glsl: Remove ir_print_visitor.h includes and usage
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/uniforms.h"
39 #include "main/fbobject.h"
40 #include "program/prog_parameter.h"
41 #include "program/prog_print.h"
42 #include "program/register_allocate.h"
43 #include "program/sampler.h"
44 #include "program/hash_table.h"
45 #include "brw_context.h"
46 #include "brw_eu.h"
47 #include "brw_wm.h"
48 }
49 #include "brw_fs.h"
50 #include "glsl/glsl_types.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63
64 /* This will be the case for almost all instructions. */
65 this->regs_written = 1;
66 }
67
68 fs_inst::fs_inst()
69 {
70 init();
71 }
72
73 fs_inst::fs_inst(enum opcode opcode)
74 {
75 init();
76 this->opcode = opcode;
77 }
78
79 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
80 {
81 init();
82 this->opcode = opcode;
83 this->dst = dst;
84
85 if (dst.file == GRF)
86 assert(dst.reg_offset >= 0);
87 }
88
89 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
90 {
91 init();
92 this->opcode = opcode;
93 this->dst = dst;
94 this->src[0] = src0;
95
96 if (dst.file == GRF)
97 assert(dst.reg_offset >= 0);
98 if (src[0].file == GRF)
99 assert(src[0].reg_offset >= 0);
100 }
101
102 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
103 {
104 init();
105 this->opcode = opcode;
106 this->dst = dst;
107 this->src[0] = src0;
108 this->src[1] = src1;
109
110 if (dst.file == GRF)
111 assert(dst.reg_offset >= 0);
112 if (src[0].file == GRF)
113 assert(src[0].reg_offset >= 0);
114 if (src[1].file == GRF)
115 assert(src[1].reg_offset >= 0);
116 }
117
118 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
119 fs_reg src0, fs_reg src1, fs_reg src2)
120 {
121 init();
122 this->opcode = opcode;
123 this->dst = dst;
124 this->src[0] = src0;
125 this->src[1] = src1;
126 this->src[2] = src2;
127
128 if (dst.file == GRF)
129 assert(dst.reg_offset >= 0);
130 if (src[0].file == GRF)
131 assert(src[0].reg_offset >= 0);
132 if (src[1].file == GRF)
133 assert(src[1].reg_offset >= 0);
134 if (src[2].file == GRF)
135 assert(src[2].reg_offset >= 0);
136 }
137
138 #define ALU1(op) \
139 fs_inst * \
140 fs_visitor::op(fs_reg dst, fs_reg src0) \
141 { \
142 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
143 }
144
145 #define ALU2(op) \
146 fs_inst * \
147 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
148 { \
149 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
150 }
151
152 #define ALU3(op) \
153 fs_inst * \
154 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
155 { \
156 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
157 }
158
159 ALU1(NOT)
160 ALU1(MOV)
161 ALU1(FRC)
162 ALU1(RNDD)
163 ALU1(RNDE)
164 ALU1(RNDZ)
165 ALU2(ADD)
166 ALU2(MUL)
167 ALU2(MACH)
168 ALU2(AND)
169 ALU2(OR)
170 ALU2(XOR)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182
183 /** Gen4 predicated IF. */
184 fs_inst *
185 fs_visitor::IF(uint32_t predicate)
186 {
187 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
188 inst->predicate = predicate;
189 return inst;
190 }
191
192 /** Gen6+ IF with embedded comparison. */
193 fs_inst *
194 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
195 {
196 assert(intel->gen >= 6);
197 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
198 reg_null_d, src0, src1);
199 inst->conditional_mod = condition;
200 return inst;
201 }
202
203 /**
204 * CMP: Sets the low bit of the destination channels with the result
205 * of the comparison, while the upper bits are undefined, and updates
206 * the flag register with the packed 16 bits of the result.
207 */
208 fs_inst *
209 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
210 {
211 fs_inst *inst;
212
213 /* Take the instruction:
214 *
215 * CMP null<d> src0<f> src1<f>
216 *
217 * Original gen4 does type conversion to the destination type before
218 * comparison, producing garbage results for floating point comparisons.
219 * gen5 does the comparison on the execution type (resolved source types),
220 * so dst type doesn't matter. gen6 does comparison and then uses the
221 * result as if it was the dst type with no conversion, which happens to
222 * mostly work out for float-interpreted-as-int since our comparisons are
223 * for >0, =0, <0.
224 */
225 if (intel->gen == 4) {
226 dst.type = src0.type;
227 if (dst.file == HW_REG)
228 dst.fixed_hw_reg.type = dst.type;
229 }
230
231 resolve_ud_negate(&src0);
232 resolve_ud_negate(&src1);
233
234 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
235 inst->conditional_mod = condition;
236
237 return inst;
238 }
239
240 exec_list
241 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
242 fs_reg varying_offset,
243 uint32_t const_offset)
244 {
245 exec_list instructions;
246 fs_inst *inst;
247
248 /* We have our constant surface use a pitch of 4 bytes, so our index can
249 * be any component of a vector, and then we load 4 contiguous
250 * components starting from that.
251 *
252 * We break down the const_offset to a portion added to the variable
253 * offset and a portion done using reg_offset, which means that if you
254 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
255 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
256 * CSE can later notice that those loads are all the same and eliminate
257 * the redundant ones.
258 */
259 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
260 instructions.push_tail(ADD(vec4_offset,
261 varying_offset, const_offset & ~3));
262
263 int scale = 1;
264 if (intel->gen == 4 && dispatch_width == 8) {
265 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
266 * u, v, r) as parameters, or we can just use the SIMD16 message
267 * consisting of (header, u). We choose the second, at the cost of a
268 * longer return length.
269 */
270 scale = 2;
271 }
272
273 enum opcode op;
274 if (intel->gen >= 7)
275 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
276 else
277 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
278 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
279 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
280 inst->regs_written = 4 * scale;
281 instructions.push_tail(inst);
282
283 if (intel->gen < 7) {
284 inst->base_mrf = 13;
285 inst->header_present = true;
286 if (intel->gen == 4)
287 inst->mlen = 3;
288 else
289 inst->mlen = 1 + dispatch_width / 8;
290 }
291
292 vec4_result.reg_offset += (const_offset & 3) * scale;
293 instructions.push_tail(MOV(dst, vec4_result));
294
295 return instructions;
296 }
297
298 /**
299 * A helper for MOV generation for fixing up broken hardware SEND dependency
300 * handling.
301 */
302 fs_inst *
303 fs_visitor::DEP_RESOLVE_MOV(int grf)
304 {
305 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
306
307 inst->ir = NULL;
308 inst->annotation = "send dependency resolve";
309
310 /* The caller always wants uncompressed to emit the minimal extra
311 * dependencies, and to avoid having to deal with aligning its regs to 2.
312 */
313 inst->force_uncompressed = true;
314
315 return inst;
316 }
317
318 bool
319 fs_inst::equals(fs_inst *inst)
320 {
321 return (opcode == inst->opcode &&
322 dst.equals(inst->dst) &&
323 src[0].equals(inst->src[0]) &&
324 src[1].equals(inst->src[1]) &&
325 src[2].equals(inst->src[2]) &&
326 saturate == inst->saturate &&
327 predicate == inst->predicate &&
328 conditional_mod == inst->conditional_mod &&
329 mlen == inst->mlen &&
330 base_mrf == inst->base_mrf &&
331 sampler == inst->sampler &&
332 target == inst->target &&
333 eot == inst->eot &&
334 header_present == inst->header_present &&
335 shadow_compare == inst->shadow_compare &&
336 offset == inst->offset);
337 }
338
339 bool
340 fs_inst::overwrites_reg(const fs_reg &reg)
341 {
342 return (reg.file == dst.file &&
343 reg.reg == dst.reg &&
344 reg.reg_offset >= dst.reg_offset &&
345 reg.reg_offset < dst.reg_offset + regs_written);
346 }
347
348 bool
349 fs_inst::is_send_from_grf()
350 {
351 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
352 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
353 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
354 src[1].file == GRF));
355 }
356
357 bool
358 fs_visitor::can_do_source_mods(fs_inst *inst)
359 {
360 if (intel->gen == 6 && inst->is_math())
361 return false;
362
363 if (inst->is_send_from_grf())
364 return false;
365
366 return true;
367 }
368
369 void
370 fs_reg::init()
371 {
372 memset(this, 0, sizeof(*this));
373 this->smear = -1;
374 }
375
376 /** Generic unset register constructor. */
377 fs_reg::fs_reg()
378 {
379 init();
380 this->file = BAD_FILE;
381 }
382
383 /** Immediate value constructor. */
384 fs_reg::fs_reg(float f)
385 {
386 init();
387 this->file = IMM;
388 this->type = BRW_REGISTER_TYPE_F;
389 this->imm.f = f;
390 }
391
392 /** Immediate value constructor. */
393 fs_reg::fs_reg(int32_t i)
394 {
395 init();
396 this->file = IMM;
397 this->type = BRW_REGISTER_TYPE_D;
398 this->imm.i = i;
399 }
400
401 /** Immediate value constructor. */
402 fs_reg::fs_reg(uint32_t u)
403 {
404 init();
405 this->file = IMM;
406 this->type = BRW_REGISTER_TYPE_UD;
407 this->imm.u = u;
408 }
409
410 /** Fixed brw_reg Immediate value constructor. */
411 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
412 {
413 init();
414 this->file = HW_REG;
415 this->fixed_hw_reg = fixed_hw_reg;
416 this->type = fixed_hw_reg.type;
417 }
418
419 bool
420 fs_reg::equals(const fs_reg &r) const
421 {
422 return (file == r.file &&
423 reg == r.reg &&
424 reg_offset == r.reg_offset &&
425 type == r.type &&
426 negate == r.negate &&
427 abs == r.abs &&
428 !reladdr && !r.reladdr &&
429 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
430 sizeof(fixed_hw_reg)) == 0 &&
431 smear == r.smear &&
432 imm.u == r.imm.u);
433 }
434
435 bool
436 fs_reg::is_zero() const
437 {
438 if (file != IMM)
439 return false;
440
441 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
442 }
443
444 bool
445 fs_reg::is_one() const
446 {
447 if (file != IMM)
448 return false;
449
450 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
451 }
452
453 bool
454 fs_reg::is_valid_3src() const
455 {
456 return file == GRF || file == UNIFORM;
457 }
458
459 int
460 fs_visitor::type_size(const struct glsl_type *type)
461 {
462 unsigned int size, i;
463
464 switch (type->base_type) {
465 case GLSL_TYPE_UINT:
466 case GLSL_TYPE_INT:
467 case GLSL_TYPE_FLOAT:
468 case GLSL_TYPE_BOOL:
469 return type->components();
470 case GLSL_TYPE_ARRAY:
471 return type_size(type->fields.array) * type->length;
472 case GLSL_TYPE_STRUCT:
473 size = 0;
474 for (i = 0; i < type->length; i++) {
475 size += type_size(type->fields.structure[i].type);
476 }
477 return size;
478 case GLSL_TYPE_SAMPLER:
479 /* Samplers take up no register space, since they're baked in at
480 * link time.
481 */
482 return 0;
483 case GLSL_TYPE_VOID:
484 case GLSL_TYPE_ERROR:
485 case GLSL_TYPE_INTERFACE:
486 assert(!"not reached");
487 break;
488 }
489
490 return 0;
491 }
492
493 fs_reg
494 fs_visitor::get_timestamp()
495 {
496 assert(intel->gen >= 7);
497
498 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
499 BRW_ARF_TIMESTAMP,
500 0),
501 BRW_REGISTER_TYPE_UD));
502
503 fs_reg dst = fs_reg(this, glsl_type::uint_type);
504
505 fs_inst *mov = emit(MOV(dst, ts));
506 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
507 * even if it's not enabled in the dispatch.
508 */
509 mov->force_writemask_all = true;
510 mov->force_uncompressed = true;
511
512 /* The caller wants the low 32 bits of the timestamp. Since it's running
513 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
514 * which is plenty of time for our purposes. It is identical across the
515 * EUs, but since it's tracking GPU core speed it will increment at a
516 * varying rate as render P-states change.
517 *
518 * The caller could also check if render P-states have changed (or anything
519 * else that might disrupt timing) by setting smear to 2 and checking if
520 * that field is != 0.
521 */
522 dst.smear = 0;
523
524 return dst;
525 }
526
527 void
528 fs_visitor::emit_shader_time_begin()
529 {
530 current_annotation = "shader time start";
531 shader_start_time = get_timestamp();
532 }
533
534 void
535 fs_visitor::emit_shader_time_end()
536 {
537 current_annotation = "shader time end";
538
539 enum shader_time_shader_type type, written_type, reset_type;
540 if (dispatch_width == 8) {
541 type = ST_FS8;
542 written_type = ST_FS8_WRITTEN;
543 reset_type = ST_FS8_RESET;
544 } else {
545 assert(dispatch_width == 16);
546 type = ST_FS16;
547 written_type = ST_FS16_WRITTEN;
548 reset_type = ST_FS16_RESET;
549 }
550
551 fs_reg shader_end_time = get_timestamp();
552
553 /* Check that there weren't any timestamp reset events (assuming these
554 * were the only two timestamp reads that happened).
555 */
556 fs_reg reset = shader_end_time;
557 reset.smear = 2;
558 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
559 test->conditional_mod = BRW_CONDITIONAL_Z;
560 emit(IF(BRW_PREDICATE_NORMAL));
561
562 push_force_uncompressed();
563 fs_reg start = shader_start_time;
564 start.negate = true;
565 fs_reg diff = fs_reg(this, glsl_type::uint_type);
566 emit(ADD(diff, start, shader_end_time));
567
568 /* If there were no instructions between the two timestamp gets, the diff
569 * is 2 cycles. Remove that overhead, so I can forget about that when
570 * trying to determine the time taken for single instructions.
571 */
572 emit(ADD(diff, diff, fs_reg(-2u)));
573
574 emit_shader_time_write(type, diff);
575 emit_shader_time_write(written_type, fs_reg(1u));
576 emit(BRW_OPCODE_ELSE);
577 emit_shader_time_write(reset_type, fs_reg(1u));
578 emit(BRW_OPCODE_ENDIF);
579
580 pop_force_uncompressed();
581 }
582
583 void
584 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
585 fs_reg value)
586 {
587 int shader_time_index =
588 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
589 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
590
591 fs_reg payload;
592 if (dispatch_width == 8)
593 payload = fs_reg(this, glsl_type::uvec2_type);
594 else
595 payload = fs_reg(this, glsl_type::uint_type);
596
597 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
598 fs_reg(), payload, offset, value));
599 }
600
601 void
602 fs_visitor::fail(const char *format, ...)
603 {
604 va_list va;
605 char *msg;
606
607 if (failed)
608 return;
609
610 failed = true;
611
612 va_start(va, format);
613 msg = ralloc_vasprintf(mem_ctx, format, va);
614 va_end(va);
615 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
616
617 this->fail_msg = msg;
618
619 if (INTEL_DEBUG & DEBUG_WM) {
620 fprintf(stderr, "%s", msg);
621 }
622 }
623
624 fs_inst *
625 fs_visitor::emit(enum opcode opcode)
626 {
627 return emit(fs_inst(opcode));
628 }
629
630 fs_inst *
631 fs_visitor::emit(enum opcode opcode, fs_reg dst)
632 {
633 return emit(fs_inst(opcode, dst));
634 }
635
636 fs_inst *
637 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
638 {
639 return emit(fs_inst(opcode, dst, src0));
640 }
641
642 fs_inst *
643 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
644 {
645 return emit(fs_inst(opcode, dst, src0, src1));
646 }
647
648 fs_inst *
649 fs_visitor::emit(enum opcode opcode, fs_reg dst,
650 fs_reg src0, fs_reg src1, fs_reg src2)
651 {
652 return emit(fs_inst(opcode, dst, src0, src1, src2));
653 }
654
655 void
656 fs_visitor::push_force_uncompressed()
657 {
658 force_uncompressed_stack++;
659 }
660
661 void
662 fs_visitor::pop_force_uncompressed()
663 {
664 force_uncompressed_stack--;
665 assert(force_uncompressed_stack >= 0);
666 }
667
668 void
669 fs_visitor::push_force_sechalf()
670 {
671 force_sechalf_stack++;
672 }
673
674 void
675 fs_visitor::pop_force_sechalf()
676 {
677 force_sechalf_stack--;
678 assert(force_sechalf_stack >= 0);
679 }
680
681 /**
682 * Returns true if the instruction has a flag that means it won't
683 * update an entire destination register.
684 *
685 * For example, dead code elimination and live variable analysis want to know
686 * when a write to a variable screens off any preceding values that were in
687 * it.
688 */
689 bool
690 fs_inst::is_partial_write()
691 {
692 return (this->predicate ||
693 this->force_uncompressed ||
694 this->force_sechalf);
695 }
696
697 /**
698 * Returns how many MRFs an FS opcode will write over.
699 *
700 * Note that this is not the 0 or 1 implied writes in an actual gen
701 * instruction -- the FS opcodes often generate MOVs in addition.
702 */
703 int
704 fs_visitor::implied_mrf_writes(fs_inst *inst)
705 {
706 if (inst->mlen == 0)
707 return 0;
708
709 switch (inst->opcode) {
710 case SHADER_OPCODE_RCP:
711 case SHADER_OPCODE_RSQ:
712 case SHADER_OPCODE_SQRT:
713 case SHADER_OPCODE_EXP2:
714 case SHADER_OPCODE_LOG2:
715 case SHADER_OPCODE_SIN:
716 case SHADER_OPCODE_COS:
717 return 1 * dispatch_width / 8;
718 case SHADER_OPCODE_POW:
719 case SHADER_OPCODE_INT_QUOTIENT:
720 case SHADER_OPCODE_INT_REMAINDER:
721 return 2 * dispatch_width / 8;
722 case SHADER_OPCODE_TEX:
723 case FS_OPCODE_TXB:
724 case SHADER_OPCODE_TXD:
725 case SHADER_OPCODE_TXF:
726 case SHADER_OPCODE_TXF_MS:
727 case SHADER_OPCODE_TXL:
728 case SHADER_OPCODE_TXS:
729 case SHADER_OPCODE_LOD:
730 return 1;
731 case FS_OPCODE_FB_WRITE:
732 return 2;
733 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
734 case FS_OPCODE_UNSPILL:
735 return 1;
736 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
737 return inst->mlen;
738 case FS_OPCODE_SPILL:
739 return 2;
740 default:
741 assert(!"not reached");
742 return inst->mlen;
743 }
744 }
745
746 int
747 fs_visitor::virtual_grf_alloc(int size)
748 {
749 if (virtual_grf_array_size <= virtual_grf_count) {
750 if (virtual_grf_array_size == 0)
751 virtual_grf_array_size = 16;
752 else
753 virtual_grf_array_size *= 2;
754 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
755 virtual_grf_array_size);
756 }
757 virtual_grf_sizes[virtual_grf_count] = size;
758 return virtual_grf_count++;
759 }
760
761 /** Fixed HW reg constructor. */
762 fs_reg::fs_reg(enum register_file file, int reg)
763 {
764 init();
765 this->file = file;
766 this->reg = reg;
767 this->type = BRW_REGISTER_TYPE_F;
768 }
769
770 /** Fixed HW reg constructor. */
771 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
772 {
773 init();
774 this->file = file;
775 this->reg = reg;
776 this->type = type;
777 }
778
779 /** Automatic reg constructor. */
780 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
781 {
782 init();
783
784 this->file = GRF;
785 this->reg = v->virtual_grf_alloc(v->type_size(type));
786 this->reg_offset = 0;
787 this->type = brw_type_for_base_type(type);
788 }
789
790 fs_reg *
791 fs_visitor::variable_storage(ir_variable *var)
792 {
793 return (fs_reg *)hash_table_find(this->variable_ht, var);
794 }
795
796 void
797 import_uniforms_callback(const void *key,
798 void *data,
799 void *closure)
800 {
801 struct hash_table *dst_ht = (struct hash_table *)closure;
802 const fs_reg *reg = (const fs_reg *)data;
803
804 if (reg->file != UNIFORM)
805 return;
806
807 hash_table_insert(dst_ht, data, key);
808 }
809
810 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
811 * This brings in those uniform definitions
812 */
813 void
814 fs_visitor::import_uniforms(fs_visitor *v)
815 {
816 hash_table_call_foreach(v->variable_ht,
817 import_uniforms_callback,
818 variable_ht);
819 this->params_remap = v->params_remap;
820 this->nr_params_remap = v->nr_params_remap;
821 }
822
823 /* Our support for uniforms is piggy-backed on the struct
824 * gl_fragment_program, because that's where the values actually
825 * get stored, rather than in some global gl_shader_program uniform
826 * store.
827 */
828 void
829 fs_visitor::setup_uniform_values(ir_variable *ir)
830 {
831 int namelen = strlen(ir->name);
832
833 /* The data for our (non-builtin) uniforms is stored in a series of
834 * gl_uniform_driver_storage structs for each subcomponent that
835 * glGetUniformLocation() could name. We know it's been set up in the same
836 * order we'd walk the type, so walk the list of storage and find anything
837 * with our name, or the prefix of a component that starts with our name.
838 */
839 unsigned params_before = c->prog_data.nr_params;
840 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
841 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
842
843 if (strncmp(ir->name, storage->name, namelen) != 0 ||
844 (storage->name[namelen] != 0 &&
845 storage->name[namelen] != '.' &&
846 storage->name[namelen] != '[')) {
847 continue;
848 }
849
850 unsigned slots = storage->type->component_slots();
851 if (storage->array_elements)
852 slots *= storage->array_elements;
853
854 for (unsigned i = 0; i < slots; i++) {
855 c->prog_data.param[c->prog_data.nr_params++] =
856 &storage->storage[i].f;
857 }
858 }
859
860 /* Make sure we actually initialized the right amount of stuff here. */
861 assert(params_before + ir->type->component_slots() ==
862 c->prog_data.nr_params);
863 (void)params_before;
864 }
865
866
867 /* Our support for builtin uniforms is even scarier than non-builtin.
868 * It sits on top of the PROG_STATE_VAR parameters that are
869 * automatically updated from GL context state.
870 */
871 void
872 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
873 {
874 const ir_state_slot *const slots = ir->state_slots;
875 assert(ir->state_slots != NULL);
876
877 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
878 /* This state reference has already been setup by ir_to_mesa, but we'll
879 * get the same index back here.
880 */
881 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
882 (gl_state_index *)slots[i].tokens);
883
884 /* Add each of the unique swizzles of the element as a parameter.
885 * This'll end up matching the expected layout of the
886 * array/matrix/structure we're trying to fill in.
887 */
888 int last_swiz = -1;
889 for (unsigned int j = 0; j < 4; j++) {
890 int swiz = GET_SWZ(slots[i].swizzle, j);
891 if (swiz == last_swiz)
892 break;
893 last_swiz = swiz;
894
895 c->prog_data.param[c->prog_data.nr_params++] =
896 &fp->Base.Parameters->ParameterValues[index][swiz].f;
897 }
898 }
899 }
900
901 fs_reg *
902 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
903 {
904 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
905 fs_reg wpos = *reg;
906 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
907
908 /* gl_FragCoord.x */
909 if (ir->pixel_center_integer) {
910 emit(MOV(wpos, this->pixel_x));
911 } else {
912 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
913 }
914 wpos.reg_offset++;
915
916 /* gl_FragCoord.y */
917 if (!flip && ir->pixel_center_integer) {
918 emit(MOV(wpos, this->pixel_y));
919 } else {
920 fs_reg pixel_y = this->pixel_y;
921 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
922
923 if (flip) {
924 pixel_y.negate = true;
925 offset += c->key.drawable_height - 1.0;
926 }
927
928 emit(ADD(wpos, pixel_y, fs_reg(offset)));
929 }
930 wpos.reg_offset++;
931
932 /* gl_FragCoord.z */
933 if (intel->gen >= 6) {
934 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
935 } else {
936 emit(FS_OPCODE_LINTERP, wpos,
937 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
938 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
939 interp_reg(VARYING_SLOT_POS, 2));
940 }
941 wpos.reg_offset++;
942
943 /* gl_FragCoord.w: Already set up in emit_interpolation */
944 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
945
946 return reg;
947 }
948
949 fs_inst *
950 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
951 glsl_interp_qualifier interpolation_mode,
952 bool is_centroid)
953 {
954 brw_wm_barycentric_interp_mode barycoord_mode;
955 if (intel->gen >= 6) {
956 if (is_centroid) {
957 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
958 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
959 else
960 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
961 } else {
962 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
963 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
964 else
965 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
966 }
967 } else {
968 /* On Ironlake and below, there is only one interpolation mode.
969 * Centroid interpolation doesn't mean anything on this hardware --
970 * there is no multisampling.
971 */
972 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
973 }
974 return emit(FS_OPCODE_LINTERP, attr,
975 this->delta_x[barycoord_mode],
976 this->delta_y[barycoord_mode], interp);
977 }
978
979 fs_reg *
980 fs_visitor::emit_general_interpolation(ir_variable *ir)
981 {
982 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
983 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
984 fs_reg attr = *reg;
985
986 unsigned int array_elements;
987 const glsl_type *type;
988
989 if (ir->type->is_array()) {
990 array_elements = ir->type->length;
991 if (array_elements == 0) {
992 fail("dereferenced array '%s' has length 0\n", ir->name);
993 }
994 type = ir->type->fields.array;
995 } else {
996 array_elements = 1;
997 type = ir->type;
998 }
999
1000 glsl_interp_qualifier interpolation_mode =
1001 ir->determine_interpolation_mode(c->key.flat_shade);
1002
1003 int location = ir->location;
1004 for (unsigned int i = 0; i < array_elements; i++) {
1005 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1006 if (urb_setup[location] == -1) {
1007 /* If there's no incoming setup data for this slot, don't
1008 * emit interpolation for it.
1009 */
1010 attr.reg_offset += type->vector_elements;
1011 location++;
1012 continue;
1013 }
1014
1015 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1016 /* Constant interpolation (flat shading) case. The SF has
1017 * handed us defined values in only the constant offset
1018 * field of the setup reg.
1019 */
1020 for (unsigned int k = 0; k < type->vector_elements; k++) {
1021 struct brw_reg interp = interp_reg(location, k);
1022 interp = suboffset(interp, 3);
1023 interp.type = reg->type;
1024 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1025 attr.reg_offset++;
1026 }
1027 } else {
1028 /* Smooth/noperspective interpolation case. */
1029 for (unsigned int k = 0; k < type->vector_elements; k++) {
1030 /* FINISHME: At some point we probably want to push
1031 * this farther by giving similar treatment to the
1032 * other potentially constant components of the
1033 * attribute, as well as making brw_vs_constval.c
1034 * handle varyings other than gl_TexCoord.
1035 */
1036 struct brw_reg interp = interp_reg(location, k);
1037 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1038 ir->centroid);
1039 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1040 /* Get the pixel/sample mask into f0 so that we know
1041 * which pixels are lit. Then, for each channel that is
1042 * unlit, replace the centroid data with non-centroid
1043 * data.
1044 */
1045 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1046 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1047 interpolation_mode, false);
1048 inst->predicate = BRW_PREDICATE_NORMAL;
1049 inst->predicate_inverse = true;
1050 }
1051 if (intel->gen < 6) {
1052 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1053 }
1054 attr.reg_offset++;
1055 }
1056
1057 }
1058 location++;
1059 }
1060 }
1061
1062 return reg;
1063 }
1064
1065 fs_reg *
1066 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1067 {
1068 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1069
1070 /* The frontfacing comes in as a bit in the thread payload. */
1071 if (intel->gen >= 6) {
1072 emit(BRW_OPCODE_ASR, *reg,
1073 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1074 fs_reg(15));
1075 emit(BRW_OPCODE_NOT, *reg, *reg);
1076 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1077 } else {
1078 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1079 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1080 * us front face
1081 */
1082 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1083 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1084 }
1085
1086 return reg;
1087 }
1088
1089 fs_reg
1090 fs_visitor::fix_math_operand(fs_reg src)
1091 {
1092 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1093 * might be able to do better by doing execsize = 1 math and then
1094 * expanding that result out, but we would need to be careful with
1095 * masking.
1096 *
1097 * The hardware ignores source modifiers (negate and abs) on math
1098 * instructions, so we also move to a temp to set those up.
1099 */
1100 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1101 !src.abs && !src.negate)
1102 return src;
1103
1104 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1105 * operands to math
1106 */
1107 if (intel->gen >= 7 && src.file != IMM)
1108 return src;
1109
1110 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1111 expanded.type = src.type;
1112 emit(BRW_OPCODE_MOV, expanded, src);
1113 return expanded;
1114 }
1115
1116 fs_inst *
1117 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1118 {
1119 switch (opcode) {
1120 case SHADER_OPCODE_RCP:
1121 case SHADER_OPCODE_RSQ:
1122 case SHADER_OPCODE_SQRT:
1123 case SHADER_OPCODE_EXP2:
1124 case SHADER_OPCODE_LOG2:
1125 case SHADER_OPCODE_SIN:
1126 case SHADER_OPCODE_COS:
1127 break;
1128 default:
1129 assert(!"not reached: bad math opcode");
1130 return NULL;
1131 }
1132
1133 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1134 * might be able to do better by doing execsize = 1 math and then
1135 * expanding that result out, but we would need to be careful with
1136 * masking.
1137 *
1138 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1139 * instructions, so we also move to a temp to set those up.
1140 */
1141 if (intel->gen >= 6)
1142 src = fix_math_operand(src);
1143
1144 fs_inst *inst = emit(opcode, dst, src);
1145
1146 if (intel->gen < 6) {
1147 inst->base_mrf = 2;
1148 inst->mlen = dispatch_width / 8;
1149 }
1150
1151 return inst;
1152 }
1153
1154 fs_inst *
1155 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1156 {
1157 int base_mrf = 2;
1158 fs_inst *inst;
1159
1160 switch (opcode) {
1161 case SHADER_OPCODE_INT_QUOTIENT:
1162 case SHADER_OPCODE_INT_REMAINDER:
1163 if (intel->gen >= 7 && dispatch_width == 16)
1164 fail("16-wide INTDIV unsupported\n");
1165 break;
1166 case SHADER_OPCODE_POW:
1167 break;
1168 default:
1169 assert(!"not reached: unsupported binary math opcode.");
1170 return NULL;
1171 }
1172
1173 if (intel->gen >= 6) {
1174 src0 = fix_math_operand(src0);
1175 src1 = fix_math_operand(src1);
1176
1177 inst = emit(opcode, dst, src0, src1);
1178 } else {
1179 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1180 * "Message Payload":
1181 *
1182 * "Operand0[7]. For the INT DIV functions, this operand is the
1183 * denominator."
1184 * ...
1185 * "Operand1[7]. For the INT DIV functions, this operand is the
1186 * numerator."
1187 */
1188 bool is_int_div = opcode != SHADER_OPCODE_POW;
1189 fs_reg &op0 = is_int_div ? src1 : src0;
1190 fs_reg &op1 = is_int_div ? src0 : src1;
1191
1192 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1193 inst = emit(opcode, dst, op0, reg_null_f);
1194
1195 inst->base_mrf = base_mrf;
1196 inst->mlen = 2 * dispatch_width / 8;
1197 }
1198 return inst;
1199 }
1200
1201 void
1202 fs_visitor::assign_curb_setup()
1203 {
1204 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1205 if (dispatch_width == 8) {
1206 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1207 } else {
1208 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1209 }
1210
1211 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1212 foreach_list(node, &this->instructions) {
1213 fs_inst *inst = (fs_inst *)node;
1214
1215 for (unsigned int i = 0; i < 3; i++) {
1216 if (inst->src[i].file == UNIFORM) {
1217 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1218 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1219 constant_nr / 8,
1220 constant_nr % 8);
1221
1222 inst->src[i].file = HW_REG;
1223 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1224 }
1225 }
1226 }
1227 }
1228
1229 void
1230 fs_visitor::calculate_urb_setup()
1231 {
1232 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1233 urb_setup[i] = -1;
1234 }
1235
1236 int urb_next = 0;
1237 /* Figure out where each of the incoming setup attributes lands. */
1238 if (intel->gen >= 6) {
1239 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1240 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1241 urb_setup[i] = urb_next++;
1242 }
1243 }
1244 } else {
1245 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1246 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1247 /* Point size is packed into the header, not as a general attribute */
1248 if (i == VARYING_SLOT_PSIZ)
1249 continue;
1250
1251 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1252 /* The back color slot is skipped when the front color is
1253 * also written to. In addition, some slots can be
1254 * written in the vertex shader and not read in the
1255 * fragment shader. So the register number must always be
1256 * incremented, mapped or not.
1257 */
1258 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1259 urb_setup[i] = urb_next;
1260 urb_next++;
1261 }
1262 }
1263
1264 /*
1265 * It's a FS only attribute, and we did interpolation for this attribute
1266 * in SF thread. So, count it here, too.
1267 *
1268 * See compile_sf_prog() for more info.
1269 */
1270 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1271 urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1272 }
1273
1274 /* Each attribute is 4 setup channels, each of which is half a reg. */
1275 c->prog_data.urb_read_length = urb_next * 2;
1276 }
1277
1278 void
1279 fs_visitor::assign_urb_setup()
1280 {
1281 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1282
1283 /* Offset all the urb_setup[] index by the actual position of the
1284 * setup regs, now that the location of the constants has been chosen.
1285 */
1286 foreach_list(node, &this->instructions) {
1287 fs_inst *inst = (fs_inst *)node;
1288
1289 if (inst->opcode == FS_OPCODE_LINTERP) {
1290 assert(inst->src[2].file == HW_REG);
1291 inst->src[2].fixed_hw_reg.nr += urb_start;
1292 }
1293
1294 if (inst->opcode == FS_OPCODE_CINTERP) {
1295 assert(inst->src[0].file == HW_REG);
1296 inst->src[0].fixed_hw_reg.nr += urb_start;
1297 }
1298 }
1299
1300 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1301 }
1302
1303 /**
1304 * Split large virtual GRFs into separate components if we can.
1305 *
1306 * This is mostly duplicated with what brw_fs_vector_splitting does,
1307 * but that's really conservative because it's afraid of doing
1308 * splitting that doesn't result in real progress after the rest of
1309 * the optimization phases, which would cause infinite looping in
1310 * optimization. We can do it once here, safely. This also has the
1311 * opportunity to split interpolated values, or maybe even uniforms,
1312 * which we don't have at the IR level.
1313 *
1314 * We want to split, because virtual GRFs are what we register
1315 * allocate and spill (due to contiguousness requirements for some
1316 * instructions), and they're what we naturally generate in the
1317 * codegen process, but most virtual GRFs don't actually need to be
1318 * contiguous sets of GRFs. If we split, we'll end up with reduced
1319 * live intervals and better dead code elimination and coalescing.
1320 */
1321 void
1322 fs_visitor::split_virtual_grfs()
1323 {
1324 int num_vars = this->virtual_grf_count;
1325 bool split_grf[num_vars];
1326 int new_virtual_grf[num_vars];
1327
1328 /* Try to split anything > 0 sized. */
1329 for (int i = 0; i < num_vars; i++) {
1330 if (this->virtual_grf_sizes[i] != 1)
1331 split_grf[i] = true;
1332 else
1333 split_grf[i] = false;
1334 }
1335
1336 if (brw->has_pln &&
1337 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1338 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1339 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1340 * Gen6, that was the only supported interpolation mode, and since Gen6,
1341 * delta_x and delta_y are in fixed hardware registers.
1342 */
1343 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1344 false;
1345 }
1346
1347 foreach_list(node, &this->instructions) {
1348 fs_inst *inst = (fs_inst *)node;
1349
1350 /* If there's a SEND message that requires contiguous destination
1351 * registers, no splitting is allowed.
1352 */
1353 if (inst->regs_written > 1) {
1354 split_grf[inst->dst.reg] = false;
1355 }
1356
1357 /* If we're sending from a GRF, don't split it, on the assumption that
1358 * the send is reading the whole thing.
1359 */
1360 if (inst->is_send_from_grf()) {
1361 split_grf[inst->src[0].reg] = false;
1362 }
1363 }
1364
1365 /* Allocate new space for split regs. Note that the virtual
1366 * numbers will be contiguous.
1367 */
1368 for (int i = 0; i < num_vars; i++) {
1369 if (split_grf[i]) {
1370 new_virtual_grf[i] = virtual_grf_alloc(1);
1371 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1372 int reg = virtual_grf_alloc(1);
1373 assert(reg == new_virtual_grf[i] + j - 1);
1374 (void) reg;
1375 }
1376 this->virtual_grf_sizes[i] = 1;
1377 }
1378 }
1379
1380 foreach_list(node, &this->instructions) {
1381 fs_inst *inst = (fs_inst *)node;
1382
1383 if (inst->dst.file == GRF &&
1384 split_grf[inst->dst.reg] &&
1385 inst->dst.reg_offset != 0) {
1386 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1387 inst->dst.reg_offset - 1);
1388 inst->dst.reg_offset = 0;
1389 }
1390 for (int i = 0; i < 3; i++) {
1391 if (inst->src[i].file == GRF &&
1392 split_grf[inst->src[i].reg] &&
1393 inst->src[i].reg_offset != 0) {
1394 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1395 inst->src[i].reg_offset - 1);
1396 inst->src[i].reg_offset = 0;
1397 }
1398 }
1399 }
1400 this->live_intervals_valid = false;
1401 }
1402
1403 /**
1404 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1405 *
1406 * During code generation, we create tons of temporary variables, many of
1407 * which get immediately killed and are never used again. Yet, in later
1408 * optimization and analysis passes, such as compute_live_intervals, we need
1409 * to loop over all the virtual GRFs. Compacting them can save a lot of
1410 * overhead.
1411 */
1412 void
1413 fs_visitor::compact_virtual_grfs()
1414 {
1415 /* Mark which virtual GRFs are used, and count how many. */
1416 int remap_table[this->virtual_grf_count];
1417 memset(remap_table, -1, sizeof(remap_table));
1418
1419 foreach_list(node, &this->instructions) {
1420 const fs_inst *inst = (const fs_inst *) node;
1421
1422 if (inst->dst.file == GRF)
1423 remap_table[inst->dst.reg] = 0;
1424
1425 for (int i = 0; i < 3; i++) {
1426 if (inst->src[i].file == GRF)
1427 remap_table[inst->src[i].reg] = 0;
1428 }
1429 }
1430
1431 /* In addition to registers used in instructions, fs_visitor keeps
1432 * direct references to certain special values which must be patched:
1433 */
1434 fs_reg *special[] = {
1435 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1436 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1437 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1438 &delta_x[0], &delta_x[1], &delta_x[2],
1439 &delta_x[3], &delta_x[4], &delta_x[5],
1440 &delta_y[0], &delta_y[1], &delta_y[2],
1441 &delta_y[3], &delta_y[4], &delta_y[5],
1442 };
1443 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1444 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1445
1446 /* Treat all special values as used, to be conservative */
1447 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1448 if (special[i]->file == GRF)
1449 remap_table[special[i]->reg] = 0;
1450 }
1451
1452 /* Compact the GRF arrays. */
1453 int new_index = 0;
1454 for (int i = 0; i < this->virtual_grf_count; i++) {
1455 if (remap_table[i] != -1) {
1456 remap_table[i] = new_index;
1457 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1458 if (live_intervals_valid) {
1459 virtual_grf_start[new_index] = virtual_grf_start[i];
1460 virtual_grf_end[new_index] = virtual_grf_end[i];
1461 }
1462 ++new_index;
1463 }
1464 }
1465
1466 this->virtual_grf_count = new_index;
1467
1468 /* Patch all the instructions to use the newly renumbered registers */
1469 foreach_list(node, &this->instructions) {
1470 fs_inst *inst = (fs_inst *) node;
1471
1472 if (inst->dst.file == GRF)
1473 inst->dst.reg = remap_table[inst->dst.reg];
1474
1475 for (int i = 0; i < 3; i++) {
1476 if (inst->src[i].file == GRF)
1477 inst->src[i].reg = remap_table[inst->src[i].reg];
1478 }
1479 }
1480
1481 /* Patch all the references to special values */
1482 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1483 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1484 special[i]->reg = remap_table[special[i]->reg];
1485 }
1486 }
1487
1488 bool
1489 fs_visitor::remove_dead_constants()
1490 {
1491 if (dispatch_width == 8) {
1492 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1493 this->nr_params_remap = c->prog_data.nr_params;
1494
1495 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1496 this->params_remap[i] = -1;
1497
1498 /* Find which params are still in use. */
1499 foreach_list(node, &this->instructions) {
1500 fs_inst *inst = (fs_inst *)node;
1501
1502 for (int i = 0; i < 3; i++) {
1503 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1504
1505 if (inst->src[i].file != UNIFORM)
1506 continue;
1507
1508 /* Section 5.11 of the OpenGL 4.3 spec says:
1509 *
1510 * "Out-of-bounds reads return undefined values, which include
1511 * values from other variables of the active program or zero."
1512 */
1513 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1514 constant_nr = 0;
1515 }
1516
1517 /* For now, set this to non-negative. We'll give it the
1518 * actual new number in a moment, in order to keep the
1519 * register numbers nicely ordered.
1520 */
1521 this->params_remap[constant_nr] = 0;
1522 }
1523 }
1524
1525 /* Figure out what the new numbers for the params will be. At some
1526 * point when we're doing uniform array access, we're going to want
1527 * to keep the distinction between .reg and .reg_offset, but for
1528 * now we don't care.
1529 */
1530 unsigned int new_nr_params = 0;
1531 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1532 if (this->params_remap[i] != -1) {
1533 this->params_remap[i] = new_nr_params++;
1534 }
1535 }
1536
1537 /* Update the list of params to be uploaded to match our new numbering. */
1538 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1539 int remapped = this->params_remap[i];
1540
1541 if (remapped == -1)
1542 continue;
1543
1544 c->prog_data.param[remapped] = c->prog_data.param[i];
1545 }
1546
1547 c->prog_data.nr_params = new_nr_params;
1548 } else {
1549 /* This should have been generated in the 8-wide pass already. */
1550 assert(this->params_remap);
1551 }
1552
1553 /* Now do the renumbering of the shader to remove unused params. */
1554 foreach_list(node, &this->instructions) {
1555 fs_inst *inst = (fs_inst *)node;
1556
1557 for (int i = 0; i < 3; i++) {
1558 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1559
1560 if (inst->src[i].file != UNIFORM)
1561 continue;
1562
1563 /* as above alias to 0 */
1564 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1565 constant_nr = 0;
1566 }
1567 assert(this->params_remap[constant_nr] != -1);
1568 inst->src[i].reg = this->params_remap[constant_nr];
1569 inst->src[i].reg_offset = 0;
1570 }
1571 }
1572
1573 return true;
1574 }
1575
1576 /*
1577 * Implements array access of uniforms by inserting a
1578 * PULL_CONSTANT_LOAD instruction.
1579 *
1580 * Unlike temporary GRF array access (where we don't support it due to
1581 * the difficulty of doing relative addressing on instruction
1582 * destinations), we could potentially do array access of uniforms
1583 * that were loaded in GRF space as push constants. In real-world
1584 * usage we've seen, though, the arrays being used are always larger
1585 * than we could load as push constants, so just always move all
1586 * uniform array access out to a pull constant buffer.
1587 */
1588 void
1589 fs_visitor::move_uniform_array_access_to_pull_constants()
1590 {
1591 int pull_constant_loc[c->prog_data.nr_params];
1592
1593 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1594 pull_constant_loc[i] = -1;
1595 }
1596
1597 /* Walk through and find array access of uniforms. Put a copy of that
1598 * uniform in the pull constant buffer.
1599 *
1600 * Note that we don't move constant-indexed accesses to arrays. No
1601 * testing has been done of the performance impact of this choice.
1602 */
1603 foreach_list_safe(node, &this->instructions) {
1604 fs_inst *inst = (fs_inst *)node;
1605
1606 for (int i = 0 ; i < 3; i++) {
1607 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1608 continue;
1609
1610 int uniform = inst->src[i].reg;
1611
1612 /* If this array isn't already present in the pull constant buffer,
1613 * add it.
1614 */
1615 if (pull_constant_loc[uniform] == -1) {
1616 const float **values = &c->prog_data.param[uniform];
1617
1618 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1619
1620 assert(param_size[uniform]);
1621
1622 for (int j = 0; j < param_size[uniform]; j++) {
1623 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1624 values[j];
1625 }
1626 }
1627
1628 /* Set up the annotation tracking for new generated instructions. */
1629 base_ir = inst->ir;
1630 current_annotation = inst->annotation;
1631
1632 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1633 fs_reg temp = fs_reg(this, glsl_type::float_type);
1634 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1635 surf_index,
1636 *inst->src[i].reladdr,
1637 pull_constant_loc[uniform] +
1638 inst->src[i].reg_offset);
1639 inst->insert_before(&list);
1640
1641 inst->src[i].file = temp.file;
1642 inst->src[i].reg = temp.reg;
1643 inst->src[i].reg_offset = temp.reg_offset;
1644 inst->src[i].reladdr = NULL;
1645 }
1646 }
1647 }
1648
1649 /**
1650 * Choose accesses from the UNIFORM file to demote to using the pull
1651 * constant buffer.
1652 *
1653 * We allow a fragment shader to have more than the specified minimum
1654 * maximum number of fragment shader uniform components (64). If
1655 * there are too many of these, they'd fill up all of register space.
1656 * So, this will push some of them out to the pull constant buffer and
1657 * update the program to load them.
1658 */
1659 void
1660 fs_visitor::setup_pull_constants()
1661 {
1662 /* Only allow 16 registers (128 uniform components) as push constants. */
1663 unsigned int max_uniform_components = 16 * 8;
1664 if (c->prog_data.nr_params <= max_uniform_components)
1665 return;
1666
1667 if (dispatch_width == 16) {
1668 fail("Pull constants not supported in 16-wide\n");
1669 return;
1670 }
1671
1672 /* Just demote the end of the list. We could probably do better
1673 * here, demoting things that are rarely used in the program first.
1674 */
1675 unsigned int pull_uniform_base = max_uniform_components;
1676
1677 int pull_constant_loc[c->prog_data.nr_params];
1678 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1679 if (i < pull_uniform_base) {
1680 pull_constant_loc[i] = -1;
1681 } else {
1682 pull_constant_loc[i] = -1;
1683 /* If our constant is already being uploaded for reladdr purposes,
1684 * reuse it.
1685 */
1686 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1687 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1688 pull_constant_loc[i] = j;
1689 break;
1690 }
1691 }
1692 if (pull_constant_loc[i] == -1) {
1693 int pull_index = c->prog_data.nr_pull_params++;
1694 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1695 pull_constant_loc[i] = pull_index;;
1696 }
1697 }
1698 }
1699 c->prog_data.nr_params = pull_uniform_base;
1700
1701 foreach_list(node, &this->instructions) {
1702 fs_inst *inst = (fs_inst *)node;
1703
1704 for (int i = 0; i < 3; i++) {
1705 if (inst->src[i].file != UNIFORM)
1706 continue;
1707
1708 int pull_index = pull_constant_loc[inst->src[i].reg +
1709 inst->src[i].reg_offset];
1710 if (pull_index == -1)
1711 continue;
1712
1713 assert(!inst->src[i].reladdr);
1714
1715 fs_reg dst = fs_reg(this, glsl_type::float_type);
1716 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1717 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1718 fs_inst *pull =
1719 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1720 dst, index, offset);
1721 pull->ir = inst->ir;
1722 pull->annotation = inst->annotation;
1723
1724 inst->insert_before(pull);
1725
1726 inst->src[i].file = GRF;
1727 inst->src[i].reg = dst.reg;
1728 inst->src[i].reg_offset = 0;
1729 inst->src[i].smear = pull_index & 3;
1730 }
1731 }
1732 }
1733
1734 bool
1735 fs_visitor::opt_algebraic()
1736 {
1737 bool progress = false;
1738
1739 foreach_list(node, &this->instructions) {
1740 fs_inst *inst = (fs_inst *)node;
1741
1742 switch (inst->opcode) {
1743 case BRW_OPCODE_MUL:
1744 if (inst->src[1].file != IMM)
1745 continue;
1746
1747 /* a * 1.0 = a */
1748 if (inst->src[1].is_one()) {
1749 inst->opcode = BRW_OPCODE_MOV;
1750 inst->src[1] = reg_undef;
1751 progress = true;
1752 break;
1753 }
1754
1755 /* a * 0.0 = 0.0 */
1756 if (inst->src[1].is_zero()) {
1757 inst->opcode = BRW_OPCODE_MOV;
1758 inst->src[0] = inst->src[1];
1759 inst->src[1] = reg_undef;
1760 progress = true;
1761 break;
1762 }
1763
1764 break;
1765 case BRW_OPCODE_ADD:
1766 if (inst->src[1].file != IMM)
1767 continue;
1768
1769 /* a + 0.0 = a */
1770 if (inst->src[1].is_zero()) {
1771 inst->opcode = BRW_OPCODE_MOV;
1772 inst->src[1] = reg_undef;
1773 progress = true;
1774 break;
1775 }
1776 break;
1777 default:
1778 break;
1779 }
1780 }
1781
1782 return progress;
1783 }
1784
1785 /**
1786 * Removes any instructions writing a VGRF where that VGRF is not used by any
1787 * later instruction.
1788 */
1789 bool
1790 fs_visitor::dead_code_eliminate()
1791 {
1792 bool progress = false;
1793 int pc = 0;
1794
1795 calculate_live_intervals();
1796
1797 foreach_list_safe(node, &this->instructions) {
1798 fs_inst *inst = (fs_inst *)node;
1799
1800 if (inst->dst.file == GRF) {
1801 assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1802 if (this->virtual_grf_end[inst->dst.reg] == pc) {
1803 inst->remove();
1804 progress = true;
1805 }
1806 }
1807
1808 pc++;
1809 }
1810
1811 if (progress)
1812 live_intervals_valid = false;
1813
1814 return progress;
1815 }
1816
1817 struct dead_code_hash_key
1818 {
1819 int vgrf;
1820 int reg_offset;
1821 };
1822
1823 static bool
1824 dead_code_hash_compare(const void *a, const void *b)
1825 {
1826 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1827 }
1828
1829 static void
1830 clear_dead_code_hash(struct hash_table *ht)
1831 {
1832 struct hash_entry *entry;
1833
1834 hash_table_foreach(ht, entry) {
1835 _mesa_hash_table_remove(ht, entry);
1836 }
1837 }
1838
1839 static void
1840 insert_dead_code_hash(struct hash_table *ht,
1841 int vgrf, int reg_offset, fs_inst *inst)
1842 {
1843 /* We don't bother freeing keys, because they'll be GCed with the ht. */
1844 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1845
1846 key->vgrf = vgrf;
1847 key->reg_offset = reg_offset;
1848
1849 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1850 }
1851
1852 static struct hash_entry *
1853 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1854 {
1855 struct dead_code_hash_key key;
1856
1857 key.vgrf = vgrf;
1858 key.reg_offset = reg_offset;
1859
1860 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1861 }
1862
1863 static void
1864 remove_dead_code_hash(struct hash_table *ht,
1865 int vgrf, int reg_offset)
1866 {
1867 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1868 if (!entry)
1869 return;
1870
1871 _mesa_hash_table_remove(ht, entry);
1872 }
1873
1874 /**
1875 * Walks basic blocks, removing any regs that are written but not read before
1876 * being redefined.
1877 *
1878 * The dead_code_eliminate() function implements a global dead code
1879 * elimination, but it only handles the removing the last write to a register
1880 * if it's never read. This one can handle intermediate writes, but only
1881 * within a basic block.
1882 */
1883 bool
1884 fs_visitor::dead_code_eliminate_local()
1885 {
1886 struct hash_table *ht;
1887 bool progress = false;
1888
1889 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1890
1891 foreach_list_safe(node, &this->instructions) {
1892 fs_inst *inst = (fs_inst *)node;
1893
1894 /* At a basic block, empty the HT since we don't understand dataflow
1895 * here.
1896 */
1897 if (inst->is_control_flow()) {
1898 clear_dead_code_hash(ht);
1899 continue;
1900 }
1901
1902 /* Clear the HT of any instructions that got read. */
1903 for (int i = 0; i < 3; i++) {
1904 fs_reg src = inst->src[i];
1905 if (src.file != GRF)
1906 continue;
1907
1908 int read = 1;
1909 if (inst->is_send_from_grf())
1910 read = virtual_grf_sizes[src.reg] - src.reg_offset;
1911
1912 for (int reg_offset = src.reg_offset;
1913 reg_offset < src.reg_offset + read;
1914 reg_offset++) {
1915 remove_dead_code_hash(ht, src.reg, reg_offset);
1916 }
1917 }
1918
1919 /* Add any update of a GRF to the HT, removing a previous write if it
1920 * wasn't read.
1921 */
1922 if (inst->dst.file == GRF) {
1923 if (inst->regs_written > 1) {
1924 /* We don't know how to trim channels from an instruction's
1925 * writes, so we can't incrementally remove unread channels from
1926 * it. Just remove whatever it overwrites from the table
1927 */
1928 for (int i = 0; i < inst->regs_written; i++) {
1929 remove_dead_code_hash(ht,
1930 inst->dst.reg,
1931 inst->dst.reg_offset + i);
1932 }
1933 } else {
1934 struct hash_entry *entry =
1935 get_dead_code_hash_entry(ht, inst->dst.reg,
1936 inst->dst.reg_offset);
1937
1938 if (inst->is_partial_write()) {
1939 /* For a partial write, we can't remove any previous dead code
1940 * candidate, since we're just modifying their result, but we can
1941 * be dead code eliminiated ourselves.
1942 */
1943 if (entry) {
1944 entry->data = inst;
1945 } else {
1946 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1947 inst);
1948 }
1949 } else {
1950 if (entry) {
1951 /* We're completely updating a channel, and there was a
1952 * previous write to the channel that wasn't read. Kill it!
1953 */
1954 fs_inst *inst = (fs_inst *)entry->data;
1955 inst->remove();
1956 progress = true;
1957 _mesa_hash_table_remove(ht, entry);
1958 }
1959
1960 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1961 inst);
1962 }
1963 }
1964 }
1965 }
1966
1967 _mesa_hash_table_destroy(ht, NULL);
1968
1969 if (progress)
1970 live_intervals_valid = false;
1971
1972 return progress;
1973 }
1974
1975 /**
1976 * Implements a second type of register coalescing: This one checks if
1977 * the two regs involved in a raw move don't interfere, in which case
1978 * they can both by stored in the same place and the MOV removed.
1979 */
1980 bool
1981 fs_visitor::register_coalesce_2()
1982 {
1983 bool progress = false;
1984
1985 calculate_live_intervals();
1986
1987 foreach_list_safe(node, &this->instructions) {
1988 fs_inst *inst = (fs_inst *)node;
1989
1990 if (inst->opcode != BRW_OPCODE_MOV ||
1991 inst->is_partial_write() ||
1992 inst->saturate ||
1993 inst->src[0].file != GRF ||
1994 inst->src[0].negate ||
1995 inst->src[0].abs ||
1996 inst->src[0].smear != -1 ||
1997 inst->dst.file != GRF ||
1998 inst->dst.type != inst->src[0].type ||
1999 virtual_grf_sizes[inst->src[0].reg] != 1 ||
2000 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2001 continue;
2002 }
2003
2004 int reg_from = inst->src[0].reg;
2005 assert(inst->src[0].reg_offset == 0);
2006 int reg_to = inst->dst.reg;
2007 int reg_to_offset = inst->dst.reg_offset;
2008
2009 foreach_list(node, &this->instructions) {
2010 fs_inst *scan_inst = (fs_inst *)node;
2011
2012 if (scan_inst->dst.file == GRF &&
2013 scan_inst->dst.reg == reg_from) {
2014 scan_inst->dst.reg = reg_to;
2015 scan_inst->dst.reg_offset = reg_to_offset;
2016 }
2017 for (int i = 0; i < 3; i++) {
2018 if (scan_inst->src[i].file == GRF &&
2019 scan_inst->src[i].reg == reg_from) {
2020 scan_inst->src[i].reg = reg_to;
2021 scan_inst->src[i].reg_offset = reg_to_offset;
2022 }
2023 }
2024 }
2025
2026 inst->remove();
2027
2028 /* We don't need to recalculate live intervals inside the loop despite
2029 * flagging live_intervals_valid because we only use live intervals for
2030 * the interferes test, and we must have had a situation where the
2031 * intervals were:
2032 *
2033 * from to
2034 * ^
2035 * |
2036 * v
2037 * ^
2038 * |
2039 * v
2040 *
2041 * Some register R that might get coalesced with one of these two could
2042 * only be referencing "to", otherwise "from"'s range would have been
2043 * longer. R's range could also only start at the end of "to" or later,
2044 * otherwise it will conflict with "to" when we try to coalesce "to"
2045 * into Rw anyway.
2046 */
2047 live_intervals_valid = false;
2048
2049 progress = true;
2050 continue;
2051 }
2052
2053 return progress;
2054 }
2055
2056 bool
2057 fs_visitor::register_coalesce()
2058 {
2059 bool progress = false;
2060 int if_depth = 0;
2061 int loop_depth = 0;
2062
2063 foreach_list_safe(node, &this->instructions) {
2064 fs_inst *inst = (fs_inst *)node;
2065
2066 /* Make sure that we dominate the instructions we're going to
2067 * scan for interfering with our coalescing, or we won't have
2068 * scanned enough to see if anything interferes with our
2069 * coalescing. We don't dominate the following instructions if
2070 * we're in a loop or an if block.
2071 */
2072 switch (inst->opcode) {
2073 case BRW_OPCODE_DO:
2074 loop_depth++;
2075 break;
2076 case BRW_OPCODE_WHILE:
2077 loop_depth--;
2078 break;
2079 case BRW_OPCODE_IF:
2080 if_depth++;
2081 break;
2082 case BRW_OPCODE_ENDIF:
2083 if_depth--;
2084 break;
2085 default:
2086 break;
2087 }
2088 if (loop_depth || if_depth)
2089 continue;
2090
2091 if (inst->opcode != BRW_OPCODE_MOV ||
2092 inst->is_partial_write() ||
2093 inst->saturate ||
2094 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2095 inst->src[0].file != UNIFORM)||
2096 inst->dst.type != inst->src[0].type)
2097 continue;
2098
2099 bool has_source_modifiers = (inst->src[0].abs ||
2100 inst->src[0].negate ||
2101 inst->src[0].smear != -1 ||
2102 inst->src[0].file == UNIFORM);
2103
2104 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2105 * them: check for no writes to either one until the exit of the
2106 * program.
2107 */
2108 bool interfered = false;
2109
2110 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2111 !scan_inst->is_tail_sentinel();
2112 scan_inst = (fs_inst *)scan_inst->next) {
2113 if (scan_inst->dst.file == GRF) {
2114 if (scan_inst->overwrites_reg(inst->dst) ||
2115 scan_inst->overwrites_reg(inst->src[0])) {
2116 interfered = true;
2117 break;
2118 }
2119 }
2120
2121 /* The gen6 MATH instruction can't handle source modifiers or
2122 * unusual register regions, so avoid coalescing those for
2123 * now. We should do something more specific.
2124 */
2125 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2126 interfered = true;
2127 break;
2128 }
2129
2130 /* The accumulator result appears to get used for the
2131 * conditional modifier generation. When negating a UD
2132 * value, there is a 33rd bit generated for the sign in the
2133 * accumulator value, so now you can't check, for example,
2134 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2135 */
2136 if (scan_inst->conditional_mod &&
2137 inst->src[0].negate &&
2138 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2139 interfered = true;
2140 break;
2141 }
2142 }
2143 if (interfered) {
2144 continue;
2145 }
2146
2147 /* Rewrite the later usage to point at the source of the move to
2148 * be removed.
2149 */
2150 for (fs_inst *scan_inst = inst;
2151 !scan_inst->is_tail_sentinel();
2152 scan_inst = (fs_inst *)scan_inst->next) {
2153 for (int i = 0; i < 3; i++) {
2154 if (scan_inst->src[i].file == GRF &&
2155 scan_inst->src[i].reg == inst->dst.reg &&
2156 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2157 fs_reg new_src = inst->src[0];
2158 if (scan_inst->src[i].abs) {
2159 new_src.negate = 0;
2160 new_src.abs = 1;
2161 }
2162 new_src.negate ^= scan_inst->src[i].negate;
2163 scan_inst->src[i] = new_src;
2164 }
2165 }
2166 }
2167
2168 inst->remove();
2169 progress = true;
2170 }
2171
2172 if (progress)
2173 live_intervals_valid = false;
2174
2175 return progress;
2176 }
2177
2178
2179 bool
2180 fs_visitor::compute_to_mrf()
2181 {
2182 bool progress = false;
2183 int next_ip = 0;
2184
2185 calculate_live_intervals();
2186
2187 foreach_list_safe(node, &this->instructions) {
2188 fs_inst *inst = (fs_inst *)node;
2189
2190 int ip = next_ip;
2191 next_ip++;
2192
2193 if (inst->opcode != BRW_OPCODE_MOV ||
2194 inst->is_partial_write() ||
2195 inst->dst.file != MRF || inst->src[0].file != GRF ||
2196 inst->dst.type != inst->src[0].type ||
2197 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2198 continue;
2199
2200 /* Work out which hardware MRF registers are written by this
2201 * instruction.
2202 */
2203 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2204 int mrf_high;
2205 if (inst->dst.reg & BRW_MRF_COMPR4) {
2206 mrf_high = mrf_low + 4;
2207 } else if (dispatch_width == 16 &&
2208 (!inst->force_uncompressed && !inst->force_sechalf)) {
2209 mrf_high = mrf_low + 1;
2210 } else {
2211 mrf_high = mrf_low;
2212 }
2213
2214 /* Can't compute-to-MRF this GRF if someone else was going to
2215 * read it later.
2216 */
2217 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2218 continue;
2219
2220 /* Found a move of a GRF to a MRF. Let's see if we can go
2221 * rewrite the thing that made this GRF to write into the MRF.
2222 */
2223 fs_inst *scan_inst;
2224 for (scan_inst = (fs_inst *)inst->prev;
2225 scan_inst->prev != NULL;
2226 scan_inst = (fs_inst *)scan_inst->prev) {
2227 if (scan_inst->dst.file == GRF &&
2228 scan_inst->dst.reg == inst->src[0].reg) {
2229 /* Found the last thing to write our reg we want to turn
2230 * into a compute-to-MRF.
2231 */
2232
2233 /* If this one instruction didn't populate all the
2234 * channels, bail. We might be able to rewrite everything
2235 * that writes that reg, but it would require smarter
2236 * tracking to delay the rewriting until complete success.
2237 */
2238 if (scan_inst->is_partial_write())
2239 break;
2240
2241 /* Things returning more than one register would need us to
2242 * understand coalescing out more than one MOV at a time.
2243 */
2244 if (scan_inst->regs_written > 1)
2245 break;
2246
2247 /* SEND instructions can't have MRF as a destination. */
2248 if (scan_inst->mlen)
2249 break;
2250
2251 if (intel->gen == 6) {
2252 /* gen6 math instructions must have the destination be
2253 * GRF, so no compute-to-MRF for them.
2254 */
2255 if (scan_inst->is_math()) {
2256 break;
2257 }
2258 }
2259
2260 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2261 /* Found the creator of our MRF's source value. */
2262 scan_inst->dst.file = MRF;
2263 scan_inst->dst.reg = inst->dst.reg;
2264 scan_inst->saturate |= inst->saturate;
2265 inst->remove();
2266 progress = true;
2267 }
2268 break;
2269 }
2270
2271 /* We don't handle control flow here. Most computation of
2272 * values that end up in MRFs are shortly before the MRF
2273 * write anyway.
2274 */
2275 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2276 break;
2277
2278 /* You can't read from an MRF, so if someone else reads our
2279 * MRF's source GRF that we wanted to rewrite, that stops us.
2280 */
2281 bool interfered = false;
2282 for (int i = 0; i < 3; i++) {
2283 if (scan_inst->src[i].file == GRF &&
2284 scan_inst->src[i].reg == inst->src[0].reg &&
2285 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2286 interfered = true;
2287 }
2288 }
2289 if (interfered)
2290 break;
2291
2292 if (scan_inst->dst.file == MRF) {
2293 /* If somebody else writes our MRF here, we can't
2294 * compute-to-MRF before that.
2295 */
2296 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2297 int scan_mrf_high;
2298
2299 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2300 scan_mrf_high = scan_mrf_low + 4;
2301 } else if (dispatch_width == 16 &&
2302 (!scan_inst->force_uncompressed &&
2303 !scan_inst->force_sechalf)) {
2304 scan_mrf_high = scan_mrf_low + 1;
2305 } else {
2306 scan_mrf_high = scan_mrf_low;
2307 }
2308
2309 if (mrf_low == scan_mrf_low ||
2310 mrf_low == scan_mrf_high ||
2311 mrf_high == scan_mrf_low ||
2312 mrf_high == scan_mrf_high) {
2313 break;
2314 }
2315 }
2316
2317 if (scan_inst->mlen > 0) {
2318 /* Found a SEND instruction, which means that there are
2319 * live values in MRFs from base_mrf to base_mrf +
2320 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2321 * above it.
2322 */
2323 if (mrf_low >= scan_inst->base_mrf &&
2324 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2325 break;
2326 }
2327 if (mrf_high >= scan_inst->base_mrf &&
2328 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2329 break;
2330 }
2331 }
2332 }
2333 }
2334
2335 if (progress)
2336 live_intervals_valid = false;
2337
2338 return progress;
2339 }
2340
2341 /**
2342 * Walks through basic blocks, looking for repeated MRF writes and
2343 * removing the later ones.
2344 */
2345 bool
2346 fs_visitor::remove_duplicate_mrf_writes()
2347 {
2348 fs_inst *last_mrf_move[16];
2349 bool progress = false;
2350
2351 /* Need to update the MRF tracking for compressed instructions. */
2352 if (dispatch_width == 16)
2353 return false;
2354
2355 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2356
2357 foreach_list_safe(node, &this->instructions) {
2358 fs_inst *inst = (fs_inst *)node;
2359
2360 if (inst->is_control_flow()) {
2361 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2362 }
2363
2364 if (inst->opcode == BRW_OPCODE_MOV &&
2365 inst->dst.file == MRF) {
2366 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2367 if (prev_inst && inst->equals(prev_inst)) {
2368 inst->remove();
2369 progress = true;
2370 continue;
2371 }
2372 }
2373
2374 /* Clear out the last-write records for MRFs that were overwritten. */
2375 if (inst->dst.file == MRF) {
2376 last_mrf_move[inst->dst.reg] = NULL;
2377 }
2378
2379 if (inst->mlen > 0) {
2380 /* Found a SEND instruction, which will include two or fewer
2381 * implied MRF writes. We could do better here.
2382 */
2383 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2384 last_mrf_move[inst->base_mrf + i] = NULL;
2385 }
2386 }
2387
2388 /* Clear out any MRF move records whose sources got overwritten. */
2389 if (inst->dst.file == GRF) {
2390 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2391 if (last_mrf_move[i] &&
2392 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2393 last_mrf_move[i] = NULL;
2394 }
2395 }
2396 }
2397
2398 if (inst->opcode == BRW_OPCODE_MOV &&
2399 inst->dst.file == MRF &&
2400 inst->src[0].file == GRF &&
2401 !inst->is_partial_write()) {
2402 last_mrf_move[inst->dst.reg] = inst;
2403 }
2404 }
2405
2406 if (progress)
2407 live_intervals_valid = false;
2408
2409 return progress;
2410 }
2411
2412 static void
2413 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2414 int first_grf, int grf_len)
2415 {
2416 bool inst_16wide = (dispatch_width > 8 &&
2417 !inst->force_uncompressed &&
2418 !inst->force_sechalf);
2419
2420 /* Clear the flag for registers that actually got read (as expected). */
2421 for (int i = 0; i < 3; i++) {
2422 int grf;
2423 if (inst->src[i].file == GRF) {
2424 grf = inst->src[i].reg;
2425 } else if (inst->src[i].file == HW_REG &&
2426 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2427 grf = inst->src[i].fixed_hw_reg.nr;
2428 } else {
2429 continue;
2430 }
2431
2432 if (grf >= first_grf &&
2433 grf < first_grf + grf_len) {
2434 deps[grf - first_grf] = false;
2435 if (inst_16wide)
2436 deps[grf - first_grf + 1] = false;
2437 }
2438 }
2439 }
2440
2441 /**
2442 * Implements this workaround for the original 965:
2443 *
2444 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2445 * check for post destination dependencies on this instruction, software
2446 * must ensure that there is no destination hazard for the case of ‘write
2447 * followed by a posted write’ shown in the following example.
2448 *
2449 * 1. mov r3 0
2450 * 2. send r3.xy <rest of send instruction>
2451 * 3. mov r2 r3
2452 *
2453 * Due to no post-destination dependency check on the ‘send’, the above
2454 * code sequence could have two instructions (1 and 2) in flight at the
2455 * same time that both consider ‘r3’ as the target of their final writes.
2456 */
2457 void
2458 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2459 {
2460 int reg_size = dispatch_width / 8;
2461 int write_len = inst->regs_written * reg_size;
2462 int first_write_grf = inst->dst.reg;
2463 bool needs_dep[BRW_MAX_MRF];
2464 assert(write_len < (int)sizeof(needs_dep) - 1);
2465
2466 memset(needs_dep, false, sizeof(needs_dep));
2467 memset(needs_dep, true, write_len);
2468
2469 clear_deps_for_inst_src(inst, dispatch_width,
2470 needs_dep, first_write_grf, write_len);
2471
2472 /* Walk backwards looking for writes to registers we're writing which
2473 * aren't read since being written. If we hit the start of the program,
2474 * we assume that there are no outstanding dependencies on entry to the
2475 * program.
2476 */
2477 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2478 scan_inst != NULL;
2479 scan_inst = (fs_inst *)scan_inst->prev) {
2480
2481 /* If we hit control flow, assume that there *are* outstanding
2482 * dependencies, and force their cleanup before our instruction.
2483 */
2484 if (scan_inst->is_control_flow()) {
2485 for (int i = 0; i < write_len; i++) {
2486 if (needs_dep[i]) {
2487 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2488 }
2489 }
2490 return;
2491 }
2492
2493 bool scan_inst_16wide = (dispatch_width > 8 &&
2494 !scan_inst->force_uncompressed &&
2495 !scan_inst->force_sechalf);
2496
2497 /* We insert our reads as late as possible on the assumption that any
2498 * instruction but a MOV that might have left us an outstanding
2499 * dependency has more latency than a MOV.
2500 */
2501 if (scan_inst->dst.file == GRF) {
2502 for (int i = 0; i < scan_inst->regs_written; i++) {
2503 int reg = scan_inst->dst.reg + i * reg_size;
2504
2505 if (reg >= first_write_grf &&
2506 reg < first_write_grf + write_len &&
2507 needs_dep[reg - first_write_grf]) {
2508 inst->insert_before(DEP_RESOLVE_MOV(reg));
2509 needs_dep[reg - first_write_grf] = false;
2510 if (scan_inst_16wide)
2511 needs_dep[reg - first_write_grf + 1] = false;
2512 }
2513 }
2514 }
2515
2516 /* Clear the flag for registers that actually got read (as expected). */
2517 clear_deps_for_inst_src(scan_inst, dispatch_width,
2518 needs_dep, first_write_grf, write_len);
2519
2520 /* Continue the loop only if we haven't resolved all the dependencies */
2521 int i;
2522 for (i = 0; i < write_len; i++) {
2523 if (needs_dep[i])
2524 break;
2525 }
2526 if (i == write_len)
2527 return;
2528 }
2529 }
2530
2531 /**
2532 * Implements this workaround for the original 965:
2533 *
2534 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2535 * used as a destination register until after it has been sourced by an
2536 * instruction with a different destination register.
2537 */
2538 void
2539 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2540 {
2541 int write_len = inst->regs_written * dispatch_width / 8;
2542 int first_write_grf = inst->dst.reg;
2543 bool needs_dep[BRW_MAX_MRF];
2544 assert(write_len < (int)sizeof(needs_dep) - 1);
2545
2546 memset(needs_dep, false, sizeof(needs_dep));
2547 memset(needs_dep, true, write_len);
2548 /* Walk forwards looking for writes to registers we're writing which aren't
2549 * read before being written.
2550 */
2551 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2552 !scan_inst->is_tail_sentinel();
2553 scan_inst = (fs_inst *)scan_inst->next) {
2554 /* If we hit control flow, force resolve all remaining dependencies. */
2555 if (scan_inst->is_control_flow()) {
2556 for (int i = 0; i < write_len; i++) {
2557 if (needs_dep[i])
2558 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2559 }
2560 return;
2561 }
2562
2563 /* Clear the flag for registers that actually got read (as expected). */
2564 clear_deps_for_inst_src(scan_inst, dispatch_width,
2565 needs_dep, first_write_grf, write_len);
2566
2567 /* We insert our reads as late as possible since they're reading the
2568 * result of a SEND, which has massive latency.
2569 */
2570 if (scan_inst->dst.file == GRF &&
2571 scan_inst->dst.reg >= first_write_grf &&
2572 scan_inst->dst.reg < first_write_grf + write_len &&
2573 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2574 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2575 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2576 }
2577
2578 /* Continue the loop only if we haven't resolved all the dependencies */
2579 int i;
2580 for (i = 0; i < write_len; i++) {
2581 if (needs_dep[i])
2582 break;
2583 }
2584 if (i == write_len)
2585 return;
2586 }
2587
2588 /* If we hit the end of the program, resolve all remaining dependencies out
2589 * of paranoia.
2590 */
2591 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2592 assert(last_inst->eot);
2593 for (int i = 0; i < write_len; i++) {
2594 if (needs_dep[i])
2595 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2596 }
2597 }
2598
2599 void
2600 fs_visitor::insert_gen4_send_dependency_workarounds()
2601 {
2602 if (intel->gen != 4 || intel->is_g4x)
2603 return;
2604
2605 /* Note that we're done with register allocation, so GRF fs_regs always
2606 * have a .reg_offset of 0.
2607 */
2608
2609 foreach_list_safe(node, &this->instructions) {
2610 fs_inst *inst = (fs_inst *)node;
2611
2612 if (inst->mlen != 0 && inst->dst.file == GRF) {
2613 insert_gen4_pre_send_dependency_workarounds(inst);
2614 insert_gen4_post_send_dependency_workarounds(inst);
2615 }
2616 }
2617 }
2618
2619 /**
2620 * Turns the generic expression-style uniform pull constant load instruction
2621 * into a hardware-specific series of instructions for loading a pull
2622 * constant.
2623 *
2624 * The expression style allows the CSE pass before this to optimize out
2625 * repeated loads from the same offset, and gives the pre-register-allocation
2626 * scheduling full flexibility, while the conversion to native instructions
2627 * allows the post-register-allocation scheduler the best information
2628 * possible.
2629 *
2630 * Note that execution masking for setting up pull constant loads is special:
2631 * the channels that need to be written are unrelated to the current execution
2632 * mask, since a later instruction will use one of the result channels as a
2633 * source operand for all 8 or 16 of its channels.
2634 */
2635 void
2636 fs_visitor::lower_uniform_pull_constant_loads()
2637 {
2638 foreach_list(node, &this->instructions) {
2639 fs_inst *inst = (fs_inst *)node;
2640
2641 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2642 continue;
2643
2644 if (intel->gen >= 7) {
2645 /* The offset arg before was a vec4-aligned byte offset. We need to
2646 * turn it into a dword offset.
2647 */
2648 fs_reg const_offset_reg = inst->src[1];
2649 assert(const_offset_reg.file == IMM &&
2650 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2651 const_offset_reg.imm.u /= 4;
2652 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2653
2654 /* This is actually going to be a MOV, but since only the first dword
2655 * is accessed, we have a special opcode to do just that one. Note
2656 * that this needs to be an operation that will be considered a def
2657 * by live variable analysis, or register allocation will explode.
2658 */
2659 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2660 payload, const_offset_reg);
2661 setup->force_writemask_all = true;
2662
2663 setup->ir = inst->ir;
2664 setup->annotation = inst->annotation;
2665 inst->insert_before(setup);
2666
2667 /* Similarly, this will only populate the first 4 channels of the
2668 * result register (since we only use smear values from 0-3), but we
2669 * don't tell the optimizer.
2670 */
2671 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2672 inst->src[1] = payload;
2673
2674 this->live_intervals_valid = false;
2675 } else {
2676 /* Before register allocation, we didn't tell the scheduler about the
2677 * MRF we use. We know it's safe to use this MRF because nothing
2678 * else does except for register spill/unspill, which generates and
2679 * uses its MRF within a single IR instruction.
2680 */
2681 inst->base_mrf = 14;
2682 inst->mlen = 1;
2683 }
2684 }
2685 }
2686
2687 void
2688 fs_visitor::dump_instruction(backend_instruction *be_inst)
2689 {
2690 fs_inst *inst = (fs_inst *)be_inst;
2691
2692 if (inst->predicate) {
2693 printf("(%cf0.%d) ",
2694 inst->predicate_inverse ? '-' : '+',
2695 inst->flag_subreg);
2696 }
2697
2698 printf("%s", brw_instruction_name(inst->opcode));
2699 if (inst->saturate)
2700 printf(".sat");
2701 if (inst->conditional_mod) {
2702 printf(".cmod");
2703 if (!inst->predicate &&
2704 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2705 inst->opcode != BRW_OPCODE_IF &&
2706 inst->opcode != BRW_OPCODE_WHILE))) {
2707 printf(".f0.%d\n", inst->flag_subreg);
2708 }
2709 }
2710 printf(" ");
2711
2712
2713 switch (inst->dst.file) {
2714 case GRF:
2715 printf("vgrf%d", inst->dst.reg);
2716 if (inst->dst.reg_offset)
2717 printf("+%d", inst->dst.reg_offset);
2718 break;
2719 case MRF:
2720 printf("m%d", inst->dst.reg);
2721 break;
2722 case BAD_FILE:
2723 printf("(null)");
2724 break;
2725 case UNIFORM:
2726 printf("***u%d***", inst->dst.reg);
2727 break;
2728 default:
2729 printf("???");
2730 break;
2731 }
2732 printf(", ");
2733
2734 for (int i = 0; i < 3; i++) {
2735 if (inst->src[i].negate)
2736 printf("-");
2737 if (inst->src[i].abs)
2738 printf("|");
2739 switch (inst->src[i].file) {
2740 case GRF:
2741 printf("vgrf%d", inst->src[i].reg);
2742 if (inst->src[i].reg_offset)
2743 printf("+%d", inst->src[i].reg_offset);
2744 break;
2745 case MRF:
2746 printf("***m%d***", inst->src[i].reg);
2747 break;
2748 case UNIFORM:
2749 printf("u%d", inst->src[i].reg);
2750 if (inst->src[i].reg_offset)
2751 printf(".%d", inst->src[i].reg_offset);
2752 break;
2753 case BAD_FILE:
2754 printf("(null)");
2755 break;
2756 case IMM:
2757 switch (inst->src[i].type) {
2758 case BRW_REGISTER_TYPE_F:
2759 printf("%ff", inst->src[i].imm.f);
2760 break;
2761 case BRW_REGISTER_TYPE_D:
2762 printf("%dd", inst->src[i].imm.i);
2763 break;
2764 case BRW_REGISTER_TYPE_UD:
2765 printf("%uu", inst->src[i].imm.u);
2766 break;
2767 default:
2768 printf("???");
2769 break;
2770 }
2771 break;
2772 default:
2773 printf("???");
2774 break;
2775 }
2776 if (inst->src[i].abs)
2777 printf("|");
2778
2779 if (i < 3)
2780 printf(", ");
2781 }
2782
2783 printf(" ");
2784
2785 if (inst->force_uncompressed)
2786 printf("1sthalf ");
2787
2788 if (inst->force_sechalf)
2789 printf("2ndhalf ");
2790
2791 printf("\n");
2792 }
2793
2794 /**
2795 * Possibly returns an instruction that set up @param reg.
2796 *
2797 * Sometimes we want to take the result of some expression/variable
2798 * dereference tree and rewrite the instruction generating the result
2799 * of the tree. When processing the tree, we know that the
2800 * instructions generated are all writing temporaries that are dead
2801 * outside of this tree. So, if we have some instructions that write
2802 * a temporary, we're free to point that temp write somewhere else.
2803 *
2804 * Note that this doesn't guarantee that the instruction generated
2805 * only reg -- it might be the size=4 destination of a texture instruction.
2806 */
2807 fs_inst *
2808 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2809 fs_inst *end,
2810 fs_reg reg)
2811 {
2812 if (end == start ||
2813 end->is_partial_write() ||
2814 reg.reladdr ||
2815 !reg.equals(end->dst)) {
2816 return NULL;
2817 } else {
2818 return end;
2819 }
2820 }
2821
2822 void
2823 fs_visitor::setup_payload_gen6()
2824 {
2825 bool uses_depth =
2826 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2827 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2828
2829 assert(intel->gen >= 6);
2830
2831 /* R0-1: masks, pixel X/Y coordinates. */
2832 c->nr_payload_regs = 2;
2833 /* R2: only for 32-pixel dispatch.*/
2834
2835 /* R3-26: barycentric interpolation coordinates. These appear in the
2836 * same order that they appear in the brw_wm_barycentric_interp_mode
2837 * enum. Each set of coordinates occupies 2 registers if dispatch width
2838 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2839 * appear if they were enabled using the "Barycentric Interpolation
2840 * Mode" bits in WM_STATE.
2841 */
2842 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2843 if (barycentric_interp_modes & (1 << i)) {
2844 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2845 c->nr_payload_regs += 2;
2846 if (dispatch_width == 16) {
2847 c->nr_payload_regs += 2;
2848 }
2849 }
2850 }
2851
2852 /* R27: interpolated depth if uses source depth */
2853 if (uses_depth) {
2854 c->source_depth_reg = c->nr_payload_regs;
2855 c->nr_payload_regs++;
2856 if (dispatch_width == 16) {
2857 /* R28: interpolated depth if not 8-wide. */
2858 c->nr_payload_regs++;
2859 }
2860 }
2861 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2862 if (uses_depth) {
2863 c->source_w_reg = c->nr_payload_regs;
2864 c->nr_payload_regs++;
2865 if (dispatch_width == 16) {
2866 /* R30: interpolated W if not 8-wide. */
2867 c->nr_payload_regs++;
2868 }
2869 }
2870 /* R31: MSAA position offsets. */
2871 /* R32-: bary for 32-pixel. */
2872 /* R58-59: interp W for 32-pixel. */
2873
2874 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2875 c->source_depth_to_render_target = true;
2876 }
2877 }
2878
2879 bool
2880 fs_visitor::run()
2881 {
2882 sanity_param_count = fp->Base.Parameters->NumParameters;
2883 uint32_t orig_nr_params = c->prog_data.nr_params;
2884
2885 if (intel->gen >= 6)
2886 setup_payload_gen6();
2887 else
2888 setup_payload_gen4();
2889
2890 if (0) {
2891 emit_dummy_fs();
2892 } else {
2893 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2894 emit_shader_time_begin();
2895
2896 calculate_urb_setup();
2897 if (intel->gen < 6)
2898 emit_interpolation_setup_gen4();
2899 else
2900 emit_interpolation_setup_gen6();
2901
2902 /* We handle discards by keeping track of the still-live pixels in f0.1.
2903 * Initialize it with the dispatched pixels.
2904 */
2905 if (fp->UsesKill) {
2906 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2907 discard_init->flag_subreg = 1;
2908 }
2909
2910 /* Generate FS IR for main(). (the visitor only descends into
2911 * functions called "main").
2912 */
2913 if (shader) {
2914 foreach_list(node, &*shader->ir) {
2915 ir_instruction *ir = (ir_instruction *)node;
2916 base_ir = ir;
2917 this->result = reg_undef;
2918 ir->accept(this);
2919 }
2920 } else {
2921 emit_fragment_program_code();
2922 }
2923 base_ir = NULL;
2924 if (failed)
2925 return false;
2926
2927 emit(FS_OPCODE_PLACEHOLDER_HALT);
2928
2929 emit_fb_writes();
2930
2931 split_virtual_grfs();
2932
2933 move_uniform_array_access_to_pull_constants();
2934 setup_pull_constants();
2935
2936 bool progress;
2937 do {
2938 progress = false;
2939
2940 compact_virtual_grfs();
2941
2942 progress = remove_duplicate_mrf_writes() || progress;
2943
2944 progress = opt_algebraic() || progress;
2945 progress = opt_cse() || progress;
2946 progress = opt_copy_propagate() || progress;
2947 progress = dead_code_eliminate() || progress;
2948 progress = dead_code_eliminate_local() || progress;
2949 progress = register_coalesce() || progress;
2950 progress = register_coalesce_2() || progress;
2951 progress = compute_to_mrf() || progress;
2952 } while (progress);
2953
2954 remove_dead_constants();
2955
2956 schedule_instructions(false);
2957
2958 lower_uniform_pull_constant_loads();
2959
2960 assign_curb_setup();
2961 assign_urb_setup();
2962
2963 if (0) {
2964 /* Debug of register spilling: Go spill everything. */
2965 for (int i = 0; i < virtual_grf_count; i++) {
2966 spill_reg(i);
2967 }
2968 }
2969
2970 if (0)
2971 assign_regs_trivial();
2972 else {
2973 while (!assign_regs()) {
2974 if (failed)
2975 break;
2976 }
2977 }
2978 }
2979 assert(force_uncompressed_stack == 0);
2980 assert(force_sechalf_stack == 0);
2981
2982 /* This must come after all optimization and register allocation, since
2983 * it inserts dead code that happens to have side effects, and it does
2984 * so based on the actual physical registers in use.
2985 */
2986 insert_gen4_send_dependency_workarounds();
2987
2988 if (failed)
2989 return false;
2990
2991 schedule_instructions(true);
2992
2993 if (dispatch_width == 8) {
2994 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2995 } else {
2996 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2997
2998 /* Make sure we didn't try to sneak in an extra uniform */
2999 assert(orig_nr_params == c->prog_data.nr_params);
3000 (void) orig_nr_params;
3001 }
3002
3003 /* If any state parameters were appended, then ParameterValues could have
3004 * been realloced, in which case the driver uniform storage set up by
3005 * _mesa_associate_uniform_storage() would point to freed memory. Make
3006 * sure that didn't happen.
3007 */
3008 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3009
3010 return !failed;
3011 }
3012
3013 const unsigned *
3014 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3015 struct gl_fragment_program *fp,
3016 struct gl_shader_program *prog,
3017 unsigned *final_assembly_size)
3018 {
3019 struct intel_context *intel = &brw->intel;
3020 bool start_busy = false;
3021 float start_time = 0;
3022
3023 if (unlikely(intel->perf_debug)) {
3024 start_busy = (intel->batch.last_bo &&
3025 drm_intel_bo_busy(intel->batch.last_bo));
3026 start_time = get_time();
3027 }
3028
3029 struct brw_shader *shader = NULL;
3030 if (prog)
3031 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3032
3033 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3034 if (prog) {
3035 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3036 _mesa_print_ir(shader->ir, NULL);
3037 printf("\n\n");
3038 } else {
3039 printf("ARB_fragment_program %d ir for native fragment shader\n",
3040 fp->Base.Id);
3041 _mesa_print_program(&fp->Base);
3042 }
3043 }
3044
3045 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3046 */
3047 fs_visitor v(brw, c, prog, fp, 8);
3048 if (!v.run()) {
3049 if (prog) {
3050 prog->LinkStatus = false;
3051 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3052 }
3053
3054 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3055 v.fail_msg);
3056
3057 return NULL;
3058 }
3059
3060 exec_list *simd16_instructions = NULL;
3061 fs_visitor v2(brw, c, prog, fp, 16);
3062 bool no16 = INTEL_DEBUG & DEBUG_NO16;
3063 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
3064 v2.import_uniforms(&v);
3065 if (!v2.run()) {
3066 perf_debug("16-wide shader failed to compile, falling back to "
3067 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3068 } else {
3069 simd16_instructions = &v2.instructions;
3070 }
3071 }
3072
3073 c->prog_data.dispatch_width = 8;
3074
3075 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3076 const unsigned *generated = g.generate_assembly(&v.instructions,
3077 simd16_instructions,
3078 final_assembly_size);
3079
3080 if (unlikely(intel->perf_debug) && shader) {
3081 if (shader->compiled_once)
3082 brw_wm_debug_recompile(brw, prog, &c->key);
3083 shader->compiled_once = true;
3084
3085 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
3086 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3087 (get_time() - start_time) * 1000);
3088 }
3089 }
3090
3091 return generated;
3092 }
3093
3094 bool
3095 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3096 {
3097 struct brw_context *brw = brw_context(ctx);
3098 struct intel_context *intel = &brw->intel;
3099 struct brw_wm_prog_key key;
3100
3101 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3102 return true;
3103
3104 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3105 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3106 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3107 bool program_uses_dfdy = fp->UsesDFdy;
3108
3109 memset(&key, 0, sizeof(key));
3110
3111 if (intel->gen < 6) {
3112 if (fp->UsesKill)
3113 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3114
3115 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3116 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3117
3118 /* Just assume depth testing. */
3119 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3120 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3121 }
3122
3123 if (intel->gen < 6)
3124 key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3125
3126 for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3127 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3128 continue;
3129
3130 if (intel->gen < 6) {
3131 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3132 key.input_slots_valid |= BITFIELD64_BIT(i);
3133 }
3134 }
3135
3136 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3137
3138 for (int i = 0; i < MAX_SAMPLERS; i++) {
3139 if (fp->Base.ShadowSamplers & (1 << i)) {
3140 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3141 key.tex.swizzles[i] =
3142 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3143 } else {
3144 /* Color sampler: assume no swizzling. */
3145 key.tex.swizzles[i] = SWIZZLE_XYZW;
3146 }
3147 }
3148
3149 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3150 key.drawable_height = ctx->DrawBuffer->Height;
3151 }
3152
3153 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3154 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3155 }
3156
3157 key.nr_color_regions = 1;
3158
3159 key.program_string_id = bfp->id;
3160
3161 uint32_t old_prog_offset = brw->wm.prog_offset;
3162 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3163
3164 bool success = do_wm_prog(brw, prog, bfp, &key);
3165
3166 brw->wm.prog_offset = old_prog_offset;
3167 brw->wm.prog_data = old_prog_data;
3168
3169 return success;
3170 }