i965: fix problem with constant out of bounds access (v2)
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/uniforms.h"
39 #include "main/fbobject.h"
40 #include "program/prog_parameter.h"
41 #include "program/prog_print.h"
42 #include "program/register_allocate.h"
43 #include "program/sampler.h"
44 #include "program/hash_table.h"
45 #include "brw_context.h"
46 #include "brw_eu.h"
47 #include "brw_wm.h"
48 }
49 #include "brw_fs.h"
50 #include "glsl/glsl_types.h"
51 #include "glsl/ir_print_visitor.h"
52
53 void
54 fs_inst::init()
55 {
56 memset(this, 0, sizeof(*this));
57 this->opcode = BRW_OPCODE_NOP;
58 this->conditional_mod = BRW_CONDITIONAL_NONE;
59
60 this->dst = reg_undef;
61 this->src[0] = reg_undef;
62 this->src[1] = reg_undef;
63 this->src[2] = reg_undef;
64
65 /* This will be the case for almost all instructions. */
66 this->regs_written = 1;
67 }
68
69 fs_inst::fs_inst()
70 {
71 init();
72 }
73
74 fs_inst::fs_inst(enum opcode opcode)
75 {
76 init();
77 this->opcode = opcode;
78 }
79
80 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
81 {
82 init();
83 this->opcode = opcode;
84 this->dst = dst;
85
86 if (dst.file == GRF)
87 assert(dst.reg_offset >= 0);
88 }
89
90 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
91 {
92 init();
93 this->opcode = opcode;
94 this->dst = dst;
95 this->src[0] = src0;
96
97 if (dst.file == GRF)
98 assert(dst.reg_offset >= 0);
99 if (src[0].file == GRF)
100 assert(src[0].reg_offset >= 0);
101 }
102
103 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
104 {
105 init();
106 this->opcode = opcode;
107 this->dst = dst;
108 this->src[0] = src0;
109 this->src[1] = src1;
110
111 if (dst.file == GRF)
112 assert(dst.reg_offset >= 0);
113 if (src[0].file == GRF)
114 assert(src[0].reg_offset >= 0);
115 if (src[1].file == GRF)
116 assert(src[1].reg_offset >= 0);
117 }
118
119 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
120 fs_reg src0, fs_reg src1, fs_reg src2)
121 {
122 init();
123 this->opcode = opcode;
124 this->dst = dst;
125 this->src[0] = src0;
126 this->src[1] = src1;
127 this->src[2] = src2;
128
129 if (dst.file == GRF)
130 assert(dst.reg_offset >= 0);
131 if (src[0].file == GRF)
132 assert(src[0].reg_offset >= 0);
133 if (src[1].file == GRF)
134 assert(src[1].reg_offset >= 0);
135 if (src[2].file == GRF)
136 assert(src[2].reg_offset >= 0);
137 }
138
139 #define ALU1(op) \
140 fs_inst * \
141 fs_visitor::op(fs_reg dst, fs_reg src0) \
142 { \
143 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
144 }
145
146 #define ALU2(op) \
147 fs_inst * \
148 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
149 { \
150 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
151 }
152
153 #define ALU3(op) \
154 fs_inst * \
155 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
156 { \
157 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
158 }
159
160 ALU1(NOT)
161 ALU1(MOV)
162 ALU1(FRC)
163 ALU1(RNDD)
164 ALU1(RNDE)
165 ALU1(RNDZ)
166 ALU2(ADD)
167 ALU2(MUL)
168 ALU2(MACH)
169 ALU2(AND)
170 ALU2(OR)
171 ALU2(XOR)
172 ALU2(SHL)
173 ALU2(SHR)
174 ALU2(ASR)
175 ALU3(LRP)
176 ALU1(BFREV)
177 ALU3(BFE)
178 ALU2(BFI1)
179 ALU3(BFI2)
180 ALU1(FBH)
181 ALU1(FBL)
182 ALU1(CBIT)
183
184 /** Gen4 predicated IF. */
185 fs_inst *
186 fs_visitor::IF(uint32_t predicate)
187 {
188 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
189 inst->predicate = predicate;
190 return inst;
191 }
192
193 /** Gen6+ IF with embedded comparison. */
194 fs_inst *
195 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
196 {
197 assert(intel->gen >= 6);
198 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
199 reg_null_d, src0, src1);
200 inst->conditional_mod = condition;
201 return inst;
202 }
203
204 /**
205 * CMP: Sets the low bit of the destination channels with the result
206 * of the comparison, while the upper bits are undefined, and updates
207 * the flag register with the packed 16 bits of the result.
208 */
209 fs_inst *
210 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
211 {
212 fs_inst *inst;
213
214 /* Take the instruction:
215 *
216 * CMP null<d> src0<f> src1<f>
217 *
218 * Original gen4 does type conversion to the destination type before
219 * comparison, producing garbage results for floating point comparisons.
220 * gen5 does the comparison on the execution type (resolved source types),
221 * so dst type doesn't matter. gen6 does comparison and then uses the
222 * result as if it was the dst type with no conversion, which happens to
223 * mostly work out for float-interpreted-as-int since our comparisons are
224 * for >0, =0, <0.
225 */
226 if (intel->gen == 4) {
227 dst.type = src0.type;
228 if (dst.file == HW_REG)
229 dst.fixed_hw_reg.type = dst.type;
230 }
231
232 resolve_ud_negate(&src0);
233 resolve_ud_negate(&src1);
234
235 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
236 inst->conditional_mod = condition;
237
238 return inst;
239 }
240
241 exec_list
242 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
243 fs_reg varying_offset,
244 uint32_t const_offset)
245 {
246 exec_list instructions;
247 fs_inst *inst;
248
249 /* We have our constant surface use a pitch of 4 bytes, so our index can
250 * be any component of a vector, and then we load 4 contiguous
251 * components starting from that.
252 *
253 * We break down the const_offset to a portion added to the variable
254 * offset and a portion done using reg_offset, which means that if you
255 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
256 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
257 * CSE can later notice that those loads are all the same and eliminate
258 * the redundant ones.
259 */
260 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
261 instructions.push_tail(ADD(vec4_offset,
262 varying_offset, const_offset & ~3));
263
264 int scale = 1;
265 if (intel->gen == 4 && dispatch_width == 8) {
266 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
267 * u, v, r) as parameters, or we can just use the SIMD16 message
268 * consisting of (header, u). We choose the second, at the cost of a
269 * longer return length.
270 */
271 scale = 2;
272 }
273
274 enum opcode op;
275 if (intel->gen >= 7)
276 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
277 else
278 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
279 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
280 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
281 inst->regs_written = 4 * scale;
282 instructions.push_tail(inst);
283
284 if (intel->gen < 7) {
285 inst->base_mrf = 13;
286 inst->header_present = true;
287 if (intel->gen == 4)
288 inst->mlen = 3;
289 else
290 inst->mlen = 1 + dispatch_width / 8;
291 }
292
293 vec4_result.reg_offset += (const_offset & 3) * scale;
294 instructions.push_tail(MOV(dst, vec4_result));
295
296 return instructions;
297 }
298
299 /**
300 * A helper for MOV generation for fixing up broken hardware SEND dependency
301 * handling.
302 */
303 fs_inst *
304 fs_visitor::DEP_RESOLVE_MOV(int grf)
305 {
306 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
307
308 inst->ir = NULL;
309 inst->annotation = "send dependency resolve";
310
311 /* The caller always wants uncompressed to emit the minimal extra
312 * dependencies, and to avoid having to deal with aligning its regs to 2.
313 */
314 inst->force_uncompressed = true;
315
316 return inst;
317 }
318
319 bool
320 fs_inst::equals(fs_inst *inst)
321 {
322 return (opcode == inst->opcode &&
323 dst.equals(inst->dst) &&
324 src[0].equals(inst->src[0]) &&
325 src[1].equals(inst->src[1]) &&
326 src[2].equals(inst->src[2]) &&
327 saturate == inst->saturate &&
328 predicate == inst->predicate &&
329 conditional_mod == inst->conditional_mod &&
330 mlen == inst->mlen &&
331 base_mrf == inst->base_mrf &&
332 sampler == inst->sampler &&
333 target == inst->target &&
334 eot == inst->eot &&
335 header_present == inst->header_present &&
336 shadow_compare == inst->shadow_compare &&
337 offset == inst->offset);
338 }
339
340 bool
341 fs_inst::overwrites_reg(const fs_reg &reg)
342 {
343 return (reg.file == dst.file &&
344 reg.reg == dst.reg &&
345 reg.reg_offset >= dst.reg_offset &&
346 reg.reg_offset < dst.reg_offset + regs_written);
347 }
348
349 bool
350 fs_inst::is_send_from_grf()
351 {
352 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
353 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
354 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
355 src[1].file == GRF));
356 }
357
358 bool
359 fs_visitor::can_do_source_mods(fs_inst *inst)
360 {
361 if (intel->gen == 6 && inst->is_math())
362 return false;
363
364 if (inst->is_send_from_grf())
365 return false;
366
367 return true;
368 }
369
370 void
371 fs_reg::init()
372 {
373 memset(this, 0, sizeof(*this));
374 this->smear = -1;
375 }
376
377 /** Generic unset register constructor. */
378 fs_reg::fs_reg()
379 {
380 init();
381 this->file = BAD_FILE;
382 }
383
384 /** Immediate value constructor. */
385 fs_reg::fs_reg(float f)
386 {
387 init();
388 this->file = IMM;
389 this->type = BRW_REGISTER_TYPE_F;
390 this->imm.f = f;
391 }
392
393 /** Immediate value constructor. */
394 fs_reg::fs_reg(int32_t i)
395 {
396 init();
397 this->file = IMM;
398 this->type = BRW_REGISTER_TYPE_D;
399 this->imm.i = i;
400 }
401
402 /** Immediate value constructor. */
403 fs_reg::fs_reg(uint32_t u)
404 {
405 init();
406 this->file = IMM;
407 this->type = BRW_REGISTER_TYPE_UD;
408 this->imm.u = u;
409 }
410
411 /** Fixed brw_reg Immediate value constructor. */
412 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
413 {
414 init();
415 this->file = HW_REG;
416 this->fixed_hw_reg = fixed_hw_reg;
417 this->type = fixed_hw_reg.type;
418 }
419
420 bool
421 fs_reg::equals(const fs_reg &r) const
422 {
423 return (file == r.file &&
424 reg == r.reg &&
425 reg_offset == r.reg_offset &&
426 type == r.type &&
427 negate == r.negate &&
428 abs == r.abs &&
429 !reladdr && !r.reladdr &&
430 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
431 sizeof(fixed_hw_reg)) == 0 &&
432 smear == r.smear &&
433 imm.u == r.imm.u);
434 }
435
436 bool
437 fs_reg::is_zero() const
438 {
439 if (file != IMM)
440 return false;
441
442 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
443 }
444
445 bool
446 fs_reg::is_one() const
447 {
448 if (file != IMM)
449 return false;
450
451 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
452 }
453
454 bool
455 fs_reg::is_valid_3src() const
456 {
457 return file == GRF || file == UNIFORM;
458 }
459
460 int
461 fs_visitor::type_size(const struct glsl_type *type)
462 {
463 unsigned int size, i;
464
465 switch (type->base_type) {
466 case GLSL_TYPE_UINT:
467 case GLSL_TYPE_INT:
468 case GLSL_TYPE_FLOAT:
469 case GLSL_TYPE_BOOL:
470 return type->components();
471 case GLSL_TYPE_ARRAY:
472 return type_size(type->fields.array) * type->length;
473 case GLSL_TYPE_STRUCT:
474 size = 0;
475 for (i = 0; i < type->length; i++) {
476 size += type_size(type->fields.structure[i].type);
477 }
478 return size;
479 case GLSL_TYPE_SAMPLER:
480 /* Samplers take up no register space, since they're baked in at
481 * link time.
482 */
483 return 0;
484 case GLSL_TYPE_VOID:
485 case GLSL_TYPE_ERROR:
486 case GLSL_TYPE_INTERFACE:
487 assert(!"not reached");
488 break;
489 }
490
491 return 0;
492 }
493
494 fs_reg
495 fs_visitor::get_timestamp()
496 {
497 assert(intel->gen >= 7);
498
499 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
500 BRW_ARF_TIMESTAMP,
501 0),
502 BRW_REGISTER_TYPE_UD));
503
504 fs_reg dst = fs_reg(this, glsl_type::uint_type);
505
506 fs_inst *mov = emit(MOV(dst, ts));
507 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
508 * even if it's not enabled in the dispatch.
509 */
510 mov->force_writemask_all = true;
511 mov->force_uncompressed = true;
512
513 /* The caller wants the low 32 bits of the timestamp. Since it's running
514 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
515 * which is plenty of time for our purposes. It is identical across the
516 * EUs, but since it's tracking GPU core speed it will increment at a
517 * varying rate as render P-states change.
518 *
519 * The caller could also check if render P-states have changed (or anything
520 * else that might disrupt timing) by setting smear to 2 and checking if
521 * that field is != 0.
522 */
523 dst.smear = 0;
524
525 return dst;
526 }
527
528 void
529 fs_visitor::emit_shader_time_begin()
530 {
531 current_annotation = "shader time start";
532 shader_start_time = get_timestamp();
533 }
534
535 void
536 fs_visitor::emit_shader_time_end()
537 {
538 current_annotation = "shader time end";
539
540 enum shader_time_shader_type type, written_type, reset_type;
541 if (dispatch_width == 8) {
542 type = ST_FS8;
543 written_type = ST_FS8_WRITTEN;
544 reset_type = ST_FS8_RESET;
545 } else {
546 assert(dispatch_width == 16);
547 type = ST_FS16;
548 written_type = ST_FS16_WRITTEN;
549 reset_type = ST_FS16_RESET;
550 }
551
552 fs_reg shader_end_time = get_timestamp();
553
554 /* Check that there weren't any timestamp reset events (assuming these
555 * were the only two timestamp reads that happened).
556 */
557 fs_reg reset = shader_end_time;
558 reset.smear = 2;
559 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
560 test->conditional_mod = BRW_CONDITIONAL_Z;
561 emit(IF(BRW_PREDICATE_NORMAL));
562
563 push_force_uncompressed();
564 fs_reg start = shader_start_time;
565 start.negate = true;
566 fs_reg diff = fs_reg(this, glsl_type::uint_type);
567 emit(ADD(diff, start, shader_end_time));
568
569 /* If there were no instructions between the two timestamp gets, the diff
570 * is 2 cycles. Remove that overhead, so I can forget about that when
571 * trying to determine the time taken for single instructions.
572 */
573 emit(ADD(diff, diff, fs_reg(-2u)));
574
575 emit_shader_time_write(type, diff);
576 emit_shader_time_write(written_type, fs_reg(1u));
577 emit(BRW_OPCODE_ELSE);
578 emit_shader_time_write(reset_type, fs_reg(1u));
579 emit(BRW_OPCODE_ENDIF);
580
581 pop_force_uncompressed();
582 }
583
584 void
585 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
586 fs_reg value)
587 {
588 int shader_time_index =
589 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
590 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
591
592 fs_reg payload;
593 if (dispatch_width == 8)
594 payload = fs_reg(this, glsl_type::uvec2_type);
595 else
596 payload = fs_reg(this, glsl_type::uint_type);
597
598 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
599 fs_reg(), payload, offset, value));
600 }
601
602 void
603 fs_visitor::fail(const char *format, ...)
604 {
605 va_list va;
606 char *msg;
607
608 if (failed)
609 return;
610
611 failed = true;
612
613 va_start(va, format);
614 msg = ralloc_vasprintf(mem_ctx, format, va);
615 va_end(va);
616 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
617
618 this->fail_msg = msg;
619
620 if (INTEL_DEBUG & DEBUG_WM) {
621 fprintf(stderr, "%s", msg);
622 }
623 }
624
625 fs_inst *
626 fs_visitor::emit(enum opcode opcode)
627 {
628 return emit(fs_inst(opcode));
629 }
630
631 fs_inst *
632 fs_visitor::emit(enum opcode opcode, fs_reg dst)
633 {
634 return emit(fs_inst(opcode, dst));
635 }
636
637 fs_inst *
638 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
639 {
640 return emit(fs_inst(opcode, dst, src0));
641 }
642
643 fs_inst *
644 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
645 {
646 return emit(fs_inst(opcode, dst, src0, src1));
647 }
648
649 fs_inst *
650 fs_visitor::emit(enum opcode opcode, fs_reg dst,
651 fs_reg src0, fs_reg src1, fs_reg src2)
652 {
653 return emit(fs_inst(opcode, dst, src0, src1, src2));
654 }
655
656 void
657 fs_visitor::push_force_uncompressed()
658 {
659 force_uncompressed_stack++;
660 }
661
662 void
663 fs_visitor::pop_force_uncompressed()
664 {
665 force_uncompressed_stack--;
666 assert(force_uncompressed_stack >= 0);
667 }
668
669 void
670 fs_visitor::push_force_sechalf()
671 {
672 force_sechalf_stack++;
673 }
674
675 void
676 fs_visitor::pop_force_sechalf()
677 {
678 force_sechalf_stack--;
679 assert(force_sechalf_stack >= 0);
680 }
681
682 /**
683 * Returns true if the instruction has a flag that means it won't
684 * update an entire destination register.
685 *
686 * For example, dead code elimination and live variable analysis want to know
687 * when a write to a variable screens off any preceding values that were in
688 * it.
689 */
690 bool
691 fs_inst::is_partial_write()
692 {
693 return (this->predicate ||
694 this->force_uncompressed ||
695 this->force_sechalf);
696 }
697
698 /**
699 * Returns how many MRFs an FS opcode will write over.
700 *
701 * Note that this is not the 0 or 1 implied writes in an actual gen
702 * instruction -- the FS opcodes often generate MOVs in addition.
703 */
704 int
705 fs_visitor::implied_mrf_writes(fs_inst *inst)
706 {
707 if (inst->mlen == 0)
708 return 0;
709
710 switch (inst->opcode) {
711 case SHADER_OPCODE_RCP:
712 case SHADER_OPCODE_RSQ:
713 case SHADER_OPCODE_SQRT:
714 case SHADER_OPCODE_EXP2:
715 case SHADER_OPCODE_LOG2:
716 case SHADER_OPCODE_SIN:
717 case SHADER_OPCODE_COS:
718 return 1 * dispatch_width / 8;
719 case SHADER_OPCODE_POW:
720 case SHADER_OPCODE_INT_QUOTIENT:
721 case SHADER_OPCODE_INT_REMAINDER:
722 return 2 * dispatch_width / 8;
723 case SHADER_OPCODE_TEX:
724 case FS_OPCODE_TXB:
725 case SHADER_OPCODE_TXD:
726 case SHADER_OPCODE_TXF:
727 case SHADER_OPCODE_TXF_MS:
728 case SHADER_OPCODE_TXL:
729 case SHADER_OPCODE_TXS:
730 case SHADER_OPCODE_LOD:
731 return 1;
732 case FS_OPCODE_FB_WRITE:
733 return 2;
734 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
735 case FS_OPCODE_UNSPILL:
736 return 1;
737 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
738 return inst->mlen;
739 case FS_OPCODE_SPILL:
740 return 2;
741 default:
742 assert(!"not reached");
743 return inst->mlen;
744 }
745 }
746
747 int
748 fs_visitor::virtual_grf_alloc(int size)
749 {
750 if (virtual_grf_array_size <= virtual_grf_count) {
751 if (virtual_grf_array_size == 0)
752 virtual_grf_array_size = 16;
753 else
754 virtual_grf_array_size *= 2;
755 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
756 virtual_grf_array_size);
757 }
758 virtual_grf_sizes[virtual_grf_count] = size;
759 return virtual_grf_count++;
760 }
761
762 /** Fixed HW reg constructor. */
763 fs_reg::fs_reg(enum register_file file, int reg)
764 {
765 init();
766 this->file = file;
767 this->reg = reg;
768 this->type = BRW_REGISTER_TYPE_F;
769 }
770
771 /** Fixed HW reg constructor. */
772 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
773 {
774 init();
775 this->file = file;
776 this->reg = reg;
777 this->type = type;
778 }
779
780 /** Automatic reg constructor. */
781 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
782 {
783 init();
784
785 this->file = GRF;
786 this->reg = v->virtual_grf_alloc(v->type_size(type));
787 this->reg_offset = 0;
788 this->type = brw_type_for_base_type(type);
789 }
790
791 fs_reg *
792 fs_visitor::variable_storage(ir_variable *var)
793 {
794 return (fs_reg *)hash_table_find(this->variable_ht, var);
795 }
796
797 void
798 import_uniforms_callback(const void *key,
799 void *data,
800 void *closure)
801 {
802 struct hash_table *dst_ht = (struct hash_table *)closure;
803 const fs_reg *reg = (const fs_reg *)data;
804
805 if (reg->file != UNIFORM)
806 return;
807
808 hash_table_insert(dst_ht, data, key);
809 }
810
811 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
812 * This brings in those uniform definitions
813 */
814 void
815 fs_visitor::import_uniforms(fs_visitor *v)
816 {
817 hash_table_call_foreach(v->variable_ht,
818 import_uniforms_callback,
819 variable_ht);
820 this->params_remap = v->params_remap;
821 }
822
823 /* Our support for uniforms is piggy-backed on the struct
824 * gl_fragment_program, because that's where the values actually
825 * get stored, rather than in some global gl_shader_program uniform
826 * store.
827 */
828 void
829 fs_visitor::setup_uniform_values(ir_variable *ir)
830 {
831 int namelen = strlen(ir->name);
832
833 /* The data for our (non-builtin) uniforms is stored in a series of
834 * gl_uniform_driver_storage structs for each subcomponent that
835 * glGetUniformLocation() could name. We know it's been set up in the same
836 * order we'd walk the type, so walk the list of storage and find anything
837 * with our name, or the prefix of a component that starts with our name.
838 */
839 unsigned params_before = c->prog_data.nr_params;
840 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
841 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
842
843 if (strncmp(ir->name, storage->name, namelen) != 0 ||
844 (storage->name[namelen] != 0 &&
845 storage->name[namelen] != '.' &&
846 storage->name[namelen] != '[')) {
847 continue;
848 }
849
850 unsigned slots = storage->type->component_slots();
851 if (storage->array_elements)
852 slots *= storage->array_elements;
853
854 for (unsigned i = 0; i < slots; i++) {
855 c->prog_data.param[c->prog_data.nr_params++] =
856 &storage->storage[i].f;
857 }
858 }
859
860 /* Make sure we actually initialized the right amount of stuff here. */
861 assert(params_before + ir->type->component_slots() ==
862 c->prog_data.nr_params);
863 (void)params_before;
864 }
865
866
867 /* Our support for builtin uniforms is even scarier than non-builtin.
868 * It sits on top of the PROG_STATE_VAR parameters that are
869 * automatically updated from GL context state.
870 */
871 void
872 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
873 {
874 const ir_state_slot *const slots = ir->state_slots;
875 assert(ir->state_slots != NULL);
876
877 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
878 /* This state reference has already been setup by ir_to_mesa, but we'll
879 * get the same index back here.
880 */
881 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
882 (gl_state_index *)slots[i].tokens);
883
884 /* Add each of the unique swizzles of the element as a parameter.
885 * This'll end up matching the expected layout of the
886 * array/matrix/structure we're trying to fill in.
887 */
888 int last_swiz = -1;
889 for (unsigned int j = 0; j < 4; j++) {
890 int swiz = GET_SWZ(slots[i].swizzle, j);
891 if (swiz == last_swiz)
892 break;
893 last_swiz = swiz;
894
895 c->prog_data.param[c->prog_data.nr_params++] =
896 &fp->Base.Parameters->ParameterValues[index][swiz].f;
897 }
898 }
899 }
900
901 fs_reg *
902 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
903 {
904 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
905 fs_reg wpos = *reg;
906 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
907
908 /* gl_FragCoord.x */
909 if (ir->pixel_center_integer) {
910 emit(MOV(wpos, this->pixel_x));
911 } else {
912 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
913 }
914 wpos.reg_offset++;
915
916 /* gl_FragCoord.y */
917 if (!flip && ir->pixel_center_integer) {
918 emit(MOV(wpos, this->pixel_y));
919 } else {
920 fs_reg pixel_y = this->pixel_y;
921 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
922
923 if (flip) {
924 pixel_y.negate = true;
925 offset += c->key.drawable_height - 1.0;
926 }
927
928 emit(ADD(wpos, pixel_y, fs_reg(offset)));
929 }
930 wpos.reg_offset++;
931
932 /* gl_FragCoord.z */
933 if (intel->gen >= 6) {
934 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
935 } else {
936 emit(FS_OPCODE_LINTERP, wpos,
937 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
938 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
939 interp_reg(VARYING_SLOT_POS, 2));
940 }
941 wpos.reg_offset++;
942
943 /* gl_FragCoord.w: Already set up in emit_interpolation */
944 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
945
946 return reg;
947 }
948
949 fs_inst *
950 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
951 glsl_interp_qualifier interpolation_mode,
952 bool is_centroid)
953 {
954 brw_wm_barycentric_interp_mode barycoord_mode;
955 if (intel->gen >= 6) {
956 if (is_centroid) {
957 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
958 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
959 else
960 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
961 } else {
962 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
963 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
964 else
965 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
966 }
967 } else {
968 /* On Ironlake and below, there is only one interpolation mode.
969 * Centroid interpolation doesn't mean anything on this hardware --
970 * there is no multisampling.
971 */
972 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
973 }
974 return emit(FS_OPCODE_LINTERP, attr,
975 this->delta_x[barycoord_mode],
976 this->delta_y[barycoord_mode], interp);
977 }
978
979 fs_reg *
980 fs_visitor::emit_general_interpolation(ir_variable *ir)
981 {
982 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
983 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
984 fs_reg attr = *reg;
985
986 unsigned int array_elements;
987 const glsl_type *type;
988
989 if (ir->type->is_array()) {
990 array_elements = ir->type->length;
991 if (array_elements == 0) {
992 fail("dereferenced array '%s' has length 0\n", ir->name);
993 }
994 type = ir->type->fields.array;
995 } else {
996 array_elements = 1;
997 type = ir->type;
998 }
999
1000 glsl_interp_qualifier interpolation_mode =
1001 ir->determine_interpolation_mode(c->key.flat_shade);
1002
1003 int location = ir->location;
1004 for (unsigned int i = 0; i < array_elements; i++) {
1005 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1006 if (urb_setup[location] == -1) {
1007 /* If there's no incoming setup data for this slot, don't
1008 * emit interpolation for it.
1009 */
1010 attr.reg_offset += type->vector_elements;
1011 location++;
1012 continue;
1013 }
1014
1015 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1016 /* Constant interpolation (flat shading) case. The SF has
1017 * handed us defined values in only the constant offset
1018 * field of the setup reg.
1019 */
1020 for (unsigned int k = 0; k < type->vector_elements; k++) {
1021 struct brw_reg interp = interp_reg(location, k);
1022 interp = suboffset(interp, 3);
1023 interp.type = reg->type;
1024 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1025 attr.reg_offset++;
1026 }
1027 } else {
1028 /* Smooth/noperspective interpolation case. */
1029 for (unsigned int k = 0; k < type->vector_elements; k++) {
1030 /* FINISHME: At some point we probably want to push
1031 * this farther by giving similar treatment to the
1032 * other potentially constant components of the
1033 * attribute, as well as making brw_vs_constval.c
1034 * handle varyings other than gl_TexCoord.
1035 */
1036 struct brw_reg interp = interp_reg(location, k);
1037 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1038 ir->centroid);
1039 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1040 /* Get the pixel/sample mask into f0 so that we know
1041 * which pixels are lit. Then, for each channel that is
1042 * unlit, replace the centroid data with non-centroid
1043 * data.
1044 */
1045 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1046 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1047 interpolation_mode, false);
1048 inst->predicate = BRW_PREDICATE_NORMAL;
1049 inst->predicate_inverse = true;
1050 }
1051 if (intel->gen < 6) {
1052 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1053 }
1054 attr.reg_offset++;
1055 }
1056
1057 }
1058 location++;
1059 }
1060 }
1061
1062 return reg;
1063 }
1064
1065 fs_reg *
1066 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1067 {
1068 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1069
1070 /* The frontfacing comes in as a bit in the thread payload. */
1071 if (intel->gen >= 6) {
1072 emit(BRW_OPCODE_ASR, *reg,
1073 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1074 fs_reg(15));
1075 emit(BRW_OPCODE_NOT, *reg, *reg);
1076 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1077 } else {
1078 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1079 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1080 * us front face
1081 */
1082 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1083 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1084 }
1085
1086 return reg;
1087 }
1088
1089 fs_reg
1090 fs_visitor::fix_math_operand(fs_reg src)
1091 {
1092 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1093 * might be able to do better by doing execsize = 1 math and then
1094 * expanding that result out, but we would need to be careful with
1095 * masking.
1096 *
1097 * The hardware ignores source modifiers (negate and abs) on math
1098 * instructions, so we also move to a temp to set those up.
1099 */
1100 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1101 !src.abs && !src.negate)
1102 return src;
1103
1104 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1105 * operands to math
1106 */
1107 if (intel->gen >= 7 && src.file != IMM)
1108 return src;
1109
1110 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1111 expanded.type = src.type;
1112 emit(BRW_OPCODE_MOV, expanded, src);
1113 return expanded;
1114 }
1115
1116 fs_inst *
1117 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1118 {
1119 switch (opcode) {
1120 case SHADER_OPCODE_RCP:
1121 case SHADER_OPCODE_RSQ:
1122 case SHADER_OPCODE_SQRT:
1123 case SHADER_OPCODE_EXP2:
1124 case SHADER_OPCODE_LOG2:
1125 case SHADER_OPCODE_SIN:
1126 case SHADER_OPCODE_COS:
1127 break;
1128 default:
1129 assert(!"not reached: bad math opcode");
1130 return NULL;
1131 }
1132
1133 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1134 * might be able to do better by doing execsize = 1 math and then
1135 * expanding that result out, but we would need to be careful with
1136 * masking.
1137 *
1138 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1139 * instructions, so we also move to a temp to set those up.
1140 */
1141 if (intel->gen >= 6)
1142 src = fix_math_operand(src);
1143
1144 fs_inst *inst = emit(opcode, dst, src);
1145
1146 if (intel->gen < 6) {
1147 inst->base_mrf = 2;
1148 inst->mlen = dispatch_width / 8;
1149 }
1150
1151 return inst;
1152 }
1153
1154 fs_inst *
1155 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1156 {
1157 int base_mrf = 2;
1158 fs_inst *inst;
1159
1160 switch (opcode) {
1161 case SHADER_OPCODE_INT_QUOTIENT:
1162 case SHADER_OPCODE_INT_REMAINDER:
1163 if (intel->gen >= 7 && dispatch_width == 16)
1164 fail("16-wide INTDIV unsupported\n");
1165 break;
1166 case SHADER_OPCODE_POW:
1167 break;
1168 default:
1169 assert(!"not reached: unsupported binary math opcode.");
1170 return NULL;
1171 }
1172
1173 if (intel->gen >= 6) {
1174 src0 = fix_math_operand(src0);
1175 src1 = fix_math_operand(src1);
1176
1177 inst = emit(opcode, dst, src0, src1);
1178 } else {
1179 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1180 * "Message Payload":
1181 *
1182 * "Operand0[7]. For the INT DIV functions, this operand is the
1183 * denominator."
1184 * ...
1185 * "Operand1[7]. For the INT DIV functions, this operand is the
1186 * numerator."
1187 */
1188 bool is_int_div = opcode != SHADER_OPCODE_POW;
1189 fs_reg &op0 = is_int_div ? src1 : src0;
1190 fs_reg &op1 = is_int_div ? src0 : src1;
1191
1192 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1193 inst = emit(opcode, dst, op0, reg_null_f);
1194
1195 inst->base_mrf = base_mrf;
1196 inst->mlen = 2 * dispatch_width / 8;
1197 }
1198 return inst;
1199 }
1200
1201 void
1202 fs_visitor::assign_curb_setup()
1203 {
1204 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1205 if (dispatch_width == 8) {
1206 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1207 } else {
1208 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1209 }
1210
1211 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1212 foreach_list(node, &this->instructions) {
1213 fs_inst *inst = (fs_inst *)node;
1214
1215 for (unsigned int i = 0; i < 3; i++) {
1216 if (inst->src[i].file == UNIFORM) {
1217 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1218 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1219 constant_nr / 8,
1220 constant_nr % 8);
1221
1222 inst->src[i].file = HW_REG;
1223 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1224 }
1225 }
1226 }
1227 }
1228
1229 void
1230 fs_visitor::calculate_urb_setup()
1231 {
1232 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1233 urb_setup[i] = -1;
1234 }
1235
1236 int urb_next = 0;
1237 /* Figure out where each of the incoming setup attributes lands. */
1238 if (intel->gen >= 6) {
1239 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1240 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1241 urb_setup[i] = urb_next++;
1242 }
1243 }
1244 } else {
1245 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1246 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1247 /* Point size is packed into the header, not as a general attribute */
1248 if (i == VARYING_SLOT_PSIZ)
1249 continue;
1250
1251 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1252 /* The back color slot is skipped when the front color is
1253 * also written to. In addition, some slots can be
1254 * written in the vertex shader and not read in the
1255 * fragment shader. So the register number must always be
1256 * incremented, mapped or not.
1257 */
1258 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1259 urb_setup[i] = urb_next;
1260 urb_next++;
1261 }
1262 }
1263
1264 /*
1265 * It's a FS only attribute, and we did interpolation for this attribute
1266 * in SF thread. So, count it here, too.
1267 *
1268 * See compile_sf_prog() for more info.
1269 */
1270 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1271 urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1272 }
1273
1274 /* Each attribute is 4 setup channels, each of which is half a reg. */
1275 c->prog_data.urb_read_length = urb_next * 2;
1276 }
1277
1278 void
1279 fs_visitor::assign_urb_setup()
1280 {
1281 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1282
1283 /* Offset all the urb_setup[] index by the actual position of the
1284 * setup regs, now that the location of the constants has been chosen.
1285 */
1286 foreach_list(node, &this->instructions) {
1287 fs_inst *inst = (fs_inst *)node;
1288
1289 if (inst->opcode == FS_OPCODE_LINTERP) {
1290 assert(inst->src[2].file == HW_REG);
1291 inst->src[2].fixed_hw_reg.nr += urb_start;
1292 }
1293
1294 if (inst->opcode == FS_OPCODE_CINTERP) {
1295 assert(inst->src[0].file == HW_REG);
1296 inst->src[0].fixed_hw_reg.nr += urb_start;
1297 }
1298 }
1299
1300 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1301 }
1302
1303 /**
1304 * Split large virtual GRFs into separate components if we can.
1305 *
1306 * This is mostly duplicated with what brw_fs_vector_splitting does,
1307 * but that's really conservative because it's afraid of doing
1308 * splitting that doesn't result in real progress after the rest of
1309 * the optimization phases, which would cause infinite looping in
1310 * optimization. We can do it once here, safely. This also has the
1311 * opportunity to split interpolated values, or maybe even uniforms,
1312 * which we don't have at the IR level.
1313 *
1314 * We want to split, because virtual GRFs are what we register
1315 * allocate and spill (due to contiguousness requirements for some
1316 * instructions), and they're what we naturally generate in the
1317 * codegen process, but most virtual GRFs don't actually need to be
1318 * contiguous sets of GRFs. If we split, we'll end up with reduced
1319 * live intervals and better dead code elimination and coalescing.
1320 */
1321 void
1322 fs_visitor::split_virtual_grfs()
1323 {
1324 int num_vars = this->virtual_grf_count;
1325 bool split_grf[num_vars];
1326 int new_virtual_grf[num_vars];
1327
1328 /* Try to split anything > 0 sized. */
1329 for (int i = 0; i < num_vars; i++) {
1330 if (this->virtual_grf_sizes[i] != 1)
1331 split_grf[i] = true;
1332 else
1333 split_grf[i] = false;
1334 }
1335
1336 if (brw->has_pln &&
1337 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1338 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1339 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1340 * Gen6, that was the only supported interpolation mode, and since Gen6,
1341 * delta_x and delta_y are in fixed hardware registers.
1342 */
1343 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1344 false;
1345 }
1346
1347 foreach_list(node, &this->instructions) {
1348 fs_inst *inst = (fs_inst *)node;
1349
1350 /* If there's a SEND message that requires contiguous destination
1351 * registers, no splitting is allowed.
1352 */
1353 if (inst->regs_written > 1) {
1354 split_grf[inst->dst.reg] = false;
1355 }
1356
1357 /* If we're sending from a GRF, don't split it, on the assumption that
1358 * the send is reading the whole thing.
1359 */
1360 if (inst->is_send_from_grf()) {
1361 split_grf[inst->src[0].reg] = false;
1362 }
1363 }
1364
1365 /* Allocate new space for split regs. Note that the virtual
1366 * numbers will be contiguous.
1367 */
1368 for (int i = 0; i < num_vars; i++) {
1369 if (split_grf[i]) {
1370 new_virtual_grf[i] = virtual_grf_alloc(1);
1371 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1372 int reg = virtual_grf_alloc(1);
1373 assert(reg == new_virtual_grf[i] + j - 1);
1374 (void) reg;
1375 }
1376 this->virtual_grf_sizes[i] = 1;
1377 }
1378 }
1379
1380 foreach_list(node, &this->instructions) {
1381 fs_inst *inst = (fs_inst *)node;
1382
1383 if (inst->dst.file == GRF &&
1384 split_grf[inst->dst.reg] &&
1385 inst->dst.reg_offset != 0) {
1386 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1387 inst->dst.reg_offset - 1);
1388 inst->dst.reg_offset = 0;
1389 }
1390 for (int i = 0; i < 3; i++) {
1391 if (inst->src[i].file == GRF &&
1392 split_grf[inst->src[i].reg] &&
1393 inst->src[i].reg_offset != 0) {
1394 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1395 inst->src[i].reg_offset - 1);
1396 inst->src[i].reg_offset = 0;
1397 }
1398 }
1399 }
1400 this->live_intervals_valid = false;
1401 }
1402
1403 /**
1404 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1405 *
1406 * During code generation, we create tons of temporary variables, many of
1407 * which get immediately killed and are never used again. Yet, in later
1408 * optimization and analysis passes, such as compute_live_intervals, we need
1409 * to loop over all the virtual GRFs. Compacting them can save a lot of
1410 * overhead.
1411 */
1412 void
1413 fs_visitor::compact_virtual_grfs()
1414 {
1415 /* Mark which virtual GRFs are used, and count how many. */
1416 int remap_table[this->virtual_grf_count];
1417 memset(remap_table, -1, sizeof(remap_table));
1418
1419 foreach_list(node, &this->instructions) {
1420 const fs_inst *inst = (const fs_inst *) node;
1421
1422 if (inst->dst.file == GRF)
1423 remap_table[inst->dst.reg] = 0;
1424
1425 for (int i = 0; i < 3; i++) {
1426 if (inst->src[i].file == GRF)
1427 remap_table[inst->src[i].reg] = 0;
1428 }
1429 }
1430
1431 /* In addition to registers used in instructions, fs_visitor keeps
1432 * direct references to certain special values which must be patched:
1433 */
1434 fs_reg *special[] = {
1435 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1436 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1437 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1438 &delta_x[0], &delta_x[1], &delta_x[2],
1439 &delta_x[3], &delta_x[4], &delta_x[5],
1440 &delta_y[0], &delta_y[1], &delta_y[2],
1441 &delta_y[3], &delta_y[4], &delta_y[5],
1442 };
1443 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1444 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1445
1446 /* Treat all special values as used, to be conservative */
1447 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1448 if (special[i]->file == GRF)
1449 remap_table[special[i]->reg] = 0;
1450 }
1451
1452 /* Compact the GRF arrays. */
1453 int new_index = 0;
1454 for (int i = 0; i < this->virtual_grf_count; i++) {
1455 if (remap_table[i] != -1) {
1456 remap_table[i] = new_index;
1457 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1458 if (live_intervals_valid) {
1459 virtual_grf_start[new_index] = virtual_grf_start[i];
1460 virtual_grf_end[new_index] = virtual_grf_end[i];
1461 }
1462 ++new_index;
1463 }
1464 }
1465
1466 this->virtual_grf_count = new_index;
1467
1468 /* Patch all the instructions to use the newly renumbered registers */
1469 foreach_list(node, &this->instructions) {
1470 fs_inst *inst = (fs_inst *) node;
1471
1472 if (inst->dst.file == GRF)
1473 inst->dst.reg = remap_table[inst->dst.reg];
1474
1475 for (int i = 0; i < 3; i++) {
1476 if (inst->src[i].file == GRF)
1477 inst->src[i].reg = remap_table[inst->src[i].reg];
1478 }
1479 }
1480
1481 /* Patch all the references to special values */
1482 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1483 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1484 special[i]->reg = remap_table[special[i]->reg];
1485 }
1486 }
1487
1488 bool
1489 fs_visitor::remove_dead_constants()
1490 {
1491 if (dispatch_width == 8) {
1492 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1493
1494 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1495 this->params_remap[i] = -1;
1496
1497 /* Find which params are still in use. */
1498 foreach_list(node, &this->instructions) {
1499 fs_inst *inst = (fs_inst *)node;
1500
1501 for (int i = 0; i < 3; i++) {
1502 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1503
1504 if (inst->src[i].file != UNIFORM)
1505 continue;
1506
1507 /* if we get a negative constant nr or one greater than we can
1508 * handle, this can cause an overflow, we can't just refuse to
1509 * build, so just go undefined and alias everyone to constant 0.
1510 */
1511 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1512 constant_nr = 0;
1513 }
1514
1515 /* For now, set this to non-negative. We'll give it the
1516 * actual new number in a moment, in order to keep the
1517 * register numbers nicely ordered.
1518 */
1519 this->params_remap[constant_nr] = 0;
1520 }
1521 }
1522
1523 /* Figure out what the new numbers for the params will be. At some
1524 * point when we're doing uniform array access, we're going to want
1525 * to keep the distinction between .reg and .reg_offset, but for
1526 * now we don't care.
1527 */
1528 unsigned int new_nr_params = 0;
1529 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1530 if (this->params_remap[i] != -1) {
1531 this->params_remap[i] = new_nr_params++;
1532 }
1533 }
1534
1535 /* Update the list of params to be uploaded to match our new numbering. */
1536 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1537 int remapped = this->params_remap[i];
1538
1539 if (remapped == -1)
1540 continue;
1541
1542 c->prog_data.param[remapped] = c->prog_data.param[i];
1543 }
1544
1545 c->prog_data.nr_params = new_nr_params;
1546 } else {
1547 /* This should have been generated in the 8-wide pass already. */
1548 assert(this->params_remap);
1549 }
1550
1551 /* Now do the renumbering of the shader to remove unused params. */
1552 foreach_list(node, &this->instructions) {
1553 fs_inst *inst = (fs_inst *)node;
1554
1555 for (int i = 0; i < 3; i++) {
1556 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1557
1558 if (inst->src[i].file != UNIFORM)
1559 continue;
1560
1561 /* as above alias to 0 */
1562 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1563 constant_nr = 0;
1564 }
1565 assert(this->params_remap[constant_nr] != -1);
1566 inst->src[i].reg = this->params_remap[constant_nr];
1567 inst->src[i].reg_offset = 0;
1568 }
1569 }
1570
1571 return true;
1572 }
1573
1574 /*
1575 * Implements array access of uniforms by inserting a
1576 * PULL_CONSTANT_LOAD instruction.
1577 *
1578 * Unlike temporary GRF array access (where we don't support it due to
1579 * the difficulty of doing relative addressing on instruction
1580 * destinations), we could potentially do array access of uniforms
1581 * that were loaded in GRF space as push constants. In real-world
1582 * usage we've seen, though, the arrays being used are always larger
1583 * than we could load as push constants, so just always move all
1584 * uniform array access out to a pull constant buffer.
1585 */
1586 void
1587 fs_visitor::move_uniform_array_access_to_pull_constants()
1588 {
1589 int pull_constant_loc[c->prog_data.nr_params];
1590
1591 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1592 pull_constant_loc[i] = -1;
1593 }
1594
1595 /* Walk through and find array access of uniforms. Put a copy of that
1596 * uniform in the pull constant buffer.
1597 *
1598 * Note that we don't move constant-indexed accesses to arrays. No
1599 * testing has been done of the performance impact of this choice.
1600 */
1601 foreach_list_safe(node, &this->instructions) {
1602 fs_inst *inst = (fs_inst *)node;
1603
1604 for (int i = 0 ; i < 3; i++) {
1605 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1606 continue;
1607
1608 int uniform = inst->src[i].reg;
1609
1610 /* If this array isn't already present in the pull constant buffer,
1611 * add it.
1612 */
1613 if (pull_constant_loc[uniform] == -1) {
1614 const float **values = &c->prog_data.param[uniform];
1615
1616 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1617
1618 assert(param_size[uniform]);
1619
1620 for (int j = 0; j < param_size[uniform]; j++) {
1621 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1622 values[j];
1623 }
1624 }
1625
1626 /* Set up the annotation tracking for new generated instructions. */
1627 base_ir = inst->ir;
1628 current_annotation = inst->annotation;
1629
1630 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1631 fs_reg temp = fs_reg(this, glsl_type::float_type);
1632 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1633 surf_index,
1634 *inst->src[i].reladdr,
1635 pull_constant_loc[uniform] +
1636 inst->src[i].reg_offset);
1637 inst->insert_before(&list);
1638
1639 inst->src[i].file = temp.file;
1640 inst->src[i].reg = temp.reg;
1641 inst->src[i].reg_offset = temp.reg_offset;
1642 inst->src[i].reladdr = NULL;
1643 }
1644 }
1645 }
1646
1647 /**
1648 * Choose accesses from the UNIFORM file to demote to using the pull
1649 * constant buffer.
1650 *
1651 * We allow a fragment shader to have more than the specified minimum
1652 * maximum number of fragment shader uniform components (64). If
1653 * there are too many of these, they'd fill up all of register space.
1654 * So, this will push some of them out to the pull constant buffer and
1655 * update the program to load them.
1656 */
1657 void
1658 fs_visitor::setup_pull_constants()
1659 {
1660 /* Only allow 16 registers (128 uniform components) as push constants. */
1661 unsigned int max_uniform_components = 16 * 8;
1662 if (c->prog_data.nr_params <= max_uniform_components)
1663 return;
1664
1665 if (dispatch_width == 16) {
1666 fail("Pull constants not supported in 16-wide\n");
1667 return;
1668 }
1669
1670 /* Just demote the end of the list. We could probably do better
1671 * here, demoting things that are rarely used in the program first.
1672 */
1673 unsigned int pull_uniform_base = max_uniform_components;
1674
1675 int pull_constant_loc[c->prog_data.nr_params];
1676 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1677 if (i < pull_uniform_base) {
1678 pull_constant_loc[i] = -1;
1679 } else {
1680 pull_constant_loc[i] = -1;
1681 /* If our constant is already being uploaded for reladdr purposes,
1682 * reuse it.
1683 */
1684 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1685 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1686 pull_constant_loc[i] = j;
1687 break;
1688 }
1689 }
1690 if (pull_constant_loc[i] == -1) {
1691 int pull_index = c->prog_data.nr_pull_params++;
1692 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1693 pull_constant_loc[i] = pull_index;;
1694 }
1695 }
1696 }
1697 c->prog_data.nr_params = pull_uniform_base;
1698
1699 foreach_list(node, &this->instructions) {
1700 fs_inst *inst = (fs_inst *)node;
1701
1702 for (int i = 0; i < 3; i++) {
1703 if (inst->src[i].file != UNIFORM)
1704 continue;
1705
1706 int pull_index = pull_constant_loc[inst->src[i].reg +
1707 inst->src[i].reg_offset];
1708 if (pull_index == -1)
1709 continue;
1710
1711 assert(!inst->src[i].reladdr);
1712
1713 fs_reg dst = fs_reg(this, glsl_type::float_type);
1714 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1715 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1716 fs_inst *pull =
1717 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1718 dst, index, offset);
1719 pull->ir = inst->ir;
1720 pull->annotation = inst->annotation;
1721
1722 inst->insert_before(pull);
1723
1724 inst->src[i].file = GRF;
1725 inst->src[i].reg = dst.reg;
1726 inst->src[i].reg_offset = 0;
1727 inst->src[i].smear = pull_index & 3;
1728 }
1729 }
1730 }
1731
1732 bool
1733 fs_visitor::opt_algebraic()
1734 {
1735 bool progress = false;
1736
1737 foreach_list(node, &this->instructions) {
1738 fs_inst *inst = (fs_inst *)node;
1739
1740 switch (inst->opcode) {
1741 case BRW_OPCODE_MUL:
1742 if (inst->src[1].file != IMM)
1743 continue;
1744
1745 /* a * 1.0 = a */
1746 if (inst->src[1].is_one()) {
1747 inst->opcode = BRW_OPCODE_MOV;
1748 inst->src[1] = reg_undef;
1749 progress = true;
1750 break;
1751 }
1752
1753 /* a * 0.0 = 0.0 */
1754 if (inst->src[1].is_zero()) {
1755 inst->opcode = BRW_OPCODE_MOV;
1756 inst->src[0] = inst->src[1];
1757 inst->src[1] = reg_undef;
1758 progress = true;
1759 break;
1760 }
1761
1762 break;
1763 case BRW_OPCODE_ADD:
1764 if (inst->src[1].file != IMM)
1765 continue;
1766
1767 /* a + 0.0 = a */
1768 if (inst->src[1].is_zero()) {
1769 inst->opcode = BRW_OPCODE_MOV;
1770 inst->src[1] = reg_undef;
1771 progress = true;
1772 break;
1773 }
1774 break;
1775 default:
1776 break;
1777 }
1778 }
1779
1780 return progress;
1781 }
1782
1783 /**
1784 * Removes any instructions writing a VGRF where that VGRF is not used by any
1785 * later instruction.
1786 */
1787 bool
1788 fs_visitor::dead_code_eliminate()
1789 {
1790 bool progress = false;
1791 int pc = 0;
1792
1793 calculate_live_intervals();
1794
1795 foreach_list_safe(node, &this->instructions) {
1796 fs_inst *inst = (fs_inst *)node;
1797
1798 if (inst->dst.file == GRF) {
1799 assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1800 if (this->virtual_grf_end[inst->dst.reg] == pc) {
1801 inst->remove();
1802 progress = true;
1803 }
1804 }
1805
1806 pc++;
1807 }
1808
1809 if (progress)
1810 live_intervals_valid = false;
1811
1812 return progress;
1813 }
1814
1815 struct dead_code_hash_key
1816 {
1817 int vgrf;
1818 int reg_offset;
1819 };
1820
1821 static bool
1822 dead_code_hash_compare(const void *a, const void *b)
1823 {
1824 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1825 }
1826
1827 static void
1828 clear_dead_code_hash(struct hash_table *ht)
1829 {
1830 struct hash_entry *entry;
1831
1832 hash_table_foreach(ht, entry) {
1833 _mesa_hash_table_remove(ht, entry);
1834 }
1835 }
1836
1837 static void
1838 insert_dead_code_hash(struct hash_table *ht,
1839 int vgrf, int reg_offset, fs_inst *inst)
1840 {
1841 /* We don't bother freeing keys, because they'll be GCed with the ht. */
1842 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1843
1844 key->vgrf = vgrf;
1845 key->reg_offset = reg_offset;
1846
1847 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1848 }
1849
1850 static struct hash_entry *
1851 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1852 {
1853 struct dead_code_hash_key key;
1854
1855 key.vgrf = vgrf;
1856 key.reg_offset = reg_offset;
1857
1858 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1859 }
1860
1861 static void
1862 remove_dead_code_hash(struct hash_table *ht,
1863 int vgrf, int reg_offset)
1864 {
1865 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1866 if (!entry)
1867 return;
1868
1869 _mesa_hash_table_remove(ht, entry);
1870 }
1871
1872 /**
1873 * Walks basic blocks, removing any regs that are written but not read before
1874 * being redefined.
1875 *
1876 * The dead_code_eliminate() function implements a global dead code
1877 * elimination, but it only handles the removing the last write to a register
1878 * if it's never read. This one can handle intermediate writes, but only
1879 * within a basic block.
1880 */
1881 bool
1882 fs_visitor::dead_code_eliminate_local()
1883 {
1884 struct hash_table *ht;
1885 bool progress = false;
1886
1887 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1888
1889 foreach_list_safe(node, &this->instructions) {
1890 fs_inst *inst = (fs_inst *)node;
1891
1892 /* At a basic block, empty the HT since we don't understand dataflow
1893 * here.
1894 */
1895 if (inst->is_control_flow()) {
1896 clear_dead_code_hash(ht);
1897 continue;
1898 }
1899
1900 /* Clear the HT of any instructions that got read. */
1901 for (int i = 0; i < 3; i++) {
1902 fs_reg src = inst->src[i];
1903 if (src.file != GRF)
1904 continue;
1905
1906 int read = 1;
1907 if (inst->is_send_from_grf())
1908 read = virtual_grf_sizes[src.reg] - src.reg_offset;
1909
1910 for (int reg_offset = src.reg_offset;
1911 reg_offset < src.reg_offset + read;
1912 reg_offset++) {
1913 remove_dead_code_hash(ht, src.reg, reg_offset);
1914 }
1915 }
1916
1917 /* Add any update of a GRF to the HT, removing a previous write if it
1918 * wasn't read.
1919 */
1920 if (inst->dst.file == GRF) {
1921 if (inst->regs_written > 1) {
1922 /* We don't know how to trim channels from an instruction's
1923 * writes, so we can't incrementally remove unread channels from
1924 * it. Just remove whatever it overwrites from the table
1925 */
1926 for (int i = 0; i < inst->regs_written; i++) {
1927 remove_dead_code_hash(ht,
1928 inst->dst.reg,
1929 inst->dst.reg_offset + i);
1930 }
1931 } else {
1932 struct hash_entry *entry =
1933 get_dead_code_hash_entry(ht, inst->dst.reg,
1934 inst->dst.reg_offset);
1935
1936 if (inst->is_partial_write()) {
1937 /* For a partial write, we can't remove any previous dead code
1938 * candidate, since we're just modifying their result, but we can
1939 * be dead code eliminiated ourselves.
1940 */
1941 if (entry) {
1942 entry->data = inst;
1943 } else {
1944 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1945 inst);
1946 }
1947 } else {
1948 if (entry) {
1949 /* We're completely updating a channel, and there was a
1950 * previous write to the channel that wasn't read. Kill it!
1951 */
1952 fs_inst *inst = (fs_inst *)entry->data;
1953 inst->remove();
1954 progress = true;
1955 _mesa_hash_table_remove(ht, entry);
1956 }
1957
1958 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1959 inst);
1960 }
1961 }
1962 }
1963 }
1964
1965 _mesa_hash_table_destroy(ht, NULL);
1966
1967 if (progress)
1968 live_intervals_valid = false;
1969
1970 return progress;
1971 }
1972
1973 /**
1974 * Implements a second type of register coalescing: This one checks if
1975 * the two regs involved in a raw move don't interfere, in which case
1976 * they can both by stored in the same place and the MOV removed.
1977 */
1978 bool
1979 fs_visitor::register_coalesce_2()
1980 {
1981 bool progress = false;
1982
1983 calculate_live_intervals();
1984
1985 foreach_list_safe(node, &this->instructions) {
1986 fs_inst *inst = (fs_inst *)node;
1987
1988 if (inst->opcode != BRW_OPCODE_MOV ||
1989 inst->is_partial_write() ||
1990 inst->saturate ||
1991 inst->src[0].file != GRF ||
1992 inst->src[0].negate ||
1993 inst->src[0].abs ||
1994 inst->src[0].smear != -1 ||
1995 inst->dst.file != GRF ||
1996 inst->dst.type != inst->src[0].type ||
1997 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1998 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1999 continue;
2000 }
2001
2002 int reg_from = inst->src[0].reg;
2003 assert(inst->src[0].reg_offset == 0);
2004 int reg_to = inst->dst.reg;
2005 int reg_to_offset = inst->dst.reg_offset;
2006
2007 foreach_list(node, &this->instructions) {
2008 fs_inst *scan_inst = (fs_inst *)node;
2009
2010 if (scan_inst->dst.file == GRF &&
2011 scan_inst->dst.reg == reg_from) {
2012 scan_inst->dst.reg = reg_to;
2013 scan_inst->dst.reg_offset = reg_to_offset;
2014 }
2015 for (int i = 0; i < 3; i++) {
2016 if (scan_inst->src[i].file == GRF &&
2017 scan_inst->src[i].reg == reg_from) {
2018 scan_inst->src[i].reg = reg_to;
2019 scan_inst->src[i].reg_offset = reg_to_offset;
2020 }
2021 }
2022 }
2023
2024 inst->remove();
2025
2026 /* We don't need to recalculate live intervals inside the loop despite
2027 * flagging live_intervals_valid because we only use live intervals for
2028 * the interferes test, and we must have had a situation where the
2029 * intervals were:
2030 *
2031 * from to
2032 * ^
2033 * |
2034 * v
2035 * ^
2036 * |
2037 * v
2038 *
2039 * Some register R that might get coalesced with one of these two could
2040 * only be referencing "to", otherwise "from"'s range would have been
2041 * longer. R's range could also only start at the end of "to" or later,
2042 * otherwise it will conflict with "to" when we try to coalesce "to"
2043 * into Rw anyway.
2044 */
2045 live_intervals_valid = false;
2046
2047 progress = true;
2048 continue;
2049 }
2050
2051 return progress;
2052 }
2053
2054 bool
2055 fs_visitor::register_coalesce()
2056 {
2057 bool progress = false;
2058 int if_depth = 0;
2059 int loop_depth = 0;
2060
2061 foreach_list_safe(node, &this->instructions) {
2062 fs_inst *inst = (fs_inst *)node;
2063
2064 /* Make sure that we dominate the instructions we're going to
2065 * scan for interfering with our coalescing, or we won't have
2066 * scanned enough to see if anything interferes with our
2067 * coalescing. We don't dominate the following instructions if
2068 * we're in a loop or an if block.
2069 */
2070 switch (inst->opcode) {
2071 case BRW_OPCODE_DO:
2072 loop_depth++;
2073 break;
2074 case BRW_OPCODE_WHILE:
2075 loop_depth--;
2076 break;
2077 case BRW_OPCODE_IF:
2078 if_depth++;
2079 break;
2080 case BRW_OPCODE_ENDIF:
2081 if_depth--;
2082 break;
2083 default:
2084 break;
2085 }
2086 if (loop_depth || if_depth)
2087 continue;
2088
2089 if (inst->opcode != BRW_OPCODE_MOV ||
2090 inst->is_partial_write() ||
2091 inst->saturate ||
2092 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2093 inst->src[0].file != UNIFORM)||
2094 inst->dst.type != inst->src[0].type)
2095 continue;
2096
2097 bool has_source_modifiers = (inst->src[0].abs ||
2098 inst->src[0].negate ||
2099 inst->src[0].smear != -1 ||
2100 inst->src[0].file == UNIFORM);
2101
2102 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2103 * them: check for no writes to either one until the exit of the
2104 * program.
2105 */
2106 bool interfered = false;
2107
2108 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2109 !scan_inst->is_tail_sentinel();
2110 scan_inst = (fs_inst *)scan_inst->next) {
2111 if (scan_inst->dst.file == GRF) {
2112 if (scan_inst->overwrites_reg(inst->dst) ||
2113 scan_inst->overwrites_reg(inst->src[0])) {
2114 interfered = true;
2115 break;
2116 }
2117 }
2118
2119 /* The gen6 MATH instruction can't handle source modifiers or
2120 * unusual register regions, so avoid coalescing those for
2121 * now. We should do something more specific.
2122 */
2123 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2124 interfered = true;
2125 break;
2126 }
2127
2128 /* The accumulator result appears to get used for the
2129 * conditional modifier generation. When negating a UD
2130 * value, there is a 33rd bit generated for the sign in the
2131 * accumulator value, so now you can't check, for example,
2132 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2133 */
2134 if (scan_inst->conditional_mod &&
2135 inst->src[0].negate &&
2136 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2137 interfered = true;
2138 break;
2139 }
2140 }
2141 if (interfered) {
2142 continue;
2143 }
2144
2145 /* Rewrite the later usage to point at the source of the move to
2146 * be removed.
2147 */
2148 for (fs_inst *scan_inst = inst;
2149 !scan_inst->is_tail_sentinel();
2150 scan_inst = (fs_inst *)scan_inst->next) {
2151 for (int i = 0; i < 3; i++) {
2152 if (scan_inst->src[i].file == GRF &&
2153 scan_inst->src[i].reg == inst->dst.reg &&
2154 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2155 fs_reg new_src = inst->src[0];
2156 if (scan_inst->src[i].abs) {
2157 new_src.negate = 0;
2158 new_src.abs = 1;
2159 }
2160 new_src.negate ^= scan_inst->src[i].negate;
2161 scan_inst->src[i] = new_src;
2162 }
2163 }
2164 }
2165
2166 inst->remove();
2167 progress = true;
2168 }
2169
2170 if (progress)
2171 live_intervals_valid = false;
2172
2173 return progress;
2174 }
2175
2176
2177 bool
2178 fs_visitor::compute_to_mrf()
2179 {
2180 bool progress = false;
2181 int next_ip = 0;
2182
2183 calculate_live_intervals();
2184
2185 foreach_list_safe(node, &this->instructions) {
2186 fs_inst *inst = (fs_inst *)node;
2187
2188 int ip = next_ip;
2189 next_ip++;
2190
2191 if (inst->opcode != BRW_OPCODE_MOV ||
2192 inst->is_partial_write() ||
2193 inst->dst.file != MRF || inst->src[0].file != GRF ||
2194 inst->dst.type != inst->src[0].type ||
2195 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2196 continue;
2197
2198 /* Work out which hardware MRF registers are written by this
2199 * instruction.
2200 */
2201 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2202 int mrf_high;
2203 if (inst->dst.reg & BRW_MRF_COMPR4) {
2204 mrf_high = mrf_low + 4;
2205 } else if (dispatch_width == 16 &&
2206 (!inst->force_uncompressed && !inst->force_sechalf)) {
2207 mrf_high = mrf_low + 1;
2208 } else {
2209 mrf_high = mrf_low;
2210 }
2211
2212 /* Can't compute-to-MRF this GRF if someone else was going to
2213 * read it later.
2214 */
2215 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2216 continue;
2217
2218 /* Found a move of a GRF to a MRF. Let's see if we can go
2219 * rewrite the thing that made this GRF to write into the MRF.
2220 */
2221 fs_inst *scan_inst;
2222 for (scan_inst = (fs_inst *)inst->prev;
2223 scan_inst->prev != NULL;
2224 scan_inst = (fs_inst *)scan_inst->prev) {
2225 if (scan_inst->dst.file == GRF &&
2226 scan_inst->dst.reg == inst->src[0].reg) {
2227 /* Found the last thing to write our reg we want to turn
2228 * into a compute-to-MRF.
2229 */
2230
2231 /* If this one instruction didn't populate all the
2232 * channels, bail. We might be able to rewrite everything
2233 * that writes that reg, but it would require smarter
2234 * tracking to delay the rewriting until complete success.
2235 */
2236 if (scan_inst->is_partial_write())
2237 break;
2238
2239 /* Things returning more than one register would need us to
2240 * understand coalescing out more than one MOV at a time.
2241 */
2242 if (scan_inst->regs_written > 1)
2243 break;
2244
2245 /* SEND instructions can't have MRF as a destination. */
2246 if (scan_inst->mlen)
2247 break;
2248
2249 if (intel->gen == 6) {
2250 /* gen6 math instructions must have the destination be
2251 * GRF, so no compute-to-MRF for them.
2252 */
2253 if (scan_inst->is_math()) {
2254 break;
2255 }
2256 }
2257
2258 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2259 /* Found the creator of our MRF's source value. */
2260 scan_inst->dst.file = MRF;
2261 scan_inst->dst.reg = inst->dst.reg;
2262 scan_inst->saturate |= inst->saturate;
2263 inst->remove();
2264 progress = true;
2265 }
2266 break;
2267 }
2268
2269 /* We don't handle control flow here. Most computation of
2270 * values that end up in MRFs are shortly before the MRF
2271 * write anyway.
2272 */
2273 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2274 break;
2275
2276 /* You can't read from an MRF, so if someone else reads our
2277 * MRF's source GRF that we wanted to rewrite, that stops us.
2278 */
2279 bool interfered = false;
2280 for (int i = 0; i < 3; i++) {
2281 if (scan_inst->src[i].file == GRF &&
2282 scan_inst->src[i].reg == inst->src[0].reg &&
2283 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2284 interfered = true;
2285 }
2286 }
2287 if (interfered)
2288 break;
2289
2290 if (scan_inst->dst.file == MRF) {
2291 /* If somebody else writes our MRF here, we can't
2292 * compute-to-MRF before that.
2293 */
2294 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2295 int scan_mrf_high;
2296
2297 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2298 scan_mrf_high = scan_mrf_low + 4;
2299 } else if (dispatch_width == 16 &&
2300 (!scan_inst->force_uncompressed &&
2301 !scan_inst->force_sechalf)) {
2302 scan_mrf_high = scan_mrf_low + 1;
2303 } else {
2304 scan_mrf_high = scan_mrf_low;
2305 }
2306
2307 if (mrf_low == scan_mrf_low ||
2308 mrf_low == scan_mrf_high ||
2309 mrf_high == scan_mrf_low ||
2310 mrf_high == scan_mrf_high) {
2311 break;
2312 }
2313 }
2314
2315 if (scan_inst->mlen > 0) {
2316 /* Found a SEND instruction, which means that there are
2317 * live values in MRFs from base_mrf to base_mrf +
2318 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2319 * above it.
2320 */
2321 if (mrf_low >= scan_inst->base_mrf &&
2322 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2323 break;
2324 }
2325 if (mrf_high >= scan_inst->base_mrf &&
2326 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2327 break;
2328 }
2329 }
2330 }
2331 }
2332
2333 if (progress)
2334 live_intervals_valid = false;
2335
2336 return progress;
2337 }
2338
2339 /**
2340 * Walks through basic blocks, looking for repeated MRF writes and
2341 * removing the later ones.
2342 */
2343 bool
2344 fs_visitor::remove_duplicate_mrf_writes()
2345 {
2346 fs_inst *last_mrf_move[16];
2347 bool progress = false;
2348
2349 /* Need to update the MRF tracking for compressed instructions. */
2350 if (dispatch_width == 16)
2351 return false;
2352
2353 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2354
2355 foreach_list_safe(node, &this->instructions) {
2356 fs_inst *inst = (fs_inst *)node;
2357
2358 if (inst->is_control_flow()) {
2359 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2360 }
2361
2362 if (inst->opcode == BRW_OPCODE_MOV &&
2363 inst->dst.file == MRF) {
2364 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2365 if (prev_inst && inst->equals(prev_inst)) {
2366 inst->remove();
2367 progress = true;
2368 continue;
2369 }
2370 }
2371
2372 /* Clear out the last-write records for MRFs that were overwritten. */
2373 if (inst->dst.file == MRF) {
2374 last_mrf_move[inst->dst.reg] = NULL;
2375 }
2376
2377 if (inst->mlen > 0) {
2378 /* Found a SEND instruction, which will include two or fewer
2379 * implied MRF writes. We could do better here.
2380 */
2381 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2382 last_mrf_move[inst->base_mrf + i] = NULL;
2383 }
2384 }
2385
2386 /* Clear out any MRF move records whose sources got overwritten. */
2387 if (inst->dst.file == GRF) {
2388 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2389 if (last_mrf_move[i] &&
2390 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2391 last_mrf_move[i] = NULL;
2392 }
2393 }
2394 }
2395
2396 if (inst->opcode == BRW_OPCODE_MOV &&
2397 inst->dst.file == MRF &&
2398 inst->src[0].file == GRF &&
2399 !inst->is_partial_write()) {
2400 last_mrf_move[inst->dst.reg] = inst;
2401 }
2402 }
2403
2404 if (progress)
2405 live_intervals_valid = false;
2406
2407 return progress;
2408 }
2409
2410 static void
2411 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2412 int first_grf, int grf_len)
2413 {
2414 bool inst_16wide = (dispatch_width > 8 &&
2415 !inst->force_uncompressed &&
2416 !inst->force_sechalf);
2417
2418 /* Clear the flag for registers that actually got read (as expected). */
2419 for (int i = 0; i < 3; i++) {
2420 int grf;
2421 if (inst->src[i].file == GRF) {
2422 grf = inst->src[i].reg;
2423 } else if (inst->src[i].file == HW_REG &&
2424 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2425 grf = inst->src[i].fixed_hw_reg.nr;
2426 } else {
2427 continue;
2428 }
2429
2430 if (grf >= first_grf &&
2431 grf < first_grf + grf_len) {
2432 deps[grf - first_grf] = false;
2433 if (inst_16wide)
2434 deps[grf - first_grf + 1] = false;
2435 }
2436 }
2437 }
2438
2439 /**
2440 * Implements this workaround for the original 965:
2441 *
2442 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2443 * check for post destination dependencies on this instruction, software
2444 * must ensure that there is no destination hazard for the case of ‘write
2445 * followed by a posted write’ shown in the following example.
2446 *
2447 * 1. mov r3 0
2448 * 2. send r3.xy <rest of send instruction>
2449 * 3. mov r2 r3
2450 *
2451 * Due to no post-destination dependency check on the ‘send’, the above
2452 * code sequence could have two instructions (1 and 2) in flight at the
2453 * same time that both consider ‘r3’ as the target of their final writes.
2454 */
2455 void
2456 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2457 {
2458 int reg_size = dispatch_width / 8;
2459 int write_len = inst->regs_written * reg_size;
2460 int first_write_grf = inst->dst.reg;
2461 bool needs_dep[BRW_MAX_MRF];
2462 assert(write_len < (int)sizeof(needs_dep) - 1);
2463
2464 memset(needs_dep, false, sizeof(needs_dep));
2465 memset(needs_dep, true, write_len);
2466
2467 clear_deps_for_inst_src(inst, dispatch_width,
2468 needs_dep, first_write_grf, write_len);
2469
2470 /* Walk backwards looking for writes to registers we're writing which
2471 * aren't read since being written. If we hit the start of the program,
2472 * we assume that there are no outstanding dependencies on entry to the
2473 * program.
2474 */
2475 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2476 scan_inst != NULL;
2477 scan_inst = (fs_inst *)scan_inst->prev) {
2478
2479 /* If we hit control flow, assume that there *are* outstanding
2480 * dependencies, and force their cleanup before our instruction.
2481 */
2482 if (scan_inst->is_control_flow()) {
2483 for (int i = 0; i < write_len; i++) {
2484 if (needs_dep[i]) {
2485 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2486 }
2487 }
2488 return;
2489 }
2490
2491 bool scan_inst_16wide = (dispatch_width > 8 &&
2492 !scan_inst->force_uncompressed &&
2493 !scan_inst->force_sechalf);
2494
2495 /* We insert our reads as late as possible on the assumption that any
2496 * instruction but a MOV that might have left us an outstanding
2497 * dependency has more latency than a MOV.
2498 */
2499 if (scan_inst->dst.file == GRF) {
2500 for (int i = 0; i < scan_inst->regs_written; i++) {
2501 int reg = scan_inst->dst.reg + i * reg_size;
2502
2503 if (reg >= first_write_grf &&
2504 reg < first_write_grf + write_len &&
2505 needs_dep[reg - first_write_grf]) {
2506 inst->insert_before(DEP_RESOLVE_MOV(reg));
2507 needs_dep[reg - first_write_grf] = false;
2508 if (scan_inst_16wide)
2509 needs_dep[reg - first_write_grf + 1] = false;
2510 }
2511 }
2512 }
2513
2514 /* Clear the flag for registers that actually got read (as expected). */
2515 clear_deps_for_inst_src(scan_inst, dispatch_width,
2516 needs_dep, first_write_grf, write_len);
2517
2518 /* Continue the loop only if we haven't resolved all the dependencies */
2519 int i;
2520 for (i = 0; i < write_len; i++) {
2521 if (needs_dep[i])
2522 break;
2523 }
2524 if (i == write_len)
2525 return;
2526 }
2527 }
2528
2529 /**
2530 * Implements this workaround for the original 965:
2531 *
2532 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2533 * used as a destination register until after it has been sourced by an
2534 * instruction with a different destination register.
2535 */
2536 void
2537 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2538 {
2539 int write_len = inst->regs_written * dispatch_width / 8;
2540 int first_write_grf = inst->dst.reg;
2541 bool needs_dep[BRW_MAX_MRF];
2542 assert(write_len < (int)sizeof(needs_dep) - 1);
2543
2544 memset(needs_dep, false, sizeof(needs_dep));
2545 memset(needs_dep, true, write_len);
2546 /* Walk forwards looking for writes to registers we're writing which aren't
2547 * read before being written.
2548 */
2549 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2550 !scan_inst->is_tail_sentinel();
2551 scan_inst = (fs_inst *)scan_inst->next) {
2552 /* If we hit control flow, force resolve all remaining dependencies. */
2553 if (scan_inst->is_control_flow()) {
2554 for (int i = 0; i < write_len; i++) {
2555 if (needs_dep[i])
2556 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2557 }
2558 return;
2559 }
2560
2561 /* Clear the flag for registers that actually got read (as expected). */
2562 clear_deps_for_inst_src(scan_inst, dispatch_width,
2563 needs_dep, first_write_grf, write_len);
2564
2565 /* We insert our reads as late as possible since they're reading the
2566 * result of a SEND, which has massive latency.
2567 */
2568 if (scan_inst->dst.file == GRF &&
2569 scan_inst->dst.reg >= first_write_grf &&
2570 scan_inst->dst.reg < first_write_grf + write_len &&
2571 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2572 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2573 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2574 }
2575
2576 /* Continue the loop only if we haven't resolved all the dependencies */
2577 int i;
2578 for (i = 0; i < write_len; i++) {
2579 if (needs_dep[i])
2580 break;
2581 }
2582 if (i == write_len)
2583 return;
2584 }
2585
2586 /* If we hit the end of the program, resolve all remaining dependencies out
2587 * of paranoia.
2588 */
2589 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2590 assert(last_inst->eot);
2591 for (int i = 0; i < write_len; i++) {
2592 if (needs_dep[i])
2593 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2594 }
2595 }
2596
2597 void
2598 fs_visitor::insert_gen4_send_dependency_workarounds()
2599 {
2600 if (intel->gen != 4 || intel->is_g4x)
2601 return;
2602
2603 /* Note that we're done with register allocation, so GRF fs_regs always
2604 * have a .reg_offset of 0.
2605 */
2606
2607 foreach_list_safe(node, &this->instructions) {
2608 fs_inst *inst = (fs_inst *)node;
2609
2610 if (inst->mlen != 0 && inst->dst.file == GRF) {
2611 insert_gen4_pre_send_dependency_workarounds(inst);
2612 insert_gen4_post_send_dependency_workarounds(inst);
2613 }
2614 }
2615 }
2616
2617 /**
2618 * Turns the generic expression-style uniform pull constant load instruction
2619 * into a hardware-specific series of instructions for loading a pull
2620 * constant.
2621 *
2622 * The expression style allows the CSE pass before this to optimize out
2623 * repeated loads from the same offset, and gives the pre-register-allocation
2624 * scheduling full flexibility, while the conversion to native instructions
2625 * allows the post-register-allocation scheduler the best information
2626 * possible.
2627 *
2628 * Note that execution masking for setting up pull constant loads is special:
2629 * the channels that need to be written are unrelated to the current execution
2630 * mask, since a later instruction will use one of the result channels as a
2631 * source operand for all 8 or 16 of its channels.
2632 */
2633 void
2634 fs_visitor::lower_uniform_pull_constant_loads()
2635 {
2636 foreach_list(node, &this->instructions) {
2637 fs_inst *inst = (fs_inst *)node;
2638
2639 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2640 continue;
2641
2642 if (intel->gen >= 7) {
2643 /* The offset arg before was a vec4-aligned byte offset. We need to
2644 * turn it into a dword offset.
2645 */
2646 fs_reg const_offset_reg = inst->src[1];
2647 assert(const_offset_reg.file == IMM &&
2648 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2649 const_offset_reg.imm.u /= 4;
2650 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2651
2652 /* This is actually going to be a MOV, but since only the first dword
2653 * is accessed, we have a special opcode to do just that one. Note
2654 * that this needs to be an operation that will be considered a def
2655 * by live variable analysis, or register allocation will explode.
2656 */
2657 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2658 payload, const_offset_reg);
2659 setup->force_writemask_all = true;
2660
2661 setup->ir = inst->ir;
2662 setup->annotation = inst->annotation;
2663 inst->insert_before(setup);
2664
2665 /* Similarly, this will only populate the first 4 channels of the
2666 * result register (since we only use smear values from 0-3), but we
2667 * don't tell the optimizer.
2668 */
2669 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2670 inst->src[1] = payload;
2671
2672 this->live_intervals_valid = false;
2673 } else {
2674 /* Before register allocation, we didn't tell the scheduler about the
2675 * MRF we use. We know it's safe to use this MRF because nothing
2676 * else does except for register spill/unspill, which generates and
2677 * uses its MRF within a single IR instruction.
2678 */
2679 inst->base_mrf = 14;
2680 inst->mlen = 1;
2681 }
2682 }
2683 }
2684
2685 void
2686 fs_visitor::dump_instruction(backend_instruction *be_inst)
2687 {
2688 fs_inst *inst = (fs_inst *)be_inst;
2689
2690 if (inst->predicate) {
2691 printf("(%cf0.%d) ",
2692 inst->predicate_inverse ? '-' : '+',
2693 inst->flag_subreg);
2694 }
2695
2696 printf("%s", brw_instruction_name(inst->opcode));
2697 if (inst->saturate)
2698 printf(".sat");
2699 if (inst->conditional_mod) {
2700 printf(".cmod");
2701 if (!inst->predicate &&
2702 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2703 inst->opcode != BRW_OPCODE_IF &&
2704 inst->opcode != BRW_OPCODE_WHILE))) {
2705 printf(".f0.%d\n", inst->flag_subreg);
2706 }
2707 }
2708 printf(" ");
2709
2710
2711 switch (inst->dst.file) {
2712 case GRF:
2713 printf("vgrf%d", inst->dst.reg);
2714 if (inst->dst.reg_offset)
2715 printf("+%d", inst->dst.reg_offset);
2716 break;
2717 case MRF:
2718 printf("m%d", inst->dst.reg);
2719 break;
2720 case BAD_FILE:
2721 printf("(null)");
2722 break;
2723 case UNIFORM:
2724 printf("***u%d***", inst->dst.reg);
2725 break;
2726 default:
2727 printf("???");
2728 break;
2729 }
2730 printf(", ");
2731
2732 for (int i = 0; i < 3; i++) {
2733 if (inst->src[i].negate)
2734 printf("-");
2735 if (inst->src[i].abs)
2736 printf("|");
2737 switch (inst->src[i].file) {
2738 case GRF:
2739 printf("vgrf%d", inst->src[i].reg);
2740 if (inst->src[i].reg_offset)
2741 printf("+%d", inst->src[i].reg_offset);
2742 break;
2743 case MRF:
2744 printf("***m%d***", inst->src[i].reg);
2745 break;
2746 case UNIFORM:
2747 printf("u%d", inst->src[i].reg);
2748 if (inst->src[i].reg_offset)
2749 printf(".%d", inst->src[i].reg_offset);
2750 break;
2751 case BAD_FILE:
2752 printf("(null)");
2753 break;
2754 case IMM:
2755 switch (inst->src[i].type) {
2756 case BRW_REGISTER_TYPE_F:
2757 printf("%ff", inst->src[i].imm.f);
2758 break;
2759 case BRW_REGISTER_TYPE_D:
2760 printf("%dd", inst->src[i].imm.i);
2761 break;
2762 case BRW_REGISTER_TYPE_UD:
2763 printf("%uu", inst->src[i].imm.u);
2764 break;
2765 default:
2766 printf("???");
2767 break;
2768 }
2769 break;
2770 default:
2771 printf("???");
2772 break;
2773 }
2774 if (inst->src[i].abs)
2775 printf("|");
2776
2777 if (i < 3)
2778 printf(", ");
2779 }
2780
2781 printf(" ");
2782
2783 if (inst->force_uncompressed)
2784 printf("1sthalf ");
2785
2786 if (inst->force_sechalf)
2787 printf("2ndhalf ");
2788
2789 printf("\n");
2790 }
2791
2792 /**
2793 * Possibly returns an instruction that set up @param reg.
2794 *
2795 * Sometimes we want to take the result of some expression/variable
2796 * dereference tree and rewrite the instruction generating the result
2797 * of the tree. When processing the tree, we know that the
2798 * instructions generated are all writing temporaries that are dead
2799 * outside of this tree. So, if we have some instructions that write
2800 * a temporary, we're free to point that temp write somewhere else.
2801 *
2802 * Note that this doesn't guarantee that the instruction generated
2803 * only reg -- it might be the size=4 destination of a texture instruction.
2804 */
2805 fs_inst *
2806 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2807 fs_inst *end,
2808 fs_reg reg)
2809 {
2810 if (end == start ||
2811 end->is_partial_write() ||
2812 reg.reladdr ||
2813 !reg.equals(end->dst)) {
2814 return NULL;
2815 } else {
2816 return end;
2817 }
2818 }
2819
2820 void
2821 fs_visitor::setup_payload_gen6()
2822 {
2823 bool uses_depth =
2824 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2825 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2826
2827 assert(intel->gen >= 6);
2828
2829 /* R0-1: masks, pixel X/Y coordinates. */
2830 c->nr_payload_regs = 2;
2831 /* R2: only for 32-pixel dispatch.*/
2832
2833 /* R3-26: barycentric interpolation coordinates. These appear in the
2834 * same order that they appear in the brw_wm_barycentric_interp_mode
2835 * enum. Each set of coordinates occupies 2 registers if dispatch width
2836 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2837 * appear if they were enabled using the "Barycentric Interpolation
2838 * Mode" bits in WM_STATE.
2839 */
2840 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2841 if (barycentric_interp_modes & (1 << i)) {
2842 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2843 c->nr_payload_regs += 2;
2844 if (dispatch_width == 16) {
2845 c->nr_payload_regs += 2;
2846 }
2847 }
2848 }
2849
2850 /* R27: interpolated depth if uses source depth */
2851 if (uses_depth) {
2852 c->source_depth_reg = c->nr_payload_regs;
2853 c->nr_payload_regs++;
2854 if (dispatch_width == 16) {
2855 /* R28: interpolated depth if not 8-wide. */
2856 c->nr_payload_regs++;
2857 }
2858 }
2859 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2860 if (uses_depth) {
2861 c->source_w_reg = c->nr_payload_regs;
2862 c->nr_payload_regs++;
2863 if (dispatch_width == 16) {
2864 /* R30: interpolated W if not 8-wide. */
2865 c->nr_payload_regs++;
2866 }
2867 }
2868 /* R31: MSAA position offsets. */
2869 /* R32-: bary for 32-pixel. */
2870 /* R58-59: interp W for 32-pixel. */
2871
2872 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2873 c->source_depth_to_render_target = true;
2874 }
2875 }
2876
2877 bool
2878 fs_visitor::run()
2879 {
2880 sanity_param_count = fp->Base.Parameters->NumParameters;
2881 uint32_t orig_nr_params = c->prog_data.nr_params;
2882
2883 if (intel->gen >= 6)
2884 setup_payload_gen6();
2885 else
2886 setup_payload_gen4();
2887
2888 if (0) {
2889 emit_dummy_fs();
2890 } else {
2891 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2892 emit_shader_time_begin();
2893
2894 calculate_urb_setup();
2895 if (intel->gen < 6)
2896 emit_interpolation_setup_gen4();
2897 else
2898 emit_interpolation_setup_gen6();
2899
2900 /* We handle discards by keeping track of the still-live pixels in f0.1.
2901 * Initialize it with the dispatched pixels.
2902 */
2903 if (fp->UsesKill) {
2904 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2905 discard_init->flag_subreg = 1;
2906 }
2907
2908 /* Generate FS IR for main(). (the visitor only descends into
2909 * functions called "main").
2910 */
2911 if (shader) {
2912 foreach_list(node, &*shader->ir) {
2913 ir_instruction *ir = (ir_instruction *)node;
2914 base_ir = ir;
2915 this->result = reg_undef;
2916 ir->accept(this);
2917 }
2918 } else {
2919 emit_fragment_program_code();
2920 }
2921 base_ir = NULL;
2922 if (failed)
2923 return false;
2924
2925 emit(FS_OPCODE_PLACEHOLDER_HALT);
2926
2927 emit_fb_writes();
2928
2929 split_virtual_grfs();
2930
2931 move_uniform_array_access_to_pull_constants();
2932 setup_pull_constants();
2933
2934 bool progress;
2935 do {
2936 progress = false;
2937
2938 compact_virtual_grfs();
2939
2940 progress = remove_duplicate_mrf_writes() || progress;
2941
2942 progress = opt_algebraic() || progress;
2943 progress = opt_cse() || progress;
2944 progress = opt_copy_propagate() || progress;
2945 progress = dead_code_eliminate() || progress;
2946 progress = dead_code_eliminate_local() || progress;
2947 progress = register_coalesce() || progress;
2948 progress = register_coalesce_2() || progress;
2949 progress = compute_to_mrf() || progress;
2950 } while (progress);
2951
2952 remove_dead_constants();
2953
2954 schedule_instructions(false);
2955
2956 lower_uniform_pull_constant_loads();
2957
2958 assign_curb_setup();
2959 assign_urb_setup();
2960
2961 if (0) {
2962 /* Debug of register spilling: Go spill everything. */
2963 for (int i = 0; i < virtual_grf_count; i++) {
2964 spill_reg(i);
2965 }
2966 }
2967
2968 if (0)
2969 assign_regs_trivial();
2970 else {
2971 while (!assign_regs()) {
2972 if (failed)
2973 break;
2974 }
2975 }
2976 }
2977 assert(force_uncompressed_stack == 0);
2978 assert(force_sechalf_stack == 0);
2979
2980 /* This must come after all optimization and register allocation, since
2981 * it inserts dead code that happens to have side effects, and it does
2982 * so based on the actual physical registers in use.
2983 */
2984 insert_gen4_send_dependency_workarounds();
2985
2986 if (failed)
2987 return false;
2988
2989 schedule_instructions(true);
2990
2991 if (dispatch_width == 8) {
2992 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2993 } else {
2994 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2995
2996 /* Make sure we didn't try to sneak in an extra uniform */
2997 assert(orig_nr_params == c->prog_data.nr_params);
2998 (void) orig_nr_params;
2999 }
3000
3001 /* If any state parameters were appended, then ParameterValues could have
3002 * been realloced, in which case the driver uniform storage set up by
3003 * _mesa_associate_uniform_storage() would point to freed memory. Make
3004 * sure that didn't happen.
3005 */
3006 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3007
3008 return !failed;
3009 }
3010
3011 const unsigned *
3012 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3013 struct gl_fragment_program *fp,
3014 struct gl_shader_program *prog,
3015 unsigned *final_assembly_size)
3016 {
3017 struct intel_context *intel = &brw->intel;
3018 bool start_busy = false;
3019 float start_time = 0;
3020
3021 if (unlikely(intel->perf_debug)) {
3022 start_busy = (intel->batch.last_bo &&
3023 drm_intel_bo_busy(intel->batch.last_bo));
3024 start_time = get_time();
3025 }
3026
3027 struct brw_shader *shader = NULL;
3028 if (prog)
3029 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3030
3031 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3032 if (prog) {
3033 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3034 _mesa_print_ir(shader->ir, NULL);
3035 printf("\n\n");
3036 } else {
3037 printf("ARB_fragment_program %d ir for native fragment shader\n",
3038 fp->Base.Id);
3039 _mesa_print_program(&fp->Base);
3040 }
3041 }
3042
3043 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3044 */
3045 fs_visitor v(brw, c, prog, fp, 8);
3046 if (!v.run()) {
3047 if (prog) {
3048 prog->LinkStatus = false;
3049 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3050 }
3051
3052 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3053 v.fail_msg);
3054
3055 return NULL;
3056 }
3057
3058 exec_list *simd16_instructions = NULL;
3059 fs_visitor v2(brw, c, prog, fp, 16);
3060 bool no16 = INTEL_DEBUG & DEBUG_NO16;
3061 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
3062 v2.import_uniforms(&v);
3063 if (!v2.run()) {
3064 perf_debug("16-wide shader failed to compile, falling back to "
3065 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3066 } else {
3067 simd16_instructions = &v2.instructions;
3068 }
3069 }
3070
3071 c->prog_data.dispatch_width = 8;
3072
3073 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3074 const unsigned *generated = g.generate_assembly(&v.instructions,
3075 simd16_instructions,
3076 final_assembly_size);
3077
3078 if (unlikely(intel->perf_debug) && shader) {
3079 if (shader->compiled_once)
3080 brw_wm_debug_recompile(brw, prog, &c->key);
3081 shader->compiled_once = true;
3082
3083 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
3084 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3085 (get_time() - start_time) * 1000);
3086 }
3087 }
3088
3089 return generated;
3090 }
3091
3092 bool
3093 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3094 {
3095 struct brw_context *brw = brw_context(ctx);
3096 struct intel_context *intel = &brw->intel;
3097 struct brw_wm_prog_key key;
3098
3099 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3100 return true;
3101
3102 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3103 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3104 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3105 bool program_uses_dfdy = fp->UsesDFdy;
3106
3107 memset(&key, 0, sizeof(key));
3108
3109 if (intel->gen < 6) {
3110 if (fp->UsesKill)
3111 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3112
3113 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3114 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3115
3116 /* Just assume depth testing. */
3117 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3118 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3119 }
3120
3121 if (intel->gen < 6)
3122 key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3123
3124 for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3125 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3126 continue;
3127
3128 if (intel->gen < 6) {
3129 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3130 key.input_slots_valid |= BITFIELD64_BIT(i);
3131 }
3132 }
3133
3134 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3135
3136 for (int i = 0; i < MAX_SAMPLERS; i++) {
3137 if (fp->Base.ShadowSamplers & (1 << i)) {
3138 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3139 key.tex.swizzles[i] =
3140 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3141 } else {
3142 /* Color sampler: assume no swizzling. */
3143 key.tex.swizzles[i] = SWIZZLE_XYZW;
3144 }
3145 }
3146
3147 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3148 key.drawable_height = ctx->DrawBuffer->Height;
3149 }
3150
3151 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3152 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3153 }
3154
3155 key.nr_color_regions = 1;
3156
3157 key.program_string_id = bfp->id;
3158
3159 uint32_t old_prog_offset = brw->wm.prog_offset;
3160 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3161
3162 bool success = do_wm_prog(brw, prog, bfp, &key);
3163
3164 brw->wm.prog_offset = old_prog_offset;
3165 brw->wm.prog_data = old_prog_data;
3166
3167 return success;
3168 }