i965/fs: Avoid inappropriate optimization with regs_written > 1.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 #define ALU3(op) \
150 fs_inst * \
151 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
152 { \
153 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
154 }
155
156 ALU1(NOT)
157 ALU1(MOV)
158 ALU1(FRC)
159 ALU1(RNDD)
160 ALU1(RNDE)
161 ALU1(RNDZ)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(SHL)
169 ALU2(SHR)
170 ALU2(ASR)
171 ALU3(LRP)
172
173 /** Gen4 predicated IF. */
174 fs_inst *
175 fs_visitor::IF(uint32_t predicate)
176 {
177 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
178 inst->predicate = predicate;
179 return inst;
180 }
181
182 /** Gen6+ IF with embedded comparison. */
183 fs_inst *
184 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
185 {
186 assert(intel->gen >= 6);
187 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
188 reg_null_d, src0, src1);
189 inst->conditional_mod = condition;
190 return inst;
191 }
192
193 /**
194 * CMP: Sets the low bit of the destination channels with the result
195 * of the comparison, while the upper bits are undefined, and updates
196 * the flag register with the packed 16 bits of the result.
197 */
198 fs_inst *
199 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
200 {
201 fs_inst *inst;
202
203 /* Take the instruction:
204 *
205 * CMP null<d> src0<f> src1<f>
206 *
207 * Original gen4 does type conversion to the destination type before
208 * comparison, producing garbage results for floating point comparisons.
209 * gen5 does the comparison on the execution type (resolved source types),
210 * so dst type doesn't matter. gen6 does comparison and then uses the
211 * result as if it was the dst type with no conversion, which happens to
212 * mostly work out for float-interpreted-as-int since our comparisons are
213 * for >0, =0, <0.
214 */
215 if (intel->gen == 4) {
216 dst.type = src0.type;
217 if (dst.file == FIXED_HW_REG)
218 dst.fixed_hw_reg.type = dst.type;
219 }
220
221 resolve_ud_negate(&src0);
222 resolve_ud_negate(&src1);
223
224 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
225 inst->conditional_mod = condition;
226
227 return inst;
228 }
229
230 exec_list
231 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
232 fs_reg varying_offset,
233 uint32_t const_offset)
234 {
235 exec_list instructions;
236 fs_inst *inst;
237
238 fs_reg offset = fs_reg(this, glsl_type::uint_type);
239 instructions.push_tail(ADD(offset, varying_offset, fs_reg(const_offset)));
240
241 if (intel->gen >= 7) {
242 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
243 dst, surf_index, offset);
244 instructions.push_tail(inst);
245 } else {
246 int base_mrf = 13;
247 bool header_present = true;
248
249 fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
250 mrf.type = BRW_REGISTER_TYPE_D;
251
252 /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
253 * dword-aligned byte offset.
254 */
255 if (intel->gen == 6) {
256 instructions.push_tail(MOV(mrf, offset));
257 } else {
258 instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
259 }
260 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
261 dst, surf_index);
262 inst->header_present = header_present;
263 inst->base_mrf = base_mrf;
264 inst->mlen = header_present + dispatch_width / 8;
265
266 instructions.push_tail(inst);
267 }
268
269 return instructions;
270 }
271
272 /**
273 * A helper for MOV generation for fixing up broken hardware SEND dependency
274 * handling.
275 */
276 fs_inst *
277 fs_visitor::DEP_RESOLVE_MOV(int grf)
278 {
279 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
280
281 inst->ir = NULL;
282 inst->annotation = "send dependency resolve";
283
284 /* The caller always wants uncompressed to emit the minimal extra
285 * dependencies, and to avoid having to deal with aligning its regs to 2.
286 */
287 inst->force_uncompressed = true;
288
289 return inst;
290 }
291
292 bool
293 fs_inst::equals(fs_inst *inst)
294 {
295 return (opcode == inst->opcode &&
296 dst.equals(inst->dst) &&
297 src[0].equals(inst->src[0]) &&
298 src[1].equals(inst->src[1]) &&
299 src[2].equals(inst->src[2]) &&
300 saturate == inst->saturate &&
301 predicate == inst->predicate &&
302 conditional_mod == inst->conditional_mod &&
303 mlen == inst->mlen &&
304 base_mrf == inst->base_mrf &&
305 sampler == inst->sampler &&
306 target == inst->target &&
307 eot == inst->eot &&
308 header_present == inst->header_present &&
309 shadow_compare == inst->shadow_compare &&
310 offset == inst->offset);
311 }
312
313 int
314 fs_inst::regs_written()
315 {
316 if (is_tex())
317 return 4;
318
319 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
320 * but we don't currently use them...nor do we have an opcode for them.
321 */
322
323 return 1;
324 }
325
326 bool
327 fs_inst::overwrites_reg(const fs_reg &reg)
328 {
329 return (reg.file == dst.file &&
330 reg.reg == dst.reg &&
331 reg.reg_offset >= dst.reg_offset &&
332 reg.reg_offset < dst.reg_offset + regs_written());
333 }
334
335 bool
336 fs_inst::is_tex()
337 {
338 return (opcode == SHADER_OPCODE_TEX ||
339 opcode == FS_OPCODE_TXB ||
340 opcode == SHADER_OPCODE_TXD ||
341 opcode == SHADER_OPCODE_TXF ||
342 opcode == SHADER_OPCODE_TXF_MS ||
343 opcode == SHADER_OPCODE_TXL ||
344 opcode == SHADER_OPCODE_TXS ||
345 opcode == SHADER_OPCODE_LOD);
346 }
347
348 bool
349 fs_inst::is_math()
350 {
351 return (opcode == SHADER_OPCODE_RCP ||
352 opcode == SHADER_OPCODE_RSQ ||
353 opcode == SHADER_OPCODE_SQRT ||
354 opcode == SHADER_OPCODE_EXP2 ||
355 opcode == SHADER_OPCODE_LOG2 ||
356 opcode == SHADER_OPCODE_SIN ||
357 opcode == SHADER_OPCODE_COS ||
358 opcode == SHADER_OPCODE_INT_QUOTIENT ||
359 opcode == SHADER_OPCODE_INT_REMAINDER ||
360 opcode == SHADER_OPCODE_POW);
361 }
362
363 bool
364 fs_inst::is_control_flow()
365 {
366 switch (opcode) {
367 case BRW_OPCODE_DO:
368 case BRW_OPCODE_WHILE:
369 case BRW_OPCODE_IF:
370 case BRW_OPCODE_ELSE:
371 case BRW_OPCODE_ENDIF:
372 case BRW_OPCODE_BREAK:
373 case BRW_OPCODE_CONTINUE:
374 return true;
375 default:
376 return false;
377 }
378 }
379
380 bool
381 fs_inst::is_send_from_grf()
382 {
383 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
384 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
385 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
386 src[1].file == GRF));
387 }
388
389 bool
390 fs_visitor::can_do_source_mods(fs_inst *inst)
391 {
392 if (intel->gen == 6 && inst->is_math())
393 return false;
394
395 if (inst->is_send_from_grf())
396 return false;
397
398 return true;
399 }
400
401 void
402 fs_reg::init()
403 {
404 memset(this, 0, sizeof(*this));
405 this->smear = -1;
406 }
407
408 /** Generic unset register constructor. */
409 fs_reg::fs_reg()
410 {
411 init();
412 this->file = BAD_FILE;
413 }
414
415 /** Immediate value constructor. */
416 fs_reg::fs_reg(float f)
417 {
418 init();
419 this->file = IMM;
420 this->type = BRW_REGISTER_TYPE_F;
421 this->imm.f = f;
422 }
423
424 /** Immediate value constructor. */
425 fs_reg::fs_reg(int32_t i)
426 {
427 init();
428 this->file = IMM;
429 this->type = BRW_REGISTER_TYPE_D;
430 this->imm.i = i;
431 }
432
433 /** Immediate value constructor. */
434 fs_reg::fs_reg(uint32_t u)
435 {
436 init();
437 this->file = IMM;
438 this->type = BRW_REGISTER_TYPE_UD;
439 this->imm.u = u;
440 }
441
442 /** Fixed brw_reg Immediate value constructor. */
443 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
444 {
445 init();
446 this->file = FIXED_HW_REG;
447 this->fixed_hw_reg = fixed_hw_reg;
448 this->type = fixed_hw_reg.type;
449 }
450
451 bool
452 fs_reg::equals(const fs_reg &r) const
453 {
454 return (file == r.file &&
455 reg == r.reg &&
456 reg_offset == r.reg_offset &&
457 type == r.type &&
458 negate == r.negate &&
459 abs == r.abs &&
460 !reladdr && !r.reladdr &&
461 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
462 sizeof(fixed_hw_reg)) == 0 &&
463 smear == r.smear &&
464 imm.u == r.imm.u);
465 }
466
467 bool
468 fs_reg::is_zero() const
469 {
470 if (file != IMM)
471 return false;
472
473 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
474 }
475
476 bool
477 fs_reg::is_one() const
478 {
479 if (file != IMM)
480 return false;
481
482 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
483 }
484
485 int
486 fs_visitor::type_size(const struct glsl_type *type)
487 {
488 unsigned int size, i;
489
490 switch (type->base_type) {
491 case GLSL_TYPE_UINT:
492 case GLSL_TYPE_INT:
493 case GLSL_TYPE_FLOAT:
494 case GLSL_TYPE_BOOL:
495 return type->components();
496 case GLSL_TYPE_ARRAY:
497 return type_size(type->fields.array) * type->length;
498 case GLSL_TYPE_STRUCT:
499 size = 0;
500 for (i = 0; i < type->length; i++) {
501 size += type_size(type->fields.structure[i].type);
502 }
503 return size;
504 case GLSL_TYPE_SAMPLER:
505 /* Samplers take up no register space, since they're baked in at
506 * link time.
507 */
508 return 0;
509 case GLSL_TYPE_VOID:
510 case GLSL_TYPE_ERROR:
511 case GLSL_TYPE_INTERFACE:
512 assert(!"not reached");
513 break;
514 }
515
516 return 0;
517 }
518
519 fs_reg
520 fs_visitor::get_timestamp()
521 {
522 assert(intel->gen >= 7);
523
524 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
525 BRW_ARF_TIMESTAMP,
526 0),
527 BRW_REGISTER_TYPE_UD));
528
529 fs_reg dst = fs_reg(this, glsl_type::uint_type);
530
531 fs_inst *mov = emit(MOV(dst, ts));
532 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
533 * even if it's not enabled in the dispatch.
534 */
535 mov->force_writemask_all = true;
536 mov->force_uncompressed = true;
537
538 /* The caller wants the low 32 bits of the timestamp. Since it's running
539 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
540 * which is plenty of time for our purposes. It is identical across the
541 * EUs, but since it's tracking GPU core speed it will increment at a
542 * varying rate as render P-states change.
543 *
544 * The caller could also check if render P-states have changed (or anything
545 * else that might disrupt timing) by setting smear to 2 and checking if
546 * that field is != 0.
547 */
548 dst.smear = 0;
549
550 return dst;
551 }
552
553 void
554 fs_visitor::emit_shader_time_begin()
555 {
556 current_annotation = "shader time start";
557 shader_start_time = get_timestamp();
558 }
559
560 void
561 fs_visitor::emit_shader_time_end()
562 {
563 current_annotation = "shader time end";
564
565 enum shader_time_shader_type type, written_type, reset_type;
566 if (dispatch_width == 8) {
567 type = ST_FS8;
568 written_type = ST_FS8_WRITTEN;
569 reset_type = ST_FS8_RESET;
570 } else {
571 assert(dispatch_width == 16);
572 type = ST_FS16;
573 written_type = ST_FS16_WRITTEN;
574 reset_type = ST_FS16_RESET;
575 }
576
577 fs_reg shader_end_time = get_timestamp();
578
579 /* Check that there weren't any timestamp reset events (assuming these
580 * were the only two timestamp reads that happened).
581 */
582 fs_reg reset = shader_end_time;
583 reset.smear = 2;
584 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
585 test->conditional_mod = BRW_CONDITIONAL_Z;
586 emit(IF(BRW_PREDICATE_NORMAL));
587
588 push_force_uncompressed();
589 fs_reg start = shader_start_time;
590 start.negate = true;
591 fs_reg diff = fs_reg(this, glsl_type::uint_type);
592 emit(ADD(diff, start, shader_end_time));
593
594 /* If there were no instructions between the two timestamp gets, the diff
595 * is 2 cycles. Remove that overhead, so I can forget about that when
596 * trying to determine the time taken for single instructions.
597 */
598 emit(ADD(diff, diff, fs_reg(-2u)));
599
600 emit_shader_time_write(type, diff);
601 emit_shader_time_write(written_type, fs_reg(1u));
602 emit(BRW_OPCODE_ELSE);
603 emit_shader_time_write(reset_type, fs_reg(1u));
604 emit(BRW_OPCODE_ENDIF);
605
606 pop_force_uncompressed();
607 }
608
609 void
610 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
611 fs_reg value)
612 {
613 int shader_time_index = brw_get_shader_time_index(brw, prog, &fp->Base,
614 type);
615 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
616
617 fs_reg payload;
618 if (dispatch_width == 8)
619 payload = fs_reg(this, glsl_type::uvec2_type);
620 else
621 payload = fs_reg(this, glsl_type::uint_type);
622
623 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
624 fs_reg(), payload, offset, value));
625 }
626
627 void
628 fs_visitor::fail(const char *format, ...)
629 {
630 va_list va;
631 char *msg;
632
633 if (failed)
634 return;
635
636 failed = true;
637
638 va_start(va, format);
639 msg = ralloc_vasprintf(mem_ctx, format, va);
640 va_end(va);
641 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
642
643 this->fail_msg = msg;
644
645 if (INTEL_DEBUG & DEBUG_WM) {
646 fprintf(stderr, "%s", msg);
647 }
648 }
649
650 fs_inst *
651 fs_visitor::emit(enum opcode opcode)
652 {
653 return emit(fs_inst(opcode));
654 }
655
656 fs_inst *
657 fs_visitor::emit(enum opcode opcode, fs_reg dst)
658 {
659 return emit(fs_inst(opcode, dst));
660 }
661
662 fs_inst *
663 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
664 {
665 return emit(fs_inst(opcode, dst, src0));
666 }
667
668 fs_inst *
669 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
670 {
671 return emit(fs_inst(opcode, dst, src0, src1));
672 }
673
674 fs_inst *
675 fs_visitor::emit(enum opcode opcode, fs_reg dst,
676 fs_reg src0, fs_reg src1, fs_reg src2)
677 {
678 return emit(fs_inst(opcode, dst, src0, src1, src2));
679 }
680
681 void
682 fs_visitor::push_force_uncompressed()
683 {
684 force_uncompressed_stack++;
685 }
686
687 void
688 fs_visitor::pop_force_uncompressed()
689 {
690 force_uncompressed_stack--;
691 assert(force_uncompressed_stack >= 0);
692 }
693
694 void
695 fs_visitor::push_force_sechalf()
696 {
697 force_sechalf_stack++;
698 }
699
700 void
701 fs_visitor::pop_force_sechalf()
702 {
703 force_sechalf_stack--;
704 assert(force_sechalf_stack >= 0);
705 }
706
707 /**
708 * Returns how many MRFs an FS opcode will write over.
709 *
710 * Note that this is not the 0 or 1 implied writes in an actual gen
711 * instruction -- the FS opcodes often generate MOVs in addition.
712 */
713 int
714 fs_visitor::implied_mrf_writes(fs_inst *inst)
715 {
716 if (inst->mlen == 0)
717 return 0;
718
719 switch (inst->opcode) {
720 case SHADER_OPCODE_RCP:
721 case SHADER_OPCODE_RSQ:
722 case SHADER_OPCODE_SQRT:
723 case SHADER_OPCODE_EXP2:
724 case SHADER_OPCODE_LOG2:
725 case SHADER_OPCODE_SIN:
726 case SHADER_OPCODE_COS:
727 return 1 * dispatch_width / 8;
728 case SHADER_OPCODE_POW:
729 case SHADER_OPCODE_INT_QUOTIENT:
730 case SHADER_OPCODE_INT_REMAINDER:
731 return 2 * dispatch_width / 8;
732 case SHADER_OPCODE_TEX:
733 case FS_OPCODE_TXB:
734 case SHADER_OPCODE_TXD:
735 case SHADER_OPCODE_TXF:
736 case SHADER_OPCODE_TXF_MS:
737 case SHADER_OPCODE_TXL:
738 case SHADER_OPCODE_TXS:
739 case SHADER_OPCODE_LOD:
740 return 1;
741 case FS_OPCODE_FB_WRITE:
742 return 2;
743 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
744 case FS_OPCODE_UNSPILL:
745 return 1;
746 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
747 return inst->header_present;
748 case FS_OPCODE_SPILL:
749 return 2;
750 default:
751 assert(!"not reached");
752 return inst->mlen;
753 }
754 }
755
756 int
757 fs_visitor::virtual_grf_alloc(int size)
758 {
759 if (virtual_grf_array_size <= virtual_grf_count) {
760 if (virtual_grf_array_size == 0)
761 virtual_grf_array_size = 16;
762 else
763 virtual_grf_array_size *= 2;
764 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
765 virtual_grf_array_size);
766 }
767 virtual_grf_sizes[virtual_grf_count] = size;
768 return virtual_grf_count++;
769 }
770
771 /** Fixed HW reg constructor. */
772 fs_reg::fs_reg(enum register_file file, int reg)
773 {
774 init();
775 this->file = file;
776 this->reg = reg;
777 this->type = BRW_REGISTER_TYPE_F;
778 }
779
780 /** Fixed HW reg constructor. */
781 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
782 {
783 init();
784 this->file = file;
785 this->reg = reg;
786 this->type = type;
787 }
788
789 /** Automatic reg constructor. */
790 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
791 {
792 init();
793
794 this->file = GRF;
795 this->reg = v->virtual_grf_alloc(v->type_size(type));
796 this->reg_offset = 0;
797 this->type = brw_type_for_base_type(type);
798 }
799
800 fs_reg *
801 fs_visitor::variable_storage(ir_variable *var)
802 {
803 return (fs_reg *)hash_table_find(this->variable_ht, var);
804 }
805
806 void
807 import_uniforms_callback(const void *key,
808 void *data,
809 void *closure)
810 {
811 struct hash_table *dst_ht = (struct hash_table *)closure;
812 const fs_reg *reg = (const fs_reg *)data;
813
814 if (reg->file != UNIFORM)
815 return;
816
817 hash_table_insert(dst_ht, data, key);
818 }
819
820 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
821 * This brings in those uniform definitions
822 */
823 void
824 fs_visitor::import_uniforms(fs_visitor *v)
825 {
826 hash_table_call_foreach(v->variable_ht,
827 import_uniforms_callback,
828 variable_ht);
829 this->params_remap = v->params_remap;
830 }
831
832 /* Our support for uniforms is piggy-backed on the struct
833 * gl_fragment_program, because that's where the values actually
834 * get stored, rather than in some global gl_shader_program uniform
835 * store.
836 */
837 void
838 fs_visitor::setup_uniform_values(ir_variable *ir)
839 {
840 int namelen = strlen(ir->name);
841
842 /* The data for our (non-builtin) uniforms is stored in a series of
843 * gl_uniform_driver_storage structs for each subcomponent that
844 * glGetUniformLocation() could name. We know it's been set up in the same
845 * order we'd walk the type, so walk the list of storage and find anything
846 * with our name, or the prefix of a component that starts with our name.
847 */
848 unsigned params_before = c->prog_data.nr_params;
849 for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
850 struct gl_uniform_storage *storage = &prog->UniformStorage[u];
851
852 if (strncmp(ir->name, storage->name, namelen) != 0 ||
853 (storage->name[namelen] != 0 &&
854 storage->name[namelen] != '.' &&
855 storage->name[namelen] != '[')) {
856 continue;
857 }
858
859 unsigned slots = storage->type->component_slots();
860 if (storage->array_elements)
861 slots *= storage->array_elements;
862
863 for (unsigned i = 0; i < slots; i++) {
864 c->prog_data.param[c->prog_data.nr_params++] =
865 &storage->storage[i].f;
866 }
867 }
868
869 /* Make sure we actually initialized the right amount of stuff here. */
870 assert(params_before + ir->type->component_slots() ==
871 c->prog_data.nr_params);
872 }
873
874
875 /* Our support for builtin uniforms is even scarier than non-builtin.
876 * It sits on top of the PROG_STATE_VAR parameters that are
877 * automatically updated from GL context state.
878 */
879 void
880 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
881 {
882 const ir_state_slot *const slots = ir->state_slots;
883 assert(ir->state_slots != NULL);
884
885 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
886 /* This state reference has already been setup by ir_to_mesa, but we'll
887 * get the same index back here.
888 */
889 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
890 (gl_state_index *)slots[i].tokens);
891
892 /* Add each of the unique swizzles of the element as a parameter.
893 * This'll end up matching the expected layout of the
894 * array/matrix/structure we're trying to fill in.
895 */
896 int last_swiz = -1;
897 for (unsigned int j = 0; j < 4; j++) {
898 int swiz = GET_SWZ(slots[i].swizzle, j);
899 if (swiz == last_swiz)
900 break;
901 last_swiz = swiz;
902
903 c->prog_data.param[c->prog_data.nr_params++] =
904 &fp->Base.Parameters->ParameterValues[index][swiz].f;
905 }
906 }
907 }
908
909 fs_reg *
910 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
911 {
912 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
913 fs_reg wpos = *reg;
914 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
915
916 /* gl_FragCoord.x */
917 if (ir->pixel_center_integer) {
918 emit(MOV(wpos, this->pixel_x));
919 } else {
920 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
921 }
922 wpos.reg_offset++;
923
924 /* gl_FragCoord.y */
925 if (!flip && ir->pixel_center_integer) {
926 emit(MOV(wpos, this->pixel_y));
927 } else {
928 fs_reg pixel_y = this->pixel_y;
929 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
930
931 if (flip) {
932 pixel_y.negate = true;
933 offset += c->key.drawable_height - 1.0;
934 }
935
936 emit(ADD(wpos, pixel_y, fs_reg(offset)));
937 }
938 wpos.reg_offset++;
939
940 /* gl_FragCoord.z */
941 if (intel->gen >= 6) {
942 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
943 } else {
944 emit(FS_OPCODE_LINTERP, wpos,
945 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
946 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
947 interp_reg(VARYING_SLOT_POS, 2));
948 }
949 wpos.reg_offset++;
950
951 /* gl_FragCoord.w: Already set up in emit_interpolation */
952 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
953
954 return reg;
955 }
956
957 fs_inst *
958 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
959 glsl_interp_qualifier interpolation_mode,
960 bool is_centroid)
961 {
962 brw_wm_barycentric_interp_mode barycoord_mode;
963 if (is_centroid) {
964 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
965 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
966 else
967 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
968 } else {
969 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
970 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
971 else
972 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
973 }
974 return emit(FS_OPCODE_LINTERP, attr,
975 this->delta_x[barycoord_mode],
976 this->delta_y[barycoord_mode], interp);
977 }
978
979 fs_reg *
980 fs_visitor::emit_general_interpolation(ir_variable *ir)
981 {
982 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
983 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
984 fs_reg attr = *reg;
985
986 unsigned int array_elements;
987 const glsl_type *type;
988
989 if (ir->type->is_array()) {
990 array_elements = ir->type->length;
991 if (array_elements == 0) {
992 fail("dereferenced array '%s' has length 0\n", ir->name);
993 }
994 type = ir->type->fields.array;
995 } else {
996 array_elements = 1;
997 type = ir->type;
998 }
999
1000 glsl_interp_qualifier interpolation_mode =
1001 ir->determine_interpolation_mode(c->key.flat_shade);
1002
1003 int location = ir->location;
1004 for (unsigned int i = 0; i < array_elements; i++) {
1005 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1006 if (urb_setup[location] == -1) {
1007 /* If there's no incoming setup data for this slot, don't
1008 * emit interpolation for it.
1009 */
1010 attr.reg_offset += type->vector_elements;
1011 location++;
1012 continue;
1013 }
1014
1015 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1016 /* Constant interpolation (flat shading) case. The SF has
1017 * handed us defined values in only the constant offset
1018 * field of the setup reg.
1019 */
1020 for (unsigned int k = 0; k < type->vector_elements; k++) {
1021 struct brw_reg interp = interp_reg(location, k);
1022 interp = suboffset(interp, 3);
1023 interp.type = reg->type;
1024 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1025 attr.reg_offset++;
1026 }
1027 } else {
1028 /* Smooth/noperspective interpolation case. */
1029 for (unsigned int k = 0; k < type->vector_elements; k++) {
1030 /* FINISHME: At some point we probably want to push
1031 * this farther by giving similar treatment to the
1032 * other potentially constant components of the
1033 * attribute, as well as making brw_vs_constval.c
1034 * handle varyings other than gl_TexCoord.
1035 */
1036 if (location >= VARYING_SLOT_TEX0 &&
1037 location <= VARYING_SLOT_TEX7 &&
1038 k == 3 && !(c->key.proj_attrib_mask
1039 & BITFIELD64_BIT(location))) {
1040 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1041 } else {
1042 struct brw_reg interp = interp_reg(location, k);
1043 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1044 ir->centroid);
1045 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1046 /* Get the pixel/sample mask into f0 so that we know
1047 * which pixels are lit. Then, for each channel that is
1048 * unlit, replace the centroid data with non-centroid
1049 * data.
1050 */
1051 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1052 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1053 interpolation_mode, false);
1054 inst->predicate = BRW_PREDICATE_NORMAL;
1055 inst->predicate_inverse = true;
1056 }
1057 if (intel->gen < 6) {
1058 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1059 }
1060 }
1061 attr.reg_offset++;
1062 }
1063
1064 }
1065 location++;
1066 }
1067 }
1068
1069 return reg;
1070 }
1071
1072 fs_reg *
1073 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1074 {
1075 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1076
1077 /* The frontfacing comes in as a bit in the thread payload. */
1078 if (intel->gen >= 6) {
1079 emit(BRW_OPCODE_ASR, *reg,
1080 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1081 fs_reg(15));
1082 emit(BRW_OPCODE_NOT, *reg, *reg);
1083 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1084 } else {
1085 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1086 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1087 * us front face
1088 */
1089 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1090 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1091 }
1092
1093 return reg;
1094 }
1095
1096 fs_reg
1097 fs_visitor::fix_math_operand(fs_reg src)
1098 {
1099 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1100 * might be able to do better by doing execsize = 1 math and then
1101 * expanding that result out, but we would need to be careful with
1102 * masking.
1103 *
1104 * The hardware ignores source modifiers (negate and abs) on math
1105 * instructions, so we also move to a temp to set those up.
1106 */
1107 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1108 !src.abs && !src.negate)
1109 return src;
1110
1111 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1112 * operands to math
1113 */
1114 if (intel->gen >= 7 && src.file != IMM)
1115 return src;
1116
1117 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1118 expanded.type = src.type;
1119 emit(BRW_OPCODE_MOV, expanded, src);
1120 return expanded;
1121 }
1122
1123 fs_inst *
1124 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1125 {
1126 switch (opcode) {
1127 case SHADER_OPCODE_RCP:
1128 case SHADER_OPCODE_RSQ:
1129 case SHADER_OPCODE_SQRT:
1130 case SHADER_OPCODE_EXP2:
1131 case SHADER_OPCODE_LOG2:
1132 case SHADER_OPCODE_SIN:
1133 case SHADER_OPCODE_COS:
1134 break;
1135 default:
1136 assert(!"not reached: bad math opcode");
1137 return NULL;
1138 }
1139
1140 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1141 * might be able to do better by doing execsize = 1 math and then
1142 * expanding that result out, but we would need to be careful with
1143 * masking.
1144 *
1145 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1146 * instructions, so we also move to a temp to set those up.
1147 */
1148 if (intel->gen >= 6)
1149 src = fix_math_operand(src);
1150
1151 fs_inst *inst = emit(opcode, dst, src);
1152
1153 if (intel->gen < 6) {
1154 inst->base_mrf = 2;
1155 inst->mlen = dispatch_width / 8;
1156 }
1157
1158 return inst;
1159 }
1160
1161 fs_inst *
1162 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1163 {
1164 int base_mrf = 2;
1165 fs_inst *inst;
1166
1167 switch (opcode) {
1168 case SHADER_OPCODE_INT_QUOTIENT:
1169 case SHADER_OPCODE_INT_REMAINDER:
1170 if (intel->gen >= 7 && dispatch_width == 16)
1171 fail("16-wide INTDIV unsupported\n");
1172 break;
1173 case SHADER_OPCODE_POW:
1174 break;
1175 default:
1176 assert(!"not reached: unsupported binary math opcode.");
1177 return NULL;
1178 }
1179
1180 if (intel->gen >= 6) {
1181 src0 = fix_math_operand(src0);
1182 src1 = fix_math_operand(src1);
1183
1184 inst = emit(opcode, dst, src0, src1);
1185 } else {
1186 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1187 * "Message Payload":
1188 *
1189 * "Operand0[7]. For the INT DIV functions, this operand is the
1190 * denominator."
1191 * ...
1192 * "Operand1[7]. For the INT DIV functions, this operand is the
1193 * numerator."
1194 */
1195 bool is_int_div = opcode != SHADER_OPCODE_POW;
1196 fs_reg &op0 = is_int_div ? src1 : src0;
1197 fs_reg &op1 = is_int_div ? src0 : src1;
1198
1199 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1200 inst = emit(opcode, dst, op0, reg_null_f);
1201
1202 inst->base_mrf = base_mrf;
1203 inst->mlen = 2 * dispatch_width / 8;
1204 }
1205 return inst;
1206 }
1207
1208 void
1209 fs_visitor::assign_curb_setup()
1210 {
1211 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1212 if (dispatch_width == 8) {
1213 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1214 } else {
1215 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1216 }
1217
1218 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1219 foreach_list(node, &this->instructions) {
1220 fs_inst *inst = (fs_inst *)node;
1221
1222 for (unsigned int i = 0; i < 3; i++) {
1223 if (inst->src[i].file == UNIFORM) {
1224 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1225 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1226 constant_nr / 8,
1227 constant_nr % 8);
1228
1229 inst->src[i].file = FIXED_HW_REG;
1230 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1231 }
1232 }
1233 }
1234 }
1235
1236 void
1237 fs_visitor::calculate_urb_setup()
1238 {
1239 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1240 urb_setup[i] = -1;
1241 }
1242
1243 int urb_next = 0;
1244 /* Figure out where each of the incoming setup attributes lands. */
1245 if (intel->gen >= 6) {
1246 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1247 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1248 urb_setup[i] = urb_next++;
1249 }
1250 }
1251 } else {
1252 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1253 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1254 /* Point size is packed into the header, not as a general attribute */
1255 if (i == VARYING_SLOT_PSIZ)
1256 continue;
1257
1258 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1259 /* The back color slot is skipped when the front color is
1260 * also written to. In addition, some slots can be
1261 * written in the vertex shader and not read in the
1262 * fragment shader. So the register number must always be
1263 * incremented, mapped or not.
1264 */
1265 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1266 urb_setup[i] = urb_next;
1267 urb_next++;
1268 }
1269 }
1270
1271 /*
1272 * It's a FS only attribute, and we did interpolation for this attribute
1273 * in SF thread. So, count it here, too.
1274 *
1275 * See compile_sf_prog() for more info.
1276 */
1277 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1278 urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1279 }
1280
1281 /* Each attribute is 4 setup channels, each of which is half a reg. */
1282 c->prog_data.urb_read_length = urb_next * 2;
1283 }
1284
1285 void
1286 fs_visitor::assign_urb_setup()
1287 {
1288 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1289
1290 /* Offset all the urb_setup[] index by the actual position of the
1291 * setup regs, now that the location of the constants has been chosen.
1292 */
1293 foreach_list(node, &this->instructions) {
1294 fs_inst *inst = (fs_inst *)node;
1295
1296 if (inst->opcode == FS_OPCODE_LINTERP) {
1297 assert(inst->src[2].file == FIXED_HW_REG);
1298 inst->src[2].fixed_hw_reg.nr += urb_start;
1299 }
1300
1301 if (inst->opcode == FS_OPCODE_CINTERP) {
1302 assert(inst->src[0].file == FIXED_HW_REG);
1303 inst->src[0].fixed_hw_reg.nr += urb_start;
1304 }
1305 }
1306
1307 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1308 }
1309
1310 /**
1311 * Split large virtual GRFs into separate components if we can.
1312 *
1313 * This is mostly duplicated with what brw_fs_vector_splitting does,
1314 * but that's really conservative because it's afraid of doing
1315 * splitting that doesn't result in real progress after the rest of
1316 * the optimization phases, which would cause infinite looping in
1317 * optimization. We can do it once here, safely. This also has the
1318 * opportunity to split interpolated values, or maybe even uniforms,
1319 * which we don't have at the IR level.
1320 *
1321 * We want to split, because virtual GRFs are what we register
1322 * allocate and spill (due to contiguousness requirements for some
1323 * instructions), and they're what we naturally generate in the
1324 * codegen process, but most virtual GRFs don't actually need to be
1325 * contiguous sets of GRFs. If we split, we'll end up with reduced
1326 * live intervals and better dead code elimination and coalescing.
1327 */
1328 void
1329 fs_visitor::split_virtual_grfs()
1330 {
1331 int num_vars = this->virtual_grf_count;
1332 bool split_grf[num_vars];
1333 int new_virtual_grf[num_vars];
1334
1335 /* Try to split anything > 0 sized. */
1336 for (int i = 0; i < num_vars; i++) {
1337 if (this->virtual_grf_sizes[i] != 1)
1338 split_grf[i] = true;
1339 else
1340 split_grf[i] = false;
1341 }
1342
1343 if (brw->has_pln &&
1344 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1345 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1346 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1347 * Gen6, that was the only supported interpolation mode, and since Gen6,
1348 * delta_x and delta_y are in fixed hardware registers.
1349 */
1350 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1351 false;
1352 }
1353
1354 foreach_list(node, &this->instructions) {
1355 fs_inst *inst = (fs_inst *)node;
1356
1357 /* If there's a SEND message that requires contiguous destination
1358 * registers, no splitting is allowed.
1359 */
1360 if (inst->regs_written() > 1) {
1361 split_grf[inst->dst.reg] = false;
1362 }
1363
1364 /* If we're sending from a GRF, don't split it, on the assumption that
1365 * the send is reading the whole thing.
1366 */
1367 if (inst->is_send_from_grf()) {
1368 split_grf[inst->src[0].reg] = false;
1369 }
1370 }
1371
1372 /* Allocate new space for split regs. Note that the virtual
1373 * numbers will be contiguous.
1374 */
1375 for (int i = 0; i < num_vars; i++) {
1376 if (split_grf[i]) {
1377 new_virtual_grf[i] = virtual_grf_alloc(1);
1378 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1379 int reg = virtual_grf_alloc(1);
1380 assert(reg == new_virtual_grf[i] + j - 1);
1381 (void) reg;
1382 }
1383 this->virtual_grf_sizes[i] = 1;
1384 }
1385 }
1386
1387 foreach_list(node, &this->instructions) {
1388 fs_inst *inst = (fs_inst *)node;
1389
1390 if (inst->dst.file == GRF &&
1391 split_grf[inst->dst.reg] &&
1392 inst->dst.reg_offset != 0) {
1393 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1394 inst->dst.reg_offset - 1);
1395 inst->dst.reg_offset = 0;
1396 }
1397 for (int i = 0; i < 3; i++) {
1398 if (inst->src[i].file == GRF &&
1399 split_grf[inst->src[i].reg] &&
1400 inst->src[i].reg_offset != 0) {
1401 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1402 inst->src[i].reg_offset - 1);
1403 inst->src[i].reg_offset = 0;
1404 }
1405 }
1406 }
1407 this->live_intervals_valid = false;
1408 }
1409
1410 /**
1411 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1412 *
1413 * During code generation, we create tons of temporary variables, many of
1414 * which get immediately killed and are never used again. Yet, in later
1415 * optimization and analysis passes, such as compute_live_intervals, we need
1416 * to loop over all the virtual GRFs. Compacting them can save a lot of
1417 * overhead.
1418 */
1419 void
1420 fs_visitor::compact_virtual_grfs()
1421 {
1422 /* Mark which virtual GRFs are used, and count how many. */
1423 int remap_table[this->virtual_grf_count];
1424 memset(remap_table, -1, sizeof(remap_table));
1425
1426 foreach_list(node, &this->instructions) {
1427 const fs_inst *inst = (const fs_inst *) node;
1428
1429 if (inst->dst.file == GRF)
1430 remap_table[inst->dst.reg] = 0;
1431
1432 for (int i = 0; i < 3; i++) {
1433 if (inst->src[i].file == GRF)
1434 remap_table[inst->src[i].reg] = 0;
1435 }
1436 }
1437
1438 /* In addition to registers used in instructions, fs_visitor keeps
1439 * direct references to certain special values which must be patched:
1440 */
1441 fs_reg *special[] = {
1442 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1443 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1444 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1445 &delta_x[0], &delta_x[1], &delta_x[2],
1446 &delta_x[3], &delta_x[4], &delta_x[5],
1447 &delta_y[0], &delta_y[1], &delta_y[2],
1448 &delta_y[3], &delta_y[4], &delta_y[5],
1449 };
1450 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1451 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1452
1453 /* Treat all special values as used, to be conservative */
1454 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1455 if (special[i]->file == GRF)
1456 remap_table[special[i]->reg] = 0;
1457 }
1458
1459 /* Compact the GRF arrays. */
1460 int new_index = 0;
1461 for (int i = 0; i < this->virtual_grf_count; i++) {
1462 if (remap_table[i] != -1) {
1463 remap_table[i] = new_index;
1464 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1465 if (live_intervals_valid) {
1466 virtual_grf_use[new_index] = virtual_grf_use[i];
1467 virtual_grf_def[new_index] = virtual_grf_def[i];
1468 }
1469 ++new_index;
1470 }
1471 }
1472
1473 this->virtual_grf_count = new_index;
1474
1475 /* Patch all the instructions to use the newly renumbered registers */
1476 foreach_list(node, &this->instructions) {
1477 fs_inst *inst = (fs_inst *) node;
1478
1479 if (inst->dst.file == GRF)
1480 inst->dst.reg = remap_table[inst->dst.reg];
1481
1482 for (int i = 0; i < 3; i++) {
1483 if (inst->src[i].file == GRF)
1484 inst->src[i].reg = remap_table[inst->src[i].reg];
1485 }
1486 }
1487
1488 /* Patch all the references to special values */
1489 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1490 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1491 special[i]->reg = remap_table[special[i]->reg];
1492 }
1493 }
1494
1495 bool
1496 fs_visitor::remove_dead_constants()
1497 {
1498 if (dispatch_width == 8) {
1499 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1500
1501 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1502 this->params_remap[i] = -1;
1503
1504 /* Find which params are still in use. */
1505 foreach_list(node, &this->instructions) {
1506 fs_inst *inst = (fs_inst *)node;
1507
1508 for (int i = 0; i < 3; i++) {
1509 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1510
1511 if (inst->src[i].file != UNIFORM)
1512 continue;
1513
1514 assert(constant_nr < (int)c->prog_data.nr_params);
1515
1516 /* For now, set this to non-negative. We'll give it the
1517 * actual new number in a moment, in order to keep the
1518 * register numbers nicely ordered.
1519 */
1520 this->params_remap[constant_nr] = 0;
1521 }
1522 }
1523
1524 /* Figure out what the new numbers for the params will be. At some
1525 * point when we're doing uniform array access, we're going to want
1526 * to keep the distinction between .reg and .reg_offset, but for
1527 * now we don't care.
1528 */
1529 unsigned int new_nr_params = 0;
1530 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1531 if (this->params_remap[i] != -1) {
1532 this->params_remap[i] = new_nr_params++;
1533 }
1534 }
1535
1536 /* Update the list of params to be uploaded to match our new numbering. */
1537 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1538 int remapped = this->params_remap[i];
1539
1540 if (remapped == -1)
1541 continue;
1542
1543 c->prog_data.param[remapped] = c->prog_data.param[i];
1544 }
1545
1546 c->prog_data.nr_params = new_nr_params;
1547 } else {
1548 /* This should have been generated in the 8-wide pass already. */
1549 assert(this->params_remap);
1550 }
1551
1552 /* Now do the renumbering of the shader to remove unused params. */
1553 foreach_list(node, &this->instructions) {
1554 fs_inst *inst = (fs_inst *)node;
1555
1556 for (int i = 0; i < 3; i++) {
1557 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1558
1559 if (inst->src[i].file != UNIFORM)
1560 continue;
1561
1562 assert(this->params_remap[constant_nr] != -1);
1563 inst->src[i].reg = this->params_remap[constant_nr];
1564 inst->src[i].reg_offset = 0;
1565 }
1566 }
1567
1568 return true;
1569 }
1570
1571 /*
1572 * Implements array access of uniforms by inserting a
1573 * PULL_CONSTANT_LOAD instruction.
1574 *
1575 * Unlike temporary GRF array access (where we don't support it due to
1576 * the difficulty of doing relative addressing on instruction
1577 * destinations), we could potentially do array access of uniforms
1578 * that were loaded in GRF space as push constants. In real-world
1579 * usage we've seen, though, the arrays being used are always larger
1580 * than we could load as push constants, so just always move all
1581 * uniform array access out to a pull constant buffer.
1582 */
1583 void
1584 fs_visitor::move_uniform_array_access_to_pull_constants()
1585 {
1586 int pull_constant_loc[c->prog_data.nr_params];
1587
1588 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1589 pull_constant_loc[i] = -1;
1590 }
1591
1592 /* Walk through and find array access of uniforms. Put a copy of that
1593 * uniform in the pull constant buffer.
1594 *
1595 * Note that we don't move constant-indexed accesses to arrays. No
1596 * testing has been done of the performance impact of this choice.
1597 */
1598 foreach_list_safe(node, &this->instructions) {
1599 fs_inst *inst = (fs_inst *)node;
1600
1601 for (int i = 0 ; i < 3; i++) {
1602 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1603 continue;
1604
1605 int uniform = inst->src[i].reg;
1606
1607 /* If this array isn't already present in the pull constant buffer,
1608 * add it.
1609 */
1610 if (pull_constant_loc[uniform] == -1) {
1611 const float **values = &c->prog_data.param[uniform];
1612
1613 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1614
1615 assert(param_size[uniform]);
1616
1617 for (int j = 0; j < param_size[uniform]; j++) {
1618 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1619 values[j];
1620 }
1621 }
1622
1623 /* Set up the annotation tracking for new generated instructions. */
1624 base_ir = inst->ir;
1625 current_annotation = inst->annotation;
1626
1627 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1628 fs_reg temp = fs_reg(this, glsl_type::float_type);
1629 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1630 surf_index,
1631 *inst->src[i].reladdr,
1632 pull_constant_loc[uniform] +
1633 inst->src[i].reg_offset);
1634 inst->insert_before(&list);
1635
1636 inst->src[i].file = temp.file;
1637 inst->src[i].reg = temp.reg;
1638 inst->src[i].reg_offset = temp.reg_offset;
1639 inst->src[i].reladdr = NULL;
1640 }
1641 }
1642 }
1643
1644 /**
1645 * Choose accesses from the UNIFORM file to demote to using the pull
1646 * constant buffer.
1647 *
1648 * We allow a fragment shader to have more than the specified minimum
1649 * maximum number of fragment shader uniform components (64). If
1650 * there are too many of these, they'd fill up all of register space.
1651 * So, this will push some of them out to the pull constant buffer and
1652 * update the program to load them.
1653 */
1654 void
1655 fs_visitor::setup_pull_constants()
1656 {
1657 /* Only allow 16 registers (128 uniform components) as push constants. */
1658 unsigned int max_uniform_components = 16 * 8;
1659 if (c->prog_data.nr_params <= max_uniform_components)
1660 return;
1661
1662 if (dispatch_width == 16) {
1663 fail("Pull constants not supported in 16-wide\n");
1664 return;
1665 }
1666
1667 /* Just demote the end of the list. We could probably do better
1668 * here, demoting things that are rarely used in the program first.
1669 */
1670 unsigned int pull_uniform_base = max_uniform_components;
1671
1672 int pull_constant_loc[c->prog_data.nr_params];
1673 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1674 if (i < pull_uniform_base) {
1675 pull_constant_loc[i] = -1;
1676 } else {
1677 pull_constant_loc[i] = -1;
1678 /* If our constant is already being uploaded for reladdr purposes,
1679 * reuse it.
1680 */
1681 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1682 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1683 pull_constant_loc[i] = j;
1684 break;
1685 }
1686 }
1687 if (pull_constant_loc[i] == -1) {
1688 int pull_index = c->prog_data.nr_pull_params++;
1689 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1690 pull_constant_loc[i] = pull_index;;
1691 }
1692 }
1693 }
1694 c->prog_data.nr_params = pull_uniform_base;
1695
1696 foreach_list(node, &this->instructions) {
1697 fs_inst *inst = (fs_inst *)node;
1698
1699 for (int i = 0; i < 3; i++) {
1700 if (inst->src[i].file != UNIFORM)
1701 continue;
1702
1703 int pull_index = pull_constant_loc[inst->src[i].reg +
1704 inst->src[i].reg_offset];
1705 if (pull_index == -1)
1706 continue;
1707
1708 assert(!inst->src[i].reladdr);
1709
1710 fs_reg dst = fs_reg(this, glsl_type::float_type);
1711 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1712 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1713 fs_inst *pull =
1714 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1715 dst, index, offset);
1716 pull->ir = inst->ir;
1717 pull->annotation = inst->annotation;
1718
1719 inst->insert_before(pull);
1720
1721 inst->src[i].file = GRF;
1722 inst->src[i].reg = dst.reg;
1723 inst->src[i].reg_offset = 0;
1724 inst->src[i].smear = pull_index & 3;
1725 }
1726 }
1727 }
1728
1729 bool
1730 fs_visitor::opt_algebraic()
1731 {
1732 bool progress = false;
1733
1734 foreach_list(node, &this->instructions) {
1735 fs_inst *inst = (fs_inst *)node;
1736
1737 switch (inst->opcode) {
1738 case BRW_OPCODE_MUL:
1739 if (inst->src[1].file != IMM)
1740 continue;
1741
1742 /* a * 1.0 = a */
1743 if (inst->src[1].is_one()) {
1744 inst->opcode = BRW_OPCODE_MOV;
1745 inst->src[1] = reg_undef;
1746 progress = true;
1747 break;
1748 }
1749
1750 /* a * 0.0 = 0.0 */
1751 if (inst->src[1].is_zero()) {
1752 inst->opcode = BRW_OPCODE_MOV;
1753 inst->src[0] = inst->src[1];
1754 inst->src[1] = reg_undef;
1755 progress = true;
1756 break;
1757 }
1758
1759 break;
1760 case BRW_OPCODE_ADD:
1761 if (inst->src[1].file != IMM)
1762 continue;
1763
1764 /* a + 0.0 = a */
1765 if (inst->src[1].is_zero()) {
1766 inst->opcode = BRW_OPCODE_MOV;
1767 inst->src[1] = reg_undef;
1768 progress = true;
1769 break;
1770 }
1771 break;
1772 default:
1773 break;
1774 }
1775 }
1776
1777 return progress;
1778 }
1779
1780 /**
1781 * Must be called after calculate_live_intervales() to remove unused
1782 * writes to registers -- register allocation will fail otherwise
1783 * because something deffed but not used won't be considered to
1784 * interfere with other regs.
1785 */
1786 bool
1787 fs_visitor::dead_code_eliminate()
1788 {
1789 bool progress = false;
1790 int pc = 0;
1791
1792 calculate_live_intervals();
1793
1794 foreach_list_safe(node, &this->instructions) {
1795 fs_inst *inst = (fs_inst *)node;
1796
1797 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1798 inst->remove();
1799 progress = true;
1800 }
1801
1802 pc++;
1803 }
1804
1805 if (progress)
1806 live_intervals_valid = false;
1807
1808 return progress;
1809 }
1810
1811 /**
1812 * Implements a second type of register coalescing: This one checks if
1813 * the two regs involved in a raw move don't interfere, in which case
1814 * they can both by stored in the same place and the MOV removed.
1815 */
1816 bool
1817 fs_visitor::register_coalesce_2()
1818 {
1819 bool progress = false;
1820
1821 calculate_live_intervals();
1822
1823 foreach_list_safe(node, &this->instructions) {
1824 fs_inst *inst = (fs_inst *)node;
1825
1826 if (inst->opcode != BRW_OPCODE_MOV ||
1827 inst->predicate ||
1828 inst->saturate ||
1829 inst->src[0].file != GRF ||
1830 inst->src[0].negate ||
1831 inst->src[0].abs ||
1832 inst->src[0].smear != -1 ||
1833 inst->dst.file != GRF ||
1834 inst->dst.type != inst->src[0].type ||
1835 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1836 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1837 continue;
1838 }
1839
1840 int reg_from = inst->src[0].reg;
1841 assert(inst->src[0].reg_offset == 0);
1842 int reg_to = inst->dst.reg;
1843 int reg_to_offset = inst->dst.reg_offset;
1844
1845 foreach_list(node, &this->instructions) {
1846 fs_inst *scan_inst = (fs_inst *)node;
1847
1848 if (scan_inst->dst.file == GRF &&
1849 scan_inst->dst.reg == reg_from) {
1850 scan_inst->dst.reg = reg_to;
1851 scan_inst->dst.reg_offset = reg_to_offset;
1852 }
1853 for (int i = 0; i < 3; i++) {
1854 if (scan_inst->src[i].file == GRF &&
1855 scan_inst->src[i].reg == reg_from) {
1856 scan_inst->src[i].reg = reg_to;
1857 scan_inst->src[i].reg_offset = reg_to_offset;
1858 }
1859 }
1860 }
1861
1862 inst->remove();
1863
1864 /* We don't need to recalculate live intervals inside the loop despite
1865 * flagging live_intervals_valid because we only use live intervals for
1866 * the interferes test, and we must have had a situation where the
1867 * intervals were:
1868 *
1869 * from to
1870 * ^
1871 * |
1872 * v
1873 * ^
1874 * |
1875 * v
1876 *
1877 * Some register R that might get coalesced with one of these two could
1878 * only be referencing "to", otherwise "from"'s range would have been
1879 * longer. R's range could also only start at the end of "to" or later,
1880 * otherwise it will conflict with "to" when we try to coalesce "to"
1881 * into Rw anyway.
1882 */
1883 live_intervals_valid = false;
1884
1885 progress = true;
1886 continue;
1887 }
1888
1889 return progress;
1890 }
1891
1892 bool
1893 fs_visitor::register_coalesce()
1894 {
1895 bool progress = false;
1896 int if_depth = 0;
1897 int loop_depth = 0;
1898
1899 foreach_list_safe(node, &this->instructions) {
1900 fs_inst *inst = (fs_inst *)node;
1901
1902 /* Make sure that we dominate the instructions we're going to
1903 * scan for interfering with our coalescing, or we won't have
1904 * scanned enough to see if anything interferes with our
1905 * coalescing. We don't dominate the following instructions if
1906 * we're in a loop or an if block.
1907 */
1908 switch (inst->opcode) {
1909 case BRW_OPCODE_DO:
1910 loop_depth++;
1911 break;
1912 case BRW_OPCODE_WHILE:
1913 loop_depth--;
1914 break;
1915 case BRW_OPCODE_IF:
1916 if_depth++;
1917 break;
1918 case BRW_OPCODE_ENDIF:
1919 if_depth--;
1920 break;
1921 default:
1922 break;
1923 }
1924 if (loop_depth || if_depth)
1925 continue;
1926
1927 if (inst->opcode != BRW_OPCODE_MOV ||
1928 inst->predicate ||
1929 inst->saturate ||
1930 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1931 inst->src[0].file != UNIFORM)||
1932 inst->dst.type != inst->src[0].type)
1933 continue;
1934
1935 bool has_source_modifiers = (inst->src[0].abs ||
1936 inst->src[0].negate ||
1937 inst->src[0].smear != -1 ||
1938 inst->src[0].file == UNIFORM);
1939
1940 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1941 * them: check for no writes to either one until the exit of the
1942 * program.
1943 */
1944 bool interfered = false;
1945
1946 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1947 !scan_inst->is_tail_sentinel();
1948 scan_inst = (fs_inst *)scan_inst->next) {
1949 if (scan_inst->dst.file == GRF) {
1950 if (scan_inst->overwrites_reg(inst->dst) ||
1951 scan_inst->overwrites_reg(inst->src[0])) {
1952 interfered = true;
1953 break;
1954 }
1955 }
1956
1957 /* The gen6 MATH instruction can't handle source modifiers or
1958 * unusual register regions, so avoid coalescing those for
1959 * now. We should do something more specific.
1960 */
1961 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1962 interfered = true;
1963 break;
1964 }
1965
1966 /* The accumulator result appears to get used for the
1967 * conditional modifier generation. When negating a UD
1968 * value, there is a 33rd bit generated for the sign in the
1969 * accumulator value, so now you can't check, for example,
1970 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1971 */
1972 if (scan_inst->conditional_mod &&
1973 inst->src[0].negate &&
1974 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1975 interfered = true;
1976 break;
1977 }
1978 }
1979 if (interfered) {
1980 continue;
1981 }
1982
1983 /* Rewrite the later usage to point at the source of the move to
1984 * be removed.
1985 */
1986 for (fs_inst *scan_inst = inst;
1987 !scan_inst->is_tail_sentinel();
1988 scan_inst = (fs_inst *)scan_inst->next) {
1989 for (int i = 0; i < 3; i++) {
1990 if (scan_inst->src[i].file == GRF &&
1991 scan_inst->src[i].reg == inst->dst.reg &&
1992 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1993 fs_reg new_src = inst->src[0];
1994 if (scan_inst->src[i].abs) {
1995 new_src.negate = 0;
1996 new_src.abs = 1;
1997 }
1998 new_src.negate ^= scan_inst->src[i].negate;
1999 scan_inst->src[i] = new_src;
2000 }
2001 }
2002 }
2003
2004 inst->remove();
2005 progress = true;
2006 }
2007
2008 if (progress)
2009 live_intervals_valid = false;
2010
2011 return progress;
2012 }
2013
2014
2015 bool
2016 fs_visitor::compute_to_mrf()
2017 {
2018 bool progress = false;
2019 int next_ip = 0;
2020
2021 calculate_live_intervals();
2022
2023 foreach_list_safe(node, &this->instructions) {
2024 fs_inst *inst = (fs_inst *)node;
2025
2026 int ip = next_ip;
2027 next_ip++;
2028
2029 if (inst->opcode != BRW_OPCODE_MOV ||
2030 inst->predicate ||
2031 inst->dst.file != MRF || inst->src[0].file != GRF ||
2032 inst->dst.type != inst->src[0].type ||
2033 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2034 continue;
2035
2036 /* Work out which hardware MRF registers are written by this
2037 * instruction.
2038 */
2039 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2040 int mrf_high;
2041 if (inst->dst.reg & BRW_MRF_COMPR4) {
2042 mrf_high = mrf_low + 4;
2043 } else if (dispatch_width == 16 &&
2044 (!inst->force_uncompressed && !inst->force_sechalf)) {
2045 mrf_high = mrf_low + 1;
2046 } else {
2047 mrf_high = mrf_low;
2048 }
2049
2050 /* Can't compute-to-MRF this GRF if someone else was going to
2051 * read it later.
2052 */
2053 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2054 continue;
2055
2056 /* Found a move of a GRF to a MRF. Let's see if we can go
2057 * rewrite the thing that made this GRF to write into the MRF.
2058 */
2059 fs_inst *scan_inst;
2060 for (scan_inst = (fs_inst *)inst->prev;
2061 scan_inst->prev != NULL;
2062 scan_inst = (fs_inst *)scan_inst->prev) {
2063 if (scan_inst->dst.file == GRF &&
2064 scan_inst->dst.reg == inst->src[0].reg) {
2065 /* Found the last thing to write our reg we want to turn
2066 * into a compute-to-MRF.
2067 */
2068
2069 /* If it's predicated, it (probably) didn't populate all
2070 * the channels. We might be able to rewrite everything
2071 * that writes that reg, but it would require smarter
2072 * tracking to delay the rewriting until complete success.
2073 */
2074 if (scan_inst->predicate)
2075 break;
2076
2077 /* If it's half of register setup and not the same half as
2078 * our MOV we're trying to remove, bail for now.
2079 */
2080 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2081 scan_inst->force_sechalf != inst->force_sechalf) {
2082 break;
2083 }
2084
2085 /* Things returning more than one register would need us to
2086 * understand coalescing out more than one MOV at a time.
2087 */
2088 if (scan_inst->regs_written() > 1)
2089 break;
2090
2091 /* SEND instructions can't have MRF as a destination. */
2092 if (scan_inst->mlen)
2093 break;
2094
2095 if (intel->gen == 6) {
2096 /* gen6 math instructions must have the destination be
2097 * GRF, so no compute-to-MRF for them.
2098 */
2099 if (scan_inst->is_math()) {
2100 break;
2101 }
2102 }
2103
2104 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2105 /* Found the creator of our MRF's source value. */
2106 scan_inst->dst.file = MRF;
2107 scan_inst->dst.reg = inst->dst.reg;
2108 scan_inst->saturate |= inst->saturate;
2109 inst->remove();
2110 progress = true;
2111 }
2112 break;
2113 }
2114
2115 /* We don't handle control flow here. Most computation of
2116 * values that end up in MRFs are shortly before the MRF
2117 * write anyway.
2118 */
2119 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2120 break;
2121
2122 /* You can't read from an MRF, so if someone else reads our
2123 * MRF's source GRF that we wanted to rewrite, that stops us.
2124 */
2125 bool interfered = false;
2126 for (int i = 0; i < 3; i++) {
2127 if (scan_inst->src[i].file == GRF &&
2128 scan_inst->src[i].reg == inst->src[0].reg &&
2129 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2130 interfered = true;
2131 }
2132 }
2133 if (interfered)
2134 break;
2135
2136 if (scan_inst->dst.file == MRF) {
2137 /* If somebody else writes our MRF here, we can't
2138 * compute-to-MRF before that.
2139 */
2140 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2141 int scan_mrf_high;
2142
2143 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2144 scan_mrf_high = scan_mrf_low + 4;
2145 } else if (dispatch_width == 16 &&
2146 (!scan_inst->force_uncompressed &&
2147 !scan_inst->force_sechalf)) {
2148 scan_mrf_high = scan_mrf_low + 1;
2149 } else {
2150 scan_mrf_high = scan_mrf_low;
2151 }
2152
2153 if (mrf_low == scan_mrf_low ||
2154 mrf_low == scan_mrf_high ||
2155 mrf_high == scan_mrf_low ||
2156 mrf_high == scan_mrf_high) {
2157 break;
2158 }
2159 }
2160
2161 if (scan_inst->mlen > 0) {
2162 /* Found a SEND instruction, which means that there are
2163 * live values in MRFs from base_mrf to base_mrf +
2164 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2165 * above it.
2166 */
2167 if (mrf_low >= scan_inst->base_mrf &&
2168 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2169 break;
2170 }
2171 if (mrf_high >= scan_inst->base_mrf &&
2172 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2173 break;
2174 }
2175 }
2176 }
2177 }
2178
2179 if (progress)
2180 live_intervals_valid = false;
2181
2182 return progress;
2183 }
2184
2185 /**
2186 * Walks through basic blocks, looking for repeated MRF writes and
2187 * removing the later ones.
2188 */
2189 bool
2190 fs_visitor::remove_duplicate_mrf_writes()
2191 {
2192 fs_inst *last_mrf_move[16];
2193 bool progress = false;
2194
2195 /* Need to update the MRF tracking for compressed instructions. */
2196 if (dispatch_width == 16)
2197 return false;
2198
2199 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2200
2201 foreach_list_safe(node, &this->instructions) {
2202 fs_inst *inst = (fs_inst *)node;
2203
2204 if (inst->is_control_flow()) {
2205 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2206 }
2207
2208 if (inst->opcode == BRW_OPCODE_MOV &&
2209 inst->dst.file == MRF) {
2210 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2211 if (prev_inst && inst->equals(prev_inst)) {
2212 inst->remove();
2213 progress = true;
2214 continue;
2215 }
2216 }
2217
2218 /* Clear out the last-write records for MRFs that were overwritten. */
2219 if (inst->dst.file == MRF) {
2220 last_mrf_move[inst->dst.reg] = NULL;
2221 }
2222
2223 if (inst->mlen > 0) {
2224 /* Found a SEND instruction, which will include two or fewer
2225 * implied MRF writes. We could do better here.
2226 */
2227 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2228 last_mrf_move[inst->base_mrf + i] = NULL;
2229 }
2230 }
2231
2232 /* Clear out any MRF move records whose sources got overwritten. */
2233 if (inst->dst.file == GRF) {
2234 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2235 if (last_mrf_move[i] &&
2236 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2237 last_mrf_move[i] = NULL;
2238 }
2239 }
2240 }
2241
2242 if (inst->opcode == BRW_OPCODE_MOV &&
2243 inst->dst.file == MRF &&
2244 inst->src[0].file == GRF &&
2245 !inst->predicate) {
2246 last_mrf_move[inst->dst.reg] = inst;
2247 }
2248 }
2249
2250 if (progress)
2251 live_intervals_valid = false;
2252
2253 return progress;
2254 }
2255
2256 static void
2257 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2258 int first_grf, int grf_len)
2259 {
2260 bool inst_16wide = (dispatch_width > 8 &&
2261 !inst->force_uncompressed &&
2262 !inst->force_sechalf);
2263
2264 /* Clear the flag for registers that actually got read (as expected). */
2265 for (int i = 0; i < 3; i++) {
2266 int grf;
2267 if (inst->src[i].file == GRF) {
2268 grf = inst->src[i].reg;
2269 } else if (inst->src[i].file == FIXED_HW_REG &&
2270 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2271 grf = inst->src[i].fixed_hw_reg.nr;
2272 } else {
2273 continue;
2274 }
2275
2276 if (grf >= first_grf &&
2277 grf < first_grf + grf_len) {
2278 deps[grf - first_grf] = false;
2279 if (inst_16wide)
2280 deps[grf - first_grf + 1] = false;
2281 }
2282 }
2283 }
2284
2285 /**
2286 * Implements this workaround for the original 965:
2287 *
2288 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2289 * check for post destination dependencies on this instruction, software
2290 * must ensure that there is no destination hazard for the case of ‘write
2291 * followed by a posted write’ shown in the following example.
2292 *
2293 * 1. mov r3 0
2294 * 2. send r3.xy <rest of send instruction>
2295 * 3. mov r2 r3
2296 *
2297 * Due to no post-destination dependency check on the ‘send’, the above
2298 * code sequence could have two instructions (1 and 2) in flight at the
2299 * same time that both consider ‘r3’ as the target of their final writes.
2300 */
2301 void
2302 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2303 {
2304 int reg_size = dispatch_width / 8;
2305 int write_len = inst->regs_written() * reg_size;
2306 int first_write_grf = inst->dst.reg;
2307 bool needs_dep[BRW_MAX_MRF];
2308 assert(write_len < (int)sizeof(needs_dep) - 1);
2309
2310 memset(needs_dep, false, sizeof(needs_dep));
2311 memset(needs_dep, true, write_len);
2312
2313 clear_deps_for_inst_src(inst, dispatch_width,
2314 needs_dep, first_write_grf, write_len);
2315
2316 /* Walk backwards looking for writes to registers we're writing which
2317 * aren't read since being written. If we hit the start of the program,
2318 * we assume that there are no outstanding dependencies on entry to the
2319 * program.
2320 */
2321 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2322 scan_inst != NULL;
2323 scan_inst = (fs_inst *)scan_inst->prev) {
2324
2325 /* If we hit control flow, assume that there *are* outstanding
2326 * dependencies, and force their cleanup before our instruction.
2327 */
2328 if (scan_inst->is_control_flow()) {
2329 for (int i = 0; i < write_len; i++) {
2330 if (needs_dep[i]) {
2331 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2332 }
2333 }
2334 }
2335
2336 bool scan_inst_16wide = (dispatch_width > 8 &&
2337 !scan_inst->force_uncompressed &&
2338 !scan_inst->force_sechalf);
2339
2340 /* We insert our reads as late as possible on the assumption that any
2341 * instruction but a MOV that might have left us an outstanding
2342 * dependency has more latency than a MOV.
2343 */
2344 if (scan_inst->dst.file == GRF) {
2345 for (int i = 0; i < scan_inst->regs_written(); i++) {
2346 int reg = scan_inst->dst.reg + i * reg_size;
2347
2348 if (reg >= first_write_grf &&
2349 reg < first_write_grf + write_len &&
2350 needs_dep[reg - first_write_grf]) {
2351 inst->insert_before(DEP_RESOLVE_MOV(reg));
2352 needs_dep[reg - first_write_grf] = false;
2353 if (scan_inst_16wide)
2354 needs_dep[reg - first_write_grf + 1] = false;
2355 }
2356 }
2357 }
2358
2359 /* Clear the flag for registers that actually got read (as expected). */
2360 clear_deps_for_inst_src(scan_inst, dispatch_width,
2361 needs_dep, first_write_grf, write_len);
2362
2363 /* Continue the loop only if we haven't resolved all the dependencies */
2364 int i;
2365 for (i = 0; i < write_len; i++) {
2366 if (needs_dep[i])
2367 break;
2368 }
2369 if (i == write_len)
2370 return;
2371 }
2372 }
2373
2374 /**
2375 * Implements this workaround for the original 965:
2376 *
2377 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2378 * used as a destination register until after it has been sourced by an
2379 * instruction with a different destination register.
2380 */
2381 void
2382 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2383 {
2384 int write_len = inst->regs_written() * dispatch_width / 8;
2385 int first_write_grf = inst->dst.reg;
2386 bool needs_dep[BRW_MAX_MRF];
2387 assert(write_len < (int)sizeof(needs_dep) - 1);
2388
2389 memset(needs_dep, false, sizeof(needs_dep));
2390 memset(needs_dep, true, write_len);
2391 /* Walk forwards looking for writes to registers we're writing which aren't
2392 * read before being written.
2393 */
2394 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2395 !scan_inst->is_tail_sentinel();
2396 scan_inst = (fs_inst *)scan_inst->next) {
2397 /* If we hit control flow, force resolve all remaining dependencies. */
2398 if (scan_inst->is_control_flow()) {
2399 for (int i = 0; i < write_len; i++) {
2400 if (needs_dep[i])
2401 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2402 }
2403 }
2404
2405 /* Clear the flag for registers that actually got read (as expected). */
2406 clear_deps_for_inst_src(scan_inst, dispatch_width,
2407 needs_dep, first_write_grf, write_len);
2408
2409 /* We insert our reads as late as possible since they're reading the
2410 * result of a SEND, which has massive latency.
2411 */
2412 if (scan_inst->dst.file == GRF &&
2413 scan_inst->dst.reg >= first_write_grf &&
2414 scan_inst->dst.reg < first_write_grf + write_len &&
2415 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2416 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2417 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2418 }
2419
2420 /* Continue the loop only if we haven't resolved all the dependencies */
2421 int i;
2422 for (i = 0; i < write_len; i++) {
2423 if (needs_dep[i])
2424 break;
2425 }
2426 if (i == write_len)
2427 return;
2428 }
2429
2430 /* If we hit the end of the program, resolve all remaining dependencies out
2431 * of paranoia.
2432 */
2433 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2434 assert(last_inst->eot);
2435 for (int i = 0; i < write_len; i++) {
2436 if (needs_dep[i])
2437 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2438 }
2439 }
2440
2441 void
2442 fs_visitor::insert_gen4_send_dependency_workarounds()
2443 {
2444 if (intel->gen != 4 || intel->is_g4x)
2445 return;
2446
2447 /* Note that we're done with register allocation, so GRF fs_regs always
2448 * have a .reg_offset of 0.
2449 */
2450
2451 foreach_list_safe(node, &this->instructions) {
2452 fs_inst *inst = (fs_inst *)node;
2453
2454 if (inst->mlen != 0 && inst->dst.file == GRF) {
2455 insert_gen4_pre_send_dependency_workarounds(inst);
2456 insert_gen4_post_send_dependency_workarounds(inst);
2457 }
2458 }
2459 }
2460
2461 /**
2462 * Turns the generic expression-style uniform pull constant load instruction
2463 * into a hardware-specific series of instructions for loading a pull
2464 * constant.
2465 *
2466 * The expression style allows the CSE pass before this to optimize out
2467 * repeated loads from the same offset, and gives the pre-register-allocation
2468 * scheduling full flexibility, while the conversion to native instructions
2469 * allows the post-register-allocation scheduler the best information
2470 * possible.
2471 *
2472 * Note that execution masking for setting up pull constant loads is special:
2473 * the channels that need to be written are unrelated to the current execution
2474 * mask, since a later instruction will use one of the result channels as a
2475 * source operand for all 8 or 16 of its channels.
2476 */
2477 void
2478 fs_visitor::lower_uniform_pull_constant_loads()
2479 {
2480 foreach_list(node, &this->instructions) {
2481 fs_inst *inst = (fs_inst *)node;
2482
2483 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2484 continue;
2485
2486 if (intel->gen >= 7) {
2487 /* The offset arg before was a vec4-aligned byte offset. We need to
2488 * turn it into a dword offset.
2489 */
2490 fs_reg const_offset_reg = inst->src[1];
2491 assert(const_offset_reg.file == IMM &&
2492 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2493 const_offset_reg.imm.u /= 4;
2494 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2495
2496 /* This is actually going to be a MOV, but since only the first dword
2497 * is accessed, we have a special opcode to do just that one. Note
2498 * that this needs to be an operation that will be considered a def
2499 * by live variable analysis, or register allocation will explode.
2500 */
2501 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2502 payload, const_offset_reg);
2503 setup->force_writemask_all = true;
2504
2505 setup->ir = inst->ir;
2506 setup->annotation = inst->annotation;
2507 inst->insert_before(setup);
2508
2509 /* Similarly, this will only populate the first 4 channels of the
2510 * result register (since we only use smear values from 0-3), but we
2511 * don't tell the optimizer.
2512 */
2513 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2514 inst->src[1] = payload;
2515
2516 this->live_intervals_valid = false;
2517 } else {
2518 /* Before register allocation, we didn't tell the scheduler about the
2519 * MRF we use. We know it's safe to use this MRF because nothing
2520 * else does except for register spill/unspill, which generates and
2521 * uses its MRF within a single IR instruction.
2522 */
2523 inst->base_mrf = 14;
2524 inst->mlen = 1;
2525 }
2526 }
2527 }
2528
2529 void
2530 fs_visitor::dump_instruction(fs_inst *inst)
2531 {
2532 if (inst->predicate) {
2533 printf("(%cf0.%d) ",
2534 inst->predicate_inverse ? '-' : '+',
2535 inst->flag_subreg);
2536 }
2537
2538 printf("%s", brw_instruction_name(inst->opcode));
2539 if (inst->saturate)
2540 printf(".sat");
2541 if (inst->conditional_mod) {
2542 printf(".cmod");
2543 if (!inst->predicate &&
2544 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2545 inst->opcode != BRW_OPCODE_IF &&
2546 inst->opcode != BRW_OPCODE_WHILE))) {
2547 printf(".f0.%d\n", inst->flag_subreg);
2548 }
2549 }
2550 printf(" ");
2551
2552
2553 switch (inst->dst.file) {
2554 case GRF:
2555 printf("vgrf%d", inst->dst.reg);
2556 if (inst->dst.reg_offset)
2557 printf("+%d", inst->dst.reg_offset);
2558 break;
2559 case MRF:
2560 printf("m%d", inst->dst.reg);
2561 break;
2562 case BAD_FILE:
2563 printf("(null)");
2564 break;
2565 case UNIFORM:
2566 printf("***u%d***", inst->dst.reg);
2567 break;
2568 default:
2569 printf("???");
2570 break;
2571 }
2572 printf(", ");
2573
2574 for (int i = 0; i < 3; i++) {
2575 if (inst->src[i].negate)
2576 printf("-");
2577 if (inst->src[i].abs)
2578 printf("|");
2579 switch (inst->src[i].file) {
2580 case GRF:
2581 printf("vgrf%d", inst->src[i].reg);
2582 if (inst->src[i].reg_offset)
2583 printf("+%d", inst->src[i].reg_offset);
2584 break;
2585 case MRF:
2586 printf("***m%d***", inst->src[i].reg);
2587 break;
2588 case UNIFORM:
2589 printf("u%d", inst->src[i].reg);
2590 if (inst->src[i].reg_offset)
2591 printf(".%d", inst->src[i].reg_offset);
2592 break;
2593 case BAD_FILE:
2594 printf("(null)");
2595 break;
2596 case IMM:
2597 switch (inst->src[i].type) {
2598 case BRW_REGISTER_TYPE_F:
2599 printf("%ff", inst->src[i].imm.f);
2600 break;
2601 case BRW_REGISTER_TYPE_D:
2602 printf("%dd", inst->src[i].imm.i);
2603 break;
2604 case BRW_REGISTER_TYPE_UD:
2605 printf("%uu", inst->src[i].imm.u);
2606 break;
2607 default:
2608 printf("???");
2609 break;
2610 }
2611 break;
2612 default:
2613 printf("???");
2614 break;
2615 }
2616 if (inst->src[i].abs)
2617 printf("|");
2618
2619 if (i < 3)
2620 printf(", ");
2621 }
2622
2623 printf(" ");
2624
2625 if (inst->force_uncompressed)
2626 printf("1sthalf ");
2627
2628 if (inst->force_sechalf)
2629 printf("2ndhalf ");
2630
2631 printf("\n");
2632 }
2633
2634 void
2635 fs_visitor::dump_instructions()
2636 {
2637 int ip = 0;
2638 foreach_list(node, &this->instructions) {
2639 fs_inst *inst = (fs_inst *)node;
2640 printf("%d: ", ip++);
2641 dump_instruction(inst);
2642 }
2643 }
2644
2645 /**
2646 * Possibly returns an instruction that set up @param reg.
2647 *
2648 * Sometimes we want to take the result of some expression/variable
2649 * dereference tree and rewrite the instruction generating the result
2650 * of the tree. When processing the tree, we know that the
2651 * instructions generated are all writing temporaries that are dead
2652 * outside of this tree. So, if we have some instructions that write
2653 * a temporary, we're free to point that temp write somewhere else.
2654 *
2655 * Note that this doesn't guarantee that the instruction generated
2656 * only reg -- it might be the size=4 destination of a texture instruction.
2657 */
2658 fs_inst *
2659 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2660 fs_inst *end,
2661 fs_reg reg)
2662 {
2663 if (end == start ||
2664 end->predicate ||
2665 end->force_uncompressed ||
2666 end->force_sechalf ||
2667 reg.reladdr ||
2668 !reg.equals(end->dst)) {
2669 return NULL;
2670 } else {
2671 return end;
2672 }
2673 }
2674
2675 void
2676 fs_visitor::setup_payload_gen6()
2677 {
2678 struct intel_context *intel = &brw->intel;
2679 bool uses_depth =
2680 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2681 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2682
2683 assert(intel->gen >= 6);
2684
2685 /* R0-1: masks, pixel X/Y coordinates. */
2686 c->nr_payload_regs = 2;
2687 /* R2: only for 32-pixel dispatch.*/
2688
2689 /* R3-26: barycentric interpolation coordinates. These appear in the
2690 * same order that they appear in the brw_wm_barycentric_interp_mode
2691 * enum. Each set of coordinates occupies 2 registers if dispatch width
2692 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2693 * appear if they were enabled using the "Barycentric Interpolation
2694 * Mode" bits in WM_STATE.
2695 */
2696 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2697 if (barycentric_interp_modes & (1 << i)) {
2698 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2699 c->nr_payload_regs += 2;
2700 if (dispatch_width == 16) {
2701 c->nr_payload_regs += 2;
2702 }
2703 }
2704 }
2705
2706 /* R27: interpolated depth if uses source depth */
2707 if (uses_depth) {
2708 c->source_depth_reg = c->nr_payload_regs;
2709 c->nr_payload_regs++;
2710 if (dispatch_width == 16) {
2711 /* R28: interpolated depth if not 8-wide. */
2712 c->nr_payload_regs++;
2713 }
2714 }
2715 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2716 if (uses_depth) {
2717 c->source_w_reg = c->nr_payload_regs;
2718 c->nr_payload_regs++;
2719 if (dispatch_width == 16) {
2720 /* R30: interpolated W if not 8-wide. */
2721 c->nr_payload_regs++;
2722 }
2723 }
2724 /* R31: MSAA position offsets. */
2725 /* R32-: bary for 32-pixel. */
2726 /* R58-59: interp W for 32-pixel. */
2727
2728 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2729 c->source_depth_to_render_target = true;
2730 }
2731 }
2732
2733 bool
2734 fs_visitor::run()
2735 {
2736 sanity_param_count = fp->Base.Parameters->NumParameters;
2737 uint32_t orig_nr_params = c->prog_data.nr_params;
2738
2739 if (intel->gen >= 6)
2740 setup_payload_gen6();
2741 else
2742 setup_payload_gen4();
2743
2744 if (0) {
2745 emit_dummy_fs();
2746 } else {
2747 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2748 emit_shader_time_begin();
2749
2750 calculate_urb_setup();
2751 if (intel->gen < 6)
2752 emit_interpolation_setup_gen4();
2753 else
2754 emit_interpolation_setup_gen6();
2755
2756 /* We handle discards by keeping track of the still-live pixels in f0.1.
2757 * Initialize it with the dispatched pixels.
2758 */
2759 if (fp->UsesKill) {
2760 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2761 discard_init->flag_subreg = 1;
2762 }
2763
2764 /* Generate FS IR for main(). (the visitor only descends into
2765 * functions called "main").
2766 */
2767 if (shader) {
2768 foreach_list(node, &*shader->ir) {
2769 ir_instruction *ir = (ir_instruction *)node;
2770 base_ir = ir;
2771 this->result = reg_undef;
2772 ir->accept(this);
2773 }
2774 } else {
2775 emit_fragment_program_code();
2776 }
2777 base_ir = NULL;
2778 if (failed)
2779 return false;
2780
2781 emit(FS_OPCODE_PLACEHOLDER_HALT);
2782
2783 emit_fb_writes();
2784
2785 split_virtual_grfs();
2786
2787 move_uniform_array_access_to_pull_constants();
2788 setup_pull_constants();
2789
2790 bool progress;
2791 do {
2792 progress = false;
2793
2794 compact_virtual_grfs();
2795
2796 progress = remove_duplicate_mrf_writes() || progress;
2797
2798 progress = opt_algebraic() || progress;
2799 progress = opt_cse() || progress;
2800 progress = opt_copy_propagate() || progress;
2801 progress = dead_code_eliminate() || progress;
2802 progress = register_coalesce() || progress;
2803 progress = register_coalesce_2() || progress;
2804 progress = compute_to_mrf() || progress;
2805 } while (progress);
2806
2807 remove_dead_constants();
2808
2809 schedule_instructions(false);
2810
2811 lower_uniform_pull_constant_loads();
2812
2813 assign_curb_setup();
2814 assign_urb_setup();
2815
2816 if (0) {
2817 /* Debug of register spilling: Go spill everything. */
2818 for (int i = 0; i < virtual_grf_count; i++) {
2819 spill_reg(i);
2820 }
2821 }
2822
2823 if (0)
2824 assign_regs_trivial();
2825 else {
2826 while (!assign_regs()) {
2827 if (failed)
2828 break;
2829 }
2830 }
2831 }
2832 assert(force_uncompressed_stack == 0);
2833 assert(force_sechalf_stack == 0);
2834
2835 /* This must come after all optimization and register allocation, since
2836 * it inserts dead code that happens to have side effects, and it does
2837 * so based on the actual physical registers in use.
2838 */
2839 insert_gen4_send_dependency_workarounds();
2840
2841 if (failed)
2842 return false;
2843
2844 schedule_instructions(true);
2845
2846 if (dispatch_width == 8) {
2847 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2848 } else {
2849 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2850
2851 /* Make sure we didn't try to sneak in an extra uniform */
2852 assert(orig_nr_params == c->prog_data.nr_params);
2853 (void) orig_nr_params;
2854 }
2855
2856 /* If any state parameters were appended, then ParameterValues could have
2857 * been realloced, in which case the driver uniform storage set up by
2858 * _mesa_associate_uniform_storage() would point to freed memory. Make
2859 * sure that didn't happen.
2860 */
2861 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2862
2863 return !failed;
2864 }
2865
2866 const unsigned *
2867 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2868 struct gl_fragment_program *fp,
2869 struct gl_shader_program *prog,
2870 unsigned *final_assembly_size)
2871 {
2872 struct intel_context *intel = &brw->intel;
2873 bool start_busy = false;
2874 float start_time = 0;
2875
2876 if (unlikely(intel->perf_debug)) {
2877 start_busy = (intel->batch.last_bo &&
2878 drm_intel_bo_busy(intel->batch.last_bo));
2879 start_time = get_time();
2880 }
2881
2882 struct brw_shader *shader = NULL;
2883 if (prog)
2884 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2885
2886 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2887 if (shader) {
2888 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2889 _mesa_print_ir(shader->ir, NULL);
2890 printf("\n\n");
2891 } else {
2892 printf("ARB_fragment_program %d ir for native fragment shader\n",
2893 fp->Base.Id);
2894 _mesa_print_program(&fp->Base);
2895 }
2896 }
2897
2898 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2899 */
2900 fs_visitor v(brw, c, prog, fp, 8);
2901 if (!v.run()) {
2902 prog->LinkStatus = false;
2903 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2904
2905 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2906 v.fail_msg);
2907
2908 return NULL;
2909 }
2910
2911 exec_list *simd16_instructions = NULL;
2912 fs_visitor v2(brw, c, prog, fp, 16);
2913 bool no16 = INTEL_DEBUG & DEBUG_NO16;
2914 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2915 v2.import_uniforms(&v);
2916 if (!v2.run()) {
2917 perf_debug("16-wide shader failed to compile, falling back to "
2918 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2919 } else {
2920 simd16_instructions = &v2.instructions;
2921 }
2922 }
2923
2924 c->prog_data.dispatch_width = 8;
2925
2926 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2927 const unsigned *generated = g.generate_assembly(&v.instructions,
2928 simd16_instructions,
2929 final_assembly_size);
2930
2931 if (unlikely(intel->perf_debug) && shader) {
2932 if (shader->compiled_once)
2933 brw_wm_debug_recompile(brw, prog, &c->key);
2934 shader->compiled_once = true;
2935
2936 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2937 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2938 (get_time() - start_time) * 1000);
2939 }
2940 }
2941
2942 return generated;
2943 }
2944
2945 bool
2946 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2947 {
2948 struct brw_context *brw = brw_context(ctx);
2949 struct intel_context *intel = &brw->intel;
2950 struct brw_wm_prog_key key;
2951
2952 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2953 return true;
2954
2955 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2956 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2957 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2958 bool program_uses_dfdy = fp->UsesDFdy;
2959
2960 memset(&key, 0, sizeof(key));
2961
2962 if (intel->gen < 6) {
2963 if (fp->UsesKill)
2964 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2965
2966 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2967 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2968
2969 /* Just assume depth testing. */
2970 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2971 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2972 }
2973
2974 if (prog->Name != 0)
2975 key.proj_attrib_mask = ~(GLbitfield64) 0;
2976 else {
2977 /* Bit VARYING_BIT_POS of key.proj_attrib_mask is never used, so to
2978 * avoid unnecessary recompiles, always set it to 1.
2979 */
2980 key.proj_attrib_mask |= VARYING_BIT_POS;
2981 }
2982
2983 if (intel->gen < 6)
2984 key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
2985
2986 for (int i = 0; i < VARYING_SLOT_MAX; i++) {
2987 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2988 continue;
2989
2990 if (prog->Name == 0)
2991 key.proj_attrib_mask |= BITFIELD64_BIT(i);
2992
2993 if (intel->gen < 6) {
2994 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
2995 key.input_slots_valid |= BITFIELD64_BIT(i);
2996 }
2997 }
2998
2999 key.clamp_fragment_color = true;
3000
3001 for (int i = 0; i < MAX_SAMPLERS; i++) {
3002 if (fp->Base.ShadowSamplers & (1 << i)) {
3003 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3004 key.tex.swizzles[i] =
3005 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3006 } else {
3007 /* Color sampler: assume no swizzling. */
3008 key.tex.swizzles[i] = SWIZZLE_XYZW;
3009 }
3010 }
3011
3012 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3013 key.drawable_height = ctx->DrawBuffer->Height;
3014 }
3015
3016 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3017 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3018 }
3019
3020 key.nr_color_regions = 1;
3021
3022 key.program_string_id = bfp->id;
3023
3024 uint32_t old_prog_offset = brw->wm.prog_offset;
3025 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3026
3027 bool success = do_wm_prog(brw, prog, bfp, &key);
3028
3029 brw->wm.prog_offset = old_prog_offset;
3030 brw->wm.prog_data = old_prog_data;
3031
3032 return success;
3033 }