e53de663cbeab1a579b344954d53129e336cba11
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 ALU1(NOT)
150 ALU1(MOV)
151 ALU1(FRC)
152 ALU1(RNDD)
153 ALU1(RNDE)
154 ALU1(RNDZ)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(SHL)
162 ALU2(SHR)
163 ALU2(ASR)
164
165 /** Gen4 predicated IF. */
166 fs_inst *
167 fs_visitor::IF(uint32_t predicate)
168 {
169 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
170 inst->predicate = predicate;
171 return inst;
172 }
173
174 /** Gen6+ IF with embedded comparison. */
175 fs_inst *
176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
177 {
178 assert(intel->gen >= 6);
179 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
180 reg_null_d, src0, src1);
181 inst->conditional_mod = condition;
182 return inst;
183 }
184
185 /**
186 * CMP: Sets the low bit of the destination channels with the result
187 * of the comparison, while the upper bits are undefined, and updates
188 * the flag register with the packed 16 bits of the result.
189 */
190 fs_inst *
191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
192 {
193 fs_inst *inst;
194
195 /* Take the instruction:
196 *
197 * CMP null<d> src0<f> src1<f>
198 *
199 * Original gen4 does type conversion to the destination type before
200 * comparison, producing garbage results for floating point comparisons.
201 * gen5 does the comparison on the execution type (resolved source types),
202 * so dst type doesn't matter. gen6 does comparison and then uses the
203 * result as if it was the dst type with no conversion, which happens to
204 * mostly work out for float-interpreted-as-int since our comparisons are
205 * for >0, =0, <0.
206 */
207 if (intel->gen == 4) {
208 dst.type = src0.type;
209 if (dst.file == FIXED_HW_REG)
210 dst.fixed_hw_reg.type = dst.type;
211 }
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 exec_list
223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
224 fs_reg offset)
225 {
226 exec_list instructions;
227 fs_inst *inst;
228
229 if (intel->gen >= 7) {
230 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
231 dst, surf_index, offset);
232 instructions.push_tail(inst);
233 } else {
234 int base_mrf = 13;
235 bool header_present = true;
236
237 fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
238 mrf.type = BRW_REGISTER_TYPE_D;
239
240 /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
241 * dword-aligned byte offset.
242 */
243 if (intel->gen == 6) {
244 instructions.push_tail(MOV(mrf, offset));
245 } else {
246 instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
247 }
248 inst = MOV(mrf, offset);
249 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
250 dst, surf_index);
251 inst->header_present = header_present;
252 inst->base_mrf = base_mrf;
253 inst->mlen = header_present + dispatch_width / 8;
254
255 instructions.push_tail(inst);
256 }
257
258 return instructions;
259 }
260
261 /**
262 * A helper for MOV generation for fixing up broken hardware SEND dependency
263 * handling.
264 */
265 fs_inst *
266 fs_visitor::DEP_RESOLVE_MOV(int grf)
267 {
268 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
269
270 inst->ir = NULL;
271 inst->annotation = "send dependency resolve";
272
273 /* The caller always wants uncompressed to emit the minimal extra
274 * dependencies, and to avoid having to deal with aligning its regs to 2.
275 */
276 inst->force_uncompressed = true;
277
278 return inst;
279 }
280
281 bool
282 fs_inst::equals(fs_inst *inst)
283 {
284 return (opcode == inst->opcode &&
285 dst.equals(inst->dst) &&
286 src[0].equals(inst->src[0]) &&
287 src[1].equals(inst->src[1]) &&
288 src[2].equals(inst->src[2]) &&
289 saturate == inst->saturate &&
290 predicate == inst->predicate &&
291 conditional_mod == inst->conditional_mod &&
292 mlen == inst->mlen &&
293 base_mrf == inst->base_mrf &&
294 sampler == inst->sampler &&
295 target == inst->target &&
296 eot == inst->eot &&
297 header_present == inst->header_present &&
298 shadow_compare == inst->shadow_compare &&
299 offset == inst->offset);
300 }
301
302 int
303 fs_inst::regs_written()
304 {
305 if (is_tex())
306 return 4;
307
308 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
309 * but we don't currently use them...nor do we have an opcode for them.
310 */
311
312 return 1;
313 }
314
315 bool
316 fs_inst::overwrites_reg(const fs_reg &reg)
317 {
318 return (reg.file == dst.file &&
319 reg.reg == dst.reg &&
320 reg.reg_offset >= dst.reg_offset &&
321 reg.reg_offset < dst.reg_offset + regs_written());
322 }
323
324 bool
325 fs_inst::is_tex()
326 {
327 return (opcode == SHADER_OPCODE_TEX ||
328 opcode == FS_OPCODE_TXB ||
329 opcode == SHADER_OPCODE_TXD ||
330 opcode == SHADER_OPCODE_TXF ||
331 opcode == SHADER_OPCODE_TXL ||
332 opcode == SHADER_OPCODE_TXS);
333 }
334
335 bool
336 fs_inst::is_math()
337 {
338 return (opcode == SHADER_OPCODE_RCP ||
339 opcode == SHADER_OPCODE_RSQ ||
340 opcode == SHADER_OPCODE_SQRT ||
341 opcode == SHADER_OPCODE_EXP2 ||
342 opcode == SHADER_OPCODE_LOG2 ||
343 opcode == SHADER_OPCODE_SIN ||
344 opcode == SHADER_OPCODE_COS ||
345 opcode == SHADER_OPCODE_INT_QUOTIENT ||
346 opcode == SHADER_OPCODE_INT_REMAINDER ||
347 opcode == SHADER_OPCODE_POW);
348 }
349
350 bool
351 fs_inst::is_control_flow()
352 {
353 switch (opcode) {
354 case BRW_OPCODE_DO:
355 case BRW_OPCODE_WHILE:
356 case BRW_OPCODE_IF:
357 case BRW_OPCODE_ELSE:
358 case BRW_OPCODE_ENDIF:
359 case BRW_OPCODE_BREAK:
360 case BRW_OPCODE_CONTINUE:
361 return true;
362 default:
363 return false;
364 }
365 }
366
367 bool
368 fs_inst::is_send_from_grf()
369 {
370 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
371 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
372 src[1].file == GRF));
373 }
374
375 bool
376 fs_visitor::can_do_source_mods(fs_inst *inst)
377 {
378 if (intel->gen == 6 && inst->is_math())
379 return false;
380
381 if (inst->is_send_from_grf())
382 return false;
383
384 return true;
385 }
386
387 void
388 fs_reg::init()
389 {
390 memset(this, 0, sizeof(*this));
391 this->smear = -1;
392 }
393
394 /** Generic unset register constructor. */
395 fs_reg::fs_reg()
396 {
397 init();
398 this->file = BAD_FILE;
399 }
400
401 /** Immediate value constructor. */
402 fs_reg::fs_reg(float f)
403 {
404 init();
405 this->file = IMM;
406 this->type = BRW_REGISTER_TYPE_F;
407 this->imm.f = f;
408 }
409
410 /** Immediate value constructor. */
411 fs_reg::fs_reg(int32_t i)
412 {
413 init();
414 this->file = IMM;
415 this->type = BRW_REGISTER_TYPE_D;
416 this->imm.i = i;
417 }
418
419 /** Immediate value constructor. */
420 fs_reg::fs_reg(uint32_t u)
421 {
422 init();
423 this->file = IMM;
424 this->type = BRW_REGISTER_TYPE_UD;
425 this->imm.u = u;
426 }
427
428 /** Fixed brw_reg Immediate value constructor. */
429 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
430 {
431 init();
432 this->file = FIXED_HW_REG;
433 this->fixed_hw_reg = fixed_hw_reg;
434 this->type = fixed_hw_reg.type;
435 }
436
437 bool
438 fs_reg::equals(const fs_reg &r) const
439 {
440 return (file == r.file &&
441 reg == r.reg &&
442 reg_offset == r.reg_offset &&
443 type == r.type &&
444 negate == r.negate &&
445 abs == r.abs &&
446 !reladdr && !r.reladdr &&
447 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
448 sizeof(fixed_hw_reg)) == 0 &&
449 smear == r.smear &&
450 imm.u == r.imm.u);
451 }
452
453 bool
454 fs_reg::is_zero() const
455 {
456 if (file != IMM)
457 return false;
458
459 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
460 }
461
462 bool
463 fs_reg::is_one() const
464 {
465 if (file != IMM)
466 return false;
467
468 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
469 }
470
471 int
472 fs_visitor::type_size(const struct glsl_type *type)
473 {
474 unsigned int size, i;
475
476 switch (type->base_type) {
477 case GLSL_TYPE_UINT:
478 case GLSL_TYPE_INT:
479 case GLSL_TYPE_FLOAT:
480 case GLSL_TYPE_BOOL:
481 return type->components();
482 case GLSL_TYPE_ARRAY:
483 return type_size(type->fields.array) * type->length;
484 case GLSL_TYPE_STRUCT:
485 size = 0;
486 for (i = 0; i < type->length; i++) {
487 size += type_size(type->fields.structure[i].type);
488 }
489 return size;
490 case GLSL_TYPE_SAMPLER:
491 /* Samplers take up no register space, since they're baked in at
492 * link time.
493 */
494 return 0;
495 case GLSL_TYPE_VOID:
496 case GLSL_TYPE_ERROR:
497 case GLSL_TYPE_INTERFACE:
498 assert(!"not reached");
499 break;
500 }
501
502 return 0;
503 }
504
505 fs_reg
506 fs_visitor::get_timestamp()
507 {
508 assert(intel->gen >= 7);
509
510 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
511 BRW_ARF_TIMESTAMP,
512 0),
513 BRW_REGISTER_TYPE_UD));
514
515 fs_reg dst = fs_reg(this, glsl_type::uint_type);
516
517 fs_inst *mov = emit(MOV(dst, ts));
518 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
519 * even if it's not enabled in the dispatch.
520 */
521 mov->force_writemask_all = true;
522 mov->force_uncompressed = true;
523
524 /* The caller wants the low 32 bits of the timestamp. Since it's running
525 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
526 * which is plenty of time for our purposes. It is identical across the
527 * EUs, but since it's tracking GPU core speed it will increment at a
528 * varying rate as render P-states change.
529 *
530 * The caller could also check if render P-states have changed (or anything
531 * else that might disrupt timing) by setting smear to 2 and checking if
532 * that field is != 0.
533 */
534 dst.smear = 0;
535
536 return dst;
537 }
538
539 void
540 fs_visitor::emit_shader_time_begin()
541 {
542 current_annotation = "shader time start";
543 shader_start_time = get_timestamp();
544 }
545
546 void
547 fs_visitor::emit_shader_time_end()
548 {
549 current_annotation = "shader time end";
550
551 enum shader_time_shader_type type, written_type, reset_type;
552 if (dispatch_width == 8) {
553 type = ST_FS8;
554 written_type = ST_FS8_WRITTEN;
555 reset_type = ST_FS8_RESET;
556 } else {
557 assert(dispatch_width == 16);
558 type = ST_FS16;
559 written_type = ST_FS16_WRITTEN;
560 reset_type = ST_FS16_RESET;
561 }
562
563 fs_reg shader_end_time = get_timestamp();
564
565 /* Check that there weren't any timestamp reset events (assuming these
566 * were the only two timestamp reads that happened).
567 */
568 fs_reg reset = shader_end_time;
569 reset.smear = 2;
570 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
571 test->conditional_mod = BRW_CONDITIONAL_Z;
572 emit(IF(BRW_PREDICATE_NORMAL));
573
574 push_force_uncompressed();
575 fs_reg start = shader_start_time;
576 start.negate = true;
577 fs_reg diff = fs_reg(this, glsl_type::uint_type);
578 emit(ADD(diff, start, shader_end_time));
579
580 /* If there were no instructions between the two timestamp gets, the diff
581 * is 2 cycles. Remove that overhead, so I can forget about that when
582 * trying to determine the time taken for single instructions.
583 */
584 emit(ADD(diff, diff, fs_reg(-2u)));
585
586 emit_shader_time_write(type, diff);
587 emit_shader_time_write(written_type, fs_reg(1u));
588 emit(BRW_OPCODE_ELSE);
589 emit_shader_time_write(reset_type, fs_reg(1u));
590 emit(BRW_OPCODE_ENDIF);
591
592 pop_force_uncompressed();
593 }
594
595 void
596 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
597 fs_reg value)
598 {
599 /* Choose an index in the buffer and set up tracking information for our
600 * printouts.
601 */
602 int shader_time_index = brw->shader_time.num_entries++;
603 assert(shader_time_index <= brw->shader_time.max_entries);
604 brw->shader_time.types[shader_time_index] = type;
605 if (prog) {
606 _mesa_reference_shader_program(ctx,
607 &brw->shader_time.programs[shader_time_index],
608 prog);
609 }
610
611 int base_mrf = 6;
612
613 fs_reg offset_mrf = fs_reg(MRF, base_mrf);
614 offset_mrf.type = BRW_REGISTER_TYPE_UD;
615 emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
616
617 fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
618 time_mrf.type = BRW_REGISTER_TYPE_UD;
619 emit(MOV(time_mrf, value));
620
621 fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
622 inst->base_mrf = base_mrf;
623 inst->mlen = 2;
624 }
625
626 void
627 fs_visitor::fail(const char *format, ...)
628 {
629 va_list va;
630 char *msg;
631
632 if (failed)
633 return;
634
635 failed = true;
636
637 va_start(va, format);
638 msg = ralloc_vasprintf(mem_ctx, format, va);
639 va_end(va);
640 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
641
642 this->fail_msg = msg;
643
644 if (INTEL_DEBUG & DEBUG_WM) {
645 fprintf(stderr, "%s", msg);
646 }
647 }
648
649 fs_inst *
650 fs_visitor::emit(enum opcode opcode)
651 {
652 return emit(fs_inst(opcode));
653 }
654
655 fs_inst *
656 fs_visitor::emit(enum opcode opcode, fs_reg dst)
657 {
658 return emit(fs_inst(opcode, dst));
659 }
660
661 fs_inst *
662 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
663 {
664 return emit(fs_inst(opcode, dst, src0));
665 }
666
667 fs_inst *
668 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
669 {
670 return emit(fs_inst(opcode, dst, src0, src1));
671 }
672
673 fs_inst *
674 fs_visitor::emit(enum opcode opcode, fs_reg dst,
675 fs_reg src0, fs_reg src1, fs_reg src2)
676 {
677 return emit(fs_inst(opcode, dst, src0, src1, src2));
678 }
679
680 void
681 fs_visitor::push_force_uncompressed()
682 {
683 force_uncompressed_stack++;
684 }
685
686 void
687 fs_visitor::pop_force_uncompressed()
688 {
689 force_uncompressed_stack--;
690 assert(force_uncompressed_stack >= 0);
691 }
692
693 void
694 fs_visitor::push_force_sechalf()
695 {
696 force_sechalf_stack++;
697 }
698
699 void
700 fs_visitor::pop_force_sechalf()
701 {
702 force_sechalf_stack--;
703 assert(force_sechalf_stack >= 0);
704 }
705
706 /**
707 * Returns how many MRFs an FS opcode will write over.
708 *
709 * Note that this is not the 0 or 1 implied writes in an actual gen
710 * instruction -- the FS opcodes often generate MOVs in addition.
711 */
712 int
713 fs_visitor::implied_mrf_writes(fs_inst *inst)
714 {
715 if (inst->mlen == 0)
716 return 0;
717
718 switch (inst->opcode) {
719 case SHADER_OPCODE_RCP:
720 case SHADER_OPCODE_RSQ:
721 case SHADER_OPCODE_SQRT:
722 case SHADER_OPCODE_EXP2:
723 case SHADER_OPCODE_LOG2:
724 case SHADER_OPCODE_SIN:
725 case SHADER_OPCODE_COS:
726 return 1 * dispatch_width / 8;
727 case SHADER_OPCODE_POW:
728 case SHADER_OPCODE_INT_QUOTIENT:
729 case SHADER_OPCODE_INT_REMAINDER:
730 return 2 * dispatch_width / 8;
731 case SHADER_OPCODE_TEX:
732 case FS_OPCODE_TXB:
733 case SHADER_OPCODE_TXD:
734 case SHADER_OPCODE_TXF:
735 case SHADER_OPCODE_TXL:
736 case SHADER_OPCODE_TXS:
737 return 1;
738 case SHADER_OPCODE_SHADER_TIME_ADD:
739 return 0;
740 case FS_OPCODE_FB_WRITE:
741 return 2;
742 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
743 case FS_OPCODE_UNSPILL:
744 return 1;
745 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
746 return inst->header_present;
747 case FS_OPCODE_SPILL:
748 return 2;
749 default:
750 assert(!"not reached");
751 return inst->mlen;
752 }
753 }
754
755 int
756 fs_visitor::virtual_grf_alloc(int size)
757 {
758 if (virtual_grf_array_size <= virtual_grf_count) {
759 if (virtual_grf_array_size == 0)
760 virtual_grf_array_size = 16;
761 else
762 virtual_grf_array_size *= 2;
763 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
764 virtual_grf_array_size);
765 }
766 virtual_grf_sizes[virtual_grf_count] = size;
767 return virtual_grf_count++;
768 }
769
770 /** Fixed HW reg constructor. */
771 fs_reg::fs_reg(enum register_file file, int reg)
772 {
773 init();
774 this->file = file;
775 this->reg = reg;
776 this->type = BRW_REGISTER_TYPE_F;
777 }
778
779 /** Fixed HW reg constructor. */
780 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
781 {
782 init();
783 this->file = file;
784 this->reg = reg;
785 this->type = type;
786 }
787
788 /** Automatic reg constructor. */
789 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
790 {
791 init();
792
793 this->file = GRF;
794 this->reg = v->virtual_grf_alloc(v->type_size(type));
795 this->reg_offset = 0;
796 this->type = brw_type_for_base_type(type);
797 }
798
799 fs_reg *
800 fs_visitor::variable_storage(ir_variable *var)
801 {
802 return (fs_reg *)hash_table_find(this->variable_ht, var);
803 }
804
805 void
806 import_uniforms_callback(const void *key,
807 void *data,
808 void *closure)
809 {
810 struct hash_table *dst_ht = (struct hash_table *)closure;
811 const fs_reg *reg = (const fs_reg *)data;
812
813 if (reg->file != UNIFORM)
814 return;
815
816 hash_table_insert(dst_ht, data, key);
817 }
818
819 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
820 * This brings in those uniform definitions
821 */
822 void
823 fs_visitor::import_uniforms(fs_visitor *v)
824 {
825 hash_table_call_foreach(v->variable_ht,
826 import_uniforms_callback,
827 variable_ht);
828 this->params_remap = v->params_remap;
829 }
830
831 /* Our support for uniforms is piggy-backed on the struct
832 * gl_fragment_program, because that's where the values actually
833 * get stored, rather than in some global gl_shader_program uniform
834 * store.
835 */
836 void
837 fs_visitor::setup_uniform_values(ir_variable *ir)
838 {
839 int namelen = strlen(ir->name);
840
841 /* The data for our (non-builtin) uniforms is stored in a series of
842 * gl_uniform_driver_storage structs for each subcomponent that
843 * glGetUniformLocation() could name. We know it's been set up in the same
844 * order we'd walk the type, so walk the list of storage and find anything
845 * with our name, or the prefix of a component that starts with our name.
846 */
847 unsigned params_before = c->prog_data.nr_params;
848 for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
849 struct gl_uniform_storage *storage = &prog->UniformStorage[u];
850
851 if (strncmp(ir->name, storage->name, namelen) != 0 ||
852 (storage->name[namelen] != 0 &&
853 storage->name[namelen] != '.' &&
854 storage->name[namelen] != '[')) {
855 continue;
856 }
857
858 unsigned slots = storage->type->component_slots();
859 if (storage->array_elements)
860 slots *= storage->array_elements;
861
862 for (unsigned i = 0; i < slots; i++) {
863 c->prog_data.param[c->prog_data.nr_params++] =
864 &storage->storage[i].f;
865 }
866 }
867
868 /* Make sure we actually initialized the right amount of stuff here. */
869 assert(params_before + ir->type->component_slots() ==
870 c->prog_data.nr_params);
871 }
872
873
874 /* Our support for builtin uniforms is even scarier than non-builtin.
875 * It sits on top of the PROG_STATE_VAR parameters that are
876 * automatically updated from GL context state.
877 */
878 void
879 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
880 {
881 const ir_state_slot *const slots = ir->state_slots;
882 assert(ir->state_slots != NULL);
883
884 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
885 /* This state reference has already been setup by ir_to_mesa, but we'll
886 * get the same index back here.
887 */
888 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
889 (gl_state_index *)slots[i].tokens);
890
891 /* Add each of the unique swizzles of the element as a parameter.
892 * This'll end up matching the expected layout of the
893 * array/matrix/structure we're trying to fill in.
894 */
895 int last_swiz = -1;
896 for (unsigned int j = 0; j < 4; j++) {
897 int swiz = GET_SWZ(slots[i].swizzle, j);
898 if (swiz == last_swiz)
899 break;
900 last_swiz = swiz;
901
902 c->prog_data.param[c->prog_data.nr_params++] =
903 &fp->Base.Parameters->ParameterValues[index][swiz].f;
904 }
905 }
906 }
907
908 fs_reg *
909 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
910 {
911 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
912 fs_reg wpos = *reg;
913 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
914
915 /* gl_FragCoord.x */
916 if (ir->pixel_center_integer) {
917 emit(MOV(wpos, this->pixel_x));
918 } else {
919 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
920 }
921 wpos.reg_offset++;
922
923 /* gl_FragCoord.y */
924 if (!flip && ir->pixel_center_integer) {
925 emit(MOV(wpos, this->pixel_y));
926 } else {
927 fs_reg pixel_y = this->pixel_y;
928 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
929
930 if (flip) {
931 pixel_y.negate = true;
932 offset += c->key.drawable_height - 1.0;
933 }
934
935 emit(ADD(wpos, pixel_y, fs_reg(offset)));
936 }
937 wpos.reg_offset++;
938
939 /* gl_FragCoord.z */
940 if (intel->gen >= 6) {
941 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
942 } else {
943 emit(FS_OPCODE_LINTERP, wpos,
944 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
945 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
946 interp_reg(FRAG_ATTRIB_WPOS, 2));
947 }
948 wpos.reg_offset++;
949
950 /* gl_FragCoord.w: Already set up in emit_interpolation */
951 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
952
953 return reg;
954 }
955
956 fs_inst *
957 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
958 glsl_interp_qualifier interpolation_mode,
959 bool is_centroid)
960 {
961 brw_wm_barycentric_interp_mode barycoord_mode;
962 if (is_centroid) {
963 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
964 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
965 else
966 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
967 } else {
968 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
969 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
970 else
971 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
972 }
973 return emit(FS_OPCODE_LINTERP, attr,
974 this->delta_x[barycoord_mode],
975 this->delta_y[barycoord_mode], interp);
976 }
977
978 fs_reg *
979 fs_visitor::emit_general_interpolation(ir_variable *ir)
980 {
981 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
982 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
983 fs_reg attr = *reg;
984
985 unsigned int array_elements;
986 const glsl_type *type;
987
988 if (ir->type->is_array()) {
989 array_elements = ir->type->length;
990 if (array_elements == 0) {
991 fail("dereferenced array '%s' has length 0\n", ir->name);
992 }
993 type = ir->type->fields.array;
994 } else {
995 array_elements = 1;
996 type = ir->type;
997 }
998
999 glsl_interp_qualifier interpolation_mode =
1000 ir->determine_interpolation_mode(c->key.flat_shade);
1001
1002 int location = ir->location;
1003 for (unsigned int i = 0; i < array_elements; i++) {
1004 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1005 if (urb_setup[location] == -1) {
1006 /* If there's no incoming setup data for this slot, don't
1007 * emit interpolation for it.
1008 */
1009 attr.reg_offset += type->vector_elements;
1010 location++;
1011 continue;
1012 }
1013
1014 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1015 /* Constant interpolation (flat shading) case. The SF has
1016 * handed us defined values in only the constant offset
1017 * field of the setup reg.
1018 */
1019 for (unsigned int k = 0; k < type->vector_elements; k++) {
1020 struct brw_reg interp = interp_reg(location, k);
1021 interp = suboffset(interp, 3);
1022 interp.type = reg->type;
1023 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1024 attr.reg_offset++;
1025 }
1026 } else {
1027 /* Smooth/noperspective interpolation case. */
1028 for (unsigned int k = 0; k < type->vector_elements; k++) {
1029 /* FINISHME: At some point we probably want to push
1030 * this farther by giving similar treatment to the
1031 * other potentially constant components of the
1032 * attribute, as well as making brw_vs_constval.c
1033 * handle varyings other than gl_TexCoord.
1034 */
1035 if (location >= FRAG_ATTRIB_TEX0 &&
1036 location <= FRAG_ATTRIB_TEX7 &&
1037 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1038 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1039 } else {
1040 struct brw_reg interp = interp_reg(location, k);
1041 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1042 ir->centroid);
1043 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1044 /* Get the pixel/sample mask into f0 so that we know
1045 * which pixels are lit. Then, for each channel that is
1046 * unlit, replace the centroid data with non-centroid
1047 * data.
1048 */
1049 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1050 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1051 interpolation_mode, false);
1052 inst->predicate = BRW_PREDICATE_NORMAL;
1053 inst->predicate_inverse = true;
1054 }
1055 if (intel->gen < 6) {
1056 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1057 }
1058 }
1059 attr.reg_offset++;
1060 }
1061
1062 }
1063 location++;
1064 }
1065 }
1066
1067 return reg;
1068 }
1069
1070 fs_reg *
1071 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1072 {
1073 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1074
1075 /* The frontfacing comes in as a bit in the thread payload. */
1076 if (intel->gen >= 6) {
1077 emit(BRW_OPCODE_ASR, *reg,
1078 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1079 fs_reg(15));
1080 emit(BRW_OPCODE_NOT, *reg, *reg);
1081 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1082 } else {
1083 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1084 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1085 * us front face
1086 */
1087 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1088 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1089 }
1090
1091 return reg;
1092 }
1093
1094 fs_reg
1095 fs_visitor::fix_math_operand(fs_reg src)
1096 {
1097 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1098 * might be able to do better by doing execsize = 1 math and then
1099 * expanding that result out, but we would need to be careful with
1100 * masking.
1101 *
1102 * The hardware ignores source modifiers (negate and abs) on math
1103 * instructions, so we also move to a temp to set those up.
1104 */
1105 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1106 !src.abs && !src.negate)
1107 return src;
1108
1109 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1110 * operands to math
1111 */
1112 if (intel->gen >= 7 && src.file != IMM)
1113 return src;
1114
1115 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1116 expanded.type = src.type;
1117 emit(BRW_OPCODE_MOV, expanded, src);
1118 return expanded;
1119 }
1120
1121 fs_inst *
1122 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1123 {
1124 switch (opcode) {
1125 case SHADER_OPCODE_RCP:
1126 case SHADER_OPCODE_RSQ:
1127 case SHADER_OPCODE_SQRT:
1128 case SHADER_OPCODE_EXP2:
1129 case SHADER_OPCODE_LOG2:
1130 case SHADER_OPCODE_SIN:
1131 case SHADER_OPCODE_COS:
1132 break;
1133 default:
1134 assert(!"not reached: bad math opcode");
1135 return NULL;
1136 }
1137
1138 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1139 * might be able to do better by doing execsize = 1 math and then
1140 * expanding that result out, but we would need to be careful with
1141 * masking.
1142 *
1143 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1144 * instructions, so we also move to a temp to set those up.
1145 */
1146 if (intel->gen >= 6)
1147 src = fix_math_operand(src);
1148
1149 fs_inst *inst = emit(opcode, dst, src);
1150
1151 if (intel->gen < 6) {
1152 inst->base_mrf = 2;
1153 inst->mlen = dispatch_width / 8;
1154 }
1155
1156 return inst;
1157 }
1158
1159 fs_inst *
1160 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1161 {
1162 int base_mrf = 2;
1163 fs_inst *inst;
1164
1165 switch (opcode) {
1166 case SHADER_OPCODE_INT_QUOTIENT:
1167 case SHADER_OPCODE_INT_REMAINDER:
1168 if (intel->gen >= 7 && dispatch_width == 16)
1169 fail("16-wide INTDIV unsupported\n");
1170 break;
1171 case SHADER_OPCODE_POW:
1172 break;
1173 default:
1174 assert(!"not reached: unsupported binary math opcode.");
1175 return NULL;
1176 }
1177
1178 if (intel->gen >= 6) {
1179 src0 = fix_math_operand(src0);
1180 src1 = fix_math_operand(src1);
1181
1182 inst = emit(opcode, dst, src0, src1);
1183 } else {
1184 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1185 * "Message Payload":
1186 *
1187 * "Operand0[7]. For the INT DIV functions, this operand is the
1188 * denominator."
1189 * ...
1190 * "Operand1[7]. For the INT DIV functions, this operand is the
1191 * numerator."
1192 */
1193 bool is_int_div = opcode != SHADER_OPCODE_POW;
1194 fs_reg &op0 = is_int_div ? src1 : src0;
1195 fs_reg &op1 = is_int_div ? src0 : src1;
1196
1197 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1198 inst = emit(opcode, dst, op0, reg_null_f);
1199
1200 inst->base_mrf = base_mrf;
1201 inst->mlen = 2 * dispatch_width / 8;
1202 }
1203 return inst;
1204 }
1205
1206 void
1207 fs_visitor::assign_curb_setup()
1208 {
1209 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1210 if (dispatch_width == 8) {
1211 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1212 } else {
1213 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1214 }
1215
1216 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1217 foreach_list(node, &this->instructions) {
1218 fs_inst *inst = (fs_inst *)node;
1219
1220 for (unsigned int i = 0; i < 3; i++) {
1221 if (inst->src[i].file == UNIFORM) {
1222 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1223 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1224 constant_nr / 8,
1225 constant_nr % 8);
1226
1227 inst->src[i].file = FIXED_HW_REG;
1228 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1229 }
1230 }
1231 }
1232 }
1233
1234 void
1235 fs_visitor::calculate_urb_setup()
1236 {
1237 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1238 urb_setup[i] = -1;
1239 }
1240
1241 int urb_next = 0;
1242 /* Figure out where each of the incoming setup attributes lands. */
1243 if (intel->gen >= 6) {
1244 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1245 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1246 urb_setup[i] = urb_next++;
1247 }
1248 }
1249 } else {
1250 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1251 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1252 /* Point size is packed into the header, not as a general attribute */
1253 if (i == VERT_RESULT_PSIZ)
1254 continue;
1255
1256 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1257 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1258
1259 /* The back color slot is skipped when the front color is
1260 * also written to. In addition, some slots can be
1261 * written in the vertex shader and not read in the
1262 * fragment shader. So the register number must always be
1263 * incremented, mapped or not.
1264 */
1265 if (fp_index >= 0)
1266 urb_setup[fp_index] = urb_next;
1267 urb_next++;
1268 }
1269 }
1270
1271 /*
1272 * It's a FS only attribute, and we did interpolation for this attribute
1273 * in SF thread. So, count it here, too.
1274 *
1275 * See compile_sf_prog() for more info.
1276 */
1277 if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1278 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1279 }
1280
1281 /* Each attribute is 4 setup channels, each of which is half a reg. */
1282 c->prog_data.urb_read_length = urb_next * 2;
1283 }
1284
1285 void
1286 fs_visitor::assign_urb_setup()
1287 {
1288 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1289
1290 /* Offset all the urb_setup[] index by the actual position of the
1291 * setup regs, now that the location of the constants has been chosen.
1292 */
1293 foreach_list(node, &this->instructions) {
1294 fs_inst *inst = (fs_inst *)node;
1295
1296 if (inst->opcode == FS_OPCODE_LINTERP) {
1297 assert(inst->src[2].file == FIXED_HW_REG);
1298 inst->src[2].fixed_hw_reg.nr += urb_start;
1299 }
1300
1301 if (inst->opcode == FS_OPCODE_CINTERP) {
1302 assert(inst->src[0].file == FIXED_HW_REG);
1303 inst->src[0].fixed_hw_reg.nr += urb_start;
1304 }
1305 }
1306
1307 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1308 }
1309
1310 /**
1311 * Split large virtual GRFs into separate components if we can.
1312 *
1313 * This is mostly duplicated with what brw_fs_vector_splitting does,
1314 * but that's really conservative because it's afraid of doing
1315 * splitting that doesn't result in real progress after the rest of
1316 * the optimization phases, which would cause infinite looping in
1317 * optimization. We can do it once here, safely. This also has the
1318 * opportunity to split interpolated values, or maybe even uniforms,
1319 * which we don't have at the IR level.
1320 *
1321 * We want to split, because virtual GRFs are what we register
1322 * allocate and spill (due to contiguousness requirements for some
1323 * instructions), and they're what we naturally generate in the
1324 * codegen process, but most virtual GRFs don't actually need to be
1325 * contiguous sets of GRFs. If we split, we'll end up with reduced
1326 * live intervals and better dead code elimination and coalescing.
1327 */
1328 void
1329 fs_visitor::split_virtual_grfs()
1330 {
1331 int num_vars = this->virtual_grf_count;
1332 bool split_grf[num_vars];
1333 int new_virtual_grf[num_vars];
1334
1335 /* Try to split anything > 0 sized. */
1336 for (int i = 0; i < num_vars; i++) {
1337 if (this->virtual_grf_sizes[i] != 1)
1338 split_grf[i] = true;
1339 else
1340 split_grf[i] = false;
1341 }
1342
1343 if (brw->has_pln &&
1344 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1345 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1346 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1347 * Gen6, that was the only supported interpolation mode, and since Gen6,
1348 * delta_x and delta_y are in fixed hardware registers.
1349 */
1350 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1351 false;
1352 }
1353
1354 foreach_list(node, &this->instructions) {
1355 fs_inst *inst = (fs_inst *)node;
1356
1357 /* If there's a SEND message that requires contiguous destination
1358 * registers, no splitting is allowed.
1359 */
1360 if (inst->regs_written() > 1) {
1361 split_grf[inst->dst.reg] = false;
1362 }
1363 }
1364
1365 /* Allocate new space for split regs. Note that the virtual
1366 * numbers will be contiguous.
1367 */
1368 for (int i = 0; i < num_vars; i++) {
1369 if (split_grf[i]) {
1370 new_virtual_grf[i] = virtual_grf_alloc(1);
1371 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1372 int reg = virtual_grf_alloc(1);
1373 assert(reg == new_virtual_grf[i] + j - 1);
1374 (void) reg;
1375 }
1376 this->virtual_grf_sizes[i] = 1;
1377 }
1378 }
1379
1380 foreach_list(node, &this->instructions) {
1381 fs_inst *inst = (fs_inst *)node;
1382
1383 if (inst->dst.file == GRF &&
1384 split_grf[inst->dst.reg] &&
1385 inst->dst.reg_offset != 0) {
1386 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1387 inst->dst.reg_offset - 1);
1388 inst->dst.reg_offset = 0;
1389 }
1390 for (int i = 0; i < 3; i++) {
1391 if (inst->src[i].file == GRF &&
1392 split_grf[inst->src[i].reg] &&
1393 inst->src[i].reg_offset != 0) {
1394 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1395 inst->src[i].reg_offset - 1);
1396 inst->src[i].reg_offset = 0;
1397 }
1398 }
1399 }
1400 this->live_intervals_valid = false;
1401 }
1402
1403 /**
1404 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1405 *
1406 * During code generation, we create tons of temporary variables, many of
1407 * which get immediately killed and are never used again. Yet, in later
1408 * optimization and analysis passes, such as compute_live_intervals, we need
1409 * to loop over all the virtual GRFs. Compacting them can save a lot of
1410 * overhead.
1411 */
1412 void
1413 fs_visitor::compact_virtual_grfs()
1414 {
1415 /* Mark which virtual GRFs are used, and count how many. */
1416 int remap_table[this->virtual_grf_count];
1417 memset(remap_table, -1, sizeof(remap_table));
1418
1419 foreach_list(node, &this->instructions) {
1420 const fs_inst *inst = (const fs_inst *) node;
1421
1422 if (inst->dst.file == GRF)
1423 remap_table[inst->dst.reg] = 0;
1424
1425 for (int i = 0; i < 3; i++) {
1426 if (inst->src[i].file == GRF)
1427 remap_table[inst->src[i].reg] = 0;
1428 }
1429 }
1430
1431 /* In addition to registers used in instructions, fs_visitor keeps
1432 * direct references to certain special values which must be patched:
1433 */
1434 fs_reg *special[] = {
1435 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1436 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1437 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1438 &delta_x[0], &delta_x[1], &delta_x[2],
1439 &delta_x[3], &delta_x[4], &delta_x[5],
1440 &delta_y[0], &delta_y[1], &delta_y[2],
1441 &delta_y[3], &delta_y[4], &delta_y[5],
1442 };
1443 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1444 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1445
1446 /* Treat all special values as used, to be conservative */
1447 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1448 if (special[i]->file == GRF)
1449 remap_table[special[i]->reg] = 0;
1450 }
1451
1452 /* Compact the GRF arrays. */
1453 int new_index = 0;
1454 for (int i = 0; i < this->virtual_grf_count; i++) {
1455 if (remap_table[i] != -1) {
1456 remap_table[i] = new_index;
1457 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1458 if (live_intervals_valid) {
1459 virtual_grf_use[new_index] = virtual_grf_use[i];
1460 virtual_grf_def[new_index] = virtual_grf_def[i];
1461 }
1462 ++new_index;
1463 }
1464 }
1465
1466 this->virtual_grf_count = new_index;
1467
1468 /* Patch all the instructions to use the newly renumbered registers */
1469 foreach_list(node, &this->instructions) {
1470 fs_inst *inst = (fs_inst *) node;
1471
1472 if (inst->dst.file == GRF)
1473 inst->dst.reg = remap_table[inst->dst.reg];
1474
1475 for (int i = 0; i < 3; i++) {
1476 if (inst->src[i].file == GRF)
1477 inst->src[i].reg = remap_table[inst->src[i].reg];
1478 }
1479 }
1480
1481 /* Patch all the references to special values */
1482 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1483 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1484 special[i]->reg = remap_table[special[i]->reg];
1485 }
1486 }
1487
1488 bool
1489 fs_visitor::remove_dead_constants()
1490 {
1491 if (dispatch_width == 8) {
1492 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1493
1494 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1495 this->params_remap[i] = -1;
1496
1497 /* Find which params are still in use. */
1498 foreach_list(node, &this->instructions) {
1499 fs_inst *inst = (fs_inst *)node;
1500
1501 for (int i = 0; i < 3; i++) {
1502 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1503
1504 if (inst->src[i].file != UNIFORM)
1505 continue;
1506
1507 assert(constant_nr < (int)c->prog_data.nr_params);
1508
1509 /* For now, set this to non-negative. We'll give it the
1510 * actual new number in a moment, in order to keep the
1511 * register numbers nicely ordered.
1512 */
1513 this->params_remap[constant_nr] = 0;
1514 }
1515 }
1516
1517 /* Figure out what the new numbers for the params will be. At some
1518 * point when we're doing uniform array access, we're going to want
1519 * to keep the distinction between .reg and .reg_offset, but for
1520 * now we don't care.
1521 */
1522 unsigned int new_nr_params = 0;
1523 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1524 if (this->params_remap[i] != -1) {
1525 this->params_remap[i] = new_nr_params++;
1526 }
1527 }
1528
1529 /* Update the list of params to be uploaded to match our new numbering. */
1530 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1531 int remapped = this->params_remap[i];
1532
1533 if (remapped == -1)
1534 continue;
1535
1536 c->prog_data.param[remapped] = c->prog_data.param[i];
1537 }
1538
1539 c->prog_data.nr_params = new_nr_params;
1540 } else {
1541 /* This should have been generated in the 8-wide pass already. */
1542 assert(this->params_remap);
1543 }
1544
1545 /* Now do the renumbering of the shader to remove unused params. */
1546 foreach_list(node, &this->instructions) {
1547 fs_inst *inst = (fs_inst *)node;
1548
1549 for (int i = 0; i < 3; i++) {
1550 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1551
1552 if (inst->src[i].file != UNIFORM)
1553 continue;
1554
1555 assert(this->params_remap[constant_nr] != -1);
1556 inst->src[i].reg = this->params_remap[constant_nr];
1557 inst->src[i].reg_offset = 0;
1558 }
1559 }
1560
1561 return true;
1562 }
1563
1564 /*
1565 * Implements array access of uniforms by inserting a
1566 * PULL_CONSTANT_LOAD instruction.
1567 *
1568 * Unlike temporary GRF array access (where we don't support it due to
1569 * the difficulty of doing relative addressing on instruction
1570 * destinations), we could potentially do array access of uniforms
1571 * that were loaded in GRF space as push constants. In real-world
1572 * usage we've seen, though, the arrays being used are always larger
1573 * than we could load as push constants, so just always move all
1574 * uniform array access out to a pull constant buffer.
1575 */
1576 void
1577 fs_visitor::move_uniform_array_access_to_pull_constants()
1578 {
1579 int pull_constant_loc[c->prog_data.nr_params];
1580
1581 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1582 pull_constant_loc[i] = -1;
1583 }
1584
1585 /* Walk through and find array access of uniforms. Put a copy of that
1586 * uniform in the pull constant buffer.
1587 *
1588 * Note that we don't move constant-indexed accesses to arrays. No
1589 * testing has been done of the performance impact of this choice.
1590 */
1591 foreach_list_safe(node, &this->instructions) {
1592 fs_inst *inst = (fs_inst *)node;
1593
1594 for (int i = 0 ; i < 3; i++) {
1595 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1596 continue;
1597
1598 int uniform = inst->src[i].reg;
1599
1600 /* If this array isn't already present in the pull constant buffer,
1601 * add it.
1602 */
1603 if (pull_constant_loc[uniform] == -1) {
1604 const float **values = &c->prog_data.param[uniform];
1605
1606 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1607
1608 assert(param_size[uniform]);
1609
1610 for (int j = 0; j < param_size[uniform]; j++) {
1611 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1612 values[j];
1613 }
1614 }
1615
1616 /* Set up the annotation tracking for new generated instructions. */
1617 base_ir = inst->ir;
1618 current_annotation = inst->annotation;
1619
1620 fs_reg offset = fs_reg(this, glsl_type::int_type);
1621 inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1622 fs_reg(pull_constant_loc[uniform] +
1623 inst->src[i].reg_offset)));
1624
1625 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1626 fs_reg temp = fs_reg(this, glsl_type::float_type);
1627 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1628 surf_index, offset);
1629 inst->insert_before(&list);
1630
1631 inst->src[i].file = temp.file;
1632 inst->src[i].reg = temp.reg;
1633 inst->src[i].reg_offset = temp.reg_offset;
1634 inst->src[i].reladdr = NULL;
1635 }
1636 }
1637 }
1638
1639 /**
1640 * Choose accesses from the UNIFORM file to demote to using the pull
1641 * constant buffer.
1642 *
1643 * We allow a fragment shader to have more than the specified minimum
1644 * maximum number of fragment shader uniform components (64). If
1645 * there are too many of these, they'd fill up all of register space.
1646 * So, this will push some of them out to the pull constant buffer and
1647 * update the program to load them.
1648 */
1649 void
1650 fs_visitor::setup_pull_constants()
1651 {
1652 /* Only allow 16 registers (128 uniform components) as push constants. */
1653 unsigned int max_uniform_components = 16 * 8;
1654 if (c->prog_data.nr_params <= max_uniform_components)
1655 return;
1656
1657 if (dispatch_width == 16) {
1658 fail("Pull constants not supported in 16-wide\n");
1659 return;
1660 }
1661
1662 /* Just demote the end of the list. We could probably do better
1663 * here, demoting things that are rarely used in the program first.
1664 */
1665 unsigned int pull_uniform_base = max_uniform_components;
1666
1667 int pull_constant_loc[c->prog_data.nr_params];
1668 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1669 if (i < pull_uniform_base) {
1670 pull_constant_loc[i] = -1;
1671 } else {
1672 pull_constant_loc[i] = -1;
1673 /* If our constant is already being uploaded for reladdr purposes,
1674 * reuse it.
1675 */
1676 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1677 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1678 pull_constant_loc[i] = j;
1679 break;
1680 }
1681 }
1682 if (pull_constant_loc[i] == -1) {
1683 int pull_index = c->prog_data.nr_pull_params++;
1684 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1685 pull_constant_loc[i] = pull_index;;
1686 }
1687 }
1688 }
1689 c->prog_data.nr_params = pull_uniform_base;
1690
1691 foreach_list(node, &this->instructions) {
1692 fs_inst *inst = (fs_inst *)node;
1693
1694 for (int i = 0; i < 3; i++) {
1695 if (inst->src[i].file != UNIFORM)
1696 continue;
1697
1698 int pull_index = pull_constant_loc[inst->src[i].reg +
1699 inst->src[i].reg_offset];
1700 if (pull_index == -1)
1701 continue;
1702
1703 assert(!inst->src[i].reladdr);
1704
1705 fs_reg dst = fs_reg(this, glsl_type::float_type);
1706 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1707 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1708 fs_inst *pull =
1709 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1710 dst, index, offset);
1711 pull->ir = inst->ir;
1712 pull->annotation = inst->annotation;
1713
1714 inst->insert_before(pull);
1715
1716 inst->src[i].file = GRF;
1717 inst->src[i].reg = dst.reg;
1718 inst->src[i].reg_offset = 0;
1719 inst->src[i].smear = pull_index & 3;
1720 }
1721 }
1722 }
1723
1724 bool
1725 fs_visitor::opt_algebraic()
1726 {
1727 bool progress = false;
1728
1729 foreach_list(node, &this->instructions) {
1730 fs_inst *inst = (fs_inst *)node;
1731
1732 switch (inst->opcode) {
1733 case BRW_OPCODE_MUL:
1734 if (inst->src[1].file != IMM)
1735 continue;
1736
1737 /* a * 1.0 = a */
1738 if (inst->src[1].is_one()) {
1739 inst->opcode = BRW_OPCODE_MOV;
1740 inst->src[1] = reg_undef;
1741 progress = true;
1742 break;
1743 }
1744
1745 /* a * 0.0 = 0.0 */
1746 if (inst->src[1].is_zero()) {
1747 inst->opcode = BRW_OPCODE_MOV;
1748 inst->src[0] = inst->src[1];
1749 inst->src[1] = reg_undef;
1750 progress = true;
1751 break;
1752 }
1753
1754 break;
1755 case BRW_OPCODE_ADD:
1756 if (inst->src[1].file != IMM)
1757 continue;
1758
1759 /* a + 0.0 = a */
1760 if (inst->src[1].is_zero()) {
1761 inst->opcode = BRW_OPCODE_MOV;
1762 inst->src[1] = reg_undef;
1763 progress = true;
1764 break;
1765 }
1766 break;
1767 default:
1768 break;
1769 }
1770 }
1771
1772 return progress;
1773 }
1774
1775 /**
1776 * Must be called after calculate_live_intervales() to remove unused
1777 * writes to registers -- register allocation will fail otherwise
1778 * because something deffed but not used won't be considered to
1779 * interfere with other regs.
1780 */
1781 bool
1782 fs_visitor::dead_code_eliminate()
1783 {
1784 bool progress = false;
1785 int pc = 0;
1786
1787 calculate_live_intervals();
1788
1789 foreach_list_safe(node, &this->instructions) {
1790 fs_inst *inst = (fs_inst *)node;
1791
1792 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1793 inst->remove();
1794 progress = true;
1795 }
1796
1797 pc++;
1798 }
1799
1800 if (progress)
1801 live_intervals_valid = false;
1802
1803 return progress;
1804 }
1805
1806 /**
1807 * Implements a second type of register coalescing: This one checks if
1808 * the two regs involved in a raw move don't interfere, in which case
1809 * they can both by stored in the same place and the MOV removed.
1810 */
1811 bool
1812 fs_visitor::register_coalesce_2()
1813 {
1814 bool progress = false;
1815
1816 calculate_live_intervals();
1817
1818 foreach_list_safe(node, &this->instructions) {
1819 fs_inst *inst = (fs_inst *)node;
1820
1821 if (inst->opcode != BRW_OPCODE_MOV ||
1822 inst->predicate ||
1823 inst->saturate ||
1824 inst->src[0].file != GRF ||
1825 inst->src[0].negate ||
1826 inst->src[0].abs ||
1827 inst->src[0].smear != -1 ||
1828 inst->dst.file != GRF ||
1829 inst->dst.type != inst->src[0].type ||
1830 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1831 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1832 continue;
1833 }
1834
1835 int reg_from = inst->src[0].reg;
1836 assert(inst->src[0].reg_offset == 0);
1837 int reg_to = inst->dst.reg;
1838 int reg_to_offset = inst->dst.reg_offset;
1839
1840 foreach_list(node, &this->instructions) {
1841 fs_inst *scan_inst = (fs_inst *)node;
1842
1843 if (scan_inst->dst.file == GRF &&
1844 scan_inst->dst.reg == reg_from) {
1845 scan_inst->dst.reg = reg_to;
1846 scan_inst->dst.reg_offset = reg_to_offset;
1847 }
1848 for (int i = 0; i < 3; i++) {
1849 if (scan_inst->src[i].file == GRF &&
1850 scan_inst->src[i].reg == reg_from) {
1851 scan_inst->src[i].reg = reg_to;
1852 scan_inst->src[i].reg_offset = reg_to_offset;
1853 }
1854 }
1855 }
1856
1857 inst->remove();
1858
1859 /* We don't need to recalculate live intervals inside the loop despite
1860 * flagging live_intervals_valid because we only use live intervals for
1861 * the interferes test, and we must have had a situation where the
1862 * intervals were:
1863 *
1864 * from to
1865 * ^
1866 * |
1867 * v
1868 * ^
1869 * |
1870 * v
1871 *
1872 * Some register R that might get coalesced with one of these two could
1873 * only be referencing "to", otherwise "from"'s range would have been
1874 * longer. R's range could also only start at the end of "to" or later,
1875 * otherwise it will conflict with "to" when we try to coalesce "to"
1876 * into Rw anyway.
1877 */
1878 live_intervals_valid = false;
1879
1880 progress = true;
1881 continue;
1882 }
1883
1884 return progress;
1885 }
1886
1887 bool
1888 fs_visitor::register_coalesce()
1889 {
1890 bool progress = false;
1891 int if_depth = 0;
1892 int loop_depth = 0;
1893
1894 foreach_list_safe(node, &this->instructions) {
1895 fs_inst *inst = (fs_inst *)node;
1896
1897 /* Make sure that we dominate the instructions we're going to
1898 * scan for interfering with our coalescing, or we won't have
1899 * scanned enough to see if anything interferes with our
1900 * coalescing. We don't dominate the following instructions if
1901 * we're in a loop or an if block.
1902 */
1903 switch (inst->opcode) {
1904 case BRW_OPCODE_DO:
1905 loop_depth++;
1906 break;
1907 case BRW_OPCODE_WHILE:
1908 loop_depth--;
1909 break;
1910 case BRW_OPCODE_IF:
1911 if_depth++;
1912 break;
1913 case BRW_OPCODE_ENDIF:
1914 if_depth--;
1915 break;
1916 default:
1917 break;
1918 }
1919 if (loop_depth || if_depth)
1920 continue;
1921
1922 if (inst->opcode != BRW_OPCODE_MOV ||
1923 inst->predicate ||
1924 inst->saturate ||
1925 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1926 inst->src[0].file != UNIFORM)||
1927 inst->dst.type != inst->src[0].type)
1928 continue;
1929
1930 bool has_source_modifiers = (inst->src[0].abs ||
1931 inst->src[0].negate ||
1932 inst->src[0].smear != -1 ||
1933 inst->src[0].file == UNIFORM);
1934
1935 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1936 * them: check for no writes to either one until the exit of the
1937 * program.
1938 */
1939 bool interfered = false;
1940
1941 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1942 !scan_inst->is_tail_sentinel();
1943 scan_inst = (fs_inst *)scan_inst->next) {
1944 if (scan_inst->dst.file == GRF) {
1945 if (scan_inst->overwrites_reg(inst->dst) ||
1946 scan_inst->overwrites_reg(inst->src[0])) {
1947 interfered = true;
1948 break;
1949 }
1950 }
1951
1952 /* The gen6 MATH instruction can't handle source modifiers or
1953 * unusual register regions, so avoid coalescing those for
1954 * now. We should do something more specific.
1955 */
1956 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1957 interfered = true;
1958 break;
1959 }
1960
1961 /* The accumulator result appears to get used for the
1962 * conditional modifier generation. When negating a UD
1963 * value, there is a 33rd bit generated for the sign in the
1964 * accumulator value, so now you can't check, for example,
1965 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1966 */
1967 if (scan_inst->conditional_mod &&
1968 inst->src[0].negate &&
1969 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1970 interfered = true;
1971 break;
1972 }
1973 }
1974 if (interfered) {
1975 continue;
1976 }
1977
1978 /* Rewrite the later usage to point at the source of the move to
1979 * be removed.
1980 */
1981 for (fs_inst *scan_inst = inst;
1982 !scan_inst->is_tail_sentinel();
1983 scan_inst = (fs_inst *)scan_inst->next) {
1984 for (int i = 0; i < 3; i++) {
1985 if (scan_inst->src[i].file == GRF &&
1986 scan_inst->src[i].reg == inst->dst.reg &&
1987 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1988 fs_reg new_src = inst->src[0];
1989 if (scan_inst->src[i].abs) {
1990 new_src.negate = 0;
1991 new_src.abs = 1;
1992 }
1993 new_src.negate ^= scan_inst->src[i].negate;
1994 scan_inst->src[i] = new_src;
1995 }
1996 }
1997 }
1998
1999 inst->remove();
2000 progress = true;
2001 }
2002
2003 if (progress)
2004 live_intervals_valid = false;
2005
2006 return progress;
2007 }
2008
2009
2010 bool
2011 fs_visitor::compute_to_mrf()
2012 {
2013 bool progress = false;
2014 int next_ip = 0;
2015
2016 calculate_live_intervals();
2017
2018 foreach_list_safe(node, &this->instructions) {
2019 fs_inst *inst = (fs_inst *)node;
2020
2021 int ip = next_ip;
2022 next_ip++;
2023
2024 if (inst->opcode != BRW_OPCODE_MOV ||
2025 inst->predicate ||
2026 inst->dst.file != MRF || inst->src[0].file != GRF ||
2027 inst->dst.type != inst->src[0].type ||
2028 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2029 continue;
2030
2031 /* Work out which hardware MRF registers are written by this
2032 * instruction.
2033 */
2034 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2035 int mrf_high;
2036 if (inst->dst.reg & BRW_MRF_COMPR4) {
2037 mrf_high = mrf_low + 4;
2038 } else if (dispatch_width == 16 &&
2039 (!inst->force_uncompressed && !inst->force_sechalf)) {
2040 mrf_high = mrf_low + 1;
2041 } else {
2042 mrf_high = mrf_low;
2043 }
2044
2045 /* Can't compute-to-MRF this GRF if someone else was going to
2046 * read it later.
2047 */
2048 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2049 continue;
2050
2051 /* Found a move of a GRF to a MRF. Let's see if we can go
2052 * rewrite the thing that made this GRF to write into the MRF.
2053 */
2054 fs_inst *scan_inst;
2055 for (scan_inst = (fs_inst *)inst->prev;
2056 scan_inst->prev != NULL;
2057 scan_inst = (fs_inst *)scan_inst->prev) {
2058 if (scan_inst->dst.file == GRF &&
2059 scan_inst->dst.reg == inst->src[0].reg) {
2060 /* Found the last thing to write our reg we want to turn
2061 * into a compute-to-MRF.
2062 */
2063
2064 /* SENDs can only write to GRFs, so no compute-to-MRF. */
2065 if (scan_inst->mlen) {
2066 break;
2067 }
2068
2069 /* If it's predicated, it (probably) didn't populate all
2070 * the channels. We might be able to rewrite everything
2071 * that writes that reg, but it would require smarter
2072 * tracking to delay the rewriting until complete success.
2073 */
2074 if (scan_inst->predicate)
2075 break;
2076
2077 /* If it's half of register setup and not the same half as
2078 * our MOV we're trying to remove, bail for now.
2079 */
2080 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2081 scan_inst->force_sechalf != inst->force_sechalf) {
2082 break;
2083 }
2084
2085 /* SEND instructions can't have MRF as a destination. */
2086 if (scan_inst->mlen)
2087 break;
2088
2089 if (intel->gen >= 6) {
2090 /* gen6 math instructions must have the destination be
2091 * GRF, so no compute-to-MRF for them.
2092 */
2093 if (scan_inst->is_math()) {
2094 break;
2095 }
2096 }
2097
2098 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2099 /* Found the creator of our MRF's source value. */
2100 scan_inst->dst.file = MRF;
2101 scan_inst->dst.reg = inst->dst.reg;
2102 scan_inst->saturate |= inst->saturate;
2103 inst->remove();
2104 progress = true;
2105 }
2106 break;
2107 }
2108
2109 /* We don't handle control flow here. Most computation of
2110 * values that end up in MRFs are shortly before the MRF
2111 * write anyway.
2112 */
2113 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2114 break;
2115
2116 /* You can't read from an MRF, so if someone else reads our
2117 * MRF's source GRF that we wanted to rewrite, that stops us.
2118 */
2119 bool interfered = false;
2120 for (int i = 0; i < 3; i++) {
2121 if (scan_inst->src[i].file == GRF &&
2122 scan_inst->src[i].reg == inst->src[0].reg &&
2123 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2124 interfered = true;
2125 }
2126 }
2127 if (interfered)
2128 break;
2129
2130 if (scan_inst->dst.file == MRF) {
2131 /* If somebody else writes our MRF here, we can't
2132 * compute-to-MRF before that.
2133 */
2134 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2135 int scan_mrf_high;
2136
2137 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2138 scan_mrf_high = scan_mrf_low + 4;
2139 } else if (dispatch_width == 16 &&
2140 (!scan_inst->force_uncompressed &&
2141 !scan_inst->force_sechalf)) {
2142 scan_mrf_high = scan_mrf_low + 1;
2143 } else {
2144 scan_mrf_high = scan_mrf_low;
2145 }
2146
2147 if (mrf_low == scan_mrf_low ||
2148 mrf_low == scan_mrf_high ||
2149 mrf_high == scan_mrf_low ||
2150 mrf_high == scan_mrf_high) {
2151 break;
2152 }
2153 }
2154
2155 if (scan_inst->mlen > 0) {
2156 /* Found a SEND instruction, which means that there are
2157 * live values in MRFs from base_mrf to base_mrf +
2158 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2159 * above it.
2160 */
2161 if (mrf_low >= scan_inst->base_mrf &&
2162 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2163 break;
2164 }
2165 if (mrf_high >= scan_inst->base_mrf &&
2166 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2167 break;
2168 }
2169 }
2170 }
2171 }
2172
2173 if (progress)
2174 live_intervals_valid = false;
2175
2176 return progress;
2177 }
2178
2179 /**
2180 * Walks through basic blocks, looking for repeated MRF writes and
2181 * removing the later ones.
2182 */
2183 bool
2184 fs_visitor::remove_duplicate_mrf_writes()
2185 {
2186 fs_inst *last_mrf_move[16];
2187 bool progress = false;
2188
2189 /* Need to update the MRF tracking for compressed instructions. */
2190 if (dispatch_width == 16)
2191 return false;
2192
2193 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2194
2195 foreach_list_safe(node, &this->instructions) {
2196 fs_inst *inst = (fs_inst *)node;
2197
2198 if (inst->is_control_flow()) {
2199 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2200 }
2201
2202 if (inst->opcode == BRW_OPCODE_MOV &&
2203 inst->dst.file == MRF) {
2204 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2205 if (prev_inst && inst->equals(prev_inst)) {
2206 inst->remove();
2207 progress = true;
2208 continue;
2209 }
2210 }
2211
2212 /* Clear out the last-write records for MRFs that were overwritten. */
2213 if (inst->dst.file == MRF) {
2214 last_mrf_move[inst->dst.reg] = NULL;
2215 }
2216
2217 if (inst->mlen > 0) {
2218 /* Found a SEND instruction, which will include two or fewer
2219 * implied MRF writes. We could do better here.
2220 */
2221 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2222 last_mrf_move[inst->base_mrf + i] = NULL;
2223 }
2224 }
2225
2226 /* Clear out any MRF move records whose sources got overwritten. */
2227 if (inst->dst.file == GRF) {
2228 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2229 if (last_mrf_move[i] &&
2230 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2231 last_mrf_move[i] = NULL;
2232 }
2233 }
2234 }
2235
2236 if (inst->opcode == BRW_OPCODE_MOV &&
2237 inst->dst.file == MRF &&
2238 inst->src[0].file == GRF &&
2239 !inst->predicate) {
2240 last_mrf_move[inst->dst.reg] = inst;
2241 }
2242 }
2243
2244 if (progress)
2245 live_intervals_valid = false;
2246
2247 return progress;
2248 }
2249
2250 static void
2251 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2252 int first_grf, int grf_len)
2253 {
2254 bool inst_16wide = (dispatch_width > 8 &&
2255 !inst->force_uncompressed &&
2256 !inst->force_sechalf);
2257
2258 /* Clear the flag for registers that actually got read (as expected). */
2259 for (int i = 0; i < 3; i++) {
2260 int grf;
2261 if (inst->src[i].file == GRF) {
2262 grf = inst->src[i].reg;
2263 } else if (inst->src[i].file == FIXED_HW_REG &&
2264 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2265 grf = inst->src[i].fixed_hw_reg.nr;
2266 } else {
2267 continue;
2268 }
2269
2270 if (grf >= first_grf &&
2271 grf < first_grf + grf_len) {
2272 deps[grf - first_grf] = false;
2273 if (inst_16wide)
2274 deps[grf - first_grf + 1] = false;
2275 }
2276 }
2277 }
2278
2279 /**
2280 * Implements this workaround for the original 965:
2281 *
2282 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2283 * check for post destination dependencies on this instruction, software
2284 * must ensure that there is no destination hazard for the case of ‘write
2285 * followed by a posted write’ shown in the following example.
2286 *
2287 * 1. mov r3 0
2288 * 2. send r3.xy <rest of send instruction>
2289 * 3. mov r2 r3
2290 *
2291 * Due to no post-destination dependency check on the ‘send’, the above
2292 * code sequence could have two instructions (1 and 2) in flight at the
2293 * same time that both consider ‘r3’ as the target of their final writes.
2294 */
2295 void
2296 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2297 {
2298 int write_len = inst->regs_written() * dispatch_width / 8;
2299 int first_write_grf = inst->dst.reg;
2300 bool needs_dep[BRW_MAX_MRF];
2301 assert(write_len < (int)sizeof(needs_dep) - 1);
2302
2303 memset(needs_dep, false, sizeof(needs_dep));
2304 memset(needs_dep, true, write_len);
2305
2306 clear_deps_for_inst_src(inst, dispatch_width,
2307 needs_dep, first_write_grf, write_len);
2308
2309 /* Walk backwards looking for writes to registers we're writing which
2310 * aren't read since being written. If we hit the start of the program,
2311 * we assume that there are no outstanding dependencies on entry to the
2312 * program.
2313 */
2314 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2315 scan_inst != NULL;
2316 scan_inst = (fs_inst *)scan_inst->prev) {
2317
2318 /* If we hit control flow, assume that there *are* outstanding
2319 * dependencies, and force their cleanup before our instruction.
2320 */
2321 if (scan_inst->is_control_flow()) {
2322 for (int i = 0; i < write_len; i++) {
2323 if (needs_dep[i]) {
2324 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2325 }
2326 }
2327 }
2328
2329 bool scan_inst_16wide = (dispatch_width > 8 &&
2330 !scan_inst->force_uncompressed &&
2331 !scan_inst->force_sechalf);
2332
2333 /* We insert our reads as late as possible on the assumption that any
2334 * instruction but a MOV that might have left us an outstanding
2335 * dependency has more latency than a MOV.
2336 */
2337 if (scan_inst->dst.file == GRF &&
2338 scan_inst->dst.reg >= first_write_grf &&
2339 scan_inst->dst.reg < first_write_grf + write_len &&
2340 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2341 inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2342 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2343 if (scan_inst_16wide)
2344 needs_dep[scan_inst->dst.reg - first_write_grf + 1] = false;
2345 }
2346
2347 /* Clear the flag for registers that actually got read (as expected). */
2348 clear_deps_for_inst_src(scan_inst, dispatch_width,
2349 needs_dep, first_write_grf, write_len);
2350
2351 /* Continue the loop only if we haven't resolved all the dependencies */
2352 int i;
2353 for (i = 0; i < write_len; i++) {
2354 if (needs_dep[i])
2355 break;
2356 }
2357 if (i == write_len)
2358 return;
2359 }
2360 }
2361
2362 /**
2363 * Implements this workaround for the original 965:
2364 *
2365 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2366 * used as a destination register until after it has been sourced by an
2367 * instruction with a different destination register.
2368 */
2369 void
2370 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2371 {
2372 int write_len = inst->regs_written() * dispatch_width / 8;
2373 int first_write_grf = inst->dst.reg;
2374 bool needs_dep[BRW_MAX_MRF];
2375 assert(write_len < (int)sizeof(needs_dep) - 1);
2376
2377 memset(needs_dep, false, sizeof(needs_dep));
2378 memset(needs_dep, true, write_len);
2379 /* Walk forwards looking for writes to registers we're writing which aren't
2380 * read before being written.
2381 */
2382 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2383 !scan_inst->is_tail_sentinel();
2384 scan_inst = (fs_inst *)scan_inst->next) {
2385 /* If we hit control flow, force resolve all remaining dependencies. */
2386 if (scan_inst->is_control_flow()) {
2387 for (int i = 0; i < write_len; i++) {
2388 if (needs_dep[i])
2389 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2390 }
2391 }
2392
2393 /* Clear the flag for registers that actually got read (as expected). */
2394 clear_deps_for_inst_src(scan_inst, dispatch_width,
2395 needs_dep, first_write_grf, write_len);
2396
2397 /* We insert our reads as late as possible since they're reading the
2398 * result of a SEND, which has massive latency.
2399 */
2400 if (scan_inst->dst.file == GRF &&
2401 scan_inst->dst.reg >= first_write_grf &&
2402 scan_inst->dst.reg < first_write_grf + write_len &&
2403 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2404 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2405 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2406 }
2407
2408 /* Continue the loop only if we haven't resolved all the dependencies */
2409 int i;
2410 for (i = 0; i < write_len; i++) {
2411 if (needs_dep[i])
2412 break;
2413 }
2414 if (i == write_len)
2415 return;
2416 }
2417
2418 /* If we hit the end of the program, resolve all remaining dependencies out
2419 * of paranoia.
2420 */
2421 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2422 assert(last_inst->eot);
2423 for (int i = 0; i < write_len; i++) {
2424 if (needs_dep[i])
2425 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2426 }
2427 }
2428
2429 void
2430 fs_visitor::insert_gen4_send_dependency_workarounds()
2431 {
2432 if (intel->gen != 4 || intel->is_g4x)
2433 return;
2434
2435 /* Note that we're done with register allocation, so GRF fs_regs always
2436 * have a .reg_offset of 0.
2437 */
2438
2439 foreach_list_safe(node, &this->instructions) {
2440 fs_inst *inst = (fs_inst *)node;
2441
2442 if (inst->mlen != 0 && inst->dst.file == GRF) {
2443 insert_gen4_pre_send_dependency_workarounds(inst);
2444 insert_gen4_post_send_dependency_workarounds(inst);
2445 }
2446 }
2447 }
2448
2449 /**
2450 * Turns the generic expression-style uniform pull constant load instruction
2451 * into a hardware-specific series of instructions for loading a pull
2452 * constant.
2453 *
2454 * The expression style allows the CSE pass before this to optimize out
2455 * repeated loads from the same offset, and gives the pre-register-allocation
2456 * scheduling full flexibility, while the conversion to native instructions
2457 * allows the post-register-allocation scheduler the best information
2458 * possible.
2459 */
2460 void
2461 fs_visitor::lower_uniform_pull_constant_loads()
2462 {
2463 foreach_list(node, &this->instructions) {
2464 fs_inst *inst = (fs_inst *)node;
2465
2466 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2467 continue;
2468
2469 if (intel->gen >= 7) {
2470 fs_reg const_offset_reg = inst->src[1];
2471 assert(const_offset_reg.file == IMM &&
2472 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2473 const_offset_reg.imm.u /= 16;
2474 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2475 struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
2476 BRW_REGISTER_TYPE_UD);
2477
2478 fs_inst *setup1 = MOV(payload, fs_reg(g0));
2479 setup1->force_writemask_all = true;
2480 /* We don't need the second half of this vgrf to be filled with g1
2481 * in the 16-wide case, but if we use force_uncompressed then live
2482 * variable analysis won't consider this a def!
2483 */
2484
2485 fs_inst *setup2 = new(mem_ctx) fs_inst(FS_OPCODE_SET_GLOBAL_OFFSET,
2486 payload, payload,
2487 const_offset_reg);
2488
2489 setup1->ir = inst->ir;
2490 setup1->annotation = inst->annotation;
2491 inst->insert_before(setup1);
2492 setup2->ir = inst->ir;
2493 setup2->annotation = inst->annotation;
2494 inst->insert_before(setup2);
2495 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2496 inst->src[1] = payload;
2497 } else {
2498 /* Before register allocation, we didn't tell the scheduler about the
2499 * MRF we use. We know it's safe to use this MRF because nothing
2500 * else does except for register spill/unspill, which generates and
2501 * uses its MRF within a single IR instruction.
2502 */
2503 inst->base_mrf = 14;
2504 inst->mlen = 1;
2505 }
2506 }
2507 }
2508
2509 void
2510 fs_visitor::dump_instruction(fs_inst *inst)
2511 {
2512 if (inst->predicate) {
2513 printf("(%cf0.%d) ",
2514 inst->predicate_inverse ? '-' : '+',
2515 inst->flag_subreg);
2516 }
2517
2518 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2519 opcode_descs[inst->opcode].name) {
2520 printf("%s", opcode_descs[inst->opcode].name);
2521 } else {
2522 switch (inst->opcode) {
2523 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
2524 printf("uniform_pull_const");
2525 break;
2526 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
2527 printf("uniform_pull_const_gen7");
2528 break;
2529 case FS_OPCODE_SET_GLOBAL_OFFSET:
2530 printf("set_global_offset");
2531 break;
2532 default:
2533 printf("op%d", inst->opcode);
2534 break;
2535 }
2536 }
2537 if (inst->saturate)
2538 printf(".sat");
2539 if (inst->conditional_mod) {
2540 printf(".cmod");
2541 if (!inst->predicate &&
2542 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2543 inst->opcode != BRW_OPCODE_IF &&
2544 inst->opcode != BRW_OPCODE_WHILE))) {
2545 printf(".f0.%d\n", inst->flag_subreg);
2546 }
2547 }
2548 printf(" ");
2549
2550
2551 switch (inst->dst.file) {
2552 case GRF:
2553 printf("vgrf%d", inst->dst.reg);
2554 if (inst->dst.reg_offset)
2555 printf("+%d", inst->dst.reg_offset);
2556 break;
2557 case MRF:
2558 printf("m%d", inst->dst.reg);
2559 break;
2560 case BAD_FILE:
2561 printf("(null)");
2562 break;
2563 case UNIFORM:
2564 printf("***u%d***", inst->dst.reg);
2565 break;
2566 default:
2567 printf("???");
2568 break;
2569 }
2570 printf(", ");
2571
2572 for (int i = 0; i < 3; i++) {
2573 if (inst->src[i].negate)
2574 printf("-");
2575 if (inst->src[i].abs)
2576 printf("|");
2577 switch (inst->src[i].file) {
2578 case GRF:
2579 printf("vgrf%d", inst->src[i].reg);
2580 if (inst->src[i].reg_offset)
2581 printf("+%d", inst->src[i].reg_offset);
2582 break;
2583 case MRF:
2584 printf("***m%d***", inst->src[i].reg);
2585 break;
2586 case UNIFORM:
2587 printf("u%d", inst->src[i].reg);
2588 if (inst->src[i].reg_offset)
2589 printf(".%d", inst->src[i].reg_offset);
2590 break;
2591 case BAD_FILE:
2592 printf("(null)");
2593 break;
2594 case IMM:
2595 switch (inst->src[i].type) {
2596 case BRW_REGISTER_TYPE_F:
2597 printf("%ff", inst->src[i].imm.f);
2598 break;
2599 case BRW_REGISTER_TYPE_D:
2600 printf("%dd", inst->src[i].imm.i);
2601 break;
2602 case BRW_REGISTER_TYPE_UD:
2603 printf("%uu", inst->src[i].imm.u);
2604 break;
2605 default:
2606 printf("???");
2607 break;
2608 }
2609 break;
2610 default:
2611 printf("???");
2612 break;
2613 }
2614 if (inst->src[i].abs)
2615 printf("|");
2616
2617 if (i < 3)
2618 printf(", ");
2619 }
2620
2621 printf(" ");
2622
2623 if (inst->force_uncompressed)
2624 printf("1sthalf ");
2625
2626 if (inst->force_sechalf)
2627 printf("2ndhalf ");
2628
2629 printf("\n");
2630 }
2631
2632 void
2633 fs_visitor::dump_instructions()
2634 {
2635 int ip = 0;
2636 foreach_list(node, &this->instructions) {
2637 fs_inst *inst = (fs_inst *)node;
2638 printf("%d: ", ip++);
2639 dump_instruction(inst);
2640 }
2641 }
2642
2643 /**
2644 * Possibly returns an instruction that set up @param reg.
2645 *
2646 * Sometimes we want to take the result of some expression/variable
2647 * dereference tree and rewrite the instruction generating the result
2648 * of the tree. When processing the tree, we know that the
2649 * instructions generated are all writing temporaries that are dead
2650 * outside of this tree. So, if we have some instructions that write
2651 * a temporary, we're free to point that temp write somewhere else.
2652 *
2653 * Note that this doesn't guarantee that the instruction generated
2654 * only reg -- it might be the size=4 destination of a texture instruction.
2655 */
2656 fs_inst *
2657 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2658 fs_inst *end,
2659 fs_reg reg)
2660 {
2661 if (end == start ||
2662 end->predicate ||
2663 end->force_uncompressed ||
2664 end->force_sechalf ||
2665 reg.reladdr ||
2666 !reg.equals(end->dst)) {
2667 return NULL;
2668 } else {
2669 return end;
2670 }
2671 }
2672
2673 void
2674 fs_visitor::setup_payload_gen6()
2675 {
2676 struct intel_context *intel = &brw->intel;
2677 bool uses_depth =
2678 (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2679 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2680
2681 assert(intel->gen >= 6);
2682
2683 /* R0-1: masks, pixel X/Y coordinates. */
2684 c->nr_payload_regs = 2;
2685 /* R2: only for 32-pixel dispatch.*/
2686
2687 /* R3-26: barycentric interpolation coordinates. These appear in the
2688 * same order that they appear in the brw_wm_barycentric_interp_mode
2689 * enum. Each set of coordinates occupies 2 registers if dispatch width
2690 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2691 * appear if they were enabled using the "Barycentric Interpolation
2692 * Mode" bits in WM_STATE.
2693 */
2694 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2695 if (barycentric_interp_modes & (1 << i)) {
2696 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2697 c->nr_payload_regs += 2;
2698 if (dispatch_width == 16) {
2699 c->nr_payload_regs += 2;
2700 }
2701 }
2702 }
2703
2704 /* R27: interpolated depth if uses source depth */
2705 if (uses_depth) {
2706 c->source_depth_reg = c->nr_payload_regs;
2707 c->nr_payload_regs++;
2708 if (dispatch_width == 16) {
2709 /* R28: interpolated depth if not 8-wide. */
2710 c->nr_payload_regs++;
2711 }
2712 }
2713 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2714 if (uses_depth) {
2715 c->source_w_reg = c->nr_payload_regs;
2716 c->nr_payload_regs++;
2717 if (dispatch_width == 16) {
2718 /* R30: interpolated W if not 8-wide. */
2719 c->nr_payload_regs++;
2720 }
2721 }
2722 /* R31: MSAA position offsets. */
2723 /* R32-: bary for 32-pixel. */
2724 /* R58-59: interp W for 32-pixel. */
2725
2726 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2727 c->source_depth_to_render_target = true;
2728 }
2729 }
2730
2731 bool
2732 fs_visitor::run()
2733 {
2734 sanity_param_count = fp->Base.Parameters->NumParameters;
2735 uint32_t orig_nr_params = c->prog_data.nr_params;
2736
2737 if (intel->gen >= 6)
2738 setup_payload_gen6();
2739 else
2740 setup_payload_gen4();
2741
2742 if (0) {
2743 emit_dummy_fs();
2744 } else {
2745 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2746 emit_shader_time_begin();
2747
2748 calculate_urb_setup();
2749 if (intel->gen < 6)
2750 emit_interpolation_setup_gen4();
2751 else
2752 emit_interpolation_setup_gen6();
2753
2754 /* We handle discards by keeping track of the still-live pixels in f0.1.
2755 * Initialize it with the dispatched pixels.
2756 */
2757 if (fp->UsesKill) {
2758 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2759 discard_init->flag_subreg = 1;
2760 }
2761
2762 /* Generate FS IR for main(). (the visitor only descends into
2763 * functions called "main").
2764 */
2765 if (shader) {
2766 foreach_list(node, &*shader->ir) {
2767 ir_instruction *ir = (ir_instruction *)node;
2768 base_ir = ir;
2769 this->result = reg_undef;
2770 ir->accept(this);
2771 }
2772 } else {
2773 emit_fragment_program_code();
2774 }
2775 base_ir = NULL;
2776 if (failed)
2777 return false;
2778
2779 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2780 emit_shader_time_end();
2781
2782 emit_fb_writes();
2783
2784 split_virtual_grfs();
2785
2786 move_uniform_array_access_to_pull_constants();
2787 setup_pull_constants();
2788
2789 bool progress;
2790 do {
2791 progress = false;
2792
2793 compact_virtual_grfs();
2794
2795 progress = remove_duplicate_mrf_writes() || progress;
2796
2797 progress = opt_algebraic() || progress;
2798 progress = opt_cse() || progress;
2799 progress = opt_copy_propagate() || progress;
2800 progress = dead_code_eliminate() || progress;
2801 progress = register_coalesce() || progress;
2802 progress = register_coalesce_2() || progress;
2803 progress = compute_to_mrf() || progress;
2804 } while (progress);
2805
2806 remove_dead_constants();
2807
2808 schedule_instructions(false);
2809
2810 lower_uniform_pull_constant_loads();
2811
2812 assign_curb_setup();
2813 assign_urb_setup();
2814
2815 if (0) {
2816 /* Debug of register spilling: Go spill everything. */
2817 for (int i = 0; i < virtual_grf_count; i++) {
2818 spill_reg(i);
2819 }
2820 }
2821
2822 if (0)
2823 assign_regs_trivial();
2824 else {
2825 while (!assign_regs()) {
2826 if (failed)
2827 break;
2828 }
2829 }
2830 }
2831 assert(force_uncompressed_stack == 0);
2832 assert(force_sechalf_stack == 0);
2833
2834 /* This must come after all optimization and register allocation, since
2835 * it inserts dead code that happens to have side effects, and it does
2836 * so based on the actual physical registers in use.
2837 */
2838 insert_gen4_send_dependency_workarounds();
2839
2840 if (failed)
2841 return false;
2842
2843 schedule_instructions(true);
2844
2845 if (dispatch_width == 8) {
2846 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2847 } else {
2848 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2849
2850 /* Make sure we didn't try to sneak in an extra uniform */
2851 assert(orig_nr_params == c->prog_data.nr_params);
2852 (void) orig_nr_params;
2853 }
2854
2855 /* If any state parameters were appended, then ParameterValues could have
2856 * been realloced, in which case the driver uniform storage set up by
2857 * _mesa_associate_uniform_storage() would point to freed memory. Make
2858 * sure that didn't happen.
2859 */
2860 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2861
2862 return !failed;
2863 }
2864
2865 const unsigned *
2866 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2867 struct gl_fragment_program *fp,
2868 struct gl_shader_program *prog,
2869 unsigned *final_assembly_size)
2870 {
2871 struct intel_context *intel = &brw->intel;
2872 bool start_busy = false;
2873 float start_time = 0;
2874
2875 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2876 start_busy = (intel->batch.last_bo &&
2877 drm_intel_bo_busy(intel->batch.last_bo));
2878 start_time = get_time();
2879 }
2880
2881 struct brw_shader *shader = NULL;
2882 if (prog)
2883 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2884
2885 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2886 if (shader) {
2887 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2888 _mesa_print_ir(shader->ir, NULL);
2889 printf("\n\n");
2890 } else {
2891 printf("ARB_fragment_program %d ir for native fragment shader\n",
2892 fp->Base.Id);
2893 _mesa_print_program(&fp->Base);
2894 }
2895 }
2896
2897 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2898 */
2899 fs_visitor v(brw, c, prog, fp, 8);
2900 if (!v.run()) {
2901 prog->LinkStatus = false;
2902 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2903
2904 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2905 v.fail_msg);
2906
2907 return NULL;
2908 }
2909
2910 exec_list *simd16_instructions = NULL;
2911 fs_visitor v2(brw, c, prog, fp, 16);
2912 bool no16 = INTEL_DEBUG & DEBUG_NO16;
2913 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2914 v2.import_uniforms(&v);
2915 if (!v2.run()) {
2916 perf_debug("16-wide shader failed to compile, falling back to "
2917 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2918 } else {
2919 simd16_instructions = &v2.instructions;
2920 }
2921 }
2922
2923 c->prog_data.dispatch_width = 8;
2924
2925 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2926 const unsigned *generated = g.generate_assembly(&v.instructions,
2927 simd16_instructions,
2928 final_assembly_size);
2929
2930 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2931 if (shader->compiled_once)
2932 brw_wm_debug_recompile(brw, prog, &c->key);
2933 shader->compiled_once = true;
2934
2935 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2936 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2937 (get_time() - start_time) * 1000);
2938 }
2939 }
2940
2941 return generated;
2942 }
2943
2944 bool
2945 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2946 {
2947 struct brw_context *brw = brw_context(ctx);
2948 struct intel_context *intel = &brw->intel;
2949 struct brw_wm_prog_key key;
2950
2951 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2952 return true;
2953
2954 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2955 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2956 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2957 bool program_uses_dfdy = fp->UsesDFdy;
2958
2959 memset(&key, 0, sizeof(key));
2960
2961 if (intel->gen < 6) {
2962 if (fp->UsesKill)
2963 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2964
2965 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2966 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2967
2968 /* Just assume depth testing. */
2969 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2970 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2971 }
2972
2973 if (prog->Name != 0)
2974 key.proj_attrib_mask = 0xffffffff;
2975
2976 if (intel->gen < 6)
2977 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2978
2979 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2980 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2981 continue;
2982
2983 if (prog->Name == 0)
2984 key.proj_attrib_mask |= 1 << i;
2985
2986 if (intel->gen < 6) {
2987 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2988
2989 if (vp_index >= 0)
2990 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2991 }
2992 }
2993
2994 key.clamp_fragment_color = true;
2995
2996 for (int i = 0; i < MAX_SAMPLERS; i++) {
2997 if (fp->Base.ShadowSamplers & (1 << i)) {
2998 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2999 key.tex.swizzles[i] =
3000 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3001 } else {
3002 /* Color sampler: assume no swizzling. */
3003 key.tex.swizzles[i] = SWIZZLE_XYZW;
3004 }
3005 }
3006
3007 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
3008 key.drawable_height = ctx->DrawBuffer->Height;
3009 }
3010
3011 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
3012 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3013 }
3014
3015 key.nr_color_regions = 1;
3016
3017 key.program_string_id = bfp->id;
3018
3019 uint32_t old_prog_offset = brw->wm.prog_offset;
3020 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3021
3022 bool success = do_wm_prog(brw, prog, bfp, &key);
3023
3024 brw->wm.prog_offset = old_prog_offset;
3025 brw->wm.prog_data = old_prog_data;
3026
3027 return success;
3028 }