i965/fs: Add a bit more instruction dumping useful for upcoming work.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 ALU1(NOT)
150 ALU1(MOV)
151 ALU1(FRC)
152 ALU1(RNDD)
153 ALU1(RNDE)
154 ALU1(RNDZ)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(SHL)
162 ALU2(SHR)
163 ALU2(ASR)
164
165 /** Gen4 predicated IF. */
166 fs_inst *
167 fs_visitor::IF(uint32_t predicate)
168 {
169 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
170 inst->predicate = predicate;
171 return inst;
172 }
173
174 /** Gen6+ IF with embedded comparison. */
175 fs_inst *
176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
177 {
178 assert(intel->gen >= 6);
179 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
180 reg_null_d, src0, src1);
181 inst->conditional_mod = condition;
182 return inst;
183 }
184
185 /**
186 * CMP: Sets the low bit of the destination channels with the result
187 * of the comparison, while the upper bits are undefined, and updates
188 * the flag register with the packed 16 bits of the result.
189 */
190 fs_inst *
191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
192 {
193 fs_inst *inst;
194
195 /* Take the instruction:
196 *
197 * CMP null<d> src0<f> src1<f>
198 *
199 * Original gen4 does type conversion to the destination type before
200 * comparison, producing garbage results for floating point comparisons.
201 * gen5 does the comparison on the execution type (resolved source types),
202 * so dst type doesn't matter. gen6 does comparison and then uses the
203 * result as if it was the dst type with no conversion, which happens to
204 * mostly work out for float-interpreted-as-int since our comparisons are
205 * for >0, =0, <0.
206 */
207 if (intel->gen == 4) {
208 dst.type = src0.type;
209 if (dst.file == FIXED_HW_REG)
210 dst.fixed_hw_reg.type = dst.type;
211 }
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 exec_list
223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
224 fs_reg offset)
225 {
226 exec_list instructions;
227 fs_inst *inst;
228
229 if (intel->gen >= 7) {
230 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
231 dst, surf_index, offset);
232 instructions.push_tail(inst);
233 } else {
234 int base_mrf = 13;
235 bool header_present = true;
236
237 fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
238 mrf.type = BRW_REGISTER_TYPE_D;
239
240 /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
241 * dword-aligned byte offset.
242 */
243 if (intel->gen == 6) {
244 instructions.push_tail(MOV(mrf, offset));
245 } else {
246 instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
247 }
248 inst = MOV(mrf, offset);
249 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
250 dst, surf_index);
251 inst->header_present = header_present;
252 inst->base_mrf = base_mrf;
253 inst->mlen = header_present + dispatch_width / 8;
254
255 instructions.push_tail(inst);
256 }
257
258 return instructions;
259 }
260
261 /**
262 * A helper for MOV generation for fixing up broken hardware SEND dependency
263 * handling.
264 */
265 fs_inst *
266 fs_visitor::DEP_RESOLVE_MOV(int grf)
267 {
268 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
269
270 inst->ir = NULL;
271 inst->annotation = "send dependency resolve";
272
273 /* The caller always wants uncompressed to emit the minimal extra
274 * dependencies, and to avoid having to deal with aligning its regs to 2.
275 */
276 inst->force_uncompressed = true;
277
278 return inst;
279 }
280
281 bool
282 fs_inst::equals(fs_inst *inst)
283 {
284 return (opcode == inst->opcode &&
285 dst.equals(inst->dst) &&
286 src[0].equals(inst->src[0]) &&
287 src[1].equals(inst->src[1]) &&
288 src[2].equals(inst->src[2]) &&
289 saturate == inst->saturate &&
290 predicate == inst->predicate &&
291 conditional_mod == inst->conditional_mod &&
292 mlen == inst->mlen &&
293 base_mrf == inst->base_mrf &&
294 sampler == inst->sampler &&
295 target == inst->target &&
296 eot == inst->eot &&
297 header_present == inst->header_present &&
298 shadow_compare == inst->shadow_compare &&
299 offset == inst->offset);
300 }
301
302 int
303 fs_inst::regs_written()
304 {
305 if (is_tex())
306 return 4;
307
308 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
309 * but we don't currently use them...nor do we have an opcode for them.
310 */
311
312 return 1;
313 }
314
315 bool
316 fs_inst::overwrites_reg(const fs_reg &reg)
317 {
318 return (reg.file == dst.file &&
319 reg.reg == dst.reg &&
320 reg.reg_offset >= dst.reg_offset &&
321 reg.reg_offset < dst.reg_offset + regs_written());
322 }
323
324 bool
325 fs_inst::is_tex()
326 {
327 return (opcode == SHADER_OPCODE_TEX ||
328 opcode == FS_OPCODE_TXB ||
329 opcode == SHADER_OPCODE_TXD ||
330 opcode == SHADER_OPCODE_TXF ||
331 opcode == SHADER_OPCODE_TXL ||
332 opcode == SHADER_OPCODE_TXS);
333 }
334
335 bool
336 fs_inst::is_math()
337 {
338 return (opcode == SHADER_OPCODE_RCP ||
339 opcode == SHADER_OPCODE_RSQ ||
340 opcode == SHADER_OPCODE_SQRT ||
341 opcode == SHADER_OPCODE_EXP2 ||
342 opcode == SHADER_OPCODE_LOG2 ||
343 opcode == SHADER_OPCODE_SIN ||
344 opcode == SHADER_OPCODE_COS ||
345 opcode == SHADER_OPCODE_INT_QUOTIENT ||
346 opcode == SHADER_OPCODE_INT_REMAINDER ||
347 opcode == SHADER_OPCODE_POW);
348 }
349
350 bool
351 fs_inst::is_control_flow()
352 {
353 switch (opcode) {
354 case BRW_OPCODE_DO:
355 case BRW_OPCODE_WHILE:
356 case BRW_OPCODE_IF:
357 case BRW_OPCODE_ELSE:
358 case BRW_OPCODE_ENDIF:
359 case BRW_OPCODE_BREAK:
360 case BRW_OPCODE_CONTINUE:
361 return true;
362 default:
363 return false;
364 }
365 }
366
367 bool
368 fs_inst::is_send_from_grf()
369 {
370 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
371 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
372 src[1].file == GRF));
373 }
374
375 bool
376 fs_visitor::can_do_source_mods(fs_inst *inst)
377 {
378 if (intel->gen == 6 && inst->is_math())
379 return false;
380
381 if (inst->is_send_from_grf())
382 return false;
383
384 return true;
385 }
386
387 void
388 fs_reg::init()
389 {
390 memset(this, 0, sizeof(*this));
391 this->smear = -1;
392 }
393
394 /** Generic unset register constructor. */
395 fs_reg::fs_reg()
396 {
397 init();
398 this->file = BAD_FILE;
399 }
400
401 /** Immediate value constructor. */
402 fs_reg::fs_reg(float f)
403 {
404 init();
405 this->file = IMM;
406 this->type = BRW_REGISTER_TYPE_F;
407 this->imm.f = f;
408 }
409
410 /** Immediate value constructor. */
411 fs_reg::fs_reg(int32_t i)
412 {
413 init();
414 this->file = IMM;
415 this->type = BRW_REGISTER_TYPE_D;
416 this->imm.i = i;
417 }
418
419 /** Immediate value constructor. */
420 fs_reg::fs_reg(uint32_t u)
421 {
422 init();
423 this->file = IMM;
424 this->type = BRW_REGISTER_TYPE_UD;
425 this->imm.u = u;
426 }
427
428 /** Fixed brw_reg Immediate value constructor. */
429 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
430 {
431 init();
432 this->file = FIXED_HW_REG;
433 this->fixed_hw_reg = fixed_hw_reg;
434 this->type = fixed_hw_reg.type;
435 }
436
437 bool
438 fs_reg::equals(const fs_reg &r) const
439 {
440 return (file == r.file &&
441 reg == r.reg &&
442 reg_offset == r.reg_offset &&
443 type == r.type &&
444 negate == r.negate &&
445 abs == r.abs &&
446 !reladdr && !r.reladdr &&
447 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
448 sizeof(fixed_hw_reg)) == 0 &&
449 smear == r.smear &&
450 imm.u == r.imm.u);
451 }
452
453 bool
454 fs_reg::is_zero() const
455 {
456 if (file != IMM)
457 return false;
458
459 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
460 }
461
462 bool
463 fs_reg::is_one() const
464 {
465 if (file != IMM)
466 return false;
467
468 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
469 }
470
471 int
472 fs_visitor::type_size(const struct glsl_type *type)
473 {
474 unsigned int size, i;
475
476 switch (type->base_type) {
477 case GLSL_TYPE_UINT:
478 case GLSL_TYPE_INT:
479 case GLSL_TYPE_FLOAT:
480 case GLSL_TYPE_BOOL:
481 return type->components();
482 case GLSL_TYPE_ARRAY:
483 return type_size(type->fields.array) * type->length;
484 case GLSL_TYPE_STRUCT:
485 size = 0;
486 for (i = 0; i < type->length; i++) {
487 size += type_size(type->fields.structure[i].type);
488 }
489 return size;
490 case GLSL_TYPE_SAMPLER:
491 /* Samplers take up no register space, since they're baked in at
492 * link time.
493 */
494 return 0;
495 case GLSL_TYPE_VOID:
496 case GLSL_TYPE_ERROR:
497 case GLSL_TYPE_INTERFACE:
498 assert(!"not reached");
499 break;
500 }
501
502 return 0;
503 }
504
505 fs_reg
506 fs_visitor::get_timestamp()
507 {
508 assert(intel->gen >= 7);
509
510 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
511 BRW_ARF_TIMESTAMP,
512 0),
513 BRW_REGISTER_TYPE_UD));
514
515 fs_reg dst = fs_reg(this, glsl_type::uint_type);
516
517 fs_inst *mov = emit(MOV(dst, ts));
518 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
519 * even if it's not enabled in the dispatch.
520 */
521 mov->force_writemask_all = true;
522 mov->force_uncompressed = true;
523
524 /* The caller wants the low 32 bits of the timestamp. Since it's running
525 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
526 * which is plenty of time for our purposes. It is identical across the
527 * EUs, but since it's tracking GPU core speed it will increment at a
528 * varying rate as render P-states change.
529 *
530 * The caller could also check if render P-states have changed (or anything
531 * else that might disrupt timing) by setting smear to 2 and checking if
532 * that field is != 0.
533 */
534 dst.smear = 0;
535
536 return dst;
537 }
538
539 void
540 fs_visitor::emit_shader_time_begin()
541 {
542 current_annotation = "shader time start";
543 shader_start_time = get_timestamp();
544 }
545
546 void
547 fs_visitor::emit_shader_time_end()
548 {
549 current_annotation = "shader time end";
550
551 enum shader_time_shader_type type, written_type, reset_type;
552 if (dispatch_width == 8) {
553 type = ST_FS8;
554 written_type = ST_FS8_WRITTEN;
555 reset_type = ST_FS8_RESET;
556 } else {
557 assert(dispatch_width == 16);
558 type = ST_FS16;
559 written_type = ST_FS16_WRITTEN;
560 reset_type = ST_FS16_RESET;
561 }
562
563 fs_reg shader_end_time = get_timestamp();
564
565 /* Check that there weren't any timestamp reset events (assuming these
566 * were the only two timestamp reads that happened).
567 */
568 fs_reg reset = shader_end_time;
569 reset.smear = 2;
570 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
571 test->conditional_mod = BRW_CONDITIONAL_Z;
572 emit(IF(BRW_PREDICATE_NORMAL));
573
574 push_force_uncompressed();
575 fs_reg start = shader_start_time;
576 start.negate = true;
577 fs_reg diff = fs_reg(this, glsl_type::uint_type);
578 emit(ADD(diff, start, shader_end_time));
579
580 /* If there were no instructions between the two timestamp gets, the diff
581 * is 2 cycles. Remove that overhead, so I can forget about that when
582 * trying to determine the time taken for single instructions.
583 */
584 emit(ADD(diff, diff, fs_reg(-2u)));
585
586 emit_shader_time_write(type, diff);
587 emit_shader_time_write(written_type, fs_reg(1u));
588 emit(BRW_OPCODE_ELSE);
589 emit_shader_time_write(reset_type, fs_reg(1u));
590 emit(BRW_OPCODE_ENDIF);
591
592 pop_force_uncompressed();
593 }
594
595 void
596 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
597 fs_reg value)
598 {
599 /* Choose an index in the buffer and set up tracking information for our
600 * printouts.
601 */
602 int shader_time_index = brw->shader_time.num_entries++;
603 assert(shader_time_index <= brw->shader_time.max_entries);
604 brw->shader_time.types[shader_time_index] = type;
605 if (prog) {
606 _mesa_reference_shader_program(ctx,
607 &brw->shader_time.programs[shader_time_index],
608 prog);
609 }
610
611 int base_mrf = 6;
612
613 fs_reg offset_mrf = fs_reg(MRF, base_mrf);
614 offset_mrf.type = BRW_REGISTER_TYPE_UD;
615 emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
616
617 fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
618 time_mrf.type = BRW_REGISTER_TYPE_UD;
619 emit(MOV(time_mrf, value));
620
621 fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
622 inst->base_mrf = base_mrf;
623 inst->mlen = 2;
624 }
625
626 void
627 fs_visitor::fail(const char *format, ...)
628 {
629 va_list va;
630 char *msg;
631
632 if (failed)
633 return;
634
635 failed = true;
636
637 va_start(va, format);
638 msg = ralloc_vasprintf(mem_ctx, format, va);
639 va_end(va);
640 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
641
642 this->fail_msg = msg;
643
644 if (INTEL_DEBUG & DEBUG_WM) {
645 fprintf(stderr, "%s", msg);
646 }
647 }
648
649 fs_inst *
650 fs_visitor::emit(enum opcode opcode)
651 {
652 return emit(fs_inst(opcode));
653 }
654
655 fs_inst *
656 fs_visitor::emit(enum opcode opcode, fs_reg dst)
657 {
658 return emit(fs_inst(opcode, dst));
659 }
660
661 fs_inst *
662 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
663 {
664 return emit(fs_inst(opcode, dst, src0));
665 }
666
667 fs_inst *
668 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
669 {
670 return emit(fs_inst(opcode, dst, src0, src1));
671 }
672
673 fs_inst *
674 fs_visitor::emit(enum opcode opcode, fs_reg dst,
675 fs_reg src0, fs_reg src1, fs_reg src2)
676 {
677 return emit(fs_inst(opcode, dst, src0, src1, src2));
678 }
679
680 void
681 fs_visitor::push_force_uncompressed()
682 {
683 force_uncompressed_stack++;
684 }
685
686 void
687 fs_visitor::pop_force_uncompressed()
688 {
689 force_uncompressed_stack--;
690 assert(force_uncompressed_stack >= 0);
691 }
692
693 void
694 fs_visitor::push_force_sechalf()
695 {
696 force_sechalf_stack++;
697 }
698
699 void
700 fs_visitor::pop_force_sechalf()
701 {
702 force_sechalf_stack--;
703 assert(force_sechalf_stack >= 0);
704 }
705
706 /**
707 * Returns how many MRFs an FS opcode will write over.
708 *
709 * Note that this is not the 0 or 1 implied writes in an actual gen
710 * instruction -- the FS opcodes often generate MOVs in addition.
711 */
712 int
713 fs_visitor::implied_mrf_writes(fs_inst *inst)
714 {
715 if (inst->mlen == 0)
716 return 0;
717
718 switch (inst->opcode) {
719 case SHADER_OPCODE_RCP:
720 case SHADER_OPCODE_RSQ:
721 case SHADER_OPCODE_SQRT:
722 case SHADER_OPCODE_EXP2:
723 case SHADER_OPCODE_LOG2:
724 case SHADER_OPCODE_SIN:
725 case SHADER_OPCODE_COS:
726 return 1 * dispatch_width / 8;
727 case SHADER_OPCODE_POW:
728 case SHADER_OPCODE_INT_QUOTIENT:
729 case SHADER_OPCODE_INT_REMAINDER:
730 return 2 * dispatch_width / 8;
731 case SHADER_OPCODE_TEX:
732 case FS_OPCODE_TXB:
733 case SHADER_OPCODE_TXD:
734 case SHADER_OPCODE_TXF:
735 case SHADER_OPCODE_TXL:
736 case SHADER_OPCODE_TXS:
737 return 1;
738 case SHADER_OPCODE_SHADER_TIME_ADD:
739 return 0;
740 case FS_OPCODE_FB_WRITE:
741 return 2;
742 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
743 case FS_OPCODE_UNSPILL:
744 return 1;
745 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
746 return inst->header_present;
747 case FS_OPCODE_SPILL:
748 return 2;
749 default:
750 assert(!"not reached");
751 return inst->mlen;
752 }
753 }
754
755 int
756 fs_visitor::virtual_grf_alloc(int size)
757 {
758 if (virtual_grf_array_size <= virtual_grf_count) {
759 if (virtual_grf_array_size == 0)
760 virtual_grf_array_size = 16;
761 else
762 virtual_grf_array_size *= 2;
763 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
764 virtual_grf_array_size);
765 }
766 virtual_grf_sizes[virtual_grf_count] = size;
767 return virtual_grf_count++;
768 }
769
770 /** Fixed HW reg constructor. */
771 fs_reg::fs_reg(enum register_file file, int reg)
772 {
773 init();
774 this->file = file;
775 this->reg = reg;
776 this->type = BRW_REGISTER_TYPE_F;
777 }
778
779 /** Fixed HW reg constructor. */
780 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
781 {
782 init();
783 this->file = file;
784 this->reg = reg;
785 this->type = type;
786 }
787
788 /** Automatic reg constructor. */
789 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
790 {
791 init();
792
793 this->file = GRF;
794 this->reg = v->virtual_grf_alloc(v->type_size(type));
795 this->reg_offset = 0;
796 this->type = brw_type_for_base_type(type);
797 }
798
799 fs_reg *
800 fs_visitor::variable_storage(ir_variable *var)
801 {
802 return (fs_reg *)hash_table_find(this->variable_ht, var);
803 }
804
805 void
806 import_uniforms_callback(const void *key,
807 void *data,
808 void *closure)
809 {
810 struct hash_table *dst_ht = (struct hash_table *)closure;
811 const fs_reg *reg = (const fs_reg *)data;
812
813 if (reg->file != UNIFORM)
814 return;
815
816 hash_table_insert(dst_ht, data, key);
817 }
818
819 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
820 * This brings in those uniform definitions
821 */
822 void
823 fs_visitor::import_uniforms(fs_visitor *v)
824 {
825 hash_table_call_foreach(v->variable_ht,
826 import_uniforms_callback,
827 variable_ht);
828 this->params_remap = v->params_remap;
829 }
830
831 /* Our support for uniforms is piggy-backed on the struct
832 * gl_fragment_program, because that's where the values actually
833 * get stored, rather than in some global gl_shader_program uniform
834 * store.
835 */
836 void
837 fs_visitor::setup_uniform_values(ir_variable *ir)
838 {
839 int namelen = strlen(ir->name);
840
841 /* The data for our (non-builtin) uniforms is stored in a series of
842 * gl_uniform_driver_storage structs for each subcomponent that
843 * glGetUniformLocation() could name. We know it's been set up in the same
844 * order we'd walk the type, so walk the list of storage and find anything
845 * with our name, or the prefix of a component that starts with our name.
846 */
847 unsigned params_before = c->prog_data.nr_params;
848 for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
849 struct gl_uniform_storage *storage = &prog->UniformStorage[u];
850
851 if (strncmp(ir->name, storage->name, namelen) != 0 ||
852 (storage->name[namelen] != 0 &&
853 storage->name[namelen] != '.' &&
854 storage->name[namelen] != '[')) {
855 continue;
856 }
857
858 unsigned slots = storage->type->component_slots();
859 if (storage->array_elements)
860 slots *= storage->array_elements;
861
862 for (unsigned i = 0; i < slots; i++) {
863 c->prog_data.param[c->prog_data.nr_params++] =
864 &storage->storage[i].f;
865 }
866 }
867
868 /* Make sure we actually initialized the right amount of stuff here. */
869 assert(params_before + ir->type->component_slots() ==
870 c->prog_data.nr_params);
871 }
872
873
874 /* Our support for builtin uniforms is even scarier than non-builtin.
875 * It sits on top of the PROG_STATE_VAR parameters that are
876 * automatically updated from GL context state.
877 */
878 void
879 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
880 {
881 const ir_state_slot *const slots = ir->state_slots;
882 assert(ir->state_slots != NULL);
883
884 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
885 /* This state reference has already been setup by ir_to_mesa, but we'll
886 * get the same index back here.
887 */
888 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
889 (gl_state_index *)slots[i].tokens);
890
891 /* Add each of the unique swizzles of the element as a parameter.
892 * This'll end up matching the expected layout of the
893 * array/matrix/structure we're trying to fill in.
894 */
895 int last_swiz = -1;
896 for (unsigned int j = 0; j < 4; j++) {
897 int swiz = GET_SWZ(slots[i].swizzle, j);
898 if (swiz == last_swiz)
899 break;
900 last_swiz = swiz;
901
902 c->prog_data.param[c->prog_data.nr_params++] =
903 &fp->Base.Parameters->ParameterValues[index][swiz].f;
904 }
905 }
906 }
907
908 fs_reg *
909 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
910 {
911 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
912 fs_reg wpos = *reg;
913 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
914
915 /* gl_FragCoord.x */
916 if (ir->pixel_center_integer) {
917 emit(MOV(wpos, this->pixel_x));
918 } else {
919 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
920 }
921 wpos.reg_offset++;
922
923 /* gl_FragCoord.y */
924 if (!flip && ir->pixel_center_integer) {
925 emit(MOV(wpos, this->pixel_y));
926 } else {
927 fs_reg pixel_y = this->pixel_y;
928 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
929
930 if (flip) {
931 pixel_y.negate = true;
932 offset += c->key.drawable_height - 1.0;
933 }
934
935 emit(ADD(wpos, pixel_y, fs_reg(offset)));
936 }
937 wpos.reg_offset++;
938
939 /* gl_FragCoord.z */
940 if (intel->gen >= 6) {
941 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
942 } else {
943 emit(FS_OPCODE_LINTERP, wpos,
944 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
945 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
946 interp_reg(FRAG_ATTRIB_WPOS, 2));
947 }
948 wpos.reg_offset++;
949
950 /* gl_FragCoord.w: Already set up in emit_interpolation */
951 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
952
953 return reg;
954 }
955
956 fs_inst *
957 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
958 glsl_interp_qualifier interpolation_mode,
959 bool is_centroid)
960 {
961 brw_wm_barycentric_interp_mode barycoord_mode;
962 if (is_centroid) {
963 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
964 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
965 else
966 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
967 } else {
968 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
969 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
970 else
971 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
972 }
973 return emit(FS_OPCODE_LINTERP, attr,
974 this->delta_x[barycoord_mode],
975 this->delta_y[barycoord_mode], interp);
976 }
977
978 fs_reg *
979 fs_visitor::emit_general_interpolation(ir_variable *ir)
980 {
981 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
982 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
983 fs_reg attr = *reg;
984
985 unsigned int array_elements;
986 const glsl_type *type;
987
988 if (ir->type->is_array()) {
989 array_elements = ir->type->length;
990 if (array_elements == 0) {
991 fail("dereferenced array '%s' has length 0\n", ir->name);
992 }
993 type = ir->type->fields.array;
994 } else {
995 array_elements = 1;
996 type = ir->type;
997 }
998
999 glsl_interp_qualifier interpolation_mode =
1000 ir->determine_interpolation_mode(c->key.flat_shade);
1001
1002 int location = ir->location;
1003 for (unsigned int i = 0; i < array_elements; i++) {
1004 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1005 if (urb_setup[location] == -1) {
1006 /* If there's no incoming setup data for this slot, don't
1007 * emit interpolation for it.
1008 */
1009 attr.reg_offset += type->vector_elements;
1010 location++;
1011 continue;
1012 }
1013
1014 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1015 /* Constant interpolation (flat shading) case. The SF has
1016 * handed us defined values in only the constant offset
1017 * field of the setup reg.
1018 */
1019 for (unsigned int k = 0; k < type->vector_elements; k++) {
1020 struct brw_reg interp = interp_reg(location, k);
1021 interp = suboffset(interp, 3);
1022 interp.type = reg->type;
1023 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1024 attr.reg_offset++;
1025 }
1026 } else {
1027 /* Smooth/noperspective interpolation case. */
1028 for (unsigned int k = 0; k < type->vector_elements; k++) {
1029 /* FINISHME: At some point we probably want to push
1030 * this farther by giving similar treatment to the
1031 * other potentially constant components of the
1032 * attribute, as well as making brw_vs_constval.c
1033 * handle varyings other than gl_TexCoord.
1034 */
1035 if (location >= FRAG_ATTRIB_TEX0 &&
1036 location <= FRAG_ATTRIB_TEX7 &&
1037 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1038 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1039 } else {
1040 struct brw_reg interp = interp_reg(location, k);
1041 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1042 ir->centroid);
1043 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1044 /* Get the pixel/sample mask into f0 so that we know
1045 * which pixels are lit. Then, for each channel that is
1046 * unlit, replace the centroid data with non-centroid
1047 * data.
1048 */
1049 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1050 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1051 interpolation_mode, false);
1052 inst->predicate = BRW_PREDICATE_NORMAL;
1053 inst->predicate_inverse = true;
1054 }
1055 if (intel->gen < 6) {
1056 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1057 }
1058 }
1059 attr.reg_offset++;
1060 }
1061
1062 }
1063 location++;
1064 }
1065 }
1066
1067 return reg;
1068 }
1069
1070 fs_reg *
1071 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1072 {
1073 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1074
1075 /* The frontfacing comes in as a bit in the thread payload. */
1076 if (intel->gen >= 6) {
1077 emit(BRW_OPCODE_ASR, *reg,
1078 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1079 fs_reg(15));
1080 emit(BRW_OPCODE_NOT, *reg, *reg);
1081 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1082 } else {
1083 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1084 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1085 * us front face
1086 */
1087 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1088 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1089 }
1090
1091 return reg;
1092 }
1093
1094 fs_reg
1095 fs_visitor::fix_math_operand(fs_reg src)
1096 {
1097 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1098 * might be able to do better by doing execsize = 1 math and then
1099 * expanding that result out, but we would need to be careful with
1100 * masking.
1101 *
1102 * The hardware ignores source modifiers (negate and abs) on math
1103 * instructions, so we also move to a temp to set those up.
1104 */
1105 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1106 !src.abs && !src.negate)
1107 return src;
1108
1109 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1110 * operands to math
1111 */
1112 if (intel->gen >= 7 && src.file != IMM)
1113 return src;
1114
1115 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1116 expanded.type = src.type;
1117 emit(BRW_OPCODE_MOV, expanded, src);
1118 return expanded;
1119 }
1120
1121 fs_inst *
1122 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1123 {
1124 switch (opcode) {
1125 case SHADER_OPCODE_RCP:
1126 case SHADER_OPCODE_RSQ:
1127 case SHADER_OPCODE_SQRT:
1128 case SHADER_OPCODE_EXP2:
1129 case SHADER_OPCODE_LOG2:
1130 case SHADER_OPCODE_SIN:
1131 case SHADER_OPCODE_COS:
1132 break;
1133 default:
1134 assert(!"not reached: bad math opcode");
1135 return NULL;
1136 }
1137
1138 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1139 * might be able to do better by doing execsize = 1 math and then
1140 * expanding that result out, but we would need to be careful with
1141 * masking.
1142 *
1143 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1144 * instructions, so we also move to a temp to set those up.
1145 */
1146 if (intel->gen >= 6)
1147 src = fix_math_operand(src);
1148
1149 fs_inst *inst = emit(opcode, dst, src);
1150
1151 if (intel->gen < 6) {
1152 inst->base_mrf = 2;
1153 inst->mlen = dispatch_width / 8;
1154 }
1155
1156 return inst;
1157 }
1158
1159 fs_inst *
1160 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1161 {
1162 int base_mrf = 2;
1163 fs_inst *inst;
1164
1165 switch (opcode) {
1166 case SHADER_OPCODE_INT_QUOTIENT:
1167 case SHADER_OPCODE_INT_REMAINDER:
1168 if (intel->gen >= 7 && dispatch_width == 16)
1169 fail("16-wide INTDIV unsupported\n");
1170 break;
1171 case SHADER_OPCODE_POW:
1172 break;
1173 default:
1174 assert(!"not reached: unsupported binary math opcode.");
1175 return NULL;
1176 }
1177
1178 if (intel->gen >= 6) {
1179 src0 = fix_math_operand(src0);
1180 src1 = fix_math_operand(src1);
1181
1182 inst = emit(opcode, dst, src0, src1);
1183 } else {
1184 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1185 * "Message Payload":
1186 *
1187 * "Operand0[7]. For the INT DIV functions, this operand is the
1188 * denominator."
1189 * ...
1190 * "Operand1[7]. For the INT DIV functions, this operand is the
1191 * numerator."
1192 */
1193 bool is_int_div = opcode != SHADER_OPCODE_POW;
1194 fs_reg &op0 = is_int_div ? src1 : src0;
1195 fs_reg &op1 = is_int_div ? src0 : src1;
1196
1197 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1198 inst = emit(opcode, dst, op0, reg_null_f);
1199
1200 inst->base_mrf = base_mrf;
1201 inst->mlen = 2 * dispatch_width / 8;
1202 }
1203 return inst;
1204 }
1205
1206 void
1207 fs_visitor::assign_curb_setup()
1208 {
1209 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1210 if (dispatch_width == 8) {
1211 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1212 } else {
1213 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1214 }
1215
1216 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1217 foreach_list(node, &this->instructions) {
1218 fs_inst *inst = (fs_inst *)node;
1219
1220 for (unsigned int i = 0; i < 3; i++) {
1221 if (inst->src[i].file == UNIFORM) {
1222 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1223 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1224 constant_nr / 8,
1225 constant_nr % 8);
1226
1227 inst->src[i].file = FIXED_HW_REG;
1228 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1229 }
1230 }
1231 }
1232 }
1233
1234 void
1235 fs_visitor::calculate_urb_setup()
1236 {
1237 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1238 urb_setup[i] = -1;
1239 }
1240
1241 int urb_next = 0;
1242 /* Figure out where each of the incoming setup attributes lands. */
1243 if (intel->gen >= 6) {
1244 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1245 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1246 urb_setup[i] = urb_next++;
1247 }
1248 }
1249 } else {
1250 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1251 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1252 /* Point size is packed into the header, not as a general attribute */
1253 if (i == VERT_RESULT_PSIZ)
1254 continue;
1255
1256 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1257 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1258
1259 /* The back color slot is skipped when the front color is
1260 * also written to. In addition, some slots can be
1261 * written in the vertex shader and not read in the
1262 * fragment shader. So the register number must always be
1263 * incremented, mapped or not.
1264 */
1265 if (fp_index >= 0)
1266 urb_setup[fp_index] = urb_next;
1267 urb_next++;
1268 }
1269 }
1270
1271 /*
1272 * It's a FS only attribute, and we did interpolation for this attribute
1273 * in SF thread. So, count it here, too.
1274 *
1275 * See compile_sf_prog() for more info.
1276 */
1277 if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1278 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1279 }
1280
1281 /* Each attribute is 4 setup channels, each of which is half a reg. */
1282 c->prog_data.urb_read_length = urb_next * 2;
1283 }
1284
1285 void
1286 fs_visitor::assign_urb_setup()
1287 {
1288 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1289
1290 /* Offset all the urb_setup[] index by the actual position of the
1291 * setup regs, now that the location of the constants has been chosen.
1292 */
1293 foreach_list(node, &this->instructions) {
1294 fs_inst *inst = (fs_inst *)node;
1295
1296 if (inst->opcode == FS_OPCODE_LINTERP) {
1297 assert(inst->src[2].file == FIXED_HW_REG);
1298 inst->src[2].fixed_hw_reg.nr += urb_start;
1299 }
1300
1301 if (inst->opcode == FS_OPCODE_CINTERP) {
1302 assert(inst->src[0].file == FIXED_HW_REG);
1303 inst->src[0].fixed_hw_reg.nr += urb_start;
1304 }
1305 }
1306
1307 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1308 }
1309
1310 /**
1311 * Split large virtual GRFs into separate components if we can.
1312 *
1313 * This is mostly duplicated with what brw_fs_vector_splitting does,
1314 * but that's really conservative because it's afraid of doing
1315 * splitting that doesn't result in real progress after the rest of
1316 * the optimization phases, which would cause infinite looping in
1317 * optimization. We can do it once here, safely. This also has the
1318 * opportunity to split interpolated values, or maybe even uniforms,
1319 * which we don't have at the IR level.
1320 *
1321 * We want to split, because virtual GRFs are what we register
1322 * allocate and spill (due to contiguousness requirements for some
1323 * instructions), and they're what we naturally generate in the
1324 * codegen process, but most virtual GRFs don't actually need to be
1325 * contiguous sets of GRFs. If we split, we'll end up with reduced
1326 * live intervals and better dead code elimination and coalescing.
1327 */
1328 void
1329 fs_visitor::split_virtual_grfs()
1330 {
1331 int num_vars = this->virtual_grf_count;
1332 bool split_grf[num_vars];
1333 int new_virtual_grf[num_vars];
1334
1335 /* Try to split anything > 0 sized. */
1336 for (int i = 0; i < num_vars; i++) {
1337 if (this->virtual_grf_sizes[i] != 1)
1338 split_grf[i] = true;
1339 else
1340 split_grf[i] = false;
1341 }
1342
1343 if (brw->has_pln &&
1344 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1345 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1346 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1347 * Gen6, that was the only supported interpolation mode, and since Gen6,
1348 * delta_x and delta_y are in fixed hardware registers.
1349 */
1350 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1351 false;
1352 }
1353
1354 foreach_list(node, &this->instructions) {
1355 fs_inst *inst = (fs_inst *)node;
1356
1357 /* If there's a SEND message that requires contiguous destination
1358 * registers, no splitting is allowed.
1359 */
1360 if (inst->regs_written() > 1) {
1361 split_grf[inst->dst.reg] = false;
1362 }
1363 }
1364
1365 /* Allocate new space for split regs. Note that the virtual
1366 * numbers will be contiguous.
1367 */
1368 for (int i = 0; i < num_vars; i++) {
1369 if (split_grf[i]) {
1370 new_virtual_grf[i] = virtual_grf_alloc(1);
1371 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1372 int reg = virtual_grf_alloc(1);
1373 assert(reg == new_virtual_grf[i] + j - 1);
1374 (void) reg;
1375 }
1376 this->virtual_grf_sizes[i] = 1;
1377 }
1378 }
1379
1380 foreach_list(node, &this->instructions) {
1381 fs_inst *inst = (fs_inst *)node;
1382
1383 if (inst->dst.file == GRF &&
1384 split_grf[inst->dst.reg] &&
1385 inst->dst.reg_offset != 0) {
1386 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1387 inst->dst.reg_offset - 1);
1388 inst->dst.reg_offset = 0;
1389 }
1390 for (int i = 0; i < 3; i++) {
1391 if (inst->src[i].file == GRF &&
1392 split_grf[inst->src[i].reg] &&
1393 inst->src[i].reg_offset != 0) {
1394 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1395 inst->src[i].reg_offset - 1);
1396 inst->src[i].reg_offset = 0;
1397 }
1398 }
1399 }
1400 this->live_intervals_valid = false;
1401 }
1402
1403 /**
1404 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1405 *
1406 * During code generation, we create tons of temporary variables, many of
1407 * which get immediately killed and are never used again. Yet, in later
1408 * optimization and analysis passes, such as compute_live_intervals, we need
1409 * to loop over all the virtual GRFs. Compacting them can save a lot of
1410 * overhead.
1411 */
1412 void
1413 fs_visitor::compact_virtual_grfs()
1414 {
1415 /* Mark which virtual GRFs are used, and count how many. */
1416 int remap_table[this->virtual_grf_count];
1417 memset(remap_table, -1, sizeof(remap_table));
1418
1419 foreach_list(node, &this->instructions) {
1420 const fs_inst *inst = (const fs_inst *) node;
1421
1422 if (inst->dst.file == GRF)
1423 remap_table[inst->dst.reg] = 0;
1424
1425 for (int i = 0; i < 3; i++) {
1426 if (inst->src[i].file == GRF)
1427 remap_table[inst->src[i].reg] = 0;
1428 }
1429 }
1430
1431 /* In addition to registers used in instructions, fs_visitor keeps
1432 * direct references to certain special values which must be patched:
1433 */
1434 fs_reg *special[] = {
1435 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1436 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1437 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1438 &delta_x[0], &delta_x[1], &delta_x[2],
1439 &delta_x[3], &delta_x[4], &delta_x[5],
1440 &delta_y[0], &delta_y[1], &delta_y[2],
1441 &delta_y[3], &delta_y[4], &delta_y[5],
1442 };
1443 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1444 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1445
1446 /* Treat all special values as used, to be conservative */
1447 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1448 if (special[i]->file == GRF)
1449 remap_table[special[i]->reg] = 0;
1450 }
1451
1452 /* Compact the GRF arrays. */
1453 int new_index = 0;
1454 for (int i = 0; i < this->virtual_grf_count; i++) {
1455 if (remap_table[i] != -1) {
1456 remap_table[i] = new_index;
1457 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1458 if (live_intervals_valid) {
1459 virtual_grf_use[new_index] = virtual_grf_use[i];
1460 virtual_grf_def[new_index] = virtual_grf_def[i];
1461 }
1462 ++new_index;
1463 }
1464 }
1465
1466 this->virtual_grf_count = new_index;
1467
1468 /* Patch all the instructions to use the newly renumbered registers */
1469 foreach_list(node, &this->instructions) {
1470 fs_inst *inst = (fs_inst *) node;
1471
1472 if (inst->dst.file == GRF)
1473 inst->dst.reg = remap_table[inst->dst.reg];
1474
1475 for (int i = 0; i < 3; i++) {
1476 if (inst->src[i].file == GRF)
1477 inst->src[i].reg = remap_table[inst->src[i].reg];
1478 }
1479 }
1480
1481 /* Patch all the references to special values */
1482 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1483 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1484 special[i]->reg = remap_table[special[i]->reg];
1485 }
1486 }
1487
1488 bool
1489 fs_visitor::remove_dead_constants()
1490 {
1491 if (dispatch_width == 8) {
1492 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1493
1494 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1495 this->params_remap[i] = -1;
1496
1497 /* Find which params are still in use. */
1498 foreach_list(node, &this->instructions) {
1499 fs_inst *inst = (fs_inst *)node;
1500
1501 for (int i = 0; i < 3; i++) {
1502 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1503
1504 if (inst->src[i].file != UNIFORM)
1505 continue;
1506
1507 assert(constant_nr < (int)c->prog_data.nr_params);
1508
1509 /* For now, set this to non-negative. We'll give it the
1510 * actual new number in a moment, in order to keep the
1511 * register numbers nicely ordered.
1512 */
1513 this->params_remap[constant_nr] = 0;
1514 }
1515 }
1516
1517 /* Figure out what the new numbers for the params will be. At some
1518 * point when we're doing uniform array access, we're going to want
1519 * to keep the distinction between .reg and .reg_offset, but for
1520 * now we don't care.
1521 */
1522 unsigned int new_nr_params = 0;
1523 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1524 if (this->params_remap[i] != -1) {
1525 this->params_remap[i] = new_nr_params++;
1526 }
1527 }
1528
1529 /* Update the list of params to be uploaded to match our new numbering. */
1530 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1531 int remapped = this->params_remap[i];
1532
1533 if (remapped == -1)
1534 continue;
1535
1536 c->prog_data.param[remapped] = c->prog_data.param[i];
1537 }
1538
1539 c->prog_data.nr_params = new_nr_params;
1540 } else {
1541 /* This should have been generated in the 8-wide pass already. */
1542 assert(this->params_remap);
1543 }
1544
1545 /* Now do the renumbering of the shader to remove unused params. */
1546 foreach_list(node, &this->instructions) {
1547 fs_inst *inst = (fs_inst *)node;
1548
1549 for (int i = 0; i < 3; i++) {
1550 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1551
1552 if (inst->src[i].file != UNIFORM)
1553 continue;
1554
1555 assert(this->params_remap[constant_nr] != -1);
1556 inst->src[i].reg = this->params_remap[constant_nr];
1557 inst->src[i].reg_offset = 0;
1558 }
1559 }
1560
1561 return true;
1562 }
1563
1564 /*
1565 * Implements array access of uniforms by inserting a
1566 * PULL_CONSTANT_LOAD instruction.
1567 *
1568 * Unlike temporary GRF array access (where we don't support it due to
1569 * the difficulty of doing relative addressing on instruction
1570 * destinations), we could potentially do array access of uniforms
1571 * that were loaded in GRF space as push constants. In real-world
1572 * usage we've seen, though, the arrays being used are always larger
1573 * than we could load as push constants, so just always move all
1574 * uniform array access out to a pull constant buffer.
1575 */
1576 void
1577 fs_visitor::move_uniform_array_access_to_pull_constants()
1578 {
1579 int pull_constant_loc[c->prog_data.nr_params];
1580
1581 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1582 pull_constant_loc[i] = -1;
1583 }
1584
1585 /* Walk through and find array access of uniforms. Put a copy of that
1586 * uniform in the pull constant buffer.
1587 *
1588 * Note that we don't move constant-indexed accesses to arrays. No
1589 * testing has been done of the performance impact of this choice.
1590 */
1591 foreach_list_safe(node, &this->instructions) {
1592 fs_inst *inst = (fs_inst *)node;
1593
1594 for (int i = 0 ; i < 3; i++) {
1595 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1596 continue;
1597
1598 int uniform = inst->src[i].reg;
1599
1600 /* If this array isn't already present in the pull constant buffer,
1601 * add it.
1602 */
1603 if (pull_constant_loc[uniform] == -1) {
1604 const float **values = &c->prog_data.param[uniform];
1605
1606 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1607
1608 assert(param_size[uniform]);
1609
1610 for (int j = 0; j < param_size[uniform]; j++) {
1611 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1612 values[j];
1613 }
1614 }
1615
1616 /* Set up the annotation tracking for new generated instructions. */
1617 base_ir = inst->ir;
1618 current_annotation = inst->annotation;
1619
1620 fs_reg offset = fs_reg(this, glsl_type::int_type);
1621 inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1622 fs_reg(pull_constant_loc[uniform] +
1623 inst->src[i].reg_offset)));
1624
1625 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1626 fs_reg temp = fs_reg(this, glsl_type::float_type);
1627 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1628 surf_index, offset);
1629 inst->insert_before(&list);
1630
1631 inst->src[i].file = temp.file;
1632 inst->src[i].reg = temp.reg;
1633 inst->src[i].reg_offset = temp.reg_offset;
1634 inst->src[i].reladdr = NULL;
1635 }
1636 }
1637 }
1638
1639 /**
1640 * Choose accesses from the UNIFORM file to demote to using the pull
1641 * constant buffer.
1642 *
1643 * We allow a fragment shader to have more than the specified minimum
1644 * maximum number of fragment shader uniform components (64). If
1645 * there are too many of these, they'd fill up all of register space.
1646 * So, this will push some of them out to the pull constant buffer and
1647 * update the program to load them.
1648 */
1649 void
1650 fs_visitor::setup_pull_constants()
1651 {
1652 /* Only allow 16 registers (128 uniform components) as push constants. */
1653 unsigned int max_uniform_components = 16 * 8;
1654 if (c->prog_data.nr_params <= max_uniform_components)
1655 return;
1656
1657 if (dispatch_width == 16) {
1658 fail("Pull constants not supported in 16-wide\n");
1659 return;
1660 }
1661
1662 /* Just demote the end of the list. We could probably do better
1663 * here, demoting things that are rarely used in the program first.
1664 */
1665 unsigned int pull_uniform_base = max_uniform_components;
1666
1667 int pull_constant_loc[c->prog_data.nr_params];
1668 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1669 if (i < pull_uniform_base) {
1670 pull_constant_loc[i] = -1;
1671 } else {
1672 pull_constant_loc[i] = -1;
1673 /* If our constant is already being uploaded for reladdr purposes,
1674 * reuse it.
1675 */
1676 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1677 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1678 pull_constant_loc[i] = j;
1679 break;
1680 }
1681 }
1682 if (pull_constant_loc[i] == -1) {
1683 int pull_index = c->prog_data.nr_pull_params++;
1684 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1685 pull_constant_loc[i] = pull_index;;
1686 }
1687 }
1688 }
1689 c->prog_data.nr_params = pull_uniform_base;
1690
1691 foreach_list(node, &this->instructions) {
1692 fs_inst *inst = (fs_inst *)node;
1693
1694 for (int i = 0; i < 3; i++) {
1695 if (inst->src[i].file != UNIFORM)
1696 continue;
1697
1698 int pull_index = pull_constant_loc[inst->src[i].reg +
1699 inst->src[i].reg_offset];
1700 if (pull_index == -1)
1701 continue;
1702
1703 assert(!inst->src[i].reladdr);
1704
1705 fs_reg dst = fs_reg(this, glsl_type::float_type);
1706 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1707 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1708 fs_inst *pull =
1709 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1710 dst, index, offset);
1711 pull->ir = inst->ir;
1712 pull->annotation = inst->annotation;
1713 pull->base_mrf = 14;
1714 pull->mlen = 1;
1715
1716 inst->insert_before(pull);
1717
1718 inst->src[i].file = GRF;
1719 inst->src[i].reg = dst.reg;
1720 inst->src[i].reg_offset = 0;
1721 inst->src[i].smear = pull_index & 3;
1722 }
1723 }
1724 }
1725
1726 bool
1727 fs_visitor::opt_algebraic()
1728 {
1729 bool progress = false;
1730
1731 foreach_list(node, &this->instructions) {
1732 fs_inst *inst = (fs_inst *)node;
1733
1734 switch (inst->opcode) {
1735 case BRW_OPCODE_MUL:
1736 if (inst->src[1].file != IMM)
1737 continue;
1738
1739 /* a * 1.0 = a */
1740 if (inst->src[1].is_one()) {
1741 inst->opcode = BRW_OPCODE_MOV;
1742 inst->src[1] = reg_undef;
1743 progress = true;
1744 break;
1745 }
1746
1747 /* a * 0.0 = 0.0 */
1748 if (inst->src[1].is_zero()) {
1749 inst->opcode = BRW_OPCODE_MOV;
1750 inst->src[0] = inst->src[1];
1751 inst->src[1] = reg_undef;
1752 progress = true;
1753 break;
1754 }
1755
1756 break;
1757 case BRW_OPCODE_ADD:
1758 if (inst->src[1].file != IMM)
1759 continue;
1760
1761 /* a + 0.0 = a */
1762 if (inst->src[1].is_zero()) {
1763 inst->opcode = BRW_OPCODE_MOV;
1764 inst->src[1] = reg_undef;
1765 progress = true;
1766 break;
1767 }
1768 break;
1769 default:
1770 break;
1771 }
1772 }
1773
1774 return progress;
1775 }
1776
1777 /**
1778 * Must be called after calculate_live_intervales() to remove unused
1779 * writes to registers -- register allocation will fail otherwise
1780 * because something deffed but not used won't be considered to
1781 * interfere with other regs.
1782 */
1783 bool
1784 fs_visitor::dead_code_eliminate()
1785 {
1786 bool progress = false;
1787 int pc = 0;
1788
1789 calculate_live_intervals();
1790
1791 foreach_list_safe(node, &this->instructions) {
1792 fs_inst *inst = (fs_inst *)node;
1793
1794 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1795 inst->remove();
1796 progress = true;
1797 }
1798
1799 pc++;
1800 }
1801
1802 if (progress)
1803 live_intervals_valid = false;
1804
1805 return progress;
1806 }
1807
1808 /**
1809 * Implements a second type of register coalescing: This one checks if
1810 * the two regs involved in a raw move don't interfere, in which case
1811 * they can both by stored in the same place and the MOV removed.
1812 */
1813 bool
1814 fs_visitor::register_coalesce_2()
1815 {
1816 bool progress = false;
1817
1818 calculate_live_intervals();
1819
1820 foreach_list_safe(node, &this->instructions) {
1821 fs_inst *inst = (fs_inst *)node;
1822
1823 if (inst->opcode != BRW_OPCODE_MOV ||
1824 inst->predicate ||
1825 inst->saturate ||
1826 inst->src[0].file != GRF ||
1827 inst->src[0].negate ||
1828 inst->src[0].abs ||
1829 inst->src[0].smear != -1 ||
1830 inst->dst.file != GRF ||
1831 inst->dst.type != inst->src[0].type ||
1832 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1833 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1834 continue;
1835 }
1836
1837 int reg_from = inst->src[0].reg;
1838 assert(inst->src[0].reg_offset == 0);
1839 int reg_to = inst->dst.reg;
1840 int reg_to_offset = inst->dst.reg_offset;
1841
1842 foreach_list(node, &this->instructions) {
1843 fs_inst *scan_inst = (fs_inst *)node;
1844
1845 if (scan_inst->dst.file == GRF &&
1846 scan_inst->dst.reg == reg_from) {
1847 scan_inst->dst.reg = reg_to;
1848 scan_inst->dst.reg_offset = reg_to_offset;
1849 }
1850 for (int i = 0; i < 3; i++) {
1851 if (scan_inst->src[i].file == GRF &&
1852 scan_inst->src[i].reg == reg_from) {
1853 scan_inst->src[i].reg = reg_to;
1854 scan_inst->src[i].reg_offset = reg_to_offset;
1855 }
1856 }
1857 }
1858
1859 inst->remove();
1860
1861 /* We don't need to recalculate live intervals inside the loop despite
1862 * flagging live_intervals_valid because we only use live intervals for
1863 * the interferes test, and we must have had a situation where the
1864 * intervals were:
1865 *
1866 * from to
1867 * ^
1868 * |
1869 * v
1870 * ^
1871 * |
1872 * v
1873 *
1874 * Some register R that might get coalesced with one of these two could
1875 * only be referencing "to", otherwise "from"'s range would have been
1876 * longer. R's range could also only start at the end of "to" or later,
1877 * otherwise it will conflict with "to" when we try to coalesce "to"
1878 * into Rw anyway.
1879 */
1880 live_intervals_valid = false;
1881
1882 progress = true;
1883 continue;
1884 }
1885
1886 return progress;
1887 }
1888
1889 bool
1890 fs_visitor::register_coalesce()
1891 {
1892 bool progress = false;
1893 int if_depth = 0;
1894 int loop_depth = 0;
1895
1896 foreach_list_safe(node, &this->instructions) {
1897 fs_inst *inst = (fs_inst *)node;
1898
1899 /* Make sure that we dominate the instructions we're going to
1900 * scan for interfering with our coalescing, or we won't have
1901 * scanned enough to see if anything interferes with our
1902 * coalescing. We don't dominate the following instructions if
1903 * we're in a loop or an if block.
1904 */
1905 switch (inst->opcode) {
1906 case BRW_OPCODE_DO:
1907 loop_depth++;
1908 break;
1909 case BRW_OPCODE_WHILE:
1910 loop_depth--;
1911 break;
1912 case BRW_OPCODE_IF:
1913 if_depth++;
1914 break;
1915 case BRW_OPCODE_ENDIF:
1916 if_depth--;
1917 break;
1918 default:
1919 break;
1920 }
1921 if (loop_depth || if_depth)
1922 continue;
1923
1924 if (inst->opcode != BRW_OPCODE_MOV ||
1925 inst->predicate ||
1926 inst->saturate ||
1927 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1928 inst->src[0].file != UNIFORM)||
1929 inst->dst.type != inst->src[0].type)
1930 continue;
1931
1932 bool has_source_modifiers = (inst->src[0].abs ||
1933 inst->src[0].negate ||
1934 inst->src[0].file == UNIFORM);
1935
1936 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1937 * them: check for no writes to either one until the exit of the
1938 * program.
1939 */
1940 bool interfered = false;
1941
1942 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1943 !scan_inst->is_tail_sentinel();
1944 scan_inst = (fs_inst *)scan_inst->next) {
1945 if (scan_inst->dst.file == GRF) {
1946 if (scan_inst->overwrites_reg(inst->dst) ||
1947 scan_inst->overwrites_reg(inst->src[0])) {
1948 interfered = true;
1949 break;
1950 }
1951 }
1952
1953 /* The gen6 MATH instruction can't handle source modifiers or
1954 * unusual register regions, so avoid coalescing those for
1955 * now. We should do something more specific.
1956 */
1957 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1958 interfered = true;
1959 break;
1960 }
1961
1962 /* The accumulator result appears to get used for the
1963 * conditional modifier generation. When negating a UD
1964 * value, there is a 33rd bit generated for the sign in the
1965 * accumulator value, so now you can't check, for example,
1966 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1967 */
1968 if (scan_inst->conditional_mod &&
1969 inst->src[0].negate &&
1970 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1971 interfered = true;
1972 break;
1973 }
1974 }
1975 if (interfered) {
1976 continue;
1977 }
1978
1979 /* Rewrite the later usage to point at the source of the move to
1980 * be removed.
1981 */
1982 for (fs_inst *scan_inst = inst;
1983 !scan_inst->is_tail_sentinel();
1984 scan_inst = (fs_inst *)scan_inst->next) {
1985 for (int i = 0; i < 3; i++) {
1986 if (scan_inst->src[i].file == GRF &&
1987 scan_inst->src[i].reg == inst->dst.reg &&
1988 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1989 fs_reg new_src = inst->src[0];
1990 if (scan_inst->src[i].abs) {
1991 new_src.negate = 0;
1992 new_src.abs = 1;
1993 }
1994 new_src.negate ^= scan_inst->src[i].negate;
1995 scan_inst->src[i] = new_src;
1996 }
1997 }
1998 }
1999
2000 inst->remove();
2001 progress = true;
2002 }
2003
2004 if (progress)
2005 live_intervals_valid = false;
2006
2007 return progress;
2008 }
2009
2010
2011 bool
2012 fs_visitor::compute_to_mrf()
2013 {
2014 bool progress = false;
2015 int next_ip = 0;
2016
2017 calculate_live_intervals();
2018
2019 foreach_list_safe(node, &this->instructions) {
2020 fs_inst *inst = (fs_inst *)node;
2021
2022 int ip = next_ip;
2023 next_ip++;
2024
2025 if (inst->opcode != BRW_OPCODE_MOV ||
2026 inst->predicate ||
2027 inst->dst.file != MRF || inst->src[0].file != GRF ||
2028 inst->dst.type != inst->src[0].type ||
2029 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2030 continue;
2031
2032 /* Work out which hardware MRF registers are written by this
2033 * instruction.
2034 */
2035 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2036 int mrf_high;
2037 if (inst->dst.reg & BRW_MRF_COMPR4) {
2038 mrf_high = mrf_low + 4;
2039 } else if (dispatch_width == 16 &&
2040 (!inst->force_uncompressed && !inst->force_sechalf)) {
2041 mrf_high = mrf_low + 1;
2042 } else {
2043 mrf_high = mrf_low;
2044 }
2045
2046 /* Can't compute-to-MRF this GRF if someone else was going to
2047 * read it later.
2048 */
2049 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2050 continue;
2051
2052 /* Found a move of a GRF to a MRF. Let's see if we can go
2053 * rewrite the thing that made this GRF to write into the MRF.
2054 */
2055 fs_inst *scan_inst;
2056 for (scan_inst = (fs_inst *)inst->prev;
2057 scan_inst->prev != NULL;
2058 scan_inst = (fs_inst *)scan_inst->prev) {
2059 if (scan_inst->dst.file == GRF &&
2060 scan_inst->dst.reg == inst->src[0].reg) {
2061 /* Found the last thing to write our reg we want to turn
2062 * into a compute-to-MRF.
2063 */
2064
2065 /* SENDs can only write to GRFs, so no compute-to-MRF. */
2066 if (scan_inst->mlen) {
2067 break;
2068 }
2069
2070 /* If it's predicated, it (probably) didn't populate all
2071 * the channels. We might be able to rewrite everything
2072 * that writes that reg, but it would require smarter
2073 * tracking to delay the rewriting until complete success.
2074 */
2075 if (scan_inst->predicate)
2076 break;
2077
2078 /* If it's half of register setup and not the same half as
2079 * our MOV we're trying to remove, bail for now.
2080 */
2081 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2082 scan_inst->force_sechalf != inst->force_sechalf) {
2083 break;
2084 }
2085
2086 /* SEND instructions can't have MRF as a destination. */
2087 if (scan_inst->mlen)
2088 break;
2089
2090 if (intel->gen >= 6) {
2091 /* gen6 math instructions must have the destination be
2092 * GRF, so no compute-to-MRF for them.
2093 */
2094 if (scan_inst->is_math()) {
2095 break;
2096 }
2097 }
2098
2099 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2100 /* Found the creator of our MRF's source value. */
2101 scan_inst->dst.file = MRF;
2102 scan_inst->dst.reg = inst->dst.reg;
2103 scan_inst->saturate |= inst->saturate;
2104 inst->remove();
2105 progress = true;
2106 }
2107 break;
2108 }
2109
2110 /* We don't handle control flow here. Most computation of
2111 * values that end up in MRFs are shortly before the MRF
2112 * write anyway.
2113 */
2114 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2115 break;
2116
2117 /* You can't read from an MRF, so if someone else reads our
2118 * MRF's source GRF that we wanted to rewrite, that stops us.
2119 */
2120 bool interfered = false;
2121 for (int i = 0; i < 3; i++) {
2122 if (scan_inst->src[i].file == GRF &&
2123 scan_inst->src[i].reg == inst->src[0].reg &&
2124 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2125 interfered = true;
2126 }
2127 }
2128 if (interfered)
2129 break;
2130
2131 if (scan_inst->dst.file == MRF) {
2132 /* If somebody else writes our MRF here, we can't
2133 * compute-to-MRF before that.
2134 */
2135 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2136 int scan_mrf_high;
2137
2138 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2139 scan_mrf_high = scan_mrf_low + 4;
2140 } else if (dispatch_width == 16 &&
2141 (!scan_inst->force_uncompressed &&
2142 !scan_inst->force_sechalf)) {
2143 scan_mrf_high = scan_mrf_low + 1;
2144 } else {
2145 scan_mrf_high = scan_mrf_low;
2146 }
2147
2148 if (mrf_low == scan_mrf_low ||
2149 mrf_low == scan_mrf_high ||
2150 mrf_high == scan_mrf_low ||
2151 mrf_high == scan_mrf_high) {
2152 break;
2153 }
2154 }
2155
2156 if (scan_inst->mlen > 0) {
2157 /* Found a SEND instruction, which means that there are
2158 * live values in MRFs from base_mrf to base_mrf +
2159 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2160 * above it.
2161 */
2162 if (mrf_low >= scan_inst->base_mrf &&
2163 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2164 break;
2165 }
2166 if (mrf_high >= scan_inst->base_mrf &&
2167 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2168 break;
2169 }
2170 }
2171 }
2172 }
2173
2174 if (progress)
2175 live_intervals_valid = false;
2176
2177 return progress;
2178 }
2179
2180 /**
2181 * Walks through basic blocks, looking for repeated MRF writes and
2182 * removing the later ones.
2183 */
2184 bool
2185 fs_visitor::remove_duplicate_mrf_writes()
2186 {
2187 fs_inst *last_mrf_move[16];
2188 bool progress = false;
2189
2190 /* Need to update the MRF tracking for compressed instructions. */
2191 if (dispatch_width == 16)
2192 return false;
2193
2194 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2195
2196 foreach_list_safe(node, &this->instructions) {
2197 fs_inst *inst = (fs_inst *)node;
2198
2199 if (inst->is_control_flow()) {
2200 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2201 }
2202
2203 if (inst->opcode == BRW_OPCODE_MOV &&
2204 inst->dst.file == MRF) {
2205 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2206 if (prev_inst && inst->equals(prev_inst)) {
2207 inst->remove();
2208 progress = true;
2209 continue;
2210 }
2211 }
2212
2213 /* Clear out the last-write records for MRFs that were overwritten. */
2214 if (inst->dst.file == MRF) {
2215 last_mrf_move[inst->dst.reg] = NULL;
2216 }
2217
2218 if (inst->mlen > 0) {
2219 /* Found a SEND instruction, which will include two or fewer
2220 * implied MRF writes. We could do better here.
2221 */
2222 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2223 last_mrf_move[inst->base_mrf + i] = NULL;
2224 }
2225 }
2226
2227 /* Clear out any MRF move records whose sources got overwritten. */
2228 if (inst->dst.file == GRF) {
2229 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2230 if (last_mrf_move[i] &&
2231 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2232 last_mrf_move[i] = NULL;
2233 }
2234 }
2235 }
2236
2237 if (inst->opcode == BRW_OPCODE_MOV &&
2238 inst->dst.file == MRF &&
2239 inst->src[0].file == GRF &&
2240 !inst->predicate) {
2241 last_mrf_move[inst->dst.reg] = inst;
2242 }
2243 }
2244
2245 if (progress)
2246 live_intervals_valid = false;
2247
2248 return progress;
2249 }
2250
2251 static void
2252 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2253 int first_grf, int grf_len)
2254 {
2255 bool inst_16wide = (dispatch_width > 8 &&
2256 !inst->force_uncompressed &&
2257 !inst->force_sechalf);
2258
2259 /* Clear the flag for registers that actually got read (as expected). */
2260 for (int i = 0; i < 3; i++) {
2261 int grf;
2262 if (inst->src[i].file == GRF) {
2263 grf = inst->src[i].reg;
2264 } else if (inst->src[i].file == FIXED_HW_REG &&
2265 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2266 grf = inst->src[i].fixed_hw_reg.nr;
2267 } else {
2268 continue;
2269 }
2270
2271 if (grf >= first_grf &&
2272 grf < first_grf + grf_len) {
2273 deps[grf - first_grf] = false;
2274 if (inst_16wide)
2275 deps[grf - first_grf + 1] = false;
2276 }
2277 }
2278 }
2279
2280 /**
2281 * Implements this workaround for the original 965:
2282 *
2283 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2284 * check for post destination dependencies on this instruction, software
2285 * must ensure that there is no destination hazard for the case of ‘write
2286 * followed by a posted write’ shown in the following example.
2287 *
2288 * 1. mov r3 0
2289 * 2. send r3.xy <rest of send instruction>
2290 * 3. mov r2 r3
2291 *
2292 * Due to no post-destination dependency check on the ‘send’, the above
2293 * code sequence could have two instructions (1 and 2) in flight at the
2294 * same time that both consider ‘r3’ as the target of their final writes.
2295 */
2296 void
2297 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2298 {
2299 int write_len = inst->regs_written() * dispatch_width / 8;
2300 int first_write_grf = inst->dst.reg;
2301 bool needs_dep[BRW_MAX_MRF];
2302 assert(write_len < (int)sizeof(needs_dep) - 1);
2303
2304 memset(needs_dep, false, sizeof(needs_dep));
2305 memset(needs_dep, true, write_len);
2306
2307 clear_deps_for_inst_src(inst, dispatch_width,
2308 needs_dep, first_write_grf, write_len);
2309
2310 /* Walk backwards looking for writes to registers we're writing which
2311 * aren't read since being written. If we hit the start of the program,
2312 * we assume that there are no outstanding dependencies on entry to the
2313 * program.
2314 */
2315 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2316 scan_inst != NULL;
2317 scan_inst = (fs_inst *)scan_inst->prev) {
2318
2319 /* If we hit control flow, assume that there *are* outstanding
2320 * dependencies, and force their cleanup before our instruction.
2321 */
2322 if (scan_inst->is_control_flow()) {
2323 for (int i = 0; i < write_len; i++) {
2324 if (needs_dep[i]) {
2325 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2326 }
2327 }
2328 }
2329
2330 bool scan_inst_16wide = (dispatch_width > 8 &&
2331 !scan_inst->force_uncompressed &&
2332 !scan_inst->force_sechalf);
2333
2334 /* We insert our reads as late as possible on the assumption that any
2335 * instruction but a MOV that might have left us an outstanding
2336 * dependency has more latency than a MOV.
2337 */
2338 if (scan_inst->dst.file == GRF &&
2339 scan_inst->dst.reg >= first_write_grf &&
2340 scan_inst->dst.reg < first_write_grf + write_len &&
2341 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2342 inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2343 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2344 if (scan_inst_16wide)
2345 needs_dep[scan_inst->dst.reg - first_write_grf + 1] = false;
2346 }
2347
2348 /* Clear the flag for registers that actually got read (as expected). */
2349 clear_deps_for_inst_src(scan_inst, dispatch_width,
2350 needs_dep, first_write_grf, write_len);
2351
2352 /* Continue the loop only if we haven't resolved all the dependencies */
2353 int i;
2354 for (i = 0; i < write_len; i++) {
2355 if (needs_dep[i])
2356 break;
2357 }
2358 if (i == write_len)
2359 return;
2360 }
2361 }
2362
2363 /**
2364 * Implements this workaround for the original 965:
2365 *
2366 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2367 * used as a destination register until after it has been sourced by an
2368 * instruction with a different destination register.
2369 */
2370 void
2371 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2372 {
2373 int write_len = inst->regs_written() * dispatch_width / 8;
2374 int first_write_grf = inst->dst.reg;
2375 bool needs_dep[BRW_MAX_MRF];
2376 assert(write_len < (int)sizeof(needs_dep) - 1);
2377
2378 memset(needs_dep, false, sizeof(needs_dep));
2379 memset(needs_dep, true, write_len);
2380 /* Walk forwards looking for writes to registers we're writing which aren't
2381 * read before being written.
2382 */
2383 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2384 !scan_inst->is_tail_sentinel();
2385 scan_inst = (fs_inst *)scan_inst->next) {
2386 /* If we hit control flow, force resolve all remaining dependencies. */
2387 if (scan_inst->is_control_flow()) {
2388 for (int i = 0; i < write_len; i++) {
2389 if (needs_dep[i])
2390 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2391 }
2392 }
2393
2394 /* Clear the flag for registers that actually got read (as expected). */
2395 clear_deps_for_inst_src(scan_inst, dispatch_width,
2396 needs_dep, first_write_grf, write_len);
2397
2398 /* We insert our reads as late as possible since they're reading the
2399 * result of a SEND, which has massive latency.
2400 */
2401 if (scan_inst->dst.file == GRF &&
2402 scan_inst->dst.reg >= first_write_grf &&
2403 scan_inst->dst.reg < first_write_grf + write_len &&
2404 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2405 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2406 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2407 }
2408
2409 /* Continue the loop only if we haven't resolved all the dependencies */
2410 int i;
2411 for (i = 0; i < write_len; i++) {
2412 if (needs_dep[i])
2413 break;
2414 }
2415 if (i == write_len)
2416 return;
2417 }
2418
2419 /* If we hit the end of the program, resolve all remaining dependencies out
2420 * of paranoia.
2421 */
2422 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2423 assert(last_inst->eot);
2424 for (int i = 0; i < write_len; i++) {
2425 if (needs_dep[i])
2426 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2427 }
2428 }
2429
2430 void
2431 fs_visitor::insert_gen4_send_dependency_workarounds()
2432 {
2433 if (intel->gen != 4 || intel->is_g4x)
2434 return;
2435
2436 /* Note that we're done with register allocation, so GRF fs_regs always
2437 * have a .reg_offset of 0.
2438 */
2439
2440 foreach_list_safe(node, &this->instructions) {
2441 fs_inst *inst = (fs_inst *)node;
2442
2443 if (inst->mlen != 0 && inst->dst.file == GRF) {
2444 insert_gen4_pre_send_dependency_workarounds(inst);
2445 insert_gen4_post_send_dependency_workarounds(inst);
2446 }
2447 }
2448 }
2449
2450 void
2451 fs_visitor::dump_instruction(fs_inst *inst)
2452 {
2453 if (inst->predicate) {
2454 printf("(%cf0.%d) ",
2455 inst->predicate_inverse ? '-' : '+',
2456 inst->flag_subreg);
2457 }
2458
2459 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2460 opcode_descs[inst->opcode].name) {
2461 printf("%s", opcode_descs[inst->opcode].name);
2462 } else {
2463 switch (inst->opcode) {
2464 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
2465 printf("uniform_pull_const");
2466 break;
2467 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
2468 printf("uniform_pull_const_gen7");
2469 break;
2470 case FS_OPCODE_SET_GLOBAL_OFFSET:
2471 printf("set_global_offset");
2472 break;
2473 default:
2474 printf("op%d", inst->opcode);
2475 break;
2476 }
2477 }
2478 if (inst->saturate)
2479 printf(".sat");
2480 if (inst->conditional_mod) {
2481 printf(".cmod");
2482 if (!inst->predicate &&
2483 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2484 inst->opcode != BRW_OPCODE_IF &&
2485 inst->opcode != BRW_OPCODE_WHILE))) {
2486 printf(".f0.%d\n", inst->flag_subreg);
2487 }
2488 }
2489 printf(" ");
2490
2491
2492 switch (inst->dst.file) {
2493 case GRF:
2494 printf("vgrf%d", inst->dst.reg);
2495 if (inst->dst.reg_offset)
2496 printf("+%d", inst->dst.reg_offset);
2497 break;
2498 case MRF:
2499 printf("m%d", inst->dst.reg);
2500 break;
2501 case BAD_FILE:
2502 printf("(null)");
2503 break;
2504 case UNIFORM:
2505 printf("***u%d***", inst->dst.reg);
2506 break;
2507 default:
2508 printf("???");
2509 break;
2510 }
2511 printf(", ");
2512
2513 for (int i = 0; i < 3; i++) {
2514 if (inst->src[i].negate)
2515 printf("-");
2516 if (inst->src[i].abs)
2517 printf("|");
2518 switch (inst->src[i].file) {
2519 case GRF:
2520 printf("vgrf%d", inst->src[i].reg);
2521 if (inst->src[i].reg_offset)
2522 printf("+%d", inst->src[i].reg_offset);
2523 break;
2524 case MRF:
2525 printf("***m%d***", inst->src[i].reg);
2526 break;
2527 case UNIFORM:
2528 printf("u%d", inst->src[i].reg);
2529 if (inst->src[i].reg_offset)
2530 printf(".%d", inst->src[i].reg_offset);
2531 break;
2532 case BAD_FILE:
2533 printf("(null)");
2534 break;
2535 case IMM:
2536 switch (inst->src[i].type) {
2537 case BRW_REGISTER_TYPE_F:
2538 printf("%ff", inst->src[i].imm.f);
2539 break;
2540 case BRW_REGISTER_TYPE_D:
2541 printf("%dd", inst->src[i].imm.i);
2542 break;
2543 case BRW_REGISTER_TYPE_UD:
2544 printf("%uu", inst->src[i].imm.u);
2545 break;
2546 default:
2547 printf("???");
2548 break;
2549 }
2550 break;
2551 default:
2552 printf("???");
2553 break;
2554 }
2555 if (inst->src[i].abs)
2556 printf("|");
2557
2558 if (i < 3)
2559 printf(", ");
2560 }
2561
2562 printf(" ");
2563
2564 if (inst->force_uncompressed)
2565 printf("1sthalf ");
2566
2567 if (inst->force_sechalf)
2568 printf("2ndhalf ");
2569
2570 printf("\n");
2571 }
2572
2573 void
2574 fs_visitor::dump_instructions()
2575 {
2576 int ip = 0;
2577 foreach_list(node, &this->instructions) {
2578 fs_inst *inst = (fs_inst *)node;
2579 printf("%d: ", ip++);
2580 dump_instruction(inst);
2581 }
2582 }
2583
2584 /**
2585 * Possibly returns an instruction that set up @param reg.
2586 *
2587 * Sometimes we want to take the result of some expression/variable
2588 * dereference tree and rewrite the instruction generating the result
2589 * of the tree. When processing the tree, we know that the
2590 * instructions generated are all writing temporaries that are dead
2591 * outside of this tree. So, if we have some instructions that write
2592 * a temporary, we're free to point that temp write somewhere else.
2593 *
2594 * Note that this doesn't guarantee that the instruction generated
2595 * only reg -- it might be the size=4 destination of a texture instruction.
2596 */
2597 fs_inst *
2598 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2599 fs_inst *end,
2600 fs_reg reg)
2601 {
2602 if (end == start ||
2603 end->predicate ||
2604 end->force_uncompressed ||
2605 end->force_sechalf ||
2606 reg.reladdr ||
2607 !reg.equals(end->dst)) {
2608 return NULL;
2609 } else {
2610 return end;
2611 }
2612 }
2613
2614 void
2615 fs_visitor::setup_payload_gen6()
2616 {
2617 struct intel_context *intel = &brw->intel;
2618 bool uses_depth =
2619 (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2620 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2621
2622 assert(intel->gen >= 6);
2623
2624 /* R0-1: masks, pixel X/Y coordinates. */
2625 c->nr_payload_regs = 2;
2626 /* R2: only for 32-pixel dispatch.*/
2627
2628 /* R3-26: barycentric interpolation coordinates. These appear in the
2629 * same order that they appear in the brw_wm_barycentric_interp_mode
2630 * enum. Each set of coordinates occupies 2 registers if dispatch width
2631 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2632 * appear if they were enabled using the "Barycentric Interpolation
2633 * Mode" bits in WM_STATE.
2634 */
2635 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2636 if (barycentric_interp_modes & (1 << i)) {
2637 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2638 c->nr_payload_regs += 2;
2639 if (dispatch_width == 16) {
2640 c->nr_payload_regs += 2;
2641 }
2642 }
2643 }
2644
2645 /* R27: interpolated depth if uses source depth */
2646 if (uses_depth) {
2647 c->source_depth_reg = c->nr_payload_regs;
2648 c->nr_payload_regs++;
2649 if (dispatch_width == 16) {
2650 /* R28: interpolated depth if not 8-wide. */
2651 c->nr_payload_regs++;
2652 }
2653 }
2654 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2655 if (uses_depth) {
2656 c->source_w_reg = c->nr_payload_regs;
2657 c->nr_payload_regs++;
2658 if (dispatch_width == 16) {
2659 /* R30: interpolated W if not 8-wide. */
2660 c->nr_payload_regs++;
2661 }
2662 }
2663 /* R31: MSAA position offsets. */
2664 /* R32-: bary for 32-pixel. */
2665 /* R58-59: interp W for 32-pixel. */
2666
2667 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2668 c->source_depth_to_render_target = true;
2669 }
2670 }
2671
2672 bool
2673 fs_visitor::run()
2674 {
2675 sanity_param_count = fp->Base.Parameters->NumParameters;
2676 uint32_t orig_nr_params = c->prog_data.nr_params;
2677
2678 if (intel->gen >= 6)
2679 setup_payload_gen6();
2680 else
2681 setup_payload_gen4();
2682
2683 if (0) {
2684 emit_dummy_fs();
2685 } else {
2686 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2687 emit_shader_time_begin();
2688
2689 calculate_urb_setup();
2690 if (intel->gen < 6)
2691 emit_interpolation_setup_gen4();
2692 else
2693 emit_interpolation_setup_gen6();
2694
2695 /* We handle discards by keeping track of the still-live pixels in f0.1.
2696 * Initialize it with the dispatched pixels.
2697 */
2698 if (fp->UsesKill) {
2699 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2700 discard_init->flag_subreg = 1;
2701 }
2702
2703 /* Generate FS IR for main(). (the visitor only descends into
2704 * functions called "main").
2705 */
2706 if (shader) {
2707 foreach_list(node, &*shader->ir) {
2708 ir_instruction *ir = (ir_instruction *)node;
2709 base_ir = ir;
2710 this->result = reg_undef;
2711 ir->accept(this);
2712 }
2713 } else {
2714 emit_fragment_program_code();
2715 }
2716 base_ir = NULL;
2717 if (failed)
2718 return false;
2719
2720 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2721 emit_shader_time_end();
2722
2723 emit_fb_writes();
2724
2725 split_virtual_grfs();
2726
2727 move_uniform_array_access_to_pull_constants();
2728 setup_pull_constants();
2729
2730 bool progress;
2731 do {
2732 progress = false;
2733
2734 compact_virtual_grfs();
2735
2736 progress = remove_duplicate_mrf_writes() || progress;
2737
2738 progress = opt_algebraic() || progress;
2739 progress = opt_cse() || progress;
2740 progress = opt_copy_propagate() || progress;
2741 progress = dead_code_eliminate() || progress;
2742 progress = register_coalesce() || progress;
2743 progress = register_coalesce_2() || progress;
2744 progress = compute_to_mrf() || progress;
2745 } while (progress);
2746
2747 remove_dead_constants();
2748
2749 schedule_instructions(false);
2750
2751 assign_curb_setup();
2752 assign_urb_setup();
2753
2754 if (0) {
2755 /* Debug of register spilling: Go spill everything. */
2756 for (int i = 0; i < virtual_grf_count; i++) {
2757 spill_reg(i);
2758 }
2759 }
2760
2761 if (0)
2762 assign_regs_trivial();
2763 else {
2764 while (!assign_regs()) {
2765 if (failed)
2766 break;
2767 }
2768 }
2769 }
2770 assert(force_uncompressed_stack == 0);
2771 assert(force_sechalf_stack == 0);
2772
2773 /* This must come after all optimization and register allocation, since
2774 * it inserts dead code that happens to have side effects, and it does
2775 * so based on the actual physical registers in use.
2776 */
2777 insert_gen4_send_dependency_workarounds();
2778
2779 if (failed)
2780 return false;
2781
2782 schedule_instructions(true);
2783
2784 if (dispatch_width == 8) {
2785 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2786 } else {
2787 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2788
2789 /* Make sure we didn't try to sneak in an extra uniform */
2790 assert(orig_nr_params == c->prog_data.nr_params);
2791 (void) orig_nr_params;
2792 }
2793
2794 /* If any state parameters were appended, then ParameterValues could have
2795 * been realloced, in which case the driver uniform storage set up by
2796 * _mesa_associate_uniform_storage() would point to freed memory. Make
2797 * sure that didn't happen.
2798 */
2799 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2800
2801 return !failed;
2802 }
2803
2804 const unsigned *
2805 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2806 struct gl_fragment_program *fp,
2807 struct gl_shader_program *prog,
2808 unsigned *final_assembly_size)
2809 {
2810 struct intel_context *intel = &brw->intel;
2811 bool start_busy = false;
2812 float start_time = 0;
2813
2814 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2815 start_busy = (intel->batch.last_bo &&
2816 drm_intel_bo_busy(intel->batch.last_bo));
2817 start_time = get_time();
2818 }
2819
2820 struct brw_shader *shader = NULL;
2821 if (prog)
2822 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2823
2824 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2825 if (shader) {
2826 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2827 _mesa_print_ir(shader->ir, NULL);
2828 printf("\n\n");
2829 } else {
2830 printf("ARB_fragment_program %d ir for native fragment shader\n",
2831 fp->Base.Id);
2832 _mesa_print_program(&fp->Base);
2833 }
2834 }
2835
2836 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2837 */
2838 fs_visitor v(brw, c, prog, fp, 8);
2839 if (!v.run()) {
2840 prog->LinkStatus = false;
2841 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2842
2843 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2844 v.fail_msg);
2845
2846 return NULL;
2847 }
2848
2849 exec_list *simd16_instructions = NULL;
2850 fs_visitor v2(brw, c, prog, fp, 16);
2851 bool no16 = INTEL_DEBUG & DEBUG_NO16;
2852 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2853 v2.import_uniforms(&v);
2854 if (!v2.run()) {
2855 perf_debug("16-wide shader failed to compile, falling back to "
2856 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2857 } else {
2858 simd16_instructions = &v2.instructions;
2859 }
2860 }
2861
2862 c->prog_data.dispatch_width = 8;
2863
2864 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2865 const unsigned *generated = g.generate_assembly(&v.instructions,
2866 simd16_instructions,
2867 final_assembly_size);
2868
2869 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2870 if (shader->compiled_once)
2871 brw_wm_debug_recompile(brw, prog, &c->key);
2872 shader->compiled_once = true;
2873
2874 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2875 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2876 (get_time() - start_time) * 1000);
2877 }
2878 }
2879
2880 return generated;
2881 }
2882
2883 bool
2884 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2885 {
2886 struct brw_context *brw = brw_context(ctx);
2887 struct intel_context *intel = &brw->intel;
2888 struct brw_wm_prog_key key;
2889
2890 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2891 return true;
2892
2893 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2894 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2895 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2896 bool program_uses_dfdy = fp->UsesDFdy;
2897
2898 memset(&key, 0, sizeof(key));
2899
2900 if (intel->gen < 6) {
2901 if (fp->UsesKill)
2902 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2903
2904 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2905 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2906
2907 /* Just assume depth testing. */
2908 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2909 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2910 }
2911
2912 if (prog->Name != 0)
2913 key.proj_attrib_mask = 0xffffffff;
2914
2915 if (intel->gen < 6)
2916 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2917
2918 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2919 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2920 continue;
2921
2922 if (prog->Name == 0)
2923 key.proj_attrib_mask |= 1 << i;
2924
2925 if (intel->gen < 6) {
2926 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2927
2928 if (vp_index >= 0)
2929 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2930 }
2931 }
2932
2933 key.clamp_fragment_color = true;
2934
2935 for (int i = 0; i < MAX_SAMPLERS; i++) {
2936 if (fp->Base.ShadowSamplers & (1 << i)) {
2937 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2938 key.tex.swizzles[i] =
2939 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2940 } else {
2941 /* Color sampler: assume no swizzling. */
2942 key.tex.swizzles[i] = SWIZZLE_XYZW;
2943 }
2944 }
2945
2946 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2947 key.drawable_height = ctx->DrawBuffer->Height;
2948 }
2949
2950 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2951 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2952 }
2953
2954 key.nr_color_regions = 1;
2955
2956 key.program_string_id = bfp->id;
2957
2958 uint32_t old_prog_offset = brw->wm.prog_offset;
2959 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2960
2961 bool success = do_wm_prog(brw, prog, bfp, &key);
2962
2963 brw->wm.prog_offset = old_prog_offset;
2964 brw->wm.prog_data = old_prog_data;
2965
2966 return success;
2967 }