Replace gl_frag_attrib enum with gl_varying_slot.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 #define ALU3(op) \
150 fs_inst * \
151 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
152 { \
153 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
154 }
155
156 ALU1(NOT)
157 ALU1(MOV)
158 ALU1(FRC)
159 ALU1(RNDD)
160 ALU1(RNDE)
161 ALU1(RNDZ)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(SHL)
169 ALU2(SHR)
170 ALU2(ASR)
171 ALU3(LRP)
172
173 /** Gen4 predicated IF. */
174 fs_inst *
175 fs_visitor::IF(uint32_t predicate)
176 {
177 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
178 inst->predicate = predicate;
179 return inst;
180 }
181
182 /** Gen6+ IF with embedded comparison. */
183 fs_inst *
184 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
185 {
186 assert(intel->gen >= 6);
187 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
188 reg_null_d, src0, src1);
189 inst->conditional_mod = condition;
190 return inst;
191 }
192
193 /**
194 * CMP: Sets the low bit of the destination channels with the result
195 * of the comparison, while the upper bits are undefined, and updates
196 * the flag register with the packed 16 bits of the result.
197 */
198 fs_inst *
199 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
200 {
201 fs_inst *inst;
202
203 /* Take the instruction:
204 *
205 * CMP null<d> src0<f> src1<f>
206 *
207 * Original gen4 does type conversion to the destination type before
208 * comparison, producing garbage results for floating point comparisons.
209 * gen5 does the comparison on the execution type (resolved source types),
210 * so dst type doesn't matter. gen6 does comparison and then uses the
211 * result as if it was the dst type with no conversion, which happens to
212 * mostly work out for float-interpreted-as-int since our comparisons are
213 * for >0, =0, <0.
214 */
215 if (intel->gen == 4) {
216 dst.type = src0.type;
217 if (dst.file == FIXED_HW_REG)
218 dst.fixed_hw_reg.type = dst.type;
219 }
220
221 resolve_ud_negate(&src0);
222 resolve_ud_negate(&src1);
223
224 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
225 inst->conditional_mod = condition;
226
227 return inst;
228 }
229
230 exec_list
231 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
232 fs_reg offset)
233 {
234 exec_list instructions;
235 fs_inst *inst;
236
237 if (intel->gen >= 7) {
238 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
239 dst, surf_index, offset);
240 instructions.push_tail(inst);
241 } else {
242 int base_mrf = 13;
243 bool header_present = true;
244
245 fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
246 mrf.type = BRW_REGISTER_TYPE_D;
247
248 /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
249 * dword-aligned byte offset.
250 */
251 if (intel->gen == 6) {
252 instructions.push_tail(MOV(mrf, offset));
253 } else {
254 instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
255 }
256 inst = MOV(mrf, offset);
257 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
258 dst, surf_index);
259 inst->header_present = header_present;
260 inst->base_mrf = base_mrf;
261 inst->mlen = header_present + dispatch_width / 8;
262
263 instructions.push_tail(inst);
264 }
265
266 return instructions;
267 }
268
269 /**
270 * A helper for MOV generation for fixing up broken hardware SEND dependency
271 * handling.
272 */
273 fs_inst *
274 fs_visitor::DEP_RESOLVE_MOV(int grf)
275 {
276 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
277
278 inst->ir = NULL;
279 inst->annotation = "send dependency resolve";
280
281 /* The caller always wants uncompressed to emit the minimal extra
282 * dependencies, and to avoid having to deal with aligning its regs to 2.
283 */
284 inst->force_uncompressed = true;
285
286 return inst;
287 }
288
289 bool
290 fs_inst::equals(fs_inst *inst)
291 {
292 return (opcode == inst->opcode &&
293 dst.equals(inst->dst) &&
294 src[0].equals(inst->src[0]) &&
295 src[1].equals(inst->src[1]) &&
296 src[2].equals(inst->src[2]) &&
297 saturate == inst->saturate &&
298 predicate == inst->predicate &&
299 conditional_mod == inst->conditional_mod &&
300 mlen == inst->mlen &&
301 base_mrf == inst->base_mrf &&
302 sampler == inst->sampler &&
303 target == inst->target &&
304 eot == inst->eot &&
305 header_present == inst->header_present &&
306 shadow_compare == inst->shadow_compare &&
307 offset == inst->offset);
308 }
309
310 int
311 fs_inst::regs_written()
312 {
313 if (is_tex())
314 return 4;
315
316 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
317 * but we don't currently use them...nor do we have an opcode for them.
318 */
319
320 return 1;
321 }
322
323 bool
324 fs_inst::overwrites_reg(const fs_reg &reg)
325 {
326 return (reg.file == dst.file &&
327 reg.reg == dst.reg &&
328 reg.reg_offset >= dst.reg_offset &&
329 reg.reg_offset < dst.reg_offset + regs_written());
330 }
331
332 bool
333 fs_inst::is_tex()
334 {
335 return (opcode == SHADER_OPCODE_TEX ||
336 opcode == FS_OPCODE_TXB ||
337 opcode == SHADER_OPCODE_TXD ||
338 opcode == SHADER_OPCODE_TXF ||
339 opcode == SHADER_OPCODE_TXF_MS ||
340 opcode == SHADER_OPCODE_TXL ||
341 opcode == SHADER_OPCODE_TXS);
342 }
343
344 bool
345 fs_inst::is_math()
346 {
347 return (opcode == SHADER_OPCODE_RCP ||
348 opcode == SHADER_OPCODE_RSQ ||
349 opcode == SHADER_OPCODE_SQRT ||
350 opcode == SHADER_OPCODE_EXP2 ||
351 opcode == SHADER_OPCODE_LOG2 ||
352 opcode == SHADER_OPCODE_SIN ||
353 opcode == SHADER_OPCODE_COS ||
354 opcode == SHADER_OPCODE_INT_QUOTIENT ||
355 opcode == SHADER_OPCODE_INT_REMAINDER ||
356 opcode == SHADER_OPCODE_POW);
357 }
358
359 bool
360 fs_inst::is_control_flow()
361 {
362 switch (opcode) {
363 case BRW_OPCODE_DO:
364 case BRW_OPCODE_WHILE:
365 case BRW_OPCODE_IF:
366 case BRW_OPCODE_ELSE:
367 case BRW_OPCODE_ENDIF:
368 case BRW_OPCODE_BREAK:
369 case BRW_OPCODE_CONTINUE:
370 return true;
371 default:
372 return false;
373 }
374 }
375
376 bool
377 fs_inst::is_send_from_grf()
378 {
379 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
380 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
381 src[1].file == GRF));
382 }
383
384 bool
385 fs_visitor::can_do_source_mods(fs_inst *inst)
386 {
387 if (intel->gen == 6 && inst->is_math())
388 return false;
389
390 if (inst->is_send_from_grf())
391 return false;
392
393 return true;
394 }
395
396 void
397 fs_reg::init()
398 {
399 memset(this, 0, sizeof(*this));
400 this->smear = -1;
401 }
402
403 /** Generic unset register constructor. */
404 fs_reg::fs_reg()
405 {
406 init();
407 this->file = BAD_FILE;
408 }
409
410 /** Immediate value constructor. */
411 fs_reg::fs_reg(float f)
412 {
413 init();
414 this->file = IMM;
415 this->type = BRW_REGISTER_TYPE_F;
416 this->imm.f = f;
417 }
418
419 /** Immediate value constructor. */
420 fs_reg::fs_reg(int32_t i)
421 {
422 init();
423 this->file = IMM;
424 this->type = BRW_REGISTER_TYPE_D;
425 this->imm.i = i;
426 }
427
428 /** Immediate value constructor. */
429 fs_reg::fs_reg(uint32_t u)
430 {
431 init();
432 this->file = IMM;
433 this->type = BRW_REGISTER_TYPE_UD;
434 this->imm.u = u;
435 }
436
437 /** Fixed brw_reg Immediate value constructor. */
438 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
439 {
440 init();
441 this->file = FIXED_HW_REG;
442 this->fixed_hw_reg = fixed_hw_reg;
443 this->type = fixed_hw_reg.type;
444 }
445
446 bool
447 fs_reg::equals(const fs_reg &r) const
448 {
449 return (file == r.file &&
450 reg == r.reg &&
451 reg_offset == r.reg_offset &&
452 type == r.type &&
453 negate == r.negate &&
454 abs == r.abs &&
455 !reladdr && !r.reladdr &&
456 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
457 sizeof(fixed_hw_reg)) == 0 &&
458 smear == r.smear &&
459 imm.u == r.imm.u);
460 }
461
462 bool
463 fs_reg::is_zero() const
464 {
465 if (file != IMM)
466 return false;
467
468 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
469 }
470
471 bool
472 fs_reg::is_one() const
473 {
474 if (file != IMM)
475 return false;
476
477 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
478 }
479
480 int
481 fs_visitor::type_size(const struct glsl_type *type)
482 {
483 unsigned int size, i;
484
485 switch (type->base_type) {
486 case GLSL_TYPE_UINT:
487 case GLSL_TYPE_INT:
488 case GLSL_TYPE_FLOAT:
489 case GLSL_TYPE_BOOL:
490 return type->components();
491 case GLSL_TYPE_ARRAY:
492 return type_size(type->fields.array) * type->length;
493 case GLSL_TYPE_STRUCT:
494 size = 0;
495 for (i = 0; i < type->length; i++) {
496 size += type_size(type->fields.structure[i].type);
497 }
498 return size;
499 case GLSL_TYPE_SAMPLER:
500 /* Samplers take up no register space, since they're baked in at
501 * link time.
502 */
503 return 0;
504 case GLSL_TYPE_VOID:
505 case GLSL_TYPE_ERROR:
506 case GLSL_TYPE_INTERFACE:
507 assert(!"not reached");
508 break;
509 }
510
511 return 0;
512 }
513
514 fs_reg
515 fs_visitor::get_timestamp()
516 {
517 assert(intel->gen >= 7);
518
519 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
520 BRW_ARF_TIMESTAMP,
521 0),
522 BRW_REGISTER_TYPE_UD));
523
524 fs_reg dst = fs_reg(this, glsl_type::uint_type);
525
526 fs_inst *mov = emit(MOV(dst, ts));
527 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
528 * even if it's not enabled in the dispatch.
529 */
530 mov->force_writemask_all = true;
531 mov->force_uncompressed = true;
532
533 /* The caller wants the low 32 bits of the timestamp. Since it's running
534 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
535 * which is plenty of time for our purposes. It is identical across the
536 * EUs, but since it's tracking GPU core speed it will increment at a
537 * varying rate as render P-states change.
538 *
539 * The caller could also check if render P-states have changed (or anything
540 * else that might disrupt timing) by setting smear to 2 and checking if
541 * that field is != 0.
542 */
543 dst.smear = 0;
544
545 return dst;
546 }
547
548 void
549 fs_visitor::emit_shader_time_begin()
550 {
551 current_annotation = "shader time start";
552 shader_start_time = get_timestamp();
553 }
554
555 void
556 fs_visitor::emit_shader_time_end()
557 {
558 current_annotation = "shader time end";
559
560 enum shader_time_shader_type type, written_type, reset_type;
561 if (dispatch_width == 8) {
562 type = ST_FS8;
563 written_type = ST_FS8_WRITTEN;
564 reset_type = ST_FS8_RESET;
565 } else {
566 assert(dispatch_width == 16);
567 type = ST_FS16;
568 written_type = ST_FS16_WRITTEN;
569 reset_type = ST_FS16_RESET;
570 }
571
572 fs_reg shader_end_time = get_timestamp();
573
574 /* Check that there weren't any timestamp reset events (assuming these
575 * were the only two timestamp reads that happened).
576 */
577 fs_reg reset = shader_end_time;
578 reset.smear = 2;
579 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
580 test->conditional_mod = BRW_CONDITIONAL_Z;
581 emit(IF(BRW_PREDICATE_NORMAL));
582
583 push_force_uncompressed();
584 fs_reg start = shader_start_time;
585 start.negate = true;
586 fs_reg diff = fs_reg(this, glsl_type::uint_type);
587 emit(ADD(diff, start, shader_end_time));
588
589 /* If there were no instructions between the two timestamp gets, the diff
590 * is 2 cycles. Remove that overhead, so I can forget about that when
591 * trying to determine the time taken for single instructions.
592 */
593 emit(ADD(diff, diff, fs_reg(-2u)));
594
595 emit_shader_time_write(type, diff);
596 emit_shader_time_write(written_type, fs_reg(1u));
597 emit(BRW_OPCODE_ELSE);
598 emit_shader_time_write(reset_type, fs_reg(1u));
599 emit(BRW_OPCODE_ENDIF);
600
601 pop_force_uncompressed();
602 }
603
604 void
605 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
606 fs_reg value)
607 {
608 /* Choose an index in the buffer and set up tracking information for our
609 * printouts.
610 */
611 int shader_time_index = brw->shader_time.num_entries++;
612 assert(shader_time_index <= brw->shader_time.max_entries);
613 brw->shader_time.types[shader_time_index] = type;
614 if (prog) {
615 _mesa_reference_shader_program(ctx,
616 &brw->shader_time.programs[shader_time_index],
617 prog);
618 }
619
620 int base_mrf = 6;
621
622 fs_reg offset_mrf = fs_reg(MRF, base_mrf);
623 offset_mrf.type = BRW_REGISTER_TYPE_UD;
624 emit(MOV(offset_mrf, fs_reg(shader_time_index * SHADER_TIME_STRIDE)));
625
626 fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
627 time_mrf.type = BRW_REGISTER_TYPE_UD;
628 emit(MOV(time_mrf, value));
629
630 fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
631 inst->base_mrf = base_mrf;
632 inst->mlen = 2;
633 }
634
635 void
636 fs_visitor::fail(const char *format, ...)
637 {
638 va_list va;
639 char *msg;
640
641 if (failed)
642 return;
643
644 failed = true;
645
646 va_start(va, format);
647 msg = ralloc_vasprintf(mem_ctx, format, va);
648 va_end(va);
649 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
650
651 this->fail_msg = msg;
652
653 if (INTEL_DEBUG & DEBUG_WM) {
654 fprintf(stderr, "%s", msg);
655 }
656 }
657
658 fs_inst *
659 fs_visitor::emit(enum opcode opcode)
660 {
661 return emit(fs_inst(opcode));
662 }
663
664 fs_inst *
665 fs_visitor::emit(enum opcode opcode, fs_reg dst)
666 {
667 return emit(fs_inst(opcode, dst));
668 }
669
670 fs_inst *
671 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
672 {
673 return emit(fs_inst(opcode, dst, src0));
674 }
675
676 fs_inst *
677 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
678 {
679 return emit(fs_inst(opcode, dst, src0, src1));
680 }
681
682 fs_inst *
683 fs_visitor::emit(enum opcode opcode, fs_reg dst,
684 fs_reg src0, fs_reg src1, fs_reg src2)
685 {
686 return emit(fs_inst(opcode, dst, src0, src1, src2));
687 }
688
689 void
690 fs_visitor::push_force_uncompressed()
691 {
692 force_uncompressed_stack++;
693 }
694
695 void
696 fs_visitor::pop_force_uncompressed()
697 {
698 force_uncompressed_stack--;
699 assert(force_uncompressed_stack >= 0);
700 }
701
702 void
703 fs_visitor::push_force_sechalf()
704 {
705 force_sechalf_stack++;
706 }
707
708 void
709 fs_visitor::pop_force_sechalf()
710 {
711 force_sechalf_stack--;
712 assert(force_sechalf_stack >= 0);
713 }
714
715 /**
716 * Returns how many MRFs an FS opcode will write over.
717 *
718 * Note that this is not the 0 or 1 implied writes in an actual gen
719 * instruction -- the FS opcodes often generate MOVs in addition.
720 */
721 int
722 fs_visitor::implied_mrf_writes(fs_inst *inst)
723 {
724 if (inst->mlen == 0)
725 return 0;
726
727 switch (inst->opcode) {
728 case SHADER_OPCODE_RCP:
729 case SHADER_OPCODE_RSQ:
730 case SHADER_OPCODE_SQRT:
731 case SHADER_OPCODE_EXP2:
732 case SHADER_OPCODE_LOG2:
733 case SHADER_OPCODE_SIN:
734 case SHADER_OPCODE_COS:
735 return 1 * dispatch_width / 8;
736 case SHADER_OPCODE_POW:
737 case SHADER_OPCODE_INT_QUOTIENT:
738 case SHADER_OPCODE_INT_REMAINDER:
739 return 2 * dispatch_width / 8;
740 case SHADER_OPCODE_TEX:
741 case FS_OPCODE_TXB:
742 case SHADER_OPCODE_TXD:
743 case SHADER_OPCODE_TXF:
744 case SHADER_OPCODE_TXF_MS:
745 case SHADER_OPCODE_TXL:
746 case SHADER_OPCODE_TXS:
747 return 1;
748 case SHADER_OPCODE_SHADER_TIME_ADD:
749 return 0;
750 case FS_OPCODE_FB_WRITE:
751 return 2;
752 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
753 case FS_OPCODE_UNSPILL:
754 return 1;
755 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
756 return inst->header_present;
757 case FS_OPCODE_SPILL:
758 return 2;
759 default:
760 assert(!"not reached");
761 return inst->mlen;
762 }
763 }
764
765 int
766 fs_visitor::virtual_grf_alloc(int size)
767 {
768 if (virtual_grf_array_size <= virtual_grf_count) {
769 if (virtual_grf_array_size == 0)
770 virtual_grf_array_size = 16;
771 else
772 virtual_grf_array_size *= 2;
773 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
774 virtual_grf_array_size);
775 }
776 virtual_grf_sizes[virtual_grf_count] = size;
777 return virtual_grf_count++;
778 }
779
780 /** Fixed HW reg constructor. */
781 fs_reg::fs_reg(enum register_file file, int reg)
782 {
783 init();
784 this->file = file;
785 this->reg = reg;
786 this->type = BRW_REGISTER_TYPE_F;
787 }
788
789 /** Fixed HW reg constructor. */
790 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
791 {
792 init();
793 this->file = file;
794 this->reg = reg;
795 this->type = type;
796 }
797
798 /** Automatic reg constructor. */
799 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
800 {
801 init();
802
803 this->file = GRF;
804 this->reg = v->virtual_grf_alloc(v->type_size(type));
805 this->reg_offset = 0;
806 this->type = brw_type_for_base_type(type);
807 }
808
809 fs_reg *
810 fs_visitor::variable_storage(ir_variable *var)
811 {
812 return (fs_reg *)hash_table_find(this->variable_ht, var);
813 }
814
815 void
816 import_uniforms_callback(const void *key,
817 void *data,
818 void *closure)
819 {
820 struct hash_table *dst_ht = (struct hash_table *)closure;
821 const fs_reg *reg = (const fs_reg *)data;
822
823 if (reg->file != UNIFORM)
824 return;
825
826 hash_table_insert(dst_ht, data, key);
827 }
828
829 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
830 * This brings in those uniform definitions
831 */
832 void
833 fs_visitor::import_uniforms(fs_visitor *v)
834 {
835 hash_table_call_foreach(v->variable_ht,
836 import_uniforms_callback,
837 variable_ht);
838 this->params_remap = v->params_remap;
839 }
840
841 /* Our support for uniforms is piggy-backed on the struct
842 * gl_fragment_program, because that's where the values actually
843 * get stored, rather than in some global gl_shader_program uniform
844 * store.
845 */
846 void
847 fs_visitor::setup_uniform_values(ir_variable *ir)
848 {
849 int namelen = strlen(ir->name);
850
851 /* The data for our (non-builtin) uniforms is stored in a series of
852 * gl_uniform_driver_storage structs for each subcomponent that
853 * glGetUniformLocation() could name. We know it's been set up in the same
854 * order we'd walk the type, so walk the list of storage and find anything
855 * with our name, or the prefix of a component that starts with our name.
856 */
857 unsigned params_before = c->prog_data.nr_params;
858 for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
859 struct gl_uniform_storage *storage = &prog->UniformStorage[u];
860
861 if (strncmp(ir->name, storage->name, namelen) != 0 ||
862 (storage->name[namelen] != 0 &&
863 storage->name[namelen] != '.' &&
864 storage->name[namelen] != '[')) {
865 continue;
866 }
867
868 unsigned slots = storage->type->component_slots();
869 if (storage->array_elements)
870 slots *= storage->array_elements;
871
872 for (unsigned i = 0; i < slots; i++) {
873 c->prog_data.param[c->prog_data.nr_params++] =
874 &storage->storage[i].f;
875 }
876 }
877
878 /* Make sure we actually initialized the right amount of stuff here. */
879 assert(params_before + ir->type->component_slots() ==
880 c->prog_data.nr_params);
881 }
882
883
884 /* Our support for builtin uniforms is even scarier than non-builtin.
885 * It sits on top of the PROG_STATE_VAR parameters that are
886 * automatically updated from GL context state.
887 */
888 void
889 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
890 {
891 const ir_state_slot *const slots = ir->state_slots;
892 assert(ir->state_slots != NULL);
893
894 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
895 /* This state reference has already been setup by ir_to_mesa, but we'll
896 * get the same index back here.
897 */
898 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
899 (gl_state_index *)slots[i].tokens);
900
901 /* Add each of the unique swizzles of the element as a parameter.
902 * This'll end up matching the expected layout of the
903 * array/matrix/structure we're trying to fill in.
904 */
905 int last_swiz = -1;
906 for (unsigned int j = 0; j < 4; j++) {
907 int swiz = GET_SWZ(slots[i].swizzle, j);
908 if (swiz == last_swiz)
909 break;
910 last_swiz = swiz;
911
912 c->prog_data.param[c->prog_data.nr_params++] =
913 &fp->Base.Parameters->ParameterValues[index][swiz].f;
914 }
915 }
916 }
917
918 fs_reg *
919 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
920 {
921 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
922 fs_reg wpos = *reg;
923 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
924
925 /* gl_FragCoord.x */
926 if (ir->pixel_center_integer) {
927 emit(MOV(wpos, this->pixel_x));
928 } else {
929 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
930 }
931 wpos.reg_offset++;
932
933 /* gl_FragCoord.y */
934 if (!flip && ir->pixel_center_integer) {
935 emit(MOV(wpos, this->pixel_y));
936 } else {
937 fs_reg pixel_y = this->pixel_y;
938 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
939
940 if (flip) {
941 pixel_y.negate = true;
942 offset += c->key.drawable_height - 1.0;
943 }
944
945 emit(ADD(wpos, pixel_y, fs_reg(offset)));
946 }
947 wpos.reg_offset++;
948
949 /* gl_FragCoord.z */
950 if (intel->gen >= 6) {
951 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
952 } else {
953 emit(FS_OPCODE_LINTERP, wpos,
954 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
955 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
956 interp_reg(VARYING_SLOT_POS, 2));
957 }
958 wpos.reg_offset++;
959
960 /* gl_FragCoord.w: Already set up in emit_interpolation */
961 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
962
963 return reg;
964 }
965
966 fs_inst *
967 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
968 glsl_interp_qualifier interpolation_mode,
969 bool is_centroid)
970 {
971 brw_wm_barycentric_interp_mode barycoord_mode;
972 if (is_centroid) {
973 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
974 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
975 else
976 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
977 } else {
978 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
979 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
980 else
981 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
982 }
983 return emit(FS_OPCODE_LINTERP, attr,
984 this->delta_x[barycoord_mode],
985 this->delta_y[barycoord_mode], interp);
986 }
987
988 fs_reg *
989 fs_visitor::emit_general_interpolation(ir_variable *ir)
990 {
991 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
992 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
993 fs_reg attr = *reg;
994
995 unsigned int array_elements;
996 const glsl_type *type;
997
998 if (ir->type->is_array()) {
999 array_elements = ir->type->length;
1000 if (array_elements == 0) {
1001 fail("dereferenced array '%s' has length 0\n", ir->name);
1002 }
1003 type = ir->type->fields.array;
1004 } else {
1005 array_elements = 1;
1006 type = ir->type;
1007 }
1008
1009 glsl_interp_qualifier interpolation_mode =
1010 ir->determine_interpolation_mode(c->key.flat_shade);
1011
1012 int location = ir->location;
1013 for (unsigned int i = 0; i < array_elements; i++) {
1014 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1015 if (urb_setup[location] == -1) {
1016 /* If there's no incoming setup data for this slot, don't
1017 * emit interpolation for it.
1018 */
1019 attr.reg_offset += type->vector_elements;
1020 location++;
1021 continue;
1022 }
1023
1024 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1025 /* Constant interpolation (flat shading) case. The SF has
1026 * handed us defined values in only the constant offset
1027 * field of the setup reg.
1028 */
1029 for (unsigned int k = 0; k < type->vector_elements; k++) {
1030 struct brw_reg interp = interp_reg(location, k);
1031 interp = suboffset(interp, 3);
1032 interp.type = reg->type;
1033 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1034 attr.reg_offset++;
1035 }
1036 } else {
1037 /* Smooth/noperspective interpolation case. */
1038 for (unsigned int k = 0; k < type->vector_elements; k++) {
1039 /* FINISHME: At some point we probably want to push
1040 * this farther by giving similar treatment to the
1041 * other potentially constant components of the
1042 * attribute, as well as making brw_vs_constval.c
1043 * handle varyings other than gl_TexCoord.
1044 */
1045 if (location >= VARYING_SLOT_TEX0 &&
1046 location <= VARYING_SLOT_TEX7 &&
1047 k == 3 && !(c->key.proj_attrib_mask
1048 & BITFIELD64_BIT(location))) {
1049 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1050 } else {
1051 struct brw_reg interp = interp_reg(location, k);
1052 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1053 ir->centroid);
1054 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1055 /* Get the pixel/sample mask into f0 so that we know
1056 * which pixels are lit. Then, for each channel that is
1057 * unlit, replace the centroid data with non-centroid
1058 * data.
1059 */
1060 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1061 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1062 interpolation_mode, false);
1063 inst->predicate = BRW_PREDICATE_NORMAL;
1064 inst->predicate_inverse = true;
1065 }
1066 if (intel->gen < 6) {
1067 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1068 }
1069 }
1070 attr.reg_offset++;
1071 }
1072
1073 }
1074 location++;
1075 }
1076 }
1077
1078 return reg;
1079 }
1080
1081 fs_reg *
1082 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1083 {
1084 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1085
1086 /* The frontfacing comes in as a bit in the thread payload. */
1087 if (intel->gen >= 6) {
1088 emit(BRW_OPCODE_ASR, *reg,
1089 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1090 fs_reg(15));
1091 emit(BRW_OPCODE_NOT, *reg, *reg);
1092 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1093 } else {
1094 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1095 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1096 * us front face
1097 */
1098 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1099 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1100 }
1101
1102 return reg;
1103 }
1104
1105 fs_reg
1106 fs_visitor::fix_math_operand(fs_reg src)
1107 {
1108 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1109 * might be able to do better by doing execsize = 1 math and then
1110 * expanding that result out, but we would need to be careful with
1111 * masking.
1112 *
1113 * The hardware ignores source modifiers (negate and abs) on math
1114 * instructions, so we also move to a temp to set those up.
1115 */
1116 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1117 !src.abs && !src.negate)
1118 return src;
1119
1120 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1121 * operands to math
1122 */
1123 if (intel->gen >= 7 && src.file != IMM)
1124 return src;
1125
1126 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1127 expanded.type = src.type;
1128 emit(BRW_OPCODE_MOV, expanded, src);
1129 return expanded;
1130 }
1131
1132 fs_inst *
1133 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1134 {
1135 switch (opcode) {
1136 case SHADER_OPCODE_RCP:
1137 case SHADER_OPCODE_RSQ:
1138 case SHADER_OPCODE_SQRT:
1139 case SHADER_OPCODE_EXP2:
1140 case SHADER_OPCODE_LOG2:
1141 case SHADER_OPCODE_SIN:
1142 case SHADER_OPCODE_COS:
1143 break;
1144 default:
1145 assert(!"not reached: bad math opcode");
1146 return NULL;
1147 }
1148
1149 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1150 * might be able to do better by doing execsize = 1 math and then
1151 * expanding that result out, but we would need to be careful with
1152 * masking.
1153 *
1154 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1155 * instructions, so we also move to a temp to set those up.
1156 */
1157 if (intel->gen >= 6)
1158 src = fix_math_operand(src);
1159
1160 fs_inst *inst = emit(opcode, dst, src);
1161
1162 if (intel->gen < 6) {
1163 inst->base_mrf = 2;
1164 inst->mlen = dispatch_width / 8;
1165 }
1166
1167 return inst;
1168 }
1169
1170 fs_inst *
1171 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1172 {
1173 int base_mrf = 2;
1174 fs_inst *inst;
1175
1176 switch (opcode) {
1177 case SHADER_OPCODE_INT_QUOTIENT:
1178 case SHADER_OPCODE_INT_REMAINDER:
1179 if (intel->gen >= 7 && dispatch_width == 16)
1180 fail("16-wide INTDIV unsupported\n");
1181 break;
1182 case SHADER_OPCODE_POW:
1183 break;
1184 default:
1185 assert(!"not reached: unsupported binary math opcode.");
1186 return NULL;
1187 }
1188
1189 if (intel->gen >= 6) {
1190 src0 = fix_math_operand(src0);
1191 src1 = fix_math_operand(src1);
1192
1193 inst = emit(opcode, dst, src0, src1);
1194 } else {
1195 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1196 * "Message Payload":
1197 *
1198 * "Operand0[7]. For the INT DIV functions, this operand is the
1199 * denominator."
1200 * ...
1201 * "Operand1[7]. For the INT DIV functions, this operand is the
1202 * numerator."
1203 */
1204 bool is_int_div = opcode != SHADER_OPCODE_POW;
1205 fs_reg &op0 = is_int_div ? src1 : src0;
1206 fs_reg &op1 = is_int_div ? src0 : src1;
1207
1208 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1209 inst = emit(opcode, dst, op0, reg_null_f);
1210
1211 inst->base_mrf = base_mrf;
1212 inst->mlen = 2 * dispatch_width / 8;
1213 }
1214 return inst;
1215 }
1216
1217 void
1218 fs_visitor::assign_curb_setup()
1219 {
1220 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1221 if (dispatch_width == 8) {
1222 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1223 } else {
1224 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1225 }
1226
1227 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1228 foreach_list(node, &this->instructions) {
1229 fs_inst *inst = (fs_inst *)node;
1230
1231 for (unsigned int i = 0; i < 3; i++) {
1232 if (inst->src[i].file == UNIFORM) {
1233 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1234 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1235 constant_nr / 8,
1236 constant_nr % 8);
1237
1238 inst->src[i].file = FIXED_HW_REG;
1239 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1240 }
1241 }
1242 }
1243 }
1244
1245 void
1246 fs_visitor::calculate_urb_setup()
1247 {
1248 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1249 urb_setup[i] = -1;
1250 }
1251
1252 int urb_next = 0;
1253 /* Figure out where each of the incoming setup attributes lands. */
1254 if (intel->gen >= 6) {
1255 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1256 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1257 urb_setup[i] = urb_next++;
1258 }
1259 }
1260 } else {
1261 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1262 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1263 /* Point size is packed into the header, not as a general attribute */
1264 if (i == VARYING_SLOT_PSIZ)
1265 continue;
1266
1267 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1268 /* The back color slot is skipped when the front color is
1269 * also written to. In addition, some slots can be
1270 * written in the vertex shader and not read in the
1271 * fragment shader. So the register number must always be
1272 * incremented, mapped or not.
1273 */
1274 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1275 urb_setup[i] = urb_next;
1276 urb_next++;
1277 }
1278 }
1279
1280 /*
1281 * It's a FS only attribute, and we did interpolation for this attribute
1282 * in SF thread. So, count it here, too.
1283 *
1284 * See compile_sf_prog() for more info.
1285 */
1286 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1287 urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1288 }
1289
1290 /* Each attribute is 4 setup channels, each of which is half a reg. */
1291 c->prog_data.urb_read_length = urb_next * 2;
1292 }
1293
1294 void
1295 fs_visitor::assign_urb_setup()
1296 {
1297 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1298
1299 /* Offset all the urb_setup[] index by the actual position of the
1300 * setup regs, now that the location of the constants has been chosen.
1301 */
1302 foreach_list(node, &this->instructions) {
1303 fs_inst *inst = (fs_inst *)node;
1304
1305 if (inst->opcode == FS_OPCODE_LINTERP) {
1306 assert(inst->src[2].file == FIXED_HW_REG);
1307 inst->src[2].fixed_hw_reg.nr += urb_start;
1308 }
1309
1310 if (inst->opcode == FS_OPCODE_CINTERP) {
1311 assert(inst->src[0].file == FIXED_HW_REG);
1312 inst->src[0].fixed_hw_reg.nr += urb_start;
1313 }
1314 }
1315
1316 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1317 }
1318
1319 /**
1320 * Split large virtual GRFs into separate components if we can.
1321 *
1322 * This is mostly duplicated with what brw_fs_vector_splitting does,
1323 * but that's really conservative because it's afraid of doing
1324 * splitting that doesn't result in real progress after the rest of
1325 * the optimization phases, which would cause infinite looping in
1326 * optimization. We can do it once here, safely. This also has the
1327 * opportunity to split interpolated values, or maybe even uniforms,
1328 * which we don't have at the IR level.
1329 *
1330 * We want to split, because virtual GRFs are what we register
1331 * allocate and spill (due to contiguousness requirements for some
1332 * instructions), and they're what we naturally generate in the
1333 * codegen process, but most virtual GRFs don't actually need to be
1334 * contiguous sets of GRFs. If we split, we'll end up with reduced
1335 * live intervals and better dead code elimination and coalescing.
1336 */
1337 void
1338 fs_visitor::split_virtual_grfs()
1339 {
1340 int num_vars = this->virtual_grf_count;
1341 bool split_grf[num_vars];
1342 int new_virtual_grf[num_vars];
1343
1344 /* Try to split anything > 0 sized. */
1345 for (int i = 0; i < num_vars; i++) {
1346 if (this->virtual_grf_sizes[i] != 1)
1347 split_grf[i] = true;
1348 else
1349 split_grf[i] = false;
1350 }
1351
1352 if (brw->has_pln &&
1353 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1354 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1355 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1356 * Gen6, that was the only supported interpolation mode, and since Gen6,
1357 * delta_x and delta_y are in fixed hardware registers.
1358 */
1359 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1360 false;
1361 }
1362
1363 foreach_list(node, &this->instructions) {
1364 fs_inst *inst = (fs_inst *)node;
1365
1366 /* If there's a SEND message that requires contiguous destination
1367 * registers, no splitting is allowed.
1368 */
1369 if (inst->regs_written() > 1) {
1370 split_grf[inst->dst.reg] = false;
1371 }
1372 }
1373
1374 /* Allocate new space for split regs. Note that the virtual
1375 * numbers will be contiguous.
1376 */
1377 for (int i = 0; i < num_vars; i++) {
1378 if (split_grf[i]) {
1379 new_virtual_grf[i] = virtual_grf_alloc(1);
1380 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1381 int reg = virtual_grf_alloc(1);
1382 assert(reg == new_virtual_grf[i] + j - 1);
1383 (void) reg;
1384 }
1385 this->virtual_grf_sizes[i] = 1;
1386 }
1387 }
1388
1389 foreach_list(node, &this->instructions) {
1390 fs_inst *inst = (fs_inst *)node;
1391
1392 if (inst->dst.file == GRF &&
1393 split_grf[inst->dst.reg] &&
1394 inst->dst.reg_offset != 0) {
1395 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1396 inst->dst.reg_offset - 1);
1397 inst->dst.reg_offset = 0;
1398 }
1399 for (int i = 0; i < 3; i++) {
1400 if (inst->src[i].file == GRF &&
1401 split_grf[inst->src[i].reg] &&
1402 inst->src[i].reg_offset != 0) {
1403 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1404 inst->src[i].reg_offset - 1);
1405 inst->src[i].reg_offset = 0;
1406 }
1407 }
1408 }
1409 this->live_intervals_valid = false;
1410 }
1411
1412 /**
1413 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1414 *
1415 * During code generation, we create tons of temporary variables, many of
1416 * which get immediately killed and are never used again. Yet, in later
1417 * optimization and analysis passes, such as compute_live_intervals, we need
1418 * to loop over all the virtual GRFs. Compacting them can save a lot of
1419 * overhead.
1420 */
1421 void
1422 fs_visitor::compact_virtual_grfs()
1423 {
1424 /* Mark which virtual GRFs are used, and count how many. */
1425 int remap_table[this->virtual_grf_count];
1426 memset(remap_table, -1, sizeof(remap_table));
1427
1428 foreach_list(node, &this->instructions) {
1429 const fs_inst *inst = (const fs_inst *) node;
1430
1431 if (inst->dst.file == GRF)
1432 remap_table[inst->dst.reg] = 0;
1433
1434 for (int i = 0; i < 3; i++) {
1435 if (inst->src[i].file == GRF)
1436 remap_table[inst->src[i].reg] = 0;
1437 }
1438 }
1439
1440 /* In addition to registers used in instructions, fs_visitor keeps
1441 * direct references to certain special values which must be patched:
1442 */
1443 fs_reg *special[] = {
1444 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1445 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1446 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1447 &delta_x[0], &delta_x[1], &delta_x[2],
1448 &delta_x[3], &delta_x[4], &delta_x[5],
1449 &delta_y[0], &delta_y[1], &delta_y[2],
1450 &delta_y[3], &delta_y[4], &delta_y[5],
1451 };
1452 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1453 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1454
1455 /* Treat all special values as used, to be conservative */
1456 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1457 if (special[i]->file == GRF)
1458 remap_table[special[i]->reg] = 0;
1459 }
1460
1461 /* Compact the GRF arrays. */
1462 int new_index = 0;
1463 for (int i = 0; i < this->virtual_grf_count; i++) {
1464 if (remap_table[i] != -1) {
1465 remap_table[i] = new_index;
1466 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1467 if (live_intervals_valid) {
1468 virtual_grf_use[new_index] = virtual_grf_use[i];
1469 virtual_grf_def[new_index] = virtual_grf_def[i];
1470 }
1471 ++new_index;
1472 }
1473 }
1474
1475 this->virtual_grf_count = new_index;
1476
1477 /* Patch all the instructions to use the newly renumbered registers */
1478 foreach_list(node, &this->instructions) {
1479 fs_inst *inst = (fs_inst *) node;
1480
1481 if (inst->dst.file == GRF)
1482 inst->dst.reg = remap_table[inst->dst.reg];
1483
1484 for (int i = 0; i < 3; i++) {
1485 if (inst->src[i].file == GRF)
1486 inst->src[i].reg = remap_table[inst->src[i].reg];
1487 }
1488 }
1489
1490 /* Patch all the references to special values */
1491 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1492 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1493 special[i]->reg = remap_table[special[i]->reg];
1494 }
1495 }
1496
1497 bool
1498 fs_visitor::remove_dead_constants()
1499 {
1500 if (dispatch_width == 8) {
1501 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1502
1503 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1504 this->params_remap[i] = -1;
1505
1506 /* Find which params are still in use. */
1507 foreach_list(node, &this->instructions) {
1508 fs_inst *inst = (fs_inst *)node;
1509
1510 for (int i = 0; i < 3; i++) {
1511 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1512
1513 if (inst->src[i].file != UNIFORM)
1514 continue;
1515
1516 assert(constant_nr < (int)c->prog_data.nr_params);
1517
1518 /* For now, set this to non-negative. We'll give it the
1519 * actual new number in a moment, in order to keep the
1520 * register numbers nicely ordered.
1521 */
1522 this->params_remap[constant_nr] = 0;
1523 }
1524 }
1525
1526 /* Figure out what the new numbers for the params will be. At some
1527 * point when we're doing uniform array access, we're going to want
1528 * to keep the distinction between .reg and .reg_offset, but for
1529 * now we don't care.
1530 */
1531 unsigned int new_nr_params = 0;
1532 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1533 if (this->params_remap[i] != -1) {
1534 this->params_remap[i] = new_nr_params++;
1535 }
1536 }
1537
1538 /* Update the list of params to be uploaded to match our new numbering. */
1539 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1540 int remapped = this->params_remap[i];
1541
1542 if (remapped == -1)
1543 continue;
1544
1545 c->prog_data.param[remapped] = c->prog_data.param[i];
1546 }
1547
1548 c->prog_data.nr_params = new_nr_params;
1549 } else {
1550 /* This should have been generated in the 8-wide pass already. */
1551 assert(this->params_remap);
1552 }
1553
1554 /* Now do the renumbering of the shader to remove unused params. */
1555 foreach_list(node, &this->instructions) {
1556 fs_inst *inst = (fs_inst *)node;
1557
1558 for (int i = 0; i < 3; i++) {
1559 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1560
1561 if (inst->src[i].file != UNIFORM)
1562 continue;
1563
1564 assert(this->params_remap[constant_nr] != -1);
1565 inst->src[i].reg = this->params_remap[constant_nr];
1566 inst->src[i].reg_offset = 0;
1567 }
1568 }
1569
1570 return true;
1571 }
1572
1573 /*
1574 * Implements array access of uniforms by inserting a
1575 * PULL_CONSTANT_LOAD instruction.
1576 *
1577 * Unlike temporary GRF array access (where we don't support it due to
1578 * the difficulty of doing relative addressing on instruction
1579 * destinations), we could potentially do array access of uniforms
1580 * that were loaded in GRF space as push constants. In real-world
1581 * usage we've seen, though, the arrays being used are always larger
1582 * than we could load as push constants, so just always move all
1583 * uniform array access out to a pull constant buffer.
1584 */
1585 void
1586 fs_visitor::move_uniform_array_access_to_pull_constants()
1587 {
1588 int pull_constant_loc[c->prog_data.nr_params];
1589
1590 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1591 pull_constant_loc[i] = -1;
1592 }
1593
1594 /* Walk through and find array access of uniforms. Put a copy of that
1595 * uniform in the pull constant buffer.
1596 *
1597 * Note that we don't move constant-indexed accesses to arrays. No
1598 * testing has been done of the performance impact of this choice.
1599 */
1600 foreach_list_safe(node, &this->instructions) {
1601 fs_inst *inst = (fs_inst *)node;
1602
1603 for (int i = 0 ; i < 3; i++) {
1604 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1605 continue;
1606
1607 int uniform = inst->src[i].reg;
1608
1609 /* If this array isn't already present in the pull constant buffer,
1610 * add it.
1611 */
1612 if (pull_constant_loc[uniform] == -1) {
1613 const float **values = &c->prog_data.param[uniform];
1614
1615 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1616
1617 assert(param_size[uniform]);
1618
1619 for (int j = 0; j < param_size[uniform]; j++) {
1620 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1621 values[j];
1622 }
1623 }
1624
1625 /* Set up the annotation tracking for new generated instructions. */
1626 base_ir = inst->ir;
1627 current_annotation = inst->annotation;
1628
1629 fs_reg offset = fs_reg(this, glsl_type::int_type);
1630 inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1631 fs_reg(pull_constant_loc[uniform] +
1632 inst->src[i].reg_offset)));
1633
1634 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1635 fs_reg temp = fs_reg(this, glsl_type::float_type);
1636 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1637 surf_index, offset);
1638 inst->insert_before(&list);
1639
1640 inst->src[i].file = temp.file;
1641 inst->src[i].reg = temp.reg;
1642 inst->src[i].reg_offset = temp.reg_offset;
1643 inst->src[i].reladdr = NULL;
1644 }
1645 }
1646 }
1647
1648 /**
1649 * Choose accesses from the UNIFORM file to demote to using the pull
1650 * constant buffer.
1651 *
1652 * We allow a fragment shader to have more than the specified minimum
1653 * maximum number of fragment shader uniform components (64). If
1654 * there are too many of these, they'd fill up all of register space.
1655 * So, this will push some of them out to the pull constant buffer and
1656 * update the program to load them.
1657 */
1658 void
1659 fs_visitor::setup_pull_constants()
1660 {
1661 /* Only allow 16 registers (128 uniform components) as push constants. */
1662 unsigned int max_uniform_components = 16 * 8;
1663 if (c->prog_data.nr_params <= max_uniform_components)
1664 return;
1665
1666 if (dispatch_width == 16) {
1667 fail("Pull constants not supported in 16-wide\n");
1668 return;
1669 }
1670
1671 /* Just demote the end of the list. We could probably do better
1672 * here, demoting things that are rarely used in the program first.
1673 */
1674 unsigned int pull_uniform_base = max_uniform_components;
1675
1676 int pull_constant_loc[c->prog_data.nr_params];
1677 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1678 if (i < pull_uniform_base) {
1679 pull_constant_loc[i] = -1;
1680 } else {
1681 pull_constant_loc[i] = -1;
1682 /* If our constant is already being uploaded for reladdr purposes,
1683 * reuse it.
1684 */
1685 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1686 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1687 pull_constant_loc[i] = j;
1688 break;
1689 }
1690 }
1691 if (pull_constant_loc[i] == -1) {
1692 int pull_index = c->prog_data.nr_pull_params++;
1693 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1694 pull_constant_loc[i] = pull_index;;
1695 }
1696 }
1697 }
1698 c->prog_data.nr_params = pull_uniform_base;
1699
1700 foreach_list(node, &this->instructions) {
1701 fs_inst *inst = (fs_inst *)node;
1702
1703 for (int i = 0; i < 3; i++) {
1704 if (inst->src[i].file != UNIFORM)
1705 continue;
1706
1707 int pull_index = pull_constant_loc[inst->src[i].reg +
1708 inst->src[i].reg_offset];
1709 if (pull_index == -1)
1710 continue;
1711
1712 assert(!inst->src[i].reladdr);
1713
1714 fs_reg dst = fs_reg(this, glsl_type::float_type);
1715 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1716 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1717 fs_inst *pull =
1718 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1719 dst, index, offset);
1720 pull->ir = inst->ir;
1721 pull->annotation = inst->annotation;
1722
1723 inst->insert_before(pull);
1724
1725 inst->src[i].file = GRF;
1726 inst->src[i].reg = dst.reg;
1727 inst->src[i].reg_offset = 0;
1728 inst->src[i].smear = pull_index & 3;
1729 }
1730 }
1731 }
1732
1733 bool
1734 fs_visitor::opt_algebraic()
1735 {
1736 bool progress = false;
1737
1738 foreach_list(node, &this->instructions) {
1739 fs_inst *inst = (fs_inst *)node;
1740
1741 switch (inst->opcode) {
1742 case BRW_OPCODE_MUL:
1743 if (inst->src[1].file != IMM)
1744 continue;
1745
1746 /* a * 1.0 = a */
1747 if (inst->src[1].is_one()) {
1748 inst->opcode = BRW_OPCODE_MOV;
1749 inst->src[1] = reg_undef;
1750 progress = true;
1751 break;
1752 }
1753
1754 /* a * 0.0 = 0.0 */
1755 if (inst->src[1].is_zero()) {
1756 inst->opcode = BRW_OPCODE_MOV;
1757 inst->src[0] = inst->src[1];
1758 inst->src[1] = reg_undef;
1759 progress = true;
1760 break;
1761 }
1762
1763 break;
1764 case BRW_OPCODE_ADD:
1765 if (inst->src[1].file != IMM)
1766 continue;
1767
1768 /* a + 0.0 = a */
1769 if (inst->src[1].is_zero()) {
1770 inst->opcode = BRW_OPCODE_MOV;
1771 inst->src[1] = reg_undef;
1772 progress = true;
1773 break;
1774 }
1775 break;
1776 default:
1777 break;
1778 }
1779 }
1780
1781 return progress;
1782 }
1783
1784 /**
1785 * Must be called after calculate_live_intervales() to remove unused
1786 * writes to registers -- register allocation will fail otherwise
1787 * because something deffed but not used won't be considered to
1788 * interfere with other regs.
1789 */
1790 bool
1791 fs_visitor::dead_code_eliminate()
1792 {
1793 bool progress = false;
1794 int pc = 0;
1795
1796 calculate_live_intervals();
1797
1798 foreach_list_safe(node, &this->instructions) {
1799 fs_inst *inst = (fs_inst *)node;
1800
1801 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1802 inst->remove();
1803 progress = true;
1804 }
1805
1806 pc++;
1807 }
1808
1809 if (progress)
1810 live_intervals_valid = false;
1811
1812 return progress;
1813 }
1814
1815 /**
1816 * Implements a second type of register coalescing: This one checks if
1817 * the two regs involved in a raw move don't interfere, in which case
1818 * they can both by stored in the same place and the MOV removed.
1819 */
1820 bool
1821 fs_visitor::register_coalesce_2()
1822 {
1823 bool progress = false;
1824
1825 calculate_live_intervals();
1826
1827 foreach_list_safe(node, &this->instructions) {
1828 fs_inst *inst = (fs_inst *)node;
1829
1830 if (inst->opcode != BRW_OPCODE_MOV ||
1831 inst->predicate ||
1832 inst->saturate ||
1833 inst->src[0].file != GRF ||
1834 inst->src[0].negate ||
1835 inst->src[0].abs ||
1836 inst->src[0].smear != -1 ||
1837 inst->dst.file != GRF ||
1838 inst->dst.type != inst->src[0].type ||
1839 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1840 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1841 continue;
1842 }
1843
1844 int reg_from = inst->src[0].reg;
1845 assert(inst->src[0].reg_offset == 0);
1846 int reg_to = inst->dst.reg;
1847 int reg_to_offset = inst->dst.reg_offset;
1848
1849 foreach_list(node, &this->instructions) {
1850 fs_inst *scan_inst = (fs_inst *)node;
1851
1852 if (scan_inst->dst.file == GRF &&
1853 scan_inst->dst.reg == reg_from) {
1854 scan_inst->dst.reg = reg_to;
1855 scan_inst->dst.reg_offset = reg_to_offset;
1856 }
1857 for (int i = 0; i < 3; i++) {
1858 if (scan_inst->src[i].file == GRF &&
1859 scan_inst->src[i].reg == reg_from) {
1860 scan_inst->src[i].reg = reg_to;
1861 scan_inst->src[i].reg_offset = reg_to_offset;
1862 }
1863 }
1864 }
1865
1866 inst->remove();
1867
1868 /* We don't need to recalculate live intervals inside the loop despite
1869 * flagging live_intervals_valid because we only use live intervals for
1870 * the interferes test, and we must have had a situation where the
1871 * intervals were:
1872 *
1873 * from to
1874 * ^
1875 * |
1876 * v
1877 * ^
1878 * |
1879 * v
1880 *
1881 * Some register R that might get coalesced with one of these two could
1882 * only be referencing "to", otherwise "from"'s range would have been
1883 * longer. R's range could also only start at the end of "to" or later,
1884 * otherwise it will conflict with "to" when we try to coalesce "to"
1885 * into Rw anyway.
1886 */
1887 live_intervals_valid = false;
1888
1889 progress = true;
1890 continue;
1891 }
1892
1893 return progress;
1894 }
1895
1896 bool
1897 fs_visitor::register_coalesce()
1898 {
1899 bool progress = false;
1900 int if_depth = 0;
1901 int loop_depth = 0;
1902
1903 foreach_list_safe(node, &this->instructions) {
1904 fs_inst *inst = (fs_inst *)node;
1905
1906 /* Make sure that we dominate the instructions we're going to
1907 * scan for interfering with our coalescing, or we won't have
1908 * scanned enough to see if anything interferes with our
1909 * coalescing. We don't dominate the following instructions if
1910 * we're in a loop or an if block.
1911 */
1912 switch (inst->opcode) {
1913 case BRW_OPCODE_DO:
1914 loop_depth++;
1915 break;
1916 case BRW_OPCODE_WHILE:
1917 loop_depth--;
1918 break;
1919 case BRW_OPCODE_IF:
1920 if_depth++;
1921 break;
1922 case BRW_OPCODE_ENDIF:
1923 if_depth--;
1924 break;
1925 default:
1926 break;
1927 }
1928 if (loop_depth || if_depth)
1929 continue;
1930
1931 if (inst->opcode != BRW_OPCODE_MOV ||
1932 inst->predicate ||
1933 inst->saturate ||
1934 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1935 inst->src[0].file != UNIFORM)||
1936 inst->dst.type != inst->src[0].type)
1937 continue;
1938
1939 bool has_source_modifiers = (inst->src[0].abs ||
1940 inst->src[0].negate ||
1941 inst->src[0].smear != -1 ||
1942 inst->src[0].file == UNIFORM);
1943
1944 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1945 * them: check for no writes to either one until the exit of the
1946 * program.
1947 */
1948 bool interfered = false;
1949
1950 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1951 !scan_inst->is_tail_sentinel();
1952 scan_inst = (fs_inst *)scan_inst->next) {
1953 if (scan_inst->dst.file == GRF) {
1954 if (scan_inst->overwrites_reg(inst->dst) ||
1955 scan_inst->overwrites_reg(inst->src[0])) {
1956 interfered = true;
1957 break;
1958 }
1959 }
1960
1961 /* The gen6 MATH instruction can't handle source modifiers or
1962 * unusual register regions, so avoid coalescing those for
1963 * now. We should do something more specific.
1964 */
1965 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1966 interfered = true;
1967 break;
1968 }
1969
1970 /* The accumulator result appears to get used for the
1971 * conditional modifier generation. When negating a UD
1972 * value, there is a 33rd bit generated for the sign in the
1973 * accumulator value, so now you can't check, for example,
1974 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1975 */
1976 if (scan_inst->conditional_mod &&
1977 inst->src[0].negate &&
1978 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1979 interfered = true;
1980 break;
1981 }
1982 }
1983 if (interfered) {
1984 continue;
1985 }
1986
1987 /* Rewrite the later usage to point at the source of the move to
1988 * be removed.
1989 */
1990 for (fs_inst *scan_inst = inst;
1991 !scan_inst->is_tail_sentinel();
1992 scan_inst = (fs_inst *)scan_inst->next) {
1993 for (int i = 0; i < 3; i++) {
1994 if (scan_inst->src[i].file == GRF &&
1995 scan_inst->src[i].reg == inst->dst.reg &&
1996 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1997 fs_reg new_src = inst->src[0];
1998 if (scan_inst->src[i].abs) {
1999 new_src.negate = 0;
2000 new_src.abs = 1;
2001 }
2002 new_src.negate ^= scan_inst->src[i].negate;
2003 scan_inst->src[i] = new_src;
2004 }
2005 }
2006 }
2007
2008 inst->remove();
2009 progress = true;
2010 }
2011
2012 if (progress)
2013 live_intervals_valid = false;
2014
2015 return progress;
2016 }
2017
2018
2019 bool
2020 fs_visitor::compute_to_mrf()
2021 {
2022 bool progress = false;
2023 int next_ip = 0;
2024
2025 calculate_live_intervals();
2026
2027 foreach_list_safe(node, &this->instructions) {
2028 fs_inst *inst = (fs_inst *)node;
2029
2030 int ip = next_ip;
2031 next_ip++;
2032
2033 if (inst->opcode != BRW_OPCODE_MOV ||
2034 inst->predicate ||
2035 inst->dst.file != MRF || inst->src[0].file != GRF ||
2036 inst->dst.type != inst->src[0].type ||
2037 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2038 continue;
2039
2040 /* Work out which hardware MRF registers are written by this
2041 * instruction.
2042 */
2043 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2044 int mrf_high;
2045 if (inst->dst.reg & BRW_MRF_COMPR4) {
2046 mrf_high = mrf_low + 4;
2047 } else if (dispatch_width == 16 &&
2048 (!inst->force_uncompressed && !inst->force_sechalf)) {
2049 mrf_high = mrf_low + 1;
2050 } else {
2051 mrf_high = mrf_low;
2052 }
2053
2054 /* Can't compute-to-MRF this GRF if someone else was going to
2055 * read it later.
2056 */
2057 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2058 continue;
2059
2060 /* Found a move of a GRF to a MRF. Let's see if we can go
2061 * rewrite the thing that made this GRF to write into the MRF.
2062 */
2063 fs_inst *scan_inst;
2064 for (scan_inst = (fs_inst *)inst->prev;
2065 scan_inst->prev != NULL;
2066 scan_inst = (fs_inst *)scan_inst->prev) {
2067 if (scan_inst->dst.file == GRF &&
2068 scan_inst->dst.reg == inst->src[0].reg) {
2069 /* Found the last thing to write our reg we want to turn
2070 * into a compute-to-MRF.
2071 */
2072
2073 /* If it's predicated, it (probably) didn't populate all
2074 * the channels. We might be able to rewrite everything
2075 * that writes that reg, but it would require smarter
2076 * tracking to delay the rewriting until complete success.
2077 */
2078 if (scan_inst->predicate)
2079 break;
2080
2081 /* If it's half of register setup and not the same half as
2082 * our MOV we're trying to remove, bail for now.
2083 */
2084 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2085 scan_inst->force_sechalf != inst->force_sechalf) {
2086 break;
2087 }
2088
2089 /* SEND instructions can't have MRF as a destination. */
2090 if (scan_inst->mlen)
2091 break;
2092
2093 if (intel->gen == 6) {
2094 /* gen6 math instructions must have the destination be
2095 * GRF, so no compute-to-MRF for them.
2096 */
2097 if (scan_inst->is_math()) {
2098 break;
2099 }
2100 }
2101
2102 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2103 /* Found the creator of our MRF's source value. */
2104 scan_inst->dst.file = MRF;
2105 scan_inst->dst.reg = inst->dst.reg;
2106 scan_inst->saturate |= inst->saturate;
2107 inst->remove();
2108 progress = true;
2109 }
2110 break;
2111 }
2112
2113 /* We don't handle control flow here. Most computation of
2114 * values that end up in MRFs are shortly before the MRF
2115 * write anyway.
2116 */
2117 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2118 break;
2119
2120 /* You can't read from an MRF, so if someone else reads our
2121 * MRF's source GRF that we wanted to rewrite, that stops us.
2122 */
2123 bool interfered = false;
2124 for (int i = 0; i < 3; i++) {
2125 if (scan_inst->src[i].file == GRF &&
2126 scan_inst->src[i].reg == inst->src[0].reg &&
2127 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2128 interfered = true;
2129 }
2130 }
2131 if (interfered)
2132 break;
2133
2134 if (scan_inst->dst.file == MRF) {
2135 /* If somebody else writes our MRF here, we can't
2136 * compute-to-MRF before that.
2137 */
2138 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2139 int scan_mrf_high;
2140
2141 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2142 scan_mrf_high = scan_mrf_low + 4;
2143 } else if (dispatch_width == 16 &&
2144 (!scan_inst->force_uncompressed &&
2145 !scan_inst->force_sechalf)) {
2146 scan_mrf_high = scan_mrf_low + 1;
2147 } else {
2148 scan_mrf_high = scan_mrf_low;
2149 }
2150
2151 if (mrf_low == scan_mrf_low ||
2152 mrf_low == scan_mrf_high ||
2153 mrf_high == scan_mrf_low ||
2154 mrf_high == scan_mrf_high) {
2155 break;
2156 }
2157 }
2158
2159 if (scan_inst->mlen > 0) {
2160 /* Found a SEND instruction, which means that there are
2161 * live values in MRFs from base_mrf to base_mrf +
2162 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2163 * above it.
2164 */
2165 if (mrf_low >= scan_inst->base_mrf &&
2166 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2167 break;
2168 }
2169 if (mrf_high >= scan_inst->base_mrf &&
2170 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2171 break;
2172 }
2173 }
2174 }
2175 }
2176
2177 if (progress)
2178 live_intervals_valid = false;
2179
2180 return progress;
2181 }
2182
2183 /**
2184 * Walks through basic blocks, looking for repeated MRF writes and
2185 * removing the later ones.
2186 */
2187 bool
2188 fs_visitor::remove_duplicate_mrf_writes()
2189 {
2190 fs_inst *last_mrf_move[16];
2191 bool progress = false;
2192
2193 /* Need to update the MRF tracking for compressed instructions. */
2194 if (dispatch_width == 16)
2195 return false;
2196
2197 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2198
2199 foreach_list_safe(node, &this->instructions) {
2200 fs_inst *inst = (fs_inst *)node;
2201
2202 if (inst->is_control_flow()) {
2203 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2204 }
2205
2206 if (inst->opcode == BRW_OPCODE_MOV &&
2207 inst->dst.file == MRF) {
2208 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2209 if (prev_inst && inst->equals(prev_inst)) {
2210 inst->remove();
2211 progress = true;
2212 continue;
2213 }
2214 }
2215
2216 /* Clear out the last-write records for MRFs that were overwritten. */
2217 if (inst->dst.file == MRF) {
2218 last_mrf_move[inst->dst.reg] = NULL;
2219 }
2220
2221 if (inst->mlen > 0) {
2222 /* Found a SEND instruction, which will include two or fewer
2223 * implied MRF writes. We could do better here.
2224 */
2225 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2226 last_mrf_move[inst->base_mrf + i] = NULL;
2227 }
2228 }
2229
2230 /* Clear out any MRF move records whose sources got overwritten. */
2231 if (inst->dst.file == GRF) {
2232 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2233 if (last_mrf_move[i] &&
2234 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2235 last_mrf_move[i] = NULL;
2236 }
2237 }
2238 }
2239
2240 if (inst->opcode == BRW_OPCODE_MOV &&
2241 inst->dst.file == MRF &&
2242 inst->src[0].file == GRF &&
2243 !inst->predicate) {
2244 last_mrf_move[inst->dst.reg] = inst;
2245 }
2246 }
2247
2248 if (progress)
2249 live_intervals_valid = false;
2250
2251 return progress;
2252 }
2253
2254 static void
2255 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2256 int first_grf, int grf_len)
2257 {
2258 bool inst_16wide = (dispatch_width > 8 &&
2259 !inst->force_uncompressed &&
2260 !inst->force_sechalf);
2261
2262 /* Clear the flag for registers that actually got read (as expected). */
2263 for (int i = 0; i < 3; i++) {
2264 int grf;
2265 if (inst->src[i].file == GRF) {
2266 grf = inst->src[i].reg;
2267 } else if (inst->src[i].file == FIXED_HW_REG &&
2268 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2269 grf = inst->src[i].fixed_hw_reg.nr;
2270 } else {
2271 continue;
2272 }
2273
2274 if (grf >= first_grf &&
2275 grf < first_grf + grf_len) {
2276 deps[grf - first_grf] = false;
2277 if (inst_16wide)
2278 deps[grf - first_grf + 1] = false;
2279 }
2280 }
2281 }
2282
2283 /**
2284 * Implements this workaround for the original 965:
2285 *
2286 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2287 * check for post destination dependencies on this instruction, software
2288 * must ensure that there is no destination hazard for the case of ‘write
2289 * followed by a posted write’ shown in the following example.
2290 *
2291 * 1. mov r3 0
2292 * 2. send r3.xy <rest of send instruction>
2293 * 3. mov r2 r3
2294 *
2295 * Due to no post-destination dependency check on the ‘send’, the above
2296 * code sequence could have two instructions (1 and 2) in flight at the
2297 * same time that both consider ‘r3’ as the target of their final writes.
2298 */
2299 void
2300 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2301 {
2302 int reg_size = dispatch_width / 8;
2303 int write_len = inst->regs_written() * reg_size;
2304 int first_write_grf = inst->dst.reg;
2305 bool needs_dep[BRW_MAX_MRF];
2306 assert(write_len < (int)sizeof(needs_dep) - 1);
2307
2308 memset(needs_dep, false, sizeof(needs_dep));
2309 memset(needs_dep, true, write_len);
2310
2311 clear_deps_for_inst_src(inst, dispatch_width,
2312 needs_dep, first_write_grf, write_len);
2313
2314 /* Walk backwards looking for writes to registers we're writing which
2315 * aren't read since being written. If we hit the start of the program,
2316 * we assume that there are no outstanding dependencies on entry to the
2317 * program.
2318 */
2319 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2320 scan_inst != NULL;
2321 scan_inst = (fs_inst *)scan_inst->prev) {
2322
2323 /* If we hit control flow, assume that there *are* outstanding
2324 * dependencies, and force their cleanup before our instruction.
2325 */
2326 if (scan_inst->is_control_flow()) {
2327 for (int i = 0; i < write_len; i++) {
2328 if (needs_dep[i]) {
2329 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2330 }
2331 }
2332 }
2333
2334 bool scan_inst_16wide = (dispatch_width > 8 &&
2335 !scan_inst->force_uncompressed &&
2336 !scan_inst->force_sechalf);
2337
2338 /* We insert our reads as late as possible on the assumption that any
2339 * instruction but a MOV that might have left us an outstanding
2340 * dependency has more latency than a MOV.
2341 */
2342 if (scan_inst->dst.file == GRF) {
2343 for (int i = 0; i < scan_inst->regs_written(); i++) {
2344 int reg = scan_inst->dst.reg + i * reg_size;
2345
2346 if (reg >= first_write_grf &&
2347 reg < first_write_grf + write_len &&
2348 needs_dep[reg - first_write_grf]) {
2349 inst->insert_before(DEP_RESOLVE_MOV(reg));
2350 needs_dep[reg - first_write_grf] = false;
2351 if (scan_inst_16wide)
2352 needs_dep[reg - first_write_grf + 1] = false;
2353 }
2354 }
2355 }
2356
2357 /* Clear the flag for registers that actually got read (as expected). */
2358 clear_deps_for_inst_src(scan_inst, dispatch_width,
2359 needs_dep, first_write_grf, write_len);
2360
2361 /* Continue the loop only if we haven't resolved all the dependencies */
2362 int i;
2363 for (i = 0; i < write_len; i++) {
2364 if (needs_dep[i])
2365 break;
2366 }
2367 if (i == write_len)
2368 return;
2369 }
2370 }
2371
2372 /**
2373 * Implements this workaround for the original 965:
2374 *
2375 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2376 * used as a destination register until after it has been sourced by an
2377 * instruction with a different destination register.
2378 */
2379 void
2380 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2381 {
2382 int write_len = inst->regs_written() * dispatch_width / 8;
2383 int first_write_grf = inst->dst.reg;
2384 bool needs_dep[BRW_MAX_MRF];
2385 assert(write_len < (int)sizeof(needs_dep) - 1);
2386
2387 memset(needs_dep, false, sizeof(needs_dep));
2388 memset(needs_dep, true, write_len);
2389 /* Walk forwards looking for writes to registers we're writing which aren't
2390 * read before being written.
2391 */
2392 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2393 !scan_inst->is_tail_sentinel();
2394 scan_inst = (fs_inst *)scan_inst->next) {
2395 /* If we hit control flow, force resolve all remaining dependencies. */
2396 if (scan_inst->is_control_flow()) {
2397 for (int i = 0; i < write_len; i++) {
2398 if (needs_dep[i])
2399 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2400 }
2401 }
2402
2403 /* Clear the flag for registers that actually got read (as expected). */
2404 clear_deps_for_inst_src(scan_inst, dispatch_width,
2405 needs_dep, first_write_grf, write_len);
2406
2407 /* We insert our reads as late as possible since they're reading the
2408 * result of a SEND, which has massive latency.
2409 */
2410 if (scan_inst->dst.file == GRF &&
2411 scan_inst->dst.reg >= first_write_grf &&
2412 scan_inst->dst.reg < first_write_grf + write_len &&
2413 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2414 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2415 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2416 }
2417
2418 /* Continue the loop only if we haven't resolved all the dependencies */
2419 int i;
2420 for (i = 0; i < write_len; i++) {
2421 if (needs_dep[i])
2422 break;
2423 }
2424 if (i == write_len)
2425 return;
2426 }
2427
2428 /* If we hit the end of the program, resolve all remaining dependencies out
2429 * of paranoia.
2430 */
2431 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2432 assert(last_inst->eot);
2433 for (int i = 0; i < write_len; i++) {
2434 if (needs_dep[i])
2435 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2436 }
2437 }
2438
2439 void
2440 fs_visitor::insert_gen4_send_dependency_workarounds()
2441 {
2442 if (intel->gen != 4 || intel->is_g4x)
2443 return;
2444
2445 /* Note that we're done with register allocation, so GRF fs_regs always
2446 * have a .reg_offset of 0.
2447 */
2448
2449 foreach_list_safe(node, &this->instructions) {
2450 fs_inst *inst = (fs_inst *)node;
2451
2452 if (inst->mlen != 0 && inst->dst.file == GRF) {
2453 insert_gen4_pre_send_dependency_workarounds(inst);
2454 insert_gen4_post_send_dependency_workarounds(inst);
2455 }
2456 }
2457 }
2458
2459 /**
2460 * Turns the generic expression-style uniform pull constant load instruction
2461 * into a hardware-specific series of instructions for loading a pull
2462 * constant.
2463 *
2464 * The expression style allows the CSE pass before this to optimize out
2465 * repeated loads from the same offset, and gives the pre-register-allocation
2466 * scheduling full flexibility, while the conversion to native instructions
2467 * allows the post-register-allocation scheduler the best information
2468 * possible.
2469 *
2470 * Note that execution masking for setting up pull constant loads is special:
2471 * the channels that need to be written are unrelated to the current execution
2472 * mask, since a later instruction will use one of the result channels as a
2473 * source operand for all 8 or 16 of its channels.
2474 */
2475 void
2476 fs_visitor::lower_uniform_pull_constant_loads()
2477 {
2478 foreach_list(node, &this->instructions) {
2479 fs_inst *inst = (fs_inst *)node;
2480
2481 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2482 continue;
2483
2484 if (intel->gen >= 7) {
2485 fs_reg const_offset_reg = inst->src[1];
2486 assert(const_offset_reg.file == IMM &&
2487 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2488 const_offset_reg.imm.u /= 16;
2489 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2490
2491 /* This is actually going to be a MOV, but since only the first dword
2492 * is accessed, we have a special opcode to do just that one. Note
2493 * that this needs to be an operation that will be considered a def
2494 * by live variable analysis, or register allocation will explode.
2495 */
2496 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2497 payload, const_offset_reg);
2498 setup->force_writemask_all = true;
2499
2500 setup->ir = inst->ir;
2501 setup->annotation = inst->annotation;
2502 inst->insert_before(setup);
2503
2504 /* Similarly, this will only populate the first 4 channels of the
2505 * result register (since we only use smear values from 0-3), but we
2506 * don't tell the optimizer.
2507 */
2508 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2509 inst->src[1] = payload;
2510
2511 this->live_intervals_valid = false;
2512 } else {
2513 /* Before register allocation, we didn't tell the scheduler about the
2514 * MRF we use. We know it's safe to use this MRF because nothing
2515 * else does except for register spill/unspill, which generates and
2516 * uses its MRF within a single IR instruction.
2517 */
2518 inst->base_mrf = 14;
2519 inst->mlen = 1;
2520 }
2521 }
2522 }
2523
2524 void
2525 fs_visitor::dump_instruction(fs_inst *inst)
2526 {
2527 if (inst->predicate) {
2528 printf("(%cf0.%d) ",
2529 inst->predicate_inverse ? '-' : '+',
2530 inst->flag_subreg);
2531 }
2532
2533 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2534 opcode_descs[inst->opcode].name) {
2535 printf("%s", opcode_descs[inst->opcode].name);
2536 } else {
2537 switch (inst->opcode) {
2538 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
2539 printf("uniform_pull_const");
2540 break;
2541 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
2542 printf("uniform_pull_const_gen7");
2543 break;
2544 case FS_OPCODE_SET_SIMD4X2_OFFSET:
2545 printf("set_global_offset");
2546 break;
2547 default:
2548 printf("op%d", inst->opcode);
2549 break;
2550 }
2551 }
2552 if (inst->saturate)
2553 printf(".sat");
2554 if (inst->conditional_mod) {
2555 printf(".cmod");
2556 if (!inst->predicate &&
2557 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2558 inst->opcode != BRW_OPCODE_IF &&
2559 inst->opcode != BRW_OPCODE_WHILE))) {
2560 printf(".f0.%d\n", inst->flag_subreg);
2561 }
2562 }
2563 printf(" ");
2564
2565
2566 switch (inst->dst.file) {
2567 case GRF:
2568 printf("vgrf%d", inst->dst.reg);
2569 if (inst->dst.reg_offset)
2570 printf("+%d", inst->dst.reg_offset);
2571 break;
2572 case MRF:
2573 printf("m%d", inst->dst.reg);
2574 break;
2575 case BAD_FILE:
2576 printf("(null)");
2577 break;
2578 case UNIFORM:
2579 printf("***u%d***", inst->dst.reg);
2580 break;
2581 default:
2582 printf("???");
2583 break;
2584 }
2585 printf(", ");
2586
2587 for (int i = 0; i < 3; i++) {
2588 if (inst->src[i].negate)
2589 printf("-");
2590 if (inst->src[i].abs)
2591 printf("|");
2592 switch (inst->src[i].file) {
2593 case GRF:
2594 printf("vgrf%d", inst->src[i].reg);
2595 if (inst->src[i].reg_offset)
2596 printf("+%d", inst->src[i].reg_offset);
2597 break;
2598 case MRF:
2599 printf("***m%d***", inst->src[i].reg);
2600 break;
2601 case UNIFORM:
2602 printf("u%d", inst->src[i].reg);
2603 if (inst->src[i].reg_offset)
2604 printf(".%d", inst->src[i].reg_offset);
2605 break;
2606 case BAD_FILE:
2607 printf("(null)");
2608 break;
2609 case IMM:
2610 switch (inst->src[i].type) {
2611 case BRW_REGISTER_TYPE_F:
2612 printf("%ff", inst->src[i].imm.f);
2613 break;
2614 case BRW_REGISTER_TYPE_D:
2615 printf("%dd", inst->src[i].imm.i);
2616 break;
2617 case BRW_REGISTER_TYPE_UD:
2618 printf("%uu", inst->src[i].imm.u);
2619 break;
2620 default:
2621 printf("???");
2622 break;
2623 }
2624 break;
2625 default:
2626 printf("???");
2627 break;
2628 }
2629 if (inst->src[i].abs)
2630 printf("|");
2631
2632 if (i < 3)
2633 printf(", ");
2634 }
2635
2636 printf(" ");
2637
2638 if (inst->force_uncompressed)
2639 printf("1sthalf ");
2640
2641 if (inst->force_sechalf)
2642 printf("2ndhalf ");
2643
2644 printf("\n");
2645 }
2646
2647 void
2648 fs_visitor::dump_instructions()
2649 {
2650 int ip = 0;
2651 foreach_list(node, &this->instructions) {
2652 fs_inst *inst = (fs_inst *)node;
2653 printf("%d: ", ip++);
2654 dump_instruction(inst);
2655 }
2656 }
2657
2658 /**
2659 * Possibly returns an instruction that set up @param reg.
2660 *
2661 * Sometimes we want to take the result of some expression/variable
2662 * dereference tree and rewrite the instruction generating the result
2663 * of the tree. When processing the tree, we know that the
2664 * instructions generated are all writing temporaries that are dead
2665 * outside of this tree. So, if we have some instructions that write
2666 * a temporary, we're free to point that temp write somewhere else.
2667 *
2668 * Note that this doesn't guarantee that the instruction generated
2669 * only reg -- it might be the size=4 destination of a texture instruction.
2670 */
2671 fs_inst *
2672 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2673 fs_inst *end,
2674 fs_reg reg)
2675 {
2676 if (end == start ||
2677 end->predicate ||
2678 end->force_uncompressed ||
2679 end->force_sechalf ||
2680 reg.reladdr ||
2681 !reg.equals(end->dst)) {
2682 return NULL;
2683 } else {
2684 return end;
2685 }
2686 }
2687
2688 void
2689 fs_visitor::setup_payload_gen6()
2690 {
2691 struct intel_context *intel = &brw->intel;
2692 bool uses_depth =
2693 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2694 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2695
2696 assert(intel->gen >= 6);
2697
2698 /* R0-1: masks, pixel X/Y coordinates. */
2699 c->nr_payload_regs = 2;
2700 /* R2: only for 32-pixel dispatch.*/
2701
2702 /* R3-26: barycentric interpolation coordinates. These appear in the
2703 * same order that they appear in the brw_wm_barycentric_interp_mode
2704 * enum. Each set of coordinates occupies 2 registers if dispatch width
2705 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2706 * appear if they were enabled using the "Barycentric Interpolation
2707 * Mode" bits in WM_STATE.
2708 */
2709 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2710 if (barycentric_interp_modes & (1 << i)) {
2711 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2712 c->nr_payload_regs += 2;
2713 if (dispatch_width == 16) {
2714 c->nr_payload_regs += 2;
2715 }
2716 }
2717 }
2718
2719 /* R27: interpolated depth if uses source depth */
2720 if (uses_depth) {
2721 c->source_depth_reg = c->nr_payload_regs;
2722 c->nr_payload_regs++;
2723 if (dispatch_width == 16) {
2724 /* R28: interpolated depth if not 8-wide. */
2725 c->nr_payload_regs++;
2726 }
2727 }
2728 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2729 if (uses_depth) {
2730 c->source_w_reg = c->nr_payload_regs;
2731 c->nr_payload_regs++;
2732 if (dispatch_width == 16) {
2733 /* R30: interpolated W if not 8-wide. */
2734 c->nr_payload_regs++;
2735 }
2736 }
2737 /* R31: MSAA position offsets. */
2738 /* R32-: bary for 32-pixel. */
2739 /* R58-59: interp W for 32-pixel. */
2740
2741 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2742 c->source_depth_to_render_target = true;
2743 }
2744 }
2745
2746 bool
2747 fs_visitor::run()
2748 {
2749 sanity_param_count = fp->Base.Parameters->NumParameters;
2750 uint32_t orig_nr_params = c->prog_data.nr_params;
2751
2752 if (intel->gen >= 6)
2753 setup_payload_gen6();
2754 else
2755 setup_payload_gen4();
2756
2757 if (0) {
2758 emit_dummy_fs();
2759 } else {
2760 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2761 emit_shader_time_begin();
2762
2763 calculate_urb_setup();
2764 if (intel->gen < 6)
2765 emit_interpolation_setup_gen4();
2766 else
2767 emit_interpolation_setup_gen6();
2768
2769 /* We handle discards by keeping track of the still-live pixels in f0.1.
2770 * Initialize it with the dispatched pixels.
2771 */
2772 if (fp->UsesKill) {
2773 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2774 discard_init->flag_subreg = 1;
2775 }
2776
2777 /* Generate FS IR for main(). (the visitor only descends into
2778 * functions called "main").
2779 */
2780 if (shader) {
2781 foreach_list(node, &*shader->ir) {
2782 ir_instruction *ir = (ir_instruction *)node;
2783 base_ir = ir;
2784 this->result = reg_undef;
2785 ir->accept(this);
2786 }
2787 } else {
2788 emit_fragment_program_code();
2789 }
2790 base_ir = NULL;
2791 if (failed)
2792 return false;
2793
2794 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2795 emit_shader_time_end();
2796
2797 emit_fb_writes();
2798
2799 split_virtual_grfs();
2800
2801 move_uniform_array_access_to_pull_constants();
2802 setup_pull_constants();
2803
2804 bool progress;
2805 do {
2806 progress = false;
2807
2808 compact_virtual_grfs();
2809
2810 progress = remove_duplicate_mrf_writes() || progress;
2811
2812 progress = opt_algebraic() || progress;
2813 progress = opt_cse() || progress;
2814 progress = opt_copy_propagate() || progress;
2815 progress = dead_code_eliminate() || progress;
2816 progress = register_coalesce() || progress;
2817 progress = register_coalesce_2() || progress;
2818 progress = compute_to_mrf() || progress;
2819 } while (progress);
2820
2821 remove_dead_constants();
2822
2823 schedule_instructions(false);
2824
2825 lower_uniform_pull_constant_loads();
2826
2827 assign_curb_setup();
2828 assign_urb_setup();
2829
2830 if (0) {
2831 /* Debug of register spilling: Go spill everything. */
2832 for (int i = 0; i < virtual_grf_count; i++) {
2833 spill_reg(i);
2834 }
2835 }
2836
2837 if (0)
2838 assign_regs_trivial();
2839 else {
2840 while (!assign_regs()) {
2841 if (failed)
2842 break;
2843 }
2844 }
2845 }
2846 assert(force_uncompressed_stack == 0);
2847 assert(force_sechalf_stack == 0);
2848
2849 /* This must come after all optimization and register allocation, since
2850 * it inserts dead code that happens to have side effects, and it does
2851 * so based on the actual physical registers in use.
2852 */
2853 insert_gen4_send_dependency_workarounds();
2854
2855 if (failed)
2856 return false;
2857
2858 schedule_instructions(true);
2859
2860 if (dispatch_width == 8) {
2861 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2862 } else {
2863 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2864
2865 /* Make sure we didn't try to sneak in an extra uniform */
2866 assert(orig_nr_params == c->prog_data.nr_params);
2867 (void) orig_nr_params;
2868 }
2869
2870 /* If any state parameters were appended, then ParameterValues could have
2871 * been realloced, in which case the driver uniform storage set up by
2872 * _mesa_associate_uniform_storage() would point to freed memory. Make
2873 * sure that didn't happen.
2874 */
2875 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2876
2877 return !failed;
2878 }
2879
2880 const unsigned *
2881 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2882 struct gl_fragment_program *fp,
2883 struct gl_shader_program *prog,
2884 unsigned *final_assembly_size)
2885 {
2886 struct intel_context *intel = &brw->intel;
2887 bool start_busy = false;
2888 float start_time = 0;
2889
2890 if (unlikely(intel->perf_debug)) {
2891 start_busy = (intel->batch.last_bo &&
2892 drm_intel_bo_busy(intel->batch.last_bo));
2893 start_time = get_time();
2894 }
2895
2896 struct brw_shader *shader = NULL;
2897 if (prog)
2898 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2899
2900 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2901 if (shader) {
2902 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2903 _mesa_print_ir(shader->ir, NULL);
2904 printf("\n\n");
2905 } else {
2906 printf("ARB_fragment_program %d ir for native fragment shader\n",
2907 fp->Base.Id);
2908 _mesa_print_program(&fp->Base);
2909 }
2910 }
2911
2912 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2913 */
2914 fs_visitor v(brw, c, prog, fp, 8);
2915 if (!v.run()) {
2916 prog->LinkStatus = false;
2917 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2918
2919 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2920 v.fail_msg);
2921
2922 return NULL;
2923 }
2924
2925 exec_list *simd16_instructions = NULL;
2926 fs_visitor v2(brw, c, prog, fp, 16);
2927 bool no16 = INTEL_DEBUG & DEBUG_NO16;
2928 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2929 v2.import_uniforms(&v);
2930 if (!v2.run()) {
2931 perf_debug("16-wide shader failed to compile, falling back to "
2932 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2933 } else {
2934 simd16_instructions = &v2.instructions;
2935 }
2936 }
2937
2938 c->prog_data.dispatch_width = 8;
2939
2940 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2941 const unsigned *generated = g.generate_assembly(&v.instructions,
2942 simd16_instructions,
2943 final_assembly_size);
2944
2945 if (unlikely(intel->perf_debug) && shader) {
2946 if (shader->compiled_once)
2947 brw_wm_debug_recompile(brw, prog, &c->key);
2948 shader->compiled_once = true;
2949
2950 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2951 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2952 (get_time() - start_time) * 1000);
2953 }
2954 }
2955
2956 return generated;
2957 }
2958
2959 bool
2960 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2961 {
2962 struct brw_context *brw = brw_context(ctx);
2963 struct intel_context *intel = &brw->intel;
2964 struct brw_wm_prog_key key;
2965
2966 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2967 return true;
2968
2969 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2970 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2971 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2972 bool program_uses_dfdy = fp->UsesDFdy;
2973
2974 memset(&key, 0, sizeof(key));
2975
2976 if (intel->gen < 6) {
2977 if (fp->UsesKill)
2978 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2979
2980 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2981 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2982
2983 /* Just assume depth testing. */
2984 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2985 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2986 }
2987
2988 if (prog->Name != 0)
2989 key.proj_attrib_mask = ~(GLbitfield64) 0;
2990
2991 if (intel->gen < 6)
2992 key.vp_outputs_written |= BITFIELD64_BIT(VARYING_SLOT_POS);
2993
2994 for (int i = 0; i < VARYING_SLOT_MAX; i++) {
2995 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2996 continue;
2997
2998 if (prog->Name == 0)
2999 key.proj_attrib_mask |= BITFIELD64_BIT(i);
3000
3001 if (intel->gen < 6) {
3002 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3003 key.vp_outputs_written |= BITFIELD64_BIT(i);
3004 }
3005 }
3006
3007 key.clamp_fragment_color = true;
3008
3009 for (int i = 0; i < MAX_SAMPLERS; i++) {
3010 if (fp->Base.ShadowSamplers & (1 << i)) {
3011 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3012 key.tex.swizzles[i] =
3013 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3014 } else {
3015 /* Color sampler: assume no swizzling. */
3016 key.tex.swizzles[i] = SWIZZLE_XYZW;
3017 }
3018 }
3019
3020 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3021 key.drawable_height = ctx->DrawBuffer->Height;
3022 }
3023
3024 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3025 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3026 }
3027
3028 key.nr_color_regions = 1;
3029
3030 key.program_string_id = bfp->id;
3031
3032 uint32_t old_prog_offset = brw->wm.prog_offset;
3033 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3034
3035 bool success = do_wm_prog(brw, prog, bfp, &key);
3036
3037 brw->wm.prog_offset = old_prog_offset;
3038 brw->wm.prog_data = old_prog_data;
3039
3040 return success;
3041 }