i965: Add names for all instructions to dump_instruction() in FS and VS.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 #define ALU3(op) \
150 fs_inst * \
151 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
152 { \
153 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
154 }
155
156 ALU1(NOT)
157 ALU1(MOV)
158 ALU1(FRC)
159 ALU1(RNDD)
160 ALU1(RNDE)
161 ALU1(RNDZ)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(SHL)
169 ALU2(SHR)
170 ALU2(ASR)
171 ALU3(LRP)
172
173 /** Gen4 predicated IF. */
174 fs_inst *
175 fs_visitor::IF(uint32_t predicate)
176 {
177 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
178 inst->predicate = predicate;
179 return inst;
180 }
181
182 /** Gen6+ IF with embedded comparison. */
183 fs_inst *
184 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
185 {
186 assert(intel->gen >= 6);
187 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
188 reg_null_d, src0, src1);
189 inst->conditional_mod = condition;
190 return inst;
191 }
192
193 /**
194 * CMP: Sets the low bit of the destination channels with the result
195 * of the comparison, while the upper bits are undefined, and updates
196 * the flag register with the packed 16 bits of the result.
197 */
198 fs_inst *
199 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
200 {
201 fs_inst *inst;
202
203 /* Take the instruction:
204 *
205 * CMP null<d> src0<f> src1<f>
206 *
207 * Original gen4 does type conversion to the destination type before
208 * comparison, producing garbage results for floating point comparisons.
209 * gen5 does the comparison on the execution type (resolved source types),
210 * so dst type doesn't matter. gen6 does comparison and then uses the
211 * result as if it was the dst type with no conversion, which happens to
212 * mostly work out for float-interpreted-as-int since our comparisons are
213 * for >0, =0, <0.
214 */
215 if (intel->gen == 4) {
216 dst.type = src0.type;
217 if (dst.file == FIXED_HW_REG)
218 dst.fixed_hw_reg.type = dst.type;
219 }
220
221 resolve_ud_negate(&src0);
222 resolve_ud_negate(&src1);
223
224 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
225 inst->conditional_mod = condition;
226
227 return inst;
228 }
229
230 exec_list
231 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
232 fs_reg offset)
233 {
234 exec_list instructions;
235 fs_inst *inst;
236
237 if (intel->gen >= 7) {
238 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
239 dst, surf_index, offset);
240 instructions.push_tail(inst);
241 } else {
242 int base_mrf = 13;
243 bool header_present = true;
244
245 fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
246 mrf.type = BRW_REGISTER_TYPE_D;
247
248 /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
249 * dword-aligned byte offset.
250 */
251 if (intel->gen == 6) {
252 instructions.push_tail(MOV(mrf, offset));
253 } else {
254 instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
255 }
256 inst = MOV(mrf, offset);
257 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
258 dst, surf_index);
259 inst->header_present = header_present;
260 inst->base_mrf = base_mrf;
261 inst->mlen = header_present + dispatch_width / 8;
262
263 instructions.push_tail(inst);
264 }
265
266 return instructions;
267 }
268
269 /**
270 * A helper for MOV generation for fixing up broken hardware SEND dependency
271 * handling.
272 */
273 fs_inst *
274 fs_visitor::DEP_RESOLVE_MOV(int grf)
275 {
276 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
277
278 inst->ir = NULL;
279 inst->annotation = "send dependency resolve";
280
281 /* The caller always wants uncompressed to emit the minimal extra
282 * dependencies, and to avoid having to deal with aligning its regs to 2.
283 */
284 inst->force_uncompressed = true;
285
286 return inst;
287 }
288
289 bool
290 fs_inst::equals(fs_inst *inst)
291 {
292 return (opcode == inst->opcode &&
293 dst.equals(inst->dst) &&
294 src[0].equals(inst->src[0]) &&
295 src[1].equals(inst->src[1]) &&
296 src[2].equals(inst->src[2]) &&
297 saturate == inst->saturate &&
298 predicate == inst->predicate &&
299 conditional_mod == inst->conditional_mod &&
300 mlen == inst->mlen &&
301 base_mrf == inst->base_mrf &&
302 sampler == inst->sampler &&
303 target == inst->target &&
304 eot == inst->eot &&
305 header_present == inst->header_present &&
306 shadow_compare == inst->shadow_compare &&
307 offset == inst->offset);
308 }
309
310 int
311 fs_inst::regs_written()
312 {
313 if (is_tex())
314 return 4;
315
316 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
317 * but we don't currently use them...nor do we have an opcode for them.
318 */
319
320 return 1;
321 }
322
323 bool
324 fs_inst::overwrites_reg(const fs_reg &reg)
325 {
326 return (reg.file == dst.file &&
327 reg.reg == dst.reg &&
328 reg.reg_offset >= dst.reg_offset &&
329 reg.reg_offset < dst.reg_offset + regs_written());
330 }
331
332 bool
333 fs_inst::is_tex()
334 {
335 return (opcode == SHADER_OPCODE_TEX ||
336 opcode == FS_OPCODE_TXB ||
337 opcode == SHADER_OPCODE_TXD ||
338 opcode == SHADER_OPCODE_TXF ||
339 opcode == SHADER_OPCODE_TXF_MS ||
340 opcode == SHADER_OPCODE_TXL ||
341 opcode == SHADER_OPCODE_TXS ||
342 opcode == SHADER_OPCODE_LOD);
343 }
344
345 bool
346 fs_inst::is_math()
347 {
348 return (opcode == SHADER_OPCODE_RCP ||
349 opcode == SHADER_OPCODE_RSQ ||
350 opcode == SHADER_OPCODE_SQRT ||
351 opcode == SHADER_OPCODE_EXP2 ||
352 opcode == SHADER_OPCODE_LOG2 ||
353 opcode == SHADER_OPCODE_SIN ||
354 opcode == SHADER_OPCODE_COS ||
355 opcode == SHADER_OPCODE_INT_QUOTIENT ||
356 opcode == SHADER_OPCODE_INT_REMAINDER ||
357 opcode == SHADER_OPCODE_POW);
358 }
359
360 bool
361 fs_inst::is_control_flow()
362 {
363 switch (opcode) {
364 case BRW_OPCODE_DO:
365 case BRW_OPCODE_WHILE:
366 case BRW_OPCODE_IF:
367 case BRW_OPCODE_ELSE:
368 case BRW_OPCODE_ENDIF:
369 case BRW_OPCODE_BREAK:
370 case BRW_OPCODE_CONTINUE:
371 return true;
372 default:
373 return false;
374 }
375 }
376
377 bool
378 fs_inst::is_send_from_grf()
379 {
380 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
381 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
382 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
383 src[1].file == GRF));
384 }
385
386 bool
387 fs_visitor::can_do_source_mods(fs_inst *inst)
388 {
389 if (intel->gen == 6 && inst->is_math())
390 return false;
391
392 if (inst->is_send_from_grf())
393 return false;
394
395 return true;
396 }
397
398 void
399 fs_reg::init()
400 {
401 memset(this, 0, sizeof(*this));
402 this->smear = -1;
403 }
404
405 /** Generic unset register constructor. */
406 fs_reg::fs_reg()
407 {
408 init();
409 this->file = BAD_FILE;
410 }
411
412 /** Immediate value constructor. */
413 fs_reg::fs_reg(float f)
414 {
415 init();
416 this->file = IMM;
417 this->type = BRW_REGISTER_TYPE_F;
418 this->imm.f = f;
419 }
420
421 /** Immediate value constructor. */
422 fs_reg::fs_reg(int32_t i)
423 {
424 init();
425 this->file = IMM;
426 this->type = BRW_REGISTER_TYPE_D;
427 this->imm.i = i;
428 }
429
430 /** Immediate value constructor. */
431 fs_reg::fs_reg(uint32_t u)
432 {
433 init();
434 this->file = IMM;
435 this->type = BRW_REGISTER_TYPE_UD;
436 this->imm.u = u;
437 }
438
439 /** Fixed brw_reg Immediate value constructor. */
440 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
441 {
442 init();
443 this->file = FIXED_HW_REG;
444 this->fixed_hw_reg = fixed_hw_reg;
445 this->type = fixed_hw_reg.type;
446 }
447
448 bool
449 fs_reg::equals(const fs_reg &r) const
450 {
451 return (file == r.file &&
452 reg == r.reg &&
453 reg_offset == r.reg_offset &&
454 type == r.type &&
455 negate == r.negate &&
456 abs == r.abs &&
457 !reladdr && !r.reladdr &&
458 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
459 sizeof(fixed_hw_reg)) == 0 &&
460 smear == r.smear &&
461 imm.u == r.imm.u);
462 }
463
464 bool
465 fs_reg::is_zero() const
466 {
467 if (file != IMM)
468 return false;
469
470 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
471 }
472
473 bool
474 fs_reg::is_one() const
475 {
476 if (file != IMM)
477 return false;
478
479 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
480 }
481
482 int
483 fs_visitor::type_size(const struct glsl_type *type)
484 {
485 unsigned int size, i;
486
487 switch (type->base_type) {
488 case GLSL_TYPE_UINT:
489 case GLSL_TYPE_INT:
490 case GLSL_TYPE_FLOAT:
491 case GLSL_TYPE_BOOL:
492 return type->components();
493 case GLSL_TYPE_ARRAY:
494 return type_size(type->fields.array) * type->length;
495 case GLSL_TYPE_STRUCT:
496 size = 0;
497 for (i = 0; i < type->length; i++) {
498 size += type_size(type->fields.structure[i].type);
499 }
500 return size;
501 case GLSL_TYPE_SAMPLER:
502 /* Samplers take up no register space, since they're baked in at
503 * link time.
504 */
505 return 0;
506 case GLSL_TYPE_VOID:
507 case GLSL_TYPE_ERROR:
508 case GLSL_TYPE_INTERFACE:
509 assert(!"not reached");
510 break;
511 }
512
513 return 0;
514 }
515
516 fs_reg
517 fs_visitor::get_timestamp()
518 {
519 assert(intel->gen >= 7);
520
521 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
522 BRW_ARF_TIMESTAMP,
523 0),
524 BRW_REGISTER_TYPE_UD));
525
526 fs_reg dst = fs_reg(this, glsl_type::uint_type);
527
528 fs_inst *mov = emit(MOV(dst, ts));
529 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
530 * even if it's not enabled in the dispatch.
531 */
532 mov->force_writemask_all = true;
533 mov->force_uncompressed = true;
534
535 /* The caller wants the low 32 bits of the timestamp. Since it's running
536 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
537 * which is plenty of time for our purposes. It is identical across the
538 * EUs, but since it's tracking GPU core speed it will increment at a
539 * varying rate as render P-states change.
540 *
541 * The caller could also check if render P-states have changed (or anything
542 * else that might disrupt timing) by setting smear to 2 and checking if
543 * that field is != 0.
544 */
545 dst.smear = 0;
546
547 return dst;
548 }
549
550 void
551 fs_visitor::emit_shader_time_begin()
552 {
553 current_annotation = "shader time start";
554 shader_start_time = get_timestamp();
555 }
556
557 void
558 fs_visitor::emit_shader_time_end()
559 {
560 current_annotation = "shader time end";
561
562 enum shader_time_shader_type type, written_type, reset_type;
563 if (dispatch_width == 8) {
564 type = ST_FS8;
565 written_type = ST_FS8_WRITTEN;
566 reset_type = ST_FS8_RESET;
567 } else {
568 assert(dispatch_width == 16);
569 type = ST_FS16;
570 written_type = ST_FS16_WRITTEN;
571 reset_type = ST_FS16_RESET;
572 }
573
574 fs_reg shader_end_time = get_timestamp();
575
576 /* Check that there weren't any timestamp reset events (assuming these
577 * were the only two timestamp reads that happened).
578 */
579 fs_reg reset = shader_end_time;
580 reset.smear = 2;
581 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
582 test->conditional_mod = BRW_CONDITIONAL_Z;
583 emit(IF(BRW_PREDICATE_NORMAL));
584
585 push_force_uncompressed();
586 fs_reg start = shader_start_time;
587 start.negate = true;
588 fs_reg diff = fs_reg(this, glsl_type::uint_type);
589 emit(ADD(diff, start, shader_end_time));
590
591 /* If there were no instructions between the two timestamp gets, the diff
592 * is 2 cycles. Remove that overhead, so I can forget about that when
593 * trying to determine the time taken for single instructions.
594 */
595 emit(ADD(diff, diff, fs_reg(-2u)));
596
597 emit_shader_time_write(type, diff);
598 emit_shader_time_write(written_type, fs_reg(1u));
599 emit(BRW_OPCODE_ELSE);
600 emit_shader_time_write(reset_type, fs_reg(1u));
601 emit(BRW_OPCODE_ENDIF);
602
603 pop_force_uncompressed();
604 }
605
606 void
607 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
608 fs_reg value)
609 {
610 int shader_time_index = brw_get_shader_time_index(brw, prog, &fp->Base,
611 type);
612 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
613
614 fs_reg payload;
615 if (dispatch_width == 8)
616 payload = fs_reg(this, glsl_type::uvec2_type);
617 else
618 payload = fs_reg(this, glsl_type::uint_type);
619
620 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
621 fs_reg(), payload, offset, value));
622 }
623
624 void
625 fs_visitor::fail(const char *format, ...)
626 {
627 va_list va;
628 char *msg;
629
630 if (failed)
631 return;
632
633 failed = true;
634
635 va_start(va, format);
636 msg = ralloc_vasprintf(mem_ctx, format, va);
637 va_end(va);
638 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
639
640 this->fail_msg = msg;
641
642 if (INTEL_DEBUG & DEBUG_WM) {
643 fprintf(stderr, "%s", msg);
644 }
645 }
646
647 fs_inst *
648 fs_visitor::emit(enum opcode opcode)
649 {
650 return emit(fs_inst(opcode));
651 }
652
653 fs_inst *
654 fs_visitor::emit(enum opcode opcode, fs_reg dst)
655 {
656 return emit(fs_inst(opcode, dst));
657 }
658
659 fs_inst *
660 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
661 {
662 return emit(fs_inst(opcode, dst, src0));
663 }
664
665 fs_inst *
666 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
667 {
668 return emit(fs_inst(opcode, dst, src0, src1));
669 }
670
671 fs_inst *
672 fs_visitor::emit(enum opcode opcode, fs_reg dst,
673 fs_reg src0, fs_reg src1, fs_reg src2)
674 {
675 return emit(fs_inst(opcode, dst, src0, src1, src2));
676 }
677
678 void
679 fs_visitor::push_force_uncompressed()
680 {
681 force_uncompressed_stack++;
682 }
683
684 void
685 fs_visitor::pop_force_uncompressed()
686 {
687 force_uncompressed_stack--;
688 assert(force_uncompressed_stack >= 0);
689 }
690
691 void
692 fs_visitor::push_force_sechalf()
693 {
694 force_sechalf_stack++;
695 }
696
697 void
698 fs_visitor::pop_force_sechalf()
699 {
700 force_sechalf_stack--;
701 assert(force_sechalf_stack >= 0);
702 }
703
704 /**
705 * Returns how many MRFs an FS opcode will write over.
706 *
707 * Note that this is not the 0 or 1 implied writes in an actual gen
708 * instruction -- the FS opcodes often generate MOVs in addition.
709 */
710 int
711 fs_visitor::implied_mrf_writes(fs_inst *inst)
712 {
713 if (inst->mlen == 0)
714 return 0;
715
716 switch (inst->opcode) {
717 case SHADER_OPCODE_RCP:
718 case SHADER_OPCODE_RSQ:
719 case SHADER_OPCODE_SQRT:
720 case SHADER_OPCODE_EXP2:
721 case SHADER_OPCODE_LOG2:
722 case SHADER_OPCODE_SIN:
723 case SHADER_OPCODE_COS:
724 return 1 * dispatch_width / 8;
725 case SHADER_OPCODE_POW:
726 case SHADER_OPCODE_INT_QUOTIENT:
727 case SHADER_OPCODE_INT_REMAINDER:
728 return 2 * dispatch_width / 8;
729 case SHADER_OPCODE_TEX:
730 case FS_OPCODE_TXB:
731 case SHADER_OPCODE_TXD:
732 case SHADER_OPCODE_TXF:
733 case SHADER_OPCODE_TXF_MS:
734 case SHADER_OPCODE_TXL:
735 case SHADER_OPCODE_TXS:
736 case SHADER_OPCODE_LOD:
737 return 1;
738 case FS_OPCODE_FB_WRITE:
739 return 2;
740 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
741 case FS_OPCODE_UNSPILL:
742 return 1;
743 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
744 return inst->header_present;
745 case FS_OPCODE_SPILL:
746 return 2;
747 default:
748 assert(!"not reached");
749 return inst->mlen;
750 }
751 }
752
753 int
754 fs_visitor::virtual_grf_alloc(int size)
755 {
756 if (virtual_grf_array_size <= virtual_grf_count) {
757 if (virtual_grf_array_size == 0)
758 virtual_grf_array_size = 16;
759 else
760 virtual_grf_array_size *= 2;
761 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
762 virtual_grf_array_size);
763 }
764 virtual_grf_sizes[virtual_grf_count] = size;
765 return virtual_grf_count++;
766 }
767
768 /** Fixed HW reg constructor. */
769 fs_reg::fs_reg(enum register_file file, int reg)
770 {
771 init();
772 this->file = file;
773 this->reg = reg;
774 this->type = BRW_REGISTER_TYPE_F;
775 }
776
777 /** Fixed HW reg constructor. */
778 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
779 {
780 init();
781 this->file = file;
782 this->reg = reg;
783 this->type = type;
784 }
785
786 /** Automatic reg constructor. */
787 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
788 {
789 init();
790
791 this->file = GRF;
792 this->reg = v->virtual_grf_alloc(v->type_size(type));
793 this->reg_offset = 0;
794 this->type = brw_type_for_base_type(type);
795 }
796
797 fs_reg *
798 fs_visitor::variable_storage(ir_variable *var)
799 {
800 return (fs_reg *)hash_table_find(this->variable_ht, var);
801 }
802
803 void
804 import_uniforms_callback(const void *key,
805 void *data,
806 void *closure)
807 {
808 struct hash_table *dst_ht = (struct hash_table *)closure;
809 const fs_reg *reg = (const fs_reg *)data;
810
811 if (reg->file != UNIFORM)
812 return;
813
814 hash_table_insert(dst_ht, data, key);
815 }
816
817 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
818 * This brings in those uniform definitions
819 */
820 void
821 fs_visitor::import_uniforms(fs_visitor *v)
822 {
823 hash_table_call_foreach(v->variable_ht,
824 import_uniforms_callback,
825 variable_ht);
826 this->params_remap = v->params_remap;
827 }
828
829 /* Our support for uniforms is piggy-backed on the struct
830 * gl_fragment_program, because that's where the values actually
831 * get stored, rather than in some global gl_shader_program uniform
832 * store.
833 */
834 void
835 fs_visitor::setup_uniform_values(ir_variable *ir)
836 {
837 int namelen = strlen(ir->name);
838
839 /* The data for our (non-builtin) uniforms is stored in a series of
840 * gl_uniform_driver_storage structs for each subcomponent that
841 * glGetUniformLocation() could name. We know it's been set up in the same
842 * order we'd walk the type, so walk the list of storage and find anything
843 * with our name, or the prefix of a component that starts with our name.
844 */
845 unsigned params_before = c->prog_data.nr_params;
846 for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
847 struct gl_uniform_storage *storage = &prog->UniformStorage[u];
848
849 if (strncmp(ir->name, storage->name, namelen) != 0 ||
850 (storage->name[namelen] != 0 &&
851 storage->name[namelen] != '.' &&
852 storage->name[namelen] != '[')) {
853 continue;
854 }
855
856 unsigned slots = storage->type->component_slots();
857 if (storage->array_elements)
858 slots *= storage->array_elements;
859
860 for (unsigned i = 0; i < slots; i++) {
861 c->prog_data.param[c->prog_data.nr_params++] =
862 &storage->storage[i].f;
863 }
864 }
865
866 /* Make sure we actually initialized the right amount of stuff here. */
867 assert(params_before + ir->type->component_slots() ==
868 c->prog_data.nr_params);
869 }
870
871
872 /* Our support for builtin uniforms is even scarier than non-builtin.
873 * It sits on top of the PROG_STATE_VAR parameters that are
874 * automatically updated from GL context state.
875 */
876 void
877 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
878 {
879 const ir_state_slot *const slots = ir->state_slots;
880 assert(ir->state_slots != NULL);
881
882 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
883 /* This state reference has already been setup by ir_to_mesa, but we'll
884 * get the same index back here.
885 */
886 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
887 (gl_state_index *)slots[i].tokens);
888
889 /* Add each of the unique swizzles of the element as a parameter.
890 * This'll end up matching the expected layout of the
891 * array/matrix/structure we're trying to fill in.
892 */
893 int last_swiz = -1;
894 for (unsigned int j = 0; j < 4; j++) {
895 int swiz = GET_SWZ(slots[i].swizzle, j);
896 if (swiz == last_swiz)
897 break;
898 last_swiz = swiz;
899
900 c->prog_data.param[c->prog_data.nr_params++] =
901 &fp->Base.Parameters->ParameterValues[index][swiz].f;
902 }
903 }
904 }
905
906 fs_reg *
907 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
908 {
909 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
910 fs_reg wpos = *reg;
911 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
912
913 /* gl_FragCoord.x */
914 if (ir->pixel_center_integer) {
915 emit(MOV(wpos, this->pixel_x));
916 } else {
917 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
918 }
919 wpos.reg_offset++;
920
921 /* gl_FragCoord.y */
922 if (!flip && ir->pixel_center_integer) {
923 emit(MOV(wpos, this->pixel_y));
924 } else {
925 fs_reg pixel_y = this->pixel_y;
926 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
927
928 if (flip) {
929 pixel_y.negate = true;
930 offset += c->key.drawable_height - 1.0;
931 }
932
933 emit(ADD(wpos, pixel_y, fs_reg(offset)));
934 }
935 wpos.reg_offset++;
936
937 /* gl_FragCoord.z */
938 if (intel->gen >= 6) {
939 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
940 } else {
941 emit(FS_OPCODE_LINTERP, wpos,
942 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
943 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
944 interp_reg(VARYING_SLOT_POS, 2));
945 }
946 wpos.reg_offset++;
947
948 /* gl_FragCoord.w: Already set up in emit_interpolation */
949 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
950
951 return reg;
952 }
953
954 fs_inst *
955 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
956 glsl_interp_qualifier interpolation_mode,
957 bool is_centroid)
958 {
959 brw_wm_barycentric_interp_mode barycoord_mode;
960 if (is_centroid) {
961 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
962 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
963 else
964 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
965 } else {
966 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
967 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
968 else
969 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
970 }
971 return emit(FS_OPCODE_LINTERP, attr,
972 this->delta_x[barycoord_mode],
973 this->delta_y[barycoord_mode], interp);
974 }
975
976 fs_reg *
977 fs_visitor::emit_general_interpolation(ir_variable *ir)
978 {
979 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
980 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
981 fs_reg attr = *reg;
982
983 unsigned int array_elements;
984 const glsl_type *type;
985
986 if (ir->type->is_array()) {
987 array_elements = ir->type->length;
988 if (array_elements == 0) {
989 fail("dereferenced array '%s' has length 0\n", ir->name);
990 }
991 type = ir->type->fields.array;
992 } else {
993 array_elements = 1;
994 type = ir->type;
995 }
996
997 glsl_interp_qualifier interpolation_mode =
998 ir->determine_interpolation_mode(c->key.flat_shade);
999
1000 int location = ir->location;
1001 for (unsigned int i = 0; i < array_elements; i++) {
1002 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1003 if (urb_setup[location] == -1) {
1004 /* If there's no incoming setup data for this slot, don't
1005 * emit interpolation for it.
1006 */
1007 attr.reg_offset += type->vector_elements;
1008 location++;
1009 continue;
1010 }
1011
1012 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1013 /* Constant interpolation (flat shading) case. The SF has
1014 * handed us defined values in only the constant offset
1015 * field of the setup reg.
1016 */
1017 for (unsigned int k = 0; k < type->vector_elements; k++) {
1018 struct brw_reg interp = interp_reg(location, k);
1019 interp = suboffset(interp, 3);
1020 interp.type = reg->type;
1021 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1022 attr.reg_offset++;
1023 }
1024 } else {
1025 /* Smooth/noperspective interpolation case. */
1026 for (unsigned int k = 0; k < type->vector_elements; k++) {
1027 /* FINISHME: At some point we probably want to push
1028 * this farther by giving similar treatment to the
1029 * other potentially constant components of the
1030 * attribute, as well as making brw_vs_constval.c
1031 * handle varyings other than gl_TexCoord.
1032 */
1033 if (location >= VARYING_SLOT_TEX0 &&
1034 location <= VARYING_SLOT_TEX7 &&
1035 k == 3 && !(c->key.proj_attrib_mask
1036 & BITFIELD64_BIT(location))) {
1037 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1038 } else {
1039 struct brw_reg interp = interp_reg(location, k);
1040 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1041 ir->centroid);
1042 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1043 /* Get the pixel/sample mask into f0 so that we know
1044 * which pixels are lit. Then, for each channel that is
1045 * unlit, replace the centroid data with non-centroid
1046 * data.
1047 */
1048 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1049 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1050 interpolation_mode, false);
1051 inst->predicate = BRW_PREDICATE_NORMAL;
1052 inst->predicate_inverse = true;
1053 }
1054 if (intel->gen < 6) {
1055 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1056 }
1057 }
1058 attr.reg_offset++;
1059 }
1060
1061 }
1062 location++;
1063 }
1064 }
1065
1066 return reg;
1067 }
1068
1069 fs_reg *
1070 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1071 {
1072 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1073
1074 /* The frontfacing comes in as a bit in the thread payload. */
1075 if (intel->gen >= 6) {
1076 emit(BRW_OPCODE_ASR, *reg,
1077 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1078 fs_reg(15));
1079 emit(BRW_OPCODE_NOT, *reg, *reg);
1080 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1081 } else {
1082 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1083 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1084 * us front face
1085 */
1086 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1087 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1088 }
1089
1090 return reg;
1091 }
1092
1093 fs_reg
1094 fs_visitor::fix_math_operand(fs_reg src)
1095 {
1096 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1097 * might be able to do better by doing execsize = 1 math and then
1098 * expanding that result out, but we would need to be careful with
1099 * masking.
1100 *
1101 * The hardware ignores source modifiers (negate and abs) on math
1102 * instructions, so we also move to a temp to set those up.
1103 */
1104 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1105 !src.abs && !src.negate)
1106 return src;
1107
1108 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1109 * operands to math
1110 */
1111 if (intel->gen >= 7 && src.file != IMM)
1112 return src;
1113
1114 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1115 expanded.type = src.type;
1116 emit(BRW_OPCODE_MOV, expanded, src);
1117 return expanded;
1118 }
1119
1120 fs_inst *
1121 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1122 {
1123 switch (opcode) {
1124 case SHADER_OPCODE_RCP:
1125 case SHADER_OPCODE_RSQ:
1126 case SHADER_OPCODE_SQRT:
1127 case SHADER_OPCODE_EXP2:
1128 case SHADER_OPCODE_LOG2:
1129 case SHADER_OPCODE_SIN:
1130 case SHADER_OPCODE_COS:
1131 break;
1132 default:
1133 assert(!"not reached: bad math opcode");
1134 return NULL;
1135 }
1136
1137 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1138 * might be able to do better by doing execsize = 1 math and then
1139 * expanding that result out, but we would need to be careful with
1140 * masking.
1141 *
1142 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1143 * instructions, so we also move to a temp to set those up.
1144 */
1145 if (intel->gen >= 6)
1146 src = fix_math_operand(src);
1147
1148 fs_inst *inst = emit(opcode, dst, src);
1149
1150 if (intel->gen < 6) {
1151 inst->base_mrf = 2;
1152 inst->mlen = dispatch_width / 8;
1153 }
1154
1155 return inst;
1156 }
1157
1158 fs_inst *
1159 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1160 {
1161 int base_mrf = 2;
1162 fs_inst *inst;
1163
1164 switch (opcode) {
1165 case SHADER_OPCODE_INT_QUOTIENT:
1166 case SHADER_OPCODE_INT_REMAINDER:
1167 if (intel->gen >= 7 && dispatch_width == 16)
1168 fail("16-wide INTDIV unsupported\n");
1169 break;
1170 case SHADER_OPCODE_POW:
1171 break;
1172 default:
1173 assert(!"not reached: unsupported binary math opcode.");
1174 return NULL;
1175 }
1176
1177 if (intel->gen >= 6) {
1178 src0 = fix_math_operand(src0);
1179 src1 = fix_math_operand(src1);
1180
1181 inst = emit(opcode, dst, src0, src1);
1182 } else {
1183 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1184 * "Message Payload":
1185 *
1186 * "Operand0[7]. For the INT DIV functions, this operand is the
1187 * denominator."
1188 * ...
1189 * "Operand1[7]. For the INT DIV functions, this operand is the
1190 * numerator."
1191 */
1192 bool is_int_div = opcode != SHADER_OPCODE_POW;
1193 fs_reg &op0 = is_int_div ? src1 : src0;
1194 fs_reg &op1 = is_int_div ? src0 : src1;
1195
1196 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1197 inst = emit(opcode, dst, op0, reg_null_f);
1198
1199 inst->base_mrf = base_mrf;
1200 inst->mlen = 2 * dispatch_width / 8;
1201 }
1202 return inst;
1203 }
1204
1205 void
1206 fs_visitor::assign_curb_setup()
1207 {
1208 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1209 if (dispatch_width == 8) {
1210 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1211 } else {
1212 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1213 }
1214
1215 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1216 foreach_list(node, &this->instructions) {
1217 fs_inst *inst = (fs_inst *)node;
1218
1219 for (unsigned int i = 0; i < 3; i++) {
1220 if (inst->src[i].file == UNIFORM) {
1221 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1222 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1223 constant_nr / 8,
1224 constant_nr % 8);
1225
1226 inst->src[i].file = FIXED_HW_REG;
1227 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1228 }
1229 }
1230 }
1231 }
1232
1233 void
1234 fs_visitor::calculate_urb_setup()
1235 {
1236 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1237 urb_setup[i] = -1;
1238 }
1239
1240 int urb_next = 0;
1241 /* Figure out where each of the incoming setup attributes lands. */
1242 if (intel->gen >= 6) {
1243 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1244 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1245 urb_setup[i] = urb_next++;
1246 }
1247 }
1248 } else {
1249 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1250 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1251 /* Point size is packed into the header, not as a general attribute */
1252 if (i == VARYING_SLOT_PSIZ)
1253 continue;
1254
1255 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1256 /* The back color slot is skipped when the front color is
1257 * also written to. In addition, some slots can be
1258 * written in the vertex shader and not read in the
1259 * fragment shader. So the register number must always be
1260 * incremented, mapped or not.
1261 */
1262 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1263 urb_setup[i] = urb_next;
1264 urb_next++;
1265 }
1266 }
1267
1268 /*
1269 * It's a FS only attribute, and we did interpolation for this attribute
1270 * in SF thread. So, count it here, too.
1271 *
1272 * See compile_sf_prog() for more info.
1273 */
1274 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1275 urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1276 }
1277
1278 /* Each attribute is 4 setup channels, each of which is half a reg. */
1279 c->prog_data.urb_read_length = urb_next * 2;
1280 }
1281
1282 void
1283 fs_visitor::assign_urb_setup()
1284 {
1285 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1286
1287 /* Offset all the urb_setup[] index by the actual position of the
1288 * setup regs, now that the location of the constants has been chosen.
1289 */
1290 foreach_list(node, &this->instructions) {
1291 fs_inst *inst = (fs_inst *)node;
1292
1293 if (inst->opcode == FS_OPCODE_LINTERP) {
1294 assert(inst->src[2].file == FIXED_HW_REG);
1295 inst->src[2].fixed_hw_reg.nr += urb_start;
1296 }
1297
1298 if (inst->opcode == FS_OPCODE_CINTERP) {
1299 assert(inst->src[0].file == FIXED_HW_REG);
1300 inst->src[0].fixed_hw_reg.nr += urb_start;
1301 }
1302 }
1303
1304 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1305 }
1306
1307 /**
1308 * Split large virtual GRFs into separate components if we can.
1309 *
1310 * This is mostly duplicated with what brw_fs_vector_splitting does,
1311 * but that's really conservative because it's afraid of doing
1312 * splitting that doesn't result in real progress after the rest of
1313 * the optimization phases, which would cause infinite looping in
1314 * optimization. We can do it once here, safely. This also has the
1315 * opportunity to split interpolated values, or maybe even uniforms,
1316 * which we don't have at the IR level.
1317 *
1318 * We want to split, because virtual GRFs are what we register
1319 * allocate and spill (due to contiguousness requirements for some
1320 * instructions), and they're what we naturally generate in the
1321 * codegen process, but most virtual GRFs don't actually need to be
1322 * contiguous sets of GRFs. If we split, we'll end up with reduced
1323 * live intervals and better dead code elimination and coalescing.
1324 */
1325 void
1326 fs_visitor::split_virtual_grfs()
1327 {
1328 int num_vars = this->virtual_grf_count;
1329 bool split_grf[num_vars];
1330 int new_virtual_grf[num_vars];
1331
1332 /* Try to split anything > 0 sized. */
1333 for (int i = 0; i < num_vars; i++) {
1334 if (this->virtual_grf_sizes[i] != 1)
1335 split_grf[i] = true;
1336 else
1337 split_grf[i] = false;
1338 }
1339
1340 if (brw->has_pln &&
1341 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1342 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1343 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1344 * Gen6, that was the only supported interpolation mode, and since Gen6,
1345 * delta_x and delta_y are in fixed hardware registers.
1346 */
1347 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1348 false;
1349 }
1350
1351 foreach_list(node, &this->instructions) {
1352 fs_inst *inst = (fs_inst *)node;
1353
1354 /* If there's a SEND message that requires contiguous destination
1355 * registers, no splitting is allowed.
1356 */
1357 if (inst->regs_written() > 1) {
1358 split_grf[inst->dst.reg] = false;
1359 }
1360
1361 /* If we're sending from a GRF, don't split it, on the assumption that
1362 * the send is reading the whole thing.
1363 */
1364 if (inst->is_send_from_grf()) {
1365 split_grf[inst->src[0].reg] = false;
1366 }
1367 }
1368
1369 /* Allocate new space for split regs. Note that the virtual
1370 * numbers will be contiguous.
1371 */
1372 for (int i = 0; i < num_vars; i++) {
1373 if (split_grf[i]) {
1374 new_virtual_grf[i] = virtual_grf_alloc(1);
1375 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1376 int reg = virtual_grf_alloc(1);
1377 assert(reg == new_virtual_grf[i] + j - 1);
1378 (void) reg;
1379 }
1380 this->virtual_grf_sizes[i] = 1;
1381 }
1382 }
1383
1384 foreach_list(node, &this->instructions) {
1385 fs_inst *inst = (fs_inst *)node;
1386
1387 if (inst->dst.file == GRF &&
1388 split_grf[inst->dst.reg] &&
1389 inst->dst.reg_offset != 0) {
1390 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1391 inst->dst.reg_offset - 1);
1392 inst->dst.reg_offset = 0;
1393 }
1394 for (int i = 0; i < 3; i++) {
1395 if (inst->src[i].file == GRF &&
1396 split_grf[inst->src[i].reg] &&
1397 inst->src[i].reg_offset != 0) {
1398 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1399 inst->src[i].reg_offset - 1);
1400 inst->src[i].reg_offset = 0;
1401 }
1402 }
1403 }
1404 this->live_intervals_valid = false;
1405 }
1406
1407 /**
1408 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1409 *
1410 * During code generation, we create tons of temporary variables, many of
1411 * which get immediately killed and are never used again. Yet, in later
1412 * optimization and analysis passes, such as compute_live_intervals, we need
1413 * to loop over all the virtual GRFs. Compacting them can save a lot of
1414 * overhead.
1415 */
1416 void
1417 fs_visitor::compact_virtual_grfs()
1418 {
1419 /* Mark which virtual GRFs are used, and count how many. */
1420 int remap_table[this->virtual_grf_count];
1421 memset(remap_table, -1, sizeof(remap_table));
1422
1423 foreach_list(node, &this->instructions) {
1424 const fs_inst *inst = (const fs_inst *) node;
1425
1426 if (inst->dst.file == GRF)
1427 remap_table[inst->dst.reg] = 0;
1428
1429 for (int i = 0; i < 3; i++) {
1430 if (inst->src[i].file == GRF)
1431 remap_table[inst->src[i].reg] = 0;
1432 }
1433 }
1434
1435 /* In addition to registers used in instructions, fs_visitor keeps
1436 * direct references to certain special values which must be patched:
1437 */
1438 fs_reg *special[] = {
1439 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1440 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1441 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1442 &delta_x[0], &delta_x[1], &delta_x[2],
1443 &delta_x[3], &delta_x[4], &delta_x[5],
1444 &delta_y[0], &delta_y[1], &delta_y[2],
1445 &delta_y[3], &delta_y[4], &delta_y[5],
1446 };
1447 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1448 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1449
1450 /* Treat all special values as used, to be conservative */
1451 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1452 if (special[i]->file == GRF)
1453 remap_table[special[i]->reg] = 0;
1454 }
1455
1456 /* Compact the GRF arrays. */
1457 int new_index = 0;
1458 for (int i = 0; i < this->virtual_grf_count; i++) {
1459 if (remap_table[i] != -1) {
1460 remap_table[i] = new_index;
1461 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1462 if (live_intervals_valid) {
1463 virtual_grf_use[new_index] = virtual_grf_use[i];
1464 virtual_grf_def[new_index] = virtual_grf_def[i];
1465 }
1466 ++new_index;
1467 }
1468 }
1469
1470 this->virtual_grf_count = new_index;
1471
1472 /* Patch all the instructions to use the newly renumbered registers */
1473 foreach_list(node, &this->instructions) {
1474 fs_inst *inst = (fs_inst *) node;
1475
1476 if (inst->dst.file == GRF)
1477 inst->dst.reg = remap_table[inst->dst.reg];
1478
1479 for (int i = 0; i < 3; i++) {
1480 if (inst->src[i].file == GRF)
1481 inst->src[i].reg = remap_table[inst->src[i].reg];
1482 }
1483 }
1484
1485 /* Patch all the references to special values */
1486 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1487 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1488 special[i]->reg = remap_table[special[i]->reg];
1489 }
1490 }
1491
1492 bool
1493 fs_visitor::remove_dead_constants()
1494 {
1495 if (dispatch_width == 8) {
1496 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1497
1498 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1499 this->params_remap[i] = -1;
1500
1501 /* Find which params are still in use. */
1502 foreach_list(node, &this->instructions) {
1503 fs_inst *inst = (fs_inst *)node;
1504
1505 for (int i = 0; i < 3; i++) {
1506 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1507
1508 if (inst->src[i].file != UNIFORM)
1509 continue;
1510
1511 assert(constant_nr < (int)c->prog_data.nr_params);
1512
1513 /* For now, set this to non-negative. We'll give it the
1514 * actual new number in a moment, in order to keep the
1515 * register numbers nicely ordered.
1516 */
1517 this->params_remap[constant_nr] = 0;
1518 }
1519 }
1520
1521 /* Figure out what the new numbers for the params will be. At some
1522 * point when we're doing uniform array access, we're going to want
1523 * to keep the distinction between .reg and .reg_offset, but for
1524 * now we don't care.
1525 */
1526 unsigned int new_nr_params = 0;
1527 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1528 if (this->params_remap[i] != -1) {
1529 this->params_remap[i] = new_nr_params++;
1530 }
1531 }
1532
1533 /* Update the list of params to be uploaded to match our new numbering. */
1534 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1535 int remapped = this->params_remap[i];
1536
1537 if (remapped == -1)
1538 continue;
1539
1540 c->prog_data.param[remapped] = c->prog_data.param[i];
1541 }
1542
1543 c->prog_data.nr_params = new_nr_params;
1544 } else {
1545 /* This should have been generated in the 8-wide pass already. */
1546 assert(this->params_remap);
1547 }
1548
1549 /* Now do the renumbering of the shader to remove unused params. */
1550 foreach_list(node, &this->instructions) {
1551 fs_inst *inst = (fs_inst *)node;
1552
1553 for (int i = 0; i < 3; i++) {
1554 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1555
1556 if (inst->src[i].file != UNIFORM)
1557 continue;
1558
1559 assert(this->params_remap[constant_nr] != -1);
1560 inst->src[i].reg = this->params_remap[constant_nr];
1561 inst->src[i].reg_offset = 0;
1562 }
1563 }
1564
1565 return true;
1566 }
1567
1568 /*
1569 * Implements array access of uniforms by inserting a
1570 * PULL_CONSTANT_LOAD instruction.
1571 *
1572 * Unlike temporary GRF array access (where we don't support it due to
1573 * the difficulty of doing relative addressing on instruction
1574 * destinations), we could potentially do array access of uniforms
1575 * that were loaded in GRF space as push constants. In real-world
1576 * usage we've seen, though, the arrays being used are always larger
1577 * than we could load as push constants, so just always move all
1578 * uniform array access out to a pull constant buffer.
1579 */
1580 void
1581 fs_visitor::move_uniform_array_access_to_pull_constants()
1582 {
1583 int pull_constant_loc[c->prog_data.nr_params];
1584
1585 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1586 pull_constant_loc[i] = -1;
1587 }
1588
1589 /* Walk through and find array access of uniforms. Put a copy of that
1590 * uniform in the pull constant buffer.
1591 *
1592 * Note that we don't move constant-indexed accesses to arrays. No
1593 * testing has been done of the performance impact of this choice.
1594 */
1595 foreach_list_safe(node, &this->instructions) {
1596 fs_inst *inst = (fs_inst *)node;
1597
1598 for (int i = 0 ; i < 3; i++) {
1599 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1600 continue;
1601
1602 int uniform = inst->src[i].reg;
1603
1604 /* If this array isn't already present in the pull constant buffer,
1605 * add it.
1606 */
1607 if (pull_constant_loc[uniform] == -1) {
1608 const float **values = &c->prog_data.param[uniform];
1609
1610 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1611
1612 assert(param_size[uniform]);
1613
1614 for (int j = 0; j < param_size[uniform]; j++) {
1615 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1616 values[j];
1617 }
1618 }
1619
1620 /* Set up the annotation tracking for new generated instructions. */
1621 base_ir = inst->ir;
1622 current_annotation = inst->annotation;
1623
1624 fs_reg offset = fs_reg(this, glsl_type::int_type);
1625 inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1626 fs_reg(pull_constant_loc[uniform] +
1627 inst->src[i].reg_offset)));
1628
1629 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1630 fs_reg temp = fs_reg(this, glsl_type::float_type);
1631 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1632 surf_index, offset);
1633 inst->insert_before(&list);
1634
1635 inst->src[i].file = temp.file;
1636 inst->src[i].reg = temp.reg;
1637 inst->src[i].reg_offset = temp.reg_offset;
1638 inst->src[i].reladdr = NULL;
1639 }
1640 }
1641 }
1642
1643 /**
1644 * Choose accesses from the UNIFORM file to demote to using the pull
1645 * constant buffer.
1646 *
1647 * We allow a fragment shader to have more than the specified minimum
1648 * maximum number of fragment shader uniform components (64). If
1649 * there are too many of these, they'd fill up all of register space.
1650 * So, this will push some of them out to the pull constant buffer and
1651 * update the program to load them.
1652 */
1653 void
1654 fs_visitor::setup_pull_constants()
1655 {
1656 /* Only allow 16 registers (128 uniform components) as push constants. */
1657 unsigned int max_uniform_components = 16 * 8;
1658 if (c->prog_data.nr_params <= max_uniform_components)
1659 return;
1660
1661 if (dispatch_width == 16) {
1662 fail("Pull constants not supported in 16-wide\n");
1663 return;
1664 }
1665
1666 /* Just demote the end of the list. We could probably do better
1667 * here, demoting things that are rarely used in the program first.
1668 */
1669 unsigned int pull_uniform_base = max_uniform_components;
1670
1671 int pull_constant_loc[c->prog_data.nr_params];
1672 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1673 if (i < pull_uniform_base) {
1674 pull_constant_loc[i] = -1;
1675 } else {
1676 pull_constant_loc[i] = -1;
1677 /* If our constant is already being uploaded for reladdr purposes,
1678 * reuse it.
1679 */
1680 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1681 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1682 pull_constant_loc[i] = j;
1683 break;
1684 }
1685 }
1686 if (pull_constant_loc[i] == -1) {
1687 int pull_index = c->prog_data.nr_pull_params++;
1688 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1689 pull_constant_loc[i] = pull_index;;
1690 }
1691 }
1692 }
1693 c->prog_data.nr_params = pull_uniform_base;
1694
1695 foreach_list(node, &this->instructions) {
1696 fs_inst *inst = (fs_inst *)node;
1697
1698 for (int i = 0; i < 3; i++) {
1699 if (inst->src[i].file != UNIFORM)
1700 continue;
1701
1702 int pull_index = pull_constant_loc[inst->src[i].reg +
1703 inst->src[i].reg_offset];
1704 if (pull_index == -1)
1705 continue;
1706
1707 assert(!inst->src[i].reladdr);
1708
1709 fs_reg dst = fs_reg(this, glsl_type::float_type);
1710 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1711 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1712 fs_inst *pull =
1713 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1714 dst, index, offset);
1715 pull->ir = inst->ir;
1716 pull->annotation = inst->annotation;
1717
1718 inst->insert_before(pull);
1719
1720 inst->src[i].file = GRF;
1721 inst->src[i].reg = dst.reg;
1722 inst->src[i].reg_offset = 0;
1723 inst->src[i].smear = pull_index & 3;
1724 }
1725 }
1726 }
1727
1728 bool
1729 fs_visitor::opt_algebraic()
1730 {
1731 bool progress = false;
1732
1733 foreach_list(node, &this->instructions) {
1734 fs_inst *inst = (fs_inst *)node;
1735
1736 switch (inst->opcode) {
1737 case BRW_OPCODE_MUL:
1738 if (inst->src[1].file != IMM)
1739 continue;
1740
1741 /* a * 1.0 = a */
1742 if (inst->src[1].is_one()) {
1743 inst->opcode = BRW_OPCODE_MOV;
1744 inst->src[1] = reg_undef;
1745 progress = true;
1746 break;
1747 }
1748
1749 /* a * 0.0 = 0.0 */
1750 if (inst->src[1].is_zero()) {
1751 inst->opcode = BRW_OPCODE_MOV;
1752 inst->src[0] = inst->src[1];
1753 inst->src[1] = reg_undef;
1754 progress = true;
1755 break;
1756 }
1757
1758 break;
1759 case BRW_OPCODE_ADD:
1760 if (inst->src[1].file != IMM)
1761 continue;
1762
1763 /* a + 0.0 = a */
1764 if (inst->src[1].is_zero()) {
1765 inst->opcode = BRW_OPCODE_MOV;
1766 inst->src[1] = reg_undef;
1767 progress = true;
1768 break;
1769 }
1770 break;
1771 default:
1772 break;
1773 }
1774 }
1775
1776 return progress;
1777 }
1778
1779 /**
1780 * Must be called after calculate_live_intervales() to remove unused
1781 * writes to registers -- register allocation will fail otherwise
1782 * because something deffed but not used won't be considered to
1783 * interfere with other regs.
1784 */
1785 bool
1786 fs_visitor::dead_code_eliminate()
1787 {
1788 bool progress = false;
1789 int pc = 0;
1790
1791 calculate_live_intervals();
1792
1793 foreach_list_safe(node, &this->instructions) {
1794 fs_inst *inst = (fs_inst *)node;
1795
1796 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1797 inst->remove();
1798 progress = true;
1799 }
1800
1801 pc++;
1802 }
1803
1804 if (progress)
1805 live_intervals_valid = false;
1806
1807 return progress;
1808 }
1809
1810 /**
1811 * Implements a second type of register coalescing: This one checks if
1812 * the two regs involved in a raw move don't interfere, in which case
1813 * they can both by stored in the same place and the MOV removed.
1814 */
1815 bool
1816 fs_visitor::register_coalesce_2()
1817 {
1818 bool progress = false;
1819
1820 calculate_live_intervals();
1821
1822 foreach_list_safe(node, &this->instructions) {
1823 fs_inst *inst = (fs_inst *)node;
1824
1825 if (inst->opcode != BRW_OPCODE_MOV ||
1826 inst->predicate ||
1827 inst->saturate ||
1828 inst->src[0].file != GRF ||
1829 inst->src[0].negate ||
1830 inst->src[0].abs ||
1831 inst->src[0].smear != -1 ||
1832 inst->dst.file != GRF ||
1833 inst->dst.type != inst->src[0].type ||
1834 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1835 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1836 continue;
1837 }
1838
1839 int reg_from = inst->src[0].reg;
1840 assert(inst->src[0].reg_offset == 0);
1841 int reg_to = inst->dst.reg;
1842 int reg_to_offset = inst->dst.reg_offset;
1843
1844 foreach_list(node, &this->instructions) {
1845 fs_inst *scan_inst = (fs_inst *)node;
1846
1847 if (scan_inst->dst.file == GRF &&
1848 scan_inst->dst.reg == reg_from) {
1849 scan_inst->dst.reg = reg_to;
1850 scan_inst->dst.reg_offset = reg_to_offset;
1851 }
1852 for (int i = 0; i < 3; i++) {
1853 if (scan_inst->src[i].file == GRF &&
1854 scan_inst->src[i].reg == reg_from) {
1855 scan_inst->src[i].reg = reg_to;
1856 scan_inst->src[i].reg_offset = reg_to_offset;
1857 }
1858 }
1859 }
1860
1861 inst->remove();
1862
1863 /* We don't need to recalculate live intervals inside the loop despite
1864 * flagging live_intervals_valid because we only use live intervals for
1865 * the interferes test, and we must have had a situation where the
1866 * intervals were:
1867 *
1868 * from to
1869 * ^
1870 * |
1871 * v
1872 * ^
1873 * |
1874 * v
1875 *
1876 * Some register R that might get coalesced with one of these two could
1877 * only be referencing "to", otherwise "from"'s range would have been
1878 * longer. R's range could also only start at the end of "to" or later,
1879 * otherwise it will conflict with "to" when we try to coalesce "to"
1880 * into Rw anyway.
1881 */
1882 live_intervals_valid = false;
1883
1884 progress = true;
1885 continue;
1886 }
1887
1888 return progress;
1889 }
1890
1891 bool
1892 fs_visitor::register_coalesce()
1893 {
1894 bool progress = false;
1895 int if_depth = 0;
1896 int loop_depth = 0;
1897
1898 foreach_list_safe(node, &this->instructions) {
1899 fs_inst *inst = (fs_inst *)node;
1900
1901 /* Make sure that we dominate the instructions we're going to
1902 * scan for interfering with our coalescing, or we won't have
1903 * scanned enough to see if anything interferes with our
1904 * coalescing. We don't dominate the following instructions if
1905 * we're in a loop or an if block.
1906 */
1907 switch (inst->opcode) {
1908 case BRW_OPCODE_DO:
1909 loop_depth++;
1910 break;
1911 case BRW_OPCODE_WHILE:
1912 loop_depth--;
1913 break;
1914 case BRW_OPCODE_IF:
1915 if_depth++;
1916 break;
1917 case BRW_OPCODE_ENDIF:
1918 if_depth--;
1919 break;
1920 default:
1921 break;
1922 }
1923 if (loop_depth || if_depth)
1924 continue;
1925
1926 if (inst->opcode != BRW_OPCODE_MOV ||
1927 inst->predicate ||
1928 inst->saturate ||
1929 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1930 inst->src[0].file != UNIFORM)||
1931 inst->dst.type != inst->src[0].type)
1932 continue;
1933
1934 bool has_source_modifiers = (inst->src[0].abs ||
1935 inst->src[0].negate ||
1936 inst->src[0].smear != -1 ||
1937 inst->src[0].file == UNIFORM);
1938
1939 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1940 * them: check for no writes to either one until the exit of the
1941 * program.
1942 */
1943 bool interfered = false;
1944
1945 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1946 !scan_inst->is_tail_sentinel();
1947 scan_inst = (fs_inst *)scan_inst->next) {
1948 if (scan_inst->dst.file == GRF) {
1949 if (scan_inst->overwrites_reg(inst->dst) ||
1950 scan_inst->overwrites_reg(inst->src[0])) {
1951 interfered = true;
1952 break;
1953 }
1954 }
1955
1956 /* The gen6 MATH instruction can't handle source modifiers or
1957 * unusual register regions, so avoid coalescing those for
1958 * now. We should do something more specific.
1959 */
1960 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1961 interfered = true;
1962 break;
1963 }
1964
1965 /* The accumulator result appears to get used for the
1966 * conditional modifier generation. When negating a UD
1967 * value, there is a 33rd bit generated for the sign in the
1968 * accumulator value, so now you can't check, for example,
1969 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1970 */
1971 if (scan_inst->conditional_mod &&
1972 inst->src[0].negate &&
1973 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1974 interfered = true;
1975 break;
1976 }
1977 }
1978 if (interfered) {
1979 continue;
1980 }
1981
1982 /* Rewrite the later usage to point at the source of the move to
1983 * be removed.
1984 */
1985 for (fs_inst *scan_inst = inst;
1986 !scan_inst->is_tail_sentinel();
1987 scan_inst = (fs_inst *)scan_inst->next) {
1988 for (int i = 0; i < 3; i++) {
1989 if (scan_inst->src[i].file == GRF &&
1990 scan_inst->src[i].reg == inst->dst.reg &&
1991 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1992 fs_reg new_src = inst->src[0];
1993 if (scan_inst->src[i].abs) {
1994 new_src.negate = 0;
1995 new_src.abs = 1;
1996 }
1997 new_src.negate ^= scan_inst->src[i].negate;
1998 scan_inst->src[i] = new_src;
1999 }
2000 }
2001 }
2002
2003 inst->remove();
2004 progress = true;
2005 }
2006
2007 if (progress)
2008 live_intervals_valid = false;
2009
2010 return progress;
2011 }
2012
2013
2014 bool
2015 fs_visitor::compute_to_mrf()
2016 {
2017 bool progress = false;
2018 int next_ip = 0;
2019
2020 calculate_live_intervals();
2021
2022 foreach_list_safe(node, &this->instructions) {
2023 fs_inst *inst = (fs_inst *)node;
2024
2025 int ip = next_ip;
2026 next_ip++;
2027
2028 if (inst->opcode != BRW_OPCODE_MOV ||
2029 inst->predicate ||
2030 inst->dst.file != MRF || inst->src[0].file != GRF ||
2031 inst->dst.type != inst->src[0].type ||
2032 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2033 continue;
2034
2035 /* Work out which hardware MRF registers are written by this
2036 * instruction.
2037 */
2038 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2039 int mrf_high;
2040 if (inst->dst.reg & BRW_MRF_COMPR4) {
2041 mrf_high = mrf_low + 4;
2042 } else if (dispatch_width == 16 &&
2043 (!inst->force_uncompressed && !inst->force_sechalf)) {
2044 mrf_high = mrf_low + 1;
2045 } else {
2046 mrf_high = mrf_low;
2047 }
2048
2049 /* Can't compute-to-MRF this GRF if someone else was going to
2050 * read it later.
2051 */
2052 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2053 continue;
2054
2055 /* Found a move of a GRF to a MRF. Let's see if we can go
2056 * rewrite the thing that made this GRF to write into the MRF.
2057 */
2058 fs_inst *scan_inst;
2059 for (scan_inst = (fs_inst *)inst->prev;
2060 scan_inst->prev != NULL;
2061 scan_inst = (fs_inst *)scan_inst->prev) {
2062 if (scan_inst->dst.file == GRF &&
2063 scan_inst->dst.reg == inst->src[0].reg) {
2064 /* Found the last thing to write our reg we want to turn
2065 * into a compute-to-MRF.
2066 */
2067
2068 /* If it's predicated, it (probably) didn't populate all
2069 * the channels. We might be able to rewrite everything
2070 * that writes that reg, but it would require smarter
2071 * tracking to delay the rewriting until complete success.
2072 */
2073 if (scan_inst->predicate)
2074 break;
2075
2076 /* If it's half of register setup and not the same half as
2077 * our MOV we're trying to remove, bail for now.
2078 */
2079 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2080 scan_inst->force_sechalf != inst->force_sechalf) {
2081 break;
2082 }
2083
2084 /* SEND instructions can't have MRF as a destination. */
2085 if (scan_inst->mlen)
2086 break;
2087
2088 if (intel->gen == 6) {
2089 /* gen6 math instructions must have the destination be
2090 * GRF, so no compute-to-MRF for them.
2091 */
2092 if (scan_inst->is_math()) {
2093 break;
2094 }
2095 }
2096
2097 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2098 /* Found the creator of our MRF's source value. */
2099 scan_inst->dst.file = MRF;
2100 scan_inst->dst.reg = inst->dst.reg;
2101 scan_inst->saturate |= inst->saturate;
2102 inst->remove();
2103 progress = true;
2104 }
2105 break;
2106 }
2107
2108 /* We don't handle control flow here. Most computation of
2109 * values that end up in MRFs are shortly before the MRF
2110 * write anyway.
2111 */
2112 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2113 break;
2114
2115 /* You can't read from an MRF, so if someone else reads our
2116 * MRF's source GRF that we wanted to rewrite, that stops us.
2117 */
2118 bool interfered = false;
2119 for (int i = 0; i < 3; i++) {
2120 if (scan_inst->src[i].file == GRF &&
2121 scan_inst->src[i].reg == inst->src[0].reg &&
2122 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2123 interfered = true;
2124 }
2125 }
2126 if (interfered)
2127 break;
2128
2129 if (scan_inst->dst.file == MRF) {
2130 /* If somebody else writes our MRF here, we can't
2131 * compute-to-MRF before that.
2132 */
2133 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2134 int scan_mrf_high;
2135
2136 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2137 scan_mrf_high = scan_mrf_low + 4;
2138 } else if (dispatch_width == 16 &&
2139 (!scan_inst->force_uncompressed &&
2140 !scan_inst->force_sechalf)) {
2141 scan_mrf_high = scan_mrf_low + 1;
2142 } else {
2143 scan_mrf_high = scan_mrf_low;
2144 }
2145
2146 if (mrf_low == scan_mrf_low ||
2147 mrf_low == scan_mrf_high ||
2148 mrf_high == scan_mrf_low ||
2149 mrf_high == scan_mrf_high) {
2150 break;
2151 }
2152 }
2153
2154 if (scan_inst->mlen > 0) {
2155 /* Found a SEND instruction, which means that there are
2156 * live values in MRFs from base_mrf to base_mrf +
2157 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2158 * above it.
2159 */
2160 if (mrf_low >= scan_inst->base_mrf &&
2161 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2162 break;
2163 }
2164 if (mrf_high >= scan_inst->base_mrf &&
2165 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2166 break;
2167 }
2168 }
2169 }
2170 }
2171
2172 if (progress)
2173 live_intervals_valid = false;
2174
2175 return progress;
2176 }
2177
2178 /**
2179 * Walks through basic blocks, looking for repeated MRF writes and
2180 * removing the later ones.
2181 */
2182 bool
2183 fs_visitor::remove_duplicate_mrf_writes()
2184 {
2185 fs_inst *last_mrf_move[16];
2186 bool progress = false;
2187
2188 /* Need to update the MRF tracking for compressed instructions. */
2189 if (dispatch_width == 16)
2190 return false;
2191
2192 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2193
2194 foreach_list_safe(node, &this->instructions) {
2195 fs_inst *inst = (fs_inst *)node;
2196
2197 if (inst->is_control_flow()) {
2198 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2199 }
2200
2201 if (inst->opcode == BRW_OPCODE_MOV &&
2202 inst->dst.file == MRF) {
2203 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2204 if (prev_inst && inst->equals(prev_inst)) {
2205 inst->remove();
2206 progress = true;
2207 continue;
2208 }
2209 }
2210
2211 /* Clear out the last-write records for MRFs that were overwritten. */
2212 if (inst->dst.file == MRF) {
2213 last_mrf_move[inst->dst.reg] = NULL;
2214 }
2215
2216 if (inst->mlen > 0) {
2217 /* Found a SEND instruction, which will include two or fewer
2218 * implied MRF writes. We could do better here.
2219 */
2220 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2221 last_mrf_move[inst->base_mrf + i] = NULL;
2222 }
2223 }
2224
2225 /* Clear out any MRF move records whose sources got overwritten. */
2226 if (inst->dst.file == GRF) {
2227 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2228 if (last_mrf_move[i] &&
2229 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2230 last_mrf_move[i] = NULL;
2231 }
2232 }
2233 }
2234
2235 if (inst->opcode == BRW_OPCODE_MOV &&
2236 inst->dst.file == MRF &&
2237 inst->src[0].file == GRF &&
2238 !inst->predicate) {
2239 last_mrf_move[inst->dst.reg] = inst;
2240 }
2241 }
2242
2243 if (progress)
2244 live_intervals_valid = false;
2245
2246 return progress;
2247 }
2248
2249 static void
2250 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2251 int first_grf, int grf_len)
2252 {
2253 bool inst_16wide = (dispatch_width > 8 &&
2254 !inst->force_uncompressed &&
2255 !inst->force_sechalf);
2256
2257 /* Clear the flag for registers that actually got read (as expected). */
2258 for (int i = 0; i < 3; i++) {
2259 int grf;
2260 if (inst->src[i].file == GRF) {
2261 grf = inst->src[i].reg;
2262 } else if (inst->src[i].file == FIXED_HW_REG &&
2263 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2264 grf = inst->src[i].fixed_hw_reg.nr;
2265 } else {
2266 continue;
2267 }
2268
2269 if (grf >= first_grf &&
2270 grf < first_grf + grf_len) {
2271 deps[grf - first_grf] = false;
2272 if (inst_16wide)
2273 deps[grf - first_grf + 1] = false;
2274 }
2275 }
2276 }
2277
2278 /**
2279 * Implements this workaround for the original 965:
2280 *
2281 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2282 * check for post destination dependencies on this instruction, software
2283 * must ensure that there is no destination hazard for the case of ‘write
2284 * followed by a posted write’ shown in the following example.
2285 *
2286 * 1. mov r3 0
2287 * 2. send r3.xy <rest of send instruction>
2288 * 3. mov r2 r3
2289 *
2290 * Due to no post-destination dependency check on the ‘send’, the above
2291 * code sequence could have two instructions (1 and 2) in flight at the
2292 * same time that both consider ‘r3’ as the target of their final writes.
2293 */
2294 void
2295 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2296 {
2297 int reg_size = dispatch_width / 8;
2298 int write_len = inst->regs_written() * reg_size;
2299 int first_write_grf = inst->dst.reg;
2300 bool needs_dep[BRW_MAX_MRF];
2301 assert(write_len < (int)sizeof(needs_dep) - 1);
2302
2303 memset(needs_dep, false, sizeof(needs_dep));
2304 memset(needs_dep, true, write_len);
2305
2306 clear_deps_for_inst_src(inst, dispatch_width,
2307 needs_dep, first_write_grf, write_len);
2308
2309 /* Walk backwards looking for writes to registers we're writing which
2310 * aren't read since being written. If we hit the start of the program,
2311 * we assume that there are no outstanding dependencies on entry to the
2312 * program.
2313 */
2314 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2315 scan_inst != NULL;
2316 scan_inst = (fs_inst *)scan_inst->prev) {
2317
2318 /* If we hit control flow, assume that there *are* outstanding
2319 * dependencies, and force their cleanup before our instruction.
2320 */
2321 if (scan_inst->is_control_flow()) {
2322 for (int i = 0; i < write_len; i++) {
2323 if (needs_dep[i]) {
2324 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2325 }
2326 }
2327 }
2328
2329 bool scan_inst_16wide = (dispatch_width > 8 &&
2330 !scan_inst->force_uncompressed &&
2331 !scan_inst->force_sechalf);
2332
2333 /* We insert our reads as late as possible on the assumption that any
2334 * instruction but a MOV that might have left us an outstanding
2335 * dependency has more latency than a MOV.
2336 */
2337 if (scan_inst->dst.file == GRF) {
2338 for (int i = 0; i < scan_inst->regs_written(); i++) {
2339 int reg = scan_inst->dst.reg + i * reg_size;
2340
2341 if (reg >= first_write_grf &&
2342 reg < first_write_grf + write_len &&
2343 needs_dep[reg - first_write_grf]) {
2344 inst->insert_before(DEP_RESOLVE_MOV(reg));
2345 needs_dep[reg - first_write_grf] = false;
2346 if (scan_inst_16wide)
2347 needs_dep[reg - first_write_grf + 1] = false;
2348 }
2349 }
2350 }
2351
2352 /* Clear the flag for registers that actually got read (as expected). */
2353 clear_deps_for_inst_src(scan_inst, dispatch_width,
2354 needs_dep, first_write_grf, write_len);
2355
2356 /* Continue the loop only if we haven't resolved all the dependencies */
2357 int i;
2358 for (i = 0; i < write_len; i++) {
2359 if (needs_dep[i])
2360 break;
2361 }
2362 if (i == write_len)
2363 return;
2364 }
2365 }
2366
2367 /**
2368 * Implements this workaround for the original 965:
2369 *
2370 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2371 * used as a destination register until after it has been sourced by an
2372 * instruction with a different destination register.
2373 */
2374 void
2375 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2376 {
2377 int write_len = inst->regs_written() * dispatch_width / 8;
2378 int first_write_grf = inst->dst.reg;
2379 bool needs_dep[BRW_MAX_MRF];
2380 assert(write_len < (int)sizeof(needs_dep) - 1);
2381
2382 memset(needs_dep, false, sizeof(needs_dep));
2383 memset(needs_dep, true, write_len);
2384 /* Walk forwards looking for writes to registers we're writing which aren't
2385 * read before being written.
2386 */
2387 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2388 !scan_inst->is_tail_sentinel();
2389 scan_inst = (fs_inst *)scan_inst->next) {
2390 /* If we hit control flow, force resolve all remaining dependencies. */
2391 if (scan_inst->is_control_flow()) {
2392 for (int i = 0; i < write_len; i++) {
2393 if (needs_dep[i])
2394 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2395 }
2396 }
2397
2398 /* Clear the flag for registers that actually got read (as expected). */
2399 clear_deps_for_inst_src(scan_inst, dispatch_width,
2400 needs_dep, first_write_grf, write_len);
2401
2402 /* We insert our reads as late as possible since they're reading the
2403 * result of a SEND, which has massive latency.
2404 */
2405 if (scan_inst->dst.file == GRF &&
2406 scan_inst->dst.reg >= first_write_grf &&
2407 scan_inst->dst.reg < first_write_grf + write_len &&
2408 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2409 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2410 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2411 }
2412
2413 /* Continue the loop only if we haven't resolved all the dependencies */
2414 int i;
2415 for (i = 0; i < write_len; i++) {
2416 if (needs_dep[i])
2417 break;
2418 }
2419 if (i == write_len)
2420 return;
2421 }
2422
2423 /* If we hit the end of the program, resolve all remaining dependencies out
2424 * of paranoia.
2425 */
2426 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2427 assert(last_inst->eot);
2428 for (int i = 0; i < write_len; i++) {
2429 if (needs_dep[i])
2430 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2431 }
2432 }
2433
2434 void
2435 fs_visitor::insert_gen4_send_dependency_workarounds()
2436 {
2437 if (intel->gen != 4 || intel->is_g4x)
2438 return;
2439
2440 /* Note that we're done with register allocation, so GRF fs_regs always
2441 * have a .reg_offset of 0.
2442 */
2443
2444 foreach_list_safe(node, &this->instructions) {
2445 fs_inst *inst = (fs_inst *)node;
2446
2447 if (inst->mlen != 0 && inst->dst.file == GRF) {
2448 insert_gen4_pre_send_dependency_workarounds(inst);
2449 insert_gen4_post_send_dependency_workarounds(inst);
2450 }
2451 }
2452 }
2453
2454 /**
2455 * Turns the generic expression-style uniform pull constant load instruction
2456 * into a hardware-specific series of instructions for loading a pull
2457 * constant.
2458 *
2459 * The expression style allows the CSE pass before this to optimize out
2460 * repeated loads from the same offset, and gives the pre-register-allocation
2461 * scheduling full flexibility, while the conversion to native instructions
2462 * allows the post-register-allocation scheduler the best information
2463 * possible.
2464 *
2465 * Note that execution masking for setting up pull constant loads is special:
2466 * the channels that need to be written are unrelated to the current execution
2467 * mask, since a later instruction will use one of the result channels as a
2468 * source operand for all 8 or 16 of its channels.
2469 */
2470 void
2471 fs_visitor::lower_uniform_pull_constant_loads()
2472 {
2473 foreach_list(node, &this->instructions) {
2474 fs_inst *inst = (fs_inst *)node;
2475
2476 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2477 continue;
2478
2479 if (intel->gen >= 7) {
2480 fs_reg const_offset_reg = inst->src[1];
2481 assert(const_offset_reg.file == IMM &&
2482 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2483 const_offset_reg.imm.u /= 16;
2484 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2485
2486 /* This is actually going to be a MOV, but since only the first dword
2487 * is accessed, we have a special opcode to do just that one. Note
2488 * that this needs to be an operation that will be considered a def
2489 * by live variable analysis, or register allocation will explode.
2490 */
2491 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2492 payload, const_offset_reg);
2493 setup->force_writemask_all = true;
2494
2495 setup->ir = inst->ir;
2496 setup->annotation = inst->annotation;
2497 inst->insert_before(setup);
2498
2499 /* Similarly, this will only populate the first 4 channels of the
2500 * result register (since we only use smear values from 0-3), but we
2501 * don't tell the optimizer.
2502 */
2503 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2504 inst->src[1] = payload;
2505
2506 this->live_intervals_valid = false;
2507 } else {
2508 /* Before register allocation, we didn't tell the scheduler about the
2509 * MRF we use. We know it's safe to use this MRF because nothing
2510 * else does except for register spill/unspill, which generates and
2511 * uses its MRF within a single IR instruction.
2512 */
2513 inst->base_mrf = 14;
2514 inst->mlen = 1;
2515 }
2516 }
2517 }
2518
2519 void
2520 fs_visitor::dump_instruction(fs_inst *inst)
2521 {
2522 if (inst->predicate) {
2523 printf("(%cf0.%d) ",
2524 inst->predicate_inverse ? '-' : '+',
2525 inst->flag_subreg);
2526 }
2527
2528 printf("%s", brw_instruction_name(inst->opcode));
2529 if (inst->saturate)
2530 printf(".sat");
2531 if (inst->conditional_mod) {
2532 printf(".cmod");
2533 if (!inst->predicate &&
2534 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2535 inst->opcode != BRW_OPCODE_IF &&
2536 inst->opcode != BRW_OPCODE_WHILE))) {
2537 printf(".f0.%d\n", inst->flag_subreg);
2538 }
2539 }
2540 printf(" ");
2541
2542
2543 switch (inst->dst.file) {
2544 case GRF:
2545 printf("vgrf%d", inst->dst.reg);
2546 if (inst->dst.reg_offset)
2547 printf("+%d", inst->dst.reg_offset);
2548 break;
2549 case MRF:
2550 printf("m%d", inst->dst.reg);
2551 break;
2552 case BAD_FILE:
2553 printf("(null)");
2554 break;
2555 case UNIFORM:
2556 printf("***u%d***", inst->dst.reg);
2557 break;
2558 default:
2559 printf("???");
2560 break;
2561 }
2562 printf(", ");
2563
2564 for (int i = 0; i < 3; i++) {
2565 if (inst->src[i].negate)
2566 printf("-");
2567 if (inst->src[i].abs)
2568 printf("|");
2569 switch (inst->src[i].file) {
2570 case GRF:
2571 printf("vgrf%d", inst->src[i].reg);
2572 if (inst->src[i].reg_offset)
2573 printf("+%d", inst->src[i].reg_offset);
2574 break;
2575 case MRF:
2576 printf("***m%d***", inst->src[i].reg);
2577 break;
2578 case UNIFORM:
2579 printf("u%d", inst->src[i].reg);
2580 if (inst->src[i].reg_offset)
2581 printf(".%d", inst->src[i].reg_offset);
2582 break;
2583 case BAD_FILE:
2584 printf("(null)");
2585 break;
2586 case IMM:
2587 switch (inst->src[i].type) {
2588 case BRW_REGISTER_TYPE_F:
2589 printf("%ff", inst->src[i].imm.f);
2590 break;
2591 case BRW_REGISTER_TYPE_D:
2592 printf("%dd", inst->src[i].imm.i);
2593 break;
2594 case BRW_REGISTER_TYPE_UD:
2595 printf("%uu", inst->src[i].imm.u);
2596 break;
2597 default:
2598 printf("???");
2599 break;
2600 }
2601 break;
2602 default:
2603 printf("???");
2604 break;
2605 }
2606 if (inst->src[i].abs)
2607 printf("|");
2608
2609 if (i < 3)
2610 printf(", ");
2611 }
2612
2613 printf(" ");
2614
2615 if (inst->force_uncompressed)
2616 printf("1sthalf ");
2617
2618 if (inst->force_sechalf)
2619 printf("2ndhalf ");
2620
2621 printf("\n");
2622 }
2623
2624 void
2625 fs_visitor::dump_instructions()
2626 {
2627 int ip = 0;
2628 foreach_list(node, &this->instructions) {
2629 fs_inst *inst = (fs_inst *)node;
2630 printf("%d: ", ip++);
2631 dump_instruction(inst);
2632 }
2633 }
2634
2635 /**
2636 * Possibly returns an instruction that set up @param reg.
2637 *
2638 * Sometimes we want to take the result of some expression/variable
2639 * dereference tree and rewrite the instruction generating the result
2640 * of the tree. When processing the tree, we know that the
2641 * instructions generated are all writing temporaries that are dead
2642 * outside of this tree. So, if we have some instructions that write
2643 * a temporary, we're free to point that temp write somewhere else.
2644 *
2645 * Note that this doesn't guarantee that the instruction generated
2646 * only reg -- it might be the size=4 destination of a texture instruction.
2647 */
2648 fs_inst *
2649 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2650 fs_inst *end,
2651 fs_reg reg)
2652 {
2653 if (end == start ||
2654 end->predicate ||
2655 end->force_uncompressed ||
2656 end->force_sechalf ||
2657 reg.reladdr ||
2658 !reg.equals(end->dst)) {
2659 return NULL;
2660 } else {
2661 return end;
2662 }
2663 }
2664
2665 void
2666 fs_visitor::setup_payload_gen6()
2667 {
2668 struct intel_context *intel = &brw->intel;
2669 bool uses_depth =
2670 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2671 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2672
2673 assert(intel->gen >= 6);
2674
2675 /* R0-1: masks, pixel X/Y coordinates. */
2676 c->nr_payload_regs = 2;
2677 /* R2: only for 32-pixel dispatch.*/
2678
2679 /* R3-26: barycentric interpolation coordinates. These appear in the
2680 * same order that they appear in the brw_wm_barycentric_interp_mode
2681 * enum. Each set of coordinates occupies 2 registers if dispatch width
2682 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2683 * appear if they were enabled using the "Barycentric Interpolation
2684 * Mode" bits in WM_STATE.
2685 */
2686 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2687 if (barycentric_interp_modes & (1 << i)) {
2688 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2689 c->nr_payload_regs += 2;
2690 if (dispatch_width == 16) {
2691 c->nr_payload_regs += 2;
2692 }
2693 }
2694 }
2695
2696 /* R27: interpolated depth if uses source depth */
2697 if (uses_depth) {
2698 c->source_depth_reg = c->nr_payload_regs;
2699 c->nr_payload_regs++;
2700 if (dispatch_width == 16) {
2701 /* R28: interpolated depth if not 8-wide. */
2702 c->nr_payload_regs++;
2703 }
2704 }
2705 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2706 if (uses_depth) {
2707 c->source_w_reg = c->nr_payload_regs;
2708 c->nr_payload_regs++;
2709 if (dispatch_width == 16) {
2710 /* R30: interpolated W if not 8-wide. */
2711 c->nr_payload_regs++;
2712 }
2713 }
2714 /* R31: MSAA position offsets. */
2715 /* R32-: bary for 32-pixel. */
2716 /* R58-59: interp W for 32-pixel. */
2717
2718 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2719 c->source_depth_to_render_target = true;
2720 }
2721 }
2722
2723 bool
2724 fs_visitor::run()
2725 {
2726 sanity_param_count = fp->Base.Parameters->NumParameters;
2727 uint32_t orig_nr_params = c->prog_data.nr_params;
2728
2729 if (intel->gen >= 6)
2730 setup_payload_gen6();
2731 else
2732 setup_payload_gen4();
2733
2734 if (0) {
2735 emit_dummy_fs();
2736 } else {
2737 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2738 emit_shader_time_begin();
2739
2740 calculate_urb_setup();
2741 if (intel->gen < 6)
2742 emit_interpolation_setup_gen4();
2743 else
2744 emit_interpolation_setup_gen6();
2745
2746 /* We handle discards by keeping track of the still-live pixels in f0.1.
2747 * Initialize it with the dispatched pixels.
2748 */
2749 if (fp->UsesKill) {
2750 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2751 discard_init->flag_subreg = 1;
2752 }
2753
2754 /* Generate FS IR for main(). (the visitor only descends into
2755 * functions called "main").
2756 */
2757 if (shader) {
2758 foreach_list(node, &*shader->ir) {
2759 ir_instruction *ir = (ir_instruction *)node;
2760 base_ir = ir;
2761 this->result = reg_undef;
2762 ir->accept(this);
2763 }
2764 } else {
2765 emit_fragment_program_code();
2766 }
2767 base_ir = NULL;
2768 if (failed)
2769 return false;
2770
2771 emit_fb_writes();
2772
2773 split_virtual_grfs();
2774
2775 move_uniform_array_access_to_pull_constants();
2776 setup_pull_constants();
2777
2778 bool progress;
2779 do {
2780 progress = false;
2781
2782 compact_virtual_grfs();
2783
2784 progress = remove_duplicate_mrf_writes() || progress;
2785
2786 progress = opt_algebraic() || progress;
2787 progress = opt_cse() || progress;
2788 progress = opt_copy_propagate() || progress;
2789 progress = dead_code_eliminate() || progress;
2790 progress = register_coalesce() || progress;
2791 progress = register_coalesce_2() || progress;
2792 progress = compute_to_mrf() || progress;
2793 } while (progress);
2794
2795 remove_dead_constants();
2796
2797 schedule_instructions(false);
2798
2799 lower_uniform_pull_constant_loads();
2800
2801 assign_curb_setup();
2802 assign_urb_setup();
2803
2804 if (0) {
2805 /* Debug of register spilling: Go spill everything. */
2806 for (int i = 0; i < virtual_grf_count; i++) {
2807 spill_reg(i);
2808 }
2809 }
2810
2811 if (0)
2812 assign_regs_trivial();
2813 else {
2814 while (!assign_regs()) {
2815 if (failed)
2816 break;
2817 }
2818 }
2819 }
2820 assert(force_uncompressed_stack == 0);
2821 assert(force_sechalf_stack == 0);
2822
2823 /* This must come after all optimization and register allocation, since
2824 * it inserts dead code that happens to have side effects, and it does
2825 * so based on the actual physical registers in use.
2826 */
2827 insert_gen4_send_dependency_workarounds();
2828
2829 if (failed)
2830 return false;
2831
2832 schedule_instructions(true);
2833
2834 if (dispatch_width == 8) {
2835 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2836 } else {
2837 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2838
2839 /* Make sure we didn't try to sneak in an extra uniform */
2840 assert(orig_nr_params == c->prog_data.nr_params);
2841 (void) orig_nr_params;
2842 }
2843
2844 /* If any state parameters were appended, then ParameterValues could have
2845 * been realloced, in which case the driver uniform storage set up by
2846 * _mesa_associate_uniform_storage() would point to freed memory. Make
2847 * sure that didn't happen.
2848 */
2849 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2850
2851 return !failed;
2852 }
2853
2854 const unsigned *
2855 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2856 struct gl_fragment_program *fp,
2857 struct gl_shader_program *prog,
2858 unsigned *final_assembly_size)
2859 {
2860 struct intel_context *intel = &brw->intel;
2861 bool start_busy = false;
2862 float start_time = 0;
2863
2864 if (unlikely(intel->perf_debug)) {
2865 start_busy = (intel->batch.last_bo &&
2866 drm_intel_bo_busy(intel->batch.last_bo));
2867 start_time = get_time();
2868 }
2869
2870 struct brw_shader *shader = NULL;
2871 if (prog)
2872 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2873
2874 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2875 if (shader) {
2876 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2877 _mesa_print_ir(shader->ir, NULL);
2878 printf("\n\n");
2879 } else {
2880 printf("ARB_fragment_program %d ir for native fragment shader\n",
2881 fp->Base.Id);
2882 _mesa_print_program(&fp->Base);
2883 }
2884 }
2885
2886 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2887 */
2888 fs_visitor v(brw, c, prog, fp, 8);
2889 if (!v.run()) {
2890 prog->LinkStatus = false;
2891 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2892
2893 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2894 v.fail_msg);
2895
2896 return NULL;
2897 }
2898
2899 exec_list *simd16_instructions = NULL;
2900 fs_visitor v2(brw, c, prog, fp, 16);
2901 bool no16 = INTEL_DEBUG & DEBUG_NO16;
2902 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2903 v2.import_uniforms(&v);
2904 if (!v2.run()) {
2905 perf_debug("16-wide shader failed to compile, falling back to "
2906 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2907 } else {
2908 simd16_instructions = &v2.instructions;
2909 }
2910 }
2911
2912 c->prog_data.dispatch_width = 8;
2913
2914 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2915 const unsigned *generated = g.generate_assembly(&v.instructions,
2916 simd16_instructions,
2917 final_assembly_size);
2918
2919 if (unlikely(intel->perf_debug) && shader) {
2920 if (shader->compiled_once)
2921 brw_wm_debug_recompile(brw, prog, &c->key);
2922 shader->compiled_once = true;
2923
2924 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2925 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2926 (get_time() - start_time) * 1000);
2927 }
2928 }
2929
2930 return generated;
2931 }
2932
2933 bool
2934 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2935 {
2936 struct brw_context *brw = brw_context(ctx);
2937 struct intel_context *intel = &brw->intel;
2938 struct brw_wm_prog_key key;
2939
2940 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2941 return true;
2942
2943 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2944 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2945 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2946 bool program_uses_dfdy = fp->UsesDFdy;
2947
2948 memset(&key, 0, sizeof(key));
2949
2950 if (intel->gen < 6) {
2951 if (fp->UsesKill)
2952 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2953
2954 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2955 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2956
2957 /* Just assume depth testing. */
2958 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2959 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2960 }
2961
2962 if (prog->Name != 0)
2963 key.proj_attrib_mask = ~(GLbitfield64) 0;
2964 else {
2965 /* Bit VARYING_BIT_POS of key.proj_attrib_mask is never used, so to
2966 * avoid unnecessary recompiles, always set it to 1.
2967 */
2968 key.proj_attrib_mask |= VARYING_BIT_POS;
2969 }
2970
2971 if (intel->gen < 6)
2972 key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
2973
2974 for (int i = 0; i < VARYING_SLOT_MAX; i++) {
2975 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2976 continue;
2977
2978 if (prog->Name == 0)
2979 key.proj_attrib_mask |= BITFIELD64_BIT(i);
2980
2981 if (intel->gen < 6) {
2982 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
2983 key.input_slots_valid |= BITFIELD64_BIT(i);
2984 }
2985 }
2986
2987 key.clamp_fragment_color = true;
2988
2989 for (int i = 0; i < MAX_SAMPLERS; i++) {
2990 if (fp->Base.ShadowSamplers & (1 << i)) {
2991 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2992 key.tex.swizzles[i] =
2993 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2994 } else {
2995 /* Color sampler: assume no swizzling. */
2996 key.tex.swizzles[i] = SWIZZLE_XYZW;
2997 }
2998 }
2999
3000 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3001 key.drawable_height = ctx->DrawBuffer->Height;
3002 }
3003
3004 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3005 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3006 }
3007
3008 key.nr_color_regions = 1;
3009
3010 key.program_string_id = bfp->id;
3011
3012 uint32_t old_prog_offset = brw->wm.prog_offset;
3013 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3014
3015 bool success = do_wm_prog(brw, prog, bfp, &key);
3016
3017 brw->wm.prog_offset = old_prog_offset;
3018 brw->wm.prog_data = old_prog_data;
3019
3020 return success;
3021 }