i965: Make the fragment shader pull constants index by dwords, not vec4s.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 #define ALU3(op) \
150 fs_inst * \
151 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
152 { \
153 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
154 }
155
156 ALU1(NOT)
157 ALU1(MOV)
158 ALU1(FRC)
159 ALU1(RNDD)
160 ALU1(RNDE)
161 ALU1(RNDZ)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(SHL)
169 ALU2(SHR)
170 ALU2(ASR)
171 ALU3(LRP)
172
173 /** Gen4 predicated IF. */
174 fs_inst *
175 fs_visitor::IF(uint32_t predicate)
176 {
177 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
178 inst->predicate = predicate;
179 return inst;
180 }
181
182 /** Gen6+ IF with embedded comparison. */
183 fs_inst *
184 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
185 {
186 assert(intel->gen >= 6);
187 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
188 reg_null_d, src0, src1);
189 inst->conditional_mod = condition;
190 return inst;
191 }
192
193 /**
194 * CMP: Sets the low bit of the destination channels with the result
195 * of the comparison, while the upper bits are undefined, and updates
196 * the flag register with the packed 16 bits of the result.
197 */
198 fs_inst *
199 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
200 {
201 fs_inst *inst;
202
203 /* Take the instruction:
204 *
205 * CMP null<d> src0<f> src1<f>
206 *
207 * Original gen4 does type conversion to the destination type before
208 * comparison, producing garbage results for floating point comparisons.
209 * gen5 does the comparison on the execution type (resolved source types),
210 * so dst type doesn't matter. gen6 does comparison and then uses the
211 * result as if it was the dst type with no conversion, which happens to
212 * mostly work out for float-interpreted-as-int since our comparisons are
213 * for >0, =0, <0.
214 */
215 if (intel->gen == 4) {
216 dst.type = src0.type;
217 if (dst.file == FIXED_HW_REG)
218 dst.fixed_hw_reg.type = dst.type;
219 }
220
221 resolve_ud_negate(&src0);
222 resolve_ud_negate(&src1);
223
224 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
225 inst->conditional_mod = condition;
226
227 return inst;
228 }
229
230 exec_list
231 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
232 fs_reg varying_offset,
233 uint32_t const_offset)
234 {
235 exec_list instructions;
236 fs_inst *inst;
237
238 fs_reg offset = fs_reg(this, glsl_type::uint_type);
239 instructions.push_tail(ADD(offset, varying_offset, fs_reg(const_offset)));
240
241 if (intel->gen >= 7) {
242 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
243 dst, surf_index, offset);
244 instructions.push_tail(inst);
245 } else {
246 int base_mrf = 13;
247 bool header_present = true;
248
249 fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
250 mrf.type = BRW_REGISTER_TYPE_D;
251
252 /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
253 * dword-aligned byte offset.
254 */
255 if (intel->gen == 6) {
256 instructions.push_tail(MOV(mrf, offset));
257 } else {
258 instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
259 }
260 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
261 dst, surf_index);
262 inst->header_present = header_present;
263 inst->base_mrf = base_mrf;
264 inst->mlen = header_present + dispatch_width / 8;
265
266 instructions.push_tail(inst);
267 }
268
269 return instructions;
270 }
271
272 /**
273 * A helper for MOV generation for fixing up broken hardware SEND dependency
274 * handling.
275 */
276 fs_inst *
277 fs_visitor::DEP_RESOLVE_MOV(int grf)
278 {
279 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
280
281 inst->ir = NULL;
282 inst->annotation = "send dependency resolve";
283
284 /* The caller always wants uncompressed to emit the minimal extra
285 * dependencies, and to avoid having to deal with aligning its regs to 2.
286 */
287 inst->force_uncompressed = true;
288
289 return inst;
290 }
291
292 bool
293 fs_inst::equals(fs_inst *inst)
294 {
295 return (opcode == inst->opcode &&
296 dst.equals(inst->dst) &&
297 src[0].equals(inst->src[0]) &&
298 src[1].equals(inst->src[1]) &&
299 src[2].equals(inst->src[2]) &&
300 saturate == inst->saturate &&
301 predicate == inst->predicate &&
302 conditional_mod == inst->conditional_mod &&
303 mlen == inst->mlen &&
304 base_mrf == inst->base_mrf &&
305 sampler == inst->sampler &&
306 target == inst->target &&
307 eot == inst->eot &&
308 header_present == inst->header_present &&
309 shadow_compare == inst->shadow_compare &&
310 offset == inst->offset);
311 }
312
313 int
314 fs_inst::regs_written()
315 {
316 if (is_tex())
317 return 4;
318
319 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
320 * but we don't currently use them...nor do we have an opcode for them.
321 */
322
323 return 1;
324 }
325
326 bool
327 fs_inst::overwrites_reg(const fs_reg &reg)
328 {
329 return (reg.file == dst.file &&
330 reg.reg == dst.reg &&
331 reg.reg_offset >= dst.reg_offset &&
332 reg.reg_offset < dst.reg_offset + regs_written());
333 }
334
335 bool
336 fs_inst::is_tex()
337 {
338 return (opcode == SHADER_OPCODE_TEX ||
339 opcode == FS_OPCODE_TXB ||
340 opcode == SHADER_OPCODE_TXD ||
341 opcode == SHADER_OPCODE_TXF ||
342 opcode == SHADER_OPCODE_TXF_MS ||
343 opcode == SHADER_OPCODE_TXL ||
344 opcode == SHADER_OPCODE_TXS ||
345 opcode == SHADER_OPCODE_LOD);
346 }
347
348 bool
349 fs_inst::is_math()
350 {
351 return (opcode == SHADER_OPCODE_RCP ||
352 opcode == SHADER_OPCODE_RSQ ||
353 opcode == SHADER_OPCODE_SQRT ||
354 opcode == SHADER_OPCODE_EXP2 ||
355 opcode == SHADER_OPCODE_LOG2 ||
356 opcode == SHADER_OPCODE_SIN ||
357 opcode == SHADER_OPCODE_COS ||
358 opcode == SHADER_OPCODE_INT_QUOTIENT ||
359 opcode == SHADER_OPCODE_INT_REMAINDER ||
360 opcode == SHADER_OPCODE_POW);
361 }
362
363 bool
364 fs_inst::is_control_flow()
365 {
366 switch (opcode) {
367 case BRW_OPCODE_DO:
368 case BRW_OPCODE_WHILE:
369 case BRW_OPCODE_IF:
370 case BRW_OPCODE_ELSE:
371 case BRW_OPCODE_ENDIF:
372 case BRW_OPCODE_BREAK:
373 case BRW_OPCODE_CONTINUE:
374 return true;
375 default:
376 return false;
377 }
378 }
379
380 bool
381 fs_inst::is_send_from_grf()
382 {
383 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
384 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
385 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
386 src[1].file == GRF));
387 }
388
389 bool
390 fs_visitor::can_do_source_mods(fs_inst *inst)
391 {
392 if (intel->gen == 6 && inst->is_math())
393 return false;
394
395 if (inst->is_send_from_grf())
396 return false;
397
398 return true;
399 }
400
401 void
402 fs_reg::init()
403 {
404 memset(this, 0, sizeof(*this));
405 this->smear = -1;
406 }
407
408 /** Generic unset register constructor. */
409 fs_reg::fs_reg()
410 {
411 init();
412 this->file = BAD_FILE;
413 }
414
415 /** Immediate value constructor. */
416 fs_reg::fs_reg(float f)
417 {
418 init();
419 this->file = IMM;
420 this->type = BRW_REGISTER_TYPE_F;
421 this->imm.f = f;
422 }
423
424 /** Immediate value constructor. */
425 fs_reg::fs_reg(int32_t i)
426 {
427 init();
428 this->file = IMM;
429 this->type = BRW_REGISTER_TYPE_D;
430 this->imm.i = i;
431 }
432
433 /** Immediate value constructor. */
434 fs_reg::fs_reg(uint32_t u)
435 {
436 init();
437 this->file = IMM;
438 this->type = BRW_REGISTER_TYPE_UD;
439 this->imm.u = u;
440 }
441
442 /** Fixed brw_reg Immediate value constructor. */
443 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
444 {
445 init();
446 this->file = FIXED_HW_REG;
447 this->fixed_hw_reg = fixed_hw_reg;
448 this->type = fixed_hw_reg.type;
449 }
450
451 bool
452 fs_reg::equals(const fs_reg &r) const
453 {
454 return (file == r.file &&
455 reg == r.reg &&
456 reg_offset == r.reg_offset &&
457 type == r.type &&
458 negate == r.negate &&
459 abs == r.abs &&
460 !reladdr && !r.reladdr &&
461 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
462 sizeof(fixed_hw_reg)) == 0 &&
463 smear == r.smear &&
464 imm.u == r.imm.u);
465 }
466
467 bool
468 fs_reg::is_zero() const
469 {
470 if (file != IMM)
471 return false;
472
473 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
474 }
475
476 bool
477 fs_reg::is_one() const
478 {
479 if (file != IMM)
480 return false;
481
482 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
483 }
484
485 int
486 fs_visitor::type_size(const struct glsl_type *type)
487 {
488 unsigned int size, i;
489
490 switch (type->base_type) {
491 case GLSL_TYPE_UINT:
492 case GLSL_TYPE_INT:
493 case GLSL_TYPE_FLOAT:
494 case GLSL_TYPE_BOOL:
495 return type->components();
496 case GLSL_TYPE_ARRAY:
497 return type_size(type->fields.array) * type->length;
498 case GLSL_TYPE_STRUCT:
499 size = 0;
500 for (i = 0; i < type->length; i++) {
501 size += type_size(type->fields.structure[i].type);
502 }
503 return size;
504 case GLSL_TYPE_SAMPLER:
505 /* Samplers take up no register space, since they're baked in at
506 * link time.
507 */
508 return 0;
509 case GLSL_TYPE_VOID:
510 case GLSL_TYPE_ERROR:
511 case GLSL_TYPE_INTERFACE:
512 assert(!"not reached");
513 break;
514 }
515
516 return 0;
517 }
518
519 fs_reg
520 fs_visitor::get_timestamp()
521 {
522 assert(intel->gen >= 7);
523
524 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
525 BRW_ARF_TIMESTAMP,
526 0),
527 BRW_REGISTER_TYPE_UD));
528
529 fs_reg dst = fs_reg(this, glsl_type::uint_type);
530
531 fs_inst *mov = emit(MOV(dst, ts));
532 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
533 * even if it's not enabled in the dispatch.
534 */
535 mov->force_writemask_all = true;
536 mov->force_uncompressed = true;
537
538 /* The caller wants the low 32 bits of the timestamp. Since it's running
539 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
540 * which is plenty of time for our purposes. It is identical across the
541 * EUs, but since it's tracking GPU core speed it will increment at a
542 * varying rate as render P-states change.
543 *
544 * The caller could also check if render P-states have changed (or anything
545 * else that might disrupt timing) by setting smear to 2 and checking if
546 * that field is != 0.
547 */
548 dst.smear = 0;
549
550 return dst;
551 }
552
553 void
554 fs_visitor::emit_shader_time_begin()
555 {
556 current_annotation = "shader time start";
557 shader_start_time = get_timestamp();
558 }
559
560 void
561 fs_visitor::emit_shader_time_end()
562 {
563 current_annotation = "shader time end";
564
565 enum shader_time_shader_type type, written_type, reset_type;
566 if (dispatch_width == 8) {
567 type = ST_FS8;
568 written_type = ST_FS8_WRITTEN;
569 reset_type = ST_FS8_RESET;
570 } else {
571 assert(dispatch_width == 16);
572 type = ST_FS16;
573 written_type = ST_FS16_WRITTEN;
574 reset_type = ST_FS16_RESET;
575 }
576
577 fs_reg shader_end_time = get_timestamp();
578
579 /* Check that there weren't any timestamp reset events (assuming these
580 * were the only two timestamp reads that happened).
581 */
582 fs_reg reset = shader_end_time;
583 reset.smear = 2;
584 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
585 test->conditional_mod = BRW_CONDITIONAL_Z;
586 emit(IF(BRW_PREDICATE_NORMAL));
587
588 push_force_uncompressed();
589 fs_reg start = shader_start_time;
590 start.negate = true;
591 fs_reg diff = fs_reg(this, glsl_type::uint_type);
592 emit(ADD(diff, start, shader_end_time));
593
594 /* If there were no instructions between the two timestamp gets, the diff
595 * is 2 cycles. Remove that overhead, so I can forget about that when
596 * trying to determine the time taken for single instructions.
597 */
598 emit(ADD(diff, diff, fs_reg(-2u)));
599
600 emit_shader_time_write(type, diff);
601 emit_shader_time_write(written_type, fs_reg(1u));
602 emit(BRW_OPCODE_ELSE);
603 emit_shader_time_write(reset_type, fs_reg(1u));
604 emit(BRW_OPCODE_ENDIF);
605
606 pop_force_uncompressed();
607 }
608
609 void
610 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
611 fs_reg value)
612 {
613 int shader_time_index = brw_get_shader_time_index(brw, prog, &fp->Base,
614 type);
615 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
616
617 fs_reg payload;
618 if (dispatch_width == 8)
619 payload = fs_reg(this, glsl_type::uvec2_type);
620 else
621 payload = fs_reg(this, glsl_type::uint_type);
622
623 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
624 fs_reg(), payload, offset, value));
625 }
626
627 void
628 fs_visitor::fail(const char *format, ...)
629 {
630 va_list va;
631 char *msg;
632
633 if (failed)
634 return;
635
636 failed = true;
637
638 va_start(va, format);
639 msg = ralloc_vasprintf(mem_ctx, format, va);
640 va_end(va);
641 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
642
643 this->fail_msg = msg;
644
645 if (INTEL_DEBUG & DEBUG_WM) {
646 fprintf(stderr, "%s", msg);
647 }
648 }
649
650 fs_inst *
651 fs_visitor::emit(enum opcode opcode)
652 {
653 return emit(fs_inst(opcode));
654 }
655
656 fs_inst *
657 fs_visitor::emit(enum opcode opcode, fs_reg dst)
658 {
659 return emit(fs_inst(opcode, dst));
660 }
661
662 fs_inst *
663 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
664 {
665 return emit(fs_inst(opcode, dst, src0));
666 }
667
668 fs_inst *
669 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
670 {
671 return emit(fs_inst(opcode, dst, src0, src1));
672 }
673
674 fs_inst *
675 fs_visitor::emit(enum opcode opcode, fs_reg dst,
676 fs_reg src0, fs_reg src1, fs_reg src2)
677 {
678 return emit(fs_inst(opcode, dst, src0, src1, src2));
679 }
680
681 void
682 fs_visitor::push_force_uncompressed()
683 {
684 force_uncompressed_stack++;
685 }
686
687 void
688 fs_visitor::pop_force_uncompressed()
689 {
690 force_uncompressed_stack--;
691 assert(force_uncompressed_stack >= 0);
692 }
693
694 void
695 fs_visitor::push_force_sechalf()
696 {
697 force_sechalf_stack++;
698 }
699
700 void
701 fs_visitor::pop_force_sechalf()
702 {
703 force_sechalf_stack--;
704 assert(force_sechalf_stack >= 0);
705 }
706
707 /**
708 * Returns how many MRFs an FS opcode will write over.
709 *
710 * Note that this is not the 0 or 1 implied writes in an actual gen
711 * instruction -- the FS opcodes often generate MOVs in addition.
712 */
713 int
714 fs_visitor::implied_mrf_writes(fs_inst *inst)
715 {
716 if (inst->mlen == 0)
717 return 0;
718
719 switch (inst->opcode) {
720 case SHADER_OPCODE_RCP:
721 case SHADER_OPCODE_RSQ:
722 case SHADER_OPCODE_SQRT:
723 case SHADER_OPCODE_EXP2:
724 case SHADER_OPCODE_LOG2:
725 case SHADER_OPCODE_SIN:
726 case SHADER_OPCODE_COS:
727 return 1 * dispatch_width / 8;
728 case SHADER_OPCODE_POW:
729 case SHADER_OPCODE_INT_QUOTIENT:
730 case SHADER_OPCODE_INT_REMAINDER:
731 return 2 * dispatch_width / 8;
732 case SHADER_OPCODE_TEX:
733 case FS_OPCODE_TXB:
734 case SHADER_OPCODE_TXD:
735 case SHADER_OPCODE_TXF:
736 case SHADER_OPCODE_TXF_MS:
737 case SHADER_OPCODE_TXL:
738 case SHADER_OPCODE_TXS:
739 case SHADER_OPCODE_LOD:
740 return 1;
741 case FS_OPCODE_FB_WRITE:
742 return 2;
743 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
744 case FS_OPCODE_UNSPILL:
745 return 1;
746 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
747 return inst->header_present;
748 case FS_OPCODE_SPILL:
749 return 2;
750 default:
751 assert(!"not reached");
752 return inst->mlen;
753 }
754 }
755
756 int
757 fs_visitor::virtual_grf_alloc(int size)
758 {
759 if (virtual_grf_array_size <= virtual_grf_count) {
760 if (virtual_grf_array_size == 0)
761 virtual_grf_array_size = 16;
762 else
763 virtual_grf_array_size *= 2;
764 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
765 virtual_grf_array_size);
766 }
767 virtual_grf_sizes[virtual_grf_count] = size;
768 return virtual_grf_count++;
769 }
770
771 /** Fixed HW reg constructor. */
772 fs_reg::fs_reg(enum register_file file, int reg)
773 {
774 init();
775 this->file = file;
776 this->reg = reg;
777 this->type = BRW_REGISTER_TYPE_F;
778 }
779
780 /** Fixed HW reg constructor. */
781 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
782 {
783 init();
784 this->file = file;
785 this->reg = reg;
786 this->type = type;
787 }
788
789 /** Automatic reg constructor. */
790 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
791 {
792 init();
793
794 this->file = GRF;
795 this->reg = v->virtual_grf_alloc(v->type_size(type));
796 this->reg_offset = 0;
797 this->type = brw_type_for_base_type(type);
798 }
799
800 fs_reg *
801 fs_visitor::variable_storage(ir_variable *var)
802 {
803 return (fs_reg *)hash_table_find(this->variable_ht, var);
804 }
805
806 void
807 import_uniforms_callback(const void *key,
808 void *data,
809 void *closure)
810 {
811 struct hash_table *dst_ht = (struct hash_table *)closure;
812 const fs_reg *reg = (const fs_reg *)data;
813
814 if (reg->file != UNIFORM)
815 return;
816
817 hash_table_insert(dst_ht, data, key);
818 }
819
820 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
821 * This brings in those uniform definitions
822 */
823 void
824 fs_visitor::import_uniforms(fs_visitor *v)
825 {
826 hash_table_call_foreach(v->variable_ht,
827 import_uniforms_callback,
828 variable_ht);
829 this->params_remap = v->params_remap;
830 }
831
832 /* Our support for uniforms is piggy-backed on the struct
833 * gl_fragment_program, because that's where the values actually
834 * get stored, rather than in some global gl_shader_program uniform
835 * store.
836 */
837 void
838 fs_visitor::setup_uniform_values(ir_variable *ir)
839 {
840 int namelen = strlen(ir->name);
841
842 /* The data for our (non-builtin) uniforms is stored in a series of
843 * gl_uniform_driver_storage structs for each subcomponent that
844 * glGetUniformLocation() could name. We know it's been set up in the same
845 * order we'd walk the type, so walk the list of storage and find anything
846 * with our name, or the prefix of a component that starts with our name.
847 */
848 unsigned params_before = c->prog_data.nr_params;
849 for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
850 struct gl_uniform_storage *storage = &prog->UniformStorage[u];
851
852 if (strncmp(ir->name, storage->name, namelen) != 0 ||
853 (storage->name[namelen] != 0 &&
854 storage->name[namelen] != '.' &&
855 storage->name[namelen] != '[')) {
856 continue;
857 }
858
859 unsigned slots = storage->type->component_slots();
860 if (storage->array_elements)
861 slots *= storage->array_elements;
862
863 for (unsigned i = 0; i < slots; i++) {
864 c->prog_data.param[c->prog_data.nr_params++] =
865 &storage->storage[i].f;
866 }
867 }
868
869 /* Make sure we actually initialized the right amount of stuff here. */
870 assert(params_before + ir->type->component_slots() ==
871 c->prog_data.nr_params);
872 }
873
874
875 /* Our support for builtin uniforms is even scarier than non-builtin.
876 * It sits on top of the PROG_STATE_VAR parameters that are
877 * automatically updated from GL context state.
878 */
879 void
880 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
881 {
882 const ir_state_slot *const slots = ir->state_slots;
883 assert(ir->state_slots != NULL);
884
885 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
886 /* This state reference has already been setup by ir_to_mesa, but we'll
887 * get the same index back here.
888 */
889 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
890 (gl_state_index *)slots[i].tokens);
891
892 /* Add each of the unique swizzles of the element as a parameter.
893 * This'll end up matching the expected layout of the
894 * array/matrix/structure we're trying to fill in.
895 */
896 int last_swiz = -1;
897 for (unsigned int j = 0; j < 4; j++) {
898 int swiz = GET_SWZ(slots[i].swizzle, j);
899 if (swiz == last_swiz)
900 break;
901 last_swiz = swiz;
902
903 c->prog_data.param[c->prog_data.nr_params++] =
904 &fp->Base.Parameters->ParameterValues[index][swiz].f;
905 }
906 }
907 }
908
909 fs_reg *
910 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
911 {
912 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
913 fs_reg wpos = *reg;
914 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
915
916 /* gl_FragCoord.x */
917 if (ir->pixel_center_integer) {
918 emit(MOV(wpos, this->pixel_x));
919 } else {
920 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
921 }
922 wpos.reg_offset++;
923
924 /* gl_FragCoord.y */
925 if (!flip && ir->pixel_center_integer) {
926 emit(MOV(wpos, this->pixel_y));
927 } else {
928 fs_reg pixel_y = this->pixel_y;
929 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
930
931 if (flip) {
932 pixel_y.negate = true;
933 offset += c->key.drawable_height - 1.0;
934 }
935
936 emit(ADD(wpos, pixel_y, fs_reg(offset)));
937 }
938 wpos.reg_offset++;
939
940 /* gl_FragCoord.z */
941 if (intel->gen >= 6) {
942 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
943 } else {
944 emit(FS_OPCODE_LINTERP, wpos,
945 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
946 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
947 interp_reg(VARYING_SLOT_POS, 2));
948 }
949 wpos.reg_offset++;
950
951 /* gl_FragCoord.w: Already set up in emit_interpolation */
952 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
953
954 return reg;
955 }
956
957 fs_inst *
958 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
959 glsl_interp_qualifier interpolation_mode,
960 bool is_centroid)
961 {
962 brw_wm_barycentric_interp_mode barycoord_mode;
963 if (is_centroid) {
964 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
965 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
966 else
967 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
968 } else {
969 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
970 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
971 else
972 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
973 }
974 return emit(FS_OPCODE_LINTERP, attr,
975 this->delta_x[barycoord_mode],
976 this->delta_y[barycoord_mode], interp);
977 }
978
979 fs_reg *
980 fs_visitor::emit_general_interpolation(ir_variable *ir)
981 {
982 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
983 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
984 fs_reg attr = *reg;
985
986 unsigned int array_elements;
987 const glsl_type *type;
988
989 if (ir->type->is_array()) {
990 array_elements = ir->type->length;
991 if (array_elements == 0) {
992 fail("dereferenced array '%s' has length 0\n", ir->name);
993 }
994 type = ir->type->fields.array;
995 } else {
996 array_elements = 1;
997 type = ir->type;
998 }
999
1000 glsl_interp_qualifier interpolation_mode =
1001 ir->determine_interpolation_mode(c->key.flat_shade);
1002
1003 int location = ir->location;
1004 for (unsigned int i = 0; i < array_elements; i++) {
1005 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1006 if (urb_setup[location] == -1) {
1007 /* If there's no incoming setup data for this slot, don't
1008 * emit interpolation for it.
1009 */
1010 attr.reg_offset += type->vector_elements;
1011 location++;
1012 continue;
1013 }
1014
1015 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1016 /* Constant interpolation (flat shading) case. The SF has
1017 * handed us defined values in only the constant offset
1018 * field of the setup reg.
1019 */
1020 for (unsigned int k = 0; k < type->vector_elements; k++) {
1021 struct brw_reg interp = interp_reg(location, k);
1022 interp = suboffset(interp, 3);
1023 interp.type = reg->type;
1024 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1025 attr.reg_offset++;
1026 }
1027 } else {
1028 /* Smooth/noperspective interpolation case. */
1029 for (unsigned int k = 0; k < type->vector_elements; k++) {
1030 /* FINISHME: At some point we probably want to push
1031 * this farther by giving similar treatment to the
1032 * other potentially constant components of the
1033 * attribute, as well as making brw_vs_constval.c
1034 * handle varyings other than gl_TexCoord.
1035 */
1036 if (location >= VARYING_SLOT_TEX0 &&
1037 location <= VARYING_SLOT_TEX7 &&
1038 k == 3 && !(c->key.proj_attrib_mask
1039 & BITFIELD64_BIT(location))) {
1040 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1041 } else {
1042 struct brw_reg interp = interp_reg(location, k);
1043 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1044 ir->centroid);
1045 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1046 /* Get the pixel/sample mask into f0 so that we know
1047 * which pixels are lit. Then, for each channel that is
1048 * unlit, replace the centroid data with non-centroid
1049 * data.
1050 */
1051 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1052 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1053 interpolation_mode, false);
1054 inst->predicate = BRW_PREDICATE_NORMAL;
1055 inst->predicate_inverse = true;
1056 }
1057 if (intel->gen < 6) {
1058 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1059 }
1060 }
1061 attr.reg_offset++;
1062 }
1063
1064 }
1065 location++;
1066 }
1067 }
1068
1069 return reg;
1070 }
1071
1072 fs_reg *
1073 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1074 {
1075 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1076
1077 /* The frontfacing comes in as a bit in the thread payload. */
1078 if (intel->gen >= 6) {
1079 emit(BRW_OPCODE_ASR, *reg,
1080 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1081 fs_reg(15));
1082 emit(BRW_OPCODE_NOT, *reg, *reg);
1083 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1084 } else {
1085 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1086 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1087 * us front face
1088 */
1089 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1090 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1091 }
1092
1093 return reg;
1094 }
1095
1096 fs_reg
1097 fs_visitor::fix_math_operand(fs_reg src)
1098 {
1099 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1100 * might be able to do better by doing execsize = 1 math and then
1101 * expanding that result out, but we would need to be careful with
1102 * masking.
1103 *
1104 * The hardware ignores source modifiers (negate and abs) on math
1105 * instructions, so we also move to a temp to set those up.
1106 */
1107 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1108 !src.abs && !src.negate)
1109 return src;
1110
1111 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1112 * operands to math
1113 */
1114 if (intel->gen >= 7 && src.file != IMM)
1115 return src;
1116
1117 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1118 expanded.type = src.type;
1119 emit(BRW_OPCODE_MOV, expanded, src);
1120 return expanded;
1121 }
1122
1123 fs_inst *
1124 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1125 {
1126 switch (opcode) {
1127 case SHADER_OPCODE_RCP:
1128 case SHADER_OPCODE_RSQ:
1129 case SHADER_OPCODE_SQRT:
1130 case SHADER_OPCODE_EXP2:
1131 case SHADER_OPCODE_LOG2:
1132 case SHADER_OPCODE_SIN:
1133 case SHADER_OPCODE_COS:
1134 break;
1135 default:
1136 assert(!"not reached: bad math opcode");
1137 return NULL;
1138 }
1139
1140 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1141 * might be able to do better by doing execsize = 1 math and then
1142 * expanding that result out, but we would need to be careful with
1143 * masking.
1144 *
1145 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1146 * instructions, so we also move to a temp to set those up.
1147 */
1148 if (intel->gen >= 6)
1149 src = fix_math_operand(src);
1150
1151 fs_inst *inst = emit(opcode, dst, src);
1152
1153 if (intel->gen < 6) {
1154 inst->base_mrf = 2;
1155 inst->mlen = dispatch_width / 8;
1156 }
1157
1158 return inst;
1159 }
1160
1161 fs_inst *
1162 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1163 {
1164 int base_mrf = 2;
1165 fs_inst *inst;
1166
1167 switch (opcode) {
1168 case SHADER_OPCODE_INT_QUOTIENT:
1169 case SHADER_OPCODE_INT_REMAINDER:
1170 if (intel->gen >= 7 && dispatch_width == 16)
1171 fail("16-wide INTDIV unsupported\n");
1172 break;
1173 case SHADER_OPCODE_POW:
1174 break;
1175 default:
1176 assert(!"not reached: unsupported binary math opcode.");
1177 return NULL;
1178 }
1179
1180 if (intel->gen >= 6) {
1181 src0 = fix_math_operand(src0);
1182 src1 = fix_math_operand(src1);
1183
1184 inst = emit(opcode, dst, src0, src1);
1185 } else {
1186 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1187 * "Message Payload":
1188 *
1189 * "Operand0[7]. For the INT DIV functions, this operand is the
1190 * denominator."
1191 * ...
1192 * "Operand1[7]. For the INT DIV functions, this operand is the
1193 * numerator."
1194 */
1195 bool is_int_div = opcode != SHADER_OPCODE_POW;
1196 fs_reg &op0 = is_int_div ? src1 : src0;
1197 fs_reg &op1 = is_int_div ? src0 : src1;
1198
1199 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1200 inst = emit(opcode, dst, op0, reg_null_f);
1201
1202 inst->base_mrf = base_mrf;
1203 inst->mlen = 2 * dispatch_width / 8;
1204 }
1205 return inst;
1206 }
1207
1208 void
1209 fs_visitor::assign_curb_setup()
1210 {
1211 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1212 if (dispatch_width == 8) {
1213 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1214 } else {
1215 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1216 }
1217
1218 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1219 foreach_list(node, &this->instructions) {
1220 fs_inst *inst = (fs_inst *)node;
1221
1222 for (unsigned int i = 0; i < 3; i++) {
1223 if (inst->src[i].file == UNIFORM) {
1224 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1225 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1226 constant_nr / 8,
1227 constant_nr % 8);
1228
1229 inst->src[i].file = FIXED_HW_REG;
1230 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1231 }
1232 }
1233 }
1234 }
1235
1236 void
1237 fs_visitor::calculate_urb_setup()
1238 {
1239 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1240 urb_setup[i] = -1;
1241 }
1242
1243 int urb_next = 0;
1244 /* Figure out where each of the incoming setup attributes lands. */
1245 if (intel->gen >= 6) {
1246 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1247 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1248 urb_setup[i] = urb_next++;
1249 }
1250 }
1251 } else {
1252 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1253 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1254 /* Point size is packed into the header, not as a general attribute */
1255 if (i == VARYING_SLOT_PSIZ)
1256 continue;
1257
1258 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1259 /* The back color slot is skipped when the front color is
1260 * also written to. In addition, some slots can be
1261 * written in the vertex shader and not read in the
1262 * fragment shader. So the register number must always be
1263 * incremented, mapped or not.
1264 */
1265 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1266 urb_setup[i] = urb_next;
1267 urb_next++;
1268 }
1269 }
1270
1271 /*
1272 * It's a FS only attribute, and we did interpolation for this attribute
1273 * in SF thread. So, count it here, too.
1274 *
1275 * See compile_sf_prog() for more info.
1276 */
1277 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1278 urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1279 }
1280
1281 /* Each attribute is 4 setup channels, each of which is half a reg. */
1282 c->prog_data.urb_read_length = urb_next * 2;
1283 }
1284
1285 void
1286 fs_visitor::assign_urb_setup()
1287 {
1288 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1289
1290 /* Offset all the urb_setup[] index by the actual position of the
1291 * setup regs, now that the location of the constants has been chosen.
1292 */
1293 foreach_list(node, &this->instructions) {
1294 fs_inst *inst = (fs_inst *)node;
1295
1296 if (inst->opcode == FS_OPCODE_LINTERP) {
1297 assert(inst->src[2].file == FIXED_HW_REG);
1298 inst->src[2].fixed_hw_reg.nr += urb_start;
1299 }
1300
1301 if (inst->opcode == FS_OPCODE_CINTERP) {
1302 assert(inst->src[0].file == FIXED_HW_REG);
1303 inst->src[0].fixed_hw_reg.nr += urb_start;
1304 }
1305 }
1306
1307 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1308 }
1309
1310 /**
1311 * Split large virtual GRFs into separate components if we can.
1312 *
1313 * This is mostly duplicated with what brw_fs_vector_splitting does,
1314 * but that's really conservative because it's afraid of doing
1315 * splitting that doesn't result in real progress after the rest of
1316 * the optimization phases, which would cause infinite looping in
1317 * optimization. We can do it once here, safely. This also has the
1318 * opportunity to split interpolated values, or maybe even uniforms,
1319 * which we don't have at the IR level.
1320 *
1321 * We want to split, because virtual GRFs are what we register
1322 * allocate and spill (due to contiguousness requirements for some
1323 * instructions), and they're what we naturally generate in the
1324 * codegen process, but most virtual GRFs don't actually need to be
1325 * contiguous sets of GRFs. If we split, we'll end up with reduced
1326 * live intervals and better dead code elimination and coalescing.
1327 */
1328 void
1329 fs_visitor::split_virtual_grfs()
1330 {
1331 int num_vars = this->virtual_grf_count;
1332 bool split_grf[num_vars];
1333 int new_virtual_grf[num_vars];
1334
1335 /* Try to split anything > 0 sized. */
1336 for (int i = 0; i < num_vars; i++) {
1337 if (this->virtual_grf_sizes[i] != 1)
1338 split_grf[i] = true;
1339 else
1340 split_grf[i] = false;
1341 }
1342
1343 if (brw->has_pln &&
1344 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1345 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1346 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1347 * Gen6, that was the only supported interpolation mode, and since Gen6,
1348 * delta_x and delta_y are in fixed hardware registers.
1349 */
1350 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1351 false;
1352 }
1353
1354 foreach_list(node, &this->instructions) {
1355 fs_inst *inst = (fs_inst *)node;
1356
1357 /* If there's a SEND message that requires contiguous destination
1358 * registers, no splitting is allowed.
1359 */
1360 if (inst->regs_written() > 1) {
1361 split_grf[inst->dst.reg] = false;
1362 }
1363
1364 /* If we're sending from a GRF, don't split it, on the assumption that
1365 * the send is reading the whole thing.
1366 */
1367 if (inst->is_send_from_grf()) {
1368 split_grf[inst->src[0].reg] = false;
1369 }
1370 }
1371
1372 /* Allocate new space for split regs. Note that the virtual
1373 * numbers will be contiguous.
1374 */
1375 for (int i = 0; i < num_vars; i++) {
1376 if (split_grf[i]) {
1377 new_virtual_grf[i] = virtual_grf_alloc(1);
1378 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1379 int reg = virtual_grf_alloc(1);
1380 assert(reg == new_virtual_grf[i] + j - 1);
1381 (void) reg;
1382 }
1383 this->virtual_grf_sizes[i] = 1;
1384 }
1385 }
1386
1387 foreach_list(node, &this->instructions) {
1388 fs_inst *inst = (fs_inst *)node;
1389
1390 if (inst->dst.file == GRF &&
1391 split_grf[inst->dst.reg] &&
1392 inst->dst.reg_offset != 0) {
1393 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1394 inst->dst.reg_offset - 1);
1395 inst->dst.reg_offset = 0;
1396 }
1397 for (int i = 0; i < 3; i++) {
1398 if (inst->src[i].file == GRF &&
1399 split_grf[inst->src[i].reg] &&
1400 inst->src[i].reg_offset != 0) {
1401 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1402 inst->src[i].reg_offset - 1);
1403 inst->src[i].reg_offset = 0;
1404 }
1405 }
1406 }
1407 this->live_intervals_valid = false;
1408 }
1409
1410 /**
1411 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1412 *
1413 * During code generation, we create tons of temporary variables, many of
1414 * which get immediately killed and are never used again. Yet, in later
1415 * optimization and analysis passes, such as compute_live_intervals, we need
1416 * to loop over all the virtual GRFs. Compacting them can save a lot of
1417 * overhead.
1418 */
1419 void
1420 fs_visitor::compact_virtual_grfs()
1421 {
1422 /* Mark which virtual GRFs are used, and count how many. */
1423 int remap_table[this->virtual_grf_count];
1424 memset(remap_table, -1, sizeof(remap_table));
1425
1426 foreach_list(node, &this->instructions) {
1427 const fs_inst *inst = (const fs_inst *) node;
1428
1429 if (inst->dst.file == GRF)
1430 remap_table[inst->dst.reg] = 0;
1431
1432 for (int i = 0; i < 3; i++) {
1433 if (inst->src[i].file == GRF)
1434 remap_table[inst->src[i].reg] = 0;
1435 }
1436 }
1437
1438 /* In addition to registers used in instructions, fs_visitor keeps
1439 * direct references to certain special values which must be patched:
1440 */
1441 fs_reg *special[] = {
1442 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1443 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1444 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1445 &delta_x[0], &delta_x[1], &delta_x[2],
1446 &delta_x[3], &delta_x[4], &delta_x[5],
1447 &delta_y[0], &delta_y[1], &delta_y[2],
1448 &delta_y[3], &delta_y[4], &delta_y[5],
1449 };
1450 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1451 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1452
1453 /* Treat all special values as used, to be conservative */
1454 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1455 if (special[i]->file == GRF)
1456 remap_table[special[i]->reg] = 0;
1457 }
1458
1459 /* Compact the GRF arrays. */
1460 int new_index = 0;
1461 for (int i = 0; i < this->virtual_grf_count; i++) {
1462 if (remap_table[i] != -1) {
1463 remap_table[i] = new_index;
1464 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1465 if (live_intervals_valid) {
1466 virtual_grf_use[new_index] = virtual_grf_use[i];
1467 virtual_grf_def[new_index] = virtual_grf_def[i];
1468 }
1469 ++new_index;
1470 }
1471 }
1472
1473 this->virtual_grf_count = new_index;
1474
1475 /* Patch all the instructions to use the newly renumbered registers */
1476 foreach_list(node, &this->instructions) {
1477 fs_inst *inst = (fs_inst *) node;
1478
1479 if (inst->dst.file == GRF)
1480 inst->dst.reg = remap_table[inst->dst.reg];
1481
1482 for (int i = 0; i < 3; i++) {
1483 if (inst->src[i].file == GRF)
1484 inst->src[i].reg = remap_table[inst->src[i].reg];
1485 }
1486 }
1487
1488 /* Patch all the references to special values */
1489 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1490 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1491 special[i]->reg = remap_table[special[i]->reg];
1492 }
1493 }
1494
1495 bool
1496 fs_visitor::remove_dead_constants()
1497 {
1498 if (dispatch_width == 8) {
1499 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1500
1501 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1502 this->params_remap[i] = -1;
1503
1504 /* Find which params are still in use. */
1505 foreach_list(node, &this->instructions) {
1506 fs_inst *inst = (fs_inst *)node;
1507
1508 for (int i = 0; i < 3; i++) {
1509 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1510
1511 if (inst->src[i].file != UNIFORM)
1512 continue;
1513
1514 assert(constant_nr < (int)c->prog_data.nr_params);
1515
1516 /* For now, set this to non-negative. We'll give it the
1517 * actual new number in a moment, in order to keep the
1518 * register numbers nicely ordered.
1519 */
1520 this->params_remap[constant_nr] = 0;
1521 }
1522 }
1523
1524 /* Figure out what the new numbers for the params will be. At some
1525 * point when we're doing uniform array access, we're going to want
1526 * to keep the distinction between .reg and .reg_offset, but for
1527 * now we don't care.
1528 */
1529 unsigned int new_nr_params = 0;
1530 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1531 if (this->params_remap[i] != -1) {
1532 this->params_remap[i] = new_nr_params++;
1533 }
1534 }
1535
1536 /* Update the list of params to be uploaded to match our new numbering. */
1537 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1538 int remapped = this->params_remap[i];
1539
1540 if (remapped == -1)
1541 continue;
1542
1543 c->prog_data.param[remapped] = c->prog_data.param[i];
1544 }
1545
1546 c->prog_data.nr_params = new_nr_params;
1547 } else {
1548 /* This should have been generated in the 8-wide pass already. */
1549 assert(this->params_remap);
1550 }
1551
1552 /* Now do the renumbering of the shader to remove unused params. */
1553 foreach_list(node, &this->instructions) {
1554 fs_inst *inst = (fs_inst *)node;
1555
1556 for (int i = 0; i < 3; i++) {
1557 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1558
1559 if (inst->src[i].file != UNIFORM)
1560 continue;
1561
1562 assert(this->params_remap[constant_nr] != -1);
1563 inst->src[i].reg = this->params_remap[constant_nr];
1564 inst->src[i].reg_offset = 0;
1565 }
1566 }
1567
1568 return true;
1569 }
1570
1571 /*
1572 * Implements array access of uniforms by inserting a
1573 * PULL_CONSTANT_LOAD instruction.
1574 *
1575 * Unlike temporary GRF array access (where we don't support it due to
1576 * the difficulty of doing relative addressing on instruction
1577 * destinations), we could potentially do array access of uniforms
1578 * that were loaded in GRF space as push constants. In real-world
1579 * usage we've seen, though, the arrays being used are always larger
1580 * than we could load as push constants, so just always move all
1581 * uniform array access out to a pull constant buffer.
1582 */
1583 void
1584 fs_visitor::move_uniform_array_access_to_pull_constants()
1585 {
1586 int pull_constant_loc[c->prog_data.nr_params];
1587
1588 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1589 pull_constant_loc[i] = -1;
1590 }
1591
1592 /* Walk through and find array access of uniforms. Put a copy of that
1593 * uniform in the pull constant buffer.
1594 *
1595 * Note that we don't move constant-indexed accesses to arrays. No
1596 * testing has been done of the performance impact of this choice.
1597 */
1598 foreach_list_safe(node, &this->instructions) {
1599 fs_inst *inst = (fs_inst *)node;
1600
1601 for (int i = 0 ; i < 3; i++) {
1602 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1603 continue;
1604
1605 int uniform = inst->src[i].reg;
1606
1607 /* If this array isn't already present in the pull constant buffer,
1608 * add it.
1609 */
1610 if (pull_constant_loc[uniform] == -1) {
1611 const float **values = &c->prog_data.param[uniform];
1612
1613 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1614
1615 assert(param_size[uniform]);
1616
1617 for (int j = 0; j < param_size[uniform]; j++) {
1618 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1619 values[j];
1620 }
1621 }
1622
1623 /* Set up the annotation tracking for new generated instructions. */
1624 base_ir = inst->ir;
1625 current_annotation = inst->annotation;
1626
1627 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1628 fs_reg temp = fs_reg(this, glsl_type::float_type);
1629 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1630 surf_index,
1631 *inst->src[i].reladdr,
1632 pull_constant_loc[uniform] +
1633 inst->src[i].reg_offset);
1634 inst->insert_before(&list);
1635
1636 inst->src[i].file = temp.file;
1637 inst->src[i].reg = temp.reg;
1638 inst->src[i].reg_offset = temp.reg_offset;
1639 inst->src[i].reladdr = NULL;
1640 }
1641 }
1642 }
1643
1644 /**
1645 * Choose accesses from the UNIFORM file to demote to using the pull
1646 * constant buffer.
1647 *
1648 * We allow a fragment shader to have more than the specified minimum
1649 * maximum number of fragment shader uniform components (64). If
1650 * there are too many of these, they'd fill up all of register space.
1651 * So, this will push some of them out to the pull constant buffer and
1652 * update the program to load them.
1653 */
1654 void
1655 fs_visitor::setup_pull_constants()
1656 {
1657 /* Only allow 16 registers (128 uniform components) as push constants. */
1658 unsigned int max_uniform_components = 16 * 8;
1659 if (c->prog_data.nr_params <= max_uniform_components)
1660 return;
1661
1662 if (dispatch_width == 16) {
1663 fail("Pull constants not supported in 16-wide\n");
1664 return;
1665 }
1666
1667 /* Just demote the end of the list. We could probably do better
1668 * here, demoting things that are rarely used in the program first.
1669 */
1670 unsigned int pull_uniform_base = max_uniform_components;
1671
1672 int pull_constant_loc[c->prog_data.nr_params];
1673 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1674 if (i < pull_uniform_base) {
1675 pull_constant_loc[i] = -1;
1676 } else {
1677 pull_constant_loc[i] = -1;
1678 /* If our constant is already being uploaded for reladdr purposes,
1679 * reuse it.
1680 */
1681 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1682 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1683 pull_constant_loc[i] = j;
1684 break;
1685 }
1686 }
1687 if (pull_constant_loc[i] == -1) {
1688 int pull_index = c->prog_data.nr_pull_params++;
1689 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1690 pull_constant_loc[i] = pull_index;;
1691 }
1692 }
1693 }
1694 c->prog_data.nr_params = pull_uniform_base;
1695
1696 foreach_list(node, &this->instructions) {
1697 fs_inst *inst = (fs_inst *)node;
1698
1699 for (int i = 0; i < 3; i++) {
1700 if (inst->src[i].file != UNIFORM)
1701 continue;
1702
1703 int pull_index = pull_constant_loc[inst->src[i].reg +
1704 inst->src[i].reg_offset];
1705 if (pull_index == -1)
1706 continue;
1707
1708 assert(!inst->src[i].reladdr);
1709
1710 fs_reg dst = fs_reg(this, glsl_type::float_type);
1711 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1712 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1713 fs_inst *pull =
1714 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1715 dst, index, offset);
1716 pull->ir = inst->ir;
1717 pull->annotation = inst->annotation;
1718
1719 inst->insert_before(pull);
1720
1721 inst->src[i].file = GRF;
1722 inst->src[i].reg = dst.reg;
1723 inst->src[i].reg_offset = 0;
1724 inst->src[i].smear = pull_index & 3;
1725 }
1726 }
1727 }
1728
1729 bool
1730 fs_visitor::opt_algebraic()
1731 {
1732 bool progress = false;
1733
1734 foreach_list(node, &this->instructions) {
1735 fs_inst *inst = (fs_inst *)node;
1736
1737 switch (inst->opcode) {
1738 case BRW_OPCODE_MUL:
1739 if (inst->src[1].file != IMM)
1740 continue;
1741
1742 /* a * 1.0 = a */
1743 if (inst->src[1].is_one()) {
1744 inst->opcode = BRW_OPCODE_MOV;
1745 inst->src[1] = reg_undef;
1746 progress = true;
1747 break;
1748 }
1749
1750 /* a * 0.0 = 0.0 */
1751 if (inst->src[1].is_zero()) {
1752 inst->opcode = BRW_OPCODE_MOV;
1753 inst->src[0] = inst->src[1];
1754 inst->src[1] = reg_undef;
1755 progress = true;
1756 break;
1757 }
1758
1759 break;
1760 case BRW_OPCODE_ADD:
1761 if (inst->src[1].file != IMM)
1762 continue;
1763
1764 /* a + 0.0 = a */
1765 if (inst->src[1].is_zero()) {
1766 inst->opcode = BRW_OPCODE_MOV;
1767 inst->src[1] = reg_undef;
1768 progress = true;
1769 break;
1770 }
1771 break;
1772 default:
1773 break;
1774 }
1775 }
1776
1777 return progress;
1778 }
1779
1780 /**
1781 * Must be called after calculate_live_intervales() to remove unused
1782 * writes to registers -- register allocation will fail otherwise
1783 * because something deffed but not used won't be considered to
1784 * interfere with other regs.
1785 */
1786 bool
1787 fs_visitor::dead_code_eliminate()
1788 {
1789 bool progress = false;
1790 int pc = 0;
1791
1792 calculate_live_intervals();
1793
1794 foreach_list_safe(node, &this->instructions) {
1795 fs_inst *inst = (fs_inst *)node;
1796
1797 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1798 inst->remove();
1799 progress = true;
1800 }
1801
1802 pc++;
1803 }
1804
1805 if (progress)
1806 live_intervals_valid = false;
1807
1808 return progress;
1809 }
1810
1811 /**
1812 * Implements a second type of register coalescing: This one checks if
1813 * the two regs involved in a raw move don't interfere, in which case
1814 * they can both by stored in the same place and the MOV removed.
1815 */
1816 bool
1817 fs_visitor::register_coalesce_2()
1818 {
1819 bool progress = false;
1820
1821 calculate_live_intervals();
1822
1823 foreach_list_safe(node, &this->instructions) {
1824 fs_inst *inst = (fs_inst *)node;
1825
1826 if (inst->opcode != BRW_OPCODE_MOV ||
1827 inst->predicate ||
1828 inst->saturate ||
1829 inst->src[0].file != GRF ||
1830 inst->src[0].negate ||
1831 inst->src[0].abs ||
1832 inst->src[0].smear != -1 ||
1833 inst->dst.file != GRF ||
1834 inst->dst.type != inst->src[0].type ||
1835 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1836 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1837 continue;
1838 }
1839
1840 int reg_from = inst->src[0].reg;
1841 assert(inst->src[0].reg_offset == 0);
1842 int reg_to = inst->dst.reg;
1843 int reg_to_offset = inst->dst.reg_offset;
1844
1845 foreach_list(node, &this->instructions) {
1846 fs_inst *scan_inst = (fs_inst *)node;
1847
1848 if (scan_inst->dst.file == GRF &&
1849 scan_inst->dst.reg == reg_from) {
1850 scan_inst->dst.reg = reg_to;
1851 scan_inst->dst.reg_offset = reg_to_offset;
1852 }
1853 for (int i = 0; i < 3; i++) {
1854 if (scan_inst->src[i].file == GRF &&
1855 scan_inst->src[i].reg == reg_from) {
1856 scan_inst->src[i].reg = reg_to;
1857 scan_inst->src[i].reg_offset = reg_to_offset;
1858 }
1859 }
1860 }
1861
1862 inst->remove();
1863
1864 /* We don't need to recalculate live intervals inside the loop despite
1865 * flagging live_intervals_valid because we only use live intervals for
1866 * the interferes test, and we must have had a situation where the
1867 * intervals were:
1868 *
1869 * from to
1870 * ^
1871 * |
1872 * v
1873 * ^
1874 * |
1875 * v
1876 *
1877 * Some register R that might get coalesced with one of these two could
1878 * only be referencing "to", otherwise "from"'s range would have been
1879 * longer. R's range could also only start at the end of "to" or later,
1880 * otherwise it will conflict with "to" when we try to coalesce "to"
1881 * into Rw anyway.
1882 */
1883 live_intervals_valid = false;
1884
1885 progress = true;
1886 continue;
1887 }
1888
1889 return progress;
1890 }
1891
1892 bool
1893 fs_visitor::register_coalesce()
1894 {
1895 bool progress = false;
1896 int if_depth = 0;
1897 int loop_depth = 0;
1898
1899 foreach_list_safe(node, &this->instructions) {
1900 fs_inst *inst = (fs_inst *)node;
1901
1902 /* Make sure that we dominate the instructions we're going to
1903 * scan for interfering with our coalescing, or we won't have
1904 * scanned enough to see if anything interferes with our
1905 * coalescing. We don't dominate the following instructions if
1906 * we're in a loop or an if block.
1907 */
1908 switch (inst->opcode) {
1909 case BRW_OPCODE_DO:
1910 loop_depth++;
1911 break;
1912 case BRW_OPCODE_WHILE:
1913 loop_depth--;
1914 break;
1915 case BRW_OPCODE_IF:
1916 if_depth++;
1917 break;
1918 case BRW_OPCODE_ENDIF:
1919 if_depth--;
1920 break;
1921 default:
1922 break;
1923 }
1924 if (loop_depth || if_depth)
1925 continue;
1926
1927 if (inst->opcode != BRW_OPCODE_MOV ||
1928 inst->predicate ||
1929 inst->saturate ||
1930 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1931 inst->src[0].file != UNIFORM)||
1932 inst->dst.type != inst->src[0].type)
1933 continue;
1934
1935 bool has_source_modifiers = (inst->src[0].abs ||
1936 inst->src[0].negate ||
1937 inst->src[0].smear != -1 ||
1938 inst->src[0].file == UNIFORM);
1939
1940 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1941 * them: check for no writes to either one until the exit of the
1942 * program.
1943 */
1944 bool interfered = false;
1945
1946 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1947 !scan_inst->is_tail_sentinel();
1948 scan_inst = (fs_inst *)scan_inst->next) {
1949 if (scan_inst->dst.file == GRF) {
1950 if (scan_inst->overwrites_reg(inst->dst) ||
1951 scan_inst->overwrites_reg(inst->src[0])) {
1952 interfered = true;
1953 break;
1954 }
1955 }
1956
1957 /* The gen6 MATH instruction can't handle source modifiers or
1958 * unusual register regions, so avoid coalescing those for
1959 * now. We should do something more specific.
1960 */
1961 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1962 interfered = true;
1963 break;
1964 }
1965
1966 /* The accumulator result appears to get used for the
1967 * conditional modifier generation. When negating a UD
1968 * value, there is a 33rd bit generated for the sign in the
1969 * accumulator value, so now you can't check, for example,
1970 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1971 */
1972 if (scan_inst->conditional_mod &&
1973 inst->src[0].negate &&
1974 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1975 interfered = true;
1976 break;
1977 }
1978 }
1979 if (interfered) {
1980 continue;
1981 }
1982
1983 /* Rewrite the later usage to point at the source of the move to
1984 * be removed.
1985 */
1986 for (fs_inst *scan_inst = inst;
1987 !scan_inst->is_tail_sentinel();
1988 scan_inst = (fs_inst *)scan_inst->next) {
1989 for (int i = 0; i < 3; i++) {
1990 if (scan_inst->src[i].file == GRF &&
1991 scan_inst->src[i].reg == inst->dst.reg &&
1992 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1993 fs_reg new_src = inst->src[0];
1994 if (scan_inst->src[i].abs) {
1995 new_src.negate = 0;
1996 new_src.abs = 1;
1997 }
1998 new_src.negate ^= scan_inst->src[i].negate;
1999 scan_inst->src[i] = new_src;
2000 }
2001 }
2002 }
2003
2004 inst->remove();
2005 progress = true;
2006 }
2007
2008 if (progress)
2009 live_intervals_valid = false;
2010
2011 return progress;
2012 }
2013
2014
2015 bool
2016 fs_visitor::compute_to_mrf()
2017 {
2018 bool progress = false;
2019 int next_ip = 0;
2020
2021 calculate_live_intervals();
2022
2023 foreach_list_safe(node, &this->instructions) {
2024 fs_inst *inst = (fs_inst *)node;
2025
2026 int ip = next_ip;
2027 next_ip++;
2028
2029 if (inst->opcode != BRW_OPCODE_MOV ||
2030 inst->predicate ||
2031 inst->dst.file != MRF || inst->src[0].file != GRF ||
2032 inst->dst.type != inst->src[0].type ||
2033 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2034 continue;
2035
2036 /* Work out which hardware MRF registers are written by this
2037 * instruction.
2038 */
2039 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2040 int mrf_high;
2041 if (inst->dst.reg & BRW_MRF_COMPR4) {
2042 mrf_high = mrf_low + 4;
2043 } else if (dispatch_width == 16 &&
2044 (!inst->force_uncompressed && !inst->force_sechalf)) {
2045 mrf_high = mrf_low + 1;
2046 } else {
2047 mrf_high = mrf_low;
2048 }
2049
2050 /* Can't compute-to-MRF this GRF if someone else was going to
2051 * read it later.
2052 */
2053 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2054 continue;
2055
2056 /* Found a move of a GRF to a MRF. Let's see if we can go
2057 * rewrite the thing that made this GRF to write into the MRF.
2058 */
2059 fs_inst *scan_inst;
2060 for (scan_inst = (fs_inst *)inst->prev;
2061 scan_inst->prev != NULL;
2062 scan_inst = (fs_inst *)scan_inst->prev) {
2063 if (scan_inst->dst.file == GRF &&
2064 scan_inst->dst.reg == inst->src[0].reg) {
2065 /* Found the last thing to write our reg we want to turn
2066 * into a compute-to-MRF.
2067 */
2068
2069 /* If it's predicated, it (probably) didn't populate all
2070 * the channels. We might be able to rewrite everything
2071 * that writes that reg, but it would require smarter
2072 * tracking to delay the rewriting until complete success.
2073 */
2074 if (scan_inst->predicate)
2075 break;
2076
2077 /* If it's half of register setup and not the same half as
2078 * our MOV we're trying to remove, bail for now.
2079 */
2080 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2081 scan_inst->force_sechalf != inst->force_sechalf) {
2082 break;
2083 }
2084
2085 /* SEND instructions can't have MRF as a destination. */
2086 if (scan_inst->mlen)
2087 break;
2088
2089 if (intel->gen == 6) {
2090 /* gen6 math instructions must have the destination be
2091 * GRF, so no compute-to-MRF for them.
2092 */
2093 if (scan_inst->is_math()) {
2094 break;
2095 }
2096 }
2097
2098 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2099 /* Found the creator of our MRF's source value. */
2100 scan_inst->dst.file = MRF;
2101 scan_inst->dst.reg = inst->dst.reg;
2102 scan_inst->saturate |= inst->saturate;
2103 inst->remove();
2104 progress = true;
2105 }
2106 break;
2107 }
2108
2109 /* We don't handle control flow here. Most computation of
2110 * values that end up in MRFs are shortly before the MRF
2111 * write anyway.
2112 */
2113 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2114 break;
2115
2116 /* You can't read from an MRF, so if someone else reads our
2117 * MRF's source GRF that we wanted to rewrite, that stops us.
2118 */
2119 bool interfered = false;
2120 for (int i = 0; i < 3; i++) {
2121 if (scan_inst->src[i].file == GRF &&
2122 scan_inst->src[i].reg == inst->src[0].reg &&
2123 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2124 interfered = true;
2125 }
2126 }
2127 if (interfered)
2128 break;
2129
2130 if (scan_inst->dst.file == MRF) {
2131 /* If somebody else writes our MRF here, we can't
2132 * compute-to-MRF before that.
2133 */
2134 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2135 int scan_mrf_high;
2136
2137 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2138 scan_mrf_high = scan_mrf_low + 4;
2139 } else if (dispatch_width == 16 &&
2140 (!scan_inst->force_uncompressed &&
2141 !scan_inst->force_sechalf)) {
2142 scan_mrf_high = scan_mrf_low + 1;
2143 } else {
2144 scan_mrf_high = scan_mrf_low;
2145 }
2146
2147 if (mrf_low == scan_mrf_low ||
2148 mrf_low == scan_mrf_high ||
2149 mrf_high == scan_mrf_low ||
2150 mrf_high == scan_mrf_high) {
2151 break;
2152 }
2153 }
2154
2155 if (scan_inst->mlen > 0) {
2156 /* Found a SEND instruction, which means that there are
2157 * live values in MRFs from base_mrf to base_mrf +
2158 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2159 * above it.
2160 */
2161 if (mrf_low >= scan_inst->base_mrf &&
2162 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2163 break;
2164 }
2165 if (mrf_high >= scan_inst->base_mrf &&
2166 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2167 break;
2168 }
2169 }
2170 }
2171 }
2172
2173 if (progress)
2174 live_intervals_valid = false;
2175
2176 return progress;
2177 }
2178
2179 /**
2180 * Walks through basic blocks, looking for repeated MRF writes and
2181 * removing the later ones.
2182 */
2183 bool
2184 fs_visitor::remove_duplicate_mrf_writes()
2185 {
2186 fs_inst *last_mrf_move[16];
2187 bool progress = false;
2188
2189 /* Need to update the MRF tracking for compressed instructions. */
2190 if (dispatch_width == 16)
2191 return false;
2192
2193 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2194
2195 foreach_list_safe(node, &this->instructions) {
2196 fs_inst *inst = (fs_inst *)node;
2197
2198 if (inst->is_control_flow()) {
2199 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2200 }
2201
2202 if (inst->opcode == BRW_OPCODE_MOV &&
2203 inst->dst.file == MRF) {
2204 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2205 if (prev_inst && inst->equals(prev_inst)) {
2206 inst->remove();
2207 progress = true;
2208 continue;
2209 }
2210 }
2211
2212 /* Clear out the last-write records for MRFs that were overwritten. */
2213 if (inst->dst.file == MRF) {
2214 last_mrf_move[inst->dst.reg] = NULL;
2215 }
2216
2217 if (inst->mlen > 0) {
2218 /* Found a SEND instruction, which will include two or fewer
2219 * implied MRF writes. We could do better here.
2220 */
2221 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2222 last_mrf_move[inst->base_mrf + i] = NULL;
2223 }
2224 }
2225
2226 /* Clear out any MRF move records whose sources got overwritten. */
2227 if (inst->dst.file == GRF) {
2228 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2229 if (last_mrf_move[i] &&
2230 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2231 last_mrf_move[i] = NULL;
2232 }
2233 }
2234 }
2235
2236 if (inst->opcode == BRW_OPCODE_MOV &&
2237 inst->dst.file == MRF &&
2238 inst->src[0].file == GRF &&
2239 !inst->predicate) {
2240 last_mrf_move[inst->dst.reg] = inst;
2241 }
2242 }
2243
2244 if (progress)
2245 live_intervals_valid = false;
2246
2247 return progress;
2248 }
2249
2250 static void
2251 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2252 int first_grf, int grf_len)
2253 {
2254 bool inst_16wide = (dispatch_width > 8 &&
2255 !inst->force_uncompressed &&
2256 !inst->force_sechalf);
2257
2258 /* Clear the flag for registers that actually got read (as expected). */
2259 for (int i = 0; i < 3; i++) {
2260 int grf;
2261 if (inst->src[i].file == GRF) {
2262 grf = inst->src[i].reg;
2263 } else if (inst->src[i].file == FIXED_HW_REG &&
2264 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2265 grf = inst->src[i].fixed_hw_reg.nr;
2266 } else {
2267 continue;
2268 }
2269
2270 if (grf >= first_grf &&
2271 grf < first_grf + grf_len) {
2272 deps[grf - first_grf] = false;
2273 if (inst_16wide)
2274 deps[grf - first_grf + 1] = false;
2275 }
2276 }
2277 }
2278
2279 /**
2280 * Implements this workaround for the original 965:
2281 *
2282 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2283 * check for post destination dependencies on this instruction, software
2284 * must ensure that there is no destination hazard for the case of ‘write
2285 * followed by a posted write’ shown in the following example.
2286 *
2287 * 1. mov r3 0
2288 * 2. send r3.xy <rest of send instruction>
2289 * 3. mov r2 r3
2290 *
2291 * Due to no post-destination dependency check on the ‘send’, the above
2292 * code sequence could have two instructions (1 and 2) in flight at the
2293 * same time that both consider ‘r3’ as the target of their final writes.
2294 */
2295 void
2296 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2297 {
2298 int reg_size = dispatch_width / 8;
2299 int write_len = inst->regs_written() * reg_size;
2300 int first_write_grf = inst->dst.reg;
2301 bool needs_dep[BRW_MAX_MRF];
2302 assert(write_len < (int)sizeof(needs_dep) - 1);
2303
2304 memset(needs_dep, false, sizeof(needs_dep));
2305 memset(needs_dep, true, write_len);
2306
2307 clear_deps_for_inst_src(inst, dispatch_width,
2308 needs_dep, first_write_grf, write_len);
2309
2310 /* Walk backwards looking for writes to registers we're writing which
2311 * aren't read since being written. If we hit the start of the program,
2312 * we assume that there are no outstanding dependencies on entry to the
2313 * program.
2314 */
2315 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2316 scan_inst != NULL;
2317 scan_inst = (fs_inst *)scan_inst->prev) {
2318
2319 /* If we hit control flow, assume that there *are* outstanding
2320 * dependencies, and force their cleanup before our instruction.
2321 */
2322 if (scan_inst->is_control_flow()) {
2323 for (int i = 0; i < write_len; i++) {
2324 if (needs_dep[i]) {
2325 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2326 }
2327 }
2328 }
2329
2330 bool scan_inst_16wide = (dispatch_width > 8 &&
2331 !scan_inst->force_uncompressed &&
2332 !scan_inst->force_sechalf);
2333
2334 /* We insert our reads as late as possible on the assumption that any
2335 * instruction but a MOV that might have left us an outstanding
2336 * dependency has more latency than a MOV.
2337 */
2338 if (scan_inst->dst.file == GRF) {
2339 for (int i = 0; i < scan_inst->regs_written(); i++) {
2340 int reg = scan_inst->dst.reg + i * reg_size;
2341
2342 if (reg >= first_write_grf &&
2343 reg < first_write_grf + write_len &&
2344 needs_dep[reg - first_write_grf]) {
2345 inst->insert_before(DEP_RESOLVE_MOV(reg));
2346 needs_dep[reg - first_write_grf] = false;
2347 if (scan_inst_16wide)
2348 needs_dep[reg - first_write_grf + 1] = false;
2349 }
2350 }
2351 }
2352
2353 /* Clear the flag for registers that actually got read (as expected). */
2354 clear_deps_for_inst_src(scan_inst, dispatch_width,
2355 needs_dep, first_write_grf, write_len);
2356
2357 /* Continue the loop only if we haven't resolved all the dependencies */
2358 int i;
2359 for (i = 0; i < write_len; i++) {
2360 if (needs_dep[i])
2361 break;
2362 }
2363 if (i == write_len)
2364 return;
2365 }
2366 }
2367
2368 /**
2369 * Implements this workaround for the original 965:
2370 *
2371 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2372 * used as a destination register until after it has been sourced by an
2373 * instruction with a different destination register.
2374 */
2375 void
2376 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2377 {
2378 int write_len = inst->regs_written() * dispatch_width / 8;
2379 int first_write_grf = inst->dst.reg;
2380 bool needs_dep[BRW_MAX_MRF];
2381 assert(write_len < (int)sizeof(needs_dep) - 1);
2382
2383 memset(needs_dep, false, sizeof(needs_dep));
2384 memset(needs_dep, true, write_len);
2385 /* Walk forwards looking for writes to registers we're writing which aren't
2386 * read before being written.
2387 */
2388 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2389 !scan_inst->is_tail_sentinel();
2390 scan_inst = (fs_inst *)scan_inst->next) {
2391 /* If we hit control flow, force resolve all remaining dependencies. */
2392 if (scan_inst->is_control_flow()) {
2393 for (int i = 0; i < write_len; i++) {
2394 if (needs_dep[i])
2395 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2396 }
2397 }
2398
2399 /* Clear the flag for registers that actually got read (as expected). */
2400 clear_deps_for_inst_src(scan_inst, dispatch_width,
2401 needs_dep, first_write_grf, write_len);
2402
2403 /* We insert our reads as late as possible since they're reading the
2404 * result of a SEND, which has massive latency.
2405 */
2406 if (scan_inst->dst.file == GRF &&
2407 scan_inst->dst.reg >= first_write_grf &&
2408 scan_inst->dst.reg < first_write_grf + write_len &&
2409 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2410 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2411 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2412 }
2413
2414 /* Continue the loop only if we haven't resolved all the dependencies */
2415 int i;
2416 for (i = 0; i < write_len; i++) {
2417 if (needs_dep[i])
2418 break;
2419 }
2420 if (i == write_len)
2421 return;
2422 }
2423
2424 /* If we hit the end of the program, resolve all remaining dependencies out
2425 * of paranoia.
2426 */
2427 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2428 assert(last_inst->eot);
2429 for (int i = 0; i < write_len; i++) {
2430 if (needs_dep[i])
2431 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2432 }
2433 }
2434
2435 void
2436 fs_visitor::insert_gen4_send_dependency_workarounds()
2437 {
2438 if (intel->gen != 4 || intel->is_g4x)
2439 return;
2440
2441 /* Note that we're done with register allocation, so GRF fs_regs always
2442 * have a .reg_offset of 0.
2443 */
2444
2445 foreach_list_safe(node, &this->instructions) {
2446 fs_inst *inst = (fs_inst *)node;
2447
2448 if (inst->mlen != 0 && inst->dst.file == GRF) {
2449 insert_gen4_pre_send_dependency_workarounds(inst);
2450 insert_gen4_post_send_dependency_workarounds(inst);
2451 }
2452 }
2453 }
2454
2455 /**
2456 * Turns the generic expression-style uniform pull constant load instruction
2457 * into a hardware-specific series of instructions for loading a pull
2458 * constant.
2459 *
2460 * The expression style allows the CSE pass before this to optimize out
2461 * repeated loads from the same offset, and gives the pre-register-allocation
2462 * scheduling full flexibility, while the conversion to native instructions
2463 * allows the post-register-allocation scheduler the best information
2464 * possible.
2465 *
2466 * Note that execution masking for setting up pull constant loads is special:
2467 * the channels that need to be written are unrelated to the current execution
2468 * mask, since a later instruction will use one of the result channels as a
2469 * source operand for all 8 or 16 of its channels.
2470 */
2471 void
2472 fs_visitor::lower_uniform_pull_constant_loads()
2473 {
2474 foreach_list(node, &this->instructions) {
2475 fs_inst *inst = (fs_inst *)node;
2476
2477 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2478 continue;
2479
2480 if (intel->gen >= 7) {
2481 /* The offset arg before was a vec4-aligned byte offset. We need to
2482 * turn it into a dword offset.
2483 */
2484 fs_reg const_offset_reg = inst->src[1];
2485 assert(const_offset_reg.file == IMM &&
2486 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2487 const_offset_reg.imm.u /= 4;
2488 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2489
2490 /* This is actually going to be a MOV, but since only the first dword
2491 * is accessed, we have a special opcode to do just that one. Note
2492 * that this needs to be an operation that will be considered a def
2493 * by live variable analysis, or register allocation will explode.
2494 */
2495 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2496 payload, const_offset_reg);
2497 setup->force_writemask_all = true;
2498
2499 setup->ir = inst->ir;
2500 setup->annotation = inst->annotation;
2501 inst->insert_before(setup);
2502
2503 /* Similarly, this will only populate the first 4 channels of the
2504 * result register (since we only use smear values from 0-3), but we
2505 * don't tell the optimizer.
2506 */
2507 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2508 inst->src[1] = payload;
2509
2510 this->live_intervals_valid = false;
2511 } else {
2512 /* Before register allocation, we didn't tell the scheduler about the
2513 * MRF we use. We know it's safe to use this MRF because nothing
2514 * else does except for register spill/unspill, which generates and
2515 * uses its MRF within a single IR instruction.
2516 */
2517 inst->base_mrf = 14;
2518 inst->mlen = 1;
2519 }
2520 }
2521 }
2522
2523 void
2524 fs_visitor::dump_instruction(fs_inst *inst)
2525 {
2526 if (inst->predicate) {
2527 printf("(%cf0.%d) ",
2528 inst->predicate_inverse ? '-' : '+',
2529 inst->flag_subreg);
2530 }
2531
2532 printf("%s", brw_instruction_name(inst->opcode));
2533 if (inst->saturate)
2534 printf(".sat");
2535 if (inst->conditional_mod) {
2536 printf(".cmod");
2537 if (!inst->predicate &&
2538 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2539 inst->opcode != BRW_OPCODE_IF &&
2540 inst->opcode != BRW_OPCODE_WHILE))) {
2541 printf(".f0.%d\n", inst->flag_subreg);
2542 }
2543 }
2544 printf(" ");
2545
2546
2547 switch (inst->dst.file) {
2548 case GRF:
2549 printf("vgrf%d", inst->dst.reg);
2550 if (inst->dst.reg_offset)
2551 printf("+%d", inst->dst.reg_offset);
2552 break;
2553 case MRF:
2554 printf("m%d", inst->dst.reg);
2555 break;
2556 case BAD_FILE:
2557 printf("(null)");
2558 break;
2559 case UNIFORM:
2560 printf("***u%d***", inst->dst.reg);
2561 break;
2562 default:
2563 printf("???");
2564 break;
2565 }
2566 printf(", ");
2567
2568 for (int i = 0; i < 3; i++) {
2569 if (inst->src[i].negate)
2570 printf("-");
2571 if (inst->src[i].abs)
2572 printf("|");
2573 switch (inst->src[i].file) {
2574 case GRF:
2575 printf("vgrf%d", inst->src[i].reg);
2576 if (inst->src[i].reg_offset)
2577 printf("+%d", inst->src[i].reg_offset);
2578 break;
2579 case MRF:
2580 printf("***m%d***", inst->src[i].reg);
2581 break;
2582 case UNIFORM:
2583 printf("u%d", inst->src[i].reg);
2584 if (inst->src[i].reg_offset)
2585 printf(".%d", inst->src[i].reg_offset);
2586 break;
2587 case BAD_FILE:
2588 printf("(null)");
2589 break;
2590 case IMM:
2591 switch (inst->src[i].type) {
2592 case BRW_REGISTER_TYPE_F:
2593 printf("%ff", inst->src[i].imm.f);
2594 break;
2595 case BRW_REGISTER_TYPE_D:
2596 printf("%dd", inst->src[i].imm.i);
2597 break;
2598 case BRW_REGISTER_TYPE_UD:
2599 printf("%uu", inst->src[i].imm.u);
2600 break;
2601 default:
2602 printf("???");
2603 break;
2604 }
2605 break;
2606 default:
2607 printf("???");
2608 break;
2609 }
2610 if (inst->src[i].abs)
2611 printf("|");
2612
2613 if (i < 3)
2614 printf(", ");
2615 }
2616
2617 printf(" ");
2618
2619 if (inst->force_uncompressed)
2620 printf("1sthalf ");
2621
2622 if (inst->force_sechalf)
2623 printf("2ndhalf ");
2624
2625 printf("\n");
2626 }
2627
2628 void
2629 fs_visitor::dump_instructions()
2630 {
2631 int ip = 0;
2632 foreach_list(node, &this->instructions) {
2633 fs_inst *inst = (fs_inst *)node;
2634 printf("%d: ", ip++);
2635 dump_instruction(inst);
2636 }
2637 }
2638
2639 /**
2640 * Possibly returns an instruction that set up @param reg.
2641 *
2642 * Sometimes we want to take the result of some expression/variable
2643 * dereference tree and rewrite the instruction generating the result
2644 * of the tree. When processing the tree, we know that the
2645 * instructions generated are all writing temporaries that are dead
2646 * outside of this tree. So, if we have some instructions that write
2647 * a temporary, we're free to point that temp write somewhere else.
2648 *
2649 * Note that this doesn't guarantee that the instruction generated
2650 * only reg -- it might be the size=4 destination of a texture instruction.
2651 */
2652 fs_inst *
2653 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2654 fs_inst *end,
2655 fs_reg reg)
2656 {
2657 if (end == start ||
2658 end->predicate ||
2659 end->force_uncompressed ||
2660 end->force_sechalf ||
2661 reg.reladdr ||
2662 !reg.equals(end->dst)) {
2663 return NULL;
2664 } else {
2665 return end;
2666 }
2667 }
2668
2669 void
2670 fs_visitor::setup_payload_gen6()
2671 {
2672 struct intel_context *intel = &brw->intel;
2673 bool uses_depth =
2674 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2675 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2676
2677 assert(intel->gen >= 6);
2678
2679 /* R0-1: masks, pixel X/Y coordinates. */
2680 c->nr_payload_regs = 2;
2681 /* R2: only for 32-pixel dispatch.*/
2682
2683 /* R3-26: barycentric interpolation coordinates. These appear in the
2684 * same order that they appear in the brw_wm_barycentric_interp_mode
2685 * enum. Each set of coordinates occupies 2 registers if dispatch width
2686 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2687 * appear if they were enabled using the "Barycentric Interpolation
2688 * Mode" bits in WM_STATE.
2689 */
2690 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2691 if (barycentric_interp_modes & (1 << i)) {
2692 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2693 c->nr_payload_regs += 2;
2694 if (dispatch_width == 16) {
2695 c->nr_payload_regs += 2;
2696 }
2697 }
2698 }
2699
2700 /* R27: interpolated depth if uses source depth */
2701 if (uses_depth) {
2702 c->source_depth_reg = c->nr_payload_regs;
2703 c->nr_payload_regs++;
2704 if (dispatch_width == 16) {
2705 /* R28: interpolated depth if not 8-wide. */
2706 c->nr_payload_regs++;
2707 }
2708 }
2709 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2710 if (uses_depth) {
2711 c->source_w_reg = c->nr_payload_regs;
2712 c->nr_payload_regs++;
2713 if (dispatch_width == 16) {
2714 /* R30: interpolated W if not 8-wide. */
2715 c->nr_payload_regs++;
2716 }
2717 }
2718 /* R31: MSAA position offsets. */
2719 /* R32-: bary for 32-pixel. */
2720 /* R58-59: interp W for 32-pixel. */
2721
2722 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2723 c->source_depth_to_render_target = true;
2724 }
2725 }
2726
2727 bool
2728 fs_visitor::run()
2729 {
2730 sanity_param_count = fp->Base.Parameters->NumParameters;
2731 uint32_t orig_nr_params = c->prog_data.nr_params;
2732
2733 if (intel->gen >= 6)
2734 setup_payload_gen6();
2735 else
2736 setup_payload_gen4();
2737
2738 if (0) {
2739 emit_dummy_fs();
2740 } else {
2741 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2742 emit_shader_time_begin();
2743
2744 calculate_urb_setup();
2745 if (intel->gen < 6)
2746 emit_interpolation_setup_gen4();
2747 else
2748 emit_interpolation_setup_gen6();
2749
2750 /* We handle discards by keeping track of the still-live pixels in f0.1.
2751 * Initialize it with the dispatched pixels.
2752 */
2753 if (fp->UsesKill) {
2754 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2755 discard_init->flag_subreg = 1;
2756 }
2757
2758 /* Generate FS IR for main(). (the visitor only descends into
2759 * functions called "main").
2760 */
2761 if (shader) {
2762 foreach_list(node, &*shader->ir) {
2763 ir_instruction *ir = (ir_instruction *)node;
2764 base_ir = ir;
2765 this->result = reg_undef;
2766 ir->accept(this);
2767 }
2768 } else {
2769 emit_fragment_program_code();
2770 }
2771 base_ir = NULL;
2772 if (failed)
2773 return false;
2774
2775 emit(FS_OPCODE_PLACEHOLDER_HALT);
2776
2777 emit_fb_writes();
2778
2779 split_virtual_grfs();
2780
2781 move_uniform_array_access_to_pull_constants();
2782 setup_pull_constants();
2783
2784 bool progress;
2785 do {
2786 progress = false;
2787
2788 compact_virtual_grfs();
2789
2790 progress = remove_duplicate_mrf_writes() || progress;
2791
2792 progress = opt_algebraic() || progress;
2793 progress = opt_cse() || progress;
2794 progress = opt_copy_propagate() || progress;
2795 progress = dead_code_eliminate() || progress;
2796 progress = register_coalesce() || progress;
2797 progress = register_coalesce_2() || progress;
2798 progress = compute_to_mrf() || progress;
2799 } while (progress);
2800
2801 remove_dead_constants();
2802
2803 schedule_instructions(false);
2804
2805 lower_uniform_pull_constant_loads();
2806
2807 assign_curb_setup();
2808 assign_urb_setup();
2809
2810 if (0) {
2811 /* Debug of register spilling: Go spill everything. */
2812 for (int i = 0; i < virtual_grf_count; i++) {
2813 spill_reg(i);
2814 }
2815 }
2816
2817 if (0)
2818 assign_regs_trivial();
2819 else {
2820 while (!assign_regs()) {
2821 if (failed)
2822 break;
2823 }
2824 }
2825 }
2826 assert(force_uncompressed_stack == 0);
2827 assert(force_sechalf_stack == 0);
2828
2829 /* This must come after all optimization and register allocation, since
2830 * it inserts dead code that happens to have side effects, and it does
2831 * so based on the actual physical registers in use.
2832 */
2833 insert_gen4_send_dependency_workarounds();
2834
2835 if (failed)
2836 return false;
2837
2838 schedule_instructions(true);
2839
2840 if (dispatch_width == 8) {
2841 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2842 } else {
2843 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2844
2845 /* Make sure we didn't try to sneak in an extra uniform */
2846 assert(orig_nr_params == c->prog_data.nr_params);
2847 (void) orig_nr_params;
2848 }
2849
2850 /* If any state parameters were appended, then ParameterValues could have
2851 * been realloced, in which case the driver uniform storage set up by
2852 * _mesa_associate_uniform_storage() would point to freed memory. Make
2853 * sure that didn't happen.
2854 */
2855 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2856
2857 return !failed;
2858 }
2859
2860 const unsigned *
2861 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2862 struct gl_fragment_program *fp,
2863 struct gl_shader_program *prog,
2864 unsigned *final_assembly_size)
2865 {
2866 struct intel_context *intel = &brw->intel;
2867 bool start_busy = false;
2868 float start_time = 0;
2869
2870 if (unlikely(intel->perf_debug)) {
2871 start_busy = (intel->batch.last_bo &&
2872 drm_intel_bo_busy(intel->batch.last_bo));
2873 start_time = get_time();
2874 }
2875
2876 struct brw_shader *shader = NULL;
2877 if (prog)
2878 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2879
2880 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2881 if (shader) {
2882 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2883 _mesa_print_ir(shader->ir, NULL);
2884 printf("\n\n");
2885 } else {
2886 printf("ARB_fragment_program %d ir for native fragment shader\n",
2887 fp->Base.Id);
2888 _mesa_print_program(&fp->Base);
2889 }
2890 }
2891
2892 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2893 */
2894 fs_visitor v(brw, c, prog, fp, 8);
2895 if (!v.run()) {
2896 prog->LinkStatus = false;
2897 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2898
2899 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2900 v.fail_msg);
2901
2902 return NULL;
2903 }
2904
2905 exec_list *simd16_instructions = NULL;
2906 fs_visitor v2(brw, c, prog, fp, 16);
2907 bool no16 = INTEL_DEBUG & DEBUG_NO16;
2908 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2909 v2.import_uniforms(&v);
2910 if (!v2.run()) {
2911 perf_debug("16-wide shader failed to compile, falling back to "
2912 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2913 } else {
2914 simd16_instructions = &v2.instructions;
2915 }
2916 }
2917
2918 c->prog_data.dispatch_width = 8;
2919
2920 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2921 const unsigned *generated = g.generate_assembly(&v.instructions,
2922 simd16_instructions,
2923 final_assembly_size);
2924
2925 if (unlikely(intel->perf_debug) && shader) {
2926 if (shader->compiled_once)
2927 brw_wm_debug_recompile(brw, prog, &c->key);
2928 shader->compiled_once = true;
2929
2930 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2931 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2932 (get_time() - start_time) * 1000);
2933 }
2934 }
2935
2936 return generated;
2937 }
2938
2939 bool
2940 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2941 {
2942 struct brw_context *brw = brw_context(ctx);
2943 struct intel_context *intel = &brw->intel;
2944 struct brw_wm_prog_key key;
2945
2946 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2947 return true;
2948
2949 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2950 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2951 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2952 bool program_uses_dfdy = fp->UsesDFdy;
2953
2954 memset(&key, 0, sizeof(key));
2955
2956 if (intel->gen < 6) {
2957 if (fp->UsesKill)
2958 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2959
2960 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2961 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2962
2963 /* Just assume depth testing. */
2964 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2965 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2966 }
2967
2968 if (prog->Name != 0)
2969 key.proj_attrib_mask = ~(GLbitfield64) 0;
2970 else {
2971 /* Bit VARYING_BIT_POS of key.proj_attrib_mask is never used, so to
2972 * avoid unnecessary recompiles, always set it to 1.
2973 */
2974 key.proj_attrib_mask |= VARYING_BIT_POS;
2975 }
2976
2977 if (intel->gen < 6)
2978 key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
2979
2980 for (int i = 0; i < VARYING_SLOT_MAX; i++) {
2981 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2982 continue;
2983
2984 if (prog->Name == 0)
2985 key.proj_attrib_mask |= BITFIELD64_BIT(i);
2986
2987 if (intel->gen < 6) {
2988 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
2989 key.input_slots_valid |= BITFIELD64_BIT(i);
2990 }
2991 }
2992
2993 key.clamp_fragment_color = true;
2994
2995 for (int i = 0; i < MAX_SAMPLERS; i++) {
2996 if (fp->Base.ShadowSamplers & (1 << i)) {
2997 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2998 key.tex.swizzles[i] =
2999 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3000 } else {
3001 /* Color sampler: assume no swizzling. */
3002 key.tex.swizzles[i] = SWIZZLE_XYZW;
3003 }
3004 }
3005
3006 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3007 key.drawable_height = ctx->DrawBuffer->Height;
3008 }
3009
3010 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3011 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3012 }
3013
3014 key.nr_color_regions = 1;
3015
3016 key.program_string_id = bfp->id;
3017
3018 uint32_t old_prog_offset = brw->wm.prog_offset;
3019 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3020
3021 bool success = do_wm_prog(brw, prog, bfp, &key);
3022
3023 brw->wm.prog_offset = old_prog_offset;
3024 brw->wm.prog_data = old_prog_data;
3025
3026 return success;
3027 }