i965/fs: Improve performance of varying-index uniform loads on IVB.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 #define ALU3(op) \
150 fs_inst * \
151 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
152 { \
153 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
154 }
155
156 ALU1(NOT)
157 ALU1(MOV)
158 ALU1(FRC)
159 ALU1(RNDD)
160 ALU1(RNDE)
161 ALU1(RNDZ)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(SHL)
169 ALU2(SHR)
170 ALU2(ASR)
171 ALU3(LRP)
172
173 /** Gen4 predicated IF. */
174 fs_inst *
175 fs_visitor::IF(uint32_t predicate)
176 {
177 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
178 inst->predicate = predicate;
179 return inst;
180 }
181
182 /** Gen6+ IF with embedded comparison. */
183 fs_inst *
184 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
185 {
186 assert(intel->gen >= 6);
187 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
188 reg_null_d, src0, src1);
189 inst->conditional_mod = condition;
190 return inst;
191 }
192
193 /**
194 * CMP: Sets the low bit of the destination channels with the result
195 * of the comparison, while the upper bits are undefined, and updates
196 * the flag register with the packed 16 bits of the result.
197 */
198 fs_inst *
199 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
200 {
201 fs_inst *inst;
202
203 /* Take the instruction:
204 *
205 * CMP null<d> src0<f> src1<f>
206 *
207 * Original gen4 does type conversion to the destination type before
208 * comparison, producing garbage results for floating point comparisons.
209 * gen5 does the comparison on the execution type (resolved source types),
210 * so dst type doesn't matter. gen6 does comparison and then uses the
211 * result as if it was the dst type with no conversion, which happens to
212 * mostly work out for float-interpreted-as-int since our comparisons are
213 * for >0, =0, <0.
214 */
215 if (intel->gen == 4) {
216 dst.type = src0.type;
217 if (dst.file == FIXED_HW_REG)
218 dst.fixed_hw_reg.type = dst.type;
219 }
220
221 resolve_ud_negate(&src0);
222 resolve_ud_negate(&src1);
223
224 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
225 inst->conditional_mod = condition;
226
227 return inst;
228 }
229
230 exec_list
231 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
232 fs_reg varying_offset,
233 uint32_t const_offset)
234 {
235 exec_list instructions;
236 fs_inst *inst;
237
238 if (intel->gen >= 7) {
239 /* We have our constant surface use a pitch of 4 bytes, so our index can
240 * be any component of a vector, and then we load 4 contiguous
241 * components starting from that.
242 *
243 * We break down the const_offset to a portion added to the variable
244 * offset and a portion done using reg_offset, which means that if you
245 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
246 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
247 * CSE can later notice that those loads are all the same and eliminate
248 * the redundant ones.
249 */
250 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
251 instructions.push_tail(ADD(vec4_offset,
252 varying_offset, const_offset & ~3));
253
254 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4), dst.type);
255 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
256 vec4_result, surf_index, vec4_offset);
257 instructions.push_tail(inst);
258
259 vec4_result.reg_offset += const_offset & 3;
260 instructions.push_tail(MOV(dst, vec4_result));
261 } else {
262 fs_reg offset = fs_reg(this, glsl_type::uint_type);
263 instructions.push_tail(ADD(offset, varying_offset, fs_reg(const_offset)));
264
265 int base_mrf = 13;
266 bool header_present = true;
267
268 fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
269 mrf.type = BRW_REGISTER_TYPE_D;
270
271 /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
272 * dword-aligned byte offset.
273 */
274 if (intel->gen == 6) {
275 instructions.push_tail(MOV(mrf, offset));
276 } else {
277 instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
278 }
279 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
280 dst, surf_index);
281 inst->header_present = header_present;
282 inst->base_mrf = base_mrf;
283 inst->mlen = header_present + dispatch_width / 8;
284
285 instructions.push_tail(inst);
286 }
287
288 return instructions;
289 }
290
291 /**
292 * A helper for MOV generation for fixing up broken hardware SEND dependency
293 * handling.
294 */
295 fs_inst *
296 fs_visitor::DEP_RESOLVE_MOV(int grf)
297 {
298 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
299
300 inst->ir = NULL;
301 inst->annotation = "send dependency resolve";
302
303 /* The caller always wants uncompressed to emit the minimal extra
304 * dependencies, and to avoid having to deal with aligning its regs to 2.
305 */
306 inst->force_uncompressed = true;
307
308 return inst;
309 }
310
311 bool
312 fs_inst::equals(fs_inst *inst)
313 {
314 return (opcode == inst->opcode &&
315 dst.equals(inst->dst) &&
316 src[0].equals(inst->src[0]) &&
317 src[1].equals(inst->src[1]) &&
318 src[2].equals(inst->src[2]) &&
319 saturate == inst->saturate &&
320 predicate == inst->predicate &&
321 conditional_mod == inst->conditional_mod &&
322 mlen == inst->mlen &&
323 base_mrf == inst->base_mrf &&
324 sampler == inst->sampler &&
325 target == inst->target &&
326 eot == inst->eot &&
327 header_present == inst->header_present &&
328 shadow_compare == inst->shadow_compare &&
329 offset == inst->offset);
330 }
331
332 int
333 fs_inst::regs_written()
334 {
335 if (is_tex() || opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7)
336 return 4;
337
338 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
339 * but we don't currently use them...nor do we have an opcode for them.
340 */
341
342 return 1;
343 }
344
345 bool
346 fs_inst::overwrites_reg(const fs_reg &reg)
347 {
348 return (reg.file == dst.file &&
349 reg.reg == dst.reg &&
350 reg.reg_offset >= dst.reg_offset &&
351 reg.reg_offset < dst.reg_offset + regs_written());
352 }
353
354 bool
355 fs_inst::is_tex()
356 {
357 return (opcode == SHADER_OPCODE_TEX ||
358 opcode == FS_OPCODE_TXB ||
359 opcode == SHADER_OPCODE_TXD ||
360 opcode == SHADER_OPCODE_TXF ||
361 opcode == SHADER_OPCODE_TXF_MS ||
362 opcode == SHADER_OPCODE_TXL ||
363 opcode == SHADER_OPCODE_TXS ||
364 opcode == SHADER_OPCODE_LOD);
365 }
366
367 bool
368 fs_inst::is_math()
369 {
370 return (opcode == SHADER_OPCODE_RCP ||
371 opcode == SHADER_OPCODE_RSQ ||
372 opcode == SHADER_OPCODE_SQRT ||
373 opcode == SHADER_OPCODE_EXP2 ||
374 opcode == SHADER_OPCODE_LOG2 ||
375 opcode == SHADER_OPCODE_SIN ||
376 opcode == SHADER_OPCODE_COS ||
377 opcode == SHADER_OPCODE_INT_QUOTIENT ||
378 opcode == SHADER_OPCODE_INT_REMAINDER ||
379 opcode == SHADER_OPCODE_POW);
380 }
381
382 bool
383 fs_inst::is_control_flow()
384 {
385 switch (opcode) {
386 case BRW_OPCODE_DO:
387 case BRW_OPCODE_WHILE:
388 case BRW_OPCODE_IF:
389 case BRW_OPCODE_ELSE:
390 case BRW_OPCODE_ENDIF:
391 case BRW_OPCODE_BREAK:
392 case BRW_OPCODE_CONTINUE:
393 return true;
394 default:
395 return false;
396 }
397 }
398
399 bool
400 fs_inst::is_send_from_grf()
401 {
402 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
403 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
404 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
405 src[1].file == GRF));
406 }
407
408 bool
409 fs_visitor::can_do_source_mods(fs_inst *inst)
410 {
411 if (intel->gen == 6 && inst->is_math())
412 return false;
413
414 if (inst->is_send_from_grf())
415 return false;
416
417 return true;
418 }
419
420 void
421 fs_reg::init()
422 {
423 memset(this, 0, sizeof(*this));
424 this->smear = -1;
425 }
426
427 /** Generic unset register constructor. */
428 fs_reg::fs_reg()
429 {
430 init();
431 this->file = BAD_FILE;
432 }
433
434 /** Immediate value constructor. */
435 fs_reg::fs_reg(float f)
436 {
437 init();
438 this->file = IMM;
439 this->type = BRW_REGISTER_TYPE_F;
440 this->imm.f = f;
441 }
442
443 /** Immediate value constructor. */
444 fs_reg::fs_reg(int32_t i)
445 {
446 init();
447 this->file = IMM;
448 this->type = BRW_REGISTER_TYPE_D;
449 this->imm.i = i;
450 }
451
452 /** Immediate value constructor. */
453 fs_reg::fs_reg(uint32_t u)
454 {
455 init();
456 this->file = IMM;
457 this->type = BRW_REGISTER_TYPE_UD;
458 this->imm.u = u;
459 }
460
461 /** Fixed brw_reg Immediate value constructor. */
462 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
463 {
464 init();
465 this->file = FIXED_HW_REG;
466 this->fixed_hw_reg = fixed_hw_reg;
467 this->type = fixed_hw_reg.type;
468 }
469
470 bool
471 fs_reg::equals(const fs_reg &r) const
472 {
473 return (file == r.file &&
474 reg == r.reg &&
475 reg_offset == r.reg_offset &&
476 type == r.type &&
477 negate == r.negate &&
478 abs == r.abs &&
479 !reladdr && !r.reladdr &&
480 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
481 sizeof(fixed_hw_reg)) == 0 &&
482 smear == r.smear &&
483 imm.u == r.imm.u);
484 }
485
486 bool
487 fs_reg::is_zero() const
488 {
489 if (file != IMM)
490 return false;
491
492 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
493 }
494
495 bool
496 fs_reg::is_one() const
497 {
498 if (file != IMM)
499 return false;
500
501 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
502 }
503
504 int
505 fs_visitor::type_size(const struct glsl_type *type)
506 {
507 unsigned int size, i;
508
509 switch (type->base_type) {
510 case GLSL_TYPE_UINT:
511 case GLSL_TYPE_INT:
512 case GLSL_TYPE_FLOAT:
513 case GLSL_TYPE_BOOL:
514 return type->components();
515 case GLSL_TYPE_ARRAY:
516 return type_size(type->fields.array) * type->length;
517 case GLSL_TYPE_STRUCT:
518 size = 0;
519 for (i = 0; i < type->length; i++) {
520 size += type_size(type->fields.structure[i].type);
521 }
522 return size;
523 case GLSL_TYPE_SAMPLER:
524 /* Samplers take up no register space, since they're baked in at
525 * link time.
526 */
527 return 0;
528 case GLSL_TYPE_VOID:
529 case GLSL_TYPE_ERROR:
530 case GLSL_TYPE_INTERFACE:
531 assert(!"not reached");
532 break;
533 }
534
535 return 0;
536 }
537
538 fs_reg
539 fs_visitor::get_timestamp()
540 {
541 assert(intel->gen >= 7);
542
543 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
544 BRW_ARF_TIMESTAMP,
545 0),
546 BRW_REGISTER_TYPE_UD));
547
548 fs_reg dst = fs_reg(this, glsl_type::uint_type);
549
550 fs_inst *mov = emit(MOV(dst, ts));
551 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
552 * even if it's not enabled in the dispatch.
553 */
554 mov->force_writemask_all = true;
555 mov->force_uncompressed = true;
556
557 /* The caller wants the low 32 bits of the timestamp. Since it's running
558 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
559 * which is plenty of time for our purposes. It is identical across the
560 * EUs, but since it's tracking GPU core speed it will increment at a
561 * varying rate as render P-states change.
562 *
563 * The caller could also check if render P-states have changed (or anything
564 * else that might disrupt timing) by setting smear to 2 and checking if
565 * that field is != 0.
566 */
567 dst.smear = 0;
568
569 return dst;
570 }
571
572 void
573 fs_visitor::emit_shader_time_begin()
574 {
575 current_annotation = "shader time start";
576 shader_start_time = get_timestamp();
577 }
578
579 void
580 fs_visitor::emit_shader_time_end()
581 {
582 current_annotation = "shader time end";
583
584 enum shader_time_shader_type type, written_type, reset_type;
585 if (dispatch_width == 8) {
586 type = ST_FS8;
587 written_type = ST_FS8_WRITTEN;
588 reset_type = ST_FS8_RESET;
589 } else {
590 assert(dispatch_width == 16);
591 type = ST_FS16;
592 written_type = ST_FS16_WRITTEN;
593 reset_type = ST_FS16_RESET;
594 }
595
596 fs_reg shader_end_time = get_timestamp();
597
598 /* Check that there weren't any timestamp reset events (assuming these
599 * were the only two timestamp reads that happened).
600 */
601 fs_reg reset = shader_end_time;
602 reset.smear = 2;
603 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
604 test->conditional_mod = BRW_CONDITIONAL_Z;
605 emit(IF(BRW_PREDICATE_NORMAL));
606
607 push_force_uncompressed();
608 fs_reg start = shader_start_time;
609 start.negate = true;
610 fs_reg diff = fs_reg(this, glsl_type::uint_type);
611 emit(ADD(diff, start, shader_end_time));
612
613 /* If there were no instructions between the two timestamp gets, the diff
614 * is 2 cycles. Remove that overhead, so I can forget about that when
615 * trying to determine the time taken for single instructions.
616 */
617 emit(ADD(diff, diff, fs_reg(-2u)));
618
619 emit_shader_time_write(type, diff);
620 emit_shader_time_write(written_type, fs_reg(1u));
621 emit(BRW_OPCODE_ELSE);
622 emit_shader_time_write(reset_type, fs_reg(1u));
623 emit(BRW_OPCODE_ENDIF);
624
625 pop_force_uncompressed();
626 }
627
628 void
629 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
630 fs_reg value)
631 {
632 int shader_time_index = brw_get_shader_time_index(brw, prog, &fp->Base,
633 type);
634 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
635
636 fs_reg payload;
637 if (dispatch_width == 8)
638 payload = fs_reg(this, glsl_type::uvec2_type);
639 else
640 payload = fs_reg(this, glsl_type::uint_type);
641
642 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
643 fs_reg(), payload, offset, value));
644 }
645
646 void
647 fs_visitor::fail(const char *format, ...)
648 {
649 va_list va;
650 char *msg;
651
652 if (failed)
653 return;
654
655 failed = true;
656
657 va_start(va, format);
658 msg = ralloc_vasprintf(mem_ctx, format, va);
659 va_end(va);
660 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
661
662 this->fail_msg = msg;
663
664 if (INTEL_DEBUG & DEBUG_WM) {
665 fprintf(stderr, "%s", msg);
666 }
667 }
668
669 fs_inst *
670 fs_visitor::emit(enum opcode opcode)
671 {
672 return emit(fs_inst(opcode));
673 }
674
675 fs_inst *
676 fs_visitor::emit(enum opcode opcode, fs_reg dst)
677 {
678 return emit(fs_inst(opcode, dst));
679 }
680
681 fs_inst *
682 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
683 {
684 return emit(fs_inst(opcode, dst, src0));
685 }
686
687 fs_inst *
688 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
689 {
690 return emit(fs_inst(opcode, dst, src0, src1));
691 }
692
693 fs_inst *
694 fs_visitor::emit(enum opcode opcode, fs_reg dst,
695 fs_reg src0, fs_reg src1, fs_reg src2)
696 {
697 return emit(fs_inst(opcode, dst, src0, src1, src2));
698 }
699
700 void
701 fs_visitor::push_force_uncompressed()
702 {
703 force_uncompressed_stack++;
704 }
705
706 void
707 fs_visitor::pop_force_uncompressed()
708 {
709 force_uncompressed_stack--;
710 assert(force_uncompressed_stack >= 0);
711 }
712
713 void
714 fs_visitor::push_force_sechalf()
715 {
716 force_sechalf_stack++;
717 }
718
719 void
720 fs_visitor::pop_force_sechalf()
721 {
722 force_sechalf_stack--;
723 assert(force_sechalf_stack >= 0);
724 }
725
726 /**
727 * Returns how many MRFs an FS opcode will write over.
728 *
729 * Note that this is not the 0 or 1 implied writes in an actual gen
730 * instruction -- the FS opcodes often generate MOVs in addition.
731 */
732 int
733 fs_visitor::implied_mrf_writes(fs_inst *inst)
734 {
735 if (inst->mlen == 0)
736 return 0;
737
738 switch (inst->opcode) {
739 case SHADER_OPCODE_RCP:
740 case SHADER_OPCODE_RSQ:
741 case SHADER_OPCODE_SQRT:
742 case SHADER_OPCODE_EXP2:
743 case SHADER_OPCODE_LOG2:
744 case SHADER_OPCODE_SIN:
745 case SHADER_OPCODE_COS:
746 return 1 * dispatch_width / 8;
747 case SHADER_OPCODE_POW:
748 case SHADER_OPCODE_INT_QUOTIENT:
749 case SHADER_OPCODE_INT_REMAINDER:
750 return 2 * dispatch_width / 8;
751 case SHADER_OPCODE_TEX:
752 case FS_OPCODE_TXB:
753 case SHADER_OPCODE_TXD:
754 case SHADER_OPCODE_TXF:
755 case SHADER_OPCODE_TXF_MS:
756 case SHADER_OPCODE_TXL:
757 case SHADER_OPCODE_TXS:
758 case SHADER_OPCODE_LOD:
759 return 1;
760 case FS_OPCODE_FB_WRITE:
761 return 2;
762 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
763 case FS_OPCODE_UNSPILL:
764 return 1;
765 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
766 return inst->header_present;
767 case FS_OPCODE_SPILL:
768 return 2;
769 default:
770 assert(!"not reached");
771 return inst->mlen;
772 }
773 }
774
775 int
776 fs_visitor::virtual_grf_alloc(int size)
777 {
778 if (virtual_grf_array_size <= virtual_grf_count) {
779 if (virtual_grf_array_size == 0)
780 virtual_grf_array_size = 16;
781 else
782 virtual_grf_array_size *= 2;
783 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
784 virtual_grf_array_size);
785 }
786 virtual_grf_sizes[virtual_grf_count] = size;
787 return virtual_grf_count++;
788 }
789
790 /** Fixed HW reg constructor. */
791 fs_reg::fs_reg(enum register_file file, int reg)
792 {
793 init();
794 this->file = file;
795 this->reg = reg;
796 this->type = BRW_REGISTER_TYPE_F;
797 }
798
799 /** Fixed HW reg constructor. */
800 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
801 {
802 init();
803 this->file = file;
804 this->reg = reg;
805 this->type = type;
806 }
807
808 /** Automatic reg constructor. */
809 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
810 {
811 init();
812
813 this->file = GRF;
814 this->reg = v->virtual_grf_alloc(v->type_size(type));
815 this->reg_offset = 0;
816 this->type = brw_type_for_base_type(type);
817 }
818
819 fs_reg *
820 fs_visitor::variable_storage(ir_variable *var)
821 {
822 return (fs_reg *)hash_table_find(this->variable_ht, var);
823 }
824
825 void
826 import_uniforms_callback(const void *key,
827 void *data,
828 void *closure)
829 {
830 struct hash_table *dst_ht = (struct hash_table *)closure;
831 const fs_reg *reg = (const fs_reg *)data;
832
833 if (reg->file != UNIFORM)
834 return;
835
836 hash_table_insert(dst_ht, data, key);
837 }
838
839 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
840 * This brings in those uniform definitions
841 */
842 void
843 fs_visitor::import_uniforms(fs_visitor *v)
844 {
845 hash_table_call_foreach(v->variable_ht,
846 import_uniforms_callback,
847 variable_ht);
848 this->params_remap = v->params_remap;
849 }
850
851 /* Our support for uniforms is piggy-backed on the struct
852 * gl_fragment_program, because that's where the values actually
853 * get stored, rather than in some global gl_shader_program uniform
854 * store.
855 */
856 void
857 fs_visitor::setup_uniform_values(ir_variable *ir)
858 {
859 int namelen = strlen(ir->name);
860
861 /* The data for our (non-builtin) uniforms is stored in a series of
862 * gl_uniform_driver_storage structs for each subcomponent that
863 * glGetUniformLocation() could name. We know it's been set up in the same
864 * order we'd walk the type, so walk the list of storage and find anything
865 * with our name, or the prefix of a component that starts with our name.
866 */
867 unsigned params_before = c->prog_data.nr_params;
868 for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
869 struct gl_uniform_storage *storage = &prog->UniformStorage[u];
870
871 if (strncmp(ir->name, storage->name, namelen) != 0 ||
872 (storage->name[namelen] != 0 &&
873 storage->name[namelen] != '.' &&
874 storage->name[namelen] != '[')) {
875 continue;
876 }
877
878 unsigned slots = storage->type->component_slots();
879 if (storage->array_elements)
880 slots *= storage->array_elements;
881
882 for (unsigned i = 0; i < slots; i++) {
883 c->prog_data.param[c->prog_data.nr_params++] =
884 &storage->storage[i].f;
885 }
886 }
887
888 /* Make sure we actually initialized the right amount of stuff here. */
889 assert(params_before + ir->type->component_slots() ==
890 c->prog_data.nr_params);
891 }
892
893
894 /* Our support for builtin uniforms is even scarier than non-builtin.
895 * It sits on top of the PROG_STATE_VAR parameters that are
896 * automatically updated from GL context state.
897 */
898 void
899 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
900 {
901 const ir_state_slot *const slots = ir->state_slots;
902 assert(ir->state_slots != NULL);
903
904 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
905 /* This state reference has already been setup by ir_to_mesa, but we'll
906 * get the same index back here.
907 */
908 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
909 (gl_state_index *)slots[i].tokens);
910
911 /* Add each of the unique swizzles of the element as a parameter.
912 * This'll end up matching the expected layout of the
913 * array/matrix/structure we're trying to fill in.
914 */
915 int last_swiz = -1;
916 for (unsigned int j = 0; j < 4; j++) {
917 int swiz = GET_SWZ(slots[i].swizzle, j);
918 if (swiz == last_swiz)
919 break;
920 last_swiz = swiz;
921
922 c->prog_data.param[c->prog_data.nr_params++] =
923 &fp->Base.Parameters->ParameterValues[index][swiz].f;
924 }
925 }
926 }
927
928 fs_reg *
929 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
930 {
931 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
932 fs_reg wpos = *reg;
933 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
934
935 /* gl_FragCoord.x */
936 if (ir->pixel_center_integer) {
937 emit(MOV(wpos, this->pixel_x));
938 } else {
939 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
940 }
941 wpos.reg_offset++;
942
943 /* gl_FragCoord.y */
944 if (!flip && ir->pixel_center_integer) {
945 emit(MOV(wpos, this->pixel_y));
946 } else {
947 fs_reg pixel_y = this->pixel_y;
948 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
949
950 if (flip) {
951 pixel_y.negate = true;
952 offset += c->key.drawable_height - 1.0;
953 }
954
955 emit(ADD(wpos, pixel_y, fs_reg(offset)));
956 }
957 wpos.reg_offset++;
958
959 /* gl_FragCoord.z */
960 if (intel->gen >= 6) {
961 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
962 } else {
963 emit(FS_OPCODE_LINTERP, wpos,
964 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
965 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
966 interp_reg(VARYING_SLOT_POS, 2));
967 }
968 wpos.reg_offset++;
969
970 /* gl_FragCoord.w: Already set up in emit_interpolation */
971 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
972
973 return reg;
974 }
975
976 fs_inst *
977 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
978 glsl_interp_qualifier interpolation_mode,
979 bool is_centroid)
980 {
981 brw_wm_barycentric_interp_mode barycoord_mode;
982 if (is_centroid) {
983 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
984 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
985 else
986 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
987 } else {
988 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
989 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
990 else
991 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
992 }
993 return emit(FS_OPCODE_LINTERP, attr,
994 this->delta_x[barycoord_mode],
995 this->delta_y[barycoord_mode], interp);
996 }
997
998 fs_reg *
999 fs_visitor::emit_general_interpolation(ir_variable *ir)
1000 {
1001 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1002 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1003 fs_reg attr = *reg;
1004
1005 unsigned int array_elements;
1006 const glsl_type *type;
1007
1008 if (ir->type->is_array()) {
1009 array_elements = ir->type->length;
1010 if (array_elements == 0) {
1011 fail("dereferenced array '%s' has length 0\n", ir->name);
1012 }
1013 type = ir->type->fields.array;
1014 } else {
1015 array_elements = 1;
1016 type = ir->type;
1017 }
1018
1019 glsl_interp_qualifier interpolation_mode =
1020 ir->determine_interpolation_mode(c->key.flat_shade);
1021
1022 int location = ir->location;
1023 for (unsigned int i = 0; i < array_elements; i++) {
1024 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1025 if (urb_setup[location] == -1) {
1026 /* If there's no incoming setup data for this slot, don't
1027 * emit interpolation for it.
1028 */
1029 attr.reg_offset += type->vector_elements;
1030 location++;
1031 continue;
1032 }
1033
1034 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1035 /* Constant interpolation (flat shading) case. The SF has
1036 * handed us defined values in only the constant offset
1037 * field of the setup reg.
1038 */
1039 for (unsigned int k = 0; k < type->vector_elements; k++) {
1040 struct brw_reg interp = interp_reg(location, k);
1041 interp = suboffset(interp, 3);
1042 interp.type = reg->type;
1043 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1044 attr.reg_offset++;
1045 }
1046 } else {
1047 /* Smooth/noperspective interpolation case. */
1048 for (unsigned int k = 0; k < type->vector_elements; k++) {
1049 /* FINISHME: At some point we probably want to push
1050 * this farther by giving similar treatment to the
1051 * other potentially constant components of the
1052 * attribute, as well as making brw_vs_constval.c
1053 * handle varyings other than gl_TexCoord.
1054 */
1055 if (location >= VARYING_SLOT_TEX0 &&
1056 location <= VARYING_SLOT_TEX7 &&
1057 k == 3 && !(c->key.proj_attrib_mask
1058 & BITFIELD64_BIT(location))) {
1059 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1060 } else {
1061 struct brw_reg interp = interp_reg(location, k);
1062 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1063 ir->centroid);
1064 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1065 /* Get the pixel/sample mask into f0 so that we know
1066 * which pixels are lit. Then, for each channel that is
1067 * unlit, replace the centroid data with non-centroid
1068 * data.
1069 */
1070 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1071 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1072 interpolation_mode, false);
1073 inst->predicate = BRW_PREDICATE_NORMAL;
1074 inst->predicate_inverse = true;
1075 }
1076 if (intel->gen < 6) {
1077 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1078 }
1079 }
1080 attr.reg_offset++;
1081 }
1082
1083 }
1084 location++;
1085 }
1086 }
1087
1088 return reg;
1089 }
1090
1091 fs_reg *
1092 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1093 {
1094 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1095
1096 /* The frontfacing comes in as a bit in the thread payload. */
1097 if (intel->gen >= 6) {
1098 emit(BRW_OPCODE_ASR, *reg,
1099 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1100 fs_reg(15));
1101 emit(BRW_OPCODE_NOT, *reg, *reg);
1102 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1103 } else {
1104 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1105 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1106 * us front face
1107 */
1108 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1109 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1110 }
1111
1112 return reg;
1113 }
1114
1115 fs_reg
1116 fs_visitor::fix_math_operand(fs_reg src)
1117 {
1118 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1119 * might be able to do better by doing execsize = 1 math and then
1120 * expanding that result out, but we would need to be careful with
1121 * masking.
1122 *
1123 * The hardware ignores source modifiers (negate and abs) on math
1124 * instructions, so we also move to a temp to set those up.
1125 */
1126 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1127 !src.abs && !src.negate)
1128 return src;
1129
1130 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1131 * operands to math
1132 */
1133 if (intel->gen >= 7 && src.file != IMM)
1134 return src;
1135
1136 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1137 expanded.type = src.type;
1138 emit(BRW_OPCODE_MOV, expanded, src);
1139 return expanded;
1140 }
1141
1142 fs_inst *
1143 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1144 {
1145 switch (opcode) {
1146 case SHADER_OPCODE_RCP:
1147 case SHADER_OPCODE_RSQ:
1148 case SHADER_OPCODE_SQRT:
1149 case SHADER_OPCODE_EXP2:
1150 case SHADER_OPCODE_LOG2:
1151 case SHADER_OPCODE_SIN:
1152 case SHADER_OPCODE_COS:
1153 break;
1154 default:
1155 assert(!"not reached: bad math opcode");
1156 return NULL;
1157 }
1158
1159 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1160 * might be able to do better by doing execsize = 1 math and then
1161 * expanding that result out, but we would need to be careful with
1162 * masking.
1163 *
1164 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1165 * instructions, so we also move to a temp to set those up.
1166 */
1167 if (intel->gen >= 6)
1168 src = fix_math_operand(src);
1169
1170 fs_inst *inst = emit(opcode, dst, src);
1171
1172 if (intel->gen < 6) {
1173 inst->base_mrf = 2;
1174 inst->mlen = dispatch_width / 8;
1175 }
1176
1177 return inst;
1178 }
1179
1180 fs_inst *
1181 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1182 {
1183 int base_mrf = 2;
1184 fs_inst *inst;
1185
1186 switch (opcode) {
1187 case SHADER_OPCODE_INT_QUOTIENT:
1188 case SHADER_OPCODE_INT_REMAINDER:
1189 if (intel->gen >= 7 && dispatch_width == 16)
1190 fail("16-wide INTDIV unsupported\n");
1191 break;
1192 case SHADER_OPCODE_POW:
1193 break;
1194 default:
1195 assert(!"not reached: unsupported binary math opcode.");
1196 return NULL;
1197 }
1198
1199 if (intel->gen >= 6) {
1200 src0 = fix_math_operand(src0);
1201 src1 = fix_math_operand(src1);
1202
1203 inst = emit(opcode, dst, src0, src1);
1204 } else {
1205 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1206 * "Message Payload":
1207 *
1208 * "Operand0[7]. For the INT DIV functions, this operand is the
1209 * denominator."
1210 * ...
1211 * "Operand1[7]. For the INT DIV functions, this operand is the
1212 * numerator."
1213 */
1214 bool is_int_div = opcode != SHADER_OPCODE_POW;
1215 fs_reg &op0 = is_int_div ? src1 : src0;
1216 fs_reg &op1 = is_int_div ? src0 : src1;
1217
1218 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1219 inst = emit(opcode, dst, op0, reg_null_f);
1220
1221 inst->base_mrf = base_mrf;
1222 inst->mlen = 2 * dispatch_width / 8;
1223 }
1224 return inst;
1225 }
1226
1227 void
1228 fs_visitor::assign_curb_setup()
1229 {
1230 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1231 if (dispatch_width == 8) {
1232 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1233 } else {
1234 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1235 }
1236
1237 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1238 foreach_list(node, &this->instructions) {
1239 fs_inst *inst = (fs_inst *)node;
1240
1241 for (unsigned int i = 0; i < 3; i++) {
1242 if (inst->src[i].file == UNIFORM) {
1243 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1244 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1245 constant_nr / 8,
1246 constant_nr % 8);
1247
1248 inst->src[i].file = FIXED_HW_REG;
1249 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1250 }
1251 }
1252 }
1253 }
1254
1255 void
1256 fs_visitor::calculate_urb_setup()
1257 {
1258 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1259 urb_setup[i] = -1;
1260 }
1261
1262 int urb_next = 0;
1263 /* Figure out where each of the incoming setup attributes lands. */
1264 if (intel->gen >= 6) {
1265 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1266 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1267 urb_setup[i] = urb_next++;
1268 }
1269 }
1270 } else {
1271 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1272 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1273 /* Point size is packed into the header, not as a general attribute */
1274 if (i == VARYING_SLOT_PSIZ)
1275 continue;
1276
1277 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1278 /* The back color slot is skipped when the front color is
1279 * also written to. In addition, some slots can be
1280 * written in the vertex shader and not read in the
1281 * fragment shader. So the register number must always be
1282 * incremented, mapped or not.
1283 */
1284 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1285 urb_setup[i] = urb_next;
1286 urb_next++;
1287 }
1288 }
1289
1290 /*
1291 * It's a FS only attribute, and we did interpolation for this attribute
1292 * in SF thread. So, count it here, too.
1293 *
1294 * See compile_sf_prog() for more info.
1295 */
1296 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1297 urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1298 }
1299
1300 /* Each attribute is 4 setup channels, each of which is half a reg. */
1301 c->prog_data.urb_read_length = urb_next * 2;
1302 }
1303
1304 void
1305 fs_visitor::assign_urb_setup()
1306 {
1307 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1308
1309 /* Offset all the urb_setup[] index by the actual position of the
1310 * setup regs, now that the location of the constants has been chosen.
1311 */
1312 foreach_list(node, &this->instructions) {
1313 fs_inst *inst = (fs_inst *)node;
1314
1315 if (inst->opcode == FS_OPCODE_LINTERP) {
1316 assert(inst->src[2].file == FIXED_HW_REG);
1317 inst->src[2].fixed_hw_reg.nr += urb_start;
1318 }
1319
1320 if (inst->opcode == FS_OPCODE_CINTERP) {
1321 assert(inst->src[0].file == FIXED_HW_REG);
1322 inst->src[0].fixed_hw_reg.nr += urb_start;
1323 }
1324 }
1325
1326 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1327 }
1328
1329 /**
1330 * Split large virtual GRFs into separate components if we can.
1331 *
1332 * This is mostly duplicated with what brw_fs_vector_splitting does,
1333 * but that's really conservative because it's afraid of doing
1334 * splitting that doesn't result in real progress after the rest of
1335 * the optimization phases, which would cause infinite looping in
1336 * optimization. We can do it once here, safely. This also has the
1337 * opportunity to split interpolated values, or maybe even uniforms,
1338 * which we don't have at the IR level.
1339 *
1340 * We want to split, because virtual GRFs are what we register
1341 * allocate and spill (due to contiguousness requirements for some
1342 * instructions), and they're what we naturally generate in the
1343 * codegen process, but most virtual GRFs don't actually need to be
1344 * contiguous sets of GRFs. If we split, we'll end up with reduced
1345 * live intervals and better dead code elimination and coalescing.
1346 */
1347 void
1348 fs_visitor::split_virtual_grfs()
1349 {
1350 int num_vars = this->virtual_grf_count;
1351 bool split_grf[num_vars];
1352 int new_virtual_grf[num_vars];
1353
1354 /* Try to split anything > 0 sized. */
1355 for (int i = 0; i < num_vars; i++) {
1356 if (this->virtual_grf_sizes[i] != 1)
1357 split_grf[i] = true;
1358 else
1359 split_grf[i] = false;
1360 }
1361
1362 if (brw->has_pln &&
1363 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1364 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1365 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1366 * Gen6, that was the only supported interpolation mode, and since Gen6,
1367 * delta_x and delta_y are in fixed hardware registers.
1368 */
1369 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1370 false;
1371 }
1372
1373 foreach_list(node, &this->instructions) {
1374 fs_inst *inst = (fs_inst *)node;
1375
1376 /* If there's a SEND message that requires contiguous destination
1377 * registers, no splitting is allowed.
1378 */
1379 if (inst->regs_written() > 1) {
1380 split_grf[inst->dst.reg] = false;
1381 }
1382
1383 /* If we're sending from a GRF, don't split it, on the assumption that
1384 * the send is reading the whole thing.
1385 */
1386 if (inst->is_send_from_grf()) {
1387 split_grf[inst->src[0].reg] = false;
1388 }
1389 }
1390
1391 /* Allocate new space for split regs. Note that the virtual
1392 * numbers will be contiguous.
1393 */
1394 for (int i = 0; i < num_vars; i++) {
1395 if (split_grf[i]) {
1396 new_virtual_grf[i] = virtual_grf_alloc(1);
1397 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1398 int reg = virtual_grf_alloc(1);
1399 assert(reg == new_virtual_grf[i] + j - 1);
1400 (void) reg;
1401 }
1402 this->virtual_grf_sizes[i] = 1;
1403 }
1404 }
1405
1406 foreach_list(node, &this->instructions) {
1407 fs_inst *inst = (fs_inst *)node;
1408
1409 if (inst->dst.file == GRF &&
1410 split_grf[inst->dst.reg] &&
1411 inst->dst.reg_offset != 0) {
1412 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1413 inst->dst.reg_offset - 1);
1414 inst->dst.reg_offset = 0;
1415 }
1416 for (int i = 0; i < 3; i++) {
1417 if (inst->src[i].file == GRF &&
1418 split_grf[inst->src[i].reg] &&
1419 inst->src[i].reg_offset != 0) {
1420 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1421 inst->src[i].reg_offset - 1);
1422 inst->src[i].reg_offset = 0;
1423 }
1424 }
1425 }
1426 this->live_intervals_valid = false;
1427 }
1428
1429 /**
1430 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1431 *
1432 * During code generation, we create tons of temporary variables, many of
1433 * which get immediately killed and are never used again. Yet, in later
1434 * optimization and analysis passes, such as compute_live_intervals, we need
1435 * to loop over all the virtual GRFs. Compacting them can save a lot of
1436 * overhead.
1437 */
1438 void
1439 fs_visitor::compact_virtual_grfs()
1440 {
1441 /* Mark which virtual GRFs are used, and count how many. */
1442 int remap_table[this->virtual_grf_count];
1443 memset(remap_table, -1, sizeof(remap_table));
1444
1445 foreach_list(node, &this->instructions) {
1446 const fs_inst *inst = (const fs_inst *) node;
1447
1448 if (inst->dst.file == GRF)
1449 remap_table[inst->dst.reg] = 0;
1450
1451 for (int i = 0; i < 3; i++) {
1452 if (inst->src[i].file == GRF)
1453 remap_table[inst->src[i].reg] = 0;
1454 }
1455 }
1456
1457 /* In addition to registers used in instructions, fs_visitor keeps
1458 * direct references to certain special values which must be patched:
1459 */
1460 fs_reg *special[] = {
1461 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1462 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1463 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1464 &delta_x[0], &delta_x[1], &delta_x[2],
1465 &delta_x[3], &delta_x[4], &delta_x[5],
1466 &delta_y[0], &delta_y[1], &delta_y[2],
1467 &delta_y[3], &delta_y[4], &delta_y[5],
1468 };
1469 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1470 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1471
1472 /* Treat all special values as used, to be conservative */
1473 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1474 if (special[i]->file == GRF)
1475 remap_table[special[i]->reg] = 0;
1476 }
1477
1478 /* Compact the GRF arrays. */
1479 int new_index = 0;
1480 for (int i = 0; i < this->virtual_grf_count; i++) {
1481 if (remap_table[i] != -1) {
1482 remap_table[i] = new_index;
1483 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1484 if (live_intervals_valid) {
1485 virtual_grf_use[new_index] = virtual_grf_use[i];
1486 virtual_grf_def[new_index] = virtual_grf_def[i];
1487 }
1488 ++new_index;
1489 }
1490 }
1491
1492 this->virtual_grf_count = new_index;
1493
1494 /* Patch all the instructions to use the newly renumbered registers */
1495 foreach_list(node, &this->instructions) {
1496 fs_inst *inst = (fs_inst *) node;
1497
1498 if (inst->dst.file == GRF)
1499 inst->dst.reg = remap_table[inst->dst.reg];
1500
1501 for (int i = 0; i < 3; i++) {
1502 if (inst->src[i].file == GRF)
1503 inst->src[i].reg = remap_table[inst->src[i].reg];
1504 }
1505 }
1506
1507 /* Patch all the references to special values */
1508 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1509 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1510 special[i]->reg = remap_table[special[i]->reg];
1511 }
1512 }
1513
1514 bool
1515 fs_visitor::remove_dead_constants()
1516 {
1517 if (dispatch_width == 8) {
1518 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1519
1520 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1521 this->params_remap[i] = -1;
1522
1523 /* Find which params are still in use. */
1524 foreach_list(node, &this->instructions) {
1525 fs_inst *inst = (fs_inst *)node;
1526
1527 for (int i = 0; i < 3; i++) {
1528 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1529
1530 if (inst->src[i].file != UNIFORM)
1531 continue;
1532
1533 assert(constant_nr < (int)c->prog_data.nr_params);
1534
1535 /* For now, set this to non-negative. We'll give it the
1536 * actual new number in a moment, in order to keep the
1537 * register numbers nicely ordered.
1538 */
1539 this->params_remap[constant_nr] = 0;
1540 }
1541 }
1542
1543 /* Figure out what the new numbers for the params will be. At some
1544 * point when we're doing uniform array access, we're going to want
1545 * to keep the distinction between .reg and .reg_offset, but for
1546 * now we don't care.
1547 */
1548 unsigned int new_nr_params = 0;
1549 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1550 if (this->params_remap[i] != -1) {
1551 this->params_remap[i] = new_nr_params++;
1552 }
1553 }
1554
1555 /* Update the list of params to be uploaded to match our new numbering. */
1556 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1557 int remapped = this->params_remap[i];
1558
1559 if (remapped == -1)
1560 continue;
1561
1562 c->prog_data.param[remapped] = c->prog_data.param[i];
1563 }
1564
1565 c->prog_data.nr_params = new_nr_params;
1566 } else {
1567 /* This should have been generated in the 8-wide pass already. */
1568 assert(this->params_remap);
1569 }
1570
1571 /* Now do the renumbering of the shader to remove unused params. */
1572 foreach_list(node, &this->instructions) {
1573 fs_inst *inst = (fs_inst *)node;
1574
1575 for (int i = 0; i < 3; i++) {
1576 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1577
1578 if (inst->src[i].file != UNIFORM)
1579 continue;
1580
1581 assert(this->params_remap[constant_nr] != -1);
1582 inst->src[i].reg = this->params_remap[constant_nr];
1583 inst->src[i].reg_offset = 0;
1584 }
1585 }
1586
1587 return true;
1588 }
1589
1590 /*
1591 * Implements array access of uniforms by inserting a
1592 * PULL_CONSTANT_LOAD instruction.
1593 *
1594 * Unlike temporary GRF array access (where we don't support it due to
1595 * the difficulty of doing relative addressing on instruction
1596 * destinations), we could potentially do array access of uniforms
1597 * that were loaded in GRF space as push constants. In real-world
1598 * usage we've seen, though, the arrays being used are always larger
1599 * than we could load as push constants, so just always move all
1600 * uniform array access out to a pull constant buffer.
1601 */
1602 void
1603 fs_visitor::move_uniform_array_access_to_pull_constants()
1604 {
1605 int pull_constant_loc[c->prog_data.nr_params];
1606
1607 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1608 pull_constant_loc[i] = -1;
1609 }
1610
1611 /* Walk through and find array access of uniforms. Put a copy of that
1612 * uniform in the pull constant buffer.
1613 *
1614 * Note that we don't move constant-indexed accesses to arrays. No
1615 * testing has been done of the performance impact of this choice.
1616 */
1617 foreach_list_safe(node, &this->instructions) {
1618 fs_inst *inst = (fs_inst *)node;
1619
1620 for (int i = 0 ; i < 3; i++) {
1621 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1622 continue;
1623
1624 int uniform = inst->src[i].reg;
1625
1626 /* If this array isn't already present in the pull constant buffer,
1627 * add it.
1628 */
1629 if (pull_constant_loc[uniform] == -1) {
1630 const float **values = &c->prog_data.param[uniform];
1631
1632 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1633
1634 assert(param_size[uniform]);
1635
1636 for (int j = 0; j < param_size[uniform]; j++) {
1637 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1638 values[j];
1639 }
1640 }
1641
1642 /* Set up the annotation tracking for new generated instructions. */
1643 base_ir = inst->ir;
1644 current_annotation = inst->annotation;
1645
1646 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1647 fs_reg temp = fs_reg(this, glsl_type::float_type);
1648 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1649 surf_index,
1650 *inst->src[i].reladdr,
1651 pull_constant_loc[uniform] +
1652 inst->src[i].reg_offset);
1653 inst->insert_before(&list);
1654
1655 inst->src[i].file = temp.file;
1656 inst->src[i].reg = temp.reg;
1657 inst->src[i].reg_offset = temp.reg_offset;
1658 inst->src[i].reladdr = NULL;
1659 }
1660 }
1661 }
1662
1663 /**
1664 * Choose accesses from the UNIFORM file to demote to using the pull
1665 * constant buffer.
1666 *
1667 * We allow a fragment shader to have more than the specified minimum
1668 * maximum number of fragment shader uniform components (64). If
1669 * there are too many of these, they'd fill up all of register space.
1670 * So, this will push some of them out to the pull constant buffer and
1671 * update the program to load them.
1672 */
1673 void
1674 fs_visitor::setup_pull_constants()
1675 {
1676 /* Only allow 16 registers (128 uniform components) as push constants. */
1677 unsigned int max_uniform_components = 16 * 8;
1678 if (c->prog_data.nr_params <= max_uniform_components)
1679 return;
1680
1681 if (dispatch_width == 16) {
1682 fail("Pull constants not supported in 16-wide\n");
1683 return;
1684 }
1685
1686 /* Just demote the end of the list. We could probably do better
1687 * here, demoting things that are rarely used in the program first.
1688 */
1689 unsigned int pull_uniform_base = max_uniform_components;
1690
1691 int pull_constant_loc[c->prog_data.nr_params];
1692 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1693 if (i < pull_uniform_base) {
1694 pull_constant_loc[i] = -1;
1695 } else {
1696 pull_constant_loc[i] = -1;
1697 /* If our constant is already being uploaded for reladdr purposes,
1698 * reuse it.
1699 */
1700 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1701 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1702 pull_constant_loc[i] = j;
1703 break;
1704 }
1705 }
1706 if (pull_constant_loc[i] == -1) {
1707 int pull_index = c->prog_data.nr_pull_params++;
1708 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1709 pull_constant_loc[i] = pull_index;;
1710 }
1711 }
1712 }
1713 c->prog_data.nr_params = pull_uniform_base;
1714
1715 foreach_list(node, &this->instructions) {
1716 fs_inst *inst = (fs_inst *)node;
1717
1718 for (int i = 0; i < 3; i++) {
1719 if (inst->src[i].file != UNIFORM)
1720 continue;
1721
1722 int pull_index = pull_constant_loc[inst->src[i].reg +
1723 inst->src[i].reg_offset];
1724 if (pull_index == -1)
1725 continue;
1726
1727 assert(!inst->src[i].reladdr);
1728
1729 fs_reg dst = fs_reg(this, glsl_type::float_type);
1730 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1731 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1732 fs_inst *pull =
1733 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1734 dst, index, offset);
1735 pull->ir = inst->ir;
1736 pull->annotation = inst->annotation;
1737
1738 inst->insert_before(pull);
1739
1740 inst->src[i].file = GRF;
1741 inst->src[i].reg = dst.reg;
1742 inst->src[i].reg_offset = 0;
1743 inst->src[i].smear = pull_index & 3;
1744 }
1745 }
1746 }
1747
1748 bool
1749 fs_visitor::opt_algebraic()
1750 {
1751 bool progress = false;
1752
1753 foreach_list(node, &this->instructions) {
1754 fs_inst *inst = (fs_inst *)node;
1755
1756 switch (inst->opcode) {
1757 case BRW_OPCODE_MUL:
1758 if (inst->src[1].file != IMM)
1759 continue;
1760
1761 /* a * 1.0 = a */
1762 if (inst->src[1].is_one()) {
1763 inst->opcode = BRW_OPCODE_MOV;
1764 inst->src[1] = reg_undef;
1765 progress = true;
1766 break;
1767 }
1768
1769 /* a * 0.0 = 0.0 */
1770 if (inst->src[1].is_zero()) {
1771 inst->opcode = BRW_OPCODE_MOV;
1772 inst->src[0] = inst->src[1];
1773 inst->src[1] = reg_undef;
1774 progress = true;
1775 break;
1776 }
1777
1778 break;
1779 case BRW_OPCODE_ADD:
1780 if (inst->src[1].file != IMM)
1781 continue;
1782
1783 /* a + 0.0 = a */
1784 if (inst->src[1].is_zero()) {
1785 inst->opcode = BRW_OPCODE_MOV;
1786 inst->src[1] = reg_undef;
1787 progress = true;
1788 break;
1789 }
1790 break;
1791 default:
1792 break;
1793 }
1794 }
1795
1796 return progress;
1797 }
1798
1799 /**
1800 * Must be called after calculate_live_intervales() to remove unused
1801 * writes to registers -- register allocation will fail otherwise
1802 * because something deffed but not used won't be considered to
1803 * interfere with other regs.
1804 */
1805 bool
1806 fs_visitor::dead_code_eliminate()
1807 {
1808 bool progress = false;
1809 int pc = 0;
1810
1811 calculate_live_intervals();
1812
1813 foreach_list_safe(node, &this->instructions) {
1814 fs_inst *inst = (fs_inst *)node;
1815
1816 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1817 inst->remove();
1818 progress = true;
1819 }
1820
1821 pc++;
1822 }
1823
1824 if (progress)
1825 live_intervals_valid = false;
1826
1827 return progress;
1828 }
1829
1830 /**
1831 * Implements a second type of register coalescing: This one checks if
1832 * the two regs involved in a raw move don't interfere, in which case
1833 * they can both by stored in the same place and the MOV removed.
1834 */
1835 bool
1836 fs_visitor::register_coalesce_2()
1837 {
1838 bool progress = false;
1839
1840 calculate_live_intervals();
1841
1842 foreach_list_safe(node, &this->instructions) {
1843 fs_inst *inst = (fs_inst *)node;
1844
1845 if (inst->opcode != BRW_OPCODE_MOV ||
1846 inst->predicate ||
1847 inst->saturate ||
1848 inst->src[0].file != GRF ||
1849 inst->src[0].negate ||
1850 inst->src[0].abs ||
1851 inst->src[0].smear != -1 ||
1852 inst->dst.file != GRF ||
1853 inst->dst.type != inst->src[0].type ||
1854 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1855 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1856 continue;
1857 }
1858
1859 int reg_from = inst->src[0].reg;
1860 assert(inst->src[0].reg_offset == 0);
1861 int reg_to = inst->dst.reg;
1862 int reg_to_offset = inst->dst.reg_offset;
1863
1864 foreach_list(node, &this->instructions) {
1865 fs_inst *scan_inst = (fs_inst *)node;
1866
1867 if (scan_inst->dst.file == GRF &&
1868 scan_inst->dst.reg == reg_from) {
1869 scan_inst->dst.reg = reg_to;
1870 scan_inst->dst.reg_offset = reg_to_offset;
1871 }
1872 for (int i = 0; i < 3; i++) {
1873 if (scan_inst->src[i].file == GRF &&
1874 scan_inst->src[i].reg == reg_from) {
1875 scan_inst->src[i].reg = reg_to;
1876 scan_inst->src[i].reg_offset = reg_to_offset;
1877 }
1878 }
1879 }
1880
1881 inst->remove();
1882
1883 /* We don't need to recalculate live intervals inside the loop despite
1884 * flagging live_intervals_valid because we only use live intervals for
1885 * the interferes test, and we must have had a situation where the
1886 * intervals were:
1887 *
1888 * from to
1889 * ^
1890 * |
1891 * v
1892 * ^
1893 * |
1894 * v
1895 *
1896 * Some register R that might get coalesced with one of these two could
1897 * only be referencing "to", otherwise "from"'s range would have been
1898 * longer. R's range could also only start at the end of "to" or later,
1899 * otherwise it will conflict with "to" when we try to coalesce "to"
1900 * into Rw anyway.
1901 */
1902 live_intervals_valid = false;
1903
1904 progress = true;
1905 continue;
1906 }
1907
1908 return progress;
1909 }
1910
1911 bool
1912 fs_visitor::register_coalesce()
1913 {
1914 bool progress = false;
1915 int if_depth = 0;
1916 int loop_depth = 0;
1917
1918 foreach_list_safe(node, &this->instructions) {
1919 fs_inst *inst = (fs_inst *)node;
1920
1921 /* Make sure that we dominate the instructions we're going to
1922 * scan for interfering with our coalescing, or we won't have
1923 * scanned enough to see if anything interferes with our
1924 * coalescing. We don't dominate the following instructions if
1925 * we're in a loop or an if block.
1926 */
1927 switch (inst->opcode) {
1928 case BRW_OPCODE_DO:
1929 loop_depth++;
1930 break;
1931 case BRW_OPCODE_WHILE:
1932 loop_depth--;
1933 break;
1934 case BRW_OPCODE_IF:
1935 if_depth++;
1936 break;
1937 case BRW_OPCODE_ENDIF:
1938 if_depth--;
1939 break;
1940 default:
1941 break;
1942 }
1943 if (loop_depth || if_depth)
1944 continue;
1945
1946 if (inst->opcode != BRW_OPCODE_MOV ||
1947 inst->predicate ||
1948 inst->saturate ||
1949 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1950 inst->src[0].file != UNIFORM)||
1951 inst->dst.type != inst->src[0].type)
1952 continue;
1953
1954 bool has_source_modifiers = (inst->src[0].abs ||
1955 inst->src[0].negate ||
1956 inst->src[0].smear != -1 ||
1957 inst->src[0].file == UNIFORM);
1958
1959 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1960 * them: check for no writes to either one until the exit of the
1961 * program.
1962 */
1963 bool interfered = false;
1964
1965 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1966 !scan_inst->is_tail_sentinel();
1967 scan_inst = (fs_inst *)scan_inst->next) {
1968 if (scan_inst->dst.file == GRF) {
1969 if (scan_inst->overwrites_reg(inst->dst) ||
1970 scan_inst->overwrites_reg(inst->src[0])) {
1971 interfered = true;
1972 break;
1973 }
1974 }
1975
1976 /* The gen6 MATH instruction can't handle source modifiers or
1977 * unusual register regions, so avoid coalescing those for
1978 * now. We should do something more specific.
1979 */
1980 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1981 interfered = true;
1982 break;
1983 }
1984
1985 /* The accumulator result appears to get used for the
1986 * conditional modifier generation. When negating a UD
1987 * value, there is a 33rd bit generated for the sign in the
1988 * accumulator value, so now you can't check, for example,
1989 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1990 */
1991 if (scan_inst->conditional_mod &&
1992 inst->src[0].negate &&
1993 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1994 interfered = true;
1995 break;
1996 }
1997 }
1998 if (interfered) {
1999 continue;
2000 }
2001
2002 /* Rewrite the later usage to point at the source of the move to
2003 * be removed.
2004 */
2005 for (fs_inst *scan_inst = inst;
2006 !scan_inst->is_tail_sentinel();
2007 scan_inst = (fs_inst *)scan_inst->next) {
2008 for (int i = 0; i < 3; i++) {
2009 if (scan_inst->src[i].file == GRF &&
2010 scan_inst->src[i].reg == inst->dst.reg &&
2011 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2012 fs_reg new_src = inst->src[0];
2013 if (scan_inst->src[i].abs) {
2014 new_src.negate = 0;
2015 new_src.abs = 1;
2016 }
2017 new_src.negate ^= scan_inst->src[i].negate;
2018 scan_inst->src[i] = new_src;
2019 }
2020 }
2021 }
2022
2023 inst->remove();
2024 progress = true;
2025 }
2026
2027 if (progress)
2028 live_intervals_valid = false;
2029
2030 return progress;
2031 }
2032
2033
2034 bool
2035 fs_visitor::compute_to_mrf()
2036 {
2037 bool progress = false;
2038 int next_ip = 0;
2039
2040 calculate_live_intervals();
2041
2042 foreach_list_safe(node, &this->instructions) {
2043 fs_inst *inst = (fs_inst *)node;
2044
2045 int ip = next_ip;
2046 next_ip++;
2047
2048 if (inst->opcode != BRW_OPCODE_MOV ||
2049 inst->predicate ||
2050 inst->dst.file != MRF || inst->src[0].file != GRF ||
2051 inst->dst.type != inst->src[0].type ||
2052 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2053 continue;
2054
2055 /* Work out which hardware MRF registers are written by this
2056 * instruction.
2057 */
2058 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2059 int mrf_high;
2060 if (inst->dst.reg & BRW_MRF_COMPR4) {
2061 mrf_high = mrf_low + 4;
2062 } else if (dispatch_width == 16 &&
2063 (!inst->force_uncompressed && !inst->force_sechalf)) {
2064 mrf_high = mrf_low + 1;
2065 } else {
2066 mrf_high = mrf_low;
2067 }
2068
2069 /* Can't compute-to-MRF this GRF if someone else was going to
2070 * read it later.
2071 */
2072 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2073 continue;
2074
2075 /* Found a move of a GRF to a MRF. Let's see if we can go
2076 * rewrite the thing that made this GRF to write into the MRF.
2077 */
2078 fs_inst *scan_inst;
2079 for (scan_inst = (fs_inst *)inst->prev;
2080 scan_inst->prev != NULL;
2081 scan_inst = (fs_inst *)scan_inst->prev) {
2082 if (scan_inst->dst.file == GRF &&
2083 scan_inst->dst.reg == inst->src[0].reg) {
2084 /* Found the last thing to write our reg we want to turn
2085 * into a compute-to-MRF.
2086 */
2087
2088 /* If it's predicated, it (probably) didn't populate all
2089 * the channels. We might be able to rewrite everything
2090 * that writes that reg, but it would require smarter
2091 * tracking to delay the rewriting until complete success.
2092 */
2093 if (scan_inst->predicate)
2094 break;
2095
2096 /* If it's half of register setup and not the same half as
2097 * our MOV we're trying to remove, bail for now.
2098 */
2099 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2100 scan_inst->force_sechalf != inst->force_sechalf) {
2101 break;
2102 }
2103
2104 /* Things returning more than one register would need us to
2105 * understand coalescing out more than one MOV at a time.
2106 */
2107 if (scan_inst->regs_written() > 1)
2108 break;
2109
2110 /* SEND instructions can't have MRF as a destination. */
2111 if (scan_inst->mlen)
2112 break;
2113
2114 if (intel->gen == 6) {
2115 /* gen6 math instructions must have the destination be
2116 * GRF, so no compute-to-MRF for them.
2117 */
2118 if (scan_inst->is_math()) {
2119 break;
2120 }
2121 }
2122
2123 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2124 /* Found the creator of our MRF's source value. */
2125 scan_inst->dst.file = MRF;
2126 scan_inst->dst.reg = inst->dst.reg;
2127 scan_inst->saturate |= inst->saturate;
2128 inst->remove();
2129 progress = true;
2130 }
2131 break;
2132 }
2133
2134 /* We don't handle control flow here. Most computation of
2135 * values that end up in MRFs are shortly before the MRF
2136 * write anyway.
2137 */
2138 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2139 break;
2140
2141 /* You can't read from an MRF, so if someone else reads our
2142 * MRF's source GRF that we wanted to rewrite, that stops us.
2143 */
2144 bool interfered = false;
2145 for (int i = 0; i < 3; i++) {
2146 if (scan_inst->src[i].file == GRF &&
2147 scan_inst->src[i].reg == inst->src[0].reg &&
2148 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2149 interfered = true;
2150 }
2151 }
2152 if (interfered)
2153 break;
2154
2155 if (scan_inst->dst.file == MRF) {
2156 /* If somebody else writes our MRF here, we can't
2157 * compute-to-MRF before that.
2158 */
2159 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2160 int scan_mrf_high;
2161
2162 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2163 scan_mrf_high = scan_mrf_low + 4;
2164 } else if (dispatch_width == 16 &&
2165 (!scan_inst->force_uncompressed &&
2166 !scan_inst->force_sechalf)) {
2167 scan_mrf_high = scan_mrf_low + 1;
2168 } else {
2169 scan_mrf_high = scan_mrf_low;
2170 }
2171
2172 if (mrf_low == scan_mrf_low ||
2173 mrf_low == scan_mrf_high ||
2174 mrf_high == scan_mrf_low ||
2175 mrf_high == scan_mrf_high) {
2176 break;
2177 }
2178 }
2179
2180 if (scan_inst->mlen > 0) {
2181 /* Found a SEND instruction, which means that there are
2182 * live values in MRFs from base_mrf to base_mrf +
2183 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2184 * above it.
2185 */
2186 if (mrf_low >= scan_inst->base_mrf &&
2187 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2188 break;
2189 }
2190 if (mrf_high >= scan_inst->base_mrf &&
2191 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2192 break;
2193 }
2194 }
2195 }
2196 }
2197
2198 if (progress)
2199 live_intervals_valid = false;
2200
2201 return progress;
2202 }
2203
2204 /**
2205 * Walks through basic blocks, looking for repeated MRF writes and
2206 * removing the later ones.
2207 */
2208 bool
2209 fs_visitor::remove_duplicate_mrf_writes()
2210 {
2211 fs_inst *last_mrf_move[16];
2212 bool progress = false;
2213
2214 /* Need to update the MRF tracking for compressed instructions. */
2215 if (dispatch_width == 16)
2216 return false;
2217
2218 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2219
2220 foreach_list_safe(node, &this->instructions) {
2221 fs_inst *inst = (fs_inst *)node;
2222
2223 if (inst->is_control_flow()) {
2224 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2225 }
2226
2227 if (inst->opcode == BRW_OPCODE_MOV &&
2228 inst->dst.file == MRF) {
2229 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2230 if (prev_inst && inst->equals(prev_inst)) {
2231 inst->remove();
2232 progress = true;
2233 continue;
2234 }
2235 }
2236
2237 /* Clear out the last-write records for MRFs that were overwritten. */
2238 if (inst->dst.file == MRF) {
2239 last_mrf_move[inst->dst.reg] = NULL;
2240 }
2241
2242 if (inst->mlen > 0) {
2243 /* Found a SEND instruction, which will include two or fewer
2244 * implied MRF writes. We could do better here.
2245 */
2246 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2247 last_mrf_move[inst->base_mrf + i] = NULL;
2248 }
2249 }
2250
2251 /* Clear out any MRF move records whose sources got overwritten. */
2252 if (inst->dst.file == GRF) {
2253 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2254 if (last_mrf_move[i] &&
2255 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2256 last_mrf_move[i] = NULL;
2257 }
2258 }
2259 }
2260
2261 if (inst->opcode == BRW_OPCODE_MOV &&
2262 inst->dst.file == MRF &&
2263 inst->src[0].file == GRF &&
2264 !inst->predicate) {
2265 last_mrf_move[inst->dst.reg] = inst;
2266 }
2267 }
2268
2269 if (progress)
2270 live_intervals_valid = false;
2271
2272 return progress;
2273 }
2274
2275 static void
2276 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2277 int first_grf, int grf_len)
2278 {
2279 bool inst_16wide = (dispatch_width > 8 &&
2280 !inst->force_uncompressed &&
2281 !inst->force_sechalf);
2282
2283 /* Clear the flag for registers that actually got read (as expected). */
2284 for (int i = 0; i < 3; i++) {
2285 int grf;
2286 if (inst->src[i].file == GRF) {
2287 grf = inst->src[i].reg;
2288 } else if (inst->src[i].file == FIXED_HW_REG &&
2289 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2290 grf = inst->src[i].fixed_hw_reg.nr;
2291 } else {
2292 continue;
2293 }
2294
2295 if (grf >= first_grf &&
2296 grf < first_grf + grf_len) {
2297 deps[grf - first_grf] = false;
2298 if (inst_16wide)
2299 deps[grf - first_grf + 1] = false;
2300 }
2301 }
2302 }
2303
2304 /**
2305 * Implements this workaround for the original 965:
2306 *
2307 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2308 * check for post destination dependencies on this instruction, software
2309 * must ensure that there is no destination hazard for the case of ‘write
2310 * followed by a posted write’ shown in the following example.
2311 *
2312 * 1. mov r3 0
2313 * 2. send r3.xy <rest of send instruction>
2314 * 3. mov r2 r3
2315 *
2316 * Due to no post-destination dependency check on the ‘send’, the above
2317 * code sequence could have two instructions (1 and 2) in flight at the
2318 * same time that both consider ‘r3’ as the target of their final writes.
2319 */
2320 void
2321 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2322 {
2323 int reg_size = dispatch_width / 8;
2324 int write_len = inst->regs_written() * reg_size;
2325 int first_write_grf = inst->dst.reg;
2326 bool needs_dep[BRW_MAX_MRF];
2327 assert(write_len < (int)sizeof(needs_dep) - 1);
2328
2329 memset(needs_dep, false, sizeof(needs_dep));
2330 memset(needs_dep, true, write_len);
2331
2332 clear_deps_for_inst_src(inst, dispatch_width,
2333 needs_dep, first_write_grf, write_len);
2334
2335 /* Walk backwards looking for writes to registers we're writing which
2336 * aren't read since being written. If we hit the start of the program,
2337 * we assume that there are no outstanding dependencies on entry to the
2338 * program.
2339 */
2340 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2341 scan_inst != NULL;
2342 scan_inst = (fs_inst *)scan_inst->prev) {
2343
2344 /* If we hit control flow, assume that there *are* outstanding
2345 * dependencies, and force their cleanup before our instruction.
2346 */
2347 if (scan_inst->is_control_flow()) {
2348 for (int i = 0; i < write_len; i++) {
2349 if (needs_dep[i]) {
2350 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2351 }
2352 }
2353 }
2354
2355 bool scan_inst_16wide = (dispatch_width > 8 &&
2356 !scan_inst->force_uncompressed &&
2357 !scan_inst->force_sechalf);
2358
2359 /* We insert our reads as late as possible on the assumption that any
2360 * instruction but a MOV that might have left us an outstanding
2361 * dependency has more latency than a MOV.
2362 */
2363 if (scan_inst->dst.file == GRF) {
2364 for (int i = 0; i < scan_inst->regs_written(); i++) {
2365 int reg = scan_inst->dst.reg + i * reg_size;
2366
2367 if (reg >= first_write_grf &&
2368 reg < first_write_grf + write_len &&
2369 needs_dep[reg - first_write_grf]) {
2370 inst->insert_before(DEP_RESOLVE_MOV(reg));
2371 needs_dep[reg - first_write_grf] = false;
2372 if (scan_inst_16wide)
2373 needs_dep[reg - first_write_grf + 1] = false;
2374 }
2375 }
2376 }
2377
2378 /* Clear the flag for registers that actually got read (as expected). */
2379 clear_deps_for_inst_src(scan_inst, dispatch_width,
2380 needs_dep, first_write_grf, write_len);
2381
2382 /* Continue the loop only if we haven't resolved all the dependencies */
2383 int i;
2384 for (i = 0; i < write_len; i++) {
2385 if (needs_dep[i])
2386 break;
2387 }
2388 if (i == write_len)
2389 return;
2390 }
2391 }
2392
2393 /**
2394 * Implements this workaround for the original 965:
2395 *
2396 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2397 * used as a destination register until after it has been sourced by an
2398 * instruction with a different destination register.
2399 */
2400 void
2401 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2402 {
2403 int write_len = inst->regs_written() * dispatch_width / 8;
2404 int first_write_grf = inst->dst.reg;
2405 bool needs_dep[BRW_MAX_MRF];
2406 assert(write_len < (int)sizeof(needs_dep) - 1);
2407
2408 memset(needs_dep, false, sizeof(needs_dep));
2409 memset(needs_dep, true, write_len);
2410 /* Walk forwards looking for writes to registers we're writing which aren't
2411 * read before being written.
2412 */
2413 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2414 !scan_inst->is_tail_sentinel();
2415 scan_inst = (fs_inst *)scan_inst->next) {
2416 /* If we hit control flow, force resolve all remaining dependencies. */
2417 if (scan_inst->is_control_flow()) {
2418 for (int i = 0; i < write_len; i++) {
2419 if (needs_dep[i])
2420 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2421 }
2422 }
2423
2424 /* Clear the flag for registers that actually got read (as expected). */
2425 clear_deps_for_inst_src(scan_inst, dispatch_width,
2426 needs_dep, first_write_grf, write_len);
2427
2428 /* We insert our reads as late as possible since they're reading the
2429 * result of a SEND, which has massive latency.
2430 */
2431 if (scan_inst->dst.file == GRF &&
2432 scan_inst->dst.reg >= first_write_grf &&
2433 scan_inst->dst.reg < first_write_grf + write_len &&
2434 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2435 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2436 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2437 }
2438
2439 /* Continue the loop only if we haven't resolved all the dependencies */
2440 int i;
2441 for (i = 0; i < write_len; i++) {
2442 if (needs_dep[i])
2443 break;
2444 }
2445 if (i == write_len)
2446 return;
2447 }
2448
2449 /* If we hit the end of the program, resolve all remaining dependencies out
2450 * of paranoia.
2451 */
2452 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2453 assert(last_inst->eot);
2454 for (int i = 0; i < write_len; i++) {
2455 if (needs_dep[i])
2456 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2457 }
2458 }
2459
2460 void
2461 fs_visitor::insert_gen4_send_dependency_workarounds()
2462 {
2463 if (intel->gen != 4 || intel->is_g4x)
2464 return;
2465
2466 /* Note that we're done with register allocation, so GRF fs_regs always
2467 * have a .reg_offset of 0.
2468 */
2469
2470 foreach_list_safe(node, &this->instructions) {
2471 fs_inst *inst = (fs_inst *)node;
2472
2473 if (inst->mlen != 0 && inst->dst.file == GRF) {
2474 insert_gen4_pre_send_dependency_workarounds(inst);
2475 insert_gen4_post_send_dependency_workarounds(inst);
2476 }
2477 }
2478 }
2479
2480 /**
2481 * Turns the generic expression-style uniform pull constant load instruction
2482 * into a hardware-specific series of instructions for loading a pull
2483 * constant.
2484 *
2485 * The expression style allows the CSE pass before this to optimize out
2486 * repeated loads from the same offset, and gives the pre-register-allocation
2487 * scheduling full flexibility, while the conversion to native instructions
2488 * allows the post-register-allocation scheduler the best information
2489 * possible.
2490 *
2491 * Note that execution masking for setting up pull constant loads is special:
2492 * the channels that need to be written are unrelated to the current execution
2493 * mask, since a later instruction will use one of the result channels as a
2494 * source operand for all 8 or 16 of its channels.
2495 */
2496 void
2497 fs_visitor::lower_uniform_pull_constant_loads()
2498 {
2499 foreach_list(node, &this->instructions) {
2500 fs_inst *inst = (fs_inst *)node;
2501
2502 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2503 continue;
2504
2505 if (intel->gen >= 7) {
2506 /* The offset arg before was a vec4-aligned byte offset. We need to
2507 * turn it into a dword offset.
2508 */
2509 fs_reg const_offset_reg = inst->src[1];
2510 assert(const_offset_reg.file == IMM &&
2511 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2512 const_offset_reg.imm.u /= 4;
2513 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2514
2515 /* This is actually going to be a MOV, but since only the first dword
2516 * is accessed, we have a special opcode to do just that one. Note
2517 * that this needs to be an operation that will be considered a def
2518 * by live variable analysis, or register allocation will explode.
2519 */
2520 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2521 payload, const_offset_reg);
2522 setup->force_writemask_all = true;
2523
2524 setup->ir = inst->ir;
2525 setup->annotation = inst->annotation;
2526 inst->insert_before(setup);
2527
2528 /* Similarly, this will only populate the first 4 channels of the
2529 * result register (since we only use smear values from 0-3), but we
2530 * don't tell the optimizer.
2531 */
2532 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2533 inst->src[1] = payload;
2534
2535 this->live_intervals_valid = false;
2536 } else {
2537 /* Before register allocation, we didn't tell the scheduler about the
2538 * MRF we use. We know it's safe to use this MRF because nothing
2539 * else does except for register spill/unspill, which generates and
2540 * uses its MRF within a single IR instruction.
2541 */
2542 inst->base_mrf = 14;
2543 inst->mlen = 1;
2544 }
2545 }
2546 }
2547
2548 void
2549 fs_visitor::dump_instruction(fs_inst *inst)
2550 {
2551 if (inst->predicate) {
2552 printf("(%cf0.%d) ",
2553 inst->predicate_inverse ? '-' : '+',
2554 inst->flag_subreg);
2555 }
2556
2557 printf("%s", brw_instruction_name(inst->opcode));
2558 if (inst->saturate)
2559 printf(".sat");
2560 if (inst->conditional_mod) {
2561 printf(".cmod");
2562 if (!inst->predicate &&
2563 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2564 inst->opcode != BRW_OPCODE_IF &&
2565 inst->opcode != BRW_OPCODE_WHILE))) {
2566 printf(".f0.%d\n", inst->flag_subreg);
2567 }
2568 }
2569 printf(" ");
2570
2571
2572 switch (inst->dst.file) {
2573 case GRF:
2574 printf("vgrf%d", inst->dst.reg);
2575 if (inst->dst.reg_offset)
2576 printf("+%d", inst->dst.reg_offset);
2577 break;
2578 case MRF:
2579 printf("m%d", inst->dst.reg);
2580 break;
2581 case BAD_FILE:
2582 printf("(null)");
2583 break;
2584 case UNIFORM:
2585 printf("***u%d***", inst->dst.reg);
2586 break;
2587 default:
2588 printf("???");
2589 break;
2590 }
2591 printf(", ");
2592
2593 for (int i = 0; i < 3; i++) {
2594 if (inst->src[i].negate)
2595 printf("-");
2596 if (inst->src[i].abs)
2597 printf("|");
2598 switch (inst->src[i].file) {
2599 case GRF:
2600 printf("vgrf%d", inst->src[i].reg);
2601 if (inst->src[i].reg_offset)
2602 printf("+%d", inst->src[i].reg_offset);
2603 break;
2604 case MRF:
2605 printf("***m%d***", inst->src[i].reg);
2606 break;
2607 case UNIFORM:
2608 printf("u%d", inst->src[i].reg);
2609 if (inst->src[i].reg_offset)
2610 printf(".%d", inst->src[i].reg_offset);
2611 break;
2612 case BAD_FILE:
2613 printf("(null)");
2614 break;
2615 case IMM:
2616 switch (inst->src[i].type) {
2617 case BRW_REGISTER_TYPE_F:
2618 printf("%ff", inst->src[i].imm.f);
2619 break;
2620 case BRW_REGISTER_TYPE_D:
2621 printf("%dd", inst->src[i].imm.i);
2622 break;
2623 case BRW_REGISTER_TYPE_UD:
2624 printf("%uu", inst->src[i].imm.u);
2625 break;
2626 default:
2627 printf("???");
2628 break;
2629 }
2630 break;
2631 default:
2632 printf("???");
2633 break;
2634 }
2635 if (inst->src[i].abs)
2636 printf("|");
2637
2638 if (i < 3)
2639 printf(", ");
2640 }
2641
2642 printf(" ");
2643
2644 if (inst->force_uncompressed)
2645 printf("1sthalf ");
2646
2647 if (inst->force_sechalf)
2648 printf("2ndhalf ");
2649
2650 printf("\n");
2651 }
2652
2653 void
2654 fs_visitor::dump_instructions()
2655 {
2656 int ip = 0;
2657 foreach_list(node, &this->instructions) {
2658 fs_inst *inst = (fs_inst *)node;
2659 printf("%d: ", ip++);
2660 dump_instruction(inst);
2661 }
2662 }
2663
2664 /**
2665 * Possibly returns an instruction that set up @param reg.
2666 *
2667 * Sometimes we want to take the result of some expression/variable
2668 * dereference tree and rewrite the instruction generating the result
2669 * of the tree. When processing the tree, we know that the
2670 * instructions generated are all writing temporaries that are dead
2671 * outside of this tree. So, if we have some instructions that write
2672 * a temporary, we're free to point that temp write somewhere else.
2673 *
2674 * Note that this doesn't guarantee that the instruction generated
2675 * only reg -- it might be the size=4 destination of a texture instruction.
2676 */
2677 fs_inst *
2678 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2679 fs_inst *end,
2680 fs_reg reg)
2681 {
2682 if (end == start ||
2683 end->predicate ||
2684 end->force_uncompressed ||
2685 end->force_sechalf ||
2686 reg.reladdr ||
2687 !reg.equals(end->dst)) {
2688 return NULL;
2689 } else {
2690 return end;
2691 }
2692 }
2693
2694 void
2695 fs_visitor::setup_payload_gen6()
2696 {
2697 struct intel_context *intel = &brw->intel;
2698 bool uses_depth =
2699 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2700 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2701
2702 assert(intel->gen >= 6);
2703
2704 /* R0-1: masks, pixel X/Y coordinates. */
2705 c->nr_payload_regs = 2;
2706 /* R2: only for 32-pixel dispatch.*/
2707
2708 /* R3-26: barycentric interpolation coordinates. These appear in the
2709 * same order that they appear in the brw_wm_barycentric_interp_mode
2710 * enum. Each set of coordinates occupies 2 registers if dispatch width
2711 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2712 * appear if they were enabled using the "Barycentric Interpolation
2713 * Mode" bits in WM_STATE.
2714 */
2715 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2716 if (barycentric_interp_modes & (1 << i)) {
2717 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2718 c->nr_payload_regs += 2;
2719 if (dispatch_width == 16) {
2720 c->nr_payload_regs += 2;
2721 }
2722 }
2723 }
2724
2725 /* R27: interpolated depth if uses source depth */
2726 if (uses_depth) {
2727 c->source_depth_reg = c->nr_payload_regs;
2728 c->nr_payload_regs++;
2729 if (dispatch_width == 16) {
2730 /* R28: interpolated depth if not 8-wide. */
2731 c->nr_payload_regs++;
2732 }
2733 }
2734 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2735 if (uses_depth) {
2736 c->source_w_reg = c->nr_payload_regs;
2737 c->nr_payload_regs++;
2738 if (dispatch_width == 16) {
2739 /* R30: interpolated W if not 8-wide. */
2740 c->nr_payload_regs++;
2741 }
2742 }
2743 /* R31: MSAA position offsets. */
2744 /* R32-: bary for 32-pixel. */
2745 /* R58-59: interp W for 32-pixel. */
2746
2747 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2748 c->source_depth_to_render_target = true;
2749 }
2750 }
2751
2752 bool
2753 fs_visitor::run()
2754 {
2755 sanity_param_count = fp->Base.Parameters->NumParameters;
2756 uint32_t orig_nr_params = c->prog_data.nr_params;
2757
2758 if (intel->gen >= 6)
2759 setup_payload_gen6();
2760 else
2761 setup_payload_gen4();
2762
2763 if (0) {
2764 emit_dummy_fs();
2765 } else {
2766 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2767 emit_shader_time_begin();
2768
2769 calculate_urb_setup();
2770 if (intel->gen < 6)
2771 emit_interpolation_setup_gen4();
2772 else
2773 emit_interpolation_setup_gen6();
2774
2775 /* We handle discards by keeping track of the still-live pixels in f0.1.
2776 * Initialize it with the dispatched pixels.
2777 */
2778 if (fp->UsesKill) {
2779 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2780 discard_init->flag_subreg = 1;
2781 }
2782
2783 /* Generate FS IR for main(). (the visitor only descends into
2784 * functions called "main").
2785 */
2786 if (shader) {
2787 foreach_list(node, &*shader->ir) {
2788 ir_instruction *ir = (ir_instruction *)node;
2789 base_ir = ir;
2790 this->result = reg_undef;
2791 ir->accept(this);
2792 }
2793 } else {
2794 emit_fragment_program_code();
2795 }
2796 base_ir = NULL;
2797 if (failed)
2798 return false;
2799
2800 emit(FS_OPCODE_PLACEHOLDER_HALT);
2801
2802 emit_fb_writes();
2803
2804 split_virtual_grfs();
2805
2806 move_uniform_array_access_to_pull_constants();
2807 setup_pull_constants();
2808
2809 bool progress;
2810 do {
2811 progress = false;
2812
2813 compact_virtual_grfs();
2814
2815 progress = remove_duplicate_mrf_writes() || progress;
2816
2817 progress = opt_algebraic() || progress;
2818 progress = opt_cse() || progress;
2819 progress = opt_copy_propagate() || progress;
2820 progress = dead_code_eliminate() || progress;
2821 progress = register_coalesce() || progress;
2822 progress = register_coalesce_2() || progress;
2823 progress = compute_to_mrf() || progress;
2824 } while (progress);
2825
2826 remove_dead_constants();
2827
2828 schedule_instructions(false);
2829
2830 lower_uniform_pull_constant_loads();
2831
2832 assign_curb_setup();
2833 assign_urb_setup();
2834
2835 if (0) {
2836 /* Debug of register spilling: Go spill everything. */
2837 for (int i = 0; i < virtual_grf_count; i++) {
2838 spill_reg(i);
2839 }
2840 }
2841
2842 if (0)
2843 assign_regs_trivial();
2844 else {
2845 while (!assign_regs()) {
2846 if (failed)
2847 break;
2848 }
2849 }
2850 }
2851 assert(force_uncompressed_stack == 0);
2852 assert(force_sechalf_stack == 0);
2853
2854 /* This must come after all optimization and register allocation, since
2855 * it inserts dead code that happens to have side effects, and it does
2856 * so based on the actual physical registers in use.
2857 */
2858 insert_gen4_send_dependency_workarounds();
2859
2860 if (failed)
2861 return false;
2862
2863 schedule_instructions(true);
2864
2865 if (dispatch_width == 8) {
2866 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2867 } else {
2868 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2869
2870 /* Make sure we didn't try to sneak in an extra uniform */
2871 assert(orig_nr_params == c->prog_data.nr_params);
2872 (void) orig_nr_params;
2873 }
2874
2875 /* If any state parameters were appended, then ParameterValues could have
2876 * been realloced, in which case the driver uniform storage set up by
2877 * _mesa_associate_uniform_storage() would point to freed memory. Make
2878 * sure that didn't happen.
2879 */
2880 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2881
2882 return !failed;
2883 }
2884
2885 const unsigned *
2886 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2887 struct gl_fragment_program *fp,
2888 struct gl_shader_program *prog,
2889 unsigned *final_assembly_size)
2890 {
2891 struct intel_context *intel = &brw->intel;
2892 bool start_busy = false;
2893 float start_time = 0;
2894
2895 if (unlikely(intel->perf_debug)) {
2896 start_busy = (intel->batch.last_bo &&
2897 drm_intel_bo_busy(intel->batch.last_bo));
2898 start_time = get_time();
2899 }
2900
2901 struct brw_shader *shader = NULL;
2902 if (prog)
2903 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2904
2905 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2906 if (shader) {
2907 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2908 _mesa_print_ir(shader->ir, NULL);
2909 printf("\n\n");
2910 } else {
2911 printf("ARB_fragment_program %d ir for native fragment shader\n",
2912 fp->Base.Id);
2913 _mesa_print_program(&fp->Base);
2914 }
2915 }
2916
2917 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2918 */
2919 fs_visitor v(brw, c, prog, fp, 8);
2920 if (!v.run()) {
2921 prog->LinkStatus = false;
2922 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2923
2924 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2925 v.fail_msg);
2926
2927 return NULL;
2928 }
2929
2930 exec_list *simd16_instructions = NULL;
2931 fs_visitor v2(brw, c, prog, fp, 16);
2932 bool no16 = INTEL_DEBUG & DEBUG_NO16;
2933 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2934 v2.import_uniforms(&v);
2935 if (!v2.run()) {
2936 perf_debug("16-wide shader failed to compile, falling back to "
2937 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2938 } else {
2939 simd16_instructions = &v2.instructions;
2940 }
2941 }
2942
2943 c->prog_data.dispatch_width = 8;
2944
2945 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2946 const unsigned *generated = g.generate_assembly(&v.instructions,
2947 simd16_instructions,
2948 final_assembly_size);
2949
2950 if (unlikely(intel->perf_debug) && shader) {
2951 if (shader->compiled_once)
2952 brw_wm_debug_recompile(brw, prog, &c->key);
2953 shader->compiled_once = true;
2954
2955 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2956 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2957 (get_time() - start_time) * 1000);
2958 }
2959 }
2960
2961 return generated;
2962 }
2963
2964 bool
2965 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2966 {
2967 struct brw_context *brw = brw_context(ctx);
2968 struct intel_context *intel = &brw->intel;
2969 struct brw_wm_prog_key key;
2970
2971 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2972 return true;
2973
2974 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2975 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2976 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2977 bool program_uses_dfdy = fp->UsesDFdy;
2978
2979 memset(&key, 0, sizeof(key));
2980
2981 if (intel->gen < 6) {
2982 if (fp->UsesKill)
2983 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2984
2985 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2986 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2987
2988 /* Just assume depth testing. */
2989 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2990 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2991 }
2992
2993 if (prog->Name != 0)
2994 key.proj_attrib_mask = ~(GLbitfield64) 0;
2995 else {
2996 /* Bit VARYING_BIT_POS of key.proj_attrib_mask is never used, so to
2997 * avoid unnecessary recompiles, always set it to 1.
2998 */
2999 key.proj_attrib_mask |= VARYING_BIT_POS;
3000 }
3001
3002 if (intel->gen < 6)
3003 key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3004
3005 for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3006 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3007 continue;
3008
3009 if (prog->Name == 0)
3010 key.proj_attrib_mask |= BITFIELD64_BIT(i);
3011
3012 if (intel->gen < 6) {
3013 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3014 key.input_slots_valid |= BITFIELD64_BIT(i);
3015 }
3016 }
3017
3018 key.clamp_fragment_color = true;
3019
3020 for (int i = 0; i < MAX_SAMPLERS; i++) {
3021 if (fp->Base.ShadowSamplers & (1 << i)) {
3022 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3023 key.tex.swizzles[i] =
3024 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3025 } else {
3026 /* Color sampler: assume no swizzling. */
3027 key.tex.swizzles[i] = SWIZZLE_XYZW;
3028 }
3029 }
3030
3031 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3032 key.drawable_height = ctx->DrawBuffer->Height;
3033 }
3034
3035 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3036 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3037 }
3038
3039 key.nr_color_regions = 1;
3040
3041 key.program_string_id = bfp->id;
3042
3043 uint32_t old_prog_offset = brw->wm.prog_offset;
3044 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3045
3046 bool success = do_wm_prog(brw, prog, bfp, &key);
3047
3048 brw->wm.prog_offset = old_prog_offset;
3049 brw->wm.prog_data = old_prog_data;
3050
3051 return success;
3052 }