8e57eb0fd098407cede7ea1a4e90ca2466492c22
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 ALU1(NOT)
150 ALU1(MOV)
151 ALU1(FRC)
152 ALU1(RNDD)
153 ALU1(RNDE)
154 ALU1(RNDZ)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(SHL)
162 ALU2(SHR)
163 ALU2(ASR)
164
165 /** Gen4 predicated IF. */
166 fs_inst *
167 fs_visitor::IF(uint32_t predicate)
168 {
169 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
170 inst->predicate = predicate;
171 return inst;
172 }
173
174 /** Gen6+ IF with embedded comparison. */
175 fs_inst *
176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
177 {
178 assert(intel->gen >= 6);
179 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
180 reg_null_d, src0, src1);
181 inst->conditional_mod = condition;
182 return inst;
183 }
184
185 /**
186 * CMP: Sets the low bit of the destination channels with the result
187 * of the comparison, while the upper bits are undefined, and updates
188 * the flag register with the packed 16 bits of the result.
189 */
190 fs_inst *
191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
192 {
193 fs_inst *inst;
194
195 /* Take the instruction:
196 *
197 * CMP null<d> src0<f> src1<f>
198 *
199 * Original gen4 does type conversion to the destination type before
200 * comparison, producing garbage results for floating point comparisons.
201 * gen5 does the comparison on the execution type (resolved source types),
202 * so dst type doesn't matter. gen6 does comparison and then uses the
203 * result as if it was the dst type with no conversion, which happens to
204 * mostly work out for float-interpreted-as-int since our comparisons are
205 * for >0, =0, <0.
206 */
207 if (intel->gen == 4) {
208 dst.type = src0.type;
209 if (dst.file == FIXED_HW_REG)
210 dst.fixed_hw_reg.type = dst.type;
211 }
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 exec_list
223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
224 fs_reg offset)
225 {
226 exec_list instructions;
227 fs_inst *inst;
228
229 if (intel->gen >= 7) {
230 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
231 dst, surf_index, offset);
232 instructions.push_tail(inst);
233 } else {
234 int base_mrf = 13;
235 bool header_present = true;
236
237 fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
238 mrf.type = BRW_REGISTER_TYPE_D;
239
240 /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
241 * dword-aligned byte offset.
242 */
243 if (intel->gen == 6) {
244 instructions.push_tail(MOV(mrf, offset));
245 } else {
246 instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
247 }
248 inst = MOV(mrf, offset);
249 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
250 dst, surf_index);
251 inst->header_present = header_present;
252 inst->base_mrf = base_mrf;
253 inst->mlen = header_present + dispatch_width / 8;
254
255 instructions.push_tail(inst);
256 }
257
258 return instructions;
259 }
260
261 bool
262 fs_inst::equals(fs_inst *inst)
263 {
264 return (opcode == inst->opcode &&
265 dst.equals(inst->dst) &&
266 src[0].equals(inst->src[0]) &&
267 src[1].equals(inst->src[1]) &&
268 src[2].equals(inst->src[2]) &&
269 saturate == inst->saturate &&
270 predicate == inst->predicate &&
271 conditional_mod == inst->conditional_mod &&
272 mlen == inst->mlen &&
273 base_mrf == inst->base_mrf &&
274 sampler == inst->sampler &&
275 target == inst->target &&
276 eot == inst->eot &&
277 header_present == inst->header_present &&
278 shadow_compare == inst->shadow_compare &&
279 offset == inst->offset);
280 }
281
282 int
283 fs_inst::regs_written()
284 {
285 if (is_tex())
286 return 4;
287
288 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
289 * but we don't currently use them...nor do we have an opcode for them.
290 */
291
292 return 1;
293 }
294
295 bool
296 fs_inst::overwrites_reg(const fs_reg &reg)
297 {
298 return (reg.file == dst.file &&
299 reg.reg == dst.reg &&
300 reg.reg_offset >= dst.reg_offset &&
301 reg.reg_offset < dst.reg_offset + regs_written());
302 }
303
304 bool
305 fs_inst::is_tex()
306 {
307 return (opcode == SHADER_OPCODE_TEX ||
308 opcode == FS_OPCODE_TXB ||
309 opcode == SHADER_OPCODE_TXD ||
310 opcode == SHADER_OPCODE_TXF ||
311 opcode == SHADER_OPCODE_TXL ||
312 opcode == SHADER_OPCODE_TXS);
313 }
314
315 bool
316 fs_inst::is_math()
317 {
318 return (opcode == SHADER_OPCODE_RCP ||
319 opcode == SHADER_OPCODE_RSQ ||
320 opcode == SHADER_OPCODE_SQRT ||
321 opcode == SHADER_OPCODE_EXP2 ||
322 opcode == SHADER_OPCODE_LOG2 ||
323 opcode == SHADER_OPCODE_SIN ||
324 opcode == SHADER_OPCODE_COS ||
325 opcode == SHADER_OPCODE_INT_QUOTIENT ||
326 opcode == SHADER_OPCODE_INT_REMAINDER ||
327 opcode == SHADER_OPCODE_POW);
328 }
329
330 bool
331 fs_inst::is_send_from_grf()
332 {
333 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
334 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
335 src[1].file == GRF));
336 }
337
338 bool
339 fs_visitor::can_do_source_mods(fs_inst *inst)
340 {
341 if (intel->gen == 6 && inst->is_math())
342 return false;
343
344 if (inst->is_send_from_grf())
345 return false;
346
347 return true;
348 }
349
350 void
351 fs_reg::init()
352 {
353 memset(this, 0, sizeof(*this));
354 this->smear = -1;
355 }
356
357 /** Generic unset register constructor. */
358 fs_reg::fs_reg()
359 {
360 init();
361 this->file = BAD_FILE;
362 }
363
364 /** Immediate value constructor. */
365 fs_reg::fs_reg(float f)
366 {
367 init();
368 this->file = IMM;
369 this->type = BRW_REGISTER_TYPE_F;
370 this->imm.f = f;
371 }
372
373 /** Immediate value constructor. */
374 fs_reg::fs_reg(int32_t i)
375 {
376 init();
377 this->file = IMM;
378 this->type = BRW_REGISTER_TYPE_D;
379 this->imm.i = i;
380 }
381
382 /** Immediate value constructor. */
383 fs_reg::fs_reg(uint32_t u)
384 {
385 init();
386 this->file = IMM;
387 this->type = BRW_REGISTER_TYPE_UD;
388 this->imm.u = u;
389 }
390
391 /** Fixed brw_reg Immediate value constructor. */
392 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
393 {
394 init();
395 this->file = FIXED_HW_REG;
396 this->fixed_hw_reg = fixed_hw_reg;
397 this->type = fixed_hw_reg.type;
398 }
399
400 bool
401 fs_reg::equals(const fs_reg &r) const
402 {
403 return (file == r.file &&
404 reg == r.reg &&
405 reg_offset == r.reg_offset &&
406 type == r.type &&
407 negate == r.negate &&
408 abs == r.abs &&
409 !reladdr && !r.reladdr &&
410 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
411 sizeof(fixed_hw_reg)) == 0 &&
412 smear == r.smear &&
413 imm.u == r.imm.u);
414 }
415
416 bool
417 fs_reg::is_zero() const
418 {
419 if (file != IMM)
420 return false;
421
422 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
423 }
424
425 bool
426 fs_reg::is_one() const
427 {
428 if (file != IMM)
429 return false;
430
431 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
432 }
433
434 int
435 fs_visitor::type_size(const struct glsl_type *type)
436 {
437 unsigned int size, i;
438
439 switch (type->base_type) {
440 case GLSL_TYPE_UINT:
441 case GLSL_TYPE_INT:
442 case GLSL_TYPE_FLOAT:
443 case GLSL_TYPE_BOOL:
444 return type->components();
445 case GLSL_TYPE_ARRAY:
446 return type_size(type->fields.array) * type->length;
447 case GLSL_TYPE_STRUCT:
448 size = 0;
449 for (i = 0; i < type->length; i++) {
450 size += type_size(type->fields.structure[i].type);
451 }
452 return size;
453 case GLSL_TYPE_SAMPLER:
454 /* Samplers take up no register space, since they're baked in at
455 * link time.
456 */
457 return 0;
458 case GLSL_TYPE_VOID:
459 case GLSL_TYPE_ERROR:
460 case GLSL_TYPE_INTERFACE:
461 assert(!"not reached");
462 break;
463 }
464
465 return 0;
466 }
467
468 fs_reg
469 fs_visitor::get_timestamp()
470 {
471 assert(intel->gen >= 7);
472
473 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
474 BRW_ARF_TIMESTAMP,
475 0),
476 BRW_REGISTER_TYPE_UD));
477
478 fs_reg dst = fs_reg(this, glsl_type::uint_type);
479
480 fs_inst *mov = emit(MOV(dst, ts));
481 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
482 * even if it's not enabled in the dispatch.
483 */
484 mov->force_writemask_all = true;
485 mov->force_uncompressed = true;
486
487 /* The caller wants the low 32 bits of the timestamp. Since it's running
488 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
489 * which is plenty of time for our purposes. It is identical across the
490 * EUs, but since it's tracking GPU core speed it will increment at a
491 * varying rate as render P-states change.
492 *
493 * The caller could also check if render P-states have changed (or anything
494 * else that might disrupt timing) by setting smear to 2 and checking if
495 * that field is != 0.
496 */
497 dst.smear = 0;
498
499 return dst;
500 }
501
502 void
503 fs_visitor::emit_shader_time_begin()
504 {
505 current_annotation = "shader time start";
506 shader_start_time = get_timestamp();
507 }
508
509 void
510 fs_visitor::emit_shader_time_end()
511 {
512 current_annotation = "shader time end";
513
514 enum shader_time_shader_type type, written_type, reset_type;
515 if (dispatch_width == 8) {
516 type = ST_FS8;
517 written_type = ST_FS8_WRITTEN;
518 reset_type = ST_FS8_RESET;
519 } else {
520 assert(dispatch_width == 16);
521 type = ST_FS16;
522 written_type = ST_FS16_WRITTEN;
523 reset_type = ST_FS16_RESET;
524 }
525
526 fs_reg shader_end_time = get_timestamp();
527
528 /* Check that there weren't any timestamp reset events (assuming these
529 * were the only two timestamp reads that happened).
530 */
531 fs_reg reset = shader_end_time;
532 reset.smear = 2;
533 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
534 test->conditional_mod = BRW_CONDITIONAL_Z;
535 emit(IF(BRW_PREDICATE_NORMAL));
536
537 push_force_uncompressed();
538 fs_reg start = shader_start_time;
539 start.negate = true;
540 fs_reg diff = fs_reg(this, glsl_type::uint_type);
541 emit(ADD(diff, start, shader_end_time));
542
543 /* If there were no instructions between the two timestamp gets, the diff
544 * is 2 cycles. Remove that overhead, so I can forget about that when
545 * trying to determine the time taken for single instructions.
546 */
547 emit(ADD(diff, diff, fs_reg(-2u)));
548
549 emit_shader_time_write(type, diff);
550 emit_shader_time_write(written_type, fs_reg(1u));
551 emit(BRW_OPCODE_ELSE);
552 emit_shader_time_write(reset_type, fs_reg(1u));
553 emit(BRW_OPCODE_ENDIF);
554
555 pop_force_uncompressed();
556 }
557
558 void
559 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
560 fs_reg value)
561 {
562 /* Choose an index in the buffer and set up tracking information for our
563 * printouts.
564 */
565 int shader_time_index = brw->shader_time.num_entries++;
566 assert(shader_time_index <= brw->shader_time.max_entries);
567 brw->shader_time.types[shader_time_index] = type;
568 if (prog) {
569 _mesa_reference_shader_program(ctx,
570 &brw->shader_time.programs[shader_time_index],
571 prog);
572 }
573
574 int base_mrf = 6;
575
576 fs_reg offset_mrf = fs_reg(MRF, base_mrf);
577 offset_mrf.type = BRW_REGISTER_TYPE_UD;
578 emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
579
580 fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
581 time_mrf.type = BRW_REGISTER_TYPE_UD;
582 emit(MOV(time_mrf, value));
583
584 fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
585 inst->base_mrf = base_mrf;
586 inst->mlen = 2;
587 }
588
589 void
590 fs_visitor::fail(const char *format, ...)
591 {
592 va_list va;
593 char *msg;
594
595 if (failed)
596 return;
597
598 failed = true;
599
600 va_start(va, format);
601 msg = ralloc_vasprintf(mem_ctx, format, va);
602 va_end(va);
603 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
604
605 this->fail_msg = msg;
606
607 if (INTEL_DEBUG & DEBUG_WM) {
608 fprintf(stderr, "%s", msg);
609 }
610 }
611
612 fs_inst *
613 fs_visitor::emit(enum opcode opcode)
614 {
615 return emit(fs_inst(opcode));
616 }
617
618 fs_inst *
619 fs_visitor::emit(enum opcode opcode, fs_reg dst)
620 {
621 return emit(fs_inst(opcode, dst));
622 }
623
624 fs_inst *
625 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
626 {
627 return emit(fs_inst(opcode, dst, src0));
628 }
629
630 fs_inst *
631 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
632 {
633 return emit(fs_inst(opcode, dst, src0, src1));
634 }
635
636 fs_inst *
637 fs_visitor::emit(enum opcode opcode, fs_reg dst,
638 fs_reg src0, fs_reg src1, fs_reg src2)
639 {
640 return emit(fs_inst(opcode, dst, src0, src1, src2));
641 }
642
643 void
644 fs_visitor::push_force_uncompressed()
645 {
646 force_uncompressed_stack++;
647 }
648
649 void
650 fs_visitor::pop_force_uncompressed()
651 {
652 force_uncompressed_stack--;
653 assert(force_uncompressed_stack >= 0);
654 }
655
656 void
657 fs_visitor::push_force_sechalf()
658 {
659 force_sechalf_stack++;
660 }
661
662 void
663 fs_visitor::pop_force_sechalf()
664 {
665 force_sechalf_stack--;
666 assert(force_sechalf_stack >= 0);
667 }
668
669 /**
670 * Returns how many MRFs an FS opcode will write over.
671 *
672 * Note that this is not the 0 or 1 implied writes in an actual gen
673 * instruction -- the FS opcodes often generate MOVs in addition.
674 */
675 int
676 fs_visitor::implied_mrf_writes(fs_inst *inst)
677 {
678 if (inst->mlen == 0)
679 return 0;
680
681 switch (inst->opcode) {
682 case SHADER_OPCODE_RCP:
683 case SHADER_OPCODE_RSQ:
684 case SHADER_OPCODE_SQRT:
685 case SHADER_OPCODE_EXP2:
686 case SHADER_OPCODE_LOG2:
687 case SHADER_OPCODE_SIN:
688 case SHADER_OPCODE_COS:
689 return 1 * dispatch_width / 8;
690 case SHADER_OPCODE_POW:
691 case SHADER_OPCODE_INT_QUOTIENT:
692 case SHADER_OPCODE_INT_REMAINDER:
693 return 2 * dispatch_width / 8;
694 case SHADER_OPCODE_TEX:
695 case FS_OPCODE_TXB:
696 case SHADER_OPCODE_TXD:
697 case SHADER_OPCODE_TXF:
698 case SHADER_OPCODE_TXL:
699 case SHADER_OPCODE_TXS:
700 return 1;
701 case SHADER_OPCODE_SHADER_TIME_ADD:
702 return 0;
703 case FS_OPCODE_FB_WRITE:
704 return 2;
705 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
706 case FS_OPCODE_UNSPILL:
707 return 1;
708 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
709 return inst->header_present;
710 case FS_OPCODE_SPILL:
711 return 2;
712 default:
713 assert(!"not reached");
714 return inst->mlen;
715 }
716 }
717
718 int
719 fs_visitor::virtual_grf_alloc(int size)
720 {
721 if (virtual_grf_array_size <= virtual_grf_count) {
722 if (virtual_grf_array_size == 0)
723 virtual_grf_array_size = 16;
724 else
725 virtual_grf_array_size *= 2;
726 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
727 virtual_grf_array_size);
728 }
729 virtual_grf_sizes[virtual_grf_count] = size;
730 return virtual_grf_count++;
731 }
732
733 /** Fixed HW reg constructor. */
734 fs_reg::fs_reg(enum register_file file, int reg)
735 {
736 init();
737 this->file = file;
738 this->reg = reg;
739 this->type = BRW_REGISTER_TYPE_F;
740 }
741
742 /** Fixed HW reg constructor. */
743 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
744 {
745 init();
746 this->file = file;
747 this->reg = reg;
748 this->type = type;
749 }
750
751 /** Automatic reg constructor. */
752 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
753 {
754 init();
755
756 this->file = GRF;
757 this->reg = v->virtual_grf_alloc(v->type_size(type));
758 this->reg_offset = 0;
759 this->type = brw_type_for_base_type(type);
760 }
761
762 fs_reg *
763 fs_visitor::variable_storage(ir_variable *var)
764 {
765 return (fs_reg *)hash_table_find(this->variable_ht, var);
766 }
767
768 void
769 import_uniforms_callback(const void *key,
770 void *data,
771 void *closure)
772 {
773 struct hash_table *dst_ht = (struct hash_table *)closure;
774 const fs_reg *reg = (const fs_reg *)data;
775
776 if (reg->file != UNIFORM)
777 return;
778
779 hash_table_insert(dst_ht, data, key);
780 }
781
782 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
783 * This brings in those uniform definitions
784 */
785 void
786 fs_visitor::import_uniforms(fs_visitor *v)
787 {
788 hash_table_call_foreach(v->variable_ht,
789 import_uniforms_callback,
790 variable_ht);
791 this->params_remap = v->params_remap;
792 }
793
794 /* Our support for uniforms is piggy-backed on the struct
795 * gl_fragment_program, because that's where the values actually
796 * get stored, rather than in some global gl_shader_program uniform
797 * store.
798 */
799 void
800 fs_visitor::setup_uniform_values(ir_variable *ir)
801 {
802 int namelen = strlen(ir->name);
803
804 /* The data for our (non-builtin) uniforms is stored in a series of
805 * gl_uniform_driver_storage structs for each subcomponent that
806 * glGetUniformLocation() could name. We know it's been set up in the same
807 * order we'd walk the type, so walk the list of storage and find anything
808 * with our name, or the prefix of a component that starts with our name.
809 */
810 unsigned params_before = c->prog_data.nr_params;
811 for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
812 struct gl_uniform_storage *storage = &prog->UniformStorage[u];
813
814 if (strncmp(ir->name, storage->name, namelen) != 0 ||
815 (storage->name[namelen] != 0 &&
816 storage->name[namelen] != '.' &&
817 storage->name[namelen] != '[')) {
818 continue;
819 }
820
821 unsigned slots = storage->type->component_slots();
822 if (storage->array_elements)
823 slots *= storage->array_elements;
824
825 for (unsigned i = 0; i < slots; i++) {
826 c->prog_data.param[c->prog_data.nr_params++] =
827 &storage->storage[i].f;
828 }
829 }
830
831 /* Make sure we actually initialized the right amount of stuff here. */
832 assert(params_before + ir->type->component_slots() ==
833 c->prog_data.nr_params);
834 }
835
836
837 /* Our support for builtin uniforms is even scarier than non-builtin.
838 * It sits on top of the PROG_STATE_VAR parameters that are
839 * automatically updated from GL context state.
840 */
841 void
842 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
843 {
844 const ir_state_slot *const slots = ir->state_slots;
845 assert(ir->state_slots != NULL);
846
847 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
848 /* This state reference has already been setup by ir_to_mesa, but we'll
849 * get the same index back here.
850 */
851 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
852 (gl_state_index *)slots[i].tokens);
853
854 /* Add each of the unique swizzles of the element as a parameter.
855 * This'll end up matching the expected layout of the
856 * array/matrix/structure we're trying to fill in.
857 */
858 int last_swiz = -1;
859 for (unsigned int j = 0; j < 4; j++) {
860 int swiz = GET_SWZ(slots[i].swizzle, j);
861 if (swiz == last_swiz)
862 break;
863 last_swiz = swiz;
864
865 c->prog_data.param[c->prog_data.nr_params++] =
866 &fp->Base.Parameters->ParameterValues[index][swiz].f;
867 }
868 }
869 }
870
871 fs_reg *
872 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
873 {
874 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
875 fs_reg wpos = *reg;
876 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
877
878 /* gl_FragCoord.x */
879 if (ir->pixel_center_integer) {
880 emit(MOV(wpos, this->pixel_x));
881 } else {
882 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
883 }
884 wpos.reg_offset++;
885
886 /* gl_FragCoord.y */
887 if (!flip && ir->pixel_center_integer) {
888 emit(MOV(wpos, this->pixel_y));
889 } else {
890 fs_reg pixel_y = this->pixel_y;
891 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
892
893 if (flip) {
894 pixel_y.negate = true;
895 offset += c->key.drawable_height - 1.0;
896 }
897
898 emit(ADD(wpos, pixel_y, fs_reg(offset)));
899 }
900 wpos.reg_offset++;
901
902 /* gl_FragCoord.z */
903 if (intel->gen >= 6) {
904 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
905 } else {
906 emit(FS_OPCODE_LINTERP, wpos,
907 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
908 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
909 interp_reg(FRAG_ATTRIB_WPOS, 2));
910 }
911 wpos.reg_offset++;
912
913 /* gl_FragCoord.w: Already set up in emit_interpolation */
914 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
915
916 return reg;
917 }
918
919 fs_inst *
920 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
921 glsl_interp_qualifier interpolation_mode,
922 bool is_centroid)
923 {
924 brw_wm_barycentric_interp_mode barycoord_mode;
925 if (is_centroid) {
926 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
927 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
928 else
929 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
930 } else {
931 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
932 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
933 else
934 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
935 }
936 return emit(FS_OPCODE_LINTERP, attr,
937 this->delta_x[barycoord_mode],
938 this->delta_y[barycoord_mode], interp);
939 }
940
941 fs_reg *
942 fs_visitor::emit_general_interpolation(ir_variable *ir)
943 {
944 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
945 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
946 fs_reg attr = *reg;
947
948 unsigned int array_elements;
949 const glsl_type *type;
950
951 if (ir->type->is_array()) {
952 array_elements = ir->type->length;
953 if (array_elements == 0) {
954 fail("dereferenced array '%s' has length 0\n", ir->name);
955 }
956 type = ir->type->fields.array;
957 } else {
958 array_elements = 1;
959 type = ir->type;
960 }
961
962 glsl_interp_qualifier interpolation_mode =
963 ir->determine_interpolation_mode(c->key.flat_shade);
964
965 int location = ir->location;
966 for (unsigned int i = 0; i < array_elements; i++) {
967 for (unsigned int j = 0; j < type->matrix_columns; j++) {
968 if (urb_setup[location] == -1) {
969 /* If there's no incoming setup data for this slot, don't
970 * emit interpolation for it.
971 */
972 attr.reg_offset += type->vector_elements;
973 location++;
974 continue;
975 }
976
977 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
978 /* Constant interpolation (flat shading) case. The SF has
979 * handed us defined values in only the constant offset
980 * field of the setup reg.
981 */
982 for (unsigned int k = 0; k < type->vector_elements; k++) {
983 struct brw_reg interp = interp_reg(location, k);
984 interp = suboffset(interp, 3);
985 interp.type = reg->type;
986 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
987 attr.reg_offset++;
988 }
989 } else {
990 /* Smooth/noperspective interpolation case. */
991 for (unsigned int k = 0; k < type->vector_elements; k++) {
992 /* FINISHME: At some point we probably want to push
993 * this farther by giving similar treatment to the
994 * other potentially constant components of the
995 * attribute, as well as making brw_vs_constval.c
996 * handle varyings other than gl_TexCoord.
997 */
998 if (location >= FRAG_ATTRIB_TEX0 &&
999 location <= FRAG_ATTRIB_TEX7 &&
1000 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1001 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1002 } else {
1003 struct brw_reg interp = interp_reg(location, k);
1004 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1005 ir->centroid);
1006 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1007 /* Get the pixel/sample mask into f0 so that we know
1008 * which pixels are lit. Then, for each channel that is
1009 * unlit, replace the centroid data with non-centroid
1010 * data.
1011 */
1012 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1013 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1014 interpolation_mode, false);
1015 inst->predicate = BRW_PREDICATE_NORMAL;
1016 inst->predicate_inverse = true;
1017 }
1018 if (intel->gen < 6) {
1019 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1020 }
1021 }
1022 attr.reg_offset++;
1023 }
1024
1025 }
1026 location++;
1027 }
1028 }
1029
1030 return reg;
1031 }
1032
1033 fs_reg *
1034 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1035 {
1036 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1037
1038 /* The frontfacing comes in as a bit in the thread payload. */
1039 if (intel->gen >= 6) {
1040 emit(BRW_OPCODE_ASR, *reg,
1041 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1042 fs_reg(15));
1043 emit(BRW_OPCODE_NOT, *reg, *reg);
1044 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1045 } else {
1046 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1047 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1048 * us front face
1049 */
1050 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1051 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1052 }
1053
1054 return reg;
1055 }
1056
1057 fs_reg
1058 fs_visitor::fix_math_operand(fs_reg src)
1059 {
1060 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1061 * might be able to do better by doing execsize = 1 math and then
1062 * expanding that result out, but we would need to be careful with
1063 * masking.
1064 *
1065 * The hardware ignores source modifiers (negate and abs) on math
1066 * instructions, so we also move to a temp to set those up.
1067 */
1068 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1069 !src.abs && !src.negate)
1070 return src;
1071
1072 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1073 * operands to math
1074 */
1075 if (intel->gen >= 7 && src.file != IMM)
1076 return src;
1077
1078 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1079 expanded.type = src.type;
1080 emit(BRW_OPCODE_MOV, expanded, src);
1081 return expanded;
1082 }
1083
1084 fs_inst *
1085 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1086 {
1087 switch (opcode) {
1088 case SHADER_OPCODE_RCP:
1089 case SHADER_OPCODE_RSQ:
1090 case SHADER_OPCODE_SQRT:
1091 case SHADER_OPCODE_EXP2:
1092 case SHADER_OPCODE_LOG2:
1093 case SHADER_OPCODE_SIN:
1094 case SHADER_OPCODE_COS:
1095 break;
1096 default:
1097 assert(!"not reached: bad math opcode");
1098 return NULL;
1099 }
1100
1101 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1102 * might be able to do better by doing execsize = 1 math and then
1103 * expanding that result out, but we would need to be careful with
1104 * masking.
1105 *
1106 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1107 * instructions, so we also move to a temp to set those up.
1108 */
1109 if (intel->gen >= 6)
1110 src = fix_math_operand(src);
1111
1112 fs_inst *inst = emit(opcode, dst, src);
1113
1114 if (intel->gen < 6) {
1115 inst->base_mrf = 2;
1116 inst->mlen = dispatch_width / 8;
1117 }
1118
1119 return inst;
1120 }
1121
1122 fs_inst *
1123 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1124 {
1125 int base_mrf = 2;
1126 fs_inst *inst;
1127
1128 switch (opcode) {
1129 case SHADER_OPCODE_INT_QUOTIENT:
1130 case SHADER_OPCODE_INT_REMAINDER:
1131 if (intel->gen >= 7 && dispatch_width == 16)
1132 fail("16-wide INTDIV unsupported\n");
1133 break;
1134 case SHADER_OPCODE_POW:
1135 break;
1136 default:
1137 assert(!"not reached: unsupported binary math opcode.");
1138 return NULL;
1139 }
1140
1141 if (intel->gen >= 6) {
1142 src0 = fix_math_operand(src0);
1143 src1 = fix_math_operand(src1);
1144
1145 inst = emit(opcode, dst, src0, src1);
1146 } else {
1147 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1148 * "Message Payload":
1149 *
1150 * "Operand0[7]. For the INT DIV functions, this operand is the
1151 * denominator."
1152 * ...
1153 * "Operand1[7]. For the INT DIV functions, this operand is the
1154 * numerator."
1155 */
1156 bool is_int_div = opcode != SHADER_OPCODE_POW;
1157 fs_reg &op0 = is_int_div ? src1 : src0;
1158 fs_reg &op1 = is_int_div ? src0 : src1;
1159
1160 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1161 inst = emit(opcode, dst, op0, reg_null_f);
1162
1163 inst->base_mrf = base_mrf;
1164 inst->mlen = 2 * dispatch_width / 8;
1165 }
1166 return inst;
1167 }
1168
1169 void
1170 fs_visitor::assign_curb_setup()
1171 {
1172 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1173 if (dispatch_width == 8) {
1174 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1175 } else {
1176 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1177 }
1178
1179 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1180 foreach_list(node, &this->instructions) {
1181 fs_inst *inst = (fs_inst *)node;
1182
1183 for (unsigned int i = 0; i < 3; i++) {
1184 if (inst->src[i].file == UNIFORM) {
1185 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1186 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1187 constant_nr / 8,
1188 constant_nr % 8);
1189
1190 inst->src[i].file = FIXED_HW_REG;
1191 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1192 }
1193 }
1194 }
1195 }
1196
1197 void
1198 fs_visitor::calculate_urb_setup()
1199 {
1200 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1201 urb_setup[i] = -1;
1202 }
1203
1204 int urb_next = 0;
1205 /* Figure out where each of the incoming setup attributes lands. */
1206 if (intel->gen >= 6) {
1207 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1208 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1209 urb_setup[i] = urb_next++;
1210 }
1211 }
1212 } else {
1213 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1214 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1215 /* Point size is packed into the header, not as a general attribute */
1216 if (i == VERT_RESULT_PSIZ)
1217 continue;
1218
1219 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1220 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1221
1222 /* The back color slot is skipped when the front color is
1223 * also written to. In addition, some slots can be
1224 * written in the vertex shader and not read in the
1225 * fragment shader. So the register number must always be
1226 * incremented, mapped or not.
1227 */
1228 if (fp_index >= 0)
1229 urb_setup[fp_index] = urb_next;
1230 urb_next++;
1231 }
1232 }
1233
1234 /*
1235 * It's a FS only attribute, and we did interpolation for this attribute
1236 * in SF thread. So, count it here, too.
1237 *
1238 * See compile_sf_prog() for more info.
1239 */
1240 if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1241 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1242 }
1243
1244 /* Each attribute is 4 setup channels, each of which is half a reg. */
1245 c->prog_data.urb_read_length = urb_next * 2;
1246 }
1247
1248 void
1249 fs_visitor::assign_urb_setup()
1250 {
1251 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1252
1253 /* Offset all the urb_setup[] index by the actual position of the
1254 * setup regs, now that the location of the constants has been chosen.
1255 */
1256 foreach_list(node, &this->instructions) {
1257 fs_inst *inst = (fs_inst *)node;
1258
1259 if (inst->opcode == FS_OPCODE_LINTERP) {
1260 assert(inst->src[2].file == FIXED_HW_REG);
1261 inst->src[2].fixed_hw_reg.nr += urb_start;
1262 }
1263
1264 if (inst->opcode == FS_OPCODE_CINTERP) {
1265 assert(inst->src[0].file == FIXED_HW_REG);
1266 inst->src[0].fixed_hw_reg.nr += urb_start;
1267 }
1268 }
1269
1270 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1271 }
1272
1273 /**
1274 * Split large virtual GRFs into separate components if we can.
1275 *
1276 * This is mostly duplicated with what brw_fs_vector_splitting does,
1277 * but that's really conservative because it's afraid of doing
1278 * splitting that doesn't result in real progress after the rest of
1279 * the optimization phases, which would cause infinite looping in
1280 * optimization. We can do it once here, safely. This also has the
1281 * opportunity to split interpolated values, or maybe even uniforms,
1282 * which we don't have at the IR level.
1283 *
1284 * We want to split, because virtual GRFs are what we register
1285 * allocate and spill (due to contiguousness requirements for some
1286 * instructions), and they're what we naturally generate in the
1287 * codegen process, but most virtual GRFs don't actually need to be
1288 * contiguous sets of GRFs. If we split, we'll end up with reduced
1289 * live intervals and better dead code elimination and coalescing.
1290 */
1291 void
1292 fs_visitor::split_virtual_grfs()
1293 {
1294 int num_vars = this->virtual_grf_count;
1295 bool split_grf[num_vars];
1296 int new_virtual_grf[num_vars];
1297
1298 /* Try to split anything > 0 sized. */
1299 for (int i = 0; i < num_vars; i++) {
1300 if (this->virtual_grf_sizes[i] != 1)
1301 split_grf[i] = true;
1302 else
1303 split_grf[i] = false;
1304 }
1305
1306 if (brw->has_pln &&
1307 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1308 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1309 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1310 * Gen6, that was the only supported interpolation mode, and since Gen6,
1311 * delta_x and delta_y are in fixed hardware registers.
1312 */
1313 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1314 false;
1315 }
1316
1317 foreach_list(node, &this->instructions) {
1318 fs_inst *inst = (fs_inst *)node;
1319
1320 /* If there's a SEND message that requires contiguous destination
1321 * registers, no splitting is allowed.
1322 */
1323 if (inst->regs_written() > 1) {
1324 split_grf[inst->dst.reg] = false;
1325 }
1326 }
1327
1328 /* Allocate new space for split regs. Note that the virtual
1329 * numbers will be contiguous.
1330 */
1331 for (int i = 0; i < num_vars; i++) {
1332 if (split_grf[i]) {
1333 new_virtual_grf[i] = virtual_grf_alloc(1);
1334 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1335 int reg = virtual_grf_alloc(1);
1336 assert(reg == new_virtual_grf[i] + j - 1);
1337 (void) reg;
1338 }
1339 this->virtual_grf_sizes[i] = 1;
1340 }
1341 }
1342
1343 foreach_list(node, &this->instructions) {
1344 fs_inst *inst = (fs_inst *)node;
1345
1346 if (inst->dst.file == GRF &&
1347 split_grf[inst->dst.reg] &&
1348 inst->dst.reg_offset != 0) {
1349 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1350 inst->dst.reg_offset - 1);
1351 inst->dst.reg_offset = 0;
1352 }
1353 for (int i = 0; i < 3; i++) {
1354 if (inst->src[i].file == GRF &&
1355 split_grf[inst->src[i].reg] &&
1356 inst->src[i].reg_offset != 0) {
1357 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1358 inst->src[i].reg_offset - 1);
1359 inst->src[i].reg_offset = 0;
1360 }
1361 }
1362 }
1363 this->live_intervals_valid = false;
1364 }
1365
1366 /**
1367 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1368 *
1369 * During code generation, we create tons of temporary variables, many of
1370 * which get immediately killed and are never used again. Yet, in later
1371 * optimization and analysis passes, such as compute_live_intervals, we need
1372 * to loop over all the virtual GRFs. Compacting them can save a lot of
1373 * overhead.
1374 */
1375 void
1376 fs_visitor::compact_virtual_grfs()
1377 {
1378 /* Mark which virtual GRFs are used, and count how many. */
1379 int remap_table[this->virtual_grf_count];
1380 memset(remap_table, -1, sizeof(remap_table));
1381
1382 foreach_list(node, &this->instructions) {
1383 const fs_inst *inst = (const fs_inst *) node;
1384
1385 if (inst->dst.file == GRF)
1386 remap_table[inst->dst.reg] = 0;
1387
1388 for (int i = 0; i < 3; i++) {
1389 if (inst->src[i].file == GRF)
1390 remap_table[inst->src[i].reg] = 0;
1391 }
1392 }
1393
1394 /* In addition to registers used in instructions, fs_visitor keeps
1395 * direct references to certain special values which must be patched:
1396 */
1397 fs_reg *special[] = {
1398 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1399 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1400 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1401 &delta_x[0], &delta_x[1], &delta_x[2],
1402 &delta_x[3], &delta_x[4], &delta_x[5],
1403 &delta_y[0], &delta_y[1], &delta_y[2],
1404 &delta_y[3], &delta_y[4], &delta_y[5],
1405 };
1406 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1407 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1408
1409 /* Treat all special values as used, to be conservative */
1410 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1411 if (special[i]->file == GRF)
1412 remap_table[special[i]->reg] = 0;
1413 }
1414
1415 /* Compact the GRF arrays. */
1416 int new_index = 0;
1417 for (int i = 0; i < this->virtual_grf_count; i++) {
1418 if (remap_table[i] != -1) {
1419 remap_table[i] = new_index;
1420 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1421 if (live_intervals_valid) {
1422 virtual_grf_use[new_index] = virtual_grf_use[i];
1423 virtual_grf_def[new_index] = virtual_grf_def[i];
1424 }
1425 ++new_index;
1426 }
1427 }
1428
1429 this->virtual_grf_count = new_index;
1430
1431 /* Patch all the instructions to use the newly renumbered registers */
1432 foreach_list(node, &this->instructions) {
1433 fs_inst *inst = (fs_inst *) node;
1434
1435 if (inst->dst.file == GRF)
1436 inst->dst.reg = remap_table[inst->dst.reg];
1437
1438 for (int i = 0; i < 3; i++) {
1439 if (inst->src[i].file == GRF)
1440 inst->src[i].reg = remap_table[inst->src[i].reg];
1441 }
1442 }
1443
1444 /* Patch all the references to special values */
1445 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1446 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1447 special[i]->reg = remap_table[special[i]->reg];
1448 }
1449 }
1450
1451 bool
1452 fs_visitor::remove_dead_constants()
1453 {
1454 if (dispatch_width == 8) {
1455 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1456
1457 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1458 this->params_remap[i] = -1;
1459
1460 /* Find which params are still in use. */
1461 foreach_list(node, &this->instructions) {
1462 fs_inst *inst = (fs_inst *)node;
1463
1464 for (int i = 0; i < 3; i++) {
1465 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1466
1467 if (inst->src[i].file != UNIFORM)
1468 continue;
1469
1470 assert(constant_nr < (int)c->prog_data.nr_params);
1471
1472 /* For now, set this to non-negative. We'll give it the
1473 * actual new number in a moment, in order to keep the
1474 * register numbers nicely ordered.
1475 */
1476 this->params_remap[constant_nr] = 0;
1477 }
1478 }
1479
1480 /* Figure out what the new numbers for the params will be. At some
1481 * point when we're doing uniform array access, we're going to want
1482 * to keep the distinction between .reg and .reg_offset, but for
1483 * now we don't care.
1484 */
1485 unsigned int new_nr_params = 0;
1486 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1487 if (this->params_remap[i] != -1) {
1488 this->params_remap[i] = new_nr_params++;
1489 }
1490 }
1491
1492 /* Update the list of params to be uploaded to match our new numbering. */
1493 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1494 int remapped = this->params_remap[i];
1495
1496 if (remapped == -1)
1497 continue;
1498
1499 c->prog_data.param[remapped] = c->prog_data.param[i];
1500 }
1501
1502 c->prog_data.nr_params = new_nr_params;
1503 } else {
1504 /* This should have been generated in the 8-wide pass already. */
1505 assert(this->params_remap);
1506 }
1507
1508 /* Now do the renumbering of the shader to remove unused params. */
1509 foreach_list(node, &this->instructions) {
1510 fs_inst *inst = (fs_inst *)node;
1511
1512 for (int i = 0; i < 3; i++) {
1513 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1514
1515 if (inst->src[i].file != UNIFORM)
1516 continue;
1517
1518 assert(this->params_remap[constant_nr] != -1);
1519 inst->src[i].reg = this->params_remap[constant_nr];
1520 inst->src[i].reg_offset = 0;
1521 }
1522 }
1523
1524 return true;
1525 }
1526
1527 /*
1528 * Implements array access of uniforms by inserting a
1529 * PULL_CONSTANT_LOAD instruction.
1530 *
1531 * Unlike temporary GRF array access (where we don't support it due to
1532 * the difficulty of doing relative addressing on instruction
1533 * destinations), we could potentially do array access of uniforms
1534 * that were loaded in GRF space as push constants. In real-world
1535 * usage we've seen, though, the arrays being used are always larger
1536 * than we could load as push constants, so just always move all
1537 * uniform array access out to a pull constant buffer.
1538 */
1539 void
1540 fs_visitor::move_uniform_array_access_to_pull_constants()
1541 {
1542 int pull_constant_loc[c->prog_data.nr_params];
1543
1544 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1545 pull_constant_loc[i] = -1;
1546 }
1547
1548 /* Walk through and find array access of uniforms. Put a copy of that
1549 * uniform in the pull constant buffer.
1550 *
1551 * Note that we don't move constant-indexed accesses to arrays. No
1552 * testing has been done of the performance impact of this choice.
1553 */
1554 foreach_list_safe(node, &this->instructions) {
1555 fs_inst *inst = (fs_inst *)node;
1556
1557 for (int i = 0 ; i < 3; i++) {
1558 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1559 continue;
1560
1561 int uniform = inst->src[i].reg;
1562
1563 /* If this array isn't already present in the pull constant buffer,
1564 * add it.
1565 */
1566 if (pull_constant_loc[uniform] == -1) {
1567 const float **values = &c->prog_data.param[uniform];
1568
1569 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1570
1571 assert(param_size[uniform]);
1572
1573 for (int j = 0; j < param_size[uniform]; j++) {
1574 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1575 values[j];
1576 }
1577 }
1578
1579 /* Set up the annotation tracking for new generated instructions. */
1580 base_ir = inst->ir;
1581 current_annotation = inst->annotation;
1582
1583 fs_reg offset = fs_reg(this, glsl_type::int_type);
1584 inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1585 fs_reg(pull_constant_loc[uniform] +
1586 inst->src[i].reg_offset)));
1587
1588 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1589 fs_reg temp = fs_reg(this, glsl_type::float_type);
1590 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1591 surf_index, offset);
1592 inst->insert_before(&list);
1593
1594 inst->src[i].file = temp.file;
1595 inst->src[i].reg = temp.reg;
1596 inst->src[i].reg_offset = temp.reg_offset;
1597 inst->src[i].reladdr = NULL;
1598 }
1599 }
1600 }
1601
1602 /**
1603 * Choose accesses from the UNIFORM file to demote to using the pull
1604 * constant buffer.
1605 *
1606 * We allow a fragment shader to have more than the specified minimum
1607 * maximum number of fragment shader uniform components (64). If
1608 * there are too many of these, they'd fill up all of register space.
1609 * So, this will push some of them out to the pull constant buffer and
1610 * update the program to load them.
1611 */
1612 void
1613 fs_visitor::setup_pull_constants()
1614 {
1615 /* Only allow 16 registers (128 uniform components) as push constants. */
1616 unsigned int max_uniform_components = 16 * 8;
1617 if (c->prog_data.nr_params <= max_uniform_components)
1618 return;
1619
1620 if (dispatch_width == 16) {
1621 fail("Pull constants not supported in 16-wide\n");
1622 return;
1623 }
1624
1625 /* Just demote the end of the list. We could probably do better
1626 * here, demoting things that are rarely used in the program first.
1627 */
1628 unsigned int pull_uniform_base = max_uniform_components;
1629
1630 int pull_constant_loc[c->prog_data.nr_params];
1631 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1632 if (i < pull_uniform_base) {
1633 pull_constant_loc[i] = -1;
1634 } else {
1635 pull_constant_loc[i] = -1;
1636 /* If our constant is already being uploaded for reladdr purposes,
1637 * reuse it.
1638 */
1639 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1640 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1641 pull_constant_loc[i] = j;
1642 break;
1643 }
1644 }
1645 if (pull_constant_loc[i] == -1) {
1646 int pull_index = c->prog_data.nr_pull_params++;
1647 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1648 pull_constant_loc[i] = pull_index;;
1649 }
1650 }
1651 }
1652 c->prog_data.nr_params = pull_uniform_base;
1653
1654 foreach_list(node, &this->instructions) {
1655 fs_inst *inst = (fs_inst *)node;
1656
1657 for (int i = 0; i < 3; i++) {
1658 if (inst->src[i].file != UNIFORM)
1659 continue;
1660
1661 int pull_index = pull_constant_loc[inst->src[i].reg +
1662 inst->src[i].reg_offset];
1663 if (pull_index == -1)
1664 continue;
1665
1666 assert(!inst->src[i].reladdr);
1667
1668 fs_reg dst = fs_reg(this, glsl_type::float_type);
1669 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1670 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1671 fs_inst *pull =
1672 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1673 dst, index, offset);
1674 pull->ir = inst->ir;
1675 pull->annotation = inst->annotation;
1676 pull->base_mrf = 14;
1677 pull->mlen = 1;
1678
1679 inst->insert_before(pull);
1680
1681 inst->src[i].file = GRF;
1682 inst->src[i].reg = dst.reg;
1683 inst->src[i].reg_offset = 0;
1684 inst->src[i].smear = pull_index & 3;
1685 }
1686 }
1687 }
1688
1689 bool
1690 fs_visitor::opt_algebraic()
1691 {
1692 bool progress = false;
1693
1694 foreach_list(node, &this->instructions) {
1695 fs_inst *inst = (fs_inst *)node;
1696
1697 switch (inst->opcode) {
1698 case BRW_OPCODE_MUL:
1699 if (inst->src[1].file != IMM)
1700 continue;
1701
1702 /* a * 1.0 = a */
1703 if (inst->src[1].is_one()) {
1704 inst->opcode = BRW_OPCODE_MOV;
1705 inst->src[1] = reg_undef;
1706 progress = true;
1707 break;
1708 }
1709
1710 /* a * 0.0 = 0.0 */
1711 if (inst->src[1].is_zero()) {
1712 inst->opcode = BRW_OPCODE_MOV;
1713 inst->src[0] = inst->src[1];
1714 inst->src[1] = reg_undef;
1715 progress = true;
1716 break;
1717 }
1718
1719 break;
1720 case BRW_OPCODE_ADD:
1721 if (inst->src[1].file != IMM)
1722 continue;
1723
1724 /* a + 0.0 = a */
1725 if (inst->src[1].is_zero()) {
1726 inst->opcode = BRW_OPCODE_MOV;
1727 inst->src[1] = reg_undef;
1728 progress = true;
1729 break;
1730 }
1731 break;
1732 default:
1733 break;
1734 }
1735 }
1736
1737 return progress;
1738 }
1739
1740 /**
1741 * Must be called after calculate_live_intervales() to remove unused
1742 * writes to registers -- register allocation will fail otherwise
1743 * because something deffed but not used won't be considered to
1744 * interfere with other regs.
1745 */
1746 bool
1747 fs_visitor::dead_code_eliminate()
1748 {
1749 bool progress = false;
1750 int pc = 0;
1751
1752 calculate_live_intervals();
1753
1754 foreach_list_safe(node, &this->instructions) {
1755 fs_inst *inst = (fs_inst *)node;
1756
1757 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1758 inst->remove();
1759 progress = true;
1760 }
1761
1762 pc++;
1763 }
1764
1765 if (progress)
1766 live_intervals_valid = false;
1767
1768 return progress;
1769 }
1770
1771 /**
1772 * Implements a second type of register coalescing: This one checks if
1773 * the two regs involved in a raw move don't interfere, in which case
1774 * they can both by stored in the same place and the MOV removed.
1775 */
1776 bool
1777 fs_visitor::register_coalesce_2()
1778 {
1779 bool progress = false;
1780
1781 calculate_live_intervals();
1782
1783 foreach_list_safe(node, &this->instructions) {
1784 fs_inst *inst = (fs_inst *)node;
1785
1786 if (inst->opcode != BRW_OPCODE_MOV ||
1787 inst->predicate ||
1788 inst->saturate ||
1789 inst->src[0].file != GRF ||
1790 inst->src[0].negate ||
1791 inst->src[0].abs ||
1792 inst->src[0].smear != -1 ||
1793 inst->dst.file != GRF ||
1794 inst->dst.type != inst->src[0].type ||
1795 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1796 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1797 continue;
1798 }
1799
1800 int reg_from = inst->src[0].reg;
1801 assert(inst->src[0].reg_offset == 0);
1802 int reg_to = inst->dst.reg;
1803 int reg_to_offset = inst->dst.reg_offset;
1804
1805 foreach_list(node, &this->instructions) {
1806 fs_inst *scan_inst = (fs_inst *)node;
1807
1808 if (scan_inst->dst.file == GRF &&
1809 scan_inst->dst.reg == reg_from) {
1810 scan_inst->dst.reg = reg_to;
1811 scan_inst->dst.reg_offset = reg_to_offset;
1812 }
1813 for (int i = 0; i < 3; i++) {
1814 if (scan_inst->src[i].file == GRF &&
1815 scan_inst->src[i].reg == reg_from) {
1816 scan_inst->src[i].reg = reg_to;
1817 scan_inst->src[i].reg_offset = reg_to_offset;
1818 }
1819 }
1820 }
1821
1822 inst->remove();
1823
1824 /* We don't need to recalculate live intervals inside the loop despite
1825 * flagging live_intervals_valid because we only use live intervals for
1826 * the interferes test, and we must have had a situation where the
1827 * intervals were:
1828 *
1829 * from to
1830 * ^
1831 * |
1832 * v
1833 * ^
1834 * |
1835 * v
1836 *
1837 * Some register R that might get coalesced with one of these two could
1838 * only be referencing "to", otherwise "from"'s range would have been
1839 * longer. R's range could also only start at the end of "to" or later,
1840 * otherwise it will conflict with "to" when we try to coalesce "to"
1841 * into Rw anyway.
1842 */
1843 live_intervals_valid = false;
1844
1845 progress = true;
1846 continue;
1847 }
1848
1849 return progress;
1850 }
1851
1852 bool
1853 fs_visitor::register_coalesce()
1854 {
1855 bool progress = false;
1856 int if_depth = 0;
1857 int loop_depth = 0;
1858
1859 foreach_list_safe(node, &this->instructions) {
1860 fs_inst *inst = (fs_inst *)node;
1861
1862 /* Make sure that we dominate the instructions we're going to
1863 * scan for interfering with our coalescing, or we won't have
1864 * scanned enough to see if anything interferes with our
1865 * coalescing. We don't dominate the following instructions if
1866 * we're in a loop or an if block.
1867 */
1868 switch (inst->opcode) {
1869 case BRW_OPCODE_DO:
1870 loop_depth++;
1871 break;
1872 case BRW_OPCODE_WHILE:
1873 loop_depth--;
1874 break;
1875 case BRW_OPCODE_IF:
1876 if_depth++;
1877 break;
1878 case BRW_OPCODE_ENDIF:
1879 if_depth--;
1880 break;
1881 default:
1882 break;
1883 }
1884 if (loop_depth || if_depth)
1885 continue;
1886
1887 if (inst->opcode != BRW_OPCODE_MOV ||
1888 inst->predicate ||
1889 inst->saturate ||
1890 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1891 inst->src[0].file != UNIFORM)||
1892 inst->dst.type != inst->src[0].type)
1893 continue;
1894
1895 bool has_source_modifiers = (inst->src[0].abs ||
1896 inst->src[0].negate ||
1897 inst->src[0].file == UNIFORM);
1898
1899 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1900 * them: check for no writes to either one until the exit of the
1901 * program.
1902 */
1903 bool interfered = false;
1904
1905 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1906 !scan_inst->is_tail_sentinel();
1907 scan_inst = (fs_inst *)scan_inst->next) {
1908 if (scan_inst->dst.file == GRF) {
1909 if (scan_inst->overwrites_reg(inst->dst) ||
1910 scan_inst->overwrites_reg(inst->src[0])) {
1911 interfered = true;
1912 break;
1913 }
1914 }
1915
1916 /* The gen6 MATH instruction can't handle source modifiers or
1917 * unusual register regions, so avoid coalescing those for
1918 * now. We should do something more specific.
1919 */
1920 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1921 interfered = true;
1922 break;
1923 }
1924
1925 /* The accumulator result appears to get used for the
1926 * conditional modifier generation. When negating a UD
1927 * value, there is a 33rd bit generated for the sign in the
1928 * accumulator value, so now you can't check, for example,
1929 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1930 */
1931 if (scan_inst->conditional_mod &&
1932 inst->src[0].negate &&
1933 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1934 interfered = true;
1935 break;
1936 }
1937 }
1938 if (interfered) {
1939 continue;
1940 }
1941
1942 /* Rewrite the later usage to point at the source of the move to
1943 * be removed.
1944 */
1945 for (fs_inst *scan_inst = inst;
1946 !scan_inst->is_tail_sentinel();
1947 scan_inst = (fs_inst *)scan_inst->next) {
1948 for (int i = 0; i < 3; i++) {
1949 if (scan_inst->src[i].file == GRF &&
1950 scan_inst->src[i].reg == inst->dst.reg &&
1951 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1952 fs_reg new_src = inst->src[0];
1953 if (scan_inst->src[i].abs) {
1954 new_src.negate = 0;
1955 new_src.abs = 1;
1956 }
1957 new_src.negate ^= scan_inst->src[i].negate;
1958 scan_inst->src[i] = new_src;
1959 }
1960 }
1961 }
1962
1963 inst->remove();
1964 progress = true;
1965 }
1966
1967 if (progress)
1968 live_intervals_valid = false;
1969
1970 return progress;
1971 }
1972
1973
1974 bool
1975 fs_visitor::compute_to_mrf()
1976 {
1977 bool progress = false;
1978 int next_ip = 0;
1979
1980 calculate_live_intervals();
1981
1982 foreach_list_safe(node, &this->instructions) {
1983 fs_inst *inst = (fs_inst *)node;
1984
1985 int ip = next_ip;
1986 next_ip++;
1987
1988 if (inst->opcode != BRW_OPCODE_MOV ||
1989 inst->predicate ||
1990 inst->dst.file != MRF || inst->src[0].file != GRF ||
1991 inst->dst.type != inst->src[0].type ||
1992 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1993 continue;
1994
1995 /* Work out which hardware MRF registers are written by this
1996 * instruction.
1997 */
1998 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1999 int mrf_high;
2000 if (inst->dst.reg & BRW_MRF_COMPR4) {
2001 mrf_high = mrf_low + 4;
2002 } else if (dispatch_width == 16 &&
2003 (!inst->force_uncompressed && !inst->force_sechalf)) {
2004 mrf_high = mrf_low + 1;
2005 } else {
2006 mrf_high = mrf_low;
2007 }
2008
2009 /* Can't compute-to-MRF this GRF if someone else was going to
2010 * read it later.
2011 */
2012 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2013 continue;
2014
2015 /* Found a move of a GRF to a MRF. Let's see if we can go
2016 * rewrite the thing that made this GRF to write into the MRF.
2017 */
2018 fs_inst *scan_inst;
2019 for (scan_inst = (fs_inst *)inst->prev;
2020 scan_inst->prev != NULL;
2021 scan_inst = (fs_inst *)scan_inst->prev) {
2022 if (scan_inst->dst.file == GRF &&
2023 scan_inst->dst.reg == inst->src[0].reg) {
2024 /* Found the last thing to write our reg we want to turn
2025 * into a compute-to-MRF.
2026 */
2027
2028 /* SENDs can only write to GRFs, so no compute-to-MRF. */
2029 if (scan_inst->mlen) {
2030 break;
2031 }
2032
2033 /* If it's predicated, it (probably) didn't populate all
2034 * the channels. We might be able to rewrite everything
2035 * that writes that reg, but it would require smarter
2036 * tracking to delay the rewriting until complete success.
2037 */
2038 if (scan_inst->predicate)
2039 break;
2040
2041 /* If it's half of register setup and not the same half as
2042 * our MOV we're trying to remove, bail for now.
2043 */
2044 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2045 scan_inst->force_sechalf != inst->force_sechalf) {
2046 break;
2047 }
2048
2049 /* SEND instructions can't have MRF as a destination. */
2050 if (scan_inst->mlen)
2051 break;
2052
2053 if (intel->gen >= 6) {
2054 /* gen6 math instructions must have the destination be
2055 * GRF, so no compute-to-MRF for them.
2056 */
2057 if (scan_inst->is_math()) {
2058 break;
2059 }
2060 }
2061
2062 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2063 /* Found the creator of our MRF's source value. */
2064 scan_inst->dst.file = MRF;
2065 scan_inst->dst.reg = inst->dst.reg;
2066 scan_inst->saturate |= inst->saturate;
2067 inst->remove();
2068 progress = true;
2069 }
2070 break;
2071 }
2072
2073 /* We don't handle flow control here. Most computation of
2074 * values that end up in MRFs are shortly before the MRF
2075 * write anyway.
2076 */
2077 if (scan_inst->opcode == BRW_OPCODE_DO ||
2078 scan_inst->opcode == BRW_OPCODE_WHILE ||
2079 scan_inst->opcode == BRW_OPCODE_ELSE ||
2080 scan_inst->opcode == BRW_OPCODE_ENDIF) {
2081 break;
2082 }
2083
2084 /* You can't read from an MRF, so if someone else reads our
2085 * MRF's source GRF that we wanted to rewrite, that stops us.
2086 */
2087 bool interfered = false;
2088 for (int i = 0; i < 3; i++) {
2089 if (scan_inst->src[i].file == GRF &&
2090 scan_inst->src[i].reg == inst->src[0].reg &&
2091 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2092 interfered = true;
2093 }
2094 }
2095 if (interfered)
2096 break;
2097
2098 if (scan_inst->dst.file == MRF) {
2099 /* If somebody else writes our MRF here, we can't
2100 * compute-to-MRF before that.
2101 */
2102 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2103 int scan_mrf_high;
2104
2105 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2106 scan_mrf_high = scan_mrf_low + 4;
2107 } else if (dispatch_width == 16 &&
2108 (!scan_inst->force_uncompressed &&
2109 !scan_inst->force_sechalf)) {
2110 scan_mrf_high = scan_mrf_low + 1;
2111 } else {
2112 scan_mrf_high = scan_mrf_low;
2113 }
2114
2115 if (mrf_low == scan_mrf_low ||
2116 mrf_low == scan_mrf_high ||
2117 mrf_high == scan_mrf_low ||
2118 mrf_high == scan_mrf_high) {
2119 break;
2120 }
2121 }
2122
2123 if (scan_inst->mlen > 0) {
2124 /* Found a SEND instruction, which means that there are
2125 * live values in MRFs from base_mrf to base_mrf +
2126 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2127 * above it.
2128 */
2129 if (mrf_low >= scan_inst->base_mrf &&
2130 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2131 break;
2132 }
2133 if (mrf_high >= scan_inst->base_mrf &&
2134 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2135 break;
2136 }
2137 }
2138 }
2139 }
2140
2141 if (progress)
2142 live_intervals_valid = false;
2143
2144 return progress;
2145 }
2146
2147 /**
2148 * Walks through basic blocks, looking for repeated MRF writes and
2149 * removing the later ones.
2150 */
2151 bool
2152 fs_visitor::remove_duplicate_mrf_writes()
2153 {
2154 fs_inst *last_mrf_move[16];
2155 bool progress = false;
2156
2157 /* Need to update the MRF tracking for compressed instructions. */
2158 if (dispatch_width == 16)
2159 return false;
2160
2161 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2162
2163 foreach_list_safe(node, &this->instructions) {
2164 fs_inst *inst = (fs_inst *)node;
2165
2166 switch (inst->opcode) {
2167 case BRW_OPCODE_DO:
2168 case BRW_OPCODE_WHILE:
2169 case BRW_OPCODE_IF:
2170 case BRW_OPCODE_ELSE:
2171 case BRW_OPCODE_ENDIF:
2172 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2173 continue;
2174 default:
2175 break;
2176 }
2177
2178 if (inst->opcode == BRW_OPCODE_MOV &&
2179 inst->dst.file == MRF) {
2180 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2181 if (prev_inst && inst->equals(prev_inst)) {
2182 inst->remove();
2183 progress = true;
2184 continue;
2185 }
2186 }
2187
2188 /* Clear out the last-write records for MRFs that were overwritten. */
2189 if (inst->dst.file == MRF) {
2190 last_mrf_move[inst->dst.reg] = NULL;
2191 }
2192
2193 if (inst->mlen > 0) {
2194 /* Found a SEND instruction, which will include two or fewer
2195 * implied MRF writes. We could do better here.
2196 */
2197 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2198 last_mrf_move[inst->base_mrf + i] = NULL;
2199 }
2200 }
2201
2202 /* Clear out any MRF move records whose sources got overwritten. */
2203 if (inst->dst.file == GRF) {
2204 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2205 if (last_mrf_move[i] &&
2206 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2207 last_mrf_move[i] = NULL;
2208 }
2209 }
2210 }
2211
2212 if (inst->opcode == BRW_OPCODE_MOV &&
2213 inst->dst.file == MRF &&
2214 inst->src[0].file == GRF &&
2215 !inst->predicate) {
2216 last_mrf_move[inst->dst.reg] = inst;
2217 }
2218 }
2219
2220 if (progress)
2221 live_intervals_valid = false;
2222
2223 return progress;
2224 }
2225
2226 void
2227 fs_visitor::dump_instruction(fs_inst *inst)
2228 {
2229 if (inst->predicate) {
2230 printf("(%cf0.%d) ",
2231 inst->predicate_inverse ? '-' : '+',
2232 inst->flag_subreg);
2233 }
2234
2235 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2236 opcode_descs[inst->opcode].name) {
2237 printf("%s", opcode_descs[inst->opcode].name);
2238 } else {
2239 printf("op%d", inst->opcode);
2240 }
2241 if (inst->saturate)
2242 printf(".sat");
2243 if (inst->conditional_mod) {
2244 printf(".cmod");
2245 if (!inst->predicate &&
2246 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2247 inst->opcode != BRW_OPCODE_IF &&
2248 inst->opcode != BRW_OPCODE_WHILE))) {
2249 printf(".f0.%d\n", inst->flag_subreg);
2250 }
2251 }
2252 printf(" ");
2253
2254
2255 switch (inst->dst.file) {
2256 case GRF:
2257 printf("vgrf%d", inst->dst.reg);
2258 if (inst->dst.reg_offset)
2259 printf("+%d", inst->dst.reg_offset);
2260 break;
2261 case MRF:
2262 printf("m%d", inst->dst.reg);
2263 break;
2264 case BAD_FILE:
2265 printf("(null)");
2266 break;
2267 case UNIFORM:
2268 printf("***u%d***", inst->dst.reg);
2269 break;
2270 default:
2271 printf("???");
2272 break;
2273 }
2274 printf(", ");
2275
2276 for (int i = 0; i < 3; i++) {
2277 if (inst->src[i].negate)
2278 printf("-");
2279 if (inst->src[i].abs)
2280 printf("|");
2281 switch (inst->src[i].file) {
2282 case GRF:
2283 printf("vgrf%d", inst->src[i].reg);
2284 if (inst->src[i].reg_offset)
2285 printf("+%d", inst->src[i].reg_offset);
2286 break;
2287 case MRF:
2288 printf("***m%d***", inst->src[i].reg);
2289 break;
2290 case UNIFORM:
2291 printf("u%d", inst->src[i].reg);
2292 if (inst->src[i].reg_offset)
2293 printf(".%d", inst->src[i].reg_offset);
2294 break;
2295 case BAD_FILE:
2296 printf("(null)");
2297 break;
2298 default:
2299 printf("???");
2300 break;
2301 }
2302 if (inst->src[i].abs)
2303 printf("|");
2304
2305 if (i < 3)
2306 printf(", ");
2307 }
2308
2309 printf(" ");
2310
2311 if (inst->force_uncompressed)
2312 printf("1sthalf ");
2313
2314 if (inst->force_sechalf)
2315 printf("2ndhalf ");
2316
2317 printf("\n");
2318 }
2319
2320 void
2321 fs_visitor::dump_instructions()
2322 {
2323 int ip = 0;
2324 foreach_list(node, &this->instructions) {
2325 fs_inst *inst = (fs_inst *)node;
2326 printf("%d: ", ip++);
2327 dump_instruction(inst);
2328 }
2329 }
2330
2331 /**
2332 * Possibly returns an instruction that set up @param reg.
2333 *
2334 * Sometimes we want to take the result of some expression/variable
2335 * dereference tree and rewrite the instruction generating the result
2336 * of the tree. When processing the tree, we know that the
2337 * instructions generated are all writing temporaries that are dead
2338 * outside of this tree. So, if we have some instructions that write
2339 * a temporary, we're free to point that temp write somewhere else.
2340 *
2341 * Note that this doesn't guarantee that the instruction generated
2342 * only reg -- it might be the size=4 destination of a texture instruction.
2343 */
2344 fs_inst *
2345 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2346 fs_inst *end,
2347 fs_reg reg)
2348 {
2349 if (end == start ||
2350 end->predicate ||
2351 end->force_uncompressed ||
2352 end->force_sechalf ||
2353 reg.reladdr ||
2354 !reg.equals(end->dst)) {
2355 return NULL;
2356 } else {
2357 return end;
2358 }
2359 }
2360
2361 void
2362 fs_visitor::setup_payload_gen6()
2363 {
2364 struct intel_context *intel = &brw->intel;
2365 bool uses_depth =
2366 (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2367 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2368
2369 assert(intel->gen >= 6);
2370
2371 /* R0-1: masks, pixel X/Y coordinates. */
2372 c->nr_payload_regs = 2;
2373 /* R2: only for 32-pixel dispatch.*/
2374
2375 /* R3-26: barycentric interpolation coordinates. These appear in the
2376 * same order that they appear in the brw_wm_barycentric_interp_mode
2377 * enum. Each set of coordinates occupies 2 registers if dispatch width
2378 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2379 * appear if they were enabled using the "Barycentric Interpolation
2380 * Mode" bits in WM_STATE.
2381 */
2382 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2383 if (barycentric_interp_modes & (1 << i)) {
2384 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2385 c->nr_payload_regs += 2;
2386 if (dispatch_width == 16) {
2387 c->nr_payload_regs += 2;
2388 }
2389 }
2390 }
2391
2392 /* R27: interpolated depth if uses source depth */
2393 if (uses_depth) {
2394 c->source_depth_reg = c->nr_payload_regs;
2395 c->nr_payload_regs++;
2396 if (dispatch_width == 16) {
2397 /* R28: interpolated depth if not 8-wide. */
2398 c->nr_payload_regs++;
2399 }
2400 }
2401 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2402 if (uses_depth) {
2403 c->source_w_reg = c->nr_payload_regs;
2404 c->nr_payload_regs++;
2405 if (dispatch_width == 16) {
2406 /* R30: interpolated W if not 8-wide. */
2407 c->nr_payload_regs++;
2408 }
2409 }
2410 /* R31: MSAA position offsets. */
2411 /* R32-: bary for 32-pixel. */
2412 /* R58-59: interp W for 32-pixel. */
2413
2414 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2415 c->source_depth_to_render_target = true;
2416 }
2417 }
2418
2419 bool
2420 fs_visitor::run()
2421 {
2422 sanity_param_count = fp->Base.Parameters->NumParameters;
2423 uint32_t orig_nr_params = c->prog_data.nr_params;
2424
2425 if (intel->gen >= 6)
2426 setup_payload_gen6();
2427 else
2428 setup_payload_gen4();
2429
2430 if (0) {
2431 emit_dummy_fs();
2432 } else {
2433 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2434 emit_shader_time_begin();
2435
2436 calculate_urb_setup();
2437 if (intel->gen < 6)
2438 emit_interpolation_setup_gen4();
2439 else
2440 emit_interpolation_setup_gen6();
2441
2442 /* We handle discards by keeping track of the still-live pixels in f0.1.
2443 * Initialize it with the dispatched pixels.
2444 */
2445 if (fp->UsesKill) {
2446 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2447 discard_init->flag_subreg = 1;
2448 }
2449
2450 /* Generate FS IR for main(). (the visitor only descends into
2451 * functions called "main").
2452 */
2453 if (shader) {
2454 foreach_list(node, &*shader->ir) {
2455 ir_instruction *ir = (ir_instruction *)node;
2456 base_ir = ir;
2457 this->result = reg_undef;
2458 ir->accept(this);
2459 }
2460 } else {
2461 emit_fragment_program_code();
2462 }
2463 base_ir = NULL;
2464 if (failed)
2465 return false;
2466
2467 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2468 emit_shader_time_end();
2469
2470 emit_fb_writes();
2471
2472 split_virtual_grfs();
2473
2474 move_uniform_array_access_to_pull_constants();
2475 setup_pull_constants();
2476
2477 bool progress;
2478 do {
2479 progress = false;
2480
2481 compact_virtual_grfs();
2482
2483 progress = remove_duplicate_mrf_writes() || progress;
2484
2485 progress = opt_algebraic() || progress;
2486 progress = opt_cse() || progress;
2487 progress = opt_copy_propagate() || progress;
2488 progress = dead_code_eliminate() || progress;
2489 progress = register_coalesce() || progress;
2490 progress = register_coalesce_2() || progress;
2491 progress = compute_to_mrf() || progress;
2492 } while (progress);
2493
2494 remove_dead_constants();
2495
2496 schedule_instructions(false);
2497
2498 assign_curb_setup();
2499 assign_urb_setup();
2500
2501 if (0) {
2502 /* Debug of register spilling: Go spill everything. */
2503 for (int i = 0; i < virtual_grf_count; i++) {
2504 spill_reg(i);
2505 }
2506 }
2507
2508 if (0)
2509 assign_regs_trivial();
2510 else {
2511 while (!assign_regs()) {
2512 if (failed)
2513 break;
2514 }
2515 }
2516 }
2517 assert(force_uncompressed_stack == 0);
2518 assert(force_sechalf_stack == 0);
2519
2520 if (failed)
2521 return false;
2522
2523 schedule_instructions(true);
2524
2525 if (dispatch_width == 8) {
2526 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2527 } else {
2528 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2529
2530 /* Make sure we didn't try to sneak in an extra uniform */
2531 assert(orig_nr_params == c->prog_data.nr_params);
2532 (void) orig_nr_params;
2533 }
2534
2535 /* If any state parameters were appended, then ParameterValues could have
2536 * been realloced, in which case the driver uniform storage set up by
2537 * _mesa_associate_uniform_storage() would point to freed memory. Make
2538 * sure that didn't happen.
2539 */
2540 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2541
2542 return !failed;
2543 }
2544
2545 const unsigned *
2546 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2547 struct gl_fragment_program *fp,
2548 struct gl_shader_program *prog,
2549 unsigned *final_assembly_size)
2550 {
2551 struct intel_context *intel = &brw->intel;
2552 bool start_busy = false;
2553 float start_time = 0;
2554
2555 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2556 start_busy = (intel->batch.last_bo &&
2557 drm_intel_bo_busy(intel->batch.last_bo));
2558 start_time = get_time();
2559 }
2560
2561 struct brw_shader *shader = NULL;
2562 if (prog)
2563 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2564
2565 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2566 if (shader) {
2567 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2568 _mesa_print_ir(shader->ir, NULL);
2569 printf("\n\n");
2570 } else {
2571 printf("ARB_fragment_program %d ir for native fragment shader\n",
2572 fp->Base.Id);
2573 _mesa_print_program(&fp->Base);
2574 }
2575 }
2576
2577 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2578 */
2579 fs_visitor v(brw, c, prog, fp, 8);
2580 if (!v.run()) {
2581 prog->LinkStatus = false;
2582 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2583
2584 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2585 v.fail_msg);
2586
2587 return NULL;
2588 }
2589
2590 exec_list *simd16_instructions = NULL;
2591 fs_visitor v2(brw, c, prog, fp, 16);
2592 bool no16 = INTEL_DEBUG & DEBUG_NO16;
2593 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2594 v2.import_uniforms(&v);
2595 if (!v2.run()) {
2596 perf_debug("16-wide shader failed to compile, falling back to "
2597 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2598 } else {
2599 simd16_instructions = &v2.instructions;
2600 }
2601 }
2602
2603 c->prog_data.dispatch_width = 8;
2604
2605 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2606 const unsigned *generated = g.generate_assembly(&v.instructions,
2607 simd16_instructions,
2608 final_assembly_size);
2609
2610 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2611 if (shader->compiled_once)
2612 brw_wm_debug_recompile(brw, prog, &c->key);
2613 shader->compiled_once = true;
2614
2615 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2616 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2617 (get_time() - start_time) * 1000);
2618 }
2619 }
2620
2621 return generated;
2622 }
2623
2624 bool
2625 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2626 {
2627 struct brw_context *brw = brw_context(ctx);
2628 struct intel_context *intel = &brw->intel;
2629 struct brw_wm_prog_key key;
2630
2631 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2632 return true;
2633
2634 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2635 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2636 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2637 bool program_uses_dfdy = fp->UsesDFdy;
2638
2639 memset(&key, 0, sizeof(key));
2640
2641 if (intel->gen < 6) {
2642 if (fp->UsesKill)
2643 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2644
2645 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2646 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2647
2648 /* Just assume depth testing. */
2649 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2650 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2651 }
2652
2653 if (prog->Name != 0)
2654 key.proj_attrib_mask = 0xffffffff;
2655
2656 if (intel->gen < 6)
2657 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2658
2659 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2660 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2661 continue;
2662
2663 if (prog->Name == 0)
2664 key.proj_attrib_mask |= 1 << i;
2665
2666 if (intel->gen < 6) {
2667 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2668
2669 if (vp_index >= 0)
2670 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2671 }
2672 }
2673
2674 key.clamp_fragment_color = true;
2675
2676 for (int i = 0; i < MAX_SAMPLERS; i++) {
2677 if (fp->Base.ShadowSamplers & (1 << i)) {
2678 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2679 key.tex.swizzles[i] =
2680 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2681 } else {
2682 /* Color sampler: assume no swizzling. */
2683 key.tex.swizzles[i] = SWIZZLE_XYZW;
2684 }
2685 }
2686
2687 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2688 key.drawable_height = ctx->DrawBuffer->Height;
2689 }
2690
2691 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2692 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2693 }
2694
2695 key.nr_color_regions = 1;
2696
2697 key.program_string_id = bfp->id;
2698
2699 uint32_t old_prog_offset = brw->wm.prog_offset;
2700 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2701
2702 bool success = do_wm_prog(brw, prog, bfp, &key);
2703
2704 brw->wm.prog_offset = old_prog_offset;
2705 brw->wm.prog_data = old_prog_data;
2706
2707 return success;
2708 }