i965/fs: Rewrite discards to use a flag subreg to track discarded pixels.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 ALU1(NOT)
150 ALU1(MOV)
151 ALU1(FRC)
152 ALU1(RNDD)
153 ALU1(RNDE)
154 ALU1(RNDZ)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(SHL)
162 ALU2(SHR)
163 ALU2(ASR)
164
165 /** Gen4 predicated IF. */
166 fs_inst *
167 fs_visitor::IF(uint32_t predicate)
168 {
169 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
170 inst->predicate = predicate;
171 return inst;
172 }
173
174 /** Gen6+ IF with embedded comparison. */
175 fs_inst *
176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
177 {
178 assert(intel->gen >= 6);
179 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
180 reg_null_d, src0, src1);
181 inst->conditional_mod = condition;
182 return inst;
183 }
184
185 /**
186 * CMP: Sets the low bit of the destination channels with the result
187 * of the comparison, while the upper bits are undefined, and updates
188 * the flag register with the packed 16 bits of the result.
189 */
190 fs_inst *
191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
192 {
193 fs_inst *inst;
194
195 /* Take the instruction:
196 *
197 * CMP null<d> src0<f> src1<f>
198 *
199 * Original gen4 does type conversion to the destination type before
200 * comparison, producing garbage results for floating point comparisons.
201 * gen5 does the comparison on the execution type (resolved source types),
202 * so dst type doesn't matter. gen6 does comparison and then uses the
203 * result as if it was the dst type with no conversion, which happens to
204 * mostly work out for float-interpreted-as-int since our comparisons are
205 * for >0, =0, <0.
206 */
207 if (intel->gen == 4) {
208 dst.type = src0.type;
209 if (dst.file == FIXED_HW_REG)
210 dst.fixed_hw_reg.type = dst.type;
211 }
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 exec_list
223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
224 fs_reg offset)
225 {
226 exec_list instructions;
227 fs_inst *inst;
228
229 if (intel->gen >= 7) {
230 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
231 dst, surf_index, offset);
232 instructions.push_tail(inst);
233 } else {
234 int base_mrf = 13;
235 bool header_present = true;
236
237 fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
238 mrf.type = BRW_REGISTER_TYPE_D;
239
240 /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
241 * dword-aligned byte offset.
242 */
243 if (intel->gen == 6) {
244 instructions.push_tail(MOV(mrf, offset));
245 } else {
246 instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
247 }
248 inst = MOV(mrf, offset);
249 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
250 dst, surf_index);
251 inst->header_present = header_present;
252 inst->base_mrf = base_mrf;
253 inst->mlen = header_present + dispatch_width / 8;
254
255 instructions.push_tail(inst);
256 }
257
258 return instructions;
259 }
260
261 bool
262 fs_inst::equals(fs_inst *inst)
263 {
264 return (opcode == inst->opcode &&
265 dst.equals(inst->dst) &&
266 src[0].equals(inst->src[0]) &&
267 src[1].equals(inst->src[1]) &&
268 src[2].equals(inst->src[2]) &&
269 saturate == inst->saturate &&
270 predicate == inst->predicate &&
271 conditional_mod == inst->conditional_mod &&
272 mlen == inst->mlen &&
273 base_mrf == inst->base_mrf &&
274 sampler == inst->sampler &&
275 target == inst->target &&
276 eot == inst->eot &&
277 header_present == inst->header_present &&
278 shadow_compare == inst->shadow_compare &&
279 offset == inst->offset);
280 }
281
282 int
283 fs_inst::regs_written()
284 {
285 if (is_tex())
286 return 4;
287
288 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
289 * but we don't currently use them...nor do we have an opcode for them.
290 */
291
292 return 1;
293 }
294
295 bool
296 fs_inst::overwrites_reg(const fs_reg &reg)
297 {
298 return (reg.file == dst.file &&
299 reg.reg == dst.reg &&
300 reg.reg_offset >= dst.reg_offset &&
301 reg.reg_offset < dst.reg_offset + regs_written());
302 }
303
304 bool
305 fs_inst::is_tex()
306 {
307 return (opcode == SHADER_OPCODE_TEX ||
308 opcode == FS_OPCODE_TXB ||
309 opcode == SHADER_OPCODE_TXD ||
310 opcode == SHADER_OPCODE_TXF ||
311 opcode == SHADER_OPCODE_TXL ||
312 opcode == SHADER_OPCODE_TXS);
313 }
314
315 bool
316 fs_inst::is_math()
317 {
318 return (opcode == SHADER_OPCODE_RCP ||
319 opcode == SHADER_OPCODE_RSQ ||
320 opcode == SHADER_OPCODE_SQRT ||
321 opcode == SHADER_OPCODE_EXP2 ||
322 opcode == SHADER_OPCODE_LOG2 ||
323 opcode == SHADER_OPCODE_SIN ||
324 opcode == SHADER_OPCODE_COS ||
325 opcode == SHADER_OPCODE_INT_QUOTIENT ||
326 opcode == SHADER_OPCODE_INT_REMAINDER ||
327 opcode == SHADER_OPCODE_POW);
328 }
329
330 bool
331 fs_inst::is_send_from_grf()
332 {
333 return opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
334 }
335
336 bool
337 fs_visitor::can_do_source_mods(fs_inst *inst)
338 {
339 if (intel->gen == 6 && inst->is_math())
340 return false;
341
342 if (inst->is_send_from_grf())
343 return false;
344
345 return true;
346 }
347
348 void
349 fs_reg::init()
350 {
351 memset(this, 0, sizeof(*this));
352 this->smear = -1;
353 }
354
355 /** Generic unset register constructor. */
356 fs_reg::fs_reg()
357 {
358 init();
359 this->file = BAD_FILE;
360 }
361
362 /** Immediate value constructor. */
363 fs_reg::fs_reg(float f)
364 {
365 init();
366 this->file = IMM;
367 this->type = BRW_REGISTER_TYPE_F;
368 this->imm.f = f;
369 }
370
371 /** Immediate value constructor. */
372 fs_reg::fs_reg(int32_t i)
373 {
374 init();
375 this->file = IMM;
376 this->type = BRW_REGISTER_TYPE_D;
377 this->imm.i = i;
378 }
379
380 /** Immediate value constructor. */
381 fs_reg::fs_reg(uint32_t u)
382 {
383 init();
384 this->file = IMM;
385 this->type = BRW_REGISTER_TYPE_UD;
386 this->imm.u = u;
387 }
388
389 /** Fixed brw_reg Immediate value constructor. */
390 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
391 {
392 init();
393 this->file = FIXED_HW_REG;
394 this->fixed_hw_reg = fixed_hw_reg;
395 this->type = fixed_hw_reg.type;
396 }
397
398 bool
399 fs_reg::equals(const fs_reg &r) const
400 {
401 return (file == r.file &&
402 reg == r.reg &&
403 reg_offset == r.reg_offset &&
404 type == r.type &&
405 negate == r.negate &&
406 abs == r.abs &&
407 !reladdr && !r.reladdr &&
408 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
409 sizeof(fixed_hw_reg)) == 0 &&
410 smear == r.smear &&
411 imm.u == r.imm.u);
412 }
413
414 bool
415 fs_reg::is_zero() const
416 {
417 if (file != IMM)
418 return false;
419
420 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
421 }
422
423 bool
424 fs_reg::is_one() const
425 {
426 if (file != IMM)
427 return false;
428
429 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
430 }
431
432 int
433 fs_visitor::type_size(const struct glsl_type *type)
434 {
435 unsigned int size, i;
436
437 switch (type->base_type) {
438 case GLSL_TYPE_UINT:
439 case GLSL_TYPE_INT:
440 case GLSL_TYPE_FLOAT:
441 case GLSL_TYPE_BOOL:
442 return type->components();
443 case GLSL_TYPE_ARRAY:
444 return type_size(type->fields.array) * type->length;
445 case GLSL_TYPE_STRUCT:
446 size = 0;
447 for (i = 0; i < type->length; i++) {
448 size += type_size(type->fields.structure[i].type);
449 }
450 return size;
451 case GLSL_TYPE_SAMPLER:
452 /* Samplers take up no register space, since they're baked in at
453 * link time.
454 */
455 return 0;
456 default:
457 assert(!"not reached");
458 return 0;
459 }
460 }
461
462 fs_reg
463 fs_visitor::get_timestamp()
464 {
465 assert(intel->gen >= 7);
466
467 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
468 BRW_ARF_TIMESTAMP,
469 0),
470 BRW_REGISTER_TYPE_UD));
471
472 fs_reg dst = fs_reg(this, glsl_type::uint_type);
473
474 fs_inst *mov = emit(MOV(dst, ts));
475 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
476 * even if it's not enabled in the dispatch.
477 */
478 mov->force_writemask_all = true;
479 mov->force_uncompressed = true;
480
481 /* The caller wants the low 32 bits of the timestamp. Since it's running
482 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
483 * which is plenty of time for our purposes. It is identical across the
484 * EUs, but since it's tracking GPU core speed it will increment at a
485 * varying rate as render P-states change.
486 *
487 * The caller could also check if render P-states have changed (or anything
488 * else that might disrupt timing) by setting smear to 2 and checking if
489 * that field is != 0.
490 */
491 dst.smear = 0;
492
493 return dst;
494 }
495
496 void
497 fs_visitor::emit_shader_time_begin()
498 {
499 current_annotation = "shader time start";
500 shader_start_time = get_timestamp();
501 }
502
503 void
504 fs_visitor::emit_shader_time_end()
505 {
506 current_annotation = "shader time end";
507
508 enum shader_time_shader_type type;
509 if (dispatch_width == 8) {
510 type = ST_FS8;
511 } else {
512 assert(dispatch_width == 16);
513 type = ST_FS16;
514 }
515
516 emit_shader_time_write(type, shader_start_time, get_timestamp());
517 }
518
519 void
520 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
521 fs_reg start, fs_reg end)
522 {
523 /* Choose an index in the buffer and set up tracking information for our
524 * printouts.
525 */
526 int shader_time_index = brw->shader_time.num_entries++;
527 assert(shader_time_index <= brw->shader_time.max_entries);
528 brw->shader_time.types[shader_time_index] = type;
529 if (prog) {
530 _mesa_reference_shader_program(ctx,
531 &brw->shader_time.programs[shader_time_index],
532 prog);
533 }
534
535 /* Check that there weren't any timestamp reset events (assuming these
536 * were the only two timestamp reads that happened).
537 */
538 fs_reg reset = end;
539 reset.smear = 2;
540 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
541 test->conditional_mod = BRW_CONDITIONAL_Z;
542 emit(IF(BRW_PREDICATE_NORMAL));
543
544 push_force_uncompressed();
545 start.negate = true;
546 fs_reg diff = fs_reg(this, glsl_type::uint_type);
547 emit(ADD(diff, start, end));
548
549 /* If there were no instructions between the two timestamp gets, the diff
550 * is 2 cycles. Remove that overhead, so I can forget about that when
551 * trying to determine the time taken for single instructions.
552 */
553 emit(ADD(diff, diff, fs_reg(-2u)));
554
555 int base_mrf = 6;
556
557 fs_reg offset_mrf = fs_reg(MRF, base_mrf);
558 offset_mrf.type = BRW_REGISTER_TYPE_UD;
559 emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
560
561 fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
562 time_mrf.type = BRW_REGISTER_TYPE_UD;
563 emit(MOV(time_mrf, diff));
564
565 fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
566 inst->base_mrf = base_mrf;
567 inst->mlen = 2;
568
569 pop_force_uncompressed();
570
571 emit(BRW_OPCODE_ENDIF);
572 }
573
574 void
575 fs_visitor::fail(const char *format, ...)
576 {
577 va_list va;
578 char *msg;
579
580 if (failed)
581 return;
582
583 failed = true;
584
585 va_start(va, format);
586 msg = ralloc_vasprintf(mem_ctx, format, va);
587 va_end(va);
588 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
589
590 this->fail_msg = msg;
591
592 if (INTEL_DEBUG & DEBUG_WM) {
593 fprintf(stderr, "%s", msg);
594 }
595 }
596
597 fs_inst *
598 fs_visitor::emit(enum opcode opcode)
599 {
600 return emit(fs_inst(opcode));
601 }
602
603 fs_inst *
604 fs_visitor::emit(enum opcode opcode, fs_reg dst)
605 {
606 return emit(fs_inst(opcode, dst));
607 }
608
609 fs_inst *
610 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
611 {
612 return emit(fs_inst(opcode, dst, src0));
613 }
614
615 fs_inst *
616 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
617 {
618 return emit(fs_inst(opcode, dst, src0, src1));
619 }
620
621 fs_inst *
622 fs_visitor::emit(enum opcode opcode, fs_reg dst,
623 fs_reg src0, fs_reg src1, fs_reg src2)
624 {
625 return emit(fs_inst(opcode, dst, src0, src1, src2));
626 }
627
628 void
629 fs_visitor::push_force_uncompressed()
630 {
631 force_uncompressed_stack++;
632 }
633
634 void
635 fs_visitor::pop_force_uncompressed()
636 {
637 force_uncompressed_stack--;
638 assert(force_uncompressed_stack >= 0);
639 }
640
641 void
642 fs_visitor::push_force_sechalf()
643 {
644 force_sechalf_stack++;
645 }
646
647 void
648 fs_visitor::pop_force_sechalf()
649 {
650 force_sechalf_stack--;
651 assert(force_sechalf_stack >= 0);
652 }
653
654 /**
655 * Returns how many MRFs an FS opcode will write over.
656 *
657 * Note that this is not the 0 or 1 implied writes in an actual gen
658 * instruction -- the FS opcodes often generate MOVs in addition.
659 */
660 int
661 fs_visitor::implied_mrf_writes(fs_inst *inst)
662 {
663 if (inst->mlen == 0)
664 return 0;
665
666 switch (inst->opcode) {
667 case SHADER_OPCODE_RCP:
668 case SHADER_OPCODE_RSQ:
669 case SHADER_OPCODE_SQRT:
670 case SHADER_OPCODE_EXP2:
671 case SHADER_OPCODE_LOG2:
672 case SHADER_OPCODE_SIN:
673 case SHADER_OPCODE_COS:
674 return 1 * dispatch_width / 8;
675 case SHADER_OPCODE_POW:
676 case SHADER_OPCODE_INT_QUOTIENT:
677 case SHADER_OPCODE_INT_REMAINDER:
678 return 2 * dispatch_width / 8;
679 case SHADER_OPCODE_TEX:
680 case FS_OPCODE_TXB:
681 case SHADER_OPCODE_TXD:
682 case SHADER_OPCODE_TXF:
683 case SHADER_OPCODE_TXL:
684 case SHADER_OPCODE_TXS:
685 return 1;
686 case SHADER_OPCODE_SHADER_TIME_ADD:
687 return 0;
688 case FS_OPCODE_FB_WRITE:
689 return 2;
690 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
691 case FS_OPCODE_UNSPILL:
692 return 1;
693 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
694 return inst->header_present;
695 case FS_OPCODE_SPILL:
696 return 2;
697 default:
698 assert(!"not reached");
699 return inst->mlen;
700 }
701 }
702
703 int
704 fs_visitor::virtual_grf_alloc(int size)
705 {
706 if (virtual_grf_array_size <= virtual_grf_count) {
707 if (virtual_grf_array_size == 0)
708 virtual_grf_array_size = 16;
709 else
710 virtual_grf_array_size *= 2;
711 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
712 virtual_grf_array_size);
713 }
714 virtual_grf_sizes[virtual_grf_count] = size;
715 return virtual_grf_count++;
716 }
717
718 /** Fixed HW reg constructor. */
719 fs_reg::fs_reg(enum register_file file, int reg)
720 {
721 init();
722 this->file = file;
723 this->reg = reg;
724 this->type = BRW_REGISTER_TYPE_F;
725 }
726
727 /** Fixed HW reg constructor. */
728 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
729 {
730 init();
731 this->file = file;
732 this->reg = reg;
733 this->type = type;
734 }
735
736 /** Automatic reg constructor. */
737 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
738 {
739 init();
740
741 this->file = GRF;
742 this->reg = v->virtual_grf_alloc(v->type_size(type));
743 this->reg_offset = 0;
744 this->type = brw_type_for_base_type(type);
745 }
746
747 fs_reg *
748 fs_visitor::variable_storage(ir_variable *var)
749 {
750 return (fs_reg *)hash_table_find(this->variable_ht, var);
751 }
752
753 void
754 import_uniforms_callback(const void *key,
755 void *data,
756 void *closure)
757 {
758 struct hash_table *dst_ht = (struct hash_table *)closure;
759 const fs_reg *reg = (const fs_reg *)data;
760
761 if (reg->file != UNIFORM)
762 return;
763
764 hash_table_insert(dst_ht, data, key);
765 }
766
767 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
768 * This brings in those uniform definitions
769 */
770 void
771 fs_visitor::import_uniforms(fs_visitor *v)
772 {
773 hash_table_call_foreach(v->variable_ht,
774 import_uniforms_callback,
775 variable_ht);
776 this->params_remap = v->params_remap;
777 }
778
779 /* Our support for uniforms is piggy-backed on the struct
780 * gl_fragment_program, because that's where the values actually
781 * get stored, rather than in some global gl_shader_program uniform
782 * store.
783 */
784 int
785 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
786 {
787 unsigned int offset = 0;
788
789 if (type->is_matrix()) {
790 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
791 type->vector_elements,
792 1);
793
794 for (unsigned int i = 0; i < type->matrix_columns; i++) {
795 offset += setup_uniform_values(loc + offset, column);
796 }
797
798 return offset;
799 }
800
801 switch (type->base_type) {
802 case GLSL_TYPE_FLOAT:
803 case GLSL_TYPE_UINT:
804 case GLSL_TYPE_INT:
805 case GLSL_TYPE_BOOL:
806 for (unsigned int i = 0; i < type->vector_elements; i++) {
807 unsigned int param = c->prog_data.nr_params++;
808
809 this->param_index[param] = loc;
810 this->param_offset[param] = i;
811 }
812 return 1;
813
814 case GLSL_TYPE_STRUCT:
815 for (unsigned int i = 0; i < type->length; i++) {
816 offset += setup_uniform_values(loc + offset,
817 type->fields.structure[i].type);
818 }
819 return offset;
820
821 case GLSL_TYPE_ARRAY:
822 for (unsigned int i = 0; i < type->length; i++) {
823 offset += setup_uniform_values(loc + offset, type->fields.array);
824 }
825 return offset;
826
827 case GLSL_TYPE_SAMPLER:
828 /* The sampler takes up a slot, but we don't use any values from it. */
829 return 1;
830
831 default:
832 assert(!"not reached");
833 return 0;
834 }
835 }
836
837
838 /* Our support for builtin uniforms is even scarier than non-builtin.
839 * It sits on top of the PROG_STATE_VAR parameters that are
840 * automatically updated from GL context state.
841 */
842 void
843 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
844 {
845 const ir_state_slot *const slots = ir->state_slots;
846 assert(ir->state_slots != NULL);
847
848 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
849 /* This state reference has already been setup by ir_to_mesa, but we'll
850 * get the same index back here.
851 */
852 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
853 (gl_state_index *)slots[i].tokens);
854
855 /* Add each of the unique swizzles of the element as a parameter.
856 * This'll end up matching the expected layout of the
857 * array/matrix/structure we're trying to fill in.
858 */
859 int last_swiz = -1;
860 for (unsigned int j = 0; j < 4; j++) {
861 int swiz = GET_SWZ(slots[i].swizzle, j);
862 if (swiz == last_swiz)
863 break;
864 last_swiz = swiz;
865
866 this->param_index[c->prog_data.nr_params] = index;
867 this->param_offset[c->prog_data.nr_params] = swiz;
868 c->prog_data.nr_params++;
869 }
870 }
871 }
872
873 fs_reg *
874 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
875 {
876 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
877 fs_reg wpos = *reg;
878 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
879
880 /* gl_FragCoord.x */
881 if (ir->pixel_center_integer) {
882 emit(MOV(wpos, this->pixel_x));
883 } else {
884 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
885 }
886 wpos.reg_offset++;
887
888 /* gl_FragCoord.y */
889 if (!flip && ir->pixel_center_integer) {
890 emit(MOV(wpos, this->pixel_y));
891 } else {
892 fs_reg pixel_y = this->pixel_y;
893 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
894
895 if (flip) {
896 pixel_y.negate = true;
897 offset += c->key.drawable_height - 1.0;
898 }
899
900 emit(ADD(wpos, pixel_y, fs_reg(offset)));
901 }
902 wpos.reg_offset++;
903
904 /* gl_FragCoord.z */
905 if (intel->gen >= 6) {
906 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
907 } else {
908 emit(FS_OPCODE_LINTERP, wpos,
909 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
910 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
911 interp_reg(FRAG_ATTRIB_WPOS, 2));
912 }
913 wpos.reg_offset++;
914
915 /* gl_FragCoord.w: Already set up in emit_interpolation */
916 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
917
918 return reg;
919 }
920
921 fs_inst *
922 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
923 glsl_interp_qualifier interpolation_mode,
924 bool is_centroid)
925 {
926 brw_wm_barycentric_interp_mode barycoord_mode;
927 if (is_centroid) {
928 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
929 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
930 else
931 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
932 } else {
933 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
934 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
935 else
936 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
937 }
938 return emit(FS_OPCODE_LINTERP, attr,
939 this->delta_x[barycoord_mode],
940 this->delta_y[barycoord_mode], interp);
941 }
942
943 fs_reg *
944 fs_visitor::emit_general_interpolation(ir_variable *ir)
945 {
946 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
947 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
948 fs_reg attr = *reg;
949
950 unsigned int array_elements;
951 const glsl_type *type;
952
953 if (ir->type->is_array()) {
954 array_elements = ir->type->length;
955 if (array_elements == 0) {
956 fail("dereferenced array '%s' has length 0\n", ir->name);
957 }
958 type = ir->type->fields.array;
959 } else {
960 array_elements = 1;
961 type = ir->type;
962 }
963
964 glsl_interp_qualifier interpolation_mode =
965 ir->determine_interpolation_mode(c->key.flat_shade);
966
967 int location = ir->location;
968 for (unsigned int i = 0; i < array_elements; i++) {
969 for (unsigned int j = 0; j < type->matrix_columns; j++) {
970 if (urb_setup[location] == -1) {
971 /* If there's no incoming setup data for this slot, don't
972 * emit interpolation for it.
973 */
974 attr.reg_offset += type->vector_elements;
975 location++;
976 continue;
977 }
978
979 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
980 /* Constant interpolation (flat shading) case. The SF has
981 * handed us defined values in only the constant offset
982 * field of the setup reg.
983 */
984 for (unsigned int k = 0; k < type->vector_elements; k++) {
985 struct brw_reg interp = interp_reg(location, k);
986 interp = suboffset(interp, 3);
987 interp.type = reg->type;
988 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
989 attr.reg_offset++;
990 }
991 } else {
992 /* Smooth/noperspective interpolation case. */
993 for (unsigned int k = 0; k < type->vector_elements; k++) {
994 /* FINISHME: At some point we probably want to push
995 * this farther by giving similar treatment to the
996 * other potentially constant components of the
997 * attribute, as well as making brw_vs_constval.c
998 * handle varyings other than gl_TexCoord.
999 */
1000 if (location >= FRAG_ATTRIB_TEX0 &&
1001 location <= FRAG_ATTRIB_TEX7 &&
1002 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1003 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1004 } else {
1005 struct brw_reg interp = interp_reg(location, k);
1006 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1007 ir->centroid);
1008 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1009 /* Get the pixel/sample mask into f0 so that we know
1010 * which pixels are lit. Then, for each channel that is
1011 * unlit, replace the centroid data with non-centroid
1012 * data.
1013 */
1014 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1015 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1016 interpolation_mode, false);
1017 inst->predicate = BRW_PREDICATE_NORMAL;
1018 inst->predicate_inverse = true;
1019 }
1020 if (intel->gen < 6) {
1021 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1022 }
1023 }
1024 attr.reg_offset++;
1025 }
1026
1027 }
1028 location++;
1029 }
1030 }
1031
1032 return reg;
1033 }
1034
1035 fs_reg *
1036 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1037 {
1038 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1039
1040 /* The frontfacing comes in as a bit in the thread payload. */
1041 if (intel->gen >= 6) {
1042 emit(BRW_OPCODE_ASR, *reg,
1043 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1044 fs_reg(15));
1045 emit(BRW_OPCODE_NOT, *reg, *reg);
1046 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1047 } else {
1048 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1049 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1050 * us front face
1051 */
1052 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1053 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1054 }
1055
1056 return reg;
1057 }
1058
1059 fs_inst *
1060 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1061 {
1062 switch (opcode) {
1063 case SHADER_OPCODE_RCP:
1064 case SHADER_OPCODE_RSQ:
1065 case SHADER_OPCODE_SQRT:
1066 case SHADER_OPCODE_EXP2:
1067 case SHADER_OPCODE_LOG2:
1068 case SHADER_OPCODE_SIN:
1069 case SHADER_OPCODE_COS:
1070 break;
1071 default:
1072 assert(!"not reached: bad math opcode");
1073 return NULL;
1074 }
1075
1076 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1077 * might be able to do better by doing execsize = 1 math and then
1078 * expanding that result out, but we would need to be careful with
1079 * masking.
1080 *
1081 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1082 * instructions, so we also move to a temp to set those up.
1083 */
1084 if (intel->gen == 6 && (src.file == UNIFORM ||
1085 src.abs ||
1086 src.negate)) {
1087 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1088 emit(BRW_OPCODE_MOV, expanded, src);
1089 src = expanded;
1090 }
1091
1092 fs_inst *inst = emit(opcode, dst, src);
1093
1094 if (intel->gen < 6) {
1095 inst->base_mrf = 2;
1096 inst->mlen = dispatch_width / 8;
1097 }
1098
1099 return inst;
1100 }
1101
1102 fs_inst *
1103 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1104 {
1105 int base_mrf = 2;
1106 fs_inst *inst;
1107
1108 switch (opcode) {
1109 case SHADER_OPCODE_POW:
1110 case SHADER_OPCODE_INT_QUOTIENT:
1111 case SHADER_OPCODE_INT_REMAINDER:
1112 break;
1113 default:
1114 assert(!"not reached: unsupported binary math opcode.");
1115 return NULL;
1116 }
1117
1118 if (intel->gen >= 7) {
1119 inst = emit(opcode, dst, src0, src1);
1120 } else if (intel->gen == 6) {
1121 /* Can't do hstride == 0 args to gen6 math, so expand it out.
1122 *
1123 * The hardware ignores source modifiers (negate and abs) on math
1124 * instructions, so we also move to a temp to set those up.
1125 */
1126 if (src0.file == UNIFORM || src0.abs || src0.negate) {
1127 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1128 expanded.type = src0.type;
1129 emit(BRW_OPCODE_MOV, expanded, src0);
1130 src0 = expanded;
1131 }
1132
1133 if (src1.file == UNIFORM || src1.abs || src1.negate) {
1134 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1135 expanded.type = src1.type;
1136 emit(BRW_OPCODE_MOV, expanded, src1);
1137 src1 = expanded;
1138 }
1139
1140 inst = emit(opcode, dst, src0, src1);
1141 } else {
1142 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1143 * "Message Payload":
1144 *
1145 * "Operand0[7]. For the INT DIV functions, this operand is the
1146 * denominator."
1147 * ...
1148 * "Operand1[7]. For the INT DIV functions, this operand is the
1149 * numerator."
1150 */
1151 bool is_int_div = opcode != SHADER_OPCODE_POW;
1152 fs_reg &op0 = is_int_div ? src1 : src0;
1153 fs_reg &op1 = is_int_div ? src0 : src1;
1154
1155 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1156 inst = emit(opcode, dst, op0, reg_null_f);
1157
1158 inst->base_mrf = base_mrf;
1159 inst->mlen = 2 * dispatch_width / 8;
1160 }
1161 return inst;
1162 }
1163
1164 /**
1165 * To be called after the last _mesa_add_state_reference() call, to
1166 * set up prog_data.param[] for assign_curb_setup() and
1167 * setup_pull_constants().
1168 */
1169 void
1170 fs_visitor::setup_paramvalues_refs()
1171 {
1172 if (dispatch_width != 8)
1173 return;
1174
1175 /* Set up the pointers to ParamValues now that that array is finalized. */
1176 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1177 c->prog_data.param[i] =
1178 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
1179 this->param_offset[i];
1180 }
1181 }
1182
1183 void
1184 fs_visitor::assign_curb_setup()
1185 {
1186 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1187 if (dispatch_width == 8) {
1188 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1189 } else {
1190 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1191 }
1192
1193 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1194 foreach_list(node, &this->instructions) {
1195 fs_inst *inst = (fs_inst *)node;
1196
1197 for (unsigned int i = 0; i < 3; i++) {
1198 if (inst->src[i].file == UNIFORM) {
1199 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1200 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1201 constant_nr / 8,
1202 constant_nr % 8);
1203
1204 inst->src[i].file = FIXED_HW_REG;
1205 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1206 }
1207 }
1208 }
1209 }
1210
1211 void
1212 fs_visitor::calculate_urb_setup()
1213 {
1214 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1215 urb_setup[i] = -1;
1216 }
1217
1218 int urb_next = 0;
1219 /* Figure out where each of the incoming setup attributes lands. */
1220 if (intel->gen >= 6) {
1221 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1222 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1223 urb_setup[i] = urb_next++;
1224 }
1225 }
1226 } else {
1227 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1228 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1229 /* Point size is packed into the header, not as a general attribute */
1230 if (i == VERT_RESULT_PSIZ)
1231 continue;
1232
1233 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1234 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1235
1236 /* The back color slot is skipped when the front color is
1237 * also written to. In addition, some slots can be
1238 * written in the vertex shader and not read in the
1239 * fragment shader. So the register number must always be
1240 * incremented, mapped or not.
1241 */
1242 if (fp_index >= 0)
1243 urb_setup[fp_index] = urb_next;
1244 urb_next++;
1245 }
1246 }
1247
1248 /*
1249 * It's a FS only attribute, and we did interpolation for this attribute
1250 * in SF thread. So, count it here, too.
1251 *
1252 * See compile_sf_prog() for more info.
1253 */
1254 if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1255 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1256 }
1257
1258 /* Each attribute is 4 setup channels, each of which is half a reg. */
1259 c->prog_data.urb_read_length = urb_next * 2;
1260 }
1261
1262 void
1263 fs_visitor::assign_urb_setup()
1264 {
1265 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1266
1267 /* Offset all the urb_setup[] index by the actual position of the
1268 * setup regs, now that the location of the constants has been chosen.
1269 */
1270 foreach_list(node, &this->instructions) {
1271 fs_inst *inst = (fs_inst *)node;
1272
1273 if (inst->opcode == FS_OPCODE_LINTERP) {
1274 assert(inst->src[2].file == FIXED_HW_REG);
1275 inst->src[2].fixed_hw_reg.nr += urb_start;
1276 }
1277
1278 if (inst->opcode == FS_OPCODE_CINTERP) {
1279 assert(inst->src[0].file == FIXED_HW_REG);
1280 inst->src[0].fixed_hw_reg.nr += urb_start;
1281 }
1282 }
1283
1284 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1285 }
1286
1287 /**
1288 * Split large virtual GRFs into separate components if we can.
1289 *
1290 * This is mostly duplicated with what brw_fs_vector_splitting does,
1291 * but that's really conservative because it's afraid of doing
1292 * splitting that doesn't result in real progress after the rest of
1293 * the optimization phases, which would cause infinite looping in
1294 * optimization. We can do it once here, safely. This also has the
1295 * opportunity to split interpolated values, or maybe even uniforms,
1296 * which we don't have at the IR level.
1297 *
1298 * We want to split, because virtual GRFs are what we register
1299 * allocate and spill (due to contiguousness requirements for some
1300 * instructions), and they're what we naturally generate in the
1301 * codegen process, but most virtual GRFs don't actually need to be
1302 * contiguous sets of GRFs. If we split, we'll end up with reduced
1303 * live intervals and better dead code elimination and coalescing.
1304 */
1305 void
1306 fs_visitor::split_virtual_grfs()
1307 {
1308 int num_vars = this->virtual_grf_count;
1309 bool split_grf[num_vars];
1310 int new_virtual_grf[num_vars];
1311
1312 /* Try to split anything > 0 sized. */
1313 for (int i = 0; i < num_vars; i++) {
1314 if (this->virtual_grf_sizes[i] != 1)
1315 split_grf[i] = true;
1316 else
1317 split_grf[i] = false;
1318 }
1319
1320 if (brw->has_pln &&
1321 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1322 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1323 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1324 * Gen6, that was the only supported interpolation mode, and since Gen6,
1325 * delta_x and delta_y are in fixed hardware registers.
1326 */
1327 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1328 false;
1329 }
1330
1331 foreach_list(node, &this->instructions) {
1332 fs_inst *inst = (fs_inst *)node;
1333
1334 /* If there's a SEND message that requires contiguous destination
1335 * registers, no splitting is allowed.
1336 */
1337 if (inst->regs_written() > 1) {
1338 split_grf[inst->dst.reg] = false;
1339 }
1340 }
1341
1342 /* Allocate new space for split regs. Note that the virtual
1343 * numbers will be contiguous.
1344 */
1345 for (int i = 0; i < num_vars; i++) {
1346 if (split_grf[i]) {
1347 new_virtual_grf[i] = virtual_grf_alloc(1);
1348 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1349 int reg = virtual_grf_alloc(1);
1350 assert(reg == new_virtual_grf[i] + j - 1);
1351 (void) reg;
1352 }
1353 this->virtual_grf_sizes[i] = 1;
1354 }
1355 }
1356
1357 foreach_list(node, &this->instructions) {
1358 fs_inst *inst = (fs_inst *)node;
1359
1360 if (inst->dst.file == GRF &&
1361 split_grf[inst->dst.reg] &&
1362 inst->dst.reg_offset != 0) {
1363 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1364 inst->dst.reg_offset - 1);
1365 inst->dst.reg_offset = 0;
1366 }
1367 for (int i = 0; i < 3; i++) {
1368 if (inst->src[i].file == GRF &&
1369 split_grf[inst->src[i].reg] &&
1370 inst->src[i].reg_offset != 0) {
1371 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1372 inst->src[i].reg_offset - 1);
1373 inst->src[i].reg_offset = 0;
1374 }
1375 }
1376 }
1377 this->live_intervals_valid = false;
1378 }
1379
1380 /**
1381 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1382 *
1383 * During code generation, we create tons of temporary variables, many of
1384 * which get immediately killed and are never used again. Yet, in later
1385 * optimization and analysis passes, such as compute_live_intervals, we need
1386 * to loop over all the virtual GRFs. Compacting them can save a lot of
1387 * overhead.
1388 */
1389 void
1390 fs_visitor::compact_virtual_grfs()
1391 {
1392 /* Mark which virtual GRFs are used, and count how many. */
1393 int remap_table[this->virtual_grf_count];
1394 memset(remap_table, -1, sizeof(remap_table));
1395
1396 foreach_list(node, &this->instructions) {
1397 const fs_inst *inst = (const fs_inst *) node;
1398
1399 if (inst->dst.file == GRF)
1400 remap_table[inst->dst.reg] = 0;
1401
1402 for (int i = 0; i < 3; i++) {
1403 if (inst->src[i].file == GRF)
1404 remap_table[inst->src[i].reg] = 0;
1405 }
1406 }
1407
1408 /* In addition to registers used in instructions, fs_visitor keeps
1409 * direct references to certain special values which must be patched:
1410 */
1411 fs_reg *special[] = {
1412 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1413 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1414 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1415 &delta_x[0], &delta_x[1], &delta_x[2],
1416 &delta_x[3], &delta_x[4], &delta_x[5],
1417 &delta_y[0], &delta_y[1], &delta_y[2],
1418 &delta_y[3], &delta_y[4], &delta_y[5],
1419 };
1420 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1421 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1422
1423 /* Treat all special values as used, to be conservative */
1424 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1425 if (special[i]->file == GRF)
1426 remap_table[special[i]->reg] = 0;
1427 }
1428
1429 /* Compact the GRF arrays. */
1430 int new_index = 0;
1431 for (int i = 0; i < this->virtual_grf_count; i++) {
1432 if (remap_table[i] != -1) {
1433 remap_table[i] = new_index;
1434 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1435 if (live_intervals_valid) {
1436 virtual_grf_use[new_index] = virtual_grf_use[i];
1437 virtual_grf_def[new_index] = virtual_grf_def[i];
1438 }
1439 ++new_index;
1440 }
1441 }
1442
1443 this->virtual_grf_count = new_index;
1444
1445 /* Patch all the instructions to use the newly renumbered registers */
1446 foreach_list(node, &this->instructions) {
1447 fs_inst *inst = (fs_inst *) node;
1448
1449 if (inst->dst.file == GRF)
1450 inst->dst.reg = remap_table[inst->dst.reg];
1451
1452 for (int i = 0; i < 3; i++) {
1453 if (inst->src[i].file == GRF)
1454 inst->src[i].reg = remap_table[inst->src[i].reg];
1455 }
1456 }
1457
1458 /* Patch all the references to special values */
1459 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1460 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1461 special[i]->reg = remap_table[special[i]->reg];
1462 }
1463 }
1464
1465 bool
1466 fs_visitor::remove_dead_constants()
1467 {
1468 if (dispatch_width == 8) {
1469 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1470
1471 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1472 this->params_remap[i] = -1;
1473
1474 /* Find which params are still in use. */
1475 foreach_list(node, &this->instructions) {
1476 fs_inst *inst = (fs_inst *)node;
1477
1478 for (int i = 0; i < 3; i++) {
1479 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1480
1481 if (inst->src[i].file != UNIFORM)
1482 continue;
1483
1484 assert(constant_nr < (int)c->prog_data.nr_params);
1485
1486 /* For now, set this to non-negative. We'll give it the
1487 * actual new number in a moment, in order to keep the
1488 * register numbers nicely ordered.
1489 */
1490 this->params_remap[constant_nr] = 0;
1491 }
1492 }
1493
1494 /* Figure out what the new numbers for the params will be. At some
1495 * point when we're doing uniform array access, we're going to want
1496 * to keep the distinction between .reg and .reg_offset, but for
1497 * now we don't care.
1498 */
1499 unsigned int new_nr_params = 0;
1500 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1501 if (this->params_remap[i] != -1) {
1502 this->params_remap[i] = new_nr_params++;
1503 }
1504 }
1505
1506 /* Update the list of params to be uploaded to match our new numbering. */
1507 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1508 int remapped = this->params_remap[i];
1509
1510 if (remapped == -1)
1511 continue;
1512
1513 /* We've already done setup_paramvalues_refs() so no need to worry
1514 * about param_index and param_offset.
1515 */
1516 c->prog_data.param[remapped] = c->prog_data.param[i];
1517 }
1518
1519 c->prog_data.nr_params = new_nr_params;
1520 } else {
1521 /* This should have been generated in the 8-wide pass already. */
1522 assert(this->params_remap);
1523 }
1524
1525 /* Now do the renumbering of the shader to remove unused params. */
1526 foreach_list(node, &this->instructions) {
1527 fs_inst *inst = (fs_inst *)node;
1528
1529 for (int i = 0; i < 3; i++) {
1530 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1531
1532 if (inst->src[i].file != UNIFORM)
1533 continue;
1534
1535 assert(this->params_remap[constant_nr] != -1);
1536 inst->src[i].reg = this->params_remap[constant_nr];
1537 inst->src[i].reg_offset = 0;
1538 }
1539 }
1540
1541 return true;
1542 }
1543
1544 /*
1545 * Implements array access of uniforms by inserting a
1546 * PULL_CONSTANT_LOAD instruction.
1547 *
1548 * Unlike temporary GRF array access (where we don't support it due to
1549 * the difficulty of doing relative addressing on instruction
1550 * destinations), we could potentially do array access of uniforms
1551 * that were loaded in GRF space as push constants. In real-world
1552 * usage we've seen, though, the arrays being used are always larger
1553 * than we could load as push constants, so just always move all
1554 * uniform array access out to a pull constant buffer.
1555 */
1556 void
1557 fs_visitor::move_uniform_array_access_to_pull_constants()
1558 {
1559 int pull_constant_loc[c->prog_data.nr_params];
1560
1561 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1562 pull_constant_loc[i] = -1;
1563 }
1564
1565 /* Walk through and find array access of uniforms. Put a copy of that
1566 * uniform in the pull constant buffer.
1567 *
1568 * Note that we don't move constant-indexed accesses to arrays. No
1569 * testing has been done of the performance impact of this choice.
1570 */
1571 foreach_list_safe(node, &this->instructions) {
1572 fs_inst *inst = (fs_inst *)node;
1573
1574 for (int i = 0 ; i < 3; i++) {
1575 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1576 continue;
1577
1578 int uniform = inst->src[i].reg;
1579
1580 /* If this array isn't already present in the pull constant buffer,
1581 * add it.
1582 */
1583 if (pull_constant_loc[uniform] == -1) {
1584 const float **values = &c->prog_data.param[uniform];
1585
1586 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1587
1588 assert(param_size[uniform]);
1589
1590 for (int j = 0; j < param_size[uniform]; j++) {
1591 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1592 values[j];
1593 }
1594 }
1595
1596 /* Set up the annotation tracking for new generated instructions. */
1597 base_ir = inst->ir;
1598 current_annotation = inst->annotation;
1599
1600 fs_reg offset = fs_reg(this, glsl_type::int_type);
1601 inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1602 fs_reg(pull_constant_loc[uniform] +
1603 inst->src[i].reg_offset)));
1604
1605 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1606 fs_reg temp = fs_reg(this, glsl_type::float_type);
1607 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1608 surf_index, offset);
1609 inst->insert_before(&list);
1610
1611 inst->src[i].file = temp.file;
1612 inst->src[i].reg = temp.reg;
1613 inst->src[i].reg_offset = temp.reg_offset;
1614 inst->src[i].reladdr = NULL;
1615 }
1616 }
1617 }
1618
1619 /**
1620 * Choose accesses from the UNIFORM file to demote to using the pull
1621 * constant buffer.
1622 *
1623 * We allow a fragment shader to have more than the specified minimum
1624 * maximum number of fragment shader uniform components (64). If
1625 * there are too many of these, they'd fill up all of register space.
1626 * So, this will push some of them out to the pull constant buffer and
1627 * update the program to load them.
1628 */
1629 void
1630 fs_visitor::setup_pull_constants()
1631 {
1632 /* Only allow 16 registers (128 uniform components) as push constants. */
1633 unsigned int max_uniform_components = 16 * 8;
1634 if (c->prog_data.nr_params <= max_uniform_components)
1635 return;
1636
1637 if (dispatch_width == 16) {
1638 fail("Pull constants not supported in 16-wide\n");
1639 return;
1640 }
1641
1642 /* Just demote the end of the list. We could probably do better
1643 * here, demoting things that are rarely used in the program first.
1644 */
1645 unsigned int pull_uniform_base = max_uniform_components;
1646
1647 int pull_constant_loc[c->prog_data.nr_params];
1648 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1649 if (i < pull_uniform_base) {
1650 pull_constant_loc[i] = -1;
1651 } else {
1652 pull_constant_loc[i] = -1;
1653 /* If our constant is already being uploaded for reladdr purposes,
1654 * reuse it.
1655 */
1656 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1657 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1658 pull_constant_loc[i] = j;
1659 break;
1660 }
1661 }
1662 if (pull_constant_loc[i] == -1) {
1663 int pull_index = c->prog_data.nr_pull_params++;
1664 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1665 pull_constant_loc[i] = pull_index;;
1666 }
1667 }
1668 }
1669 c->prog_data.nr_params = pull_uniform_base;
1670
1671 foreach_list(node, &this->instructions) {
1672 fs_inst *inst = (fs_inst *)node;
1673
1674 for (int i = 0; i < 3; i++) {
1675 if (inst->src[i].file != UNIFORM)
1676 continue;
1677
1678 int pull_index = pull_constant_loc[inst->src[i].reg +
1679 inst->src[i].reg_offset];
1680 if (pull_index == -1)
1681 continue;
1682
1683 assert(!inst->src[i].reladdr);
1684
1685 fs_reg dst = fs_reg(this, glsl_type::float_type);
1686 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1687 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1688 fs_inst *pull =
1689 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1690 dst, index, offset);
1691 pull->ir = inst->ir;
1692 pull->annotation = inst->annotation;
1693 pull->base_mrf = 14;
1694 pull->mlen = 1;
1695
1696 inst->insert_before(pull);
1697
1698 inst->src[i].file = GRF;
1699 inst->src[i].reg = dst.reg;
1700 inst->src[i].reg_offset = 0;
1701 inst->src[i].smear = pull_index & 3;
1702 }
1703 }
1704 }
1705
1706 bool
1707 fs_visitor::opt_algebraic()
1708 {
1709 bool progress = false;
1710
1711 foreach_list(node, &this->instructions) {
1712 fs_inst *inst = (fs_inst *)node;
1713
1714 switch (inst->opcode) {
1715 case BRW_OPCODE_MUL:
1716 if (inst->src[1].file != IMM)
1717 continue;
1718
1719 /* a * 1.0 = a */
1720 if (inst->src[1].is_one()) {
1721 inst->opcode = BRW_OPCODE_MOV;
1722 inst->src[1] = reg_undef;
1723 progress = true;
1724 break;
1725 }
1726
1727 /* a * 0.0 = 0.0 */
1728 if (inst->src[1].is_zero()) {
1729 inst->opcode = BRW_OPCODE_MOV;
1730 inst->src[0] = inst->src[1];
1731 inst->src[1] = reg_undef;
1732 progress = true;
1733 break;
1734 }
1735
1736 break;
1737 case BRW_OPCODE_ADD:
1738 if (inst->src[1].file != IMM)
1739 continue;
1740
1741 /* a + 0.0 = a */
1742 if (inst->src[1].is_zero()) {
1743 inst->opcode = BRW_OPCODE_MOV;
1744 inst->src[1] = reg_undef;
1745 progress = true;
1746 break;
1747 }
1748 break;
1749 default:
1750 break;
1751 }
1752 }
1753
1754 return progress;
1755 }
1756
1757 /**
1758 * Must be called after calculate_live_intervales() to remove unused
1759 * writes to registers -- register allocation will fail otherwise
1760 * because something deffed but not used won't be considered to
1761 * interfere with other regs.
1762 */
1763 bool
1764 fs_visitor::dead_code_eliminate()
1765 {
1766 bool progress = false;
1767 int pc = 0;
1768
1769 calculate_live_intervals();
1770
1771 foreach_list_safe(node, &this->instructions) {
1772 fs_inst *inst = (fs_inst *)node;
1773
1774 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1775 inst->remove();
1776 progress = true;
1777 }
1778
1779 pc++;
1780 }
1781
1782 if (progress)
1783 live_intervals_valid = false;
1784
1785 return progress;
1786 }
1787
1788 /**
1789 * Implements a second type of register coalescing: This one checks if
1790 * the two regs involved in a raw move don't interfere, in which case
1791 * they can both by stored in the same place and the MOV removed.
1792 */
1793 bool
1794 fs_visitor::register_coalesce_2()
1795 {
1796 bool progress = false;
1797
1798 calculate_live_intervals();
1799
1800 foreach_list_safe(node, &this->instructions) {
1801 fs_inst *inst = (fs_inst *)node;
1802
1803 if (inst->opcode != BRW_OPCODE_MOV ||
1804 inst->predicate ||
1805 inst->saturate ||
1806 inst->src[0].file != GRF ||
1807 inst->src[0].negate ||
1808 inst->src[0].abs ||
1809 inst->src[0].smear != -1 ||
1810 inst->dst.file != GRF ||
1811 inst->dst.type != inst->src[0].type ||
1812 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1813 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1814 continue;
1815 }
1816
1817 int reg_from = inst->src[0].reg;
1818 assert(inst->src[0].reg_offset == 0);
1819 int reg_to = inst->dst.reg;
1820 int reg_to_offset = inst->dst.reg_offset;
1821
1822 foreach_list_safe(node, &this->instructions) {
1823 fs_inst *scan_inst = (fs_inst *)node;
1824
1825 if (scan_inst->dst.file == GRF &&
1826 scan_inst->dst.reg == reg_from) {
1827 scan_inst->dst.reg = reg_to;
1828 scan_inst->dst.reg_offset = reg_to_offset;
1829 }
1830 for (int i = 0; i < 3; i++) {
1831 if (scan_inst->src[i].file == GRF &&
1832 scan_inst->src[i].reg == reg_from) {
1833 scan_inst->src[i].reg = reg_to;
1834 scan_inst->src[i].reg_offset = reg_to_offset;
1835 }
1836 }
1837 }
1838
1839 inst->remove();
1840 live_intervals_valid = false;
1841 progress = true;
1842 continue;
1843 }
1844
1845 return progress;
1846 }
1847
1848 bool
1849 fs_visitor::register_coalesce()
1850 {
1851 bool progress = false;
1852 int if_depth = 0;
1853 int loop_depth = 0;
1854
1855 foreach_list_safe(node, &this->instructions) {
1856 fs_inst *inst = (fs_inst *)node;
1857
1858 /* Make sure that we dominate the instructions we're going to
1859 * scan for interfering with our coalescing, or we won't have
1860 * scanned enough to see if anything interferes with our
1861 * coalescing. We don't dominate the following instructions if
1862 * we're in a loop or an if block.
1863 */
1864 switch (inst->opcode) {
1865 case BRW_OPCODE_DO:
1866 loop_depth++;
1867 break;
1868 case BRW_OPCODE_WHILE:
1869 loop_depth--;
1870 break;
1871 case BRW_OPCODE_IF:
1872 if_depth++;
1873 break;
1874 case BRW_OPCODE_ENDIF:
1875 if_depth--;
1876 break;
1877 default:
1878 break;
1879 }
1880 if (loop_depth || if_depth)
1881 continue;
1882
1883 if (inst->opcode != BRW_OPCODE_MOV ||
1884 inst->predicate ||
1885 inst->saturate ||
1886 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1887 inst->src[0].file != UNIFORM)||
1888 inst->dst.type != inst->src[0].type)
1889 continue;
1890
1891 bool has_source_modifiers = (inst->src[0].abs ||
1892 inst->src[0].negate ||
1893 inst->src[0].file == UNIFORM);
1894
1895 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1896 * them: check for no writes to either one until the exit of the
1897 * program.
1898 */
1899 bool interfered = false;
1900
1901 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1902 !scan_inst->is_tail_sentinel();
1903 scan_inst = (fs_inst *)scan_inst->next) {
1904 if (scan_inst->dst.file == GRF) {
1905 if (scan_inst->overwrites_reg(inst->dst) ||
1906 scan_inst->overwrites_reg(inst->src[0])) {
1907 interfered = true;
1908 break;
1909 }
1910 }
1911
1912 /* The gen6 MATH instruction can't handle source modifiers or
1913 * unusual register regions, so avoid coalescing those for
1914 * now. We should do something more specific.
1915 */
1916 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1917 interfered = true;
1918 break;
1919 }
1920
1921 /* The accumulator result appears to get used for the
1922 * conditional modifier generation. When negating a UD
1923 * value, there is a 33rd bit generated for the sign in the
1924 * accumulator value, so now you can't check, for example,
1925 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1926 */
1927 if (scan_inst->conditional_mod &&
1928 inst->src[0].negate &&
1929 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1930 interfered = true;
1931 break;
1932 }
1933 }
1934 if (interfered) {
1935 continue;
1936 }
1937
1938 /* Rewrite the later usage to point at the source of the move to
1939 * be removed.
1940 */
1941 for (fs_inst *scan_inst = inst;
1942 !scan_inst->is_tail_sentinel();
1943 scan_inst = (fs_inst *)scan_inst->next) {
1944 for (int i = 0; i < 3; i++) {
1945 if (scan_inst->src[i].file == GRF &&
1946 scan_inst->src[i].reg == inst->dst.reg &&
1947 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1948 fs_reg new_src = inst->src[0];
1949 if (scan_inst->src[i].abs) {
1950 new_src.negate = 0;
1951 new_src.abs = 1;
1952 }
1953 new_src.negate ^= scan_inst->src[i].negate;
1954 scan_inst->src[i] = new_src;
1955 }
1956 }
1957 }
1958
1959 inst->remove();
1960 progress = true;
1961 }
1962
1963 if (progress)
1964 live_intervals_valid = false;
1965
1966 return progress;
1967 }
1968
1969
1970 bool
1971 fs_visitor::compute_to_mrf()
1972 {
1973 bool progress = false;
1974 int next_ip = 0;
1975
1976 calculate_live_intervals();
1977
1978 foreach_list_safe(node, &this->instructions) {
1979 fs_inst *inst = (fs_inst *)node;
1980
1981 int ip = next_ip;
1982 next_ip++;
1983
1984 if (inst->opcode != BRW_OPCODE_MOV ||
1985 inst->predicate ||
1986 inst->dst.file != MRF || inst->src[0].file != GRF ||
1987 inst->dst.type != inst->src[0].type ||
1988 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1989 continue;
1990
1991 /* Work out which hardware MRF registers are written by this
1992 * instruction.
1993 */
1994 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1995 int mrf_high;
1996 if (inst->dst.reg & BRW_MRF_COMPR4) {
1997 mrf_high = mrf_low + 4;
1998 } else if (dispatch_width == 16 &&
1999 (!inst->force_uncompressed && !inst->force_sechalf)) {
2000 mrf_high = mrf_low + 1;
2001 } else {
2002 mrf_high = mrf_low;
2003 }
2004
2005 /* Can't compute-to-MRF this GRF if someone else was going to
2006 * read it later.
2007 */
2008 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2009 continue;
2010
2011 /* Found a move of a GRF to a MRF. Let's see if we can go
2012 * rewrite the thing that made this GRF to write into the MRF.
2013 */
2014 fs_inst *scan_inst;
2015 for (scan_inst = (fs_inst *)inst->prev;
2016 scan_inst->prev != NULL;
2017 scan_inst = (fs_inst *)scan_inst->prev) {
2018 if (scan_inst->dst.file == GRF &&
2019 scan_inst->dst.reg == inst->src[0].reg) {
2020 /* Found the last thing to write our reg we want to turn
2021 * into a compute-to-MRF.
2022 */
2023
2024 /* SENDs can only write to GRFs, so no compute-to-MRF. */
2025 if (scan_inst->mlen) {
2026 break;
2027 }
2028
2029 /* If it's predicated, it (probably) didn't populate all
2030 * the channels. We might be able to rewrite everything
2031 * that writes that reg, but it would require smarter
2032 * tracking to delay the rewriting until complete success.
2033 */
2034 if (scan_inst->predicate)
2035 break;
2036
2037 /* If it's half of register setup and not the same half as
2038 * our MOV we're trying to remove, bail for now.
2039 */
2040 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2041 scan_inst->force_sechalf != inst->force_sechalf) {
2042 break;
2043 }
2044
2045 /* SEND instructions can't have MRF as a destination. */
2046 if (scan_inst->mlen)
2047 break;
2048
2049 if (intel->gen >= 6) {
2050 /* gen6 math instructions must have the destination be
2051 * GRF, so no compute-to-MRF for them.
2052 */
2053 if (scan_inst->is_math()) {
2054 break;
2055 }
2056 }
2057
2058 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2059 /* Found the creator of our MRF's source value. */
2060 scan_inst->dst.file = MRF;
2061 scan_inst->dst.reg = inst->dst.reg;
2062 scan_inst->saturate |= inst->saturate;
2063 inst->remove();
2064 progress = true;
2065 }
2066 break;
2067 }
2068
2069 /* We don't handle flow control here. Most computation of
2070 * values that end up in MRFs are shortly before the MRF
2071 * write anyway.
2072 */
2073 if (scan_inst->opcode == BRW_OPCODE_DO ||
2074 scan_inst->opcode == BRW_OPCODE_WHILE ||
2075 scan_inst->opcode == BRW_OPCODE_ELSE ||
2076 scan_inst->opcode == BRW_OPCODE_ENDIF) {
2077 break;
2078 }
2079
2080 /* You can't read from an MRF, so if someone else reads our
2081 * MRF's source GRF that we wanted to rewrite, that stops us.
2082 */
2083 bool interfered = false;
2084 for (int i = 0; i < 3; i++) {
2085 if (scan_inst->src[i].file == GRF &&
2086 scan_inst->src[i].reg == inst->src[0].reg &&
2087 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2088 interfered = true;
2089 }
2090 }
2091 if (interfered)
2092 break;
2093
2094 if (scan_inst->dst.file == MRF) {
2095 /* If somebody else writes our MRF here, we can't
2096 * compute-to-MRF before that.
2097 */
2098 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2099 int scan_mrf_high;
2100
2101 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2102 scan_mrf_high = scan_mrf_low + 4;
2103 } else if (dispatch_width == 16 &&
2104 (!scan_inst->force_uncompressed &&
2105 !scan_inst->force_sechalf)) {
2106 scan_mrf_high = scan_mrf_low + 1;
2107 } else {
2108 scan_mrf_high = scan_mrf_low;
2109 }
2110
2111 if (mrf_low == scan_mrf_low ||
2112 mrf_low == scan_mrf_high ||
2113 mrf_high == scan_mrf_low ||
2114 mrf_high == scan_mrf_high) {
2115 break;
2116 }
2117 }
2118
2119 if (scan_inst->mlen > 0) {
2120 /* Found a SEND instruction, which means that there are
2121 * live values in MRFs from base_mrf to base_mrf +
2122 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2123 * above it.
2124 */
2125 if (mrf_low >= scan_inst->base_mrf &&
2126 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2127 break;
2128 }
2129 if (mrf_high >= scan_inst->base_mrf &&
2130 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2131 break;
2132 }
2133 }
2134 }
2135 }
2136
2137 if (progress)
2138 live_intervals_valid = false;
2139
2140 return progress;
2141 }
2142
2143 /**
2144 * Walks through basic blocks, looking for repeated MRF writes and
2145 * removing the later ones.
2146 */
2147 bool
2148 fs_visitor::remove_duplicate_mrf_writes()
2149 {
2150 fs_inst *last_mrf_move[16];
2151 bool progress = false;
2152
2153 /* Need to update the MRF tracking for compressed instructions. */
2154 if (dispatch_width == 16)
2155 return false;
2156
2157 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2158
2159 foreach_list_safe(node, &this->instructions) {
2160 fs_inst *inst = (fs_inst *)node;
2161
2162 switch (inst->opcode) {
2163 case BRW_OPCODE_DO:
2164 case BRW_OPCODE_WHILE:
2165 case BRW_OPCODE_IF:
2166 case BRW_OPCODE_ELSE:
2167 case BRW_OPCODE_ENDIF:
2168 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2169 continue;
2170 default:
2171 break;
2172 }
2173
2174 if (inst->opcode == BRW_OPCODE_MOV &&
2175 inst->dst.file == MRF) {
2176 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2177 if (prev_inst && inst->equals(prev_inst)) {
2178 inst->remove();
2179 progress = true;
2180 continue;
2181 }
2182 }
2183
2184 /* Clear out the last-write records for MRFs that were overwritten. */
2185 if (inst->dst.file == MRF) {
2186 last_mrf_move[inst->dst.reg] = NULL;
2187 }
2188
2189 if (inst->mlen > 0) {
2190 /* Found a SEND instruction, which will include two or fewer
2191 * implied MRF writes. We could do better here.
2192 */
2193 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2194 last_mrf_move[inst->base_mrf + i] = NULL;
2195 }
2196 }
2197
2198 /* Clear out any MRF move records whose sources got overwritten. */
2199 if (inst->dst.file == GRF) {
2200 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2201 if (last_mrf_move[i] &&
2202 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2203 last_mrf_move[i] = NULL;
2204 }
2205 }
2206 }
2207
2208 if (inst->opcode == BRW_OPCODE_MOV &&
2209 inst->dst.file == MRF &&
2210 inst->src[0].file == GRF &&
2211 !inst->predicate) {
2212 last_mrf_move[inst->dst.reg] = inst;
2213 }
2214 }
2215
2216 if (progress)
2217 live_intervals_valid = false;
2218
2219 return progress;
2220 }
2221
2222 void
2223 fs_visitor::dump_instruction(fs_inst *inst)
2224 {
2225 if (inst->predicate) {
2226 printf("(%cf0.%d) ",
2227 inst->predicate_inverse ? '-' : '+',
2228 inst->flag_subreg);
2229 }
2230
2231 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2232 opcode_descs[inst->opcode].name) {
2233 printf("%s", opcode_descs[inst->opcode].name);
2234 } else {
2235 printf("op%d", inst->opcode);
2236 }
2237 if (inst->saturate)
2238 printf(".sat");
2239 if (inst->conditional_mod) {
2240 printf(".cmod");
2241 if (!inst->predicate &&
2242 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2243 inst->opcode != BRW_OPCODE_IF &&
2244 inst->opcode != BRW_OPCODE_WHILE))) {
2245 printf(".f0.%d\n", inst->flag_subreg);
2246 }
2247 }
2248 printf(" ");
2249
2250
2251 switch (inst->dst.file) {
2252 case GRF:
2253 printf("vgrf%d", inst->dst.reg);
2254 if (inst->dst.reg_offset)
2255 printf("+%d", inst->dst.reg_offset);
2256 break;
2257 case MRF:
2258 printf("m%d", inst->dst.reg);
2259 break;
2260 case BAD_FILE:
2261 printf("(null)");
2262 break;
2263 case UNIFORM:
2264 printf("***u%d***", inst->dst.reg);
2265 break;
2266 default:
2267 printf("???");
2268 break;
2269 }
2270 printf(", ");
2271
2272 for (int i = 0; i < 3; i++) {
2273 if (inst->src[i].negate)
2274 printf("-");
2275 if (inst->src[i].abs)
2276 printf("|");
2277 switch (inst->src[i].file) {
2278 case GRF:
2279 printf("vgrf%d", inst->src[i].reg);
2280 if (inst->src[i].reg_offset)
2281 printf("+%d", inst->src[i].reg_offset);
2282 break;
2283 case MRF:
2284 printf("***m%d***", inst->src[i].reg);
2285 break;
2286 case UNIFORM:
2287 printf("u%d", inst->src[i].reg);
2288 if (inst->src[i].reg_offset)
2289 printf(".%d", inst->src[i].reg_offset);
2290 break;
2291 case BAD_FILE:
2292 printf("(null)");
2293 break;
2294 default:
2295 printf("???");
2296 break;
2297 }
2298 if (inst->src[i].abs)
2299 printf("|");
2300
2301 if (i < 3)
2302 printf(", ");
2303 }
2304
2305 printf(" ");
2306
2307 if (inst->force_uncompressed)
2308 printf("1sthalf ");
2309
2310 if (inst->force_sechalf)
2311 printf("2ndhalf ");
2312
2313 printf("\n");
2314 }
2315
2316 void
2317 fs_visitor::dump_instructions()
2318 {
2319 int ip = 0;
2320 foreach_list(node, &this->instructions) {
2321 fs_inst *inst = (fs_inst *)node;
2322 printf("%d: ", ip++);
2323 dump_instruction(inst);
2324 }
2325 }
2326
2327 /**
2328 * Possibly returns an instruction that set up @param reg.
2329 *
2330 * Sometimes we want to take the result of some expression/variable
2331 * dereference tree and rewrite the instruction generating the result
2332 * of the tree. When processing the tree, we know that the
2333 * instructions generated are all writing temporaries that are dead
2334 * outside of this tree. So, if we have some instructions that write
2335 * a temporary, we're free to point that temp write somewhere else.
2336 *
2337 * Note that this doesn't guarantee that the instruction generated
2338 * only reg -- it might be the size=4 destination of a texture instruction.
2339 */
2340 fs_inst *
2341 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2342 fs_inst *end,
2343 fs_reg reg)
2344 {
2345 if (end == start ||
2346 end->predicate ||
2347 end->force_uncompressed ||
2348 end->force_sechalf ||
2349 reg.reladdr ||
2350 !reg.equals(end->dst)) {
2351 return NULL;
2352 } else {
2353 return end;
2354 }
2355 }
2356
2357 void
2358 fs_visitor::setup_payload_gen6()
2359 {
2360 struct intel_context *intel = &brw->intel;
2361 bool uses_depth =
2362 (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2363 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2364
2365 assert(intel->gen >= 6);
2366
2367 /* R0-1: masks, pixel X/Y coordinates. */
2368 c->nr_payload_regs = 2;
2369 /* R2: only for 32-pixel dispatch.*/
2370
2371 /* R3-26: barycentric interpolation coordinates. These appear in the
2372 * same order that they appear in the brw_wm_barycentric_interp_mode
2373 * enum. Each set of coordinates occupies 2 registers if dispatch width
2374 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2375 * appear if they were enabled using the "Barycentric Interpolation
2376 * Mode" bits in WM_STATE.
2377 */
2378 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2379 if (barycentric_interp_modes & (1 << i)) {
2380 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2381 c->nr_payload_regs += 2;
2382 if (dispatch_width == 16) {
2383 c->nr_payload_regs += 2;
2384 }
2385 }
2386 }
2387
2388 /* R27: interpolated depth if uses source depth */
2389 if (uses_depth) {
2390 c->source_depth_reg = c->nr_payload_regs;
2391 c->nr_payload_regs++;
2392 if (dispatch_width == 16) {
2393 /* R28: interpolated depth if not 8-wide. */
2394 c->nr_payload_regs++;
2395 }
2396 }
2397 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2398 if (uses_depth) {
2399 c->source_w_reg = c->nr_payload_regs;
2400 c->nr_payload_regs++;
2401 if (dispatch_width == 16) {
2402 /* R30: interpolated W if not 8-wide. */
2403 c->nr_payload_regs++;
2404 }
2405 }
2406 /* R31: MSAA position offsets. */
2407 /* R32-: bary for 32-pixel. */
2408 /* R58-59: interp W for 32-pixel. */
2409
2410 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2411 c->source_depth_to_render_target = true;
2412 }
2413 }
2414
2415 bool
2416 fs_visitor::run()
2417 {
2418 uint32_t orig_nr_params = c->prog_data.nr_params;
2419
2420 if (intel->gen >= 6)
2421 setup_payload_gen6();
2422 else
2423 setup_payload_gen4();
2424
2425 if (0) {
2426 emit_dummy_fs();
2427 } else {
2428 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2429 emit_shader_time_begin();
2430
2431 calculate_urb_setup();
2432 if (intel->gen < 6)
2433 emit_interpolation_setup_gen4();
2434 else
2435 emit_interpolation_setup_gen6();
2436
2437 /* We handle discards by keeping track of the still-live pixels in f0.1.
2438 * Initialize it with the dispatched pixels.
2439 */
2440 if (fp->UsesKill) {
2441 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2442 discard_init->flag_subreg = 1;
2443 }
2444
2445 /* Generate FS IR for main(). (the visitor only descends into
2446 * functions called "main").
2447 */
2448 if (shader) {
2449 foreach_list(node, &*shader->ir) {
2450 ir_instruction *ir = (ir_instruction *)node;
2451 base_ir = ir;
2452 this->result = reg_undef;
2453 ir->accept(this);
2454 }
2455 } else {
2456 emit_fragment_program_code();
2457 }
2458 base_ir = NULL;
2459 if (failed)
2460 return false;
2461
2462 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2463 emit_shader_time_end();
2464
2465 emit_fb_writes();
2466
2467 split_virtual_grfs();
2468
2469 setup_paramvalues_refs();
2470 move_uniform_array_access_to_pull_constants();
2471 setup_pull_constants();
2472
2473 bool progress;
2474 do {
2475 progress = false;
2476
2477 compact_virtual_grfs();
2478
2479 progress = remove_duplicate_mrf_writes() || progress;
2480
2481 progress = opt_algebraic() || progress;
2482 progress = opt_cse() || progress;
2483 progress = opt_copy_propagate() || progress;
2484 progress = dead_code_eliminate() || progress;
2485 progress = register_coalesce() || progress;
2486 progress = register_coalesce_2() || progress;
2487 progress = compute_to_mrf() || progress;
2488 } while (progress);
2489
2490 remove_dead_constants();
2491
2492 schedule_instructions();
2493
2494 assign_curb_setup();
2495 assign_urb_setup();
2496
2497 if (0) {
2498 /* Debug of register spilling: Go spill everything. */
2499 for (int i = 0; i < virtual_grf_count; i++) {
2500 spill_reg(i);
2501 }
2502 }
2503
2504 if (0)
2505 assign_regs_trivial();
2506 else {
2507 while (!assign_regs()) {
2508 if (failed)
2509 break;
2510 }
2511 }
2512 }
2513 assert(force_uncompressed_stack == 0);
2514 assert(force_sechalf_stack == 0);
2515
2516 if (failed)
2517 return false;
2518
2519 if (dispatch_width == 8) {
2520 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2521 } else {
2522 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2523
2524 /* Make sure we didn't try to sneak in an extra uniform */
2525 assert(orig_nr_params == c->prog_data.nr_params);
2526 (void) orig_nr_params;
2527 }
2528
2529 return !failed;
2530 }
2531
2532 const unsigned *
2533 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2534 struct gl_fragment_program *fp,
2535 struct gl_shader_program *prog,
2536 unsigned *final_assembly_size)
2537 {
2538 struct intel_context *intel = &brw->intel;
2539 bool start_busy = false;
2540 float start_time = 0;
2541
2542 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2543 start_busy = (intel->batch.last_bo &&
2544 drm_intel_bo_busy(intel->batch.last_bo));
2545 start_time = get_time();
2546 }
2547
2548 struct brw_shader *shader = NULL;
2549 if (prog)
2550 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2551
2552 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2553 if (shader) {
2554 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2555 _mesa_print_ir(shader->ir, NULL);
2556 printf("\n\n");
2557 } else {
2558 printf("ARB_fragment_program %d ir for native fragment shader\n",
2559 fp->Base.Id);
2560 _mesa_print_program(&fp->Base);
2561 }
2562 }
2563
2564 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2565 */
2566 fs_visitor v(brw, c, prog, fp, 8);
2567 if (!v.run()) {
2568 prog->LinkStatus = false;
2569 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2570
2571 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2572 v.fail_msg);
2573
2574 return NULL;
2575 }
2576
2577 exec_list *simd16_instructions = NULL;
2578 fs_visitor v2(brw, c, prog, fp, 16);
2579 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2580 v2.import_uniforms(&v);
2581 if (!v2.run()) {
2582 perf_debug("16-wide shader failed to compile, falling back to "
2583 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2584 } else {
2585 simd16_instructions = &v2.instructions;
2586 }
2587 }
2588
2589 c->prog_data.dispatch_width = 8;
2590
2591 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2592 const unsigned *generated = g.generate_assembly(&v.instructions,
2593 simd16_instructions,
2594 final_assembly_size);
2595
2596 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2597 if (shader->compiled_once)
2598 brw_wm_debug_recompile(brw, prog, &c->key);
2599 shader->compiled_once = true;
2600
2601 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2602 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2603 (get_time() - start_time) * 1000);
2604 }
2605 }
2606
2607 return generated;
2608 }
2609
2610 bool
2611 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2612 {
2613 struct brw_context *brw = brw_context(ctx);
2614 struct intel_context *intel = &brw->intel;
2615 struct brw_wm_prog_key key;
2616
2617 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2618 return true;
2619
2620 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2621 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2622 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2623 bool program_uses_dfdy = fp->UsesDFdy;
2624
2625 memset(&key, 0, sizeof(key));
2626
2627 if (intel->gen < 6) {
2628 if (fp->UsesKill)
2629 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2630
2631 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2632 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2633
2634 /* Just assume depth testing. */
2635 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2636 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2637 }
2638
2639 if (prog->Name != 0)
2640 key.proj_attrib_mask = 0xffffffff;
2641
2642 if (intel->gen < 6)
2643 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2644
2645 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2646 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2647 continue;
2648
2649 if (prog->Name == 0)
2650 key.proj_attrib_mask |= 1 << i;
2651
2652 if (intel->gen < 6) {
2653 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2654
2655 if (vp_index >= 0)
2656 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2657 }
2658 }
2659
2660 key.clamp_fragment_color = true;
2661
2662 for (int i = 0; i < MAX_SAMPLERS; i++) {
2663 if (fp->Base.ShadowSamplers & (1 << i)) {
2664 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2665 key.tex.swizzles[i] =
2666 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2667 } else {
2668 /* Color sampler: assume no swizzling. */
2669 key.tex.swizzles[i] = SWIZZLE_XYZW;
2670 }
2671 }
2672
2673 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2674 key.drawable_height = ctx->DrawBuffer->Height;
2675 }
2676
2677 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2678 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2679 }
2680
2681 key.nr_color_regions = 1;
2682
2683 key.program_string_id = bfp->id;
2684
2685 uint32_t old_prog_offset = brw->wm.prog_offset;
2686 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2687
2688 bool success = do_wm_prog(brw, prog, bfp, &key);
2689
2690 brw->wm.prog_offset = old_prog_offset;
2691 brw->wm.prog_data = old_prog_data;
2692
2693 return success;
2694 }