glsl: Replace most default cases in switches on GLSL type
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 ALU1(NOT)
150 ALU1(MOV)
151 ALU1(FRC)
152 ALU1(RNDD)
153 ALU1(RNDE)
154 ALU1(RNDZ)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(SHL)
162 ALU2(SHR)
163 ALU2(ASR)
164
165 /** Gen4 predicated IF. */
166 fs_inst *
167 fs_visitor::IF(uint32_t predicate)
168 {
169 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
170 inst->predicate = predicate;
171 return inst;
172 }
173
174 /** Gen6+ IF with embedded comparison. */
175 fs_inst *
176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
177 {
178 assert(intel->gen >= 6);
179 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
180 reg_null_d, src0, src1);
181 inst->conditional_mod = condition;
182 return inst;
183 }
184
185 /**
186 * CMP: Sets the low bit of the destination channels with the result
187 * of the comparison, while the upper bits are undefined, and updates
188 * the flag register with the packed 16 bits of the result.
189 */
190 fs_inst *
191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
192 {
193 fs_inst *inst;
194
195 /* Take the instruction:
196 *
197 * CMP null<d> src0<f> src1<f>
198 *
199 * Original gen4 does type conversion to the destination type before
200 * comparison, producing garbage results for floating point comparisons.
201 * gen5 does the comparison on the execution type (resolved source types),
202 * so dst type doesn't matter. gen6 does comparison and then uses the
203 * result as if it was the dst type with no conversion, which happens to
204 * mostly work out for float-interpreted-as-int since our comparisons are
205 * for >0, =0, <0.
206 */
207 if (intel->gen == 4) {
208 dst.type = src0.type;
209 if (dst.file == FIXED_HW_REG)
210 dst.fixed_hw_reg.type = dst.type;
211 }
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 exec_list
223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
224 fs_reg offset)
225 {
226 exec_list instructions;
227 fs_inst *inst;
228
229 if (intel->gen >= 7) {
230 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
231 dst, surf_index, offset);
232 instructions.push_tail(inst);
233 } else {
234 int base_mrf = 13;
235 bool header_present = true;
236
237 fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
238 mrf.type = BRW_REGISTER_TYPE_D;
239
240 /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
241 * dword-aligned byte offset.
242 */
243 if (intel->gen == 6) {
244 instructions.push_tail(MOV(mrf, offset));
245 } else {
246 instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
247 }
248 inst = MOV(mrf, offset);
249 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
250 dst, surf_index);
251 inst->header_present = header_present;
252 inst->base_mrf = base_mrf;
253 inst->mlen = header_present + dispatch_width / 8;
254
255 instructions.push_tail(inst);
256 }
257
258 return instructions;
259 }
260
261 bool
262 fs_inst::equals(fs_inst *inst)
263 {
264 return (opcode == inst->opcode &&
265 dst.equals(inst->dst) &&
266 src[0].equals(inst->src[0]) &&
267 src[1].equals(inst->src[1]) &&
268 src[2].equals(inst->src[2]) &&
269 saturate == inst->saturate &&
270 predicate == inst->predicate &&
271 conditional_mod == inst->conditional_mod &&
272 mlen == inst->mlen &&
273 base_mrf == inst->base_mrf &&
274 sampler == inst->sampler &&
275 target == inst->target &&
276 eot == inst->eot &&
277 header_present == inst->header_present &&
278 shadow_compare == inst->shadow_compare &&
279 offset == inst->offset);
280 }
281
282 int
283 fs_inst::regs_written()
284 {
285 if (is_tex())
286 return 4;
287
288 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
289 * but we don't currently use them...nor do we have an opcode for them.
290 */
291
292 return 1;
293 }
294
295 bool
296 fs_inst::overwrites_reg(const fs_reg &reg)
297 {
298 return (reg.file == dst.file &&
299 reg.reg == dst.reg &&
300 reg.reg_offset >= dst.reg_offset &&
301 reg.reg_offset < dst.reg_offset + regs_written());
302 }
303
304 bool
305 fs_inst::is_tex()
306 {
307 return (opcode == SHADER_OPCODE_TEX ||
308 opcode == FS_OPCODE_TXB ||
309 opcode == SHADER_OPCODE_TXD ||
310 opcode == SHADER_OPCODE_TXF ||
311 opcode == SHADER_OPCODE_TXL ||
312 opcode == SHADER_OPCODE_TXS);
313 }
314
315 bool
316 fs_inst::is_math()
317 {
318 return (opcode == SHADER_OPCODE_RCP ||
319 opcode == SHADER_OPCODE_RSQ ||
320 opcode == SHADER_OPCODE_SQRT ||
321 opcode == SHADER_OPCODE_EXP2 ||
322 opcode == SHADER_OPCODE_LOG2 ||
323 opcode == SHADER_OPCODE_SIN ||
324 opcode == SHADER_OPCODE_COS ||
325 opcode == SHADER_OPCODE_INT_QUOTIENT ||
326 opcode == SHADER_OPCODE_INT_REMAINDER ||
327 opcode == SHADER_OPCODE_POW);
328 }
329
330 bool
331 fs_inst::is_send_from_grf()
332 {
333 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
334 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
335 src[1].file == GRF));
336 }
337
338 bool
339 fs_visitor::can_do_source_mods(fs_inst *inst)
340 {
341 if (intel->gen == 6 && inst->is_math())
342 return false;
343
344 if (inst->is_send_from_grf())
345 return false;
346
347 return true;
348 }
349
350 void
351 fs_reg::init()
352 {
353 memset(this, 0, sizeof(*this));
354 this->smear = -1;
355 }
356
357 /** Generic unset register constructor. */
358 fs_reg::fs_reg()
359 {
360 init();
361 this->file = BAD_FILE;
362 }
363
364 /** Immediate value constructor. */
365 fs_reg::fs_reg(float f)
366 {
367 init();
368 this->file = IMM;
369 this->type = BRW_REGISTER_TYPE_F;
370 this->imm.f = f;
371 }
372
373 /** Immediate value constructor. */
374 fs_reg::fs_reg(int32_t i)
375 {
376 init();
377 this->file = IMM;
378 this->type = BRW_REGISTER_TYPE_D;
379 this->imm.i = i;
380 }
381
382 /** Immediate value constructor. */
383 fs_reg::fs_reg(uint32_t u)
384 {
385 init();
386 this->file = IMM;
387 this->type = BRW_REGISTER_TYPE_UD;
388 this->imm.u = u;
389 }
390
391 /** Fixed brw_reg Immediate value constructor. */
392 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
393 {
394 init();
395 this->file = FIXED_HW_REG;
396 this->fixed_hw_reg = fixed_hw_reg;
397 this->type = fixed_hw_reg.type;
398 }
399
400 bool
401 fs_reg::equals(const fs_reg &r) const
402 {
403 return (file == r.file &&
404 reg == r.reg &&
405 reg_offset == r.reg_offset &&
406 type == r.type &&
407 negate == r.negate &&
408 abs == r.abs &&
409 !reladdr && !r.reladdr &&
410 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
411 sizeof(fixed_hw_reg)) == 0 &&
412 smear == r.smear &&
413 imm.u == r.imm.u);
414 }
415
416 bool
417 fs_reg::is_zero() const
418 {
419 if (file != IMM)
420 return false;
421
422 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
423 }
424
425 bool
426 fs_reg::is_one() const
427 {
428 if (file != IMM)
429 return false;
430
431 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
432 }
433
434 int
435 fs_visitor::type_size(const struct glsl_type *type)
436 {
437 unsigned int size, i;
438
439 switch (type->base_type) {
440 case GLSL_TYPE_UINT:
441 case GLSL_TYPE_INT:
442 case GLSL_TYPE_FLOAT:
443 case GLSL_TYPE_BOOL:
444 return type->components();
445 case GLSL_TYPE_ARRAY:
446 return type_size(type->fields.array) * type->length;
447 case GLSL_TYPE_STRUCT:
448 size = 0;
449 for (i = 0; i < type->length; i++) {
450 size += type_size(type->fields.structure[i].type);
451 }
452 return size;
453 case GLSL_TYPE_SAMPLER:
454 /* Samplers take up no register space, since they're baked in at
455 * link time.
456 */
457 return 0;
458 case GLSL_TYPE_VOID:
459 case GLSL_TYPE_ERROR:
460 assert(!"not reached");
461 break;
462 }
463
464 return 0;
465 }
466
467 fs_reg
468 fs_visitor::get_timestamp()
469 {
470 assert(intel->gen >= 7);
471
472 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
473 BRW_ARF_TIMESTAMP,
474 0),
475 BRW_REGISTER_TYPE_UD));
476
477 fs_reg dst = fs_reg(this, glsl_type::uint_type);
478
479 fs_inst *mov = emit(MOV(dst, ts));
480 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
481 * even if it's not enabled in the dispatch.
482 */
483 mov->force_writemask_all = true;
484 mov->force_uncompressed = true;
485
486 /* The caller wants the low 32 bits of the timestamp. Since it's running
487 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
488 * which is plenty of time for our purposes. It is identical across the
489 * EUs, but since it's tracking GPU core speed it will increment at a
490 * varying rate as render P-states change.
491 *
492 * The caller could also check if render P-states have changed (or anything
493 * else that might disrupt timing) by setting smear to 2 and checking if
494 * that field is != 0.
495 */
496 dst.smear = 0;
497
498 return dst;
499 }
500
501 void
502 fs_visitor::emit_shader_time_begin()
503 {
504 current_annotation = "shader time start";
505 shader_start_time = get_timestamp();
506 }
507
508 void
509 fs_visitor::emit_shader_time_end()
510 {
511 current_annotation = "shader time end";
512
513 enum shader_time_shader_type type, written_type, reset_type;
514 if (dispatch_width == 8) {
515 type = ST_FS8;
516 written_type = ST_FS8_WRITTEN;
517 reset_type = ST_FS8_RESET;
518 } else {
519 assert(dispatch_width == 16);
520 type = ST_FS16;
521 written_type = ST_FS16_WRITTEN;
522 reset_type = ST_FS16_RESET;
523 }
524
525 fs_reg shader_end_time = get_timestamp();
526
527 /* Check that there weren't any timestamp reset events (assuming these
528 * were the only two timestamp reads that happened).
529 */
530 fs_reg reset = shader_end_time;
531 reset.smear = 2;
532 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
533 test->conditional_mod = BRW_CONDITIONAL_Z;
534 emit(IF(BRW_PREDICATE_NORMAL));
535
536 push_force_uncompressed();
537 fs_reg start = shader_start_time;
538 start.negate = true;
539 fs_reg diff = fs_reg(this, glsl_type::uint_type);
540 emit(ADD(diff, start, shader_end_time));
541
542 /* If there were no instructions between the two timestamp gets, the diff
543 * is 2 cycles. Remove that overhead, so I can forget about that when
544 * trying to determine the time taken for single instructions.
545 */
546 emit(ADD(diff, diff, fs_reg(-2u)));
547
548 emit_shader_time_write(type, diff);
549 emit_shader_time_write(written_type, fs_reg(1u));
550 emit(BRW_OPCODE_ELSE);
551 emit_shader_time_write(reset_type, fs_reg(1u));
552 emit(BRW_OPCODE_ENDIF);
553
554 pop_force_uncompressed();
555 }
556
557 void
558 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
559 fs_reg value)
560 {
561 /* Choose an index in the buffer and set up tracking information for our
562 * printouts.
563 */
564 int shader_time_index = brw->shader_time.num_entries++;
565 assert(shader_time_index <= brw->shader_time.max_entries);
566 brw->shader_time.types[shader_time_index] = type;
567 if (prog) {
568 _mesa_reference_shader_program(ctx,
569 &brw->shader_time.programs[shader_time_index],
570 prog);
571 }
572
573 int base_mrf = 6;
574
575 fs_reg offset_mrf = fs_reg(MRF, base_mrf);
576 offset_mrf.type = BRW_REGISTER_TYPE_UD;
577 emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
578
579 fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
580 time_mrf.type = BRW_REGISTER_TYPE_UD;
581 emit(MOV(time_mrf, value));
582
583 fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
584 inst->base_mrf = base_mrf;
585 inst->mlen = 2;
586 }
587
588 void
589 fs_visitor::fail(const char *format, ...)
590 {
591 va_list va;
592 char *msg;
593
594 if (failed)
595 return;
596
597 failed = true;
598
599 va_start(va, format);
600 msg = ralloc_vasprintf(mem_ctx, format, va);
601 va_end(va);
602 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
603
604 this->fail_msg = msg;
605
606 if (INTEL_DEBUG & DEBUG_WM) {
607 fprintf(stderr, "%s", msg);
608 }
609 }
610
611 fs_inst *
612 fs_visitor::emit(enum opcode opcode)
613 {
614 return emit(fs_inst(opcode));
615 }
616
617 fs_inst *
618 fs_visitor::emit(enum opcode opcode, fs_reg dst)
619 {
620 return emit(fs_inst(opcode, dst));
621 }
622
623 fs_inst *
624 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
625 {
626 return emit(fs_inst(opcode, dst, src0));
627 }
628
629 fs_inst *
630 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
631 {
632 return emit(fs_inst(opcode, dst, src0, src1));
633 }
634
635 fs_inst *
636 fs_visitor::emit(enum opcode opcode, fs_reg dst,
637 fs_reg src0, fs_reg src1, fs_reg src2)
638 {
639 return emit(fs_inst(opcode, dst, src0, src1, src2));
640 }
641
642 void
643 fs_visitor::push_force_uncompressed()
644 {
645 force_uncompressed_stack++;
646 }
647
648 void
649 fs_visitor::pop_force_uncompressed()
650 {
651 force_uncompressed_stack--;
652 assert(force_uncompressed_stack >= 0);
653 }
654
655 void
656 fs_visitor::push_force_sechalf()
657 {
658 force_sechalf_stack++;
659 }
660
661 void
662 fs_visitor::pop_force_sechalf()
663 {
664 force_sechalf_stack--;
665 assert(force_sechalf_stack >= 0);
666 }
667
668 /**
669 * Returns how many MRFs an FS opcode will write over.
670 *
671 * Note that this is not the 0 or 1 implied writes in an actual gen
672 * instruction -- the FS opcodes often generate MOVs in addition.
673 */
674 int
675 fs_visitor::implied_mrf_writes(fs_inst *inst)
676 {
677 if (inst->mlen == 0)
678 return 0;
679
680 switch (inst->opcode) {
681 case SHADER_OPCODE_RCP:
682 case SHADER_OPCODE_RSQ:
683 case SHADER_OPCODE_SQRT:
684 case SHADER_OPCODE_EXP2:
685 case SHADER_OPCODE_LOG2:
686 case SHADER_OPCODE_SIN:
687 case SHADER_OPCODE_COS:
688 return 1 * dispatch_width / 8;
689 case SHADER_OPCODE_POW:
690 case SHADER_OPCODE_INT_QUOTIENT:
691 case SHADER_OPCODE_INT_REMAINDER:
692 return 2 * dispatch_width / 8;
693 case SHADER_OPCODE_TEX:
694 case FS_OPCODE_TXB:
695 case SHADER_OPCODE_TXD:
696 case SHADER_OPCODE_TXF:
697 case SHADER_OPCODE_TXL:
698 case SHADER_OPCODE_TXS:
699 return 1;
700 case SHADER_OPCODE_SHADER_TIME_ADD:
701 return 0;
702 case FS_OPCODE_FB_WRITE:
703 return 2;
704 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
705 case FS_OPCODE_UNSPILL:
706 return 1;
707 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
708 return inst->header_present;
709 case FS_OPCODE_SPILL:
710 return 2;
711 default:
712 assert(!"not reached");
713 return inst->mlen;
714 }
715 }
716
717 int
718 fs_visitor::virtual_grf_alloc(int size)
719 {
720 if (virtual_grf_array_size <= virtual_grf_count) {
721 if (virtual_grf_array_size == 0)
722 virtual_grf_array_size = 16;
723 else
724 virtual_grf_array_size *= 2;
725 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
726 virtual_grf_array_size);
727 }
728 virtual_grf_sizes[virtual_grf_count] = size;
729 return virtual_grf_count++;
730 }
731
732 /** Fixed HW reg constructor. */
733 fs_reg::fs_reg(enum register_file file, int reg)
734 {
735 init();
736 this->file = file;
737 this->reg = reg;
738 this->type = BRW_REGISTER_TYPE_F;
739 }
740
741 /** Fixed HW reg constructor. */
742 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
743 {
744 init();
745 this->file = file;
746 this->reg = reg;
747 this->type = type;
748 }
749
750 /** Automatic reg constructor. */
751 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
752 {
753 init();
754
755 this->file = GRF;
756 this->reg = v->virtual_grf_alloc(v->type_size(type));
757 this->reg_offset = 0;
758 this->type = brw_type_for_base_type(type);
759 }
760
761 fs_reg *
762 fs_visitor::variable_storage(ir_variable *var)
763 {
764 return (fs_reg *)hash_table_find(this->variable_ht, var);
765 }
766
767 void
768 import_uniforms_callback(const void *key,
769 void *data,
770 void *closure)
771 {
772 struct hash_table *dst_ht = (struct hash_table *)closure;
773 const fs_reg *reg = (const fs_reg *)data;
774
775 if (reg->file != UNIFORM)
776 return;
777
778 hash_table_insert(dst_ht, data, key);
779 }
780
781 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
782 * This brings in those uniform definitions
783 */
784 void
785 fs_visitor::import_uniforms(fs_visitor *v)
786 {
787 hash_table_call_foreach(v->variable_ht,
788 import_uniforms_callback,
789 variable_ht);
790 this->params_remap = v->params_remap;
791 }
792
793 /* Our support for uniforms is piggy-backed on the struct
794 * gl_fragment_program, because that's where the values actually
795 * get stored, rather than in some global gl_shader_program uniform
796 * store.
797 */
798 void
799 fs_visitor::setup_uniform_values(ir_variable *ir)
800 {
801 int namelen = strlen(ir->name);
802
803 /* The data for our (non-builtin) uniforms is stored in a series of
804 * gl_uniform_driver_storage structs for each subcomponent that
805 * glGetUniformLocation() could name. We know it's been set up in the same
806 * order we'd walk the type, so walk the list of storage and find anything
807 * with our name, or the prefix of a component that starts with our name.
808 */
809 unsigned params_before = c->prog_data.nr_params;
810 for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
811 struct gl_uniform_storage *storage = &prog->UniformStorage[u];
812
813 if (strncmp(ir->name, storage->name, namelen) != 0 ||
814 (storage->name[namelen] != 0 &&
815 storage->name[namelen] != '.' &&
816 storage->name[namelen] != '[')) {
817 continue;
818 }
819
820 unsigned slots = storage->type->component_slots();
821 if (storage->array_elements)
822 slots *= storage->array_elements;
823
824 for (unsigned i = 0; i < slots; i++) {
825 c->prog_data.param[c->prog_data.nr_params++] =
826 &storage->storage[i].f;
827 }
828 }
829
830 /* Make sure we actually initialized the right amount of stuff here. */
831 assert(params_before + ir->type->component_slots() ==
832 c->prog_data.nr_params);
833 }
834
835
836 /* Our support for builtin uniforms is even scarier than non-builtin.
837 * It sits on top of the PROG_STATE_VAR parameters that are
838 * automatically updated from GL context state.
839 */
840 void
841 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
842 {
843 const ir_state_slot *const slots = ir->state_slots;
844 assert(ir->state_slots != NULL);
845
846 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
847 /* This state reference has already been setup by ir_to_mesa, but we'll
848 * get the same index back here.
849 */
850 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
851 (gl_state_index *)slots[i].tokens);
852
853 /* Add each of the unique swizzles of the element as a parameter.
854 * This'll end up matching the expected layout of the
855 * array/matrix/structure we're trying to fill in.
856 */
857 int last_swiz = -1;
858 for (unsigned int j = 0; j < 4; j++) {
859 int swiz = GET_SWZ(slots[i].swizzle, j);
860 if (swiz == last_swiz)
861 break;
862 last_swiz = swiz;
863
864 c->prog_data.param[c->prog_data.nr_params++] =
865 &fp->Base.Parameters->ParameterValues[index][swiz].f;
866 }
867 }
868 }
869
870 fs_reg *
871 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
872 {
873 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
874 fs_reg wpos = *reg;
875 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
876
877 /* gl_FragCoord.x */
878 if (ir->pixel_center_integer) {
879 emit(MOV(wpos, this->pixel_x));
880 } else {
881 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
882 }
883 wpos.reg_offset++;
884
885 /* gl_FragCoord.y */
886 if (!flip && ir->pixel_center_integer) {
887 emit(MOV(wpos, this->pixel_y));
888 } else {
889 fs_reg pixel_y = this->pixel_y;
890 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
891
892 if (flip) {
893 pixel_y.negate = true;
894 offset += c->key.drawable_height - 1.0;
895 }
896
897 emit(ADD(wpos, pixel_y, fs_reg(offset)));
898 }
899 wpos.reg_offset++;
900
901 /* gl_FragCoord.z */
902 if (intel->gen >= 6) {
903 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
904 } else {
905 emit(FS_OPCODE_LINTERP, wpos,
906 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
907 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
908 interp_reg(FRAG_ATTRIB_WPOS, 2));
909 }
910 wpos.reg_offset++;
911
912 /* gl_FragCoord.w: Already set up in emit_interpolation */
913 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
914
915 return reg;
916 }
917
918 fs_inst *
919 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
920 glsl_interp_qualifier interpolation_mode,
921 bool is_centroid)
922 {
923 brw_wm_barycentric_interp_mode barycoord_mode;
924 if (is_centroid) {
925 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
926 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
927 else
928 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
929 } else {
930 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
931 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
932 else
933 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
934 }
935 return emit(FS_OPCODE_LINTERP, attr,
936 this->delta_x[barycoord_mode],
937 this->delta_y[barycoord_mode], interp);
938 }
939
940 fs_reg *
941 fs_visitor::emit_general_interpolation(ir_variable *ir)
942 {
943 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
944 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
945 fs_reg attr = *reg;
946
947 unsigned int array_elements;
948 const glsl_type *type;
949
950 if (ir->type->is_array()) {
951 array_elements = ir->type->length;
952 if (array_elements == 0) {
953 fail("dereferenced array '%s' has length 0\n", ir->name);
954 }
955 type = ir->type->fields.array;
956 } else {
957 array_elements = 1;
958 type = ir->type;
959 }
960
961 glsl_interp_qualifier interpolation_mode =
962 ir->determine_interpolation_mode(c->key.flat_shade);
963
964 int location = ir->location;
965 for (unsigned int i = 0; i < array_elements; i++) {
966 for (unsigned int j = 0; j < type->matrix_columns; j++) {
967 if (urb_setup[location] == -1) {
968 /* If there's no incoming setup data for this slot, don't
969 * emit interpolation for it.
970 */
971 attr.reg_offset += type->vector_elements;
972 location++;
973 continue;
974 }
975
976 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
977 /* Constant interpolation (flat shading) case. The SF has
978 * handed us defined values in only the constant offset
979 * field of the setup reg.
980 */
981 for (unsigned int k = 0; k < type->vector_elements; k++) {
982 struct brw_reg interp = interp_reg(location, k);
983 interp = suboffset(interp, 3);
984 interp.type = reg->type;
985 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
986 attr.reg_offset++;
987 }
988 } else {
989 /* Smooth/noperspective interpolation case. */
990 for (unsigned int k = 0; k < type->vector_elements; k++) {
991 /* FINISHME: At some point we probably want to push
992 * this farther by giving similar treatment to the
993 * other potentially constant components of the
994 * attribute, as well as making brw_vs_constval.c
995 * handle varyings other than gl_TexCoord.
996 */
997 if (location >= FRAG_ATTRIB_TEX0 &&
998 location <= FRAG_ATTRIB_TEX7 &&
999 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1000 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1001 } else {
1002 struct brw_reg interp = interp_reg(location, k);
1003 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1004 ir->centroid);
1005 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1006 /* Get the pixel/sample mask into f0 so that we know
1007 * which pixels are lit. Then, for each channel that is
1008 * unlit, replace the centroid data with non-centroid
1009 * data.
1010 */
1011 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1012 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1013 interpolation_mode, false);
1014 inst->predicate = BRW_PREDICATE_NORMAL;
1015 inst->predicate_inverse = true;
1016 }
1017 if (intel->gen < 6) {
1018 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1019 }
1020 }
1021 attr.reg_offset++;
1022 }
1023
1024 }
1025 location++;
1026 }
1027 }
1028
1029 return reg;
1030 }
1031
1032 fs_reg *
1033 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1034 {
1035 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1036
1037 /* The frontfacing comes in as a bit in the thread payload. */
1038 if (intel->gen >= 6) {
1039 emit(BRW_OPCODE_ASR, *reg,
1040 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1041 fs_reg(15));
1042 emit(BRW_OPCODE_NOT, *reg, *reg);
1043 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1044 } else {
1045 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1046 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1047 * us front face
1048 */
1049 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1050 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1051 }
1052
1053 return reg;
1054 }
1055
1056 fs_reg
1057 fs_visitor::fix_math_operand(fs_reg src)
1058 {
1059 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1060 * might be able to do better by doing execsize = 1 math and then
1061 * expanding that result out, but we would need to be careful with
1062 * masking.
1063 *
1064 * The hardware ignores source modifiers (negate and abs) on math
1065 * instructions, so we also move to a temp to set those up.
1066 */
1067 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1068 !src.abs && !src.negate)
1069 return src;
1070
1071 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1072 * operands to math
1073 */
1074 if (intel->gen >= 7 && src.file != IMM)
1075 return src;
1076
1077 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1078 expanded.type = src.type;
1079 emit(BRW_OPCODE_MOV, expanded, src);
1080 return expanded;
1081 }
1082
1083 fs_inst *
1084 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1085 {
1086 switch (opcode) {
1087 case SHADER_OPCODE_RCP:
1088 case SHADER_OPCODE_RSQ:
1089 case SHADER_OPCODE_SQRT:
1090 case SHADER_OPCODE_EXP2:
1091 case SHADER_OPCODE_LOG2:
1092 case SHADER_OPCODE_SIN:
1093 case SHADER_OPCODE_COS:
1094 break;
1095 default:
1096 assert(!"not reached: bad math opcode");
1097 return NULL;
1098 }
1099
1100 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1101 * might be able to do better by doing execsize = 1 math and then
1102 * expanding that result out, but we would need to be careful with
1103 * masking.
1104 *
1105 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1106 * instructions, so we also move to a temp to set those up.
1107 */
1108 if (intel->gen >= 6)
1109 src = fix_math_operand(src);
1110
1111 fs_inst *inst = emit(opcode, dst, src);
1112
1113 if (intel->gen < 6) {
1114 inst->base_mrf = 2;
1115 inst->mlen = dispatch_width / 8;
1116 }
1117
1118 return inst;
1119 }
1120
1121 fs_inst *
1122 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1123 {
1124 int base_mrf = 2;
1125 fs_inst *inst;
1126
1127 switch (opcode) {
1128 case SHADER_OPCODE_INT_QUOTIENT:
1129 case SHADER_OPCODE_INT_REMAINDER:
1130 if (intel->gen >= 7 && dispatch_width == 16)
1131 fail("16-wide INTDIV unsupported\n");
1132 break;
1133 case SHADER_OPCODE_POW:
1134 break;
1135 default:
1136 assert(!"not reached: unsupported binary math opcode.");
1137 return NULL;
1138 }
1139
1140 if (intel->gen >= 6) {
1141 src0 = fix_math_operand(src0);
1142 src1 = fix_math_operand(src1);
1143
1144 inst = emit(opcode, dst, src0, src1);
1145 } else {
1146 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1147 * "Message Payload":
1148 *
1149 * "Operand0[7]. For the INT DIV functions, this operand is the
1150 * denominator."
1151 * ...
1152 * "Operand1[7]. For the INT DIV functions, this operand is the
1153 * numerator."
1154 */
1155 bool is_int_div = opcode != SHADER_OPCODE_POW;
1156 fs_reg &op0 = is_int_div ? src1 : src0;
1157 fs_reg &op1 = is_int_div ? src0 : src1;
1158
1159 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1160 inst = emit(opcode, dst, op0, reg_null_f);
1161
1162 inst->base_mrf = base_mrf;
1163 inst->mlen = 2 * dispatch_width / 8;
1164 }
1165 return inst;
1166 }
1167
1168 void
1169 fs_visitor::assign_curb_setup()
1170 {
1171 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1172 if (dispatch_width == 8) {
1173 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1174 } else {
1175 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1176 }
1177
1178 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1179 foreach_list(node, &this->instructions) {
1180 fs_inst *inst = (fs_inst *)node;
1181
1182 for (unsigned int i = 0; i < 3; i++) {
1183 if (inst->src[i].file == UNIFORM) {
1184 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1185 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1186 constant_nr / 8,
1187 constant_nr % 8);
1188
1189 inst->src[i].file = FIXED_HW_REG;
1190 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1191 }
1192 }
1193 }
1194 }
1195
1196 void
1197 fs_visitor::calculate_urb_setup()
1198 {
1199 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1200 urb_setup[i] = -1;
1201 }
1202
1203 int urb_next = 0;
1204 /* Figure out where each of the incoming setup attributes lands. */
1205 if (intel->gen >= 6) {
1206 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1207 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1208 urb_setup[i] = urb_next++;
1209 }
1210 }
1211 } else {
1212 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1213 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1214 /* Point size is packed into the header, not as a general attribute */
1215 if (i == VERT_RESULT_PSIZ)
1216 continue;
1217
1218 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1219 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1220
1221 /* The back color slot is skipped when the front color is
1222 * also written to. In addition, some slots can be
1223 * written in the vertex shader and not read in the
1224 * fragment shader. So the register number must always be
1225 * incremented, mapped or not.
1226 */
1227 if (fp_index >= 0)
1228 urb_setup[fp_index] = urb_next;
1229 urb_next++;
1230 }
1231 }
1232
1233 /*
1234 * It's a FS only attribute, and we did interpolation for this attribute
1235 * in SF thread. So, count it here, too.
1236 *
1237 * See compile_sf_prog() for more info.
1238 */
1239 if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1240 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1241 }
1242
1243 /* Each attribute is 4 setup channels, each of which is half a reg. */
1244 c->prog_data.urb_read_length = urb_next * 2;
1245 }
1246
1247 void
1248 fs_visitor::assign_urb_setup()
1249 {
1250 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1251
1252 /* Offset all the urb_setup[] index by the actual position of the
1253 * setup regs, now that the location of the constants has been chosen.
1254 */
1255 foreach_list(node, &this->instructions) {
1256 fs_inst *inst = (fs_inst *)node;
1257
1258 if (inst->opcode == FS_OPCODE_LINTERP) {
1259 assert(inst->src[2].file == FIXED_HW_REG);
1260 inst->src[2].fixed_hw_reg.nr += urb_start;
1261 }
1262
1263 if (inst->opcode == FS_OPCODE_CINTERP) {
1264 assert(inst->src[0].file == FIXED_HW_REG);
1265 inst->src[0].fixed_hw_reg.nr += urb_start;
1266 }
1267 }
1268
1269 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1270 }
1271
1272 /**
1273 * Split large virtual GRFs into separate components if we can.
1274 *
1275 * This is mostly duplicated with what brw_fs_vector_splitting does,
1276 * but that's really conservative because it's afraid of doing
1277 * splitting that doesn't result in real progress after the rest of
1278 * the optimization phases, which would cause infinite looping in
1279 * optimization. We can do it once here, safely. This also has the
1280 * opportunity to split interpolated values, or maybe even uniforms,
1281 * which we don't have at the IR level.
1282 *
1283 * We want to split, because virtual GRFs are what we register
1284 * allocate and spill (due to contiguousness requirements for some
1285 * instructions), and they're what we naturally generate in the
1286 * codegen process, but most virtual GRFs don't actually need to be
1287 * contiguous sets of GRFs. If we split, we'll end up with reduced
1288 * live intervals and better dead code elimination and coalescing.
1289 */
1290 void
1291 fs_visitor::split_virtual_grfs()
1292 {
1293 int num_vars = this->virtual_grf_count;
1294 bool split_grf[num_vars];
1295 int new_virtual_grf[num_vars];
1296
1297 /* Try to split anything > 0 sized. */
1298 for (int i = 0; i < num_vars; i++) {
1299 if (this->virtual_grf_sizes[i] != 1)
1300 split_grf[i] = true;
1301 else
1302 split_grf[i] = false;
1303 }
1304
1305 if (brw->has_pln &&
1306 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1307 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1308 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1309 * Gen6, that was the only supported interpolation mode, and since Gen6,
1310 * delta_x and delta_y are in fixed hardware registers.
1311 */
1312 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1313 false;
1314 }
1315
1316 foreach_list(node, &this->instructions) {
1317 fs_inst *inst = (fs_inst *)node;
1318
1319 /* If there's a SEND message that requires contiguous destination
1320 * registers, no splitting is allowed.
1321 */
1322 if (inst->regs_written() > 1) {
1323 split_grf[inst->dst.reg] = false;
1324 }
1325 }
1326
1327 /* Allocate new space for split regs. Note that the virtual
1328 * numbers will be contiguous.
1329 */
1330 for (int i = 0; i < num_vars; i++) {
1331 if (split_grf[i]) {
1332 new_virtual_grf[i] = virtual_grf_alloc(1);
1333 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1334 int reg = virtual_grf_alloc(1);
1335 assert(reg == new_virtual_grf[i] + j - 1);
1336 (void) reg;
1337 }
1338 this->virtual_grf_sizes[i] = 1;
1339 }
1340 }
1341
1342 foreach_list(node, &this->instructions) {
1343 fs_inst *inst = (fs_inst *)node;
1344
1345 if (inst->dst.file == GRF &&
1346 split_grf[inst->dst.reg] &&
1347 inst->dst.reg_offset != 0) {
1348 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1349 inst->dst.reg_offset - 1);
1350 inst->dst.reg_offset = 0;
1351 }
1352 for (int i = 0; i < 3; i++) {
1353 if (inst->src[i].file == GRF &&
1354 split_grf[inst->src[i].reg] &&
1355 inst->src[i].reg_offset != 0) {
1356 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1357 inst->src[i].reg_offset - 1);
1358 inst->src[i].reg_offset = 0;
1359 }
1360 }
1361 }
1362 this->live_intervals_valid = false;
1363 }
1364
1365 /**
1366 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1367 *
1368 * During code generation, we create tons of temporary variables, many of
1369 * which get immediately killed and are never used again. Yet, in later
1370 * optimization and analysis passes, such as compute_live_intervals, we need
1371 * to loop over all the virtual GRFs. Compacting them can save a lot of
1372 * overhead.
1373 */
1374 void
1375 fs_visitor::compact_virtual_grfs()
1376 {
1377 /* Mark which virtual GRFs are used, and count how many. */
1378 int remap_table[this->virtual_grf_count];
1379 memset(remap_table, -1, sizeof(remap_table));
1380
1381 foreach_list(node, &this->instructions) {
1382 const fs_inst *inst = (const fs_inst *) node;
1383
1384 if (inst->dst.file == GRF)
1385 remap_table[inst->dst.reg] = 0;
1386
1387 for (int i = 0; i < 3; i++) {
1388 if (inst->src[i].file == GRF)
1389 remap_table[inst->src[i].reg] = 0;
1390 }
1391 }
1392
1393 /* In addition to registers used in instructions, fs_visitor keeps
1394 * direct references to certain special values which must be patched:
1395 */
1396 fs_reg *special[] = {
1397 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1398 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1399 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1400 &delta_x[0], &delta_x[1], &delta_x[2],
1401 &delta_x[3], &delta_x[4], &delta_x[5],
1402 &delta_y[0], &delta_y[1], &delta_y[2],
1403 &delta_y[3], &delta_y[4], &delta_y[5],
1404 };
1405 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1406 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1407
1408 /* Treat all special values as used, to be conservative */
1409 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1410 if (special[i]->file == GRF)
1411 remap_table[special[i]->reg] = 0;
1412 }
1413
1414 /* Compact the GRF arrays. */
1415 int new_index = 0;
1416 for (int i = 0; i < this->virtual_grf_count; i++) {
1417 if (remap_table[i] != -1) {
1418 remap_table[i] = new_index;
1419 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1420 if (live_intervals_valid) {
1421 virtual_grf_use[new_index] = virtual_grf_use[i];
1422 virtual_grf_def[new_index] = virtual_grf_def[i];
1423 }
1424 ++new_index;
1425 }
1426 }
1427
1428 this->virtual_grf_count = new_index;
1429
1430 /* Patch all the instructions to use the newly renumbered registers */
1431 foreach_list(node, &this->instructions) {
1432 fs_inst *inst = (fs_inst *) node;
1433
1434 if (inst->dst.file == GRF)
1435 inst->dst.reg = remap_table[inst->dst.reg];
1436
1437 for (int i = 0; i < 3; i++) {
1438 if (inst->src[i].file == GRF)
1439 inst->src[i].reg = remap_table[inst->src[i].reg];
1440 }
1441 }
1442
1443 /* Patch all the references to special values */
1444 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1445 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1446 special[i]->reg = remap_table[special[i]->reg];
1447 }
1448 }
1449
1450 bool
1451 fs_visitor::remove_dead_constants()
1452 {
1453 if (dispatch_width == 8) {
1454 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1455
1456 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1457 this->params_remap[i] = -1;
1458
1459 /* Find which params are still in use. */
1460 foreach_list(node, &this->instructions) {
1461 fs_inst *inst = (fs_inst *)node;
1462
1463 for (int i = 0; i < 3; i++) {
1464 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1465
1466 if (inst->src[i].file != UNIFORM)
1467 continue;
1468
1469 assert(constant_nr < (int)c->prog_data.nr_params);
1470
1471 /* For now, set this to non-negative. We'll give it the
1472 * actual new number in a moment, in order to keep the
1473 * register numbers nicely ordered.
1474 */
1475 this->params_remap[constant_nr] = 0;
1476 }
1477 }
1478
1479 /* Figure out what the new numbers for the params will be. At some
1480 * point when we're doing uniform array access, we're going to want
1481 * to keep the distinction between .reg and .reg_offset, but for
1482 * now we don't care.
1483 */
1484 unsigned int new_nr_params = 0;
1485 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1486 if (this->params_remap[i] != -1) {
1487 this->params_remap[i] = new_nr_params++;
1488 }
1489 }
1490
1491 /* Update the list of params to be uploaded to match our new numbering. */
1492 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1493 int remapped = this->params_remap[i];
1494
1495 if (remapped == -1)
1496 continue;
1497
1498 c->prog_data.param[remapped] = c->prog_data.param[i];
1499 }
1500
1501 c->prog_data.nr_params = new_nr_params;
1502 } else {
1503 /* This should have been generated in the 8-wide pass already. */
1504 assert(this->params_remap);
1505 }
1506
1507 /* Now do the renumbering of the shader to remove unused params. */
1508 foreach_list(node, &this->instructions) {
1509 fs_inst *inst = (fs_inst *)node;
1510
1511 for (int i = 0; i < 3; i++) {
1512 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1513
1514 if (inst->src[i].file != UNIFORM)
1515 continue;
1516
1517 assert(this->params_remap[constant_nr] != -1);
1518 inst->src[i].reg = this->params_remap[constant_nr];
1519 inst->src[i].reg_offset = 0;
1520 }
1521 }
1522
1523 return true;
1524 }
1525
1526 /*
1527 * Implements array access of uniforms by inserting a
1528 * PULL_CONSTANT_LOAD instruction.
1529 *
1530 * Unlike temporary GRF array access (where we don't support it due to
1531 * the difficulty of doing relative addressing on instruction
1532 * destinations), we could potentially do array access of uniforms
1533 * that were loaded in GRF space as push constants. In real-world
1534 * usage we've seen, though, the arrays being used are always larger
1535 * than we could load as push constants, so just always move all
1536 * uniform array access out to a pull constant buffer.
1537 */
1538 void
1539 fs_visitor::move_uniform_array_access_to_pull_constants()
1540 {
1541 int pull_constant_loc[c->prog_data.nr_params];
1542
1543 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1544 pull_constant_loc[i] = -1;
1545 }
1546
1547 /* Walk through and find array access of uniforms. Put a copy of that
1548 * uniform in the pull constant buffer.
1549 *
1550 * Note that we don't move constant-indexed accesses to arrays. No
1551 * testing has been done of the performance impact of this choice.
1552 */
1553 foreach_list_safe(node, &this->instructions) {
1554 fs_inst *inst = (fs_inst *)node;
1555
1556 for (int i = 0 ; i < 3; i++) {
1557 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1558 continue;
1559
1560 int uniform = inst->src[i].reg;
1561
1562 /* If this array isn't already present in the pull constant buffer,
1563 * add it.
1564 */
1565 if (pull_constant_loc[uniform] == -1) {
1566 const float **values = &c->prog_data.param[uniform];
1567
1568 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1569
1570 assert(param_size[uniform]);
1571
1572 for (int j = 0; j < param_size[uniform]; j++) {
1573 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1574 values[j];
1575 }
1576 }
1577
1578 /* Set up the annotation tracking for new generated instructions. */
1579 base_ir = inst->ir;
1580 current_annotation = inst->annotation;
1581
1582 fs_reg offset = fs_reg(this, glsl_type::int_type);
1583 inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1584 fs_reg(pull_constant_loc[uniform] +
1585 inst->src[i].reg_offset)));
1586
1587 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1588 fs_reg temp = fs_reg(this, glsl_type::float_type);
1589 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1590 surf_index, offset);
1591 inst->insert_before(&list);
1592
1593 inst->src[i].file = temp.file;
1594 inst->src[i].reg = temp.reg;
1595 inst->src[i].reg_offset = temp.reg_offset;
1596 inst->src[i].reladdr = NULL;
1597 }
1598 }
1599 }
1600
1601 /**
1602 * Choose accesses from the UNIFORM file to demote to using the pull
1603 * constant buffer.
1604 *
1605 * We allow a fragment shader to have more than the specified minimum
1606 * maximum number of fragment shader uniform components (64). If
1607 * there are too many of these, they'd fill up all of register space.
1608 * So, this will push some of them out to the pull constant buffer and
1609 * update the program to load them.
1610 */
1611 void
1612 fs_visitor::setup_pull_constants()
1613 {
1614 /* Only allow 16 registers (128 uniform components) as push constants. */
1615 unsigned int max_uniform_components = 16 * 8;
1616 if (c->prog_data.nr_params <= max_uniform_components)
1617 return;
1618
1619 if (dispatch_width == 16) {
1620 fail("Pull constants not supported in 16-wide\n");
1621 return;
1622 }
1623
1624 /* Just demote the end of the list. We could probably do better
1625 * here, demoting things that are rarely used in the program first.
1626 */
1627 unsigned int pull_uniform_base = max_uniform_components;
1628
1629 int pull_constant_loc[c->prog_data.nr_params];
1630 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1631 if (i < pull_uniform_base) {
1632 pull_constant_loc[i] = -1;
1633 } else {
1634 pull_constant_loc[i] = -1;
1635 /* If our constant is already being uploaded for reladdr purposes,
1636 * reuse it.
1637 */
1638 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1639 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1640 pull_constant_loc[i] = j;
1641 break;
1642 }
1643 }
1644 if (pull_constant_loc[i] == -1) {
1645 int pull_index = c->prog_data.nr_pull_params++;
1646 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1647 pull_constant_loc[i] = pull_index;;
1648 }
1649 }
1650 }
1651 c->prog_data.nr_params = pull_uniform_base;
1652
1653 foreach_list(node, &this->instructions) {
1654 fs_inst *inst = (fs_inst *)node;
1655
1656 for (int i = 0; i < 3; i++) {
1657 if (inst->src[i].file != UNIFORM)
1658 continue;
1659
1660 int pull_index = pull_constant_loc[inst->src[i].reg +
1661 inst->src[i].reg_offset];
1662 if (pull_index == -1)
1663 continue;
1664
1665 assert(!inst->src[i].reladdr);
1666
1667 fs_reg dst = fs_reg(this, glsl_type::float_type);
1668 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1669 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1670 fs_inst *pull =
1671 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1672 dst, index, offset);
1673 pull->ir = inst->ir;
1674 pull->annotation = inst->annotation;
1675 pull->base_mrf = 14;
1676 pull->mlen = 1;
1677
1678 inst->insert_before(pull);
1679
1680 inst->src[i].file = GRF;
1681 inst->src[i].reg = dst.reg;
1682 inst->src[i].reg_offset = 0;
1683 inst->src[i].smear = pull_index & 3;
1684 }
1685 }
1686 }
1687
1688 bool
1689 fs_visitor::opt_algebraic()
1690 {
1691 bool progress = false;
1692
1693 foreach_list(node, &this->instructions) {
1694 fs_inst *inst = (fs_inst *)node;
1695
1696 switch (inst->opcode) {
1697 case BRW_OPCODE_MUL:
1698 if (inst->src[1].file != IMM)
1699 continue;
1700
1701 /* a * 1.0 = a */
1702 if (inst->src[1].is_one()) {
1703 inst->opcode = BRW_OPCODE_MOV;
1704 inst->src[1] = reg_undef;
1705 progress = true;
1706 break;
1707 }
1708
1709 /* a * 0.0 = 0.0 */
1710 if (inst->src[1].is_zero()) {
1711 inst->opcode = BRW_OPCODE_MOV;
1712 inst->src[0] = inst->src[1];
1713 inst->src[1] = reg_undef;
1714 progress = true;
1715 break;
1716 }
1717
1718 break;
1719 case BRW_OPCODE_ADD:
1720 if (inst->src[1].file != IMM)
1721 continue;
1722
1723 /* a + 0.0 = a */
1724 if (inst->src[1].is_zero()) {
1725 inst->opcode = BRW_OPCODE_MOV;
1726 inst->src[1] = reg_undef;
1727 progress = true;
1728 break;
1729 }
1730 break;
1731 default:
1732 break;
1733 }
1734 }
1735
1736 return progress;
1737 }
1738
1739 /**
1740 * Must be called after calculate_live_intervales() to remove unused
1741 * writes to registers -- register allocation will fail otherwise
1742 * because something deffed but not used won't be considered to
1743 * interfere with other regs.
1744 */
1745 bool
1746 fs_visitor::dead_code_eliminate()
1747 {
1748 bool progress = false;
1749 int pc = 0;
1750
1751 calculate_live_intervals();
1752
1753 foreach_list_safe(node, &this->instructions) {
1754 fs_inst *inst = (fs_inst *)node;
1755
1756 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1757 inst->remove();
1758 progress = true;
1759 }
1760
1761 pc++;
1762 }
1763
1764 if (progress)
1765 live_intervals_valid = false;
1766
1767 return progress;
1768 }
1769
1770 /**
1771 * Implements a second type of register coalescing: This one checks if
1772 * the two regs involved in a raw move don't interfere, in which case
1773 * they can both by stored in the same place and the MOV removed.
1774 */
1775 bool
1776 fs_visitor::register_coalesce_2()
1777 {
1778 bool progress = false;
1779
1780 calculate_live_intervals();
1781
1782 foreach_list_safe(node, &this->instructions) {
1783 fs_inst *inst = (fs_inst *)node;
1784
1785 if (inst->opcode != BRW_OPCODE_MOV ||
1786 inst->predicate ||
1787 inst->saturate ||
1788 inst->src[0].file != GRF ||
1789 inst->src[0].negate ||
1790 inst->src[0].abs ||
1791 inst->src[0].smear != -1 ||
1792 inst->dst.file != GRF ||
1793 inst->dst.type != inst->src[0].type ||
1794 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1795 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1796 continue;
1797 }
1798
1799 int reg_from = inst->src[0].reg;
1800 assert(inst->src[0].reg_offset == 0);
1801 int reg_to = inst->dst.reg;
1802 int reg_to_offset = inst->dst.reg_offset;
1803
1804 foreach_list(node, &this->instructions) {
1805 fs_inst *scan_inst = (fs_inst *)node;
1806
1807 if (scan_inst->dst.file == GRF &&
1808 scan_inst->dst.reg == reg_from) {
1809 scan_inst->dst.reg = reg_to;
1810 scan_inst->dst.reg_offset = reg_to_offset;
1811 }
1812 for (int i = 0; i < 3; i++) {
1813 if (scan_inst->src[i].file == GRF &&
1814 scan_inst->src[i].reg == reg_from) {
1815 scan_inst->src[i].reg = reg_to;
1816 scan_inst->src[i].reg_offset = reg_to_offset;
1817 }
1818 }
1819 }
1820
1821 inst->remove();
1822
1823 /* We don't need to recalculate live intervals inside the loop despite
1824 * flagging live_intervals_valid because we only use live intervals for
1825 * the interferes test, and we must have had a situation where the
1826 * intervals were:
1827 *
1828 * from to
1829 * ^
1830 * |
1831 * v
1832 * ^
1833 * |
1834 * v
1835 *
1836 * Some register R that might get coalesced with one of these two could
1837 * only be referencing "to", otherwise "from"'s range would have been
1838 * longer. R's range could also only start at the end of "to" or later,
1839 * otherwise it will conflict with "to" when we try to coalesce "to"
1840 * into Rw anyway.
1841 */
1842 live_intervals_valid = false;
1843
1844 progress = true;
1845 continue;
1846 }
1847
1848 return progress;
1849 }
1850
1851 bool
1852 fs_visitor::register_coalesce()
1853 {
1854 bool progress = false;
1855 int if_depth = 0;
1856 int loop_depth = 0;
1857
1858 foreach_list_safe(node, &this->instructions) {
1859 fs_inst *inst = (fs_inst *)node;
1860
1861 /* Make sure that we dominate the instructions we're going to
1862 * scan for interfering with our coalescing, or we won't have
1863 * scanned enough to see if anything interferes with our
1864 * coalescing. We don't dominate the following instructions if
1865 * we're in a loop or an if block.
1866 */
1867 switch (inst->opcode) {
1868 case BRW_OPCODE_DO:
1869 loop_depth++;
1870 break;
1871 case BRW_OPCODE_WHILE:
1872 loop_depth--;
1873 break;
1874 case BRW_OPCODE_IF:
1875 if_depth++;
1876 break;
1877 case BRW_OPCODE_ENDIF:
1878 if_depth--;
1879 break;
1880 default:
1881 break;
1882 }
1883 if (loop_depth || if_depth)
1884 continue;
1885
1886 if (inst->opcode != BRW_OPCODE_MOV ||
1887 inst->predicate ||
1888 inst->saturate ||
1889 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1890 inst->src[0].file != UNIFORM)||
1891 inst->dst.type != inst->src[0].type)
1892 continue;
1893
1894 bool has_source_modifiers = (inst->src[0].abs ||
1895 inst->src[0].negate ||
1896 inst->src[0].file == UNIFORM);
1897
1898 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1899 * them: check for no writes to either one until the exit of the
1900 * program.
1901 */
1902 bool interfered = false;
1903
1904 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1905 !scan_inst->is_tail_sentinel();
1906 scan_inst = (fs_inst *)scan_inst->next) {
1907 if (scan_inst->dst.file == GRF) {
1908 if (scan_inst->overwrites_reg(inst->dst) ||
1909 scan_inst->overwrites_reg(inst->src[0])) {
1910 interfered = true;
1911 break;
1912 }
1913 }
1914
1915 /* The gen6 MATH instruction can't handle source modifiers or
1916 * unusual register regions, so avoid coalescing those for
1917 * now. We should do something more specific.
1918 */
1919 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1920 interfered = true;
1921 break;
1922 }
1923
1924 /* The accumulator result appears to get used for the
1925 * conditional modifier generation. When negating a UD
1926 * value, there is a 33rd bit generated for the sign in the
1927 * accumulator value, so now you can't check, for example,
1928 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1929 */
1930 if (scan_inst->conditional_mod &&
1931 inst->src[0].negate &&
1932 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1933 interfered = true;
1934 break;
1935 }
1936 }
1937 if (interfered) {
1938 continue;
1939 }
1940
1941 /* Rewrite the later usage to point at the source of the move to
1942 * be removed.
1943 */
1944 for (fs_inst *scan_inst = inst;
1945 !scan_inst->is_tail_sentinel();
1946 scan_inst = (fs_inst *)scan_inst->next) {
1947 for (int i = 0; i < 3; i++) {
1948 if (scan_inst->src[i].file == GRF &&
1949 scan_inst->src[i].reg == inst->dst.reg &&
1950 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1951 fs_reg new_src = inst->src[0];
1952 if (scan_inst->src[i].abs) {
1953 new_src.negate = 0;
1954 new_src.abs = 1;
1955 }
1956 new_src.negate ^= scan_inst->src[i].negate;
1957 scan_inst->src[i] = new_src;
1958 }
1959 }
1960 }
1961
1962 inst->remove();
1963 progress = true;
1964 }
1965
1966 if (progress)
1967 live_intervals_valid = false;
1968
1969 return progress;
1970 }
1971
1972
1973 bool
1974 fs_visitor::compute_to_mrf()
1975 {
1976 bool progress = false;
1977 int next_ip = 0;
1978
1979 calculate_live_intervals();
1980
1981 foreach_list_safe(node, &this->instructions) {
1982 fs_inst *inst = (fs_inst *)node;
1983
1984 int ip = next_ip;
1985 next_ip++;
1986
1987 if (inst->opcode != BRW_OPCODE_MOV ||
1988 inst->predicate ||
1989 inst->dst.file != MRF || inst->src[0].file != GRF ||
1990 inst->dst.type != inst->src[0].type ||
1991 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1992 continue;
1993
1994 /* Work out which hardware MRF registers are written by this
1995 * instruction.
1996 */
1997 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1998 int mrf_high;
1999 if (inst->dst.reg & BRW_MRF_COMPR4) {
2000 mrf_high = mrf_low + 4;
2001 } else if (dispatch_width == 16 &&
2002 (!inst->force_uncompressed && !inst->force_sechalf)) {
2003 mrf_high = mrf_low + 1;
2004 } else {
2005 mrf_high = mrf_low;
2006 }
2007
2008 /* Can't compute-to-MRF this GRF if someone else was going to
2009 * read it later.
2010 */
2011 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2012 continue;
2013
2014 /* Found a move of a GRF to a MRF. Let's see if we can go
2015 * rewrite the thing that made this GRF to write into the MRF.
2016 */
2017 fs_inst *scan_inst;
2018 for (scan_inst = (fs_inst *)inst->prev;
2019 scan_inst->prev != NULL;
2020 scan_inst = (fs_inst *)scan_inst->prev) {
2021 if (scan_inst->dst.file == GRF &&
2022 scan_inst->dst.reg == inst->src[0].reg) {
2023 /* Found the last thing to write our reg we want to turn
2024 * into a compute-to-MRF.
2025 */
2026
2027 /* SENDs can only write to GRFs, so no compute-to-MRF. */
2028 if (scan_inst->mlen) {
2029 break;
2030 }
2031
2032 /* If it's predicated, it (probably) didn't populate all
2033 * the channels. We might be able to rewrite everything
2034 * that writes that reg, but it would require smarter
2035 * tracking to delay the rewriting until complete success.
2036 */
2037 if (scan_inst->predicate)
2038 break;
2039
2040 /* If it's half of register setup and not the same half as
2041 * our MOV we're trying to remove, bail for now.
2042 */
2043 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2044 scan_inst->force_sechalf != inst->force_sechalf) {
2045 break;
2046 }
2047
2048 /* SEND instructions can't have MRF as a destination. */
2049 if (scan_inst->mlen)
2050 break;
2051
2052 if (intel->gen >= 6) {
2053 /* gen6 math instructions must have the destination be
2054 * GRF, so no compute-to-MRF for them.
2055 */
2056 if (scan_inst->is_math()) {
2057 break;
2058 }
2059 }
2060
2061 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2062 /* Found the creator of our MRF's source value. */
2063 scan_inst->dst.file = MRF;
2064 scan_inst->dst.reg = inst->dst.reg;
2065 scan_inst->saturate |= inst->saturate;
2066 inst->remove();
2067 progress = true;
2068 }
2069 break;
2070 }
2071
2072 /* We don't handle flow control here. Most computation of
2073 * values that end up in MRFs are shortly before the MRF
2074 * write anyway.
2075 */
2076 if (scan_inst->opcode == BRW_OPCODE_DO ||
2077 scan_inst->opcode == BRW_OPCODE_WHILE ||
2078 scan_inst->opcode == BRW_OPCODE_ELSE ||
2079 scan_inst->opcode == BRW_OPCODE_ENDIF) {
2080 break;
2081 }
2082
2083 /* You can't read from an MRF, so if someone else reads our
2084 * MRF's source GRF that we wanted to rewrite, that stops us.
2085 */
2086 bool interfered = false;
2087 for (int i = 0; i < 3; i++) {
2088 if (scan_inst->src[i].file == GRF &&
2089 scan_inst->src[i].reg == inst->src[0].reg &&
2090 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2091 interfered = true;
2092 }
2093 }
2094 if (interfered)
2095 break;
2096
2097 if (scan_inst->dst.file == MRF) {
2098 /* If somebody else writes our MRF here, we can't
2099 * compute-to-MRF before that.
2100 */
2101 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2102 int scan_mrf_high;
2103
2104 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2105 scan_mrf_high = scan_mrf_low + 4;
2106 } else if (dispatch_width == 16 &&
2107 (!scan_inst->force_uncompressed &&
2108 !scan_inst->force_sechalf)) {
2109 scan_mrf_high = scan_mrf_low + 1;
2110 } else {
2111 scan_mrf_high = scan_mrf_low;
2112 }
2113
2114 if (mrf_low == scan_mrf_low ||
2115 mrf_low == scan_mrf_high ||
2116 mrf_high == scan_mrf_low ||
2117 mrf_high == scan_mrf_high) {
2118 break;
2119 }
2120 }
2121
2122 if (scan_inst->mlen > 0) {
2123 /* Found a SEND instruction, which means that there are
2124 * live values in MRFs from base_mrf to base_mrf +
2125 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2126 * above it.
2127 */
2128 if (mrf_low >= scan_inst->base_mrf &&
2129 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2130 break;
2131 }
2132 if (mrf_high >= scan_inst->base_mrf &&
2133 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2134 break;
2135 }
2136 }
2137 }
2138 }
2139
2140 if (progress)
2141 live_intervals_valid = false;
2142
2143 return progress;
2144 }
2145
2146 /**
2147 * Walks through basic blocks, looking for repeated MRF writes and
2148 * removing the later ones.
2149 */
2150 bool
2151 fs_visitor::remove_duplicate_mrf_writes()
2152 {
2153 fs_inst *last_mrf_move[16];
2154 bool progress = false;
2155
2156 /* Need to update the MRF tracking for compressed instructions. */
2157 if (dispatch_width == 16)
2158 return false;
2159
2160 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2161
2162 foreach_list_safe(node, &this->instructions) {
2163 fs_inst *inst = (fs_inst *)node;
2164
2165 switch (inst->opcode) {
2166 case BRW_OPCODE_DO:
2167 case BRW_OPCODE_WHILE:
2168 case BRW_OPCODE_IF:
2169 case BRW_OPCODE_ELSE:
2170 case BRW_OPCODE_ENDIF:
2171 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2172 continue;
2173 default:
2174 break;
2175 }
2176
2177 if (inst->opcode == BRW_OPCODE_MOV &&
2178 inst->dst.file == MRF) {
2179 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2180 if (prev_inst && inst->equals(prev_inst)) {
2181 inst->remove();
2182 progress = true;
2183 continue;
2184 }
2185 }
2186
2187 /* Clear out the last-write records for MRFs that were overwritten. */
2188 if (inst->dst.file == MRF) {
2189 last_mrf_move[inst->dst.reg] = NULL;
2190 }
2191
2192 if (inst->mlen > 0) {
2193 /* Found a SEND instruction, which will include two or fewer
2194 * implied MRF writes. We could do better here.
2195 */
2196 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2197 last_mrf_move[inst->base_mrf + i] = NULL;
2198 }
2199 }
2200
2201 /* Clear out any MRF move records whose sources got overwritten. */
2202 if (inst->dst.file == GRF) {
2203 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2204 if (last_mrf_move[i] &&
2205 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2206 last_mrf_move[i] = NULL;
2207 }
2208 }
2209 }
2210
2211 if (inst->opcode == BRW_OPCODE_MOV &&
2212 inst->dst.file == MRF &&
2213 inst->src[0].file == GRF &&
2214 !inst->predicate) {
2215 last_mrf_move[inst->dst.reg] = inst;
2216 }
2217 }
2218
2219 if (progress)
2220 live_intervals_valid = false;
2221
2222 return progress;
2223 }
2224
2225 void
2226 fs_visitor::dump_instruction(fs_inst *inst)
2227 {
2228 if (inst->predicate) {
2229 printf("(%cf0.%d) ",
2230 inst->predicate_inverse ? '-' : '+',
2231 inst->flag_subreg);
2232 }
2233
2234 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2235 opcode_descs[inst->opcode].name) {
2236 printf("%s", opcode_descs[inst->opcode].name);
2237 } else {
2238 printf("op%d", inst->opcode);
2239 }
2240 if (inst->saturate)
2241 printf(".sat");
2242 if (inst->conditional_mod) {
2243 printf(".cmod");
2244 if (!inst->predicate &&
2245 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2246 inst->opcode != BRW_OPCODE_IF &&
2247 inst->opcode != BRW_OPCODE_WHILE))) {
2248 printf(".f0.%d\n", inst->flag_subreg);
2249 }
2250 }
2251 printf(" ");
2252
2253
2254 switch (inst->dst.file) {
2255 case GRF:
2256 printf("vgrf%d", inst->dst.reg);
2257 if (inst->dst.reg_offset)
2258 printf("+%d", inst->dst.reg_offset);
2259 break;
2260 case MRF:
2261 printf("m%d", inst->dst.reg);
2262 break;
2263 case BAD_FILE:
2264 printf("(null)");
2265 break;
2266 case UNIFORM:
2267 printf("***u%d***", inst->dst.reg);
2268 break;
2269 default:
2270 printf("???");
2271 break;
2272 }
2273 printf(", ");
2274
2275 for (int i = 0; i < 3; i++) {
2276 if (inst->src[i].negate)
2277 printf("-");
2278 if (inst->src[i].abs)
2279 printf("|");
2280 switch (inst->src[i].file) {
2281 case GRF:
2282 printf("vgrf%d", inst->src[i].reg);
2283 if (inst->src[i].reg_offset)
2284 printf("+%d", inst->src[i].reg_offset);
2285 break;
2286 case MRF:
2287 printf("***m%d***", inst->src[i].reg);
2288 break;
2289 case UNIFORM:
2290 printf("u%d", inst->src[i].reg);
2291 if (inst->src[i].reg_offset)
2292 printf(".%d", inst->src[i].reg_offset);
2293 break;
2294 case BAD_FILE:
2295 printf("(null)");
2296 break;
2297 default:
2298 printf("???");
2299 break;
2300 }
2301 if (inst->src[i].abs)
2302 printf("|");
2303
2304 if (i < 3)
2305 printf(", ");
2306 }
2307
2308 printf(" ");
2309
2310 if (inst->force_uncompressed)
2311 printf("1sthalf ");
2312
2313 if (inst->force_sechalf)
2314 printf("2ndhalf ");
2315
2316 printf("\n");
2317 }
2318
2319 void
2320 fs_visitor::dump_instructions()
2321 {
2322 int ip = 0;
2323 foreach_list(node, &this->instructions) {
2324 fs_inst *inst = (fs_inst *)node;
2325 printf("%d: ", ip++);
2326 dump_instruction(inst);
2327 }
2328 }
2329
2330 /**
2331 * Possibly returns an instruction that set up @param reg.
2332 *
2333 * Sometimes we want to take the result of some expression/variable
2334 * dereference tree and rewrite the instruction generating the result
2335 * of the tree. When processing the tree, we know that the
2336 * instructions generated are all writing temporaries that are dead
2337 * outside of this tree. So, if we have some instructions that write
2338 * a temporary, we're free to point that temp write somewhere else.
2339 *
2340 * Note that this doesn't guarantee that the instruction generated
2341 * only reg -- it might be the size=4 destination of a texture instruction.
2342 */
2343 fs_inst *
2344 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2345 fs_inst *end,
2346 fs_reg reg)
2347 {
2348 if (end == start ||
2349 end->predicate ||
2350 end->force_uncompressed ||
2351 end->force_sechalf ||
2352 reg.reladdr ||
2353 !reg.equals(end->dst)) {
2354 return NULL;
2355 } else {
2356 return end;
2357 }
2358 }
2359
2360 void
2361 fs_visitor::setup_payload_gen6()
2362 {
2363 struct intel_context *intel = &brw->intel;
2364 bool uses_depth =
2365 (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2366 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2367
2368 assert(intel->gen >= 6);
2369
2370 /* R0-1: masks, pixel X/Y coordinates. */
2371 c->nr_payload_regs = 2;
2372 /* R2: only for 32-pixel dispatch.*/
2373
2374 /* R3-26: barycentric interpolation coordinates. These appear in the
2375 * same order that they appear in the brw_wm_barycentric_interp_mode
2376 * enum. Each set of coordinates occupies 2 registers if dispatch width
2377 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2378 * appear if they were enabled using the "Barycentric Interpolation
2379 * Mode" bits in WM_STATE.
2380 */
2381 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2382 if (barycentric_interp_modes & (1 << i)) {
2383 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2384 c->nr_payload_regs += 2;
2385 if (dispatch_width == 16) {
2386 c->nr_payload_regs += 2;
2387 }
2388 }
2389 }
2390
2391 /* R27: interpolated depth if uses source depth */
2392 if (uses_depth) {
2393 c->source_depth_reg = c->nr_payload_regs;
2394 c->nr_payload_regs++;
2395 if (dispatch_width == 16) {
2396 /* R28: interpolated depth if not 8-wide. */
2397 c->nr_payload_regs++;
2398 }
2399 }
2400 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2401 if (uses_depth) {
2402 c->source_w_reg = c->nr_payload_regs;
2403 c->nr_payload_regs++;
2404 if (dispatch_width == 16) {
2405 /* R30: interpolated W if not 8-wide. */
2406 c->nr_payload_regs++;
2407 }
2408 }
2409 /* R31: MSAA position offsets. */
2410 /* R32-: bary for 32-pixel. */
2411 /* R58-59: interp W for 32-pixel. */
2412
2413 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2414 c->source_depth_to_render_target = true;
2415 }
2416 }
2417
2418 bool
2419 fs_visitor::run()
2420 {
2421 sanity_param_count = fp->Base.Parameters->NumParameters;
2422 uint32_t orig_nr_params = c->prog_data.nr_params;
2423
2424 if (intel->gen >= 6)
2425 setup_payload_gen6();
2426 else
2427 setup_payload_gen4();
2428
2429 if (0) {
2430 emit_dummy_fs();
2431 } else {
2432 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2433 emit_shader_time_begin();
2434
2435 calculate_urb_setup();
2436 if (intel->gen < 6)
2437 emit_interpolation_setup_gen4();
2438 else
2439 emit_interpolation_setup_gen6();
2440
2441 /* We handle discards by keeping track of the still-live pixels in f0.1.
2442 * Initialize it with the dispatched pixels.
2443 */
2444 if (fp->UsesKill) {
2445 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2446 discard_init->flag_subreg = 1;
2447 }
2448
2449 /* Generate FS IR for main(). (the visitor only descends into
2450 * functions called "main").
2451 */
2452 if (shader) {
2453 foreach_list(node, &*shader->ir) {
2454 ir_instruction *ir = (ir_instruction *)node;
2455 base_ir = ir;
2456 this->result = reg_undef;
2457 ir->accept(this);
2458 }
2459 } else {
2460 emit_fragment_program_code();
2461 }
2462 base_ir = NULL;
2463 if (failed)
2464 return false;
2465
2466 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2467 emit_shader_time_end();
2468
2469 emit_fb_writes();
2470
2471 split_virtual_grfs();
2472
2473 move_uniform_array_access_to_pull_constants();
2474 setup_pull_constants();
2475
2476 bool progress;
2477 do {
2478 progress = false;
2479
2480 compact_virtual_grfs();
2481
2482 progress = remove_duplicate_mrf_writes() || progress;
2483
2484 progress = opt_algebraic() || progress;
2485 progress = opt_cse() || progress;
2486 progress = opt_copy_propagate() || progress;
2487 progress = dead_code_eliminate() || progress;
2488 progress = register_coalesce() || progress;
2489 progress = register_coalesce_2() || progress;
2490 progress = compute_to_mrf() || progress;
2491 } while (progress);
2492
2493 remove_dead_constants();
2494
2495 schedule_instructions(false);
2496
2497 assign_curb_setup();
2498 assign_urb_setup();
2499
2500 if (0) {
2501 /* Debug of register spilling: Go spill everything. */
2502 for (int i = 0; i < virtual_grf_count; i++) {
2503 spill_reg(i);
2504 }
2505 }
2506
2507 if (0)
2508 assign_regs_trivial();
2509 else {
2510 while (!assign_regs()) {
2511 if (failed)
2512 break;
2513 }
2514 }
2515 }
2516 assert(force_uncompressed_stack == 0);
2517 assert(force_sechalf_stack == 0);
2518
2519 if (failed)
2520 return false;
2521
2522 schedule_instructions(true);
2523
2524 if (dispatch_width == 8) {
2525 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2526 } else {
2527 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2528
2529 /* Make sure we didn't try to sneak in an extra uniform */
2530 assert(orig_nr_params == c->prog_data.nr_params);
2531 (void) orig_nr_params;
2532 }
2533
2534 /* If any state parameters were appended, then ParameterValues could have
2535 * been realloced, in which case the driver uniform storage set up by
2536 * _mesa_associate_uniform_storage() would point to freed memory. Make
2537 * sure that didn't happen.
2538 */
2539 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2540
2541 return !failed;
2542 }
2543
2544 const unsigned *
2545 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2546 struct gl_fragment_program *fp,
2547 struct gl_shader_program *prog,
2548 unsigned *final_assembly_size)
2549 {
2550 struct intel_context *intel = &brw->intel;
2551 bool start_busy = false;
2552 float start_time = 0;
2553
2554 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2555 start_busy = (intel->batch.last_bo &&
2556 drm_intel_bo_busy(intel->batch.last_bo));
2557 start_time = get_time();
2558 }
2559
2560 struct brw_shader *shader = NULL;
2561 if (prog)
2562 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2563
2564 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2565 if (shader) {
2566 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2567 _mesa_print_ir(shader->ir, NULL);
2568 printf("\n\n");
2569 } else {
2570 printf("ARB_fragment_program %d ir for native fragment shader\n",
2571 fp->Base.Id);
2572 _mesa_print_program(&fp->Base);
2573 }
2574 }
2575
2576 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2577 */
2578 fs_visitor v(brw, c, prog, fp, 8);
2579 if (!v.run()) {
2580 prog->LinkStatus = false;
2581 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2582
2583 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2584 v.fail_msg);
2585
2586 return NULL;
2587 }
2588
2589 exec_list *simd16_instructions = NULL;
2590 fs_visitor v2(brw, c, prog, fp, 16);
2591 bool no16 = INTEL_DEBUG & DEBUG_NO16;
2592 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2593 v2.import_uniforms(&v);
2594 if (!v2.run()) {
2595 perf_debug("16-wide shader failed to compile, falling back to "
2596 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2597 } else {
2598 simd16_instructions = &v2.instructions;
2599 }
2600 }
2601
2602 c->prog_data.dispatch_width = 8;
2603
2604 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2605 const unsigned *generated = g.generate_assembly(&v.instructions,
2606 simd16_instructions,
2607 final_assembly_size);
2608
2609 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2610 if (shader->compiled_once)
2611 brw_wm_debug_recompile(brw, prog, &c->key);
2612 shader->compiled_once = true;
2613
2614 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2615 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2616 (get_time() - start_time) * 1000);
2617 }
2618 }
2619
2620 return generated;
2621 }
2622
2623 bool
2624 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2625 {
2626 struct brw_context *brw = brw_context(ctx);
2627 struct intel_context *intel = &brw->intel;
2628 struct brw_wm_prog_key key;
2629
2630 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2631 return true;
2632
2633 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2634 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2635 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2636 bool program_uses_dfdy = fp->UsesDFdy;
2637
2638 memset(&key, 0, sizeof(key));
2639
2640 if (intel->gen < 6) {
2641 if (fp->UsesKill)
2642 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2643
2644 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2645 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2646
2647 /* Just assume depth testing. */
2648 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2649 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2650 }
2651
2652 if (prog->Name != 0)
2653 key.proj_attrib_mask = 0xffffffff;
2654
2655 if (intel->gen < 6)
2656 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2657
2658 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2659 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2660 continue;
2661
2662 if (prog->Name == 0)
2663 key.proj_attrib_mask |= 1 << i;
2664
2665 if (intel->gen < 6) {
2666 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2667
2668 if (vp_index >= 0)
2669 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2670 }
2671 }
2672
2673 key.clamp_fragment_color = true;
2674
2675 for (int i = 0; i < MAX_SAMPLERS; i++) {
2676 if (fp->Base.ShadowSamplers & (1 << i)) {
2677 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2678 key.tex.swizzles[i] =
2679 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2680 } else {
2681 /* Color sampler: assume no swizzling. */
2682 key.tex.swizzles[i] = SWIZZLE_XYZW;
2683 }
2684 }
2685
2686 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2687 key.drawable_height = ctx->DrawBuffer->Height;
2688 }
2689
2690 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2691 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2692 }
2693
2694 key.nr_color_regions = 1;
2695
2696 key.program_string_id = bfp->id;
2697
2698 uint32_t old_prog_offset = brw->wm.prog_offset;
2699 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2700
2701 bool success = do_wm_prog(brw, prog, bfp, &key);
2702
2703 brw->wm.prog_offset = old_prog_offset;
2704 brw->wm.prog_data = old_prog_data;
2705
2706 return success;
2707 }