i965/fs: Reference the core GL uniform storage for non-builtin uniforms.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 ALU1(NOT)
150 ALU1(MOV)
151 ALU1(FRC)
152 ALU1(RNDD)
153 ALU1(RNDE)
154 ALU1(RNDZ)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(SHL)
162 ALU2(SHR)
163 ALU2(ASR)
164
165 /** Gen4 predicated IF. */
166 fs_inst *
167 fs_visitor::IF(uint32_t predicate)
168 {
169 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
170 inst->predicate = predicate;
171 return inst;
172 }
173
174 /** Gen6+ IF with embedded comparison. */
175 fs_inst *
176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
177 {
178 assert(intel->gen >= 6);
179 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
180 reg_null_d, src0, src1);
181 inst->conditional_mod = condition;
182 return inst;
183 }
184
185 /**
186 * CMP: Sets the low bit of the destination channels with the result
187 * of the comparison, while the upper bits are undefined, and updates
188 * the flag register with the packed 16 bits of the result.
189 */
190 fs_inst *
191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
192 {
193 fs_inst *inst;
194
195 /* Take the instruction:
196 *
197 * CMP null<d> src0<f> src1<f>
198 *
199 * Original gen4 does type conversion to the destination type before
200 * comparison, producing garbage results for floating point comparisons.
201 * gen5 does the comparison on the execution type (resolved source types),
202 * so dst type doesn't matter. gen6 does comparison and then uses the
203 * result as if it was the dst type with no conversion, which happens to
204 * mostly work out for float-interpreted-as-int since our comparisons are
205 * for >0, =0, <0.
206 */
207 if (intel->gen == 4) {
208 dst.type = src0.type;
209 if (dst.file == FIXED_HW_REG)
210 dst.fixed_hw_reg.type = dst.type;
211 }
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 exec_list
223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
224 fs_reg offset)
225 {
226 exec_list instructions;
227 fs_inst *inst;
228
229 if (intel->gen >= 7) {
230 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
231 dst, surf_index, offset);
232 instructions.push_tail(inst);
233 } else {
234 int base_mrf = 13;
235 bool header_present = true;
236
237 fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
238 mrf.type = BRW_REGISTER_TYPE_D;
239
240 /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
241 * dword-aligned byte offset.
242 */
243 if (intel->gen == 6) {
244 instructions.push_tail(MOV(mrf, offset));
245 } else {
246 instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
247 }
248 inst = MOV(mrf, offset);
249 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
250 dst, surf_index);
251 inst->header_present = header_present;
252 inst->base_mrf = base_mrf;
253 inst->mlen = header_present + dispatch_width / 8;
254
255 instructions.push_tail(inst);
256 }
257
258 return instructions;
259 }
260
261 bool
262 fs_inst::equals(fs_inst *inst)
263 {
264 return (opcode == inst->opcode &&
265 dst.equals(inst->dst) &&
266 src[0].equals(inst->src[0]) &&
267 src[1].equals(inst->src[1]) &&
268 src[2].equals(inst->src[2]) &&
269 saturate == inst->saturate &&
270 predicate == inst->predicate &&
271 conditional_mod == inst->conditional_mod &&
272 mlen == inst->mlen &&
273 base_mrf == inst->base_mrf &&
274 sampler == inst->sampler &&
275 target == inst->target &&
276 eot == inst->eot &&
277 header_present == inst->header_present &&
278 shadow_compare == inst->shadow_compare &&
279 offset == inst->offset);
280 }
281
282 int
283 fs_inst::regs_written()
284 {
285 if (is_tex())
286 return 4;
287
288 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
289 * but we don't currently use them...nor do we have an opcode for them.
290 */
291
292 return 1;
293 }
294
295 bool
296 fs_inst::overwrites_reg(const fs_reg &reg)
297 {
298 return (reg.file == dst.file &&
299 reg.reg == dst.reg &&
300 reg.reg_offset >= dst.reg_offset &&
301 reg.reg_offset < dst.reg_offset + regs_written());
302 }
303
304 bool
305 fs_inst::is_tex()
306 {
307 return (opcode == SHADER_OPCODE_TEX ||
308 opcode == FS_OPCODE_TXB ||
309 opcode == SHADER_OPCODE_TXD ||
310 opcode == SHADER_OPCODE_TXF ||
311 opcode == SHADER_OPCODE_TXL ||
312 opcode == SHADER_OPCODE_TXS);
313 }
314
315 bool
316 fs_inst::is_math()
317 {
318 return (opcode == SHADER_OPCODE_RCP ||
319 opcode == SHADER_OPCODE_RSQ ||
320 opcode == SHADER_OPCODE_SQRT ||
321 opcode == SHADER_OPCODE_EXP2 ||
322 opcode == SHADER_OPCODE_LOG2 ||
323 opcode == SHADER_OPCODE_SIN ||
324 opcode == SHADER_OPCODE_COS ||
325 opcode == SHADER_OPCODE_INT_QUOTIENT ||
326 opcode == SHADER_OPCODE_INT_REMAINDER ||
327 opcode == SHADER_OPCODE_POW);
328 }
329
330 bool
331 fs_inst::is_send_from_grf()
332 {
333 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
334 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
335 src[1].file == GRF));
336 }
337
338 bool
339 fs_visitor::can_do_source_mods(fs_inst *inst)
340 {
341 if (intel->gen == 6 && inst->is_math())
342 return false;
343
344 if (inst->is_send_from_grf())
345 return false;
346
347 return true;
348 }
349
350 void
351 fs_reg::init()
352 {
353 memset(this, 0, sizeof(*this));
354 this->smear = -1;
355 }
356
357 /** Generic unset register constructor. */
358 fs_reg::fs_reg()
359 {
360 init();
361 this->file = BAD_FILE;
362 }
363
364 /** Immediate value constructor. */
365 fs_reg::fs_reg(float f)
366 {
367 init();
368 this->file = IMM;
369 this->type = BRW_REGISTER_TYPE_F;
370 this->imm.f = f;
371 }
372
373 /** Immediate value constructor. */
374 fs_reg::fs_reg(int32_t i)
375 {
376 init();
377 this->file = IMM;
378 this->type = BRW_REGISTER_TYPE_D;
379 this->imm.i = i;
380 }
381
382 /** Immediate value constructor. */
383 fs_reg::fs_reg(uint32_t u)
384 {
385 init();
386 this->file = IMM;
387 this->type = BRW_REGISTER_TYPE_UD;
388 this->imm.u = u;
389 }
390
391 /** Fixed brw_reg Immediate value constructor. */
392 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
393 {
394 init();
395 this->file = FIXED_HW_REG;
396 this->fixed_hw_reg = fixed_hw_reg;
397 this->type = fixed_hw_reg.type;
398 }
399
400 bool
401 fs_reg::equals(const fs_reg &r) const
402 {
403 return (file == r.file &&
404 reg == r.reg &&
405 reg_offset == r.reg_offset &&
406 type == r.type &&
407 negate == r.negate &&
408 abs == r.abs &&
409 !reladdr && !r.reladdr &&
410 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
411 sizeof(fixed_hw_reg)) == 0 &&
412 smear == r.smear &&
413 imm.u == r.imm.u);
414 }
415
416 bool
417 fs_reg::is_zero() const
418 {
419 if (file != IMM)
420 return false;
421
422 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
423 }
424
425 bool
426 fs_reg::is_one() const
427 {
428 if (file != IMM)
429 return false;
430
431 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
432 }
433
434 int
435 fs_visitor::type_size(const struct glsl_type *type)
436 {
437 unsigned int size, i;
438
439 switch (type->base_type) {
440 case GLSL_TYPE_UINT:
441 case GLSL_TYPE_INT:
442 case GLSL_TYPE_FLOAT:
443 case GLSL_TYPE_BOOL:
444 return type->components();
445 case GLSL_TYPE_ARRAY:
446 return type_size(type->fields.array) * type->length;
447 case GLSL_TYPE_STRUCT:
448 size = 0;
449 for (i = 0; i < type->length; i++) {
450 size += type_size(type->fields.structure[i].type);
451 }
452 return size;
453 case GLSL_TYPE_SAMPLER:
454 /* Samplers take up no register space, since they're baked in at
455 * link time.
456 */
457 return 0;
458 default:
459 assert(!"not reached");
460 return 0;
461 }
462 }
463
464 fs_reg
465 fs_visitor::get_timestamp()
466 {
467 assert(intel->gen >= 7);
468
469 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
470 BRW_ARF_TIMESTAMP,
471 0),
472 BRW_REGISTER_TYPE_UD));
473
474 fs_reg dst = fs_reg(this, glsl_type::uint_type);
475
476 fs_inst *mov = emit(MOV(dst, ts));
477 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
478 * even if it's not enabled in the dispatch.
479 */
480 mov->force_writemask_all = true;
481 mov->force_uncompressed = true;
482
483 /* The caller wants the low 32 bits of the timestamp. Since it's running
484 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
485 * which is plenty of time for our purposes. It is identical across the
486 * EUs, but since it's tracking GPU core speed it will increment at a
487 * varying rate as render P-states change.
488 *
489 * The caller could also check if render P-states have changed (or anything
490 * else that might disrupt timing) by setting smear to 2 and checking if
491 * that field is != 0.
492 */
493 dst.smear = 0;
494
495 return dst;
496 }
497
498 void
499 fs_visitor::emit_shader_time_begin()
500 {
501 current_annotation = "shader time start";
502 shader_start_time = get_timestamp();
503 }
504
505 void
506 fs_visitor::emit_shader_time_end()
507 {
508 current_annotation = "shader time end";
509
510 enum shader_time_shader_type type, written_type, reset_type;
511 if (dispatch_width == 8) {
512 type = ST_FS8;
513 written_type = ST_FS8_WRITTEN;
514 reset_type = ST_FS8_RESET;
515 } else {
516 assert(dispatch_width == 16);
517 type = ST_FS16;
518 written_type = ST_FS16_WRITTEN;
519 reset_type = ST_FS16_RESET;
520 }
521
522 fs_reg shader_end_time = get_timestamp();
523
524 /* Check that there weren't any timestamp reset events (assuming these
525 * were the only two timestamp reads that happened).
526 */
527 fs_reg reset = shader_end_time;
528 reset.smear = 2;
529 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
530 test->conditional_mod = BRW_CONDITIONAL_Z;
531 emit(IF(BRW_PREDICATE_NORMAL));
532
533 push_force_uncompressed();
534 fs_reg start = shader_start_time;
535 start.negate = true;
536 fs_reg diff = fs_reg(this, glsl_type::uint_type);
537 emit(ADD(diff, start, shader_end_time));
538
539 /* If there were no instructions between the two timestamp gets, the diff
540 * is 2 cycles. Remove that overhead, so I can forget about that when
541 * trying to determine the time taken for single instructions.
542 */
543 emit(ADD(diff, diff, fs_reg(-2u)));
544
545 emit_shader_time_write(type, diff);
546 emit_shader_time_write(written_type, fs_reg(1u));
547 emit(BRW_OPCODE_ELSE);
548 emit_shader_time_write(reset_type, fs_reg(1u));
549 emit(BRW_OPCODE_ENDIF);
550
551 pop_force_uncompressed();
552 }
553
554 void
555 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
556 fs_reg value)
557 {
558 /* Choose an index in the buffer and set up tracking information for our
559 * printouts.
560 */
561 int shader_time_index = brw->shader_time.num_entries++;
562 assert(shader_time_index <= brw->shader_time.max_entries);
563 brw->shader_time.types[shader_time_index] = type;
564 if (prog) {
565 _mesa_reference_shader_program(ctx,
566 &brw->shader_time.programs[shader_time_index],
567 prog);
568 }
569
570 int base_mrf = 6;
571
572 fs_reg offset_mrf = fs_reg(MRF, base_mrf);
573 offset_mrf.type = BRW_REGISTER_TYPE_UD;
574 emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
575
576 fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
577 time_mrf.type = BRW_REGISTER_TYPE_UD;
578 emit(MOV(time_mrf, value));
579
580 fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
581 inst->base_mrf = base_mrf;
582 inst->mlen = 2;
583 }
584
585 void
586 fs_visitor::fail(const char *format, ...)
587 {
588 va_list va;
589 char *msg;
590
591 if (failed)
592 return;
593
594 failed = true;
595
596 va_start(va, format);
597 msg = ralloc_vasprintf(mem_ctx, format, va);
598 va_end(va);
599 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
600
601 this->fail_msg = msg;
602
603 if (INTEL_DEBUG & DEBUG_WM) {
604 fprintf(stderr, "%s", msg);
605 }
606 }
607
608 fs_inst *
609 fs_visitor::emit(enum opcode opcode)
610 {
611 return emit(fs_inst(opcode));
612 }
613
614 fs_inst *
615 fs_visitor::emit(enum opcode opcode, fs_reg dst)
616 {
617 return emit(fs_inst(opcode, dst));
618 }
619
620 fs_inst *
621 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
622 {
623 return emit(fs_inst(opcode, dst, src0));
624 }
625
626 fs_inst *
627 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
628 {
629 return emit(fs_inst(opcode, dst, src0, src1));
630 }
631
632 fs_inst *
633 fs_visitor::emit(enum opcode opcode, fs_reg dst,
634 fs_reg src0, fs_reg src1, fs_reg src2)
635 {
636 return emit(fs_inst(opcode, dst, src0, src1, src2));
637 }
638
639 void
640 fs_visitor::push_force_uncompressed()
641 {
642 force_uncompressed_stack++;
643 }
644
645 void
646 fs_visitor::pop_force_uncompressed()
647 {
648 force_uncompressed_stack--;
649 assert(force_uncompressed_stack >= 0);
650 }
651
652 void
653 fs_visitor::push_force_sechalf()
654 {
655 force_sechalf_stack++;
656 }
657
658 void
659 fs_visitor::pop_force_sechalf()
660 {
661 force_sechalf_stack--;
662 assert(force_sechalf_stack >= 0);
663 }
664
665 /**
666 * Returns how many MRFs an FS opcode will write over.
667 *
668 * Note that this is not the 0 or 1 implied writes in an actual gen
669 * instruction -- the FS opcodes often generate MOVs in addition.
670 */
671 int
672 fs_visitor::implied_mrf_writes(fs_inst *inst)
673 {
674 if (inst->mlen == 0)
675 return 0;
676
677 switch (inst->opcode) {
678 case SHADER_OPCODE_RCP:
679 case SHADER_OPCODE_RSQ:
680 case SHADER_OPCODE_SQRT:
681 case SHADER_OPCODE_EXP2:
682 case SHADER_OPCODE_LOG2:
683 case SHADER_OPCODE_SIN:
684 case SHADER_OPCODE_COS:
685 return 1 * dispatch_width / 8;
686 case SHADER_OPCODE_POW:
687 case SHADER_OPCODE_INT_QUOTIENT:
688 case SHADER_OPCODE_INT_REMAINDER:
689 return 2 * dispatch_width / 8;
690 case SHADER_OPCODE_TEX:
691 case FS_OPCODE_TXB:
692 case SHADER_OPCODE_TXD:
693 case SHADER_OPCODE_TXF:
694 case SHADER_OPCODE_TXL:
695 case SHADER_OPCODE_TXS:
696 return 1;
697 case SHADER_OPCODE_SHADER_TIME_ADD:
698 return 0;
699 case FS_OPCODE_FB_WRITE:
700 return 2;
701 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
702 case FS_OPCODE_UNSPILL:
703 return 1;
704 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
705 return inst->header_present;
706 case FS_OPCODE_SPILL:
707 return 2;
708 default:
709 assert(!"not reached");
710 return inst->mlen;
711 }
712 }
713
714 int
715 fs_visitor::virtual_grf_alloc(int size)
716 {
717 if (virtual_grf_array_size <= virtual_grf_count) {
718 if (virtual_grf_array_size == 0)
719 virtual_grf_array_size = 16;
720 else
721 virtual_grf_array_size *= 2;
722 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
723 virtual_grf_array_size);
724 }
725 virtual_grf_sizes[virtual_grf_count] = size;
726 return virtual_grf_count++;
727 }
728
729 /** Fixed HW reg constructor. */
730 fs_reg::fs_reg(enum register_file file, int reg)
731 {
732 init();
733 this->file = file;
734 this->reg = reg;
735 this->type = BRW_REGISTER_TYPE_F;
736 }
737
738 /** Fixed HW reg constructor. */
739 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
740 {
741 init();
742 this->file = file;
743 this->reg = reg;
744 this->type = type;
745 }
746
747 /** Automatic reg constructor. */
748 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
749 {
750 init();
751
752 this->file = GRF;
753 this->reg = v->virtual_grf_alloc(v->type_size(type));
754 this->reg_offset = 0;
755 this->type = brw_type_for_base_type(type);
756 }
757
758 fs_reg *
759 fs_visitor::variable_storage(ir_variable *var)
760 {
761 return (fs_reg *)hash_table_find(this->variable_ht, var);
762 }
763
764 void
765 import_uniforms_callback(const void *key,
766 void *data,
767 void *closure)
768 {
769 struct hash_table *dst_ht = (struct hash_table *)closure;
770 const fs_reg *reg = (const fs_reg *)data;
771
772 if (reg->file != UNIFORM)
773 return;
774
775 hash_table_insert(dst_ht, data, key);
776 }
777
778 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
779 * This brings in those uniform definitions
780 */
781 void
782 fs_visitor::import_uniforms(fs_visitor *v)
783 {
784 hash_table_call_foreach(v->variable_ht,
785 import_uniforms_callback,
786 variable_ht);
787 this->params_remap = v->params_remap;
788 }
789
790 /* Our support for uniforms is piggy-backed on the struct
791 * gl_fragment_program, because that's where the values actually
792 * get stored, rather than in some global gl_shader_program uniform
793 * store.
794 */
795 void
796 fs_visitor::setup_uniform_values(ir_variable *ir)
797 {
798 int namelen = strlen(ir->name);
799
800 /* The data for our (non-builtin) uniforms is stored in a series of
801 * gl_uniform_driver_storage structs for each subcomponent that
802 * glGetUniformLocation() could name. We know it's been set up in the same
803 * order we'd walk the type, so walk the list of storage and find anything
804 * with our name, or the prefix of a component that starts with our name.
805 */
806 unsigned params_before = c->prog_data.nr_params;
807 for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
808 struct gl_uniform_storage *storage = &prog->UniformStorage[u];
809
810 if (strncmp(ir->name, storage->name, namelen) != 0 ||
811 (storage->name[namelen] != 0 &&
812 storage->name[namelen] != '.' &&
813 storage->name[namelen] != '[')) {
814 continue;
815 }
816
817 unsigned slots = storage->type->component_slots();
818 if (storage->array_elements)
819 slots *= storage->array_elements;
820
821 for (unsigned i = 0; i < slots; i++) {
822 c->prog_data.param[c->prog_data.nr_params++] =
823 &storage->storage[i].f;
824 }
825 }
826
827 /* Make sure we actually initialized the right amount of stuff here. */
828 assert(params_before + ir->type->component_slots() ==
829 c->prog_data.nr_params);
830 }
831
832
833 /* Our support for builtin uniforms is even scarier than non-builtin.
834 * It sits on top of the PROG_STATE_VAR parameters that are
835 * automatically updated from GL context state.
836 */
837 void
838 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
839 {
840 const ir_state_slot *const slots = ir->state_slots;
841 assert(ir->state_slots != NULL);
842
843 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
844 /* This state reference has already been setup by ir_to_mesa, but we'll
845 * get the same index back here.
846 */
847 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
848 (gl_state_index *)slots[i].tokens);
849
850 /* Add each of the unique swizzles of the element as a parameter.
851 * This'll end up matching the expected layout of the
852 * array/matrix/structure we're trying to fill in.
853 */
854 int last_swiz = -1;
855 for (unsigned int j = 0; j < 4; j++) {
856 int swiz = GET_SWZ(slots[i].swizzle, j);
857 if (swiz == last_swiz)
858 break;
859 last_swiz = swiz;
860
861 c->prog_data.param[c->prog_data.nr_params++] =
862 &fp->Base.Parameters->ParameterValues[index][swiz].f;
863 }
864 }
865 }
866
867 fs_reg *
868 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
869 {
870 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
871 fs_reg wpos = *reg;
872 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
873
874 /* gl_FragCoord.x */
875 if (ir->pixel_center_integer) {
876 emit(MOV(wpos, this->pixel_x));
877 } else {
878 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
879 }
880 wpos.reg_offset++;
881
882 /* gl_FragCoord.y */
883 if (!flip && ir->pixel_center_integer) {
884 emit(MOV(wpos, this->pixel_y));
885 } else {
886 fs_reg pixel_y = this->pixel_y;
887 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
888
889 if (flip) {
890 pixel_y.negate = true;
891 offset += c->key.drawable_height - 1.0;
892 }
893
894 emit(ADD(wpos, pixel_y, fs_reg(offset)));
895 }
896 wpos.reg_offset++;
897
898 /* gl_FragCoord.z */
899 if (intel->gen >= 6) {
900 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
901 } else {
902 emit(FS_OPCODE_LINTERP, wpos,
903 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
904 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
905 interp_reg(FRAG_ATTRIB_WPOS, 2));
906 }
907 wpos.reg_offset++;
908
909 /* gl_FragCoord.w: Already set up in emit_interpolation */
910 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
911
912 return reg;
913 }
914
915 fs_inst *
916 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
917 glsl_interp_qualifier interpolation_mode,
918 bool is_centroid)
919 {
920 brw_wm_barycentric_interp_mode barycoord_mode;
921 if (is_centroid) {
922 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
923 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
924 else
925 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
926 } else {
927 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
928 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
929 else
930 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
931 }
932 return emit(FS_OPCODE_LINTERP, attr,
933 this->delta_x[barycoord_mode],
934 this->delta_y[barycoord_mode], interp);
935 }
936
937 fs_reg *
938 fs_visitor::emit_general_interpolation(ir_variable *ir)
939 {
940 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
941 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
942 fs_reg attr = *reg;
943
944 unsigned int array_elements;
945 const glsl_type *type;
946
947 if (ir->type->is_array()) {
948 array_elements = ir->type->length;
949 if (array_elements == 0) {
950 fail("dereferenced array '%s' has length 0\n", ir->name);
951 }
952 type = ir->type->fields.array;
953 } else {
954 array_elements = 1;
955 type = ir->type;
956 }
957
958 glsl_interp_qualifier interpolation_mode =
959 ir->determine_interpolation_mode(c->key.flat_shade);
960
961 int location = ir->location;
962 for (unsigned int i = 0; i < array_elements; i++) {
963 for (unsigned int j = 0; j < type->matrix_columns; j++) {
964 if (urb_setup[location] == -1) {
965 /* If there's no incoming setup data for this slot, don't
966 * emit interpolation for it.
967 */
968 attr.reg_offset += type->vector_elements;
969 location++;
970 continue;
971 }
972
973 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
974 /* Constant interpolation (flat shading) case. The SF has
975 * handed us defined values in only the constant offset
976 * field of the setup reg.
977 */
978 for (unsigned int k = 0; k < type->vector_elements; k++) {
979 struct brw_reg interp = interp_reg(location, k);
980 interp = suboffset(interp, 3);
981 interp.type = reg->type;
982 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
983 attr.reg_offset++;
984 }
985 } else {
986 /* Smooth/noperspective interpolation case. */
987 for (unsigned int k = 0; k < type->vector_elements; k++) {
988 /* FINISHME: At some point we probably want to push
989 * this farther by giving similar treatment to the
990 * other potentially constant components of the
991 * attribute, as well as making brw_vs_constval.c
992 * handle varyings other than gl_TexCoord.
993 */
994 if (location >= FRAG_ATTRIB_TEX0 &&
995 location <= FRAG_ATTRIB_TEX7 &&
996 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
997 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
998 } else {
999 struct brw_reg interp = interp_reg(location, k);
1000 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1001 ir->centroid);
1002 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1003 /* Get the pixel/sample mask into f0 so that we know
1004 * which pixels are lit. Then, for each channel that is
1005 * unlit, replace the centroid data with non-centroid
1006 * data.
1007 */
1008 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1009 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1010 interpolation_mode, false);
1011 inst->predicate = BRW_PREDICATE_NORMAL;
1012 inst->predicate_inverse = true;
1013 }
1014 if (intel->gen < 6) {
1015 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1016 }
1017 }
1018 attr.reg_offset++;
1019 }
1020
1021 }
1022 location++;
1023 }
1024 }
1025
1026 return reg;
1027 }
1028
1029 fs_reg *
1030 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1031 {
1032 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1033
1034 /* The frontfacing comes in as a bit in the thread payload. */
1035 if (intel->gen >= 6) {
1036 emit(BRW_OPCODE_ASR, *reg,
1037 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1038 fs_reg(15));
1039 emit(BRW_OPCODE_NOT, *reg, *reg);
1040 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1041 } else {
1042 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1043 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1044 * us front face
1045 */
1046 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1047 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1048 }
1049
1050 return reg;
1051 }
1052
1053 fs_reg
1054 fs_visitor::fix_math_operand(fs_reg src)
1055 {
1056 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1057 * might be able to do better by doing execsize = 1 math and then
1058 * expanding that result out, but we would need to be careful with
1059 * masking.
1060 *
1061 * The hardware ignores source modifiers (negate and abs) on math
1062 * instructions, so we also move to a temp to set those up.
1063 */
1064 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1065 !src.abs && !src.negate)
1066 return src;
1067
1068 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1069 * operands to math
1070 */
1071 if (intel->gen >= 7 && src.file != IMM)
1072 return src;
1073
1074 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1075 expanded.type = src.type;
1076 emit(BRW_OPCODE_MOV, expanded, src);
1077 return expanded;
1078 }
1079
1080 fs_inst *
1081 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1082 {
1083 switch (opcode) {
1084 case SHADER_OPCODE_RCP:
1085 case SHADER_OPCODE_RSQ:
1086 case SHADER_OPCODE_SQRT:
1087 case SHADER_OPCODE_EXP2:
1088 case SHADER_OPCODE_LOG2:
1089 case SHADER_OPCODE_SIN:
1090 case SHADER_OPCODE_COS:
1091 break;
1092 default:
1093 assert(!"not reached: bad math opcode");
1094 return NULL;
1095 }
1096
1097 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1098 * might be able to do better by doing execsize = 1 math and then
1099 * expanding that result out, but we would need to be careful with
1100 * masking.
1101 *
1102 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1103 * instructions, so we also move to a temp to set those up.
1104 */
1105 if (intel->gen >= 6)
1106 src = fix_math_operand(src);
1107
1108 fs_inst *inst = emit(opcode, dst, src);
1109
1110 if (intel->gen < 6) {
1111 inst->base_mrf = 2;
1112 inst->mlen = dispatch_width / 8;
1113 }
1114
1115 return inst;
1116 }
1117
1118 fs_inst *
1119 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1120 {
1121 int base_mrf = 2;
1122 fs_inst *inst;
1123
1124 switch (opcode) {
1125 case SHADER_OPCODE_INT_QUOTIENT:
1126 case SHADER_OPCODE_INT_REMAINDER:
1127 if (intel->gen >= 7 && dispatch_width == 16)
1128 fail("16-wide INTDIV unsupported\n");
1129 break;
1130 case SHADER_OPCODE_POW:
1131 break;
1132 default:
1133 assert(!"not reached: unsupported binary math opcode.");
1134 return NULL;
1135 }
1136
1137 if (intel->gen >= 6) {
1138 src0 = fix_math_operand(src0);
1139 src1 = fix_math_operand(src1);
1140
1141 inst = emit(opcode, dst, src0, src1);
1142 } else {
1143 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1144 * "Message Payload":
1145 *
1146 * "Operand0[7]. For the INT DIV functions, this operand is the
1147 * denominator."
1148 * ...
1149 * "Operand1[7]. For the INT DIV functions, this operand is the
1150 * numerator."
1151 */
1152 bool is_int_div = opcode != SHADER_OPCODE_POW;
1153 fs_reg &op0 = is_int_div ? src1 : src0;
1154 fs_reg &op1 = is_int_div ? src0 : src1;
1155
1156 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1157 inst = emit(opcode, dst, op0, reg_null_f);
1158
1159 inst->base_mrf = base_mrf;
1160 inst->mlen = 2 * dispatch_width / 8;
1161 }
1162 return inst;
1163 }
1164
1165 void
1166 fs_visitor::assign_curb_setup()
1167 {
1168 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1169 if (dispatch_width == 8) {
1170 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1171 } else {
1172 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1173 }
1174
1175 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1176 foreach_list(node, &this->instructions) {
1177 fs_inst *inst = (fs_inst *)node;
1178
1179 for (unsigned int i = 0; i < 3; i++) {
1180 if (inst->src[i].file == UNIFORM) {
1181 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1182 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1183 constant_nr / 8,
1184 constant_nr % 8);
1185
1186 inst->src[i].file = FIXED_HW_REG;
1187 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1188 }
1189 }
1190 }
1191 }
1192
1193 void
1194 fs_visitor::calculate_urb_setup()
1195 {
1196 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1197 urb_setup[i] = -1;
1198 }
1199
1200 int urb_next = 0;
1201 /* Figure out where each of the incoming setup attributes lands. */
1202 if (intel->gen >= 6) {
1203 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1204 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1205 urb_setup[i] = urb_next++;
1206 }
1207 }
1208 } else {
1209 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1210 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1211 /* Point size is packed into the header, not as a general attribute */
1212 if (i == VERT_RESULT_PSIZ)
1213 continue;
1214
1215 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1216 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1217
1218 /* The back color slot is skipped when the front color is
1219 * also written to. In addition, some slots can be
1220 * written in the vertex shader and not read in the
1221 * fragment shader. So the register number must always be
1222 * incremented, mapped or not.
1223 */
1224 if (fp_index >= 0)
1225 urb_setup[fp_index] = urb_next;
1226 urb_next++;
1227 }
1228 }
1229
1230 /*
1231 * It's a FS only attribute, and we did interpolation for this attribute
1232 * in SF thread. So, count it here, too.
1233 *
1234 * See compile_sf_prog() for more info.
1235 */
1236 if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1237 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1238 }
1239
1240 /* Each attribute is 4 setup channels, each of which is half a reg. */
1241 c->prog_data.urb_read_length = urb_next * 2;
1242 }
1243
1244 void
1245 fs_visitor::assign_urb_setup()
1246 {
1247 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1248
1249 /* Offset all the urb_setup[] index by the actual position of the
1250 * setup regs, now that the location of the constants has been chosen.
1251 */
1252 foreach_list(node, &this->instructions) {
1253 fs_inst *inst = (fs_inst *)node;
1254
1255 if (inst->opcode == FS_OPCODE_LINTERP) {
1256 assert(inst->src[2].file == FIXED_HW_REG);
1257 inst->src[2].fixed_hw_reg.nr += urb_start;
1258 }
1259
1260 if (inst->opcode == FS_OPCODE_CINTERP) {
1261 assert(inst->src[0].file == FIXED_HW_REG);
1262 inst->src[0].fixed_hw_reg.nr += urb_start;
1263 }
1264 }
1265
1266 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1267 }
1268
1269 /**
1270 * Split large virtual GRFs into separate components if we can.
1271 *
1272 * This is mostly duplicated with what brw_fs_vector_splitting does,
1273 * but that's really conservative because it's afraid of doing
1274 * splitting that doesn't result in real progress after the rest of
1275 * the optimization phases, which would cause infinite looping in
1276 * optimization. We can do it once here, safely. This also has the
1277 * opportunity to split interpolated values, or maybe even uniforms,
1278 * which we don't have at the IR level.
1279 *
1280 * We want to split, because virtual GRFs are what we register
1281 * allocate and spill (due to contiguousness requirements for some
1282 * instructions), and they're what we naturally generate in the
1283 * codegen process, but most virtual GRFs don't actually need to be
1284 * contiguous sets of GRFs. If we split, we'll end up with reduced
1285 * live intervals and better dead code elimination and coalescing.
1286 */
1287 void
1288 fs_visitor::split_virtual_grfs()
1289 {
1290 int num_vars = this->virtual_grf_count;
1291 bool split_grf[num_vars];
1292 int new_virtual_grf[num_vars];
1293
1294 /* Try to split anything > 0 sized. */
1295 for (int i = 0; i < num_vars; i++) {
1296 if (this->virtual_grf_sizes[i] != 1)
1297 split_grf[i] = true;
1298 else
1299 split_grf[i] = false;
1300 }
1301
1302 if (brw->has_pln &&
1303 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1304 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1305 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1306 * Gen6, that was the only supported interpolation mode, and since Gen6,
1307 * delta_x and delta_y are in fixed hardware registers.
1308 */
1309 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1310 false;
1311 }
1312
1313 foreach_list(node, &this->instructions) {
1314 fs_inst *inst = (fs_inst *)node;
1315
1316 /* If there's a SEND message that requires contiguous destination
1317 * registers, no splitting is allowed.
1318 */
1319 if (inst->regs_written() > 1) {
1320 split_grf[inst->dst.reg] = false;
1321 }
1322 }
1323
1324 /* Allocate new space for split regs. Note that the virtual
1325 * numbers will be contiguous.
1326 */
1327 for (int i = 0; i < num_vars; i++) {
1328 if (split_grf[i]) {
1329 new_virtual_grf[i] = virtual_grf_alloc(1);
1330 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1331 int reg = virtual_grf_alloc(1);
1332 assert(reg == new_virtual_grf[i] + j - 1);
1333 (void) reg;
1334 }
1335 this->virtual_grf_sizes[i] = 1;
1336 }
1337 }
1338
1339 foreach_list(node, &this->instructions) {
1340 fs_inst *inst = (fs_inst *)node;
1341
1342 if (inst->dst.file == GRF &&
1343 split_grf[inst->dst.reg] &&
1344 inst->dst.reg_offset != 0) {
1345 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1346 inst->dst.reg_offset - 1);
1347 inst->dst.reg_offset = 0;
1348 }
1349 for (int i = 0; i < 3; i++) {
1350 if (inst->src[i].file == GRF &&
1351 split_grf[inst->src[i].reg] &&
1352 inst->src[i].reg_offset != 0) {
1353 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1354 inst->src[i].reg_offset - 1);
1355 inst->src[i].reg_offset = 0;
1356 }
1357 }
1358 }
1359 this->live_intervals_valid = false;
1360 }
1361
1362 /**
1363 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1364 *
1365 * During code generation, we create tons of temporary variables, many of
1366 * which get immediately killed and are never used again. Yet, in later
1367 * optimization and analysis passes, such as compute_live_intervals, we need
1368 * to loop over all the virtual GRFs. Compacting them can save a lot of
1369 * overhead.
1370 */
1371 void
1372 fs_visitor::compact_virtual_grfs()
1373 {
1374 /* Mark which virtual GRFs are used, and count how many. */
1375 int remap_table[this->virtual_grf_count];
1376 memset(remap_table, -1, sizeof(remap_table));
1377
1378 foreach_list(node, &this->instructions) {
1379 const fs_inst *inst = (const fs_inst *) node;
1380
1381 if (inst->dst.file == GRF)
1382 remap_table[inst->dst.reg] = 0;
1383
1384 for (int i = 0; i < 3; i++) {
1385 if (inst->src[i].file == GRF)
1386 remap_table[inst->src[i].reg] = 0;
1387 }
1388 }
1389
1390 /* In addition to registers used in instructions, fs_visitor keeps
1391 * direct references to certain special values which must be patched:
1392 */
1393 fs_reg *special[] = {
1394 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1395 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1396 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1397 &delta_x[0], &delta_x[1], &delta_x[2],
1398 &delta_x[3], &delta_x[4], &delta_x[5],
1399 &delta_y[0], &delta_y[1], &delta_y[2],
1400 &delta_y[3], &delta_y[4], &delta_y[5],
1401 };
1402 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1403 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1404
1405 /* Treat all special values as used, to be conservative */
1406 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1407 if (special[i]->file == GRF)
1408 remap_table[special[i]->reg] = 0;
1409 }
1410
1411 /* Compact the GRF arrays. */
1412 int new_index = 0;
1413 for (int i = 0; i < this->virtual_grf_count; i++) {
1414 if (remap_table[i] != -1) {
1415 remap_table[i] = new_index;
1416 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1417 if (live_intervals_valid) {
1418 virtual_grf_use[new_index] = virtual_grf_use[i];
1419 virtual_grf_def[new_index] = virtual_grf_def[i];
1420 }
1421 ++new_index;
1422 }
1423 }
1424
1425 this->virtual_grf_count = new_index;
1426
1427 /* Patch all the instructions to use the newly renumbered registers */
1428 foreach_list(node, &this->instructions) {
1429 fs_inst *inst = (fs_inst *) node;
1430
1431 if (inst->dst.file == GRF)
1432 inst->dst.reg = remap_table[inst->dst.reg];
1433
1434 for (int i = 0; i < 3; i++) {
1435 if (inst->src[i].file == GRF)
1436 inst->src[i].reg = remap_table[inst->src[i].reg];
1437 }
1438 }
1439
1440 /* Patch all the references to special values */
1441 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1442 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1443 special[i]->reg = remap_table[special[i]->reg];
1444 }
1445 }
1446
1447 bool
1448 fs_visitor::remove_dead_constants()
1449 {
1450 if (dispatch_width == 8) {
1451 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1452
1453 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1454 this->params_remap[i] = -1;
1455
1456 /* Find which params are still in use. */
1457 foreach_list(node, &this->instructions) {
1458 fs_inst *inst = (fs_inst *)node;
1459
1460 for (int i = 0; i < 3; i++) {
1461 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1462
1463 if (inst->src[i].file != UNIFORM)
1464 continue;
1465
1466 assert(constant_nr < (int)c->prog_data.nr_params);
1467
1468 /* For now, set this to non-negative. We'll give it the
1469 * actual new number in a moment, in order to keep the
1470 * register numbers nicely ordered.
1471 */
1472 this->params_remap[constant_nr] = 0;
1473 }
1474 }
1475
1476 /* Figure out what the new numbers for the params will be. At some
1477 * point when we're doing uniform array access, we're going to want
1478 * to keep the distinction between .reg and .reg_offset, but for
1479 * now we don't care.
1480 */
1481 unsigned int new_nr_params = 0;
1482 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1483 if (this->params_remap[i] != -1) {
1484 this->params_remap[i] = new_nr_params++;
1485 }
1486 }
1487
1488 /* Update the list of params to be uploaded to match our new numbering. */
1489 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1490 int remapped = this->params_remap[i];
1491
1492 if (remapped == -1)
1493 continue;
1494
1495 c->prog_data.param[remapped] = c->prog_data.param[i];
1496 }
1497
1498 c->prog_data.nr_params = new_nr_params;
1499 } else {
1500 /* This should have been generated in the 8-wide pass already. */
1501 assert(this->params_remap);
1502 }
1503
1504 /* Now do the renumbering of the shader to remove unused params. */
1505 foreach_list(node, &this->instructions) {
1506 fs_inst *inst = (fs_inst *)node;
1507
1508 for (int i = 0; i < 3; i++) {
1509 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1510
1511 if (inst->src[i].file != UNIFORM)
1512 continue;
1513
1514 assert(this->params_remap[constant_nr] != -1);
1515 inst->src[i].reg = this->params_remap[constant_nr];
1516 inst->src[i].reg_offset = 0;
1517 }
1518 }
1519
1520 return true;
1521 }
1522
1523 /*
1524 * Implements array access of uniforms by inserting a
1525 * PULL_CONSTANT_LOAD instruction.
1526 *
1527 * Unlike temporary GRF array access (where we don't support it due to
1528 * the difficulty of doing relative addressing on instruction
1529 * destinations), we could potentially do array access of uniforms
1530 * that were loaded in GRF space as push constants. In real-world
1531 * usage we've seen, though, the arrays being used are always larger
1532 * than we could load as push constants, so just always move all
1533 * uniform array access out to a pull constant buffer.
1534 */
1535 void
1536 fs_visitor::move_uniform_array_access_to_pull_constants()
1537 {
1538 int pull_constant_loc[c->prog_data.nr_params];
1539
1540 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1541 pull_constant_loc[i] = -1;
1542 }
1543
1544 /* Walk through and find array access of uniforms. Put a copy of that
1545 * uniform in the pull constant buffer.
1546 *
1547 * Note that we don't move constant-indexed accesses to arrays. No
1548 * testing has been done of the performance impact of this choice.
1549 */
1550 foreach_list_safe(node, &this->instructions) {
1551 fs_inst *inst = (fs_inst *)node;
1552
1553 for (int i = 0 ; i < 3; i++) {
1554 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1555 continue;
1556
1557 int uniform = inst->src[i].reg;
1558
1559 /* If this array isn't already present in the pull constant buffer,
1560 * add it.
1561 */
1562 if (pull_constant_loc[uniform] == -1) {
1563 const float **values = &c->prog_data.param[uniform];
1564
1565 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1566
1567 assert(param_size[uniform]);
1568
1569 for (int j = 0; j < param_size[uniform]; j++) {
1570 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1571 values[j];
1572 }
1573 }
1574
1575 /* Set up the annotation tracking for new generated instructions. */
1576 base_ir = inst->ir;
1577 current_annotation = inst->annotation;
1578
1579 fs_reg offset = fs_reg(this, glsl_type::int_type);
1580 inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1581 fs_reg(pull_constant_loc[uniform] +
1582 inst->src[i].reg_offset)));
1583
1584 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1585 fs_reg temp = fs_reg(this, glsl_type::float_type);
1586 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1587 surf_index, offset);
1588 inst->insert_before(&list);
1589
1590 inst->src[i].file = temp.file;
1591 inst->src[i].reg = temp.reg;
1592 inst->src[i].reg_offset = temp.reg_offset;
1593 inst->src[i].reladdr = NULL;
1594 }
1595 }
1596 }
1597
1598 /**
1599 * Choose accesses from the UNIFORM file to demote to using the pull
1600 * constant buffer.
1601 *
1602 * We allow a fragment shader to have more than the specified minimum
1603 * maximum number of fragment shader uniform components (64). If
1604 * there are too many of these, they'd fill up all of register space.
1605 * So, this will push some of them out to the pull constant buffer and
1606 * update the program to load them.
1607 */
1608 void
1609 fs_visitor::setup_pull_constants()
1610 {
1611 /* Only allow 16 registers (128 uniform components) as push constants. */
1612 unsigned int max_uniform_components = 16 * 8;
1613 if (c->prog_data.nr_params <= max_uniform_components)
1614 return;
1615
1616 if (dispatch_width == 16) {
1617 fail("Pull constants not supported in 16-wide\n");
1618 return;
1619 }
1620
1621 /* Just demote the end of the list. We could probably do better
1622 * here, demoting things that are rarely used in the program first.
1623 */
1624 unsigned int pull_uniform_base = max_uniform_components;
1625
1626 int pull_constant_loc[c->prog_data.nr_params];
1627 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1628 if (i < pull_uniform_base) {
1629 pull_constant_loc[i] = -1;
1630 } else {
1631 pull_constant_loc[i] = -1;
1632 /* If our constant is already being uploaded for reladdr purposes,
1633 * reuse it.
1634 */
1635 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1636 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1637 pull_constant_loc[i] = j;
1638 break;
1639 }
1640 }
1641 if (pull_constant_loc[i] == -1) {
1642 int pull_index = c->prog_data.nr_pull_params++;
1643 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1644 pull_constant_loc[i] = pull_index;;
1645 }
1646 }
1647 }
1648 c->prog_data.nr_params = pull_uniform_base;
1649
1650 foreach_list(node, &this->instructions) {
1651 fs_inst *inst = (fs_inst *)node;
1652
1653 for (int i = 0; i < 3; i++) {
1654 if (inst->src[i].file != UNIFORM)
1655 continue;
1656
1657 int pull_index = pull_constant_loc[inst->src[i].reg +
1658 inst->src[i].reg_offset];
1659 if (pull_index == -1)
1660 continue;
1661
1662 assert(!inst->src[i].reladdr);
1663
1664 fs_reg dst = fs_reg(this, glsl_type::float_type);
1665 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1666 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1667 fs_inst *pull =
1668 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1669 dst, index, offset);
1670 pull->ir = inst->ir;
1671 pull->annotation = inst->annotation;
1672 pull->base_mrf = 14;
1673 pull->mlen = 1;
1674
1675 inst->insert_before(pull);
1676
1677 inst->src[i].file = GRF;
1678 inst->src[i].reg = dst.reg;
1679 inst->src[i].reg_offset = 0;
1680 inst->src[i].smear = pull_index & 3;
1681 }
1682 }
1683 }
1684
1685 bool
1686 fs_visitor::opt_algebraic()
1687 {
1688 bool progress = false;
1689
1690 foreach_list(node, &this->instructions) {
1691 fs_inst *inst = (fs_inst *)node;
1692
1693 switch (inst->opcode) {
1694 case BRW_OPCODE_MUL:
1695 if (inst->src[1].file != IMM)
1696 continue;
1697
1698 /* a * 1.0 = a */
1699 if (inst->src[1].is_one()) {
1700 inst->opcode = BRW_OPCODE_MOV;
1701 inst->src[1] = reg_undef;
1702 progress = true;
1703 break;
1704 }
1705
1706 /* a * 0.0 = 0.0 */
1707 if (inst->src[1].is_zero()) {
1708 inst->opcode = BRW_OPCODE_MOV;
1709 inst->src[0] = inst->src[1];
1710 inst->src[1] = reg_undef;
1711 progress = true;
1712 break;
1713 }
1714
1715 break;
1716 case BRW_OPCODE_ADD:
1717 if (inst->src[1].file != IMM)
1718 continue;
1719
1720 /* a + 0.0 = a */
1721 if (inst->src[1].is_zero()) {
1722 inst->opcode = BRW_OPCODE_MOV;
1723 inst->src[1] = reg_undef;
1724 progress = true;
1725 break;
1726 }
1727 break;
1728 default:
1729 break;
1730 }
1731 }
1732
1733 return progress;
1734 }
1735
1736 /**
1737 * Must be called after calculate_live_intervales() to remove unused
1738 * writes to registers -- register allocation will fail otherwise
1739 * because something deffed but not used won't be considered to
1740 * interfere with other regs.
1741 */
1742 bool
1743 fs_visitor::dead_code_eliminate()
1744 {
1745 bool progress = false;
1746 int pc = 0;
1747
1748 calculate_live_intervals();
1749
1750 foreach_list_safe(node, &this->instructions) {
1751 fs_inst *inst = (fs_inst *)node;
1752
1753 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1754 inst->remove();
1755 progress = true;
1756 }
1757
1758 pc++;
1759 }
1760
1761 if (progress)
1762 live_intervals_valid = false;
1763
1764 return progress;
1765 }
1766
1767 /**
1768 * Implements a second type of register coalescing: This one checks if
1769 * the two regs involved in a raw move don't interfere, in which case
1770 * they can both by stored in the same place and the MOV removed.
1771 */
1772 bool
1773 fs_visitor::register_coalesce_2()
1774 {
1775 bool progress = false;
1776
1777 calculate_live_intervals();
1778
1779 foreach_list_safe(node, &this->instructions) {
1780 fs_inst *inst = (fs_inst *)node;
1781
1782 if (inst->opcode != BRW_OPCODE_MOV ||
1783 inst->predicate ||
1784 inst->saturate ||
1785 inst->src[0].file != GRF ||
1786 inst->src[0].negate ||
1787 inst->src[0].abs ||
1788 inst->src[0].smear != -1 ||
1789 inst->dst.file != GRF ||
1790 inst->dst.type != inst->src[0].type ||
1791 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1792 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1793 continue;
1794 }
1795
1796 int reg_from = inst->src[0].reg;
1797 assert(inst->src[0].reg_offset == 0);
1798 int reg_to = inst->dst.reg;
1799 int reg_to_offset = inst->dst.reg_offset;
1800
1801 foreach_list(node, &this->instructions) {
1802 fs_inst *scan_inst = (fs_inst *)node;
1803
1804 if (scan_inst->dst.file == GRF &&
1805 scan_inst->dst.reg == reg_from) {
1806 scan_inst->dst.reg = reg_to;
1807 scan_inst->dst.reg_offset = reg_to_offset;
1808 }
1809 for (int i = 0; i < 3; i++) {
1810 if (scan_inst->src[i].file == GRF &&
1811 scan_inst->src[i].reg == reg_from) {
1812 scan_inst->src[i].reg = reg_to;
1813 scan_inst->src[i].reg_offset = reg_to_offset;
1814 }
1815 }
1816 }
1817
1818 inst->remove();
1819
1820 /* We don't need to recalculate live intervals inside the loop despite
1821 * flagging live_intervals_valid because we only use live intervals for
1822 * the interferes test, and we must have had a situation where the
1823 * intervals were:
1824 *
1825 * from to
1826 * ^
1827 * |
1828 * v
1829 * ^
1830 * |
1831 * v
1832 *
1833 * Some register R that might get coalesced with one of these two could
1834 * only be referencing "to", otherwise "from"'s range would have been
1835 * longer. R's range could also only start at the end of "to" or later,
1836 * otherwise it will conflict with "to" when we try to coalesce "to"
1837 * into Rw anyway.
1838 */
1839 live_intervals_valid = false;
1840
1841 progress = true;
1842 continue;
1843 }
1844
1845 return progress;
1846 }
1847
1848 bool
1849 fs_visitor::register_coalesce()
1850 {
1851 bool progress = false;
1852 int if_depth = 0;
1853 int loop_depth = 0;
1854
1855 foreach_list_safe(node, &this->instructions) {
1856 fs_inst *inst = (fs_inst *)node;
1857
1858 /* Make sure that we dominate the instructions we're going to
1859 * scan for interfering with our coalescing, or we won't have
1860 * scanned enough to see if anything interferes with our
1861 * coalescing. We don't dominate the following instructions if
1862 * we're in a loop or an if block.
1863 */
1864 switch (inst->opcode) {
1865 case BRW_OPCODE_DO:
1866 loop_depth++;
1867 break;
1868 case BRW_OPCODE_WHILE:
1869 loop_depth--;
1870 break;
1871 case BRW_OPCODE_IF:
1872 if_depth++;
1873 break;
1874 case BRW_OPCODE_ENDIF:
1875 if_depth--;
1876 break;
1877 default:
1878 break;
1879 }
1880 if (loop_depth || if_depth)
1881 continue;
1882
1883 if (inst->opcode != BRW_OPCODE_MOV ||
1884 inst->predicate ||
1885 inst->saturate ||
1886 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1887 inst->src[0].file != UNIFORM)||
1888 inst->dst.type != inst->src[0].type)
1889 continue;
1890
1891 bool has_source_modifiers = (inst->src[0].abs ||
1892 inst->src[0].negate ||
1893 inst->src[0].file == UNIFORM);
1894
1895 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1896 * them: check for no writes to either one until the exit of the
1897 * program.
1898 */
1899 bool interfered = false;
1900
1901 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1902 !scan_inst->is_tail_sentinel();
1903 scan_inst = (fs_inst *)scan_inst->next) {
1904 if (scan_inst->dst.file == GRF) {
1905 if (scan_inst->overwrites_reg(inst->dst) ||
1906 scan_inst->overwrites_reg(inst->src[0])) {
1907 interfered = true;
1908 break;
1909 }
1910 }
1911
1912 /* The gen6 MATH instruction can't handle source modifiers or
1913 * unusual register regions, so avoid coalescing those for
1914 * now. We should do something more specific.
1915 */
1916 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1917 interfered = true;
1918 break;
1919 }
1920
1921 /* The accumulator result appears to get used for the
1922 * conditional modifier generation. When negating a UD
1923 * value, there is a 33rd bit generated for the sign in the
1924 * accumulator value, so now you can't check, for example,
1925 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1926 */
1927 if (scan_inst->conditional_mod &&
1928 inst->src[0].negate &&
1929 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1930 interfered = true;
1931 break;
1932 }
1933 }
1934 if (interfered) {
1935 continue;
1936 }
1937
1938 /* Rewrite the later usage to point at the source of the move to
1939 * be removed.
1940 */
1941 for (fs_inst *scan_inst = inst;
1942 !scan_inst->is_tail_sentinel();
1943 scan_inst = (fs_inst *)scan_inst->next) {
1944 for (int i = 0; i < 3; i++) {
1945 if (scan_inst->src[i].file == GRF &&
1946 scan_inst->src[i].reg == inst->dst.reg &&
1947 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1948 fs_reg new_src = inst->src[0];
1949 if (scan_inst->src[i].abs) {
1950 new_src.negate = 0;
1951 new_src.abs = 1;
1952 }
1953 new_src.negate ^= scan_inst->src[i].negate;
1954 scan_inst->src[i] = new_src;
1955 }
1956 }
1957 }
1958
1959 inst->remove();
1960 progress = true;
1961 }
1962
1963 if (progress)
1964 live_intervals_valid = false;
1965
1966 return progress;
1967 }
1968
1969
1970 bool
1971 fs_visitor::compute_to_mrf()
1972 {
1973 bool progress = false;
1974 int next_ip = 0;
1975
1976 calculate_live_intervals();
1977
1978 foreach_list_safe(node, &this->instructions) {
1979 fs_inst *inst = (fs_inst *)node;
1980
1981 int ip = next_ip;
1982 next_ip++;
1983
1984 if (inst->opcode != BRW_OPCODE_MOV ||
1985 inst->predicate ||
1986 inst->dst.file != MRF || inst->src[0].file != GRF ||
1987 inst->dst.type != inst->src[0].type ||
1988 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1989 continue;
1990
1991 /* Work out which hardware MRF registers are written by this
1992 * instruction.
1993 */
1994 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1995 int mrf_high;
1996 if (inst->dst.reg & BRW_MRF_COMPR4) {
1997 mrf_high = mrf_low + 4;
1998 } else if (dispatch_width == 16 &&
1999 (!inst->force_uncompressed && !inst->force_sechalf)) {
2000 mrf_high = mrf_low + 1;
2001 } else {
2002 mrf_high = mrf_low;
2003 }
2004
2005 /* Can't compute-to-MRF this GRF if someone else was going to
2006 * read it later.
2007 */
2008 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2009 continue;
2010
2011 /* Found a move of a GRF to a MRF. Let's see if we can go
2012 * rewrite the thing that made this GRF to write into the MRF.
2013 */
2014 fs_inst *scan_inst;
2015 for (scan_inst = (fs_inst *)inst->prev;
2016 scan_inst->prev != NULL;
2017 scan_inst = (fs_inst *)scan_inst->prev) {
2018 if (scan_inst->dst.file == GRF &&
2019 scan_inst->dst.reg == inst->src[0].reg) {
2020 /* Found the last thing to write our reg we want to turn
2021 * into a compute-to-MRF.
2022 */
2023
2024 /* SENDs can only write to GRFs, so no compute-to-MRF. */
2025 if (scan_inst->mlen) {
2026 break;
2027 }
2028
2029 /* If it's predicated, it (probably) didn't populate all
2030 * the channels. We might be able to rewrite everything
2031 * that writes that reg, but it would require smarter
2032 * tracking to delay the rewriting until complete success.
2033 */
2034 if (scan_inst->predicate)
2035 break;
2036
2037 /* If it's half of register setup and not the same half as
2038 * our MOV we're trying to remove, bail for now.
2039 */
2040 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2041 scan_inst->force_sechalf != inst->force_sechalf) {
2042 break;
2043 }
2044
2045 /* SEND instructions can't have MRF as a destination. */
2046 if (scan_inst->mlen)
2047 break;
2048
2049 if (intel->gen >= 6) {
2050 /* gen6 math instructions must have the destination be
2051 * GRF, so no compute-to-MRF for them.
2052 */
2053 if (scan_inst->is_math()) {
2054 break;
2055 }
2056 }
2057
2058 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2059 /* Found the creator of our MRF's source value. */
2060 scan_inst->dst.file = MRF;
2061 scan_inst->dst.reg = inst->dst.reg;
2062 scan_inst->saturate |= inst->saturate;
2063 inst->remove();
2064 progress = true;
2065 }
2066 break;
2067 }
2068
2069 /* We don't handle flow control here. Most computation of
2070 * values that end up in MRFs are shortly before the MRF
2071 * write anyway.
2072 */
2073 if (scan_inst->opcode == BRW_OPCODE_DO ||
2074 scan_inst->opcode == BRW_OPCODE_WHILE ||
2075 scan_inst->opcode == BRW_OPCODE_ELSE ||
2076 scan_inst->opcode == BRW_OPCODE_ENDIF) {
2077 break;
2078 }
2079
2080 /* You can't read from an MRF, so if someone else reads our
2081 * MRF's source GRF that we wanted to rewrite, that stops us.
2082 */
2083 bool interfered = false;
2084 for (int i = 0; i < 3; i++) {
2085 if (scan_inst->src[i].file == GRF &&
2086 scan_inst->src[i].reg == inst->src[0].reg &&
2087 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2088 interfered = true;
2089 }
2090 }
2091 if (interfered)
2092 break;
2093
2094 if (scan_inst->dst.file == MRF) {
2095 /* If somebody else writes our MRF here, we can't
2096 * compute-to-MRF before that.
2097 */
2098 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2099 int scan_mrf_high;
2100
2101 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2102 scan_mrf_high = scan_mrf_low + 4;
2103 } else if (dispatch_width == 16 &&
2104 (!scan_inst->force_uncompressed &&
2105 !scan_inst->force_sechalf)) {
2106 scan_mrf_high = scan_mrf_low + 1;
2107 } else {
2108 scan_mrf_high = scan_mrf_low;
2109 }
2110
2111 if (mrf_low == scan_mrf_low ||
2112 mrf_low == scan_mrf_high ||
2113 mrf_high == scan_mrf_low ||
2114 mrf_high == scan_mrf_high) {
2115 break;
2116 }
2117 }
2118
2119 if (scan_inst->mlen > 0) {
2120 /* Found a SEND instruction, which means that there are
2121 * live values in MRFs from base_mrf to base_mrf +
2122 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2123 * above it.
2124 */
2125 if (mrf_low >= scan_inst->base_mrf &&
2126 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2127 break;
2128 }
2129 if (mrf_high >= scan_inst->base_mrf &&
2130 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2131 break;
2132 }
2133 }
2134 }
2135 }
2136
2137 if (progress)
2138 live_intervals_valid = false;
2139
2140 return progress;
2141 }
2142
2143 /**
2144 * Walks through basic blocks, looking for repeated MRF writes and
2145 * removing the later ones.
2146 */
2147 bool
2148 fs_visitor::remove_duplicate_mrf_writes()
2149 {
2150 fs_inst *last_mrf_move[16];
2151 bool progress = false;
2152
2153 /* Need to update the MRF tracking for compressed instructions. */
2154 if (dispatch_width == 16)
2155 return false;
2156
2157 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2158
2159 foreach_list_safe(node, &this->instructions) {
2160 fs_inst *inst = (fs_inst *)node;
2161
2162 switch (inst->opcode) {
2163 case BRW_OPCODE_DO:
2164 case BRW_OPCODE_WHILE:
2165 case BRW_OPCODE_IF:
2166 case BRW_OPCODE_ELSE:
2167 case BRW_OPCODE_ENDIF:
2168 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2169 continue;
2170 default:
2171 break;
2172 }
2173
2174 if (inst->opcode == BRW_OPCODE_MOV &&
2175 inst->dst.file == MRF) {
2176 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2177 if (prev_inst && inst->equals(prev_inst)) {
2178 inst->remove();
2179 progress = true;
2180 continue;
2181 }
2182 }
2183
2184 /* Clear out the last-write records for MRFs that were overwritten. */
2185 if (inst->dst.file == MRF) {
2186 last_mrf_move[inst->dst.reg] = NULL;
2187 }
2188
2189 if (inst->mlen > 0) {
2190 /* Found a SEND instruction, which will include two or fewer
2191 * implied MRF writes. We could do better here.
2192 */
2193 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2194 last_mrf_move[inst->base_mrf + i] = NULL;
2195 }
2196 }
2197
2198 /* Clear out any MRF move records whose sources got overwritten. */
2199 if (inst->dst.file == GRF) {
2200 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2201 if (last_mrf_move[i] &&
2202 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2203 last_mrf_move[i] = NULL;
2204 }
2205 }
2206 }
2207
2208 if (inst->opcode == BRW_OPCODE_MOV &&
2209 inst->dst.file == MRF &&
2210 inst->src[0].file == GRF &&
2211 !inst->predicate) {
2212 last_mrf_move[inst->dst.reg] = inst;
2213 }
2214 }
2215
2216 if (progress)
2217 live_intervals_valid = false;
2218
2219 return progress;
2220 }
2221
2222 void
2223 fs_visitor::dump_instruction(fs_inst *inst)
2224 {
2225 if (inst->predicate) {
2226 printf("(%cf0.%d) ",
2227 inst->predicate_inverse ? '-' : '+',
2228 inst->flag_subreg);
2229 }
2230
2231 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2232 opcode_descs[inst->opcode].name) {
2233 printf("%s", opcode_descs[inst->opcode].name);
2234 } else {
2235 printf("op%d", inst->opcode);
2236 }
2237 if (inst->saturate)
2238 printf(".sat");
2239 if (inst->conditional_mod) {
2240 printf(".cmod");
2241 if (!inst->predicate &&
2242 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2243 inst->opcode != BRW_OPCODE_IF &&
2244 inst->opcode != BRW_OPCODE_WHILE))) {
2245 printf(".f0.%d\n", inst->flag_subreg);
2246 }
2247 }
2248 printf(" ");
2249
2250
2251 switch (inst->dst.file) {
2252 case GRF:
2253 printf("vgrf%d", inst->dst.reg);
2254 if (inst->dst.reg_offset)
2255 printf("+%d", inst->dst.reg_offset);
2256 break;
2257 case MRF:
2258 printf("m%d", inst->dst.reg);
2259 break;
2260 case BAD_FILE:
2261 printf("(null)");
2262 break;
2263 case UNIFORM:
2264 printf("***u%d***", inst->dst.reg);
2265 break;
2266 default:
2267 printf("???");
2268 break;
2269 }
2270 printf(", ");
2271
2272 for (int i = 0; i < 3; i++) {
2273 if (inst->src[i].negate)
2274 printf("-");
2275 if (inst->src[i].abs)
2276 printf("|");
2277 switch (inst->src[i].file) {
2278 case GRF:
2279 printf("vgrf%d", inst->src[i].reg);
2280 if (inst->src[i].reg_offset)
2281 printf("+%d", inst->src[i].reg_offset);
2282 break;
2283 case MRF:
2284 printf("***m%d***", inst->src[i].reg);
2285 break;
2286 case UNIFORM:
2287 printf("u%d", inst->src[i].reg);
2288 if (inst->src[i].reg_offset)
2289 printf(".%d", inst->src[i].reg_offset);
2290 break;
2291 case BAD_FILE:
2292 printf("(null)");
2293 break;
2294 default:
2295 printf("???");
2296 break;
2297 }
2298 if (inst->src[i].abs)
2299 printf("|");
2300
2301 if (i < 3)
2302 printf(", ");
2303 }
2304
2305 printf(" ");
2306
2307 if (inst->force_uncompressed)
2308 printf("1sthalf ");
2309
2310 if (inst->force_sechalf)
2311 printf("2ndhalf ");
2312
2313 printf("\n");
2314 }
2315
2316 void
2317 fs_visitor::dump_instructions()
2318 {
2319 int ip = 0;
2320 foreach_list(node, &this->instructions) {
2321 fs_inst *inst = (fs_inst *)node;
2322 printf("%d: ", ip++);
2323 dump_instruction(inst);
2324 }
2325 }
2326
2327 /**
2328 * Possibly returns an instruction that set up @param reg.
2329 *
2330 * Sometimes we want to take the result of some expression/variable
2331 * dereference tree and rewrite the instruction generating the result
2332 * of the tree. When processing the tree, we know that the
2333 * instructions generated are all writing temporaries that are dead
2334 * outside of this tree. So, if we have some instructions that write
2335 * a temporary, we're free to point that temp write somewhere else.
2336 *
2337 * Note that this doesn't guarantee that the instruction generated
2338 * only reg -- it might be the size=4 destination of a texture instruction.
2339 */
2340 fs_inst *
2341 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2342 fs_inst *end,
2343 fs_reg reg)
2344 {
2345 if (end == start ||
2346 end->predicate ||
2347 end->force_uncompressed ||
2348 end->force_sechalf ||
2349 reg.reladdr ||
2350 !reg.equals(end->dst)) {
2351 return NULL;
2352 } else {
2353 return end;
2354 }
2355 }
2356
2357 void
2358 fs_visitor::setup_payload_gen6()
2359 {
2360 struct intel_context *intel = &brw->intel;
2361 bool uses_depth =
2362 (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2363 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2364
2365 assert(intel->gen >= 6);
2366
2367 /* R0-1: masks, pixel X/Y coordinates. */
2368 c->nr_payload_regs = 2;
2369 /* R2: only for 32-pixel dispatch.*/
2370
2371 /* R3-26: barycentric interpolation coordinates. These appear in the
2372 * same order that they appear in the brw_wm_barycentric_interp_mode
2373 * enum. Each set of coordinates occupies 2 registers if dispatch width
2374 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2375 * appear if they were enabled using the "Barycentric Interpolation
2376 * Mode" bits in WM_STATE.
2377 */
2378 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2379 if (barycentric_interp_modes & (1 << i)) {
2380 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2381 c->nr_payload_regs += 2;
2382 if (dispatch_width == 16) {
2383 c->nr_payload_regs += 2;
2384 }
2385 }
2386 }
2387
2388 /* R27: interpolated depth if uses source depth */
2389 if (uses_depth) {
2390 c->source_depth_reg = c->nr_payload_regs;
2391 c->nr_payload_regs++;
2392 if (dispatch_width == 16) {
2393 /* R28: interpolated depth if not 8-wide. */
2394 c->nr_payload_regs++;
2395 }
2396 }
2397 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2398 if (uses_depth) {
2399 c->source_w_reg = c->nr_payload_regs;
2400 c->nr_payload_regs++;
2401 if (dispatch_width == 16) {
2402 /* R30: interpolated W if not 8-wide. */
2403 c->nr_payload_regs++;
2404 }
2405 }
2406 /* R31: MSAA position offsets. */
2407 /* R32-: bary for 32-pixel. */
2408 /* R58-59: interp W for 32-pixel. */
2409
2410 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2411 c->source_depth_to_render_target = true;
2412 }
2413 }
2414
2415 bool
2416 fs_visitor::run()
2417 {
2418 sanity_param_count = fp->Base.Parameters->NumParameters;
2419 uint32_t orig_nr_params = c->prog_data.nr_params;
2420
2421 if (intel->gen >= 6)
2422 setup_payload_gen6();
2423 else
2424 setup_payload_gen4();
2425
2426 if (0) {
2427 emit_dummy_fs();
2428 } else {
2429 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2430 emit_shader_time_begin();
2431
2432 calculate_urb_setup();
2433 if (intel->gen < 6)
2434 emit_interpolation_setup_gen4();
2435 else
2436 emit_interpolation_setup_gen6();
2437
2438 /* We handle discards by keeping track of the still-live pixels in f0.1.
2439 * Initialize it with the dispatched pixels.
2440 */
2441 if (fp->UsesKill) {
2442 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2443 discard_init->flag_subreg = 1;
2444 }
2445
2446 /* Generate FS IR for main(). (the visitor only descends into
2447 * functions called "main").
2448 */
2449 if (shader) {
2450 foreach_list(node, &*shader->ir) {
2451 ir_instruction *ir = (ir_instruction *)node;
2452 base_ir = ir;
2453 this->result = reg_undef;
2454 ir->accept(this);
2455 }
2456 } else {
2457 emit_fragment_program_code();
2458 }
2459 base_ir = NULL;
2460 if (failed)
2461 return false;
2462
2463 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2464 emit_shader_time_end();
2465
2466 emit_fb_writes();
2467
2468 split_virtual_grfs();
2469
2470 move_uniform_array_access_to_pull_constants();
2471 setup_pull_constants();
2472
2473 bool progress;
2474 do {
2475 progress = false;
2476
2477 compact_virtual_grfs();
2478
2479 progress = remove_duplicate_mrf_writes() || progress;
2480
2481 progress = opt_algebraic() || progress;
2482 progress = opt_cse() || progress;
2483 progress = opt_copy_propagate() || progress;
2484 progress = dead_code_eliminate() || progress;
2485 progress = register_coalesce() || progress;
2486 progress = register_coalesce_2() || progress;
2487 progress = compute_to_mrf() || progress;
2488 } while (progress);
2489
2490 remove_dead_constants();
2491
2492 schedule_instructions(false);
2493
2494 assign_curb_setup();
2495 assign_urb_setup();
2496
2497 if (0) {
2498 /* Debug of register spilling: Go spill everything. */
2499 for (int i = 0; i < virtual_grf_count; i++) {
2500 spill_reg(i);
2501 }
2502 }
2503
2504 if (0)
2505 assign_regs_trivial();
2506 else {
2507 while (!assign_regs()) {
2508 if (failed)
2509 break;
2510 }
2511 }
2512 }
2513 assert(force_uncompressed_stack == 0);
2514 assert(force_sechalf_stack == 0);
2515
2516 if (failed)
2517 return false;
2518
2519 schedule_instructions(true);
2520
2521 if (dispatch_width == 8) {
2522 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2523 } else {
2524 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2525
2526 /* Make sure we didn't try to sneak in an extra uniform */
2527 assert(orig_nr_params == c->prog_data.nr_params);
2528 (void) orig_nr_params;
2529 }
2530
2531 /* If any state parameters were appended, then ParameterValues could have
2532 * been realloced, in which case the driver uniform storage set up by
2533 * _mesa_associate_uniform_storage() would point to freed memory. Make
2534 * sure that didn't happen.
2535 */
2536 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2537
2538 return !failed;
2539 }
2540
2541 const unsigned *
2542 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2543 struct gl_fragment_program *fp,
2544 struct gl_shader_program *prog,
2545 unsigned *final_assembly_size)
2546 {
2547 struct intel_context *intel = &brw->intel;
2548 bool start_busy = false;
2549 float start_time = 0;
2550
2551 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2552 start_busy = (intel->batch.last_bo &&
2553 drm_intel_bo_busy(intel->batch.last_bo));
2554 start_time = get_time();
2555 }
2556
2557 struct brw_shader *shader = NULL;
2558 if (prog)
2559 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2560
2561 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2562 if (shader) {
2563 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2564 _mesa_print_ir(shader->ir, NULL);
2565 printf("\n\n");
2566 } else {
2567 printf("ARB_fragment_program %d ir for native fragment shader\n",
2568 fp->Base.Id);
2569 _mesa_print_program(&fp->Base);
2570 }
2571 }
2572
2573 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2574 */
2575 fs_visitor v(brw, c, prog, fp, 8);
2576 if (!v.run()) {
2577 prog->LinkStatus = false;
2578 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2579
2580 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2581 v.fail_msg);
2582
2583 return NULL;
2584 }
2585
2586 exec_list *simd16_instructions = NULL;
2587 fs_visitor v2(brw, c, prog, fp, 16);
2588 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2589 v2.import_uniforms(&v);
2590 if (!v2.run()) {
2591 perf_debug("16-wide shader failed to compile, falling back to "
2592 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2593 } else {
2594 simd16_instructions = &v2.instructions;
2595 }
2596 }
2597
2598 c->prog_data.dispatch_width = 8;
2599
2600 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2601 const unsigned *generated = g.generate_assembly(&v.instructions,
2602 simd16_instructions,
2603 final_assembly_size);
2604
2605 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2606 if (shader->compiled_once)
2607 brw_wm_debug_recompile(brw, prog, &c->key);
2608 shader->compiled_once = true;
2609
2610 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2611 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2612 (get_time() - start_time) * 1000);
2613 }
2614 }
2615
2616 return generated;
2617 }
2618
2619 bool
2620 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2621 {
2622 struct brw_context *brw = brw_context(ctx);
2623 struct intel_context *intel = &brw->intel;
2624 struct brw_wm_prog_key key;
2625
2626 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2627 return true;
2628
2629 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2630 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2631 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2632 bool program_uses_dfdy = fp->UsesDFdy;
2633
2634 memset(&key, 0, sizeof(key));
2635
2636 if (intel->gen < 6) {
2637 if (fp->UsesKill)
2638 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2639
2640 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2641 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2642
2643 /* Just assume depth testing. */
2644 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2645 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2646 }
2647
2648 if (prog->Name != 0)
2649 key.proj_attrib_mask = 0xffffffff;
2650
2651 if (intel->gen < 6)
2652 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2653
2654 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2655 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2656 continue;
2657
2658 if (prog->Name == 0)
2659 key.proj_attrib_mask |= 1 << i;
2660
2661 if (intel->gen < 6) {
2662 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2663
2664 if (vp_index >= 0)
2665 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2666 }
2667 }
2668
2669 key.clamp_fragment_color = true;
2670
2671 for (int i = 0; i < MAX_SAMPLERS; i++) {
2672 if (fp->Base.ShadowSamplers & (1 << i)) {
2673 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2674 key.tex.swizzles[i] =
2675 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2676 } else {
2677 /* Color sampler: assume no swizzling. */
2678 key.tex.swizzles[i] = SWIZZLE_XYZW;
2679 }
2680 }
2681
2682 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2683 key.drawable_height = ctx->DrawBuffer->Height;
2684 }
2685
2686 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2687 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2688 }
2689
2690 key.nr_color_regions = 1;
2691
2692 key.program_string_id = bfp->id;
2693
2694 uint32_t old_prog_offset = brw->wm.prog_offset;
2695 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2696
2697 bool success = do_wm_prog(brw, prog, bfp, &key);
2698
2699 brw->wm.prog_offset = old_prog_offset;
2700 brw->wm.prog_data = old_prog_data;
2701
2702 return success;
2703 }