i965/fs: Set up gen7 UBO loads as sends from GRFs.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 ALU1(NOT)
150 ALU1(MOV)
151 ALU1(FRC)
152 ALU1(RNDD)
153 ALU1(RNDE)
154 ALU1(RNDZ)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(SHL)
162 ALU2(SHR)
163 ALU2(ASR)
164
165 /** Gen4 predicated IF. */
166 fs_inst *
167 fs_visitor::IF(uint32_t predicate)
168 {
169 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
170 inst->predicate = predicate;
171 return inst;
172 }
173
174 /** Gen6+ IF with embedded comparison. */
175 fs_inst *
176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
177 {
178 assert(intel->gen >= 6);
179 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
180 reg_null_d, src0, src1);
181 inst->conditional_mod = condition;
182 return inst;
183 }
184
185 /**
186 * CMP: Sets the low bit of the destination channels with the result
187 * of the comparison, while the upper bits are undefined, and updates
188 * the flag register with the packed 16 bits of the result.
189 */
190 fs_inst *
191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
192 {
193 fs_inst *inst;
194
195 /* Take the instruction:
196 *
197 * CMP null<d> src0<f> src1<f>
198 *
199 * Original gen4 does type conversion to the destination type before
200 * comparison, producing garbage results for floating point comparisons.
201 * gen5 does the comparison on the execution type (resolved source types),
202 * so dst type doesn't matter. gen6 does comparison and then uses the
203 * result as if it was the dst type with no conversion, which happens to
204 * mostly work out for float-interpreted-as-int since our comparisons are
205 * for >0, =0, <0.
206 */
207 if (intel->gen == 4) {
208 dst.type = src0.type;
209 if (dst.file == FIXED_HW_REG)
210 dst.fixed_hw_reg.type = dst.type;
211 }
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 exec_list
223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
224 fs_reg offset)
225 {
226 exec_list instructions;
227 fs_inst *inst;
228
229 if (intel->gen >= 7) {
230 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
231 dst, surf_index, offset);
232 instructions.push_tail(inst);
233 } else {
234 int base_mrf = 13;
235 bool header_present = true;
236
237 fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
238 mrf.type = BRW_REGISTER_TYPE_D;
239
240 /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
241 * dword-aligned byte offset.
242 */
243 if (intel->gen == 6) {
244 instructions.push_tail(MOV(mrf, offset));
245 } else {
246 instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
247 }
248 inst = MOV(mrf, offset);
249 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
250 dst, surf_index);
251 inst->header_present = header_present;
252 inst->base_mrf = base_mrf;
253 inst->mlen = header_present + dispatch_width / 8;
254
255 instructions.push_tail(inst);
256 }
257
258 return instructions;
259 }
260
261 bool
262 fs_inst::equals(fs_inst *inst)
263 {
264 return (opcode == inst->opcode &&
265 dst.equals(inst->dst) &&
266 src[0].equals(inst->src[0]) &&
267 src[1].equals(inst->src[1]) &&
268 src[2].equals(inst->src[2]) &&
269 saturate == inst->saturate &&
270 predicate == inst->predicate &&
271 conditional_mod == inst->conditional_mod &&
272 mlen == inst->mlen &&
273 base_mrf == inst->base_mrf &&
274 sampler == inst->sampler &&
275 target == inst->target &&
276 eot == inst->eot &&
277 header_present == inst->header_present &&
278 shadow_compare == inst->shadow_compare &&
279 offset == inst->offset);
280 }
281
282 int
283 fs_inst::regs_written()
284 {
285 if (is_tex())
286 return 4;
287
288 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
289 * but we don't currently use them...nor do we have an opcode for them.
290 */
291
292 return 1;
293 }
294
295 bool
296 fs_inst::overwrites_reg(const fs_reg &reg)
297 {
298 return (reg.file == dst.file &&
299 reg.reg == dst.reg &&
300 reg.reg_offset >= dst.reg_offset &&
301 reg.reg_offset < dst.reg_offset + regs_written());
302 }
303
304 bool
305 fs_inst::is_tex()
306 {
307 return (opcode == SHADER_OPCODE_TEX ||
308 opcode == FS_OPCODE_TXB ||
309 opcode == SHADER_OPCODE_TXD ||
310 opcode == SHADER_OPCODE_TXF ||
311 opcode == SHADER_OPCODE_TXL ||
312 opcode == SHADER_OPCODE_TXS);
313 }
314
315 bool
316 fs_inst::is_math()
317 {
318 return (opcode == SHADER_OPCODE_RCP ||
319 opcode == SHADER_OPCODE_RSQ ||
320 opcode == SHADER_OPCODE_SQRT ||
321 opcode == SHADER_OPCODE_EXP2 ||
322 opcode == SHADER_OPCODE_LOG2 ||
323 opcode == SHADER_OPCODE_SIN ||
324 opcode == SHADER_OPCODE_COS ||
325 opcode == SHADER_OPCODE_INT_QUOTIENT ||
326 opcode == SHADER_OPCODE_INT_REMAINDER ||
327 opcode == SHADER_OPCODE_POW);
328 }
329
330 bool
331 fs_inst::is_send_from_grf()
332 {
333 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
334 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
335 src[1].file == GRF));
336 }
337
338 bool
339 fs_visitor::can_do_source_mods(fs_inst *inst)
340 {
341 if (intel->gen == 6 && inst->is_math())
342 return false;
343
344 if (inst->is_send_from_grf())
345 return false;
346
347 return true;
348 }
349
350 void
351 fs_reg::init()
352 {
353 memset(this, 0, sizeof(*this));
354 this->smear = -1;
355 }
356
357 /** Generic unset register constructor. */
358 fs_reg::fs_reg()
359 {
360 init();
361 this->file = BAD_FILE;
362 }
363
364 /** Immediate value constructor. */
365 fs_reg::fs_reg(float f)
366 {
367 init();
368 this->file = IMM;
369 this->type = BRW_REGISTER_TYPE_F;
370 this->imm.f = f;
371 }
372
373 /** Immediate value constructor. */
374 fs_reg::fs_reg(int32_t i)
375 {
376 init();
377 this->file = IMM;
378 this->type = BRW_REGISTER_TYPE_D;
379 this->imm.i = i;
380 }
381
382 /** Immediate value constructor. */
383 fs_reg::fs_reg(uint32_t u)
384 {
385 init();
386 this->file = IMM;
387 this->type = BRW_REGISTER_TYPE_UD;
388 this->imm.u = u;
389 }
390
391 /** Fixed brw_reg Immediate value constructor. */
392 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
393 {
394 init();
395 this->file = FIXED_HW_REG;
396 this->fixed_hw_reg = fixed_hw_reg;
397 this->type = fixed_hw_reg.type;
398 }
399
400 bool
401 fs_reg::equals(const fs_reg &r) const
402 {
403 return (file == r.file &&
404 reg == r.reg &&
405 reg_offset == r.reg_offset &&
406 type == r.type &&
407 negate == r.negate &&
408 abs == r.abs &&
409 !reladdr && !r.reladdr &&
410 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
411 sizeof(fixed_hw_reg)) == 0 &&
412 smear == r.smear &&
413 imm.u == r.imm.u);
414 }
415
416 bool
417 fs_reg::is_zero() const
418 {
419 if (file != IMM)
420 return false;
421
422 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
423 }
424
425 bool
426 fs_reg::is_one() const
427 {
428 if (file != IMM)
429 return false;
430
431 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
432 }
433
434 int
435 fs_visitor::type_size(const struct glsl_type *type)
436 {
437 unsigned int size, i;
438
439 switch (type->base_type) {
440 case GLSL_TYPE_UINT:
441 case GLSL_TYPE_INT:
442 case GLSL_TYPE_FLOAT:
443 case GLSL_TYPE_BOOL:
444 return type->components();
445 case GLSL_TYPE_ARRAY:
446 return type_size(type->fields.array) * type->length;
447 case GLSL_TYPE_STRUCT:
448 size = 0;
449 for (i = 0; i < type->length; i++) {
450 size += type_size(type->fields.structure[i].type);
451 }
452 return size;
453 case GLSL_TYPE_SAMPLER:
454 /* Samplers take up no register space, since they're baked in at
455 * link time.
456 */
457 return 0;
458 default:
459 assert(!"not reached");
460 return 0;
461 }
462 }
463
464 fs_reg
465 fs_visitor::get_timestamp()
466 {
467 assert(intel->gen >= 7);
468
469 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
470 BRW_ARF_TIMESTAMP,
471 0),
472 BRW_REGISTER_TYPE_UD));
473
474 fs_reg dst = fs_reg(this, glsl_type::uint_type);
475
476 fs_inst *mov = emit(MOV(dst, ts));
477 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
478 * even if it's not enabled in the dispatch.
479 */
480 mov->force_writemask_all = true;
481 mov->force_uncompressed = true;
482
483 /* The caller wants the low 32 bits of the timestamp. Since it's running
484 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
485 * which is plenty of time for our purposes. It is identical across the
486 * EUs, but since it's tracking GPU core speed it will increment at a
487 * varying rate as render P-states change.
488 *
489 * The caller could also check if render P-states have changed (or anything
490 * else that might disrupt timing) by setting smear to 2 and checking if
491 * that field is != 0.
492 */
493 dst.smear = 0;
494
495 return dst;
496 }
497
498 void
499 fs_visitor::emit_shader_time_begin()
500 {
501 current_annotation = "shader time start";
502 shader_start_time = get_timestamp();
503 }
504
505 void
506 fs_visitor::emit_shader_time_end()
507 {
508 current_annotation = "shader time end";
509
510 enum shader_time_shader_type type, written_type, reset_type;
511 if (dispatch_width == 8) {
512 type = ST_FS8;
513 written_type = ST_FS8_WRITTEN;
514 reset_type = ST_FS8_RESET;
515 } else {
516 assert(dispatch_width == 16);
517 type = ST_FS16;
518 written_type = ST_FS16_WRITTEN;
519 reset_type = ST_FS16_RESET;
520 }
521
522 fs_reg shader_end_time = get_timestamp();
523
524 /* Check that there weren't any timestamp reset events (assuming these
525 * were the only two timestamp reads that happened).
526 */
527 fs_reg reset = shader_end_time;
528 reset.smear = 2;
529 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
530 test->conditional_mod = BRW_CONDITIONAL_Z;
531 emit(IF(BRW_PREDICATE_NORMAL));
532
533 push_force_uncompressed();
534 fs_reg start = shader_start_time;
535 start.negate = true;
536 fs_reg diff = fs_reg(this, glsl_type::uint_type);
537 emit(ADD(diff, start, shader_end_time));
538
539 /* If there were no instructions between the two timestamp gets, the diff
540 * is 2 cycles. Remove that overhead, so I can forget about that when
541 * trying to determine the time taken for single instructions.
542 */
543 emit(ADD(diff, diff, fs_reg(-2u)));
544
545 emit_shader_time_write(type, diff);
546 emit_shader_time_write(written_type, fs_reg(1u));
547 emit(BRW_OPCODE_ELSE);
548 emit_shader_time_write(reset_type, fs_reg(1u));
549 emit(BRW_OPCODE_ENDIF);
550
551 pop_force_uncompressed();
552 }
553
554 void
555 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
556 fs_reg value)
557 {
558 /* Choose an index in the buffer and set up tracking information for our
559 * printouts.
560 */
561 int shader_time_index = brw->shader_time.num_entries++;
562 assert(shader_time_index <= brw->shader_time.max_entries);
563 brw->shader_time.types[shader_time_index] = type;
564 if (prog) {
565 _mesa_reference_shader_program(ctx,
566 &brw->shader_time.programs[shader_time_index],
567 prog);
568 }
569
570 int base_mrf = 6;
571
572 fs_reg offset_mrf = fs_reg(MRF, base_mrf);
573 offset_mrf.type = BRW_REGISTER_TYPE_UD;
574 emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
575
576 fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
577 time_mrf.type = BRW_REGISTER_TYPE_UD;
578 emit(MOV(time_mrf, value));
579
580 fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
581 inst->base_mrf = base_mrf;
582 inst->mlen = 2;
583 }
584
585 void
586 fs_visitor::fail(const char *format, ...)
587 {
588 va_list va;
589 char *msg;
590
591 if (failed)
592 return;
593
594 failed = true;
595
596 va_start(va, format);
597 msg = ralloc_vasprintf(mem_ctx, format, va);
598 va_end(va);
599 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
600
601 this->fail_msg = msg;
602
603 if (INTEL_DEBUG & DEBUG_WM) {
604 fprintf(stderr, "%s", msg);
605 }
606 }
607
608 fs_inst *
609 fs_visitor::emit(enum opcode opcode)
610 {
611 return emit(fs_inst(opcode));
612 }
613
614 fs_inst *
615 fs_visitor::emit(enum opcode opcode, fs_reg dst)
616 {
617 return emit(fs_inst(opcode, dst));
618 }
619
620 fs_inst *
621 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
622 {
623 return emit(fs_inst(opcode, dst, src0));
624 }
625
626 fs_inst *
627 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
628 {
629 return emit(fs_inst(opcode, dst, src0, src1));
630 }
631
632 fs_inst *
633 fs_visitor::emit(enum opcode opcode, fs_reg dst,
634 fs_reg src0, fs_reg src1, fs_reg src2)
635 {
636 return emit(fs_inst(opcode, dst, src0, src1, src2));
637 }
638
639 void
640 fs_visitor::push_force_uncompressed()
641 {
642 force_uncompressed_stack++;
643 }
644
645 void
646 fs_visitor::pop_force_uncompressed()
647 {
648 force_uncompressed_stack--;
649 assert(force_uncompressed_stack >= 0);
650 }
651
652 void
653 fs_visitor::push_force_sechalf()
654 {
655 force_sechalf_stack++;
656 }
657
658 void
659 fs_visitor::pop_force_sechalf()
660 {
661 force_sechalf_stack--;
662 assert(force_sechalf_stack >= 0);
663 }
664
665 /**
666 * Returns how many MRFs an FS opcode will write over.
667 *
668 * Note that this is not the 0 or 1 implied writes in an actual gen
669 * instruction -- the FS opcodes often generate MOVs in addition.
670 */
671 int
672 fs_visitor::implied_mrf_writes(fs_inst *inst)
673 {
674 if (inst->mlen == 0)
675 return 0;
676
677 switch (inst->opcode) {
678 case SHADER_OPCODE_RCP:
679 case SHADER_OPCODE_RSQ:
680 case SHADER_OPCODE_SQRT:
681 case SHADER_OPCODE_EXP2:
682 case SHADER_OPCODE_LOG2:
683 case SHADER_OPCODE_SIN:
684 case SHADER_OPCODE_COS:
685 return 1 * dispatch_width / 8;
686 case SHADER_OPCODE_POW:
687 case SHADER_OPCODE_INT_QUOTIENT:
688 case SHADER_OPCODE_INT_REMAINDER:
689 return 2 * dispatch_width / 8;
690 case SHADER_OPCODE_TEX:
691 case FS_OPCODE_TXB:
692 case SHADER_OPCODE_TXD:
693 case SHADER_OPCODE_TXF:
694 case SHADER_OPCODE_TXL:
695 case SHADER_OPCODE_TXS:
696 return 1;
697 case SHADER_OPCODE_SHADER_TIME_ADD:
698 return 0;
699 case FS_OPCODE_FB_WRITE:
700 return 2;
701 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
702 case FS_OPCODE_UNSPILL:
703 return 1;
704 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
705 return inst->header_present;
706 case FS_OPCODE_SPILL:
707 return 2;
708 default:
709 assert(!"not reached");
710 return inst->mlen;
711 }
712 }
713
714 int
715 fs_visitor::virtual_grf_alloc(int size)
716 {
717 if (virtual_grf_array_size <= virtual_grf_count) {
718 if (virtual_grf_array_size == 0)
719 virtual_grf_array_size = 16;
720 else
721 virtual_grf_array_size *= 2;
722 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
723 virtual_grf_array_size);
724 }
725 virtual_grf_sizes[virtual_grf_count] = size;
726 return virtual_grf_count++;
727 }
728
729 /** Fixed HW reg constructor. */
730 fs_reg::fs_reg(enum register_file file, int reg)
731 {
732 init();
733 this->file = file;
734 this->reg = reg;
735 this->type = BRW_REGISTER_TYPE_F;
736 }
737
738 /** Fixed HW reg constructor. */
739 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
740 {
741 init();
742 this->file = file;
743 this->reg = reg;
744 this->type = type;
745 }
746
747 /** Automatic reg constructor. */
748 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
749 {
750 init();
751
752 this->file = GRF;
753 this->reg = v->virtual_grf_alloc(v->type_size(type));
754 this->reg_offset = 0;
755 this->type = brw_type_for_base_type(type);
756 }
757
758 fs_reg *
759 fs_visitor::variable_storage(ir_variable *var)
760 {
761 return (fs_reg *)hash_table_find(this->variable_ht, var);
762 }
763
764 void
765 import_uniforms_callback(const void *key,
766 void *data,
767 void *closure)
768 {
769 struct hash_table *dst_ht = (struct hash_table *)closure;
770 const fs_reg *reg = (const fs_reg *)data;
771
772 if (reg->file != UNIFORM)
773 return;
774
775 hash_table_insert(dst_ht, data, key);
776 }
777
778 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
779 * This brings in those uniform definitions
780 */
781 void
782 fs_visitor::import_uniforms(fs_visitor *v)
783 {
784 hash_table_call_foreach(v->variable_ht,
785 import_uniforms_callback,
786 variable_ht);
787 this->params_remap = v->params_remap;
788 }
789
790 /* Our support for uniforms is piggy-backed on the struct
791 * gl_fragment_program, because that's where the values actually
792 * get stored, rather than in some global gl_shader_program uniform
793 * store.
794 */
795 int
796 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
797 {
798 unsigned int offset = 0;
799
800 if (type->is_matrix()) {
801 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
802 type->vector_elements,
803 1);
804
805 for (unsigned int i = 0; i < type->matrix_columns; i++) {
806 offset += setup_uniform_values(loc + offset, column);
807 }
808
809 return offset;
810 }
811
812 switch (type->base_type) {
813 case GLSL_TYPE_FLOAT:
814 case GLSL_TYPE_UINT:
815 case GLSL_TYPE_INT:
816 case GLSL_TYPE_BOOL:
817 for (unsigned int i = 0; i < type->vector_elements; i++) {
818 unsigned int param = c->prog_data.nr_params++;
819
820 this->param_index[param] = loc;
821 this->param_offset[param] = i;
822 }
823 return 1;
824
825 case GLSL_TYPE_STRUCT:
826 for (unsigned int i = 0; i < type->length; i++) {
827 offset += setup_uniform_values(loc + offset,
828 type->fields.structure[i].type);
829 }
830 return offset;
831
832 case GLSL_TYPE_ARRAY:
833 for (unsigned int i = 0; i < type->length; i++) {
834 offset += setup_uniform_values(loc + offset, type->fields.array);
835 }
836 return offset;
837
838 case GLSL_TYPE_SAMPLER:
839 /* The sampler takes up a slot, but we don't use any values from it. */
840 return 1;
841
842 default:
843 assert(!"not reached");
844 return 0;
845 }
846 }
847
848
849 /* Our support for builtin uniforms is even scarier than non-builtin.
850 * It sits on top of the PROG_STATE_VAR parameters that are
851 * automatically updated from GL context state.
852 */
853 void
854 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
855 {
856 const ir_state_slot *const slots = ir->state_slots;
857 assert(ir->state_slots != NULL);
858
859 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
860 /* This state reference has already been setup by ir_to_mesa, but we'll
861 * get the same index back here.
862 */
863 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
864 (gl_state_index *)slots[i].tokens);
865
866 /* Add each of the unique swizzles of the element as a parameter.
867 * This'll end up matching the expected layout of the
868 * array/matrix/structure we're trying to fill in.
869 */
870 int last_swiz = -1;
871 for (unsigned int j = 0; j < 4; j++) {
872 int swiz = GET_SWZ(slots[i].swizzle, j);
873 if (swiz == last_swiz)
874 break;
875 last_swiz = swiz;
876
877 this->param_index[c->prog_data.nr_params] = index;
878 this->param_offset[c->prog_data.nr_params] = swiz;
879 c->prog_data.nr_params++;
880 }
881 }
882 }
883
884 fs_reg *
885 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
886 {
887 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
888 fs_reg wpos = *reg;
889 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
890
891 /* gl_FragCoord.x */
892 if (ir->pixel_center_integer) {
893 emit(MOV(wpos, this->pixel_x));
894 } else {
895 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
896 }
897 wpos.reg_offset++;
898
899 /* gl_FragCoord.y */
900 if (!flip && ir->pixel_center_integer) {
901 emit(MOV(wpos, this->pixel_y));
902 } else {
903 fs_reg pixel_y = this->pixel_y;
904 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
905
906 if (flip) {
907 pixel_y.negate = true;
908 offset += c->key.drawable_height - 1.0;
909 }
910
911 emit(ADD(wpos, pixel_y, fs_reg(offset)));
912 }
913 wpos.reg_offset++;
914
915 /* gl_FragCoord.z */
916 if (intel->gen >= 6) {
917 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
918 } else {
919 emit(FS_OPCODE_LINTERP, wpos,
920 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
921 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
922 interp_reg(FRAG_ATTRIB_WPOS, 2));
923 }
924 wpos.reg_offset++;
925
926 /* gl_FragCoord.w: Already set up in emit_interpolation */
927 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
928
929 return reg;
930 }
931
932 fs_inst *
933 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
934 glsl_interp_qualifier interpolation_mode,
935 bool is_centroid)
936 {
937 brw_wm_barycentric_interp_mode barycoord_mode;
938 if (is_centroid) {
939 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
940 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
941 else
942 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
943 } else {
944 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
945 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
946 else
947 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
948 }
949 return emit(FS_OPCODE_LINTERP, attr,
950 this->delta_x[barycoord_mode],
951 this->delta_y[barycoord_mode], interp);
952 }
953
954 fs_reg *
955 fs_visitor::emit_general_interpolation(ir_variable *ir)
956 {
957 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
958 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
959 fs_reg attr = *reg;
960
961 unsigned int array_elements;
962 const glsl_type *type;
963
964 if (ir->type->is_array()) {
965 array_elements = ir->type->length;
966 if (array_elements == 0) {
967 fail("dereferenced array '%s' has length 0\n", ir->name);
968 }
969 type = ir->type->fields.array;
970 } else {
971 array_elements = 1;
972 type = ir->type;
973 }
974
975 glsl_interp_qualifier interpolation_mode =
976 ir->determine_interpolation_mode(c->key.flat_shade);
977
978 int location = ir->location;
979 for (unsigned int i = 0; i < array_elements; i++) {
980 for (unsigned int j = 0; j < type->matrix_columns; j++) {
981 if (urb_setup[location] == -1) {
982 /* If there's no incoming setup data for this slot, don't
983 * emit interpolation for it.
984 */
985 attr.reg_offset += type->vector_elements;
986 location++;
987 continue;
988 }
989
990 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
991 /* Constant interpolation (flat shading) case. The SF has
992 * handed us defined values in only the constant offset
993 * field of the setup reg.
994 */
995 for (unsigned int k = 0; k < type->vector_elements; k++) {
996 struct brw_reg interp = interp_reg(location, k);
997 interp = suboffset(interp, 3);
998 interp.type = reg->type;
999 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1000 attr.reg_offset++;
1001 }
1002 } else {
1003 /* Smooth/noperspective interpolation case. */
1004 for (unsigned int k = 0; k < type->vector_elements; k++) {
1005 /* FINISHME: At some point we probably want to push
1006 * this farther by giving similar treatment to the
1007 * other potentially constant components of the
1008 * attribute, as well as making brw_vs_constval.c
1009 * handle varyings other than gl_TexCoord.
1010 */
1011 if (location >= FRAG_ATTRIB_TEX0 &&
1012 location <= FRAG_ATTRIB_TEX7 &&
1013 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1014 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1015 } else {
1016 struct brw_reg interp = interp_reg(location, k);
1017 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1018 ir->centroid);
1019 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1020 /* Get the pixel/sample mask into f0 so that we know
1021 * which pixels are lit. Then, for each channel that is
1022 * unlit, replace the centroid data with non-centroid
1023 * data.
1024 */
1025 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1026 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1027 interpolation_mode, false);
1028 inst->predicate = BRW_PREDICATE_NORMAL;
1029 inst->predicate_inverse = true;
1030 }
1031 if (intel->gen < 6) {
1032 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1033 }
1034 }
1035 attr.reg_offset++;
1036 }
1037
1038 }
1039 location++;
1040 }
1041 }
1042
1043 return reg;
1044 }
1045
1046 fs_reg *
1047 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1048 {
1049 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1050
1051 /* The frontfacing comes in as a bit in the thread payload. */
1052 if (intel->gen >= 6) {
1053 emit(BRW_OPCODE_ASR, *reg,
1054 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1055 fs_reg(15));
1056 emit(BRW_OPCODE_NOT, *reg, *reg);
1057 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1058 } else {
1059 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1060 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1061 * us front face
1062 */
1063 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1064 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1065 }
1066
1067 return reg;
1068 }
1069
1070 fs_inst *
1071 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1072 {
1073 switch (opcode) {
1074 case SHADER_OPCODE_RCP:
1075 case SHADER_OPCODE_RSQ:
1076 case SHADER_OPCODE_SQRT:
1077 case SHADER_OPCODE_EXP2:
1078 case SHADER_OPCODE_LOG2:
1079 case SHADER_OPCODE_SIN:
1080 case SHADER_OPCODE_COS:
1081 break;
1082 default:
1083 assert(!"not reached: bad math opcode");
1084 return NULL;
1085 }
1086
1087 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1088 * might be able to do better by doing execsize = 1 math and then
1089 * expanding that result out, but we would need to be careful with
1090 * masking.
1091 *
1092 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1093 * instructions, so we also move to a temp to set those up.
1094 */
1095 if (intel->gen == 6 && (src.file == UNIFORM ||
1096 src.abs ||
1097 src.negate)) {
1098 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1099 emit(BRW_OPCODE_MOV, expanded, src);
1100 src = expanded;
1101 }
1102
1103 fs_inst *inst = emit(opcode, dst, src);
1104
1105 if (intel->gen < 6) {
1106 inst->base_mrf = 2;
1107 inst->mlen = dispatch_width / 8;
1108 }
1109
1110 return inst;
1111 }
1112
1113 fs_inst *
1114 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1115 {
1116 int base_mrf = 2;
1117 fs_inst *inst;
1118
1119 switch (opcode) {
1120 case SHADER_OPCODE_POW:
1121 case SHADER_OPCODE_INT_QUOTIENT:
1122 case SHADER_OPCODE_INT_REMAINDER:
1123 break;
1124 default:
1125 assert(!"not reached: unsupported binary math opcode.");
1126 return NULL;
1127 }
1128
1129 if (intel->gen >= 7) {
1130 inst = emit(opcode, dst, src0, src1);
1131 } else if (intel->gen == 6) {
1132 /* Can't do hstride == 0 args to gen6 math, so expand it out.
1133 *
1134 * The hardware ignores source modifiers (negate and abs) on math
1135 * instructions, so we also move to a temp to set those up.
1136 */
1137 if (src0.file == UNIFORM || src0.abs || src0.negate) {
1138 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1139 expanded.type = src0.type;
1140 emit(BRW_OPCODE_MOV, expanded, src0);
1141 src0 = expanded;
1142 }
1143
1144 if (src1.file == UNIFORM || src1.abs || src1.negate) {
1145 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1146 expanded.type = src1.type;
1147 emit(BRW_OPCODE_MOV, expanded, src1);
1148 src1 = expanded;
1149 }
1150
1151 inst = emit(opcode, dst, src0, src1);
1152 } else {
1153 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1154 * "Message Payload":
1155 *
1156 * "Operand0[7]. For the INT DIV functions, this operand is the
1157 * denominator."
1158 * ...
1159 * "Operand1[7]. For the INT DIV functions, this operand is the
1160 * numerator."
1161 */
1162 bool is_int_div = opcode != SHADER_OPCODE_POW;
1163 fs_reg &op0 = is_int_div ? src1 : src0;
1164 fs_reg &op1 = is_int_div ? src0 : src1;
1165
1166 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1167 inst = emit(opcode, dst, op0, reg_null_f);
1168
1169 inst->base_mrf = base_mrf;
1170 inst->mlen = 2 * dispatch_width / 8;
1171 }
1172 return inst;
1173 }
1174
1175 /**
1176 * To be called after the last _mesa_add_state_reference() call, to
1177 * set up prog_data.param[] for assign_curb_setup() and
1178 * setup_pull_constants().
1179 */
1180 void
1181 fs_visitor::setup_paramvalues_refs()
1182 {
1183 if (dispatch_width != 8)
1184 return;
1185
1186 /* Set up the pointers to ParamValues now that that array is finalized. */
1187 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1188 c->prog_data.param[i] =
1189 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
1190 this->param_offset[i];
1191 }
1192 }
1193
1194 void
1195 fs_visitor::assign_curb_setup()
1196 {
1197 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1198 if (dispatch_width == 8) {
1199 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1200 } else {
1201 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1202 }
1203
1204 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1205 foreach_list(node, &this->instructions) {
1206 fs_inst *inst = (fs_inst *)node;
1207
1208 for (unsigned int i = 0; i < 3; i++) {
1209 if (inst->src[i].file == UNIFORM) {
1210 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1211 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1212 constant_nr / 8,
1213 constant_nr % 8);
1214
1215 inst->src[i].file = FIXED_HW_REG;
1216 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1217 }
1218 }
1219 }
1220 }
1221
1222 void
1223 fs_visitor::calculate_urb_setup()
1224 {
1225 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1226 urb_setup[i] = -1;
1227 }
1228
1229 int urb_next = 0;
1230 /* Figure out where each of the incoming setup attributes lands. */
1231 if (intel->gen >= 6) {
1232 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1233 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1234 urb_setup[i] = urb_next++;
1235 }
1236 }
1237 } else {
1238 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1239 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1240 /* Point size is packed into the header, not as a general attribute */
1241 if (i == VERT_RESULT_PSIZ)
1242 continue;
1243
1244 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1245 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1246
1247 /* The back color slot is skipped when the front color is
1248 * also written to. In addition, some slots can be
1249 * written in the vertex shader and not read in the
1250 * fragment shader. So the register number must always be
1251 * incremented, mapped or not.
1252 */
1253 if (fp_index >= 0)
1254 urb_setup[fp_index] = urb_next;
1255 urb_next++;
1256 }
1257 }
1258
1259 /*
1260 * It's a FS only attribute, and we did interpolation for this attribute
1261 * in SF thread. So, count it here, too.
1262 *
1263 * See compile_sf_prog() for more info.
1264 */
1265 if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1266 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1267 }
1268
1269 /* Each attribute is 4 setup channels, each of which is half a reg. */
1270 c->prog_data.urb_read_length = urb_next * 2;
1271 }
1272
1273 void
1274 fs_visitor::assign_urb_setup()
1275 {
1276 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1277
1278 /* Offset all the urb_setup[] index by the actual position of the
1279 * setup regs, now that the location of the constants has been chosen.
1280 */
1281 foreach_list(node, &this->instructions) {
1282 fs_inst *inst = (fs_inst *)node;
1283
1284 if (inst->opcode == FS_OPCODE_LINTERP) {
1285 assert(inst->src[2].file == FIXED_HW_REG);
1286 inst->src[2].fixed_hw_reg.nr += urb_start;
1287 }
1288
1289 if (inst->opcode == FS_OPCODE_CINTERP) {
1290 assert(inst->src[0].file == FIXED_HW_REG);
1291 inst->src[0].fixed_hw_reg.nr += urb_start;
1292 }
1293 }
1294
1295 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1296 }
1297
1298 /**
1299 * Split large virtual GRFs into separate components if we can.
1300 *
1301 * This is mostly duplicated with what brw_fs_vector_splitting does,
1302 * but that's really conservative because it's afraid of doing
1303 * splitting that doesn't result in real progress after the rest of
1304 * the optimization phases, which would cause infinite looping in
1305 * optimization. We can do it once here, safely. This also has the
1306 * opportunity to split interpolated values, or maybe even uniforms,
1307 * which we don't have at the IR level.
1308 *
1309 * We want to split, because virtual GRFs are what we register
1310 * allocate and spill (due to contiguousness requirements for some
1311 * instructions), and they're what we naturally generate in the
1312 * codegen process, but most virtual GRFs don't actually need to be
1313 * contiguous sets of GRFs. If we split, we'll end up with reduced
1314 * live intervals and better dead code elimination and coalescing.
1315 */
1316 void
1317 fs_visitor::split_virtual_grfs()
1318 {
1319 int num_vars = this->virtual_grf_count;
1320 bool split_grf[num_vars];
1321 int new_virtual_grf[num_vars];
1322
1323 /* Try to split anything > 0 sized. */
1324 for (int i = 0; i < num_vars; i++) {
1325 if (this->virtual_grf_sizes[i] != 1)
1326 split_grf[i] = true;
1327 else
1328 split_grf[i] = false;
1329 }
1330
1331 if (brw->has_pln &&
1332 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1333 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1334 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1335 * Gen6, that was the only supported interpolation mode, and since Gen6,
1336 * delta_x and delta_y are in fixed hardware registers.
1337 */
1338 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1339 false;
1340 }
1341
1342 foreach_list(node, &this->instructions) {
1343 fs_inst *inst = (fs_inst *)node;
1344
1345 /* If there's a SEND message that requires contiguous destination
1346 * registers, no splitting is allowed.
1347 */
1348 if (inst->regs_written() > 1) {
1349 split_grf[inst->dst.reg] = false;
1350 }
1351 }
1352
1353 /* Allocate new space for split regs. Note that the virtual
1354 * numbers will be contiguous.
1355 */
1356 for (int i = 0; i < num_vars; i++) {
1357 if (split_grf[i]) {
1358 new_virtual_grf[i] = virtual_grf_alloc(1);
1359 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1360 int reg = virtual_grf_alloc(1);
1361 assert(reg == new_virtual_grf[i] + j - 1);
1362 (void) reg;
1363 }
1364 this->virtual_grf_sizes[i] = 1;
1365 }
1366 }
1367
1368 foreach_list(node, &this->instructions) {
1369 fs_inst *inst = (fs_inst *)node;
1370
1371 if (inst->dst.file == GRF &&
1372 split_grf[inst->dst.reg] &&
1373 inst->dst.reg_offset != 0) {
1374 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1375 inst->dst.reg_offset - 1);
1376 inst->dst.reg_offset = 0;
1377 }
1378 for (int i = 0; i < 3; i++) {
1379 if (inst->src[i].file == GRF &&
1380 split_grf[inst->src[i].reg] &&
1381 inst->src[i].reg_offset != 0) {
1382 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1383 inst->src[i].reg_offset - 1);
1384 inst->src[i].reg_offset = 0;
1385 }
1386 }
1387 }
1388 this->live_intervals_valid = false;
1389 }
1390
1391 /**
1392 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1393 *
1394 * During code generation, we create tons of temporary variables, many of
1395 * which get immediately killed and are never used again. Yet, in later
1396 * optimization and analysis passes, such as compute_live_intervals, we need
1397 * to loop over all the virtual GRFs. Compacting them can save a lot of
1398 * overhead.
1399 */
1400 void
1401 fs_visitor::compact_virtual_grfs()
1402 {
1403 /* Mark which virtual GRFs are used, and count how many. */
1404 int remap_table[this->virtual_grf_count];
1405 memset(remap_table, -1, sizeof(remap_table));
1406
1407 foreach_list(node, &this->instructions) {
1408 const fs_inst *inst = (const fs_inst *) node;
1409
1410 if (inst->dst.file == GRF)
1411 remap_table[inst->dst.reg] = 0;
1412
1413 for (int i = 0; i < 3; i++) {
1414 if (inst->src[i].file == GRF)
1415 remap_table[inst->src[i].reg] = 0;
1416 }
1417 }
1418
1419 /* In addition to registers used in instructions, fs_visitor keeps
1420 * direct references to certain special values which must be patched:
1421 */
1422 fs_reg *special[] = {
1423 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1424 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1425 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1426 &delta_x[0], &delta_x[1], &delta_x[2],
1427 &delta_x[3], &delta_x[4], &delta_x[5],
1428 &delta_y[0], &delta_y[1], &delta_y[2],
1429 &delta_y[3], &delta_y[4], &delta_y[5],
1430 };
1431 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1432 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1433
1434 /* Treat all special values as used, to be conservative */
1435 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1436 if (special[i]->file == GRF)
1437 remap_table[special[i]->reg] = 0;
1438 }
1439
1440 /* Compact the GRF arrays. */
1441 int new_index = 0;
1442 for (int i = 0; i < this->virtual_grf_count; i++) {
1443 if (remap_table[i] != -1) {
1444 remap_table[i] = new_index;
1445 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1446 if (live_intervals_valid) {
1447 virtual_grf_use[new_index] = virtual_grf_use[i];
1448 virtual_grf_def[new_index] = virtual_grf_def[i];
1449 }
1450 ++new_index;
1451 }
1452 }
1453
1454 this->virtual_grf_count = new_index;
1455
1456 /* Patch all the instructions to use the newly renumbered registers */
1457 foreach_list(node, &this->instructions) {
1458 fs_inst *inst = (fs_inst *) node;
1459
1460 if (inst->dst.file == GRF)
1461 inst->dst.reg = remap_table[inst->dst.reg];
1462
1463 for (int i = 0; i < 3; i++) {
1464 if (inst->src[i].file == GRF)
1465 inst->src[i].reg = remap_table[inst->src[i].reg];
1466 }
1467 }
1468
1469 /* Patch all the references to special values */
1470 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1471 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1472 special[i]->reg = remap_table[special[i]->reg];
1473 }
1474 }
1475
1476 bool
1477 fs_visitor::remove_dead_constants()
1478 {
1479 if (dispatch_width == 8) {
1480 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1481
1482 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1483 this->params_remap[i] = -1;
1484
1485 /* Find which params are still in use. */
1486 foreach_list(node, &this->instructions) {
1487 fs_inst *inst = (fs_inst *)node;
1488
1489 for (int i = 0; i < 3; i++) {
1490 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1491
1492 if (inst->src[i].file != UNIFORM)
1493 continue;
1494
1495 assert(constant_nr < (int)c->prog_data.nr_params);
1496
1497 /* For now, set this to non-negative. We'll give it the
1498 * actual new number in a moment, in order to keep the
1499 * register numbers nicely ordered.
1500 */
1501 this->params_remap[constant_nr] = 0;
1502 }
1503 }
1504
1505 /* Figure out what the new numbers for the params will be. At some
1506 * point when we're doing uniform array access, we're going to want
1507 * to keep the distinction between .reg and .reg_offset, but for
1508 * now we don't care.
1509 */
1510 unsigned int new_nr_params = 0;
1511 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1512 if (this->params_remap[i] != -1) {
1513 this->params_remap[i] = new_nr_params++;
1514 }
1515 }
1516
1517 /* Update the list of params to be uploaded to match our new numbering. */
1518 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1519 int remapped = this->params_remap[i];
1520
1521 if (remapped == -1)
1522 continue;
1523
1524 /* We've already done setup_paramvalues_refs() so no need to worry
1525 * about param_index and param_offset.
1526 */
1527 c->prog_data.param[remapped] = c->prog_data.param[i];
1528 }
1529
1530 c->prog_data.nr_params = new_nr_params;
1531 } else {
1532 /* This should have been generated in the 8-wide pass already. */
1533 assert(this->params_remap);
1534 }
1535
1536 /* Now do the renumbering of the shader to remove unused params. */
1537 foreach_list(node, &this->instructions) {
1538 fs_inst *inst = (fs_inst *)node;
1539
1540 for (int i = 0; i < 3; i++) {
1541 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1542
1543 if (inst->src[i].file != UNIFORM)
1544 continue;
1545
1546 assert(this->params_remap[constant_nr] != -1);
1547 inst->src[i].reg = this->params_remap[constant_nr];
1548 inst->src[i].reg_offset = 0;
1549 }
1550 }
1551
1552 return true;
1553 }
1554
1555 /*
1556 * Implements array access of uniforms by inserting a
1557 * PULL_CONSTANT_LOAD instruction.
1558 *
1559 * Unlike temporary GRF array access (where we don't support it due to
1560 * the difficulty of doing relative addressing on instruction
1561 * destinations), we could potentially do array access of uniforms
1562 * that were loaded in GRF space as push constants. In real-world
1563 * usage we've seen, though, the arrays being used are always larger
1564 * than we could load as push constants, so just always move all
1565 * uniform array access out to a pull constant buffer.
1566 */
1567 void
1568 fs_visitor::move_uniform_array_access_to_pull_constants()
1569 {
1570 int pull_constant_loc[c->prog_data.nr_params];
1571
1572 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1573 pull_constant_loc[i] = -1;
1574 }
1575
1576 /* Walk through and find array access of uniforms. Put a copy of that
1577 * uniform in the pull constant buffer.
1578 *
1579 * Note that we don't move constant-indexed accesses to arrays. No
1580 * testing has been done of the performance impact of this choice.
1581 */
1582 foreach_list_safe(node, &this->instructions) {
1583 fs_inst *inst = (fs_inst *)node;
1584
1585 for (int i = 0 ; i < 3; i++) {
1586 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1587 continue;
1588
1589 int uniform = inst->src[i].reg;
1590
1591 /* If this array isn't already present in the pull constant buffer,
1592 * add it.
1593 */
1594 if (pull_constant_loc[uniform] == -1) {
1595 const float **values = &c->prog_data.param[uniform];
1596
1597 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1598
1599 assert(param_size[uniform]);
1600
1601 for (int j = 0; j < param_size[uniform]; j++) {
1602 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1603 values[j];
1604 }
1605 }
1606
1607 /* Set up the annotation tracking for new generated instructions. */
1608 base_ir = inst->ir;
1609 current_annotation = inst->annotation;
1610
1611 fs_reg offset = fs_reg(this, glsl_type::int_type);
1612 inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1613 fs_reg(pull_constant_loc[uniform] +
1614 inst->src[i].reg_offset)));
1615
1616 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1617 fs_reg temp = fs_reg(this, glsl_type::float_type);
1618 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1619 surf_index, offset);
1620 inst->insert_before(&list);
1621
1622 inst->src[i].file = temp.file;
1623 inst->src[i].reg = temp.reg;
1624 inst->src[i].reg_offset = temp.reg_offset;
1625 inst->src[i].reladdr = NULL;
1626 }
1627 }
1628 }
1629
1630 /**
1631 * Choose accesses from the UNIFORM file to demote to using the pull
1632 * constant buffer.
1633 *
1634 * We allow a fragment shader to have more than the specified minimum
1635 * maximum number of fragment shader uniform components (64). If
1636 * there are too many of these, they'd fill up all of register space.
1637 * So, this will push some of them out to the pull constant buffer and
1638 * update the program to load them.
1639 */
1640 void
1641 fs_visitor::setup_pull_constants()
1642 {
1643 /* Only allow 16 registers (128 uniform components) as push constants. */
1644 unsigned int max_uniform_components = 16 * 8;
1645 if (c->prog_data.nr_params <= max_uniform_components)
1646 return;
1647
1648 if (dispatch_width == 16) {
1649 fail("Pull constants not supported in 16-wide\n");
1650 return;
1651 }
1652
1653 /* Just demote the end of the list. We could probably do better
1654 * here, demoting things that are rarely used in the program first.
1655 */
1656 unsigned int pull_uniform_base = max_uniform_components;
1657
1658 int pull_constant_loc[c->prog_data.nr_params];
1659 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1660 if (i < pull_uniform_base) {
1661 pull_constant_loc[i] = -1;
1662 } else {
1663 pull_constant_loc[i] = -1;
1664 /* If our constant is already being uploaded for reladdr purposes,
1665 * reuse it.
1666 */
1667 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1668 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1669 pull_constant_loc[i] = j;
1670 break;
1671 }
1672 }
1673 if (pull_constant_loc[i] == -1) {
1674 int pull_index = c->prog_data.nr_pull_params++;
1675 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1676 pull_constant_loc[i] = pull_index;;
1677 }
1678 }
1679 }
1680 c->prog_data.nr_params = pull_uniform_base;
1681
1682 foreach_list(node, &this->instructions) {
1683 fs_inst *inst = (fs_inst *)node;
1684
1685 for (int i = 0; i < 3; i++) {
1686 if (inst->src[i].file != UNIFORM)
1687 continue;
1688
1689 int pull_index = pull_constant_loc[inst->src[i].reg +
1690 inst->src[i].reg_offset];
1691 if (pull_index == -1)
1692 continue;
1693
1694 assert(!inst->src[i].reladdr);
1695
1696 fs_reg dst = fs_reg(this, glsl_type::float_type);
1697 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1698 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1699 fs_inst *pull =
1700 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1701 dst, index, offset);
1702 pull->ir = inst->ir;
1703 pull->annotation = inst->annotation;
1704 pull->base_mrf = 14;
1705 pull->mlen = 1;
1706
1707 inst->insert_before(pull);
1708
1709 inst->src[i].file = GRF;
1710 inst->src[i].reg = dst.reg;
1711 inst->src[i].reg_offset = 0;
1712 inst->src[i].smear = pull_index & 3;
1713 }
1714 }
1715 }
1716
1717 bool
1718 fs_visitor::opt_algebraic()
1719 {
1720 bool progress = false;
1721
1722 foreach_list(node, &this->instructions) {
1723 fs_inst *inst = (fs_inst *)node;
1724
1725 switch (inst->opcode) {
1726 case BRW_OPCODE_MUL:
1727 if (inst->src[1].file != IMM)
1728 continue;
1729
1730 /* a * 1.0 = a */
1731 if (inst->src[1].is_one()) {
1732 inst->opcode = BRW_OPCODE_MOV;
1733 inst->src[1] = reg_undef;
1734 progress = true;
1735 break;
1736 }
1737
1738 /* a * 0.0 = 0.0 */
1739 if (inst->src[1].is_zero()) {
1740 inst->opcode = BRW_OPCODE_MOV;
1741 inst->src[0] = inst->src[1];
1742 inst->src[1] = reg_undef;
1743 progress = true;
1744 break;
1745 }
1746
1747 break;
1748 case BRW_OPCODE_ADD:
1749 if (inst->src[1].file != IMM)
1750 continue;
1751
1752 /* a + 0.0 = a */
1753 if (inst->src[1].is_zero()) {
1754 inst->opcode = BRW_OPCODE_MOV;
1755 inst->src[1] = reg_undef;
1756 progress = true;
1757 break;
1758 }
1759 break;
1760 default:
1761 break;
1762 }
1763 }
1764
1765 return progress;
1766 }
1767
1768 /**
1769 * Must be called after calculate_live_intervales() to remove unused
1770 * writes to registers -- register allocation will fail otherwise
1771 * because something deffed but not used won't be considered to
1772 * interfere with other regs.
1773 */
1774 bool
1775 fs_visitor::dead_code_eliminate()
1776 {
1777 bool progress = false;
1778 int pc = 0;
1779
1780 calculate_live_intervals();
1781
1782 foreach_list_safe(node, &this->instructions) {
1783 fs_inst *inst = (fs_inst *)node;
1784
1785 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1786 inst->remove();
1787 progress = true;
1788 }
1789
1790 pc++;
1791 }
1792
1793 if (progress)
1794 live_intervals_valid = false;
1795
1796 return progress;
1797 }
1798
1799 /**
1800 * Implements a second type of register coalescing: This one checks if
1801 * the two regs involved in a raw move don't interfere, in which case
1802 * they can both by stored in the same place and the MOV removed.
1803 */
1804 bool
1805 fs_visitor::register_coalesce_2()
1806 {
1807 bool progress = false;
1808
1809 calculate_live_intervals();
1810
1811 foreach_list_safe(node, &this->instructions) {
1812 fs_inst *inst = (fs_inst *)node;
1813
1814 if (inst->opcode != BRW_OPCODE_MOV ||
1815 inst->predicate ||
1816 inst->saturate ||
1817 inst->src[0].file != GRF ||
1818 inst->src[0].negate ||
1819 inst->src[0].abs ||
1820 inst->src[0].smear != -1 ||
1821 inst->dst.file != GRF ||
1822 inst->dst.type != inst->src[0].type ||
1823 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1824 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1825 continue;
1826 }
1827
1828 int reg_from = inst->src[0].reg;
1829 assert(inst->src[0].reg_offset == 0);
1830 int reg_to = inst->dst.reg;
1831 int reg_to_offset = inst->dst.reg_offset;
1832
1833 foreach_list_safe(node, &this->instructions) {
1834 fs_inst *scan_inst = (fs_inst *)node;
1835
1836 if (scan_inst->dst.file == GRF &&
1837 scan_inst->dst.reg == reg_from) {
1838 scan_inst->dst.reg = reg_to;
1839 scan_inst->dst.reg_offset = reg_to_offset;
1840 }
1841 for (int i = 0; i < 3; i++) {
1842 if (scan_inst->src[i].file == GRF &&
1843 scan_inst->src[i].reg == reg_from) {
1844 scan_inst->src[i].reg = reg_to;
1845 scan_inst->src[i].reg_offset = reg_to_offset;
1846 }
1847 }
1848 }
1849
1850 inst->remove();
1851 live_intervals_valid = false;
1852 progress = true;
1853 continue;
1854 }
1855
1856 return progress;
1857 }
1858
1859 bool
1860 fs_visitor::register_coalesce()
1861 {
1862 bool progress = false;
1863 int if_depth = 0;
1864 int loop_depth = 0;
1865
1866 foreach_list_safe(node, &this->instructions) {
1867 fs_inst *inst = (fs_inst *)node;
1868
1869 /* Make sure that we dominate the instructions we're going to
1870 * scan for interfering with our coalescing, or we won't have
1871 * scanned enough to see if anything interferes with our
1872 * coalescing. We don't dominate the following instructions if
1873 * we're in a loop or an if block.
1874 */
1875 switch (inst->opcode) {
1876 case BRW_OPCODE_DO:
1877 loop_depth++;
1878 break;
1879 case BRW_OPCODE_WHILE:
1880 loop_depth--;
1881 break;
1882 case BRW_OPCODE_IF:
1883 if_depth++;
1884 break;
1885 case BRW_OPCODE_ENDIF:
1886 if_depth--;
1887 break;
1888 default:
1889 break;
1890 }
1891 if (loop_depth || if_depth)
1892 continue;
1893
1894 if (inst->opcode != BRW_OPCODE_MOV ||
1895 inst->predicate ||
1896 inst->saturate ||
1897 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1898 inst->src[0].file != UNIFORM)||
1899 inst->dst.type != inst->src[0].type)
1900 continue;
1901
1902 bool has_source_modifiers = (inst->src[0].abs ||
1903 inst->src[0].negate ||
1904 inst->src[0].file == UNIFORM);
1905
1906 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1907 * them: check for no writes to either one until the exit of the
1908 * program.
1909 */
1910 bool interfered = false;
1911
1912 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1913 !scan_inst->is_tail_sentinel();
1914 scan_inst = (fs_inst *)scan_inst->next) {
1915 if (scan_inst->dst.file == GRF) {
1916 if (scan_inst->overwrites_reg(inst->dst) ||
1917 scan_inst->overwrites_reg(inst->src[0])) {
1918 interfered = true;
1919 break;
1920 }
1921 }
1922
1923 /* The gen6 MATH instruction can't handle source modifiers or
1924 * unusual register regions, so avoid coalescing those for
1925 * now. We should do something more specific.
1926 */
1927 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1928 interfered = true;
1929 break;
1930 }
1931
1932 /* The accumulator result appears to get used for the
1933 * conditional modifier generation. When negating a UD
1934 * value, there is a 33rd bit generated for the sign in the
1935 * accumulator value, so now you can't check, for example,
1936 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1937 */
1938 if (scan_inst->conditional_mod &&
1939 inst->src[0].negate &&
1940 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1941 interfered = true;
1942 break;
1943 }
1944 }
1945 if (interfered) {
1946 continue;
1947 }
1948
1949 /* Rewrite the later usage to point at the source of the move to
1950 * be removed.
1951 */
1952 for (fs_inst *scan_inst = inst;
1953 !scan_inst->is_tail_sentinel();
1954 scan_inst = (fs_inst *)scan_inst->next) {
1955 for (int i = 0; i < 3; i++) {
1956 if (scan_inst->src[i].file == GRF &&
1957 scan_inst->src[i].reg == inst->dst.reg &&
1958 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1959 fs_reg new_src = inst->src[0];
1960 if (scan_inst->src[i].abs) {
1961 new_src.negate = 0;
1962 new_src.abs = 1;
1963 }
1964 new_src.negate ^= scan_inst->src[i].negate;
1965 scan_inst->src[i] = new_src;
1966 }
1967 }
1968 }
1969
1970 inst->remove();
1971 progress = true;
1972 }
1973
1974 if (progress)
1975 live_intervals_valid = false;
1976
1977 return progress;
1978 }
1979
1980
1981 bool
1982 fs_visitor::compute_to_mrf()
1983 {
1984 bool progress = false;
1985 int next_ip = 0;
1986
1987 calculate_live_intervals();
1988
1989 foreach_list_safe(node, &this->instructions) {
1990 fs_inst *inst = (fs_inst *)node;
1991
1992 int ip = next_ip;
1993 next_ip++;
1994
1995 if (inst->opcode != BRW_OPCODE_MOV ||
1996 inst->predicate ||
1997 inst->dst.file != MRF || inst->src[0].file != GRF ||
1998 inst->dst.type != inst->src[0].type ||
1999 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2000 continue;
2001
2002 /* Work out which hardware MRF registers are written by this
2003 * instruction.
2004 */
2005 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2006 int mrf_high;
2007 if (inst->dst.reg & BRW_MRF_COMPR4) {
2008 mrf_high = mrf_low + 4;
2009 } else if (dispatch_width == 16 &&
2010 (!inst->force_uncompressed && !inst->force_sechalf)) {
2011 mrf_high = mrf_low + 1;
2012 } else {
2013 mrf_high = mrf_low;
2014 }
2015
2016 /* Can't compute-to-MRF this GRF if someone else was going to
2017 * read it later.
2018 */
2019 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2020 continue;
2021
2022 /* Found a move of a GRF to a MRF. Let's see if we can go
2023 * rewrite the thing that made this GRF to write into the MRF.
2024 */
2025 fs_inst *scan_inst;
2026 for (scan_inst = (fs_inst *)inst->prev;
2027 scan_inst->prev != NULL;
2028 scan_inst = (fs_inst *)scan_inst->prev) {
2029 if (scan_inst->dst.file == GRF &&
2030 scan_inst->dst.reg == inst->src[0].reg) {
2031 /* Found the last thing to write our reg we want to turn
2032 * into a compute-to-MRF.
2033 */
2034
2035 /* SENDs can only write to GRFs, so no compute-to-MRF. */
2036 if (scan_inst->mlen) {
2037 break;
2038 }
2039
2040 /* If it's predicated, it (probably) didn't populate all
2041 * the channels. We might be able to rewrite everything
2042 * that writes that reg, but it would require smarter
2043 * tracking to delay the rewriting until complete success.
2044 */
2045 if (scan_inst->predicate)
2046 break;
2047
2048 /* If it's half of register setup and not the same half as
2049 * our MOV we're trying to remove, bail for now.
2050 */
2051 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2052 scan_inst->force_sechalf != inst->force_sechalf) {
2053 break;
2054 }
2055
2056 /* SEND instructions can't have MRF as a destination. */
2057 if (scan_inst->mlen)
2058 break;
2059
2060 if (intel->gen >= 6) {
2061 /* gen6 math instructions must have the destination be
2062 * GRF, so no compute-to-MRF for them.
2063 */
2064 if (scan_inst->is_math()) {
2065 break;
2066 }
2067 }
2068
2069 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2070 /* Found the creator of our MRF's source value. */
2071 scan_inst->dst.file = MRF;
2072 scan_inst->dst.reg = inst->dst.reg;
2073 scan_inst->saturate |= inst->saturate;
2074 inst->remove();
2075 progress = true;
2076 }
2077 break;
2078 }
2079
2080 /* We don't handle flow control here. Most computation of
2081 * values that end up in MRFs are shortly before the MRF
2082 * write anyway.
2083 */
2084 if (scan_inst->opcode == BRW_OPCODE_DO ||
2085 scan_inst->opcode == BRW_OPCODE_WHILE ||
2086 scan_inst->opcode == BRW_OPCODE_ELSE ||
2087 scan_inst->opcode == BRW_OPCODE_ENDIF) {
2088 break;
2089 }
2090
2091 /* You can't read from an MRF, so if someone else reads our
2092 * MRF's source GRF that we wanted to rewrite, that stops us.
2093 */
2094 bool interfered = false;
2095 for (int i = 0; i < 3; i++) {
2096 if (scan_inst->src[i].file == GRF &&
2097 scan_inst->src[i].reg == inst->src[0].reg &&
2098 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2099 interfered = true;
2100 }
2101 }
2102 if (interfered)
2103 break;
2104
2105 if (scan_inst->dst.file == MRF) {
2106 /* If somebody else writes our MRF here, we can't
2107 * compute-to-MRF before that.
2108 */
2109 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2110 int scan_mrf_high;
2111
2112 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2113 scan_mrf_high = scan_mrf_low + 4;
2114 } else if (dispatch_width == 16 &&
2115 (!scan_inst->force_uncompressed &&
2116 !scan_inst->force_sechalf)) {
2117 scan_mrf_high = scan_mrf_low + 1;
2118 } else {
2119 scan_mrf_high = scan_mrf_low;
2120 }
2121
2122 if (mrf_low == scan_mrf_low ||
2123 mrf_low == scan_mrf_high ||
2124 mrf_high == scan_mrf_low ||
2125 mrf_high == scan_mrf_high) {
2126 break;
2127 }
2128 }
2129
2130 if (scan_inst->mlen > 0) {
2131 /* Found a SEND instruction, which means that there are
2132 * live values in MRFs from base_mrf to base_mrf +
2133 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2134 * above it.
2135 */
2136 if (mrf_low >= scan_inst->base_mrf &&
2137 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2138 break;
2139 }
2140 if (mrf_high >= scan_inst->base_mrf &&
2141 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2142 break;
2143 }
2144 }
2145 }
2146 }
2147
2148 if (progress)
2149 live_intervals_valid = false;
2150
2151 return progress;
2152 }
2153
2154 /**
2155 * Walks through basic blocks, looking for repeated MRF writes and
2156 * removing the later ones.
2157 */
2158 bool
2159 fs_visitor::remove_duplicate_mrf_writes()
2160 {
2161 fs_inst *last_mrf_move[16];
2162 bool progress = false;
2163
2164 /* Need to update the MRF tracking for compressed instructions. */
2165 if (dispatch_width == 16)
2166 return false;
2167
2168 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2169
2170 foreach_list_safe(node, &this->instructions) {
2171 fs_inst *inst = (fs_inst *)node;
2172
2173 switch (inst->opcode) {
2174 case BRW_OPCODE_DO:
2175 case BRW_OPCODE_WHILE:
2176 case BRW_OPCODE_IF:
2177 case BRW_OPCODE_ELSE:
2178 case BRW_OPCODE_ENDIF:
2179 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2180 continue;
2181 default:
2182 break;
2183 }
2184
2185 if (inst->opcode == BRW_OPCODE_MOV &&
2186 inst->dst.file == MRF) {
2187 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2188 if (prev_inst && inst->equals(prev_inst)) {
2189 inst->remove();
2190 progress = true;
2191 continue;
2192 }
2193 }
2194
2195 /* Clear out the last-write records for MRFs that were overwritten. */
2196 if (inst->dst.file == MRF) {
2197 last_mrf_move[inst->dst.reg] = NULL;
2198 }
2199
2200 if (inst->mlen > 0) {
2201 /* Found a SEND instruction, which will include two or fewer
2202 * implied MRF writes. We could do better here.
2203 */
2204 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2205 last_mrf_move[inst->base_mrf + i] = NULL;
2206 }
2207 }
2208
2209 /* Clear out any MRF move records whose sources got overwritten. */
2210 if (inst->dst.file == GRF) {
2211 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2212 if (last_mrf_move[i] &&
2213 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2214 last_mrf_move[i] = NULL;
2215 }
2216 }
2217 }
2218
2219 if (inst->opcode == BRW_OPCODE_MOV &&
2220 inst->dst.file == MRF &&
2221 inst->src[0].file == GRF &&
2222 !inst->predicate) {
2223 last_mrf_move[inst->dst.reg] = inst;
2224 }
2225 }
2226
2227 if (progress)
2228 live_intervals_valid = false;
2229
2230 return progress;
2231 }
2232
2233 void
2234 fs_visitor::dump_instruction(fs_inst *inst)
2235 {
2236 if (inst->predicate) {
2237 printf("(%cf0.%d) ",
2238 inst->predicate_inverse ? '-' : '+',
2239 inst->flag_subreg);
2240 }
2241
2242 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2243 opcode_descs[inst->opcode].name) {
2244 printf("%s", opcode_descs[inst->opcode].name);
2245 } else {
2246 printf("op%d", inst->opcode);
2247 }
2248 if (inst->saturate)
2249 printf(".sat");
2250 if (inst->conditional_mod) {
2251 printf(".cmod");
2252 if (!inst->predicate &&
2253 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2254 inst->opcode != BRW_OPCODE_IF &&
2255 inst->opcode != BRW_OPCODE_WHILE))) {
2256 printf(".f0.%d\n", inst->flag_subreg);
2257 }
2258 }
2259 printf(" ");
2260
2261
2262 switch (inst->dst.file) {
2263 case GRF:
2264 printf("vgrf%d", inst->dst.reg);
2265 if (inst->dst.reg_offset)
2266 printf("+%d", inst->dst.reg_offset);
2267 break;
2268 case MRF:
2269 printf("m%d", inst->dst.reg);
2270 break;
2271 case BAD_FILE:
2272 printf("(null)");
2273 break;
2274 case UNIFORM:
2275 printf("***u%d***", inst->dst.reg);
2276 break;
2277 default:
2278 printf("???");
2279 break;
2280 }
2281 printf(", ");
2282
2283 for (int i = 0; i < 3; i++) {
2284 if (inst->src[i].negate)
2285 printf("-");
2286 if (inst->src[i].abs)
2287 printf("|");
2288 switch (inst->src[i].file) {
2289 case GRF:
2290 printf("vgrf%d", inst->src[i].reg);
2291 if (inst->src[i].reg_offset)
2292 printf("+%d", inst->src[i].reg_offset);
2293 break;
2294 case MRF:
2295 printf("***m%d***", inst->src[i].reg);
2296 break;
2297 case UNIFORM:
2298 printf("u%d", inst->src[i].reg);
2299 if (inst->src[i].reg_offset)
2300 printf(".%d", inst->src[i].reg_offset);
2301 break;
2302 case BAD_FILE:
2303 printf("(null)");
2304 break;
2305 default:
2306 printf("???");
2307 break;
2308 }
2309 if (inst->src[i].abs)
2310 printf("|");
2311
2312 if (i < 3)
2313 printf(", ");
2314 }
2315
2316 printf(" ");
2317
2318 if (inst->force_uncompressed)
2319 printf("1sthalf ");
2320
2321 if (inst->force_sechalf)
2322 printf("2ndhalf ");
2323
2324 printf("\n");
2325 }
2326
2327 void
2328 fs_visitor::dump_instructions()
2329 {
2330 int ip = 0;
2331 foreach_list(node, &this->instructions) {
2332 fs_inst *inst = (fs_inst *)node;
2333 printf("%d: ", ip++);
2334 dump_instruction(inst);
2335 }
2336 }
2337
2338 /**
2339 * Possibly returns an instruction that set up @param reg.
2340 *
2341 * Sometimes we want to take the result of some expression/variable
2342 * dereference tree and rewrite the instruction generating the result
2343 * of the tree. When processing the tree, we know that the
2344 * instructions generated are all writing temporaries that are dead
2345 * outside of this tree. So, if we have some instructions that write
2346 * a temporary, we're free to point that temp write somewhere else.
2347 *
2348 * Note that this doesn't guarantee that the instruction generated
2349 * only reg -- it might be the size=4 destination of a texture instruction.
2350 */
2351 fs_inst *
2352 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2353 fs_inst *end,
2354 fs_reg reg)
2355 {
2356 if (end == start ||
2357 end->predicate ||
2358 end->force_uncompressed ||
2359 end->force_sechalf ||
2360 reg.reladdr ||
2361 !reg.equals(end->dst)) {
2362 return NULL;
2363 } else {
2364 return end;
2365 }
2366 }
2367
2368 void
2369 fs_visitor::setup_payload_gen6()
2370 {
2371 struct intel_context *intel = &brw->intel;
2372 bool uses_depth =
2373 (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2374 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2375
2376 assert(intel->gen >= 6);
2377
2378 /* R0-1: masks, pixel X/Y coordinates. */
2379 c->nr_payload_regs = 2;
2380 /* R2: only for 32-pixel dispatch.*/
2381
2382 /* R3-26: barycentric interpolation coordinates. These appear in the
2383 * same order that they appear in the brw_wm_barycentric_interp_mode
2384 * enum. Each set of coordinates occupies 2 registers if dispatch width
2385 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2386 * appear if they were enabled using the "Barycentric Interpolation
2387 * Mode" bits in WM_STATE.
2388 */
2389 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2390 if (barycentric_interp_modes & (1 << i)) {
2391 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2392 c->nr_payload_regs += 2;
2393 if (dispatch_width == 16) {
2394 c->nr_payload_regs += 2;
2395 }
2396 }
2397 }
2398
2399 /* R27: interpolated depth if uses source depth */
2400 if (uses_depth) {
2401 c->source_depth_reg = c->nr_payload_regs;
2402 c->nr_payload_regs++;
2403 if (dispatch_width == 16) {
2404 /* R28: interpolated depth if not 8-wide. */
2405 c->nr_payload_regs++;
2406 }
2407 }
2408 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2409 if (uses_depth) {
2410 c->source_w_reg = c->nr_payload_regs;
2411 c->nr_payload_regs++;
2412 if (dispatch_width == 16) {
2413 /* R30: interpolated W if not 8-wide. */
2414 c->nr_payload_regs++;
2415 }
2416 }
2417 /* R31: MSAA position offsets. */
2418 /* R32-: bary for 32-pixel. */
2419 /* R58-59: interp W for 32-pixel. */
2420
2421 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2422 c->source_depth_to_render_target = true;
2423 }
2424 }
2425
2426 bool
2427 fs_visitor::run()
2428 {
2429 uint32_t orig_nr_params = c->prog_data.nr_params;
2430
2431 if (intel->gen >= 6)
2432 setup_payload_gen6();
2433 else
2434 setup_payload_gen4();
2435
2436 if (0) {
2437 emit_dummy_fs();
2438 } else {
2439 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2440 emit_shader_time_begin();
2441
2442 calculate_urb_setup();
2443 if (intel->gen < 6)
2444 emit_interpolation_setup_gen4();
2445 else
2446 emit_interpolation_setup_gen6();
2447
2448 /* We handle discards by keeping track of the still-live pixels in f0.1.
2449 * Initialize it with the dispatched pixels.
2450 */
2451 if (fp->UsesKill) {
2452 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2453 discard_init->flag_subreg = 1;
2454 }
2455
2456 /* Generate FS IR for main(). (the visitor only descends into
2457 * functions called "main").
2458 */
2459 if (shader) {
2460 foreach_list(node, &*shader->ir) {
2461 ir_instruction *ir = (ir_instruction *)node;
2462 base_ir = ir;
2463 this->result = reg_undef;
2464 ir->accept(this);
2465 }
2466 } else {
2467 emit_fragment_program_code();
2468 }
2469 base_ir = NULL;
2470 if (failed)
2471 return false;
2472
2473 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2474 emit_shader_time_end();
2475
2476 emit_fb_writes();
2477
2478 split_virtual_grfs();
2479
2480 setup_paramvalues_refs();
2481 move_uniform_array_access_to_pull_constants();
2482 setup_pull_constants();
2483
2484 bool progress;
2485 do {
2486 progress = false;
2487
2488 compact_virtual_grfs();
2489
2490 progress = remove_duplicate_mrf_writes() || progress;
2491
2492 progress = opt_algebraic() || progress;
2493 progress = opt_cse() || progress;
2494 progress = opt_copy_propagate() || progress;
2495 progress = dead_code_eliminate() || progress;
2496 progress = register_coalesce() || progress;
2497 progress = register_coalesce_2() || progress;
2498 progress = compute_to_mrf() || progress;
2499 } while (progress);
2500
2501 remove_dead_constants();
2502
2503 schedule_instructions(false);
2504
2505 assign_curb_setup();
2506 assign_urb_setup();
2507
2508 if (0) {
2509 /* Debug of register spilling: Go spill everything. */
2510 for (int i = 0; i < virtual_grf_count; i++) {
2511 spill_reg(i);
2512 }
2513 }
2514
2515 if (0)
2516 assign_regs_trivial();
2517 else {
2518 while (!assign_regs()) {
2519 if (failed)
2520 break;
2521 }
2522 }
2523 }
2524 assert(force_uncompressed_stack == 0);
2525 assert(force_sechalf_stack == 0);
2526
2527 if (failed)
2528 return false;
2529
2530 schedule_instructions(true);
2531
2532 if (dispatch_width == 8) {
2533 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2534 } else {
2535 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2536
2537 /* Make sure we didn't try to sneak in an extra uniform */
2538 assert(orig_nr_params == c->prog_data.nr_params);
2539 (void) orig_nr_params;
2540 }
2541
2542 return !failed;
2543 }
2544
2545 const unsigned *
2546 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2547 struct gl_fragment_program *fp,
2548 struct gl_shader_program *prog,
2549 unsigned *final_assembly_size)
2550 {
2551 struct intel_context *intel = &brw->intel;
2552 bool start_busy = false;
2553 float start_time = 0;
2554
2555 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2556 start_busy = (intel->batch.last_bo &&
2557 drm_intel_bo_busy(intel->batch.last_bo));
2558 start_time = get_time();
2559 }
2560
2561 struct brw_shader *shader = NULL;
2562 if (prog)
2563 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2564
2565 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2566 if (shader) {
2567 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2568 _mesa_print_ir(shader->ir, NULL);
2569 printf("\n\n");
2570 } else {
2571 printf("ARB_fragment_program %d ir for native fragment shader\n",
2572 fp->Base.Id);
2573 _mesa_print_program(&fp->Base);
2574 }
2575 }
2576
2577 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2578 */
2579 fs_visitor v(brw, c, prog, fp, 8);
2580 if (!v.run()) {
2581 prog->LinkStatus = false;
2582 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2583
2584 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2585 v.fail_msg);
2586
2587 return NULL;
2588 }
2589
2590 exec_list *simd16_instructions = NULL;
2591 fs_visitor v2(brw, c, prog, fp, 16);
2592 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2593 v2.import_uniforms(&v);
2594 if (!v2.run()) {
2595 perf_debug("16-wide shader failed to compile, falling back to "
2596 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2597 } else {
2598 simd16_instructions = &v2.instructions;
2599 }
2600 }
2601
2602 c->prog_data.dispatch_width = 8;
2603
2604 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2605 const unsigned *generated = g.generate_assembly(&v.instructions,
2606 simd16_instructions,
2607 final_assembly_size);
2608
2609 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2610 if (shader->compiled_once)
2611 brw_wm_debug_recompile(brw, prog, &c->key);
2612 shader->compiled_once = true;
2613
2614 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2615 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2616 (get_time() - start_time) * 1000);
2617 }
2618 }
2619
2620 return generated;
2621 }
2622
2623 bool
2624 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2625 {
2626 struct brw_context *brw = brw_context(ctx);
2627 struct intel_context *intel = &brw->intel;
2628 struct brw_wm_prog_key key;
2629
2630 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2631 return true;
2632
2633 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2634 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2635 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2636 bool program_uses_dfdy = fp->UsesDFdy;
2637
2638 memset(&key, 0, sizeof(key));
2639
2640 if (intel->gen < 6) {
2641 if (fp->UsesKill)
2642 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2643
2644 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2645 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2646
2647 /* Just assume depth testing. */
2648 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2649 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2650 }
2651
2652 if (prog->Name != 0)
2653 key.proj_attrib_mask = 0xffffffff;
2654
2655 if (intel->gen < 6)
2656 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2657
2658 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2659 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2660 continue;
2661
2662 if (prog->Name == 0)
2663 key.proj_attrib_mask |= 1 << i;
2664
2665 if (intel->gen < 6) {
2666 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2667
2668 if (vp_index >= 0)
2669 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2670 }
2671 }
2672
2673 key.clamp_fragment_color = true;
2674
2675 for (int i = 0; i < MAX_SAMPLERS; i++) {
2676 if (fp->Base.ShadowSamplers & (1 << i)) {
2677 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2678 key.tex.swizzles[i] =
2679 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2680 } else {
2681 /* Color sampler: assume no swizzling. */
2682 key.tex.swizzles[i] = SWIZZLE_XYZW;
2683 }
2684 }
2685
2686 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2687 key.drawable_height = ctx->DrawBuffer->Height;
2688 }
2689
2690 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2691 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2692 }
2693
2694 key.nr_color_regions = 1;
2695
2696 key.program_string_id = bfp->id;
2697
2698 uint32_t old_prog_offset = brw->wm.prog_offset;
2699 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2700
2701 bool success = do_wm_prog(brw, prog, bfp, &key);
2702
2703 brw->wm.prog_offset = old_prog_offset;
2704 brw->wm.prog_data = old_prog_data;
2705
2706 return success;
2707 }