i965/fs: Move the failure for gen7 16-wide intdiv to emit_math().
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 ALU1(NOT)
150 ALU1(MOV)
151 ALU1(FRC)
152 ALU1(RNDD)
153 ALU1(RNDE)
154 ALU1(RNDZ)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(SHL)
162 ALU2(SHR)
163 ALU2(ASR)
164
165 /** Gen4 predicated IF. */
166 fs_inst *
167 fs_visitor::IF(uint32_t predicate)
168 {
169 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
170 inst->predicate = predicate;
171 return inst;
172 }
173
174 /** Gen6+ IF with embedded comparison. */
175 fs_inst *
176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
177 {
178 assert(intel->gen >= 6);
179 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
180 reg_null_d, src0, src1);
181 inst->conditional_mod = condition;
182 return inst;
183 }
184
185 /**
186 * CMP: Sets the low bit of the destination channels with the result
187 * of the comparison, while the upper bits are undefined, and updates
188 * the flag register with the packed 16 bits of the result.
189 */
190 fs_inst *
191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
192 {
193 fs_inst *inst;
194
195 /* Take the instruction:
196 *
197 * CMP null<d> src0<f> src1<f>
198 *
199 * Original gen4 does type conversion to the destination type before
200 * comparison, producing garbage results for floating point comparisons.
201 * gen5 does the comparison on the execution type (resolved source types),
202 * so dst type doesn't matter. gen6 does comparison and then uses the
203 * result as if it was the dst type with no conversion, which happens to
204 * mostly work out for float-interpreted-as-int since our comparisons are
205 * for >0, =0, <0.
206 */
207 if (intel->gen == 4) {
208 dst.type = src0.type;
209 if (dst.file == FIXED_HW_REG)
210 dst.fixed_hw_reg.type = dst.type;
211 }
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 exec_list
223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
224 fs_reg offset)
225 {
226 exec_list instructions;
227 fs_inst *inst;
228
229 if (intel->gen >= 7) {
230 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
231 dst, surf_index, offset);
232 instructions.push_tail(inst);
233 } else {
234 int base_mrf = 13;
235 bool header_present = true;
236
237 fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
238 mrf.type = BRW_REGISTER_TYPE_D;
239
240 /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
241 * dword-aligned byte offset.
242 */
243 if (intel->gen == 6) {
244 instructions.push_tail(MOV(mrf, offset));
245 } else {
246 instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
247 }
248 inst = MOV(mrf, offset);
249 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
250 dst, surf_index);
251 inst->header_present = header_present;
252 inst->base_mrf = base_mrf;
253 inst->mlen = header_present + dispatch_width / 8;
254
255 instructions.push_tail(inst);
256 }
257
258 return instructions;
259 }
260
261 bool
262 fs_inst::equals(fs_inst *inst)
263 {
264 return (opcode == inst->opcode &&
265 dst.equals(inst->dst) &&
266 src[0].equals(inst->src[0]) &&
267 src[1].equals(inst->src[1]) &&
268 src[2].equals(inst->src[2]) &&
269 saturate == inst->saturate &&
270 predicate == inst->predicate &&
271 conditional_mod == inst->conditional_mod &&
272 mlen == inst->mlen &&
273 base_mrf == inst->base_mrf &&
274 sampler == inst->sampler &&
275 target == inst->target &&
276 eot == inst->eot &&
277 header_present == inst->header_present &&
278 shadow_compare == inst->shadow_compare &&
279 offset == inst->offset);
280 }
281
282 int
283 fs_inst::regs_written()
284 {
285 if (is_tex())
286 return 4;
287
288 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
289 * but we don't currently use them...nor do we have an opcode for them.
290 */
291
292 return 1;
293 }
294
295 bool
296 fs_inst::overwrites_reg(const fs_reg &reg)
297 {
298 return (reg.file == dst.file &&
299 reg.reg == dst.reg &&
300 reg.reg_offset >= dst.reg_offset &&
301 reg.reg_offset < dst.reg_offset + regs_written());
302 }
303
304 bool
305 fs_inst::is_tex()
306 {
307 return (opcode == SHADER_OPCODE_TEX ||
308 opcode == FS_OPCODE_TXB ||
309 opcode == SHADER_OPCODE_TXD ||
310 opcode == SHADER_OPCODE_TXF ||
311 opcode == SHADER_OPCODE_TXL ||
312 opcode == SHADER_OPCODE_TXS);
313 }
314
315 bool
316 fs_inst::is_math()
317 {
318 return (opcode == SHADER_OPCODE_RCP ||
319 opcode == SHADER_OPCODE_RSQ ||
320 opcode == SHADER_OPCODE_SQRT ||
321 opcode == SHADER_OPCODE_EXP2 ||
322 opcode == SHADER_OPCODE_LOG2 ||
323 opcode == SHADER_OPCODE_SIN ||
324 opcode == SHADER_OPCODE_COS ||
325 opcode == SHADER_OPCODE_INT_QUOTIENT ||
326 opcode == SHADER_OPCODE_INT_REMAINDER ||
327 opcode == SHADER_OPCODE_POW);
328 }
329
330 bool
331 fs_inst::is_send_from_grf()
332 {
333 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
334 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
335 src[1].file == GRF));
336 }
337
338 bool
339 fs_visitor::can_do_source_mods(fs_inst *inst)
340 {
341 if (intel->gen == 6 && inst->is_math())
342 return false;
343
344 if (inst->is_send_from_grf())
345 return false;
346
347 return true;
348 }
349
350 void
351 fs_reg::init()
352 {
353 memset(this, 0, sizeof(*this));
354 this->smear = -1;
355 }
356
357 /** Generic unset register constructor. */
358 fs_reg::fs_reg()
359 {
360 init();
361 this->file = BAD_FILE;
362 }
363
364 /** Immediate value constructor. */
365 fs_reg::fs_reg(float f)
366 {
367 init();
368 this->file = IMM;
369 this->type = BRW_REGISTER_TYPE_F;
370 this->imm.f = f;
371 }
372
373 /** Immediate value constructor. */
374 fs_reg::fs_reg(int32_t i)
375 {
376 init();
377 this->file = IMM;
378 this->type = BRW_REGISTER_TYPE_D;
379 this->imm.i = i;
380 }
381
382 /** Immediate value constructor. */
383 fs_reg::fs_reg(uint32_t u)
384 {
385 init();
386 this->file = IMM;
387 this->type = BRW_REGISTER_TYPE_UD;
388 this->imm.u = u;
389 }
390
391 /** Fixed brw_reg Immediate value constructor. */
392 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
393 {
394 init();
395 this->file = FIXED_HW_REG;
396 this->fixed_hw_reg = fixed_hw_reg;
397 this->type = fixed_hw_reg.type;
398 }
399
400 bool
401 fs_reg::equals(const fs_reg &r) const
402 {
403 return (file == r.file &&
404 reg == r.reg &&
405 reg_offset == r.reg_offset &&
406 type == r.type &&
407 negate == r.negate &&
408 abs == r.abs &&
409 !reladdr && !r.reladdr &&
410 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
411 sizeof(fixed_hw_reg)) == 0 &&
412 smear == r.smear &&
413 imm.u == r.imm.u);
414 }
415
416 bool
417 fs_reg::is_zero() const
418 {
419 if (file != IMM)
420 return false;
421
422 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
423 }
424
425 bool
426 fs_reg::is_one() const
427 {
428 if (file != IMM)
429 return false;
430
431 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
432 }
433
434 int
435 fs_visitor::type_size(const struct glsl_type *type)
436 {
437 unsigned int size, i;
438
439 switch (type->base_type) {
440 case GLSL_TYPE_UINT:
441 case GLSL_TYPE_INT:
442 case GLSL_TYPE_FLOAT:
443 case GLSL_TYPE_BOOL:
444 return type->components();
445 case GLSL_TYPE_ARRAY:
446 return type_size(type->fields.array) * type->length;
447 case GLSL_TYPE_STRUCT:
448 size = 0;
449 for (i = 0; i < type->length; i++) {
450 size += type_size(type->fields.structure[i].type);
451 }
452 return size;
453 case GLSL_TYPE_SAMPLER:
454 /* Samplers take up no register space, since they're baked in at
455 * link time.
456 */
457 return 0;
458 default:
459 assert(!"not reached");
460 return 0;
461 }
462 }
463
464 fs_reg
465 fs_visitor::get_timestamp()
466 {
467 assert(intel->gen >= 7);
468
469 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
470 BRW_ARF_TIMESTAMP,
471 0),
472 BRW_REGISTER_TYPE_UD));
473
474 fs_reg dst = fs_reg(this, glsl_type::uint_type);
475
476 fs_inst *mov = emit(MOV(dst, ts));
477 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
478 * even if it's not enabled in the dispatch.
479 */
480 mov->force_writemask_all = true;
481 mov->force_uncompressed = true;
482
483 /* The caller wants the low 32 bits of the timestamp. Since it's running
484 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
485 * which is plenty of time for our purposes. It is identical across the
486 * EUs, but since it's tracking GPU core speed it will increment at a
487 * varying rate as render P-states change.
488 *
489 * The caller could also check if render P-states have changed (or anything
490 * else that might disrupt timing) by setting smear to 2 and checking if
491 * that field is != 0.
492 */
493 dst.smear = 0;
494
495 return dst;
496 }
497
498 void
499 fs_visitor::emit_shader_time_begin()
500 {
501 current_annotation = "shader time start";
502 shader_start_time = get_timestamp();
503 }
504
505 void
506 fs_visitor::emit_shader_time_end()
507 {
508 current_annotation = "shader time end";
509
510 enum shader_time_shader_type type, written_type, reset_type;
511 if (dispatch_width == 8) {
512 type = ST_FS8;
513 written_type = ST_FS8_WRITTEN;
514 reset_type = ST_FS8_RESET;
515 } else {
516 assert(dispatch_width == 16);
517 type = ST_FS16;
518 written_type = ST_FS16_WRITTEN;
519 reset_type = ST_FS16_RESET;
520 }
521
522 fs_reg shader_end_time = get_timestamp();
523
524 /* Check that there weren't any timestamp reset events (assuming these
525 * were the only two timestamp reads that happened).
526 */
527 fs_reg reset = shader_end_time;
528 reset.smear = 2;
529 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
530 test->conditional_mod = BRW_CONDITIONAL_Z;
531 emit(IF(BRW_PREDICATE_NORMAL));
532
533 push_force_uncompressed();
534 fs_reg start = shader_start_time;
535 start.negate = true;
536 fs_reg diff = fs_reg(this, glsl_type::uint_type);
537 emit(ADD(diff, start, shader_end_time));
538
539 /* If there were no instructions between the two timestamp gets, the diff
540 * is 2 cycles. Remove that overhead, so I can forget about that when
541 * trying to determine the time taken for single instructions.
542 */
543 emit(ADD(diff, diff, fs_reg(-2u)));
544
545 emit_shader_time_write(type, diff);
546 emit_shader_time_write(written_type, fs_reg(1u));
547 emit(BRW_OPCODE_ELSE);
548 emit_shader_time_write(reset_type, fs_reg(1u));
549 emit(BRW_OPCODE_ENDIF);
550
551 pop_force_uncompressed();
552 }
553
554 void
555 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
556 fs_reg value)
557 {
558 /* Choose an index in the buffer and set up tracking information for our
559 * printouts.
560 */
561 int shader_time_index = brw->shader_time.num_entries++;
562 assert(shader_time_index <= brw->shader_time.max_entries);
563 brw->shader_time.types[shader_time_index] = type;
564 if (prog) {
565 _mesa_reference_shader_program(ctx,
566 &brw->shader_time.programs[shader_time_index],
567 prog);
568 }
569
570 int base_mrf = 6;
571
572 fs_reg offset_mrf = fs_reg(MRF, base_mrf);
573 offset_mrf.type = BRW_REGISTER_TYPE_UD;
574 emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
575
576 fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
577 time_mrf.type = BRW_REGISTER_TYPE_UD;
578 emit(MOV(time_mrf, value));
579
580 fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
581 inst->base_mrf = base_mrf;
582 inst->mlen = 2;
583 }
584
585 void
586 fs_visitor::fail(const char *format, ...)
587 {
588 va_list va;
589 char *msg;
590
591 if (failed)
592 return;
593
594 failed = true;
595
596 va_start(va, format);
597 msg = ralloc_vasprintf(mem_ctx, format, va);
598 va_end(va);
599 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
600
601 this->fail_msg = msg;
602
603 if (INTEL_DEBUG & DEBUG_WM) {
604 fprintf(stderr, "%s", msg);
605 }
606 }
607
608 fs_inst *
609 fs_visitor::emit(enum opcode opcode)
610 {
611 return emit(fs_inst(opcode));
612 }
613
614 fs_inst *
615 fs_visitor::emit(enum opcode opcode, fs_reg dst)
616 {
617 return emit(fs_inst(opcode, dst));
618 }
619
620 fs_inst *
621 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
622 {
623 return emit(fs_inst(opcode, dst, src0));
624 }
625
626 fs_inst *
627 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
628 {
629 return emit(fs_inst(opcode, dst, src0, src1));
630 }
631
632 fs_inst *
633 fs_visitor::emit(enum opcode opcode, fs_reg dst,
634 fs_reg src0, fs_reg src1, fs_reg src2)
635 {
636 return emit(fs_inst(opcode, dst, src0, src1, src2));
637 }
638
639 void
640 fs_visitor::push_force_uncompressed()
641 {
642 force_uncompressed_stack++;
643 }
644
645 void
646 fs_visitor::pop_force_uncompressed()
647 {
648 force_uncompressed_stack--;
649 assert(force_uncompressed_stack >= 0);
650 }
651
652 void
653 fs_visitor::push_force_sechalf()
654 {
655 force_sechalf_stack++;
656 }
657
658 void
659 fs_visitor::pop_force_sechalf()
660 {
661 force_sechalf_stack--;
662 assert(force_sechalf_stack >= 0);
663 }
664
665 /**
666 * Returns how many MRFs an FS opcode will write over.
667 *
668 * Note that this is not the 0 or 1 implied writes in an actual gen
669 * instruction -- the FS opcodes often generate MOVs in addition.
670 */
671 int
672 fs_visitor::implied_mrf_writes(fs_inst *inst)
673 {
674 if (inst->mlen == 0)
675 return 0;
676
677 switch (inst->opcode) {
678 case SHADER_OPCODE_RCP:
679 case SHADER_OPCODE_RSQ:
680 case SHADER_OPCODE_SQRT:
681 case SHADER_OPCODE_EXP2:
682 case SHADER_OPCODE_LOG2:
683 case SHADER_OPCODE_SIN:
684 case SHADER_OPCODE_COS:
685 return 1 * dispatch_width / 8;
686 case SHADER_OPCODE_POW:
687 case SHADER_OPCODE_INT_QUOTIENT:
688 case SHADER_OPCODE_INT_REMAINDER:
689 return 2 * dispatch_width / 8;
690 case SHADER_OPCODE_TEX:
691 case FS_OPCODE_TXB:
692 case SHADER_OPCODE_TXD:
693 case SHADER_OPCODE_TXF:
694 case SHADER_OPCODE_TXL:
695 case SHADER_OPCODE_TXS:
696 return 1;
697 case SHADER_OPCODE_SHADER_TIME_ADD:
698 return 0;
699 case FS_OPCODE_FB_WRITE:
700 return 2;
701 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
702 case FS_OPCODE_UNSPILL:
703 return 1;
704 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
705 return inst->header_present;
706 case FS_OPCODE_SPILL:
707 return 2;
708 default:
709 assert(!"not reached");
710 return inst->mlen;
711 }
712 }
713
714 int
715 fs_visitor::virtual_grf_alloc(int size)
716 {
717 if (virtual_grf_array_size <= virtual_grf_count) {
718 if (virtual_grf_array_size == 0)
719 virtual_grf_array_size = 16;
720 else
721 virtual_grf_array_size *= 2;
722 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
723 virtual_grf_array_size);
724 }
725 virtual_grf_sizes[virtual_grf_count] = size;
726 return virtual_grf_count++;
727 }
728
729 /** Fixed HW reg constructor. */
730 fs_reg::fs_reg(enum register_file file, int reg)
731 {
732 init();
733 this->file = file;
734 this->reg = reg;
735 this->type = BRW_REGISTER_TYPE_F;
736 }
737
738 /** Fixed HW reg constructor. */
739 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
740 {
741 init();
742 this->file = file;
743 this->reg = reg;
744 this->type = type;
745 }
746
747 /** Automatic reg constructor. */
748 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
749 {
750 init();
751
752 this->file = GRF;
753 this->reg = v->virtual_grf_alloc(v->type_size(type));
754 this->reg_offset = 0;
755 this->type = brw_type_for_base_type(type);
756 }
757
758 fs_reg *
759 fs_visitor::variable_storage(ir_variable *var)
760 {
761 return (fs_reg *)hash_table_find(this->variable_ht, var);
762 }
763
764 void
765 import_uniforms_callback(const void *key,
766 void *data,
767 void *closure)
768 {
769 struct hash_table *dst_ht = (struct hash_table *)closure;
770 const fs_reg *reg = (const fs_reg *)data;
771
772 if (reg->file != UNIFORM)
773 return;
774
775 hash_table_insert(dst_ht, data, key);
776 }
777
778 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
779 * This brings in those uniform definitions
780 */
781 void
782 fs_visitor::import_uniforms(fs_visitor *v)
783 {
784 hash_table_call_foreach(v->variable_ht,
785 import_uniforms_callback,
786 variable_ht);
787 this->params_remap = v->params_remap;
788 }
789
790 /* Our support for uniforms is piggy-backed on the struct
791 * gl_fragment_program, because that's where the values actually
792 * get stored, rather than in some global gl_shader_program uniform
793 * store.
794 */
795 int
796 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
797 {
798 unsigned int offset = 0;
799
800 if (type->is_matrix()) {
801 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
802 type->vector_elements,
803 1);
804
805 for (unsigned int i = 0; i < type->matrix_columns; i++) {
806 offset += setup_uniform_values(loc + offset, column);
807 }
808
809 return offset;
810 }
811
812 switch (type->base_type) {
813 case GLSL_TYPE_FLOAT:
814 case GLSL_TYPE_UINT:
815 case GLSL_TYPE_INT:
816 case GLSL_TYPE_BOOL:
817 for (unsigned int i = 0; i < type->vector_elements; i++) {
818 unsigned int param = c->prog_data.nr_params++;
819
820 this->param_index[param] = loc;
821 this->param_offset[param] = i;
822 }
823 return 1;
824
825 case GLSL_TYPE_STRUCT:
826 for (unsigned int i = 0; i < type->length; i++) {
827 offset += setup_uniform_values(loc + offset,
828 type->fields.structure[i].type);
829 }
830 return offset;
831
832 case GLSL_TYPE_ARRAY:
833 for (unsigned int i = 0; i < type->length; i++) {
834 offset += setup_uniform_values(loc + offset, type->fields.array);
835 }
836 return offset;
837
838 case GLSL_TYPE_SAMPLER:
839 /* The sampler takes up a slot, but we don't use any values from it. */
840 return 1;
841
842 default:
843 assert(!"not reached");
844 return 0;
845 }
846 }
847
848
849 /* Our support for builtin uniforms is even scarier than non-builtin.
850 * It sits on top of the PROG_STATE_VAR parameters that are
851 * automatically updated from GL context state.
852 */
853 void
854 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
855 {
856 const ir_state_slot *const slots = ir->state_slots;
857 assert(ir->state_slots != NULL);
858
859 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
860 /* This state reference has already been setup by ir_to_mesa, but we'll
861 * get the same index back here.
862 */
863 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
864 (gl_state_index *)slots[i].tokens);
865
866 /* Add each of the unique swizzles of the element as a parameter.
867 * This'll end up matching the expected layout of the
868 * array/matrix/structure we're trying to fill in.
869 */
870 int last_swiz = -1;
871 for (unsigned int j = 0; j < 4; j++) {
872 int swiz = GET_SWZ(slots[i].swizzle, j);
873 if (swiz == last_swiz)
874 break;
875 last_swiz = swiz;
876
877 this->param_index[c->prog_data.nr_params] = index;
878 this->param_offset[c->prog_data.nr_params] = swiz;
879 c->prog_data.nr_params++;
880 }
881 }
882 }
883
884 fs_reg *
885 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
886 {
887 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
888 fs_reg wpos = *reg;
889 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
890
891 /* gl_FragCoord.x */
892 if (ir->pixel_center_integer) {
893 emit(MOV(wpos, this->pixel_x));
894 } else {
895 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
896 }
897 wpos.reg_offset++;
898
899 /* gl_FragCoord.y */
900 if (!flip && ir->pixel_center_integer) {
901 emit(MOV(wpos, this->pixel_y));
902 } else {
903 fs_reg pixel_y = this->pixel_y;
904 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
905
906 if (flip) {
907 pixel_y.negate = true;
908 offset += c->key.drawable_height - 1.0;
909 }
910
911 emit(ADD(wpos, pixel_y, fs_reg(offset)));
912 }
913 wpos.reg_offset++;
914
915 /* gl_FragCoord.z */
916 if (intel->gen >= 6) {
917 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
918 } else {
919 emit(FS_OPCODE_LINTERP, wpos,
920 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
921 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
922 interp_reg(FRAG_ATTRIB_WPOS, 2));
923 }
924 wpos.reg_offset++;
925
926 /* gl_FragCoord.w: Already set up in emit_interpolation */
927 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
928
929 return reg;
930 }
931
932 fs_inst *
933 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
934 glsl_interp_qualifier interpolation_mode,
935 bool is_centroid)
936 {
937 brw_wm_barycentric_interp_mode barycoord_mode;
938 if (is_centroid) {
939 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
940 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
941 else
942 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
943 } else {
944 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
945 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
946 else
947 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
948 }
949 return emit(FS_OPCODE_LINTERP, attr,
950 this->delta_x[barycoord_mode],
951 this->delta_y[barycoord_mode], interp);
952 }
953
954 fs_reg *
955 fs_visitor::emit_general_interpolation(ir_variable *ir)
956 {
957 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
958 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
959 fs_reg attr = *reg;
960
961 unsigned int array_elements;
962 const glsl_type *type;
963
964 if (ir->type->is_array()) {
965 array_elements = ir->type->length;
966 if (array_elements == 0) {
967 fail("dereferenced array '%s' has length 0\n", ir->name);
968 }
969 type = ir->type->fields.array;
970 } else {
971 array_elements = 1;
972 type = ir->type;
973 }
974
975 glsl_interp_qualifier interpolation_mode =
976 ir->determine_interpolation_mode(c->key.flat_shade);
977
978 int location = ir->location;
979 for (unsigned int i = 0; i < array_elements; i++) {
980 for (unsigned int j = 0; j < type->matrix_columns; j++) {
981 if (urb_setup[location] == -1) {
982 /* If there's no incoming setup data for this slot, don't
983 * emit interpolation for it.
984 */
985 attr.reg_offset += type->vector_elements;
986 location++;
987 continue;
988 }
989
990 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
991 /* Constant interpolation (flat shading) case. The SF has
992 * handed us defined values in only the constant offset
993 * field of the setup reg.
994 */
995 for (unsigned int k = 0; k < type->vector_elements; k++) {
996 struct brw_reg interp = interp_reg(location, k);
997 interp = suboffset(interp, 3);
998 interp.type = reg->type;
999 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1000 attr.reg_offset++;
1001 }
1002 } else {
1003 /* Smooth/noperspective interpolation case. */
1004 for (unsigned int k = 0; k < type->vector_elements; k++) {
1005 /* FINISHME: At some point we probably want to push
1006 * this farther by giving similar treatment to the
1007 * other potentially constant components of the
1008 * attribute, as well as making brw_vs_constval.c
1009 * handle varyings other than gl_TexCoord.
1010 */
1011 if (location >= FRAG_ATTRIB_TEX0 &&
1012 location <= FRAG_ATTRIB_TEX7 &&
1013 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1014 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1015 } else {
1016 struct brw_reg interp = interp_reg(location, k);
1017 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1018 ir->centroid);
1019 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1020 /* Get the pixel/sample mask into f0 so that we know
1021 * which pixels are lit. Then, for each channel that is
1022 * unlit, replace the centroid data with non-centroid
1023 * data.
1024 */
1025 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1026 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1027 interpolation_mode, false);
1028 inst->predicate = BRW_PREDICATE_NORMAL;
1029 inst->predicate_inverse = true;
1030 }
1031 if (intel->gen < 6) {
1032 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1033 }
1034 }
1035 attr.reg_offset++;
1036 }
1037
1038 }
1039 location++;
1040 }
1041 }
1042
1043 return reg;
1044 }
1045
1046 fs_reg *
1047 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1048 {
1049 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1050
1051 /* The frontfacing comes in as a bit in the thread payload. */
1052 if (intel->gen >= 6) {
1053 emit(BRW_OPCODE_ASR, *reg,
1054 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1055 fs_reg(15));
1056 emit(BRW_OPCODE_NOT, *reg, *reg);
1057 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1058 } else {
1059 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1060 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1061 * us front face
1062 */
1063 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1064 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1065 }
1066
1067 return reg;
1068 }
1069
1070 fs_reg
1071 fs_visitor::fix_math_operand(fs_reg src)
1072 {
1073 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1074 * might be able to do better by doing execsize = 1 math and then
1075 * expanding that result out, but we would need to be careful with
1076 * masking.
1077 *
1078 * The hardware ignores source modifiers (negate and abs) on math
1079 * instructions, so we also move to a temp to set those up.
1080 */
1081 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1082 !src.abs && !src.negate)
1083 return src;
1084
1085 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1086 * operands to math
1087 */
1088 if (intel->gen >= 7 && src.file != IMM)
1089 return src;
1090
1091 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1092 expanded.type = src.type;
1093 emit(BRW_OPCODE_MOV, expanded, src);
1094 return expanded;
1095 }
1096
1097 fs_inst *
1098 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1099 {
1100 switch (opcode) {
1101 case SHADER_OPCODE_RCP:
1102 case SHADER_OPCODE_RSQ:
1103 case SHADER_OPCODE_SQRT:
1104 case SHADER_OPCODE_EXP2:
1105 case SHADER_OPCODE_LOG2:
1106 case SHADER_OPCODE_SIN:
1107 case SHADER_OPCODE_COS:
1108 break;
1109 default:
1110 assert(!"not reached: bad math opcode");
1111 return NULL;
1112 }
1113
1114 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1115 * might be able to do better by doing execsize = 1 math and then
1116 * expanding that result out, but we would need to be careful with
1117 * masking.
1118 *
1119 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1120 * instructions, so we also move to a temp to set those up.
1121 */
1122 if (intel->gen >= 6)
1123 src = fix_math_operand(src);
1124
1125 fs_inst *inst = emit(opcode, dst, src);
1126
1127 if (intel->gen < 6) {
1128 inst->base_mrf = 2;
1129 inst->mlen = dispatch_width / 8;
1130 }
1131
1132 return inst;
1133 }
1134
1135 fs_inst *
1136 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1137 {
1138 int base_mrf = 2;
1139 fs_inst *inst;
1140
1141 switch (opcode) {
1142 case SHADER_OPCODE_INT_QUOTIENT:
1143 case SHADER_OPCODE_INT_REMAINDER:
1144 if (intel->gen >= 7 && dispatch_width == 16)
1145 fail("16-wide INTDIV unsupported\n");
1146 break;
1147 case SHADER_OPCODE_POW:
1148 break;
1149 default:
1150 assert(!"not reached: unsupported binary math opcode.");
1151 return NULL;
1152 }
1153
1154 if (intel->gen >= 6) {
1155 src0 = fix_math_operand(src0);
1156 src1 = fix_math_operand(src1);
1157
1158 inst = emit(opcode, dst, src0, src1);
1159 } else {
1160 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1161 * "Message Payload":
1162 *
1163 * "Operand0[7]. For the INT DIV functions, this operand is the
1164 * denominator."
1165 * ...
1166 * "Operand1[7]. For the INT DIV functions, this operand is the
1167 * numerator."
1168 */
1169 bool is_int_div = opcode != SHADER_OPCODE_POW;
1170 fs_reg &op0 = is_int_div ? src1 : src0;
1171 fs_reg &op1 = is_int_div ? src0 : src1;
1172
1173 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1174 inst = emit(opcode, dst, op0, reg_null_f);
1175
1176 inst->base_mrf = base_mrf;
1177 inst->mlen = 2 * dispatch_width / 8;
1178 }
1179 return inst;
1180 }
1181
1182 /**
1183 * To be called after the last _mesa_add_state_reference() call, to
1184 * set up prog_data.param[] for assign_curb_setup() and
1185 * setup_pull_constants().
1186 */
1187 void
1188 fs_visitor::setup_paramvalues_refs()
1189 {
1190 if (dispatch_width != 8)
1191 return;
1192
1193 /* Set up the pointers to ParamValues now that that array is finalized. */
1194 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1195 c->prog_data.param[i] =
1196 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
1197 this->param_offset[i];
1198 }
1199 }
1200
1201 void
1202 fs_visitor::assign_curb_setup()
1203 {
1204 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1205 if (dispatch_width == 8) {
1206 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1207 } else {
1208 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1209 }
1210
1211 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1212 foreach_list(node, &this->instructions) {
1213 fs_inst *inst = (fs_inst *)node;
1214
1215 for (unsigned int i = 0; i < 3; i++) {
1216 if (inst->src[i].file == UNIFORM) {
1217 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1218 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1219 constant_nr / 8,
1220 constant_nr % 8);
1221
1222 inst->src[i].file = FIXED_HW_REG;
1223 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1224 }
1225 }
1226 }
1227 }
1228
1229 void
1230 fs_visitor::calculate_urb_setup()
1231 {
1232 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1233 urb_setup[i] = -1;
1234 }
1235
1236 int urb_next = 0;
1237 /* Figure out where each of the incoming setup attributes lands. */
1238 if (intel->gen >= 6) {
1239 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1240 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1241 urb_setup[i] = urb_next++;
1242 }
1243 }
1244 } else {
1245 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1246 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1247 /* Point size is packed into the header, not as a general attribute */
1248 if (i == VERT_RESULT_PSIZ)
1249 continue;
1250
1251 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1252 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1253
1254 /* The back color slot is skipped when the front color is
1255 * also written to. In addition, some slots can be
1256 * written in the vertex shader and not read in the
1257 * fragment shader. So the register number must always be
1258 * incremented, mapped or not.
1259 */
1260 if (fp_index >= 0)
1261 urb_setup[fp_index] = urb_next;
1262 urb_next++;
1263 }
1264 }
1265
1266 /*
1267 * It's a FS only attribute, and we did interpolation for this attribute
1268 * in SF thread. So, count it here, too.
1269 *
1270 * See compile_sf_prog() for more info.
1271 */
1272 if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1273 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1274 }
1275
1276 /* Each attribute is 4 setup channels, each of which is half a reg. */
1277 c->prog_data.urb_read_length = urb_next * 2;
1278 }
1279
1280 void
1281 fs_visitor::assign_urb_setup()
1282 {
1283 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1284
1285 /* Offset all the urb_setup[] index by the actual position of the
1286 * setup regs, now that the location of the constants has been chosen.
1287 */
1288 foreach_list(node, &this->instructions) {
1289 fs_inst *inst = (fs_inst *)node;
1290
1291 if (inst->opcode == FS_OPCODE_LINTERP) {
1292 assert(inst->src[2].file == FIXED_HW_REG);
1293 inst->src[2].fixed_hw_reg.nr += urb_start;
1294 }
1295
1296 if (inst->opcode == FS_OPCODE_CINTERP) {
1297 assert(inst->src[0].file == FIXED_HW_REG);
1298 inst->src[0].fixed_hw_reg.nr += urb_start;
1299 }
1300 }
1301
1302 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1303 }
1304
1305 /**
1306 * Split large virtual GRFs into separate components if we can.
1307 *
1308 * This is mostly duplicated with what brw_fs_vector_splitting does,
1309 * but that's really conservative because it's afraid of doing
1310 * splitting that doesn't result in real progress after the rest of
1311 * the optimization phases, which would cause infinite looping in
1312 * optimization. We can do it once here, safely. This also has the
1313 * opportunity to split interpolated values, or maybe even uniforms,
1314 * which we don't have at the IR level.
1315 *
1316 * We want to split, because virtual GRFs are what we register
1317 * allocate and spill (due to contiguousness requirements for some
1318 * instructions), and they're what we naturally generate in the
1319 * codegen process, but most virtual GRFs don't actually need to be
1320 * contiguous sets of GRFs. If we split, we'll end up with reduced
1321 * live intervals and better dead code elimination and coalescing.
1322 */
1323 void
1324 fs_visitor::split_virtual_grfs()
1325 {
1326 int num_vars = this->virtual_grf_count;
1327 bool split_grf[num_vars];
1328 int new_virtual_grf[num_vars];
1329
1330 /* Try to split anything > 0 sized. */
1331 for (int i = 0; i < num_vars; i++) {
1332 if (this->virtual_grf_sizes[i] != 1)
1333 split_grf[i] = true;
1334 else
1335 split_grf[i] = false;
1336 }
1337
1338 if (brw->has_pln &&
1339 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1340 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1341 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1342 * Gen6, that was the only supported interpolation mode, and since Gen6,
1343 * delta_x and delta_y are in fixed hardware registers.
1344 */
1345 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1346 false;
1347 }
1348
1349 foreach_list(node, &this->instructions) {
1350 fs_inst *inst = (fs_inst *)node;
1351
1352 /* If there's a SEND message that requires contiguous destination
1353 * registers, no splitting is allowed.
1354 */
1355 if (inst->regs_written() > 1) {
1356 split_grf[inst->dst.reg] = false;
1357 }
1358 }
1359
1360 /* Allocate new space for split regs. Note that the virtual
1361 * numbers will be contiguous.
1362 */
1363 for (int i = 0; i < num_vars; i++) {
1364 if (split_grf[i]) {
1365 new_virtual_grf[i] = virtual_grf_alloc(1);
1366 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1367 int reg = virtual_grf_alloc(1);
1368 assert(reg == new_virtual_grf[i] + j - 1);
1369 (void) reg;
1370 }
1371 this->virtual_grf_sizes[i] = 1;
1372 }
1373 }
1374
1375 foreach_list(node, &this->instructions) {
1376 fs_inst *inst = (fs_inst *)node;
1377
1378 if (inst->dst.file == GRF &&
1379 split_grf[inst->dst.reg] &&
1380 inst->dst.reg_offset != 0) {
1381 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1382 inst->dst.reg_offset - 1);
1383 inst->dst.reg_offset = 0;
1384 }
1385 for (int i = 0; i < 3; i++) {
1386 if (inst->src[i].file == GRF &&
1387 split_grf[inst->src[i].reg] &&
1388 inst->src[i].reg_offset != 0) {
1389 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1390 inst->src[i].reg_offset - 1);
1391 inst->src[i].reg_offset = 0;
1392 }
1393 }
1394 }
1395 this->live_intervals_valid = false;
1396 }
1397
1398 /**
1399 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1400 *
1401 * During code generation, we create tons of temporary variables, many of
1402 * which get immediately killed and are never used again. Yet, in later
1403 * optimization and analysis passes, such as compute_live_intervals, we need
1404 * to loop over all the virtual GRFs. Compacting them can save a lot of
1405 * overhead.
1406 */
1407 void
1408 fs_visitor::compact_virtual_grfs()
1409 {
1410 /* Mark which virtual GRFs are used, and count how many. */
1411 int remap_table[this->virtual_grf_count];
1412 memset(remap_table, -1, sizeof(remap_table));
1413
1414 foreach_list(node, &this->instructions) {
1415 const fs_inst *inst = (const fs_inst *) node;
1416
1417 if (inst->dst.file == GRF)
1418 remap_table[inst->dst.reg] = 0;
1419
1420 for (int i = 0; i < 3; i++) {
1421 if (inst->src[i].file == GRF)
1422 remap_table[inst->src[i].reg] = 0;
1423 }
1424 }
1425
1426 /* In addition to registers used in instructions, fs_visitor keeps
1427 * direct references to certain special values which must be patched:
1428 */
1429 fs_reg *special[] = {
1430 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1431 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1432 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1433 &delta_x[0], &delta_x[1], &delta_x[2],
1434 &delta_x[3], &delta_x[4], &delta_x[5],
1435 &delta_y[0], &delta_y[1], &delta_y[2],
1436 &delta_y[3], &delta_y[4], &delta_y[5],
1437 };
1438 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1439 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1440
1441 /* Treat all special values as used, to be conservative */
1442 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1443 if (special[i]->file == GRF)
1444 remap_table[special[i]->reg] = 0;
1445 }
1446
1447 /* Compact the GRF arrays. */
1448 int new_index = 0;
1449 for (int i = 0; i < this->virtual_grf_count; i++) {
1450 if (remap_table[i] != -1) {
1451 remap_table[i] = new_index;
1452 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1453 if (live_intervals_valid) {
1454 virtual_grf_use[new_index] = virtual_grf_use[i];
1455 virtual_grf_def[new_index] = virtual_grf_def[i];
1456 }
1457 ++new_index;
1458 }
1459 }
1460
1461 this->virtual_grf_count = new_index;
1462
1463 /* Patch all the instructions to use the newly renumbered registers */
1464 foreach_list(node, &this->instructions) {
1465 fs_inst *inst = (fs_inst *) node;
1466
1467 if (inst->dst.file == GRF)
1468 inst->dst.reg = remap_table[inst->dst.reg];
1469
1470 for (int i = 0; i < 3; i++) {
1471 if (inst->src[i].file == GRF)
1472 inst->src[i].reg = remap_table[inst->src[i].reg];
1473 }
1474 }
1475
1476 /* Patch all the references to special values */
1477 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1478 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1479 special[i]->reg = remap_table[special[i]->reg];
1480 }
1481 }
1482
1483 bool
1484 fs_visitor::remove_dead_constants()
1485 {
1486 if (dispatch_width == 8) {
1487 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1488
1489 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1490 this->params_remap[i] = -1;
1491
1492 /* Find which params are still in use. */
1493 foreach_list(node, &this->instructions) {
1494 fs_inst *inst = (fs_inst *)node;
1495
1496 for (int i = 0; i < 3; i++) {
1497 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1498
1499 if (inst->src[i].file != UNIFORM)
1500 continue;
1501
1502 assert(constant_nr < (int)c->prog_data.nr_params);
1503
1504 /* For now, set this to non-negative. We'll give it the
1505 * actual new number in a moment, in order to keep the
1506 * register numbers nicely ordered.
1507 */
1508 this->params_remap[constant_nr] = 0;
1509 }
1510 }
1511
1512 /* Figure out what the new numbers for the params will be. At some
1513 * point when we're doing uniform array access, we're going to want
1514 * to keep the distinction between .reg and .reg_offset, but for
1515 * now we don't care.
1516 */
1517 unsigned int new_nr_params = 0;
1518 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1519 if (this->params_remap[i] != -1) {
1520 this->params_remap[i] = new_nr_params++;
1521 }
1522 }
1523
1524 /* Update the list of params to be uploaded to match our new numbering. */
1525 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1526 int remapped = this->params_remap[i];
1527
1528 if (remapped == -1)
1529 continue;
1530
1531 /* We've already done setup_paramvalues_refs() so no need to worry
1532 * about param_index and param_offset.
1533 */
1534 c->prog_data.param[remapped] = c->prog_data.param[i];
1535 }
1536
1537 c->prog_data.nr_params = new_nr_params;
1538 } else {
1539 /* This should have been generated in the 8-wide pass already. */
1540 assert(this->params_remap);
1541 }
1542
1543 /* Now do the renumbering of the shader to remove unused params. */
1544 foreach_list(node, &this->instructions) {
1545 fs_inst *inst = (fs_inst *)node;
1546
1547 for (int i = 0; i < 3; i++) {
1548 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1549
1550 if (inst->src[i].file != UNIFORM)
1551 continue;
1552
1553 assert(this->params_remap[constant_nr] != -1);
1554 inst->src[i].reg = this->params_remap[constant_nr];
1555 inst->src[i].reg_offset = 0;
1556 }
1557 }
1558
1559 return true;
1560 }
1561
1562 /*
1563 * Implements array access of uniforms by inserting a
1564 * PULL_CONSTANT_LOAD instruction.
1565 *
1566 * Unlike temporary GRF array access (where we don't support it due to
1567 * the difficulty of doing relative addressing on instruction
1568 * destinations), we could potentially do array access of uniforms
1569 * that were loaded in GRF space as push constants. In real-world
1570 * usage we've seen, though, the arrays being used are always larger
1571 * than we could load as push constants, so just always move all
1572 * uniform array access out to a pull constant buffer.
1573 */
1574 void
1575 fs_visitor::move_uniform_array_access_to_pull_constants()
1576 {
1577 int pull_constant_loc[c->prog_data.nr_params];
1578
1579 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1580 pull_constant_loc[i] = -1;
1581 }
1582
1583 /* Walk through and find array access of uniforms. Put a copy of that
1584 * uniform in the pull constant buffer.
1585 *
1586 * Note that we don't move constant-indexed accesses to arrays. No
1587 * testing has been done of the performance impact of this choice.
1588 */
1589 foreach_list_safe(node, &this->instructions) {
1590 fs_inst *inst = (fs_inst *)node;
1591
1592 for (int i = 0 ; i < 3; i++) {
1593 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1594 continue;
1595
1596 int uniform = inst->src[i].reg;
1597
1598 /* If this array isn't already present in the pull constant buffer,
1599 * add it.
1600 */
1601 if (pull_constant_loc[uniform] == -1) {
1602 const float **values = &c->prog_data.param[uniform];
1603
1604 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1605
1606 assert(param_size[uniform]);
1607
1608 for (int j = 0; j < param_size[uniform]; j++) {
1609 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1610 values[j];
1611 }
1612 }
1613
1614 /* Set up the annotation tracking for new generated instructions. */
1615 base_ir = inst->ir;
1616 current_annotation = inst->annotation;
1617
1618 fs_reg offset = fs_reg(this, glsl_type::int_type);
1619 inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1620 fs_reg(pull_constant_loc[uniform] +
1621 inst->src[i].reg_offset)));
1622
1623 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1624 fs_reg temp = fs_reg(this, glsl_type::float_type);
1625 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1626 surf_index, offset);
1627 inst->insert_before(&list);
1628
1629 inst->src[i].file = temp.file;
1630 inst->src[i].reg = temp.reg;
1631 inst->src[i].reg_offset = temp.reg_offset;
1632 inst->src[i].reladdr = NULL;
1633 }
1634 }
1635 }
1636
1637 /**
1638 * Choose accesses from the UNIFORM file to demote to using the pull
1639 * constant buffer.
1640 *
1641 * We allow a fragment shader to have more than the specified minimum
1642 * maximum number of fragment shader uniform components (64). If
1643 * there are too many of these, they'd fill up all of register space.
1644 * So, this will push some of them out to the pull constant buffer and
1645 * update the program to load them.
1646 */
1647 void
1648 fs_visitor::setup_pull_constants()
1649 {
1650 /* Only allow 16 registers (128 uniform components) as push constants. */
1651 unsigned int max_uniform_components = 16 * 8;
1652 if (c->prog_data.nr_params <= max_uniform_components)
1653 return;
1654
1655 if (dispatch_width == 16) {
1656 fail("Pull constants not supported in 16-wide\n");
1657 return;
1658 }
1659
1660 /* Just demote the end of the list. We could probably do better
1661 * here, demoting things that are rarely used in the program first.
1662 */
1663 unsigned int pull_uniform_base = max_uniform_components;
1664
1665 int pull_constant_loc[c->prog_data.nr_params];
1666 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1667 if (i < pull_uniform_base) {
1668 pull_constant_loc[i] = -1;
1669 } else {
1670 pull_constant_loc[i] = -1;
1671 /* If our constant is already being uploaded for reladdr purposes,
1672 * reuse it.
1673 */
1674 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1675 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1676 pull_constant_loc[i] = j;
1677 break;
1678 }
1679 }
1680 if (pull_constant_loc[i] == -1) {
1681 int pull_index = c->prog_data.nr_pull_params++;
1682 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1683 pull_constant_loc[i] = pull_index;;
1684 }
1685 }
1686 }
1687 c->prog_data.nr_params = pull_uniform_base;
1688
1689 foreach_list(node, &this->instructions) {
1690 fs_inst *inst = (fs_inst *)node;
1691
1692 for (int i = 0; i < 3; i++) {
1693 if (inst->src[i].file != UNIFORM)
1694 continue;
1695
1696 int pull_index = pull_constant_loc[inst->src[i].reg +
1697 inst->src[i].reg_offset];
1698 if (pull_index == -1)
1699 continue;
1700
1701 assert(!inst->src[i].reladdr);
1702
1703 fs_reg dst = fs_reg(this, glsl_type::float_type);
1704 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1705 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1706 fs_inst *pull =
1707 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1708 dst, index, offset);
1709 pull->ir = inst->ir;
1710 pull->annotation = inst->annotation;
1711 pull->base_mrf = 14;
1712 pull->mlen = 1;
1713
1714 inst->insert_before(pull);
1715
1716 inst->src[i].file = GRF;
1717 inst->src[i].reg = dst.reg;
1718 inst->src[i].reg_offset = 0;
1719 inst->src[i].smear = pull_index & 3;
1720 }
1721 }
1722 }
1723
1724 bool
1725 fs_visitor::opt_algebraic()
1726 {
1727 bool progress = false;
1728
1729 foreach_list(node, &this->instructions) {
1730 fs_inst *inst = (fs_inst *)node;
1731
1732 switch (inst->opcode) {
1733 case BRW_OPCODE_MUL:
1734 if (inst->src[1].file != IMM)
1735 continue;
1736
1737 /* a * 1.0 = a */
1738 if (inst->src[1].is_one()) {
1739 inst->opcode = BRW_OPCODE_MOV;
1740 inst->src[1] = reg_undef;
1741 progress = true;
1742 break;
1743 }
1744
1745 /* a * 0.0 = 0.0 */
1746 if (inst->src[1].is_zero()) {
1747 inst->opcode = BRW_OPCODE_MOV;
1748 inst->src[0] = inst->src[1];
1749 inst->src[1] = reg_undef;
1750 progress = true;
1751 break;
1752 }
1753
1754 break;
1755 case BRW_OPCODE_ADD:
1756 if (inst->src[1].file != IMM)
1757 continue;
1758
1759 /* a + 0.0 = a */
1760 if (inst->src[1].is_zero()) {
1761 inst->opcode = BRW_OPCODE_MOV;
1762 inst->src[1] = reg_undef;
1763 progress = true;
1764 break;
1765 }
1766 break;
1767 default:
1768 break;
1769 }
1770 }
1771
1772 return progress;
1773 }
1774
1775 /**
1776 * Must be called after calculate_live_intervales() to remove unused
1777 * writes to registers -- register allocation will fail otherwise
1778 * because something deffed but not used won't be considered to
1779 * interfere with other regs.
1780 */
1781 bool
1782 fs_visitor::dead_code_eliminate()
1783 {
1784 bool progress = false;
1785 int pc = 0;
1786
1787 calculate_live_intervals();
1788
1789 foreach_list_safe(node, &this->instructions) {
1790 fs_inst *inst = (fs_inst *)node;
1791
1792 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1793 inst->remove();
1794 progress = true;
1795 }
1796
1797 pc++;
1798 }
1799
1800 if (progress)
1801 live_intervals_valid = false;
1802
1803 return progress;
1804 }
1805
1806 /**
1807 * Implements a second type of register coalescing: This one checks if
1808 * the two regs involved in a raw move don't interfere, in which case
1809 * they can both by stored in the same place and the MOV removed.
1810 */
1811 bool
1812 fs_visitor::register_coalesce_2()
1813 {
1814 bool progress = false;
1815
1816 calculate_live_intervals();
1817
1818 foreach_list_safe(node, &this->instructions) {
1819 fs_inst *inst = (fs_inst *)node;
1820
1821 if (inst->opcode != BRW_OPCODE_MOV ||
1822 inst->predicate ||
1823 inst->saturate ||
1824 inst->src[0].file != GRF ||
1825 inst->src[0].negate ||
1826 inst->src[0].abs ||
1827 inst->src[0].smear != -1 ||
1828 inst->dst.file != GRF ||
1829 inst->dst.type != inst->src[0].type ||
1830 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1831 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1832 continue;
1833 }
1834
1835 int reg_from = inst->src[0].reg;
1836 assert(inst->src[0].reg_offset == 0);
1837 int reg_to = inst->dst.reg;
1838 int reg_to_offset = inst->dst.reg_offset;
1839
1840 foreach_list_safe(node, &this->instructions) {
1841 fs_inst *scan_inst = (fs_inst *)node;
1842
1843 if (scan_inst->dst.file == GRF &&
1844 scan_inst->dst.reg == reg_from) {
1845 scan_inst->dst.reg = reg_to;
1846 scan_inst->dst.reg_offset = reg_to_offset;
1847 }
1848 for (int i = 0; i < 3; i++) {
1849 if (scan_inst->src[i].file == GRF &&
1850 scan_inst->src[i].reg == reg_from) {
1851 scan_inst->src[i].reg = reg_to;
1852 scan_inst->src[i].reg_offset = reg_to_offset;
1853 }
1854 }
1855 }
1856
1857 inst->remove();
1858 live_intervals_valid = false;
1859 progress = true;
1860 continue;
1861 }
1862
1863 return progress;
1864 }
1865
1866 bool
1867 fs_visitor::register_coalesce()
1868 {
1869 bool progress = false;
1870 int if_depth = 0;
1871 int loop_depth = 0;
1872
1873 foreach_list_safe(node, &this->instructions) {
1874 fs_inst *inst = (fs_inst *)node;
1875
1876 /* Make sure that we dominate the instructions we're going to
1877 * scan for interfering with our coalescing, or we won't have
1878 * scanned enough to see if anything interferes with our
1879 * coalescing. We don't dominate the following instructions if
1880 * we're in a loop or an if block.
1881 */
1882 switch (inst->opcode) {
1883 case BRW_OPCODE_DO:
1884 loop_depth++;
1885 break;
1886 case BRW_OPCODE_WHILE:
1887 loop_depth--;
1888 break;
1889 case BRW_OPCODE_IF:
1890 if_depth++;
1891 break;
1892 case BRW_OPCODE_ENDIF:
1893 if_depth--;
1894 break;
1895 default:
1896 break;
1897 }
1898 if (loop_depth || if_depth)
1899 continue;
1900
1901 if (inst->opcode != BRW_OPCODE_MOV ||
1902 inst->predicate ||
1903 inst->saturate ||
1904 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1905 inst->src[0].file != UNIFORM)||
1906 inst->dst.type != inst->src[0].type)
1907 continue;
1908
1909 bool has_source_modifiers = (inst->src[0].abs ||
1910 inst->src[0].negate ||
1911 inst->src[0].file == UNIFORM);
1912
1913 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1914 * them: check for no writes to either one until the exit of the
1915 * program.
1916 */
1917 bool interfered = false;
1918
1919 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1920 !scan_inst->is_tail_sentinel();
1921 scan_inst = (fs_inst *)scan_inst->next) {
1922 if (scan_inst->dst.file == GRF) {
1923 if (scan_inst->overwrites_reg(inst->dst) ||
1924 scan_inst->overwrites_reg(inst->src[0])) {
1925 interfered = true;
1926 break;
1927 }
1928 }
1929
1930 /* The gen6 MATH instruction can't handle source modifiers or
1931 * unusual register regions, so avoid coalescing those for
1932 * now. We should do something more specific.
1933 */
1934 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1935 interfered = true;
1936 break;
1937 }
1938
1939 /* The accumulator result appears to get used for the
1940 * conditional modifier generation. When negating a UD
1941 * value, there is a 33rd bit generated for the sign in the
1942 * accumulator value, so now you can't check, for example,
1943 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1944 */
1945 if (scan_inst->conditional_mod &&
1946 inst->src[0].negate &&
1947 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1948 interfered = true;
1949 break;
1950 }
1951 }
1952 if (interfered) {
1953 continue;
1954 }
1955
1956 /* Rewrite the later usage to point at the source of the move to
1957 * be removed.
1958 */
1959 for (fs_inst *scan_inst = inst;
1960 !scan_inst->is_tail_sentinel();
1961 scan_inst = (fs_inst *)scan_inst->next) {
1962 for (int i = 0; i < 3; i++) {
1963 if (scan_inst->src[i].file == GRF &&
1964 scan_inst->src[i].reg == inst->dst.reg &&
1965 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1966 fs_reg new_src = inst->src[0];
1967 if (scan_inst->src[i].abs) {
1968 new_src.negate = 0;
1969 new_src.abs = 1;
1970 }
1971 new_src.negate ^= scan_inst->src[i].negate;
1972 scan_inst->src[i] = new_src;
1973 }
1974 }
1975 }
1976
1977 inst->remove();
1978 progress = true;
1979 }
1980
1981 if (progress)
1982 live_intervals_valid = false;
1983
1984 return progress;
1985 }
1986
1987
1988 bool
1989 fs_visitor::compute_to_mrf()
1990 {
1991 bool progress = false;
1992 int next_ip = 0;
1993
1994 calculate_live_intervals();
1995
1996 foreach_list_safe(node, &this->instructions) {
1997 fs_inst *inst = (fs_inst *)node;
1998
1999 int ip = next_ip;
2000 next_ip++;
2001
2002 if (inst->opcode != BRW_OPCODE_MOV ||
2003 inst->predicate ||
2004 inst->dst.file != MRF || inst->src[0].file != GRF ||
2005 inst->dst.type != inst->src[0].type ||
2006 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2007 continue;
2008
2009 /* Work out which hardware MRF registers are written by this
2010 * instruction.
2011 */
2012 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2013 int mrf_high;
2014 if (inst->dst.reg & BRW_MRF_COMPR4) {
2015 mrf_high = mrf_low + 4;
2016 } else if (dispatch_width == 16 &&
2017 (!inst->force_uncompressed && !inst->force_sechalf)) {
2018 mrf_high = mrf_low + 1;
2019 } else {
2020 mrf_high = mrf_low;
2021 }
2022
2023 /* Can't compute-to-MRF this GRF if someone else was going to
2024 * read it later.
2025 */
2026 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2027 continue;
2028
2029 /* Found a move of a GRF to a MRF. Let's see if we can go
2030 * rewrite the thing that made this GRF to write into the MRF.
2031 */
2032 fs_inst *scan_inst;
2033 for (scan_inst = (fs_inst *)inst->prev;
2034 scan_inst->prev != NULL;
2035 scan_inst = (fs_inst *)scan_inst->prev) {
2036 if (scan_inst->dst.file == GRF &&
2037 scan_inst->dst.reg == inst->src[0].reg) {
2038 /* Found the last thing to write our reg we want to turn
2039 * into a compute-to-MRF.
2040 */
2041
2042 /* SENDs can only write to GRFs, so no compute-to-MRF. */
2043 if (scan_inst->mlen) {
2044 break;
2045 }
2046
2047 /* If it's predicated, it (probably) didn't populate all
2048 * the channels. We might be able to rewrite everything
2049 * that writes that reg, but it would require smarter
2050 * tracking to delay the rewriting until complete success.
2051 */
2052 if (scan_inst->predicate)
2053 break;
2054
2055 /* If it's half of register setup and not the same half as
2056 * our MOV we're trying to remove, bail for now.
2057 */
2058 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2059 scan_inst->force_sechalf != inst->force_sechalf) {
2060 break;
2061 }
2062
2063 /* SEND instructions can't have MRF as a destination. */
2064 if (scan_inst->mlen)
2065 break;
2066
2067 if (intel->gen >= 6) {
2068 /* gen6 math instructions must have the destination be
2069 * GRF, so no compute-to-MRF for them.
2070 */
2071 if (scan_inst->is_math()) {
2072 break;
2073 }
2074 }
2075
2076 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2077 /* Found the creator of our MRF's source value. */
2078 scan_inst->dst.file = MRF;
2079 scan_inst->dst.reg = inst->dst.reg;
2080 scan_inst->saturate |= inst->saturate;
2081 inst->remove();
2082 progress = true;
2083 }
2084 break;
2085 }
2086
2087 /* We don't handle flow control here. Most computation of
2088 * values that end up in MRFs are shortly before the MRF
2089 * write anyway.
2090 */
2091 if (scan_inst->opcode == BRW_OPCODE_DO ||
2092 scan_inst->opcode == BRW_OPCODE_WHILE ||
2093 scan_inst->opcode == BRW_OPCODE_ELSE ||
2094 scan_inst->opcode == BRW_OPCODE_ENDIF) {
2095 break;
2096 }
2097
2098 /* You can't read from an MRF, so if someone else reads our
2099 * MRF's source GRF that we wanted to rewrite, that stops us.
2100 */
2101 bool interfered = false;
2102 for (int i = 0; i < 3; i++) {
2103 if (scan_inst->src[i].file == GRF &&
2104 scan_inst->src[i].reg == inst->src[0].reg &&
2105 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2106 interfered = true;
2107 }
2108 }
2109 if (interfered)
2110 break;
2111
2112 if (scan_inst->dst.file == MRF) {
2113 /* If somebody else writes our MRF here, we can't
2114 * compute-to-MRF before that.
2115 */
2116 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2117 int scan_mrf_high;
2118
2119 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2120 scan_mrf_high = scan_mrf_low + 4;
2121 } else if (dispatch_width == 16 &&
2122 (!scan_inst->force_uncompressed &&
2123 !scan_inst->force_sechalf)) {
2124 scan_mrf_high = scan_mrf_low + 1;
2125 } else {
2126 scan_mrf_high = scan_mrf_low;
2127 }
2128
2129 if (mrf_low == scan_mrf_low ||
2130 mrf_low == scan_mrf_high ||
2131 mrf_high == scan_mrf_low ||
2132 mrf_high == scan_mrf_high) {
2133 break;
2134 }
2135 }
2136
2137 if (scan_inst->mlen > 0) {
2138 /* Found a SEND instruction, which means that there are
2139 * live values in MRFs from base_mrf to base_mrf +
2140 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2141 * above it.
2142 */
2143 if (mrf_low >= scan_inst->base_mrf &&
2144 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2145 break;
2146 }
2147 if (mrf_high >= scan_inst->base_mrf &&
2148 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2149 break;
2150 }
2151 }
2152 }
2153 }
2154
2155 if (progress)
2156 live_intervals_valid = false;
2157
2158 return progress;
2159 }
2160
2161 /**
2162 * Walks through basic blocks, looking for repeated MRF writes and
2163 * removing the later ones.
2164 */
2165 bool
2166 fs_visitor::remove_duplicate_mrf_writes()
2167 {
2168 fs_inst *last_mrf_move[16];
2169 bool progress = false;
2170
2171 /* Need to update the MRF tracking for compressed instructions. */
2172 if (dispatch_width == 16)
2173 return false;
2174
2175 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2176
2177 foreach_list_safe(node, &this->instructions) {
2178 fs_inst *inst = (fs_inst *)node;
2179
2180 switch (inst->opcode) {
2181 case BRW_OPCODE_DO:
2182 case BRW_OPCODE_WHILE:
2183 case BRW_OPCODE_IF:
2184 case BRW_OPCODE_ELSE:
2185 case BRW_OPCODE_ENDIF:
2186 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2187 continue;
2188 default:
2189 break;
2190 }
2191
2192 if (inst->opcode == BRW_OPCODE_MOV &&
2193 inst->dst.file == MRF) {
2194 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2195 if (prev_inst && inst->equals(prev_inst)) {
2196 inst->remove();
2197 progress = true;
2198 continue;
2199 }
2200 }
2201
2202 /* Clear out the last-write records for MRFs that were overwritten. */
2203 if (inst->dst.file == MRF) {
2204 last_mrf_move[inst->dst.reg] = NULL;
2205 }
2206
2207 if (inst->mlen > 0) {
2208 /* Found a SEND instruction, which will include two or fewer
2209 * implied MRF writes. We could do better here.
2210 */
2211 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2212 last_mrf_move[inst->base_mrf + i] = NULL;
2213 }
2214 }
2215
2216 /* Clear out any MRF move records whose sources got overwritten. */
2217 if (inst->dst.file == GRF) {
2218 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2219 if (last_mrf_move[i] &&
2220 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2221 last_mrf_move[i] = NULL;
2222 }
2223 }
2224 }
2225
2226 if (inst->opcode == BRW_OPCODE_MOV &&
2227 inst->dst.file == MRF &&
2228 inst->src[0].file == GRF &&
2229 !inst->predicate) {
2230 last_mrf_move[inst->dst.reg] = inst;
2231 }
2232 }
2233
2234 if (progress)
2235 live_intervals_valid = false;
2236
2237 return progress;
2238 }
2239
2240 void
2241 fs_visitor::dump_instruction(fs_inst *inst)
2242 {
2243 if (inst->predicate) {
2244 printf("(%cf0.%d) ",
2245 inst->predicate_inverse ? '-' : '+',
2246 inst->flag_subreg);
2247 }
2248
2249 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2250 opcode_descs[inst->opcode].name) {
2251 printf("%s", opcode_descs[inst->opcode].name);
2252 } else {
2253 printf("op%d", inst->opcode);
2254 }
2255 if (inst->saturate)
2256 printf(".sat");
2257 if (inst->conditional_mod) {
2258 printf(".cmod");
2259 if (!inst->predicate &&
2260 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2261 inst->opcode != BRW_OPCODE_IF &&
2262 inst->opcode != BRW_OPCODE_WHILE))) {
2263 printf(".f0.%d\n", inst->flag_subreg);
2264 }
2265 }
2266 printf(" ");
2267
2268
2269 switch (inst->dst.file) {
2270 case GRF:
2271 printf("vgrf%d", inst->dst.reg);
2272 if (inst->dst.reg_offset)
2273 printf("+%d", inst->dst.reg_offset);
2274 break;
2275 case MRF:
2276 printf("m%d", inst->dst.reg);
2277 break;
2278 case BAD_FILE:
2279 printf("(null)");
2280 break;
2281 case UNIFORM:
2282 printf("***u%d***", inst->dst.reg);
2283 break;
2284 default:
2285 printf("???");
2286 break;
2287 }
2288 printf(", ");
2289
2290 for (int i = 0; i < 3; i++) {
2291 if (inst->src[i].negate)
2292 printf("-");
2293 if (inst->src[i].abs)
2294 printf("|");
2295 switch (inst->src[i].file) {
2296 case GRF:
2297 printf("vgrf%d", inst->src[i].reg);
2298 if (inst->src[i].reg_offset)
2299 printf("+%d", inst->src[i].reg_offset);
2300 break;
2301 case MRF:
2302 printf("***m%d***", inst->src[i].reg);
2303 break;
2304 case UNIFORM:
2305 printf("u%d", inst->src[i].reg);
2306 if (inst->src[i].reg_offset)
2307 printf(".%d", inst->src[i].reg_offset);
2308 break;
2309 case BAD_FILE:
2310 printf("(null)");
2311 break;
2312 default:
2313 printf("???");
2314 break;
2315 }
2316 if (inst->src[i].abs)
2317 printf("|");
2318
2319 if (i < 3)
2320 printf(", ");
2321 }
2322
2323 printf(" ");
2324
2325 if (inst->force_uncompressed)
2326 printf("1sthalf ");
2327
2328 if (inst->force_sechalf)
2329 printf("2ndhalf ");
2330
2331 printf("\n");
2332 }
2333
2334 void
2335 fs_visitor::dump_instructions()
2336 {
2337 int ip = 0;
2338 foreach_list(node, &this->instructions) {
2339 fs_inst *inst = (fs_inst *)node;
2340 printf("%d: ", ip++);
2341 dump_instruction(inst);
2342 }
2343 }
2344
2345 /**
2346 * Possibly returns an instruction that set up @param reg.
2347 *
2348 * Sometimes we want to take the result of some expression/variable
2349 * dereference tree and rewrite the instruction generating the result
2350 * of the tree. When processing the tree, we know that the
2351 * instructions generated are all writing temporaries that are dead
2352 * outside of this tree. So, if we have some instructions that write
2353 * a temporary, we're free to point that temp write somewhere else.
2354 *
2355 * Note that this doesn't guarantee that the instruction generated
2356 * only reg -- it might be the size=4 destination of a texture instruction.
2357 */
2358 fs_inst *
2359 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2360 fs_inst *end,
2361 fs_reg reg)
2362 {
2363 if (end == start ||
2364 end->predicate ||
2365 end->force_uncompressed ||
2366 end->force_sechalf ||
2367 reg.reladdr ||
2368 !reg.equals(end->dst)) {
2369 return NULL;
2370 } else {
2371 return end;
2372 }
2373 }
2374
2375 void
2376 fs_visitor::setup_payload_gen6()
2377 {
2378 struct intel_context *intel = &brw->intel;
2379 bool uses_depth =
2380 (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2381 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2382
2383 assert(intel->gen >= 6);
2384
2385 /* R0-1: masks, pixel X/Y coordinates. */
2386 c->nr_payload_regs = 2;
2387 /* R2: only for 32-pixel dispatch.*/
2388
2389 /* R3-26: barycentric interpolation coordinates. These appear in the
2390 * same order that they appear in the brw_wm_barycentric_interp_mode
2391 * enum. Each set of coordinates occupies 2 registers if dispatch width
2392 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2393 * appear if they were enabled using the "Barycentric Interpolation
2394 * Mode" bits in WM_STATE.
2395 */
2396 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2397 if (barycentric_interp_modes & (1 << i)) {
2398 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2399 c->nr_payload_regs += 2;
2400 if (dispatch_width == 16) {
2401 c->nr_payload_regs += 2;
2402 }
2403 }
2404 }
2405
2406 /* R27: interpolated depth if uses source depth */
2407 if (uses_depth) {
2408 c->source_depth_reg = c->nr_payload_regs;
2409 c->nr_payload_regs++;
2410 if (dispatch_width == 16) {
2411 /* R28: interpolated depth if not 8-wide. */
2412 c->nr_payload_regs++;
2413 }
2414 }
2415 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2416 if (uses_depth) {
2417 c->source_w_reg = c->nr_payload_regs;
2418 c->nr_payload_regs++;
2419 if (dispatch_width == 16) {
2420 /* R30: interpolated W if not 8-wide. */
2421 c->nr_payload_regs++;
2422 }
2423 }
2424 /* R31: MSAA position offsets. */
2425 /* R32-: bary for 32-pixel. */
2426 /* R58-59: interp W for 32-pixel. */
2427
2428 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2429 c->source_depth_to_render_target = true;
2430 }
2431 }
2432
2433 bool
2434 fs_visitor::run()
2435 {
2436 uint32_t orig_nr_params = c->prog_data.nr_params;
2437
2438 if (intel->gen >= 6)
2439 setup_payload_gen6();
2440 else
2441 setup_payload_gen4();
2442
2443 if (0) {
2444 emit_dummy_fs();
2445 } else {
2446 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2447 emit_shader_time_begin();
2448
2449 calculate_urb_setup();
2450 if (intel->gen < 6)
2451 emit_interpolation_setup_gen4();
2452 else
2453 emit_interpolation_setup_gen6();
2454
2455 /* We handle discards by keeping track of the still-live pixels in f0.1.
2456 * Initialize it with the dispatched pixels.
2457 */
2458 if (fp->UsesKill) {
2459 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2460 discard_init->flag_subreg = 1;
2461 }
2462
2463 /* Generate FS IR for main(). (the visitor only descends into
2464 * functions called "main").
2465 */
2466 if (shader) {
2467 foreach_list(node, &*shader->ir) {
2468 ir_instruction *ir = (ir_instruction *)node;
2469 base_ir = ir;
2470 this->result = reg_undef;
2471 ir->accept(this);
2472 }
2473 } else {
2474 emit_fragment_program_code();
2475 }
2476 base_ir = NULL;
2477 if (failed)
2478 return false;
2479
2480 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2481 emit_shader_time_end();
2482
2483 emit_fb_writes();
2484
2485 split_virtual_grfs();
2486
2487 setup_paramvalues_refs();
2488 move_uniform_array_access_to_pull_constants();
2489 setup_pull_constants();
2490
2491 bool progress;
2492 do {
2493 progress = false;
2494
2495 compact_virtual_grfs();
2496
2497 progress = remove_duplicate_mrf_writes() || progress;
2498
2499 progress = opt_algebraic() || progress;
2500 progress = opt_cse() || progress;
2501 progress = opt_copy_propagate() || progress;
2502 progress = dead_code_eliminate() || progress;
2503 progress = register_coalesce() || progress;
2504 progress = register_coalesce_2() || progress;
2505 progress = compute_to_mrf() || progress;
2506 } while (progress);
2507
2508 remove_dead_constants();
2509
2510 schedule_instructions(false);
2511
2512 assign_curb_setup();
2513 assign_urb_setup();
2514
2515 if (0) {
2516 /* Debug of register spilling: Go spill everything. */
2517 for (int i = 0; i < virtual_grf_count; i++) {
2518 spill_reg(i);
2519 }
2520 }
2521
2522 if (0)
2523 assign_regs_trivial();
2524 else {
2525 while (!assign_regs()) {
2526 if (failed)
2527 break;
2528 }
2529 }
2530 }
2531 assert(force_uncompressed_stack == 0);
2532 assert(force_sechalf_stack == 0);
2533
2534 if (failed)
2535 return false;
2536
2537 schedule_instructions(true);
2538
2539 if (dispatch_width == 8) {
2540 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2541 } else {
2542 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2543
2544 /* Make sure we didn't try to sneak in an extra uniform */
2545 assert(orig_nr_params == c->prog_data.nr_params);
2546 (void) orig_nr_params;
2547 }
2548
2549 return !failed;
2550 }
2551
2552 const unsigned *
2553 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2554 struct gl_fragment_program *fp,
2555 struct gl_shader_program *prog,
2556 unsigned *final_assembly_size)
2557 {
2558 struct intel_context *intel = &brw->intel;
2559 bool start_busy = false;
2560 float start_time = 0;
2561
2562 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2563 start_busy = (intel->batch.last_bo &&
2564 drm_intel_bo_busy(intel->batch.last_bo));
2565 start_time = get_time();
2566 }
2567
2568 struct brw_shader *shader = NULL;
2569 if (prog)
2570 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2571
2572 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2573 if (shader) {
2574 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2575 _mesa_print_ir(shader->ir, NULL);
2576 printf("\n\n");
2577 } else {
2578 printf("ARB_fragment_program %d ir for native fragment shader\n",
2579 fp->Base.Id);
2580 _mesa_print_program(&fp->Base);
2581 }
2582 }
2583
2584 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2585 */
2586 fs_visitor v(brw, c, prog, fp, 8);
2587 if (!v.run()) {
2588 prog->LinkStatus = false;
2589 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2590
2591 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2592 v.fail_msg);
2593
2594 return NULL;
2595 }
2596
2597 exec_list *simd16_instructions = NULL;
2598 fs_visitor v2(brw, c, prog, fp, 16);
2599 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2600 v2.import_uniforms(&v);
2601 if (!v2.run()) {
2602 perf_debug("16-wide shader failed to compile, falling back to "
2603 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2604 } else {
2605 simd16_instructions = &v2.instructions;
2606 }
2607 }
2608
2609 c->prog_data.dispatch_width = 8;
2610
2611 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2612 const unsigned *generated = g.generate_assembly(&v.instructions,
2613 simd16_instructions,
2614 final_assembly_size);
2615
2616 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2617 if (shader->compiled_once)
2618 brw_wm_debug_recompile(brw, prog, &c->key);
2619 shader->compiled_once = true;
2620
2621 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2622 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2623 (get_time() - start_time) * 1000);
2624 }
2625 }
2626
2627 return generated;
2628 }
2629
2630 bool
2631 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2632 {
2633 struct brw_context *brw = brw_context(ctx);
2634 struct intel_context *intel = &brw->intel;
2635 struct brw_wm_prog_key key;
2636
2637 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2638 return true;
2639
2640 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2641 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2642 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2643 bool program_uses_dfdy = fp->UsesDFdy;
2644
2645 memset(&key, 0, sizeof(key));
2646
2647 if (intel->gen < 6) {
2648 if (fp->UsesKill)
2649 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2650
2651 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2652 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2653
2654 /* Just assume depth testing. */
2655 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2656 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2657 }
2658
2659 if (prog->Name != 0)
2660 key.proj_attrib_mask = 0xffffffff;
2661
2662 if (intel->gen < 6)
2663 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2664
2665 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2666 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2667 continue;
2668
2669 if (prog->Name == 0)
2670 key.proj_attrib_mask |= 1 << i;
2671
2672 if (intel->gen < 6) {
2673 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2674
2675 if (vp_index >= 0)
2676 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2677 }
2678 }
2679
2680 key.clamp_fragment_color = true;
2681
2682 for (int i = 0; i < MAX_SAMPLERS; i++) {
2683 if (fp->Base.ShadowSamplers & (1 << i)) {
2684 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2685 key.tex.swizzles[i] =
2686 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2687 } else {
2688 /* Color sampler: assume no swizzling. */
2689 key.tex.swizzles[i] = SWIZZLE_XYZW;
2690 }
2691 }
2692
2693 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2694 key.drawable_height = ctx->DrawBuffer->Height;
2695 }
2696
2697 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2698 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2699 }
2700
2701 key.nr_color_regions = 1;
2702
2703 key.program_string_id = bfp->id;
2704
2705 uint32_t old_prog_offset = brw->wm.prog_offset;
2706 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2707
2708 bool success = do_wm_prog(brw, prog, bfp, &key);
2709
2710 brw->wm.prog_offset = old_prog_offset;
2711 brw->wm.prog_data = old_prog_data;
2712
2713 return success;
2714 }