i965/fs: Remove the param_index/param_offset indirection.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 ALU1(NOT)
150 ALU1(MOV)
151 ALU1(FRC)
152 ALU1(RNDD)
153 ALU1(RNDE)
154 ALU1(RNDZ)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(SHL)
162 ALU2(SHR)
163 ALU2(ASR)
164
165 /** Gen4 predicated IF. */
166 fs_inst *
167 fs_visitor::IF(uint32_t predicate)
168 {
169 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
170 inst->predicate = predicate;
171 return inst;
172 }
173
174 /** Gen6+ IF with embedded comparison. */
175 fs_inst *
176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
177 {
178 assert(intel->gen >= 6);
179 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
180 reg_null_d, src0, src1);
181 inst->conditional_mod = condition;
182 return inst;
183 }
184
185 /**
186 * CMP: Sets the low bit of the destination channels with the result
187 * of the comparison, while the upper bits are undefined, and updates
188 * the flag register with the packed 16 bits of the result.
189 */
190 fs_inst *
191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
192 {
193 fs_inst *inst;
194
195 /* Take the instruction:
196 *
197 * CMP null<d> src0<f> src1<f>
198 *
199 * Original gen4 does type conversion to the destination type before
200 * comparison, producing garbage results for floating point comparisons.
201 * gen5 does the comparison on the execution type (resolved source types),
202 * so dst type doesn't matter. gen6 does comparison and then uses the
203 * result as if it was the dst type with no conversion, which happens to
204 * mostly work out for float-interpreted-as-int since our comparisons are
205 * for >0, =0, <0.
206 */
207 if (intel->gen == 4) {
208 dst.type = src0.type;
209 if (dst.file == FIXED_HW_REG)
210 dst.fixed_hw_reg.type = dst.type;
211 }
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 exec_list
223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
224 fs_reg offset)
225 {
226 exec_list instructions;
227 fs_inst *inst;
228
229 if (intel->gen >= 7) {
230 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
231 dst, surf_index, offset);
232 instructions.push_tail(inst);
233 } else {
234 int base_mrf = 13;
235 bool header_present = true;
236
237 fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
238 mrf.type = BRW_REGISTER_TYPE_D;
239
240 /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
241 * dword-aligned byte offset.
242 */
243 if (intel->gen == 6) {
244 instructions.push_tail(MOV(mrf, offset));
245 } else {
246 instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
247 }
248 inst = MOV(mrf, offset);
249 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
250 dst, surf_index);
251 inst->header_present = header_present;
252 inst->base_mrf = base_mrf;
253 inst->mlen = header_present + dispatch_width / 8;
254
255 instructions.push_tail(inst);
256 }
257
258 return instructions;
259 }
260
261 bool
262 fs_inst::equals(fs_inst *inst)
263 {
264 return (opcode == inst->opcode &&
265 dst.equals(inst->dst) &&
266 src[0].equals(inst->src[0]) &&
267 src[1].equals(inst->src[1]) &&
268 src[2].equals(inst->src[2]) &&
269 saturate == inst->saturate &&
270 predicate == inst->predicate &&
271 conditional_mod == inst->conditional_mod &&
272 mlen == inst->mlen &&
273 base_mrf == inst->base_mrf &&
274 sampler == inst->sampler &&
275 target == inst->target &&
276 eot == inst->eot &&
277 header_present == inst->header_present &&
278 shadow_compare == inst->shadow_compare &&
279 offset == inst->offset);
280 }
281
282 int
283 fs_inst::regs_written()
284 {
285 if (is_tex())
286 return 4;
287
288 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
289 * but we don't currently use them...nor do we have an opcode for them.
290 */
291
292 return 1;
293 }
294
295 bool
296 fs_inst::overwrites_reg(const fs_reg &reg)
297 {
298 return (reg.file == dst.file &&
299 reg.reg == dst.reg &&
300 reg.reg_offset >= dst.reg_offset &&
301 reg.reg_offset < dst.reg_offset + regs_written());
302 }
303
304 bool
305 fs_inst::is_tex()
306 {
307 return (opcode == SHADER_OPCODE_TEX ||
308 opcode == FS_OPCODE_TXB ||
309 opcode == SHADER_OPCODE_TXD ||
310 opcode == SHADER_OPCODE_TXF ||
311 opcode == SHADER_OPCODE_TXL ||
312 opcode == SHADER_OPCODE_TXS);
313 }
314
315 bool
316 fs_inst::is_math()
317 {
318 return (opcode == SHADER_OPCODE_RCP ||
319 opcode == SHADER_OPCODE_RSQ ||
320 opcode == SHADER_OPCODE_SQRT ||
321 opcode == SHADER_OPCODE_EXP2 ||
322 opcode == SHADER_OPCODE_LOG2 ||
323 opcode == SHADER_OPCODE_SIN ||
324 opcode == SHADER_OPCODE_COS ||
325 opcode == SHADER_OPCODE_INT_QUOTIENT ||
326 opcode == SHADER_OPCODE_INT_REMAINDER ||
327 opcode == SHADER_OPCODE_POW);
328 }
329
330 bool
331 fs_inst::is_send_from_grf()
332 {
333 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
334 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
335 src[1].file == GRF));
336 }
337
338 bool
339 fs_visitor::can_do_source_mods(fs_inst *inst)
340 {
341 if (intel->gen == 6 && inst->is_math())
342 return false;
343
344 if (inst->is_send_from_grf())
345 return false;
346
347 return true;
348 }
349
350 void
351 fs_reg::init()
352 {
353 memset(this, 0, sizeof(*this));
354 this->smear = -1;
355 }
356
357 /** Generic unset register constructor. */
358 fs_reg::fs_reg()
359 {
360 init();
361 this->file = BAD_FILE;
362 }
363
364 /** Immediate value constructor. */
365 fs_reg::fs_reg(float f)
366 {
367 init();
368 this->file = IMM;
369 this->type = BRW_REGISTER_TYPE_F;
370 this->imm.f = f;
371 }
372
373 /** Immediate value constructor. */
374 fs_reg::fs_reg(int32_t i)
375 {
376 init();
377 this->file = IMM;
378 this->type = BRW_REGISTER_TYPE_D;
379 this->imm.i = i;
380 }
381
382 /** Immediate value constructor. */
383 fs_reg::fs_reg(uint32_t u)
384 {
385 init();
386 this->file = IMM;
387 this->type = BRW_REGISTER_TYPE_UD;
388 this->imm.u = u;
389 }
390
391 /** Fixed brw_reg Immediate value constructor. */
392 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
393 {
394 init();
395 this->file = FIXED_HW_REG;
396 this->fixed_hw_reg = fixed_hw_reg;
397 this->type = fixed_hw_reg.type;
398 }
399
400 bool
401 fs_reg::equals(const fs_reg &r) const
402 {
403 return (file == r.file &&
404 reg == r.reg &&
405 reg_offset == r.reg_offset &&
406 type == r.type &&
407 negate == r.negate &&
408 abs == r.abs &&
409 !reladdr && !r.reladdr &&
410 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
411 sizeof(fixed_hw_reg)) == 0 &&
412 smear == r.smear &&
413 imm.u == r.imm.u);
414 }
415
416 bool
417 fs_reg::is_zero() const
418 {
419 if (file != IMM)
420 return false;
421
422 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
423 }
424
425 bool
426 fs_reg::is_one() const
427 {
428 if (file != IMM)
429 return false;
430
431 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
432 }
433
434 int
435 fs_visitor::type_size(const struct glsl_type *type)
436 {
437 unsigned int size, i;
438
439 switch (type->base_type) {
440 case GLSL_TYPE_UINT:
441 case GLSL_TYPE_INT:
442 case GLSL_TYPE_FLOAT:
443 case GLSL_TYPE_BOOL:
444 return type->components();
445 case GLSL_TYPE_ARRAY:
446 return type_size(type->fields.array) * type->length;
447 case GLSL_TYPE_STRUCT:
448 size = 0;
449 for (i = 0; i < type->length; i++) {
450 size += type_size(type->fields.structure[i].type);
451 }
452 return size;
453 case GLSL_TYPE_SAMPLER:
454 /* Samplers take up no register space, since they're baked in at
455 * link time.
456 */
457 return 0;
458 default:
459 assert(!"not reached");
460 return 0;
461 }
462 }
463
464 fs_reg
465 fs_visitor::get_timestamp()
466 {
467 assert(intel->gen >= 7);
468
469 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
470 BRW_ARF_TIMESTAMP,
471 0),
472 BRW_REGISTER_TYPE_UD));
473
474 fs_reg dst = fs_reg(this, glsl_type::uint_type);
475
476 fs_inst *mov = emit(MOV(dst, ts));
477 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
478 * even if it's not enabled in the dispatch.
479 */
480 mov->force_writemask_all = true;
481 mov->force_uncompressed = true;
482
483 /* The caller wants the low 32 bits of the timestamp. Since it's running
484 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
485 * which is plenty of time for our purposes. It is identical across the
486 * EUs, but since it's tracking GPU core speed it will increment at a
487 * varying rate as render P-states change.
488 *
489 * The caller could also check if render P-states have changed (or anything
490 * else that might disrupt timing) by setting smear to 2 and checking if
491 * that field is != 0.
492 */
493 dst.smear = 0;
494
495 return dst;
496 }
497
498 void
499 fs_visitor::emit_shader_time_begin()
500 {
501 current_annotation = "shader time start";
502 shader_start_time = get_timestamp();
503 }
504
505 void
506 fs_visitor::emit_shader_time_end()
507 {
508 current_annotation = "shader time end";
509
510 enum shader_time_shader_type type, written_type, reset_type;
511 if (dispatch_width == 8) {
512 type = ST_FS8;
513 written_type = ST_FS8_WRITTEN;
514 reset_type = ST_FS8_RESET;
515 } else {
516 assert(dispatch_width == 16);
517 type = ST_FS16;
518 written_type = ST_FS16_WRITTEN;
519 reset_type = ST_FS16_RESET;
520 }
521
522 fs_reg shader_end_time = get_timestamp();
523
524 /* Check that there weren't any timestamp reset events (assuming these
525 * were the only two timestamp reads that happened).
526 */
527 fs_reg reset = shader_end_time;
528 reset.smear = 2;
529 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
530 test->conditional_mod = BRW_CONDITIONAL_Z;
531 emit(IF(BRW_PREDICATE_NORMAL));
532
533 push_force_uncompressed();
534 fs_reg start = shader_start_time;
535 start.negate = true;
536 fs_reg diff = fs_reg(this, glsl_type::uint_type);
537 emit(ADD(diff, start, shader_end_time));
538
539 /* If there were no instructions between the two timestamp gets, the diff
540 * is 2 cycles. Remove that overhead, so I can forget about that when
541 * trying to determine the time taken for single instructions.
542 */
543 emit(ADD(diff, diff, fs_reg(-2u)));
544
545 emit_shader_time_write(type, diff);
546 emit_shader_time_write(written_type, fs_reg(1u));
547 emit(BRW_OPCODE_ELSE);
548 emit_shader_time_write(reset_type, fs_reg(1u));
549 emit(BRW_OPCODE_ENDIF);
550
551 pop_force_uncompressed();
552 }
553
554 void
555 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
556 fs_reg value)
557 {
558 /* Choose an index in the buffer and set up tracking information for our
559 * printouts.
560 */
561 int shader_time_index = brw->shader_time.num_entries++;
562 assert(shader_time_index <= brw->shader_time.max_entries);
563 brw->shader_time.types[shader_time_index] = type;
564 if (prog) {
565 _mesa_reference_shader_program(ctx,
566 &brw->shader_time.programs[shader_time_index],
567 prog);
568 }
569
570 int base_mrf = 6;
571
572 fs_reg offset_mrf = fs_reg(MRF, base_mrf);
573 offset_mrf.type = BRW_REGISTER_TYPE_UD;
574 emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
575
576 fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
577 time_mrf.type = BRW_REGISTER_TYPE_UD;
578 emit(MOV(time_mrf, value));
579
580 fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
581 inst->base_mrf = base_mrf;
582 inst->mlen = 2;
583 }
584
585 void
586 fs_visitor::fail(const char *format, ...)
587 {
588 va_list va;
589 char *msg;
590
591 if (failed)
592 return;
593
594 failed = true;
595
596 va_start(va, format);
597 msg = ralloc_vasprintf(mem_ctx, format, va);
598 va_end(va);
599 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
600
601 this->fail_msg = msg;
602
603 if (INTEL_DEBUG & DEBUG_WM) {
604 fprintf(stderr, "%s", msg);
605 }
606 }
607
608 fs_inst *
609 fs_visitor::emit(enum opcode opcode)
610 {
611 return emit(fs_inst(opcode));
612 }
613
614 fs_inst *
615 fs_visitor::emit(enum opcode opcode, fs_reg dst)
616 {
617 return emit(fs_inst(opcode, dst));
618 }
619
620 fs_inst *
621 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
622 {
623 return emit(fs_inst(opcode, dst, src0));
624 }
625
626 fs_inst *
627 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
628 {
629 return emit(fs_inst(opcode, dst, src0, src1));
630 }
631
632 fs_inst *
633 fs_visitor::emit(enum opcode opcode, fs_reg dst,
634 fs_reg src0, fs_reg src1, fs_reg src2)
635 {
636 return emit(fs_inst(opcode, dst, src0, src1, src2));
637 }
638
639 void
640 fs_visitor::push_force_uncompressed()
641 {
642 force_uncompressed_stack++;
643 }
644
645 void
646 fs_visitor::pop_force_uncompressed()
647 {
648 force_uncompressed_stack--;
649 assert(force_uncompressed_stack >= 0);
650 }
651
652 void
653 fs_visitor::push_force_sechalf()
654 {
655 force_sechalf_stack++;
656 }
657
658 void
659 fs_visitor::pop_force_sechalf()
660 {
661 force_sechalf_stack--;
662 assert(force_sechalf_stack >= 0);
663 }
664
665 /**
666 * Returns how many MRFs an FS opcode will write over.
667 *
668 * Note that this is not the 0 or 1 implied writes in an actual gen
669 * instruction -- the FS opcodes often generate MOVs in addition.
670 */
671 int
672 fs_visitor::implied_mrf_writes(fs_inst *inst)
673 {
674 if (inst->mlen == 0)
675 return 0;
676
677 switch (inst->opcode) {
678 case SHADER_OPCODE_RCP:
679 case SHADER_OPCODE_RSQ:
680 case SHADER_OPCODE_SQRT:
681 case SHADER_OPCODE_EXP2:
682 case SHADER_OPCODE_LOG2:
683 case SHADER_OPCODE_SIN:
684 case SHADER_OPCODE_COS:
685 return 1 * dispatch_width / 8;
686 case SHADER_OPCODE_POW:
687 case SHADER_OPCODE_INT_QUOTIENT:
688 case SHADER_OPCODE_INT_REMAINDER:
689 return 2 * dispatch_width / 8;
690 case SHADER_OPCODE_TEX:
691 case FS_OPCODE_TXB:
692 case SHADER_OPCODE_TXD:
693 case SHADER_OPCODE_TXF:
694 case SHADER_OPCODE_TXL:
695 case SHADER_OPCODE_TXS:
696 return 1;
697 case SHADER_OPCODE_SHADER_TIME_ADD:
698 return 0;
699 case FS_OPCODE_FB_WRITE:
700 return 2;
701 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
702 case FS_OPCODE_UNSPILL:
703 return 1;
704 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
705 return inst->header_present;
706 case FS_OPCODE_SPILL:
707 return 2;
708 default:
709 assert(!"not reached");
710 return inst->mlen;
711 }
712 }
713
714 int
715 fs_visitor::virtual_grf_alloc(int size)
716 {
717 if (virtual_grf_array_size <= virtual_grf_count) {
718 if (virtual_grf_array_size == 0)
719 virtual_grf_array_size = 16;
720 else
721 virtual_grf_array_size *= 2;
722 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
723 virtual_grf_array_size);
724 }
725 virtual_grf_sizes[virtual_grf_count] = size;
726 return virtual_grf_count++;
727 }
728
729 /** Fixed HW reg constructor. */
730 fs_reg::fs_reg(enum register_file file, int reg)
731 {
732 init();
733 this->file = file;
734 this->reg = reg;
735 this->type = BRW_REGISTER_TYPE_F;
736 }
737
738 /** Fixed HW reg constructor. */
739 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
740 {
741 init();
742 this->file = file;
743 this->reg = reg;
744 this->type = type;
745 }
746
747 /** Automatic reg constructor. */
748 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
749 {
750 init();
751
752 this->file = GRF;
753 this->reg = v->virtual_grf_alloc(v->type_size(type));
754 this->reg_offset = 0;
755 this->type = brw_type_for_base_type(type);
756 }
757
758 fs_reg *
759 fs_visitor::variable_storage(ir_variable *var)
760 {
761 return (fs_reg *)hash_table_find(this->variable_ht, var);
762 }
763
764 void
765 import_uniforms_callback(const void *key,
766 void *data,
767 void *closure)
768 {
769 struct hash_table *dst_ht = (struct hash_table *)closure;
770 const fs_reg *reg = (const fs_reg *)data;
771
772 if (reg->file != UNIFORM)
773 return;
774
775 hash_table_insert(dst_ht, data, key);
776 }
777
778 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
779 * This brings in those uniform definitions
780 */
781 void
782 fs_visitor::import_uniforms(fs_visitor *v)
783 {
784 hash_table_call_foreach(v->variable_ht,
785 import_uniforms_callback,
786 variable_ht);
787 this->params_remap = v->params_remap;
788 }
789
790 /* Our support for uniforms is piggy-backed on the struct
791 * gl_fragment_program, because that's where the values actually
792 * get stored, rather than in some global gl_shader_program uniform
793 * store.
794 */
795 int
796 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
797 {
798 unsigned int offset = 0;
799
800 if (type->is_matrix()) {
801 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
802 type->vector_elements,
803 1);
804
805 for (unsigned int i = 0; i < type->matrix_columns; i++) {
806 offset += setup_uniform_values(loc + offset, column);
807 }
808
809 return offset;
810 }
811
812 switch (type->base_type) {
813 case GLSL_TYPE_FLOAT:
814 case GLSL_TYPE_UINT:
815 case GLSL_TYPE_INT:
816 case GLSL_TYPE_BOOL:
817 for (unsigned int i = 0; i < type->vector_elements; i++) {
818 c->prog_data.param[c->prog_data.nr_params++] =
819 &fp->Base.Parameters->ParameterValues[loc][i].f;
820 }
821 return 1;
822
823 case GLSL_TYPE_STRUCT:
824 for (unsigned int i = 0; i < type->length; i++) {
825 offset += setup_uniform_values(loc + offset,
826 type->fields.structure[i].type);
827 }
828 return offset;
829
830 case GLSL_TYPE_ARRAY:
831 for (unsigned int i = 0; i < type->length; i++) {
832 offset += setup_uniform_values(loc + offset, type->fields.array);
833 }
834 return offset;
835
836 case GLSL_TYPE_SAMPLER:
837 /* The sampler takes up a slot, but we don't use any values from it. */
838 return 1;
839
840 default:
841 assert(!"not reached");
842 return 0;
843 }
844 }
845
846
847 /* Our support for builtin uniforms is even scarier than non-builtin.
848 * It sits on top of the PROG_STATE_VAR parameters that are
849 * automatically updated from GL context state.
850 */
851 void
852 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
853 {
854 const ir_state_slot *const slots = ir->state_slots;
855 assert(ir->state_slots != NULL);
856
857 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
858 /* This state reference has already been setup by ir_to_mesa, but we'll
859 * get the same index back here.
860 */
861 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
862 (gl_state_index *)slots[i].tokens);
863
864 /* Add each of the unique swizzles of the element as a parameter.
865 * This'll end up matching the expected layout of the
866 * array/matrix/structure we're trying to fill in.
867 */
868 int last_swiz = -1;
869 for (unsigned int j = 0; j < 4; j++) {
870 int swiz = GET_SWZ(slots[i].swizzle, j);
871 if (swiz == last_swiz)
872 break;
873 last_swiz = swiz;
874
875 c->prog_data.param[c->prog_data.nr_params++] =
876 &fp->Base.Parameters->ParameterValues[index][swiz].f;
877 }
878 }
879 }
880
881 fs_reg *
882 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
883 {
884 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
885 fs_reg wpos = *reg;
886 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
887
888 /* gl_FragCoord.x */
889 if (ir->pixel_center_integer) {
890 emit(MOV(wpos, this->pixel_x));
891 } else {
892 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
893 }
894 wpos.reg_offset++;
895
896 /* gl_FragCoord.y */
897 if (!flip && ir->pixel_center_integer) {
898 emit(MOV(wpos, this->pixel_y));
899 } else {
900 fs_reg pixel_y = this->pixel_y;
901 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
902
903 if (flip) {
904 pixel_y.negate = true;
905 offset += c->key.drawable_height - 1.0;
906 }
907
908 emit(ADD(wpos, pixel_y, fs_reg(offset)));
909 }
910 wpos.reg_offset++;
911
912 /* gl_FragCoord.z */
913 if (intel->gen >= 6) {
914 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
915 } else {
916 emit(FS_OPCODE_LINTERP, wpos,
917 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
918 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
919 interp_reg(FRAG_ATTRIB_WPOS, 2));
920 }
921 wpos.reg_offset++;
922
923 /* gl_FragCoord.w: Already set up in emit_interpolation */
924 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
925
926 return reg;
927 }
928
929 fs_inst *
930 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
931 glsl_interp_qualifier interpolation_mode,
932 bool is_centroid)
933 {
934 brw_wm_barycentric_interp_mode barycoord_mode;
935 if (is_centroid) {
936 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
937 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
938 else
939 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
940 } else {
941 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
942 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
943 else
944 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
945 }
946 return emit(FS_OPCODE_LINTERP, attr,
947 this->delta_x[barycoord_mode],
948 this->delta_y[barycoord_mode], interp);
949 }
950
951 fs_reg *
952 fs_visitor::emit_general_interpolation(ir_variable *ir)
953 {
954 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
955 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
956 fs_reg attr = *reg;
957
958 unsigned int array_elements;
959 const glsl_type *type;
960
961 if (ir->type->is_array()) {
962 array_elements = ir->type->length;
963 if (array_elements == 0) {
964 fail("dereferenced array '%s' has length 0\n", ir->name);
965 }
966 type = ir->type->fields.array;
967 } else {
968 array_elements = 1;
969 type = ir->type;
970 }
971
972 glsl_interp_qualifier interpolation_mode =
973 ir->determine_interpolation_mode(c->key.flat_shade);
974
975 int location = ir->location;
976 for (unsigned int i = 0; i < array_elements; i++) {
977 for (unsigned int j = 0; j < type->matrix_columns; j++) {
978 if (urb_setup[location] == -1) {
979 /* If there's no incoming setup data for this slot, don't
980 * emit interpolation for it.
981 */
982 attr.reg_offset += type->vector_elements;
983 location++;
984 continue;
985 }
986
987 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
988 /* Constant interpolation (flat shading) case. The SF has
989 * handed us defined values in only the constant offset
990 * field of the setup reg.
991 */
992 for (unsigned int k = 0; k < type->vector_elements; k++) {
993 struct brw_reg interp = interp_reg(location, k);
994 interp = suboffset(interp, 3);
995 interp.type = reg->type;
996 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
997 attr.reg_offset++;
998 }
999 } else {
1000 /* Smooth/noperspective interpolation case. */
1001 for (unsigned int k = 0; k < type->vector_elements; k++) {
1002 /* FINISHME: At some point we probably want to push
1003 * this farther by giving similar treatment to the
1004 * other potentially constant components of the
1005 * attribute, as well as making brw_vs_constval.c
1006 * handle varyings other than gl_TexCoord.
1007 */
1008 if (location >= FRAG_ATTRIB_TEX0 &&
1009 location <= FRAG_ATTRIB_TEX7 &&
1010 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1011 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1012 } else {
1013 struct brw_reg interp = interp_reg(location, k);
1014 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1015 ir->centroid);
1016 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1017 /* Get the pixel/sample mask into f0 so that we know
1018 * which pixels are lit. Then, for each channel that is
1019 * unlit, replace the centroid data with non-centroid
1020 * data.
1021 */
1022 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1023 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1024 interpolation_mode, false);
1025 inst->predicate = BRW_PREDICATE_NORMAL;
1026 inst->predicate_inverse = true;
1027 }
1028 if (intel->gen < 6) {
1029 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1030 }
1031 }
1032 attr.reg_offset++;
1033 }
1034
1035 }
1036 location++;
1037 }
1038 }
1039
1040 return reg;
1041 }
1042
1043 fs_reg *
1044 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1045 {
1046 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1047
1048 /* The frontfacing comes in as a bit in the thread payload. */
1049 if (intel->gen >= 6) {
1050 emit(BRW_OPCODE_ASR, *reg,
1051 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1052 fs_reg(15));
1053 emit(BRW_OPCODE_NOT, *reg, *reg);
1054 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1055 } else {
1056 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1057 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1058 * us front face
1059 */
1060 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1061 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1062 }
1063
1064 return reg;
1065 }
1066
1067 fs_reg
1068 fs_visitor::fix_math_operand(fs_reg src)
1069 {
1070 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1071 * might be able to do better by doing execsize = 1 math and then
1072 * expanding that result out, but we would need to be careful with
1073 * masking.
1074 *
1075 * The hardware ignores source modifiers (negate and abs) on math
1076 * instructions, so we also move to a temp to set those up.
1077 */
1078 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1079 !src.abs && !src.negate)
1080 return src;
1081
1082 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1083 * operands to math
1084 */
1085 if (intel->gen >= 7 && src.file != IMM)
1086 return src;
1087
1088 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1089 expanded.type = src.type;
1090 emit(BRW_OPCODE_MOV, expanded, src);
1091 return expanded;
1092 }
1093
1094 fs_inst *
1095 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1096 {
1097 switch (opcode) {
1098 case SHADER_OPCODE_RCP:
1099 case SHADER_OPCODE_RSQ:
1100 case SHADER_OPCODE_SQRT:
1101 case SHADER_OPCODE_EXP2:
1102 case SHADER_OPCODE_LOG2:
1103 case SHADER_OPCODE_SIN:
1104 case SHADER_OPCODE_COS:
1105 break;
1106 default:
1107 assert(!"not reached: bad math opcode");
1108 return NULL;
1109 }
1110
1111 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1112 * might be able to do better by doing execsize = 1 math and then
1113 * expanding that result out, but we would need to be careful with
1114 * masking.
1115 *
1116 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1117 * instructions, so we also move to a temp to set those up.
1118 */
1119 if (intel->gen >= 6)
1120 src = fix_math_operand(src);
1121
1122 fs_inst *inst = emit(opcode, dst, src);
1123
1124 if (intel->gen < 6) {
1125 inst->base_mrf = 2;
1126 inst->mlen = dispatch_width / 8;
1127 }
1128
1129 return inst;
1130 }
1131
1132 fs_inst *
1133 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1134 {
1135 int base_mrf = 2;
1136 fs_inst *inst;
1137
1138 switch (opcode) {
1139 case SHADER_OPCODE_INT_QUOTIENT:
1140 case SHADER_OPCODE_INT_REMAINDER:
1141 if (intel->gen >= 7 && dispatch_width == 16)
1142 fail("16-wide INTDIV unsupported\n");
1143 break;
1144 case SHADER_OPCODE_POW:
1145 break;
1146 default:
1147 assert(!"not reached: unsupported binary math opcode.");
1148 return NULL;
1149 }
1150
1151 if (intel->gen >= 6) {
1152 src0 = fix_math_operand(src0);
1153 src1 = fix_math_operand(src1);
1154
1155 inst = emit(opcode, dst, src0, src1);
1156 } else {
1157 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1158 * "Message Payload":
1159 *
1160 * "Operand0[7]. For the INT DIV functions, this operand is the
1161 * denominator."
1162 * ...
1163 * "Operand1[7]. For the INT DIV functions, this operand is the
1164 * numerator."
1165 */
1166 bool is_int_div = opcode != SHADER_OPCODE_POW;
1167 fs_reg &op0 = is_int_div ? src1 : src0;
1168 fs_reg &op1 = is_int_div ? src0 : src1;
1169
1170 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1171 inst = emit(opcode, dst, op0, reg_null_f);
1172
1173 inst->base_mrf = base_mrf;
1174 inst->mlen = 2 * dispatch_width / 8;
1175 }
1176 return inst;
1177 }
1178
1179 void
1180 fs_visitor::assign_curb_setup()
1181 {
1182 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1183 if (dispatch_width == 8) {
1184 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1185 } else {
1186 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1187 }
1188
1189 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1190 foreach_list(node, &this->instructions) {
1191 fs_inst *inst = (fs_inst *)node;
1192
1193 for (unsigned int i = 0; i < 3; i++) {
1194 if (inst->src[i].file == UNIFORM) {
1195 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1196 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1197 constant_nr / 8,
1198 constant_nr % 8);
1199
1200 inst->src[i].file = FIXED_HW_REG;
1201 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1202 }
1203 }
1204 }
1205 }
1206
1207 void
1208 fs_visitor::calculate_urb_setup()
1209 {
1210 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1211 urb_setup[i] = -1;
1212 }
1213
1214 int urb_next = 0;
1215 /* Figure out where each of the incoming setup attributes lands. */
1216 if (intel->gen >= 6) {
1217 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1218 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1219 urb_setup[i] = urb_next++;
1220 }
1221 }
1222 } else {
1223 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1224 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1225 /* Point size is packed into the header, not as a general attribute */
1226 if (i == VERT_RESULT_PSIZ)
1227 continue;
1228
1229 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1230 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1231
1232 /* The back color slot is skipped when the front color is
1233 * also written to. In addition, some slots can be
1234 * written in the vertex shader and not read in the
1235 * fragment shader. So the register number must always be
1236 * incremented, mapped or not.
1237 */
1238 if (fp_index >= 0)
1239 urb_setup[fp_index] = urb_next;
1240 urb_next++;
1241 }
1242 }
1243
1244 /*
1245 * It's a FS only attribute, and we did interpolation for this attribute
1246 * in SF thread. So, count it here, too.
1247 *
1248 * See compile_sf_prog() for more info.
1249 */
1250 if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1251 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1252 }
1253
1254 /* Each attribute is 4 setup channels, each of which is half a reg. */
1255 c->prog_data.urb_read_length = urb_next * 2;
1256 }
1257
1258 void
1259 fs_visitor::assign_urb_setup()
1260 {
1261 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1262
1263 /* Offset all the urb_setup[] index by the actual position of the
1264 * setup regs, now that the location of the constants has been chosen.
1265 */
1266 foreach_list(node, &this->instructions) {
1267 fs_inst *inst = (fs_inst *)node;
1268
1269 if (inst->opcode == FS_OPCODE_LINTERP) {
1270 assert(inst->src[2].file == FIXED_HW_REG);
1271 inst->src[2].fixed_hw_reg.nr += urb_start;
1272 }
1273
1274 if (inst->opcode == FS_OPCODE_CINTERP) {
1275 assert(inst->src[0].file == FIXED_HW_REG);
1276 inst->src[0].fixed_hw_reg.nr += urb_start;
1277 }
1278 }
1279
1280 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1281 }
1282
1283 /**
1284 * Split large virtual GRFs into separate components if we can.
1285 *
1286 * This is mostly duplicated with what brw_fs_vector_splitting does,
1287 * but that's really conservative because it's afraid of doing
1288 * splitting that doesn't result in real progress after the rest of
1289 * the optimization phases, which would cause infinite looping in
1290 * optimization. We can do it once here, safely. This also has the
1291 * opportunity to split interpolated values, or maybe even uniforms,
1292 * which we don't have at the IR level.
1293 *
1294 * We want to split, because virtual GRFs are what we register
1295 * allocate and spill (due to contiguousness requirements for some
1296 * instructions), and they're what we naturally generate in the
1297 * codegen process, but most virtual GRFs don't actually need to be
1298 * contiguous sets of GRFs. If we split, we'll end up with reduced
1299 * live intervals and better dead code elimination and coalescing.
1300 */
1301 void
1302 fs_visitor::split_virtual_grfs()
1303 {
1304 int num_vars = this->virtual_grf_count;
1305 bool split_grf[num_vars];
1306 int new_virtual_grf[num_vars];
1307
1308 /* Try to split anything > 0 sized. */
1309 for (int i = 0; i < num_vars; i++) {
1310 if (this->virtual_grf_sizes[i] != 1)
1311 split_grf[i] = true;
1312 else
1313 split_grf[i] = false;
1314 }
1315
1316 if (brw->has_pln &&
1317 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1318 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1319 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1320 * Gen6, that was the only supported interpolation mode, and since Gen6,
1321 * delta_x and delta_y are in fixed hardware registers.
1322 */
1323 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1324 false;
1325 }
1326
1327 foreach_list(node, &this->instructions) {
1328 fs_inst *inst = (fs_inst *)node;
1329
1330 /* If there's a SEND message that requires contiguous destination
1331 * registers, no splitting is allowed.
1332 */
1333 if (inst->regs_written() > 1) {
1334 split_grf[inst->dst.reg] = false;
1335 }
1336 }
1337
1338 /* Allocate new space for split regs. Note that the virtual
1339 * numbers will be contiguous.
1340 */
1341 for (int i = 0; i < num_vars; i++) {
1342 if (split_grf[i]) {
1343 new_virtual_grf[i] = virtual_grf_alloc(1);
1344 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1345 int reg = virtual_grf_alloc(1);
1346 assert(reg == new_virtual_grf[i] + j - 1);
1347 (void) reg;
1348 }
1349 this->virtual_grf_sizes[i] = 1;
1350 }
1351 }
1352
1353 foreach_list(node, &this->instructions) {
1354 fs_inst *inst = (fs_inst *)node;
1355
1356 if (inst->dst.file == GRF &&
1357 split_grf[inst->dst.reg] &&
1358 inst->dst.reg_offset != 0) {
1359 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1360 inst->dst.reg_offset - 1);
1361 inst->dst.reg_offset = 0;
1362 }
1363 for (int i = 0; i < 3; i++) {
1364 if (inst->src[i].file == GRF &&
1365 split_grf[inst->src[i].reg] &&
1366 inst->src[i].reg_offset != 0) {
1367 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1368 inst->src[i].reg_offset - 1);
1369 inst->src[i].reg_offset = 0;
1370 }
1371 }
1372 }
1373 this->live_intervals_valid = false;
1374 }
1375
1376 /**
1377 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1378 *
1379 * During code generation, we create tons of temporary variables, many of
1380 * which get immediately killed and are never used again. Yet, in later
1381 * optimization and analysis passes, such as compute_live_intervals, we need
1382 * to loop over all the virtual GRFs. Compacting them can save a lot of
1383 * overhead.
1384 */
1385 void
1386 fs_visitor::compact_virtual_grfs()
1387 {
1388 /* Mark which virtual GRFs are used, and count how many. */
1389 int remap_table[this->virtual_grf_count];
1390 memset(remap_table, -1, sizeof(remap_table));
1391
1392 foreach_list(node, &this->instructions) {
1393 const fs_inst *inst = (const fs_inst *) node;
1394
1395 if (inst->dst.file == GRF)
1396 remap_table[inst->dst.reg] = 0;
1397
1398 for (int i = 0; i < 3; i++) {
1399 if (inst->src[i].file == GRF)
1400 remap_table[inst->src[i].reg] = 0;
1401 }
1402 }
1403
1404 /* In addition to registers used in instructions, fs_visitor keeps
1405 * direct references to certain special values which must be patched:
1406 */
1407 fs_reg *special[] = {
1408 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1409 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1410 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1411 &delta_x[0], &delta_x[1], &delta_x[2],
1412 &delta_x[3], &delta_x[4], &delta_x[5],
1413 &delta_y[0], &delta_y[1], &delta_y[2],
1414 &delta_y[3], &delta_y[4], &delta_y[5],
1415 };
1416 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1417 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1418
1419 /* Treat all special values as used, to be conservative */
1420 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1421 if (special[i]->file == GRF)
1422 remap_table[special[i]->reg] = 0;
1423 }
1424
1425 /* Compact the GRF arrays. */
1426 int new_index = 0;
1427 for (int i = 0; i < this->virtual_grf_count; i++) {
1428 if (remap_table[i] != -1) {
1429 remap_table[i] = new_index;
1430 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1431 if (live_intervals_valid) {
1432 virtual_grf_use[new_index] = virtual_grf_use[i];
1433 virtual_grf_def[new_index] = virtual_grf_def[i];
1434 }
1435 ++new_index;
1436 }
1437 }
1438
1439 this->virtual_grf_count = new_index;
1440
1441 /* Patch all the instructions to use the newly renumbered registers */
1442 foreach_list(node, &this->instructions) {
1443 fs_inst *inst = (fs_inst *) node;
1444
1445 if (inst->dst.file == GRF)
1446 inst->dst.reg = remap_table[inst->dst.reg];
1447
1448 for (int i = 0; i < 3; i++) {
1449 if (inst->src[i].file == GRF)
1450 inst->src[i].reg = remap_table[inst->src[i].reg];
1451 }
1452 }
1453
1454 /* Patch all the references to special values */
1455 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1456 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1457 special[i]->reg = remap_table[special[i]->reg];
1458 }
1459 }
1460
1461 bool
1462 fs_visitor::remove_dead_constants()
1463 {
1464 if (dispatch_width == 8) {
1465 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1466
1467 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1468 this->params_remap[i] = -1;
1469
1470 /* Find which params are still in use. */
1471 foreach_list(node, &this->instructions) {
1472 fs_inst *inst = (fs_inst *)node;
1473
1474 for (int i = 0; i < 3; i++) {
1475 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1476
1477 if (inst->src[i].file != UNIFORM)
1478 continue;
1479
1480 assert(constant_nr < (int)c->prog_data.nr_params);
1481
1482 /* For now, set this to non-negative. We'll give it the
1483 * actual new number in a moment, in order to keep the
1484 * register numbers nicely ordered.
1485 */
1486 this->params_remap[constant_nr] = 0;
1487 }
1488 }
1489
1490 /* Figure out what the new numbers for the params will be. At some
1491 * point when we're doing uniform array access, we're going to want
1492 * to keep the distinction between .reg and .reg_offset, but for
1493 * now we don't care.
1494 */
1495 unsigned int new_nr_params = 0;
1496 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1497 if (this->params_remap[i] != -1) {
1498 this->params_remap[i] = new_nr_params++;
1499 }
1500 }
1501
1502 /* Update the list of params to be uploaded to match our new numbering. */
1503 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1504 int remapped = this->params_remap[i];
1505
1506 if (remapped == -1)
1507 continue;
1508
1509 c->prog_data.param[remapped] = c->prog_data.param[i];
1510 }
1511
1512 c->prog_data.nr_params = new_nr_params;
1513 } else {
1514 /* This should have been generated in the 8-wide pass already. */
1515 assert(this->params_remap);
1516 }
1517
1518 /* Now do the renumbering of the shader to remove unused params. */
1519 foreach_list(node, &this->instructions) {
1520 fs_inst *inst = (fs_inst *)node;
1521
1522 for (int i = 0; i < 3; i++) {
1523 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1524
1525 if (inst->src[i].file != UNIFORM)
1526 continue;
1527
1528 assert(this->params_remap[constant_nr] != -1);
1529 inst->src[i].reg = this->params_remap[constant_nr];
1530 inst->src[i].reg_offset = 0;
1531 }
1532 }
1533
1534 return true;
1535 }
1536
1537 /*
1538 * Implements array access of uniforms by inserting a
1539 * PULL_CONSTANT_LOAD instruction.
1540 *
1541 * Unlike temporary GRF array access (where we don't support it due to
1542 * the difficulty of doing relative addressing on instruction
1543 * destinations), we could potentially do array access of uniforms
1544 * that were loaded in GRF space as push constants. In real-world
1545 * usage we've seen, though, the arrays being used are always larger
1546 * than we could load as push constants, so just always move all
1547 * uniform array access out to a pull constant buffer.
1548 */
1549 void
1550 fs_visitor::move_uniform_array_access_to_pull_constants()
1551 {
1552 int pull_constant_loc[c->prog_data.nr_params];
1553
1554 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1555 pull_constant_loc[i] = -1;
1556 }
1557
1558 /* Walk through and find array access of uniforms. Put a copy of that
1559 * uniform in the pull constant buffer.
1560 *
1561 * Note that we don't move constant-indexed accesses to arrays. No
1562 * testing has been done of the performance impact of this choice.
1563 */
1564 foreach_list_safe(node, &this->instructions) {
1565 fs_inst *inst = (fs_inst *)node;
1566
1567 for (int i = 0 ; i < 3; i++) {
1568 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1569 continue;
1570
1571 int uniform = inst->src[i].reg;
1572
1573 /* If this array isn't already present in the pull constant buffer,
1574 * add it.
1575 */
1576 if (pull_constant_loc[uniform] == -1) {
1577 const float **values = &c->prog_data.param[uniform];
1578
1579 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1580
1581 assert(param_size[uniform]);
1582
1583 for (int j = 0; j < param_size[uniform]; j++) {
1584 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1585 values[j];
1586 }
1587 }
1588
1589 /* Set up the annotation tracking for new generated instructions. */
1590 base_ir = inst->ir;
1591 current_annotation = inst->annotation;
1592
1593 fs_reg offset = fs_reg(this, glsl_type::int_type);
1594 inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1595 fs_reg(pull_constant_loc[uniform] +
1596 inst->src[i].reg_offset)));
1597
1598 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1599 fs_reg temp = fs_reg(this, glsl_type::float_type);
1600 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1601 surf_index, offset);
1602 inst->insert_before(&list);
1603
1604 inst->src[i].file = temp.file;
1605 inst->src[i].reg = temp.reg;
1606 inst->src[i].reg_offset = temp.reg_offset;
1607 inst->src[i].reladdr = NULL;
1608 }
1609 }
1610 }
1611
1612 /**
1613 * Choose accesses from the UNIFORM file to demote to using the pull
1614 * constant buffer.
1615 *
1616 * We allow a fragment shader to have more than the specified minimum
1617 * maximum number of fragment shader uniform components (64). If
1618 * there are too many of these, they'd fill up all of register space.
1619 * So, this will push some of them out to the pull constant buffer and
1620 * update the program to load them.
1621 */
1622 void
1623 fs_visitor::setup_pull_constants()
1624 {
1625 /* Only allow 16 registers (128 uniform components) as push constants. */
1626 unsigned int max_uniform_components = 16 * 8;
1627 if (c->prog_data.nr_params <= max_uniform_components)
1628 return;
1629
1630 if (dispatch_width == 16) {
1631 fail("Pull constants not supported in 16-wide\n");
1632 return;
1633 }
1634
1635 /* Just demote the end of the list. We could probably do better
1636 * here, demoting things that are rarely used in the program first.
1637 */
1638 unsigned int pull_uniform_base = max_uniform_components;
1639
1640 int pull_constant_loc[c->prog_data.nr_params];
1641 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1642 if (i < pull_uniform_base) {
1643 pull_constant_loc[i] = -1;
1644 } else {
1645 pull_constant_loc[i] = -1;
1646 /* If our constant is already being uploaded for reladdr purposes,
1647 * reuse it.
1648 */
1649 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1650 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1651 pull_constant_loc[i] = j;
1652 break;
1653 }
1654 }
1655 if (pull_constant_loc[i] == -1) {
1656 int pull_index = c->prog_data.nr_pull_params++;
1657 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1658 pull_constant_loc[i] = pull_index;;
1659 }
1660 }
1661 }
1662 c->prog_data.nr_params = pull_uniform_base;
1663
1664 foreach_list(node, &this->instructions) {
1665 fs_inst *inst = (fs_inst *)node;
1666
1667 for (int i = 0; i < 3; i++) {
1668 if (inst->src[i].file != UNIFORM)
1669 continue;
1670
1671 int pull_index = pull_constant_loc[inst->src[i].reg +
1672 inst->src[i].reg_offset];
1673 if (pull_index == -1)
1674 continue;
1675
1676 assert(!inst->src[i].reladdr);
1677
1678 fs_reg dst = fs_reg(this, glsl_type::float_type);
1679 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1680 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1681 fs_inst *pull =
1682 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1683 dst, index, offset);
1684 pull->ir = inst->ir;
1685 pull->annotation = inst->annotation;
1686 pull->base_mrf = 14;
1687 pull->mlen = 1;
1688
1689 inst->insert_before(pull);
1690
1691 inst->src[i].file = GRF;
1692 inst->src[i].reg = dst.reg;
1693 inst->src[i].reg_offset = 0;
1694 inst->src[i].smear = pull_index & 3;
1695 }
1696 }
1697 }
1698
1699 bool
1700 fs_visitor::opt_algebraic()
1701 {
1702 bool progress = false;
1703
1704 foreach_list(node, &this->instructions) {
1705 fs_inst *inst = (fs_inst *)node;
1706
1707 switch (inst->opcode) {
1708 case BRW_OPCODE_MUL:
1709 if (inst->src[1].file != IMM)
1710 continue;
1711
1712 /* a * 1.0 = a */
1713 if (inst->src[1].is_one()) {
1714 inst->opcode = BRW_OPCODE_MOV;
1715 inst->src[1] = reg_undef;
1716 progress = true;
1717 break;
1718 }
1719
1720 /* a * 0.0 = 0.0 */
1721 if (inst->src[1].is_zero()) {
1722 inst->opcode = BRW_OPCODE_MOV;
1723 inst->src[0] = inst->src[1];
1724 inst->src[1] = reg_undef;
1725 progress = true;
1726 break;
1727 }
1728
1729 break;
1730 case BRW_OPCODE_ADD:
1731 if (inst->src[1].file != IMM)
1732 continue;
1733
1734 /* a + 0.0 = a */
1735 if (inst->src[1].is_zero()) {
1736 inst->opcode = BRW_OPCODE_MOV;
1737 inst->src[1] = reg_undef;
1738 progress = true;
1739 break;
1740 }
1741 break;
1742 default:
1743 break;
1744 }
1745 }
1746
1747 return progress;
1748 }
1749
1750 /**
1751 * Must be called after calculate_live_intervales() to remove unused
1752 * writes to registers -- register allocation will fail otherwise
1753 * because something deffed but not used won't be considered to
1754 * interfere with other regs.
1755 */
1756 bool
1757 fs_visitor::dead_code_eliminate()
1758 {
1759 bool progress = false;
1760 int pc = 0;
1761
1762 calculate_live_intervals();
1763
1764 foreach_list_safe(node, &this->instructions) {
1765 fs_inst *inst = (fs_inst *)node;
1766
1767 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1768 inst->remove();
1769 progress = true;
1770 }
1771
1772 pc++;
1773 }
1774
1775 if (progress)
1776 live_intervals_valid = false;
1777
1778 return progress;
1779 }
1780
1781 /**
1782 * Implements a second type of register coalescing: This one checks if
1783 * the two regs involved in a raw move don't interfere, in which case
1784 * they can both by stored in the same place and the MOV removed.
1785 */
1786 bool
1787 fs_visitor::register_coalesce_2()
1788 {
1789 bool progress = false;
1790
1791 calculate_live_intervals();
1792
1793 foreach_list_safe(node, &this->instructions) {
1794 fs_inst *inst = (fs_inst *)node;
1795
1796 if (inst->opcode != BRW_OPCODE_MOV ||
1797 inst->predicate ||
1798 inst->saturate ||
1799 inst->src[0].file != GRF ||
1800 inst->src[0].negate ||
1801 inst->src[0].abs ||
1802 inst->src[0].smear != -1 ||
1803 inst->dst.file != GRF ||
1804 inst->dst.type != inst->src[0].type ||
1805 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1806 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1807 continue;
1808 }
1809
1810 int reg_from = inst->src[0].reg;
1811 assert(inst->src[0].reg_offset == 0);
1812 int reg_to = inst->dst.reg;
1813 int reg_to_offset = inst->dst.reg_offset;
1814
1815 foreach_list(node, &this->instructions) {
1816 fs_inst *scan_inst = (fs_inst *)node;
1817
1818 if (scan_inst->dst.file == GRF &&
1819 scan_inst->dst.reg == reg_from) {
1820 scan_inst->dst.reg = reg_to;
1821 scan_inst->dst.reg_offset = reg_to_offset;
1822 }
1823 for (int i = 0; i < 3; i++) {
1824 if (scan_inst->src[i].file == GRF &&
1825 scan_inst->src[i].reg == reg_from) {
1826 scan_inst->src[i].reg = reg_to;
1827 scan_inst->src[i].reg_offset = reg_to_offset;
1828 }
1829 }
1830 }
1831
1832 inst->remove();
1833
1834 /* We don't need to recalculate live intervals inside the loop despite
1835 * flagging live_intervals_valid because we only use live intervals for
1836 * the interferes test, and we must have had a situation where the
1837 * intervals were:
1838 *
1839 * from to
1840 * ^
1841 * |
1842 * v
1843 * ^
1844 * |
1845 * v
1846 *
1847 * Some register R that might get coalesced with one of these two could
1848 * only be referencing "to", otherwise "from"'s range would have been
1849 * longer. R's range could also only start at the end of "to" or later,
1850 * otherwise it will conflict with "to" when we try to coalesce "to"
1851 * into Rw anyway.
1852 */
1853 live_intervals_valid = false;
1854
1855 progress = true;
1856 continue;
1857 }
1858
1859 return progress;
1860 }
1861
1862 bool
1863 fs_visitor::register_coalesce()
1864 {
1865 bool progress = false;
1866 int if_depth = 0;
1867 int loop_depth = 0;
1868
1869 foreach_list_safe(node, &this->instructions) {
1870 fs_inst *inst = (fs_inst *)node;
1871
1872 /* Make sure that we dominate the instructions we're going to
1873 * scan for interfering with our coalescing, or we won't have
1874 * scanned enough to see if anything interferes with our
1875 * coalescing. We don't dominate the following instructions if
1876 * we're in a loop or an if block.
1877 */
1878 switch (inst->opcode) {
1879 case BRW_OPCODE_DO:
1880 loop_depth++;
1881 break;
1882 case BRW_OPCODE_WHILE:
1883 loop_depth--;
1884 break;
1885 case BRW_OPCODE_IF:
1886 if_depth++;
1887 break;
1888 case BRW_OPCODE_ENDIF:
1889 if_depth--;
1890 break;
1891 default:
1892 break;
1893 }
1894 if (loop_depth || if_depth)
1895 continue;
1896
1897 if (inst->opcode != BRW_OPCODE_MOV ||
1898 inst->predicate ||
1899 inst->saturate ||
1900 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1901 inst->src[0].file != UNIFORM)||
1902 inst->dst.type != inst->src[0].type)
1903 continue;
1904
1905 bool has_source_modifiers = (inst->src[0].abs ||
1906 inst->src[0].negate ||
1907 inst->src[0].file == UNIFORM);
1908
1909 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1910 * them: check for no writes to either one until the exit of the
1911 * program.
1912 */
1913 bool interfered = false;
1914
1915 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1916 !scan_inst->is_tail_sentinel();
1917 scan_inst = (fs_inst *)scan_inst->next) {
1918 if (scan_inst->dst.file == GRF) {
1919 if (scan_inst->overwrites_reg(inst->dst) ||
1920 scan_inst->overwrites_reg(inst->src[0])) {
1921 interfered = true;
1922 break;
1923 }
1924 }
1925
1926 /* The gen6 MATH instruction can't handle source modifiers or
1927 * unusual register regions, so avoid coalescing those for
1928 * now. We should do something more specific.
1929 */
1930 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1931 interfered = true;
1932 break;
1933 }
1934
1935 /* The accumulator result appears to get used for the
1936 * conditional modifier generation. When negating a UD
1937 * value, there is a 33rd bit generated for the sign in the
1938 * accumulator value, so now you can't check, for example,
1939 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1940 */
1941 if (scan_inst->conditional_mod &&
1942 inst->src[0].negate &&
1943 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1944 interfered = true;
1945 break;
1946 }
1947 }
1948 if (interfered) {
1949 continue;
1950 }
1951
1952 /* Rewrite the later usage to point at the source of the move to
1953 * be removed.
1954 */
1955 for (fs_inst *scan_inst = inst;
1956 !scan_inst->is_tail_sentinel();
1957 scan_inst = (fs_inst *)scan_inst->next) {
1958 for (int i = 0; i < 3; i++) {
1959 if (scan_inst->src[i].file == GRF &&
1960 scan_inst->src[i].reg == inst->dst.reg &&
1961 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1962 fs_reg new_src = inst->src[0];
1963 if (scan_inst->src[i].abs) {
1964 new_src.negate = 0;
1965 new_src.abs = 1;
1966 }
1967 new_src.negate ^= scan_inst->src[i].negate;
1968 scan_inst->src[i] = new_src;
1969 }
1970 }
1971 }
1972
1973 inst->remove();
1974 progress = true;
1975 }
1976
1977 if (progress)
1978 live_intervals_valid = false;
1979
1980 return progress;
1981 }
1982
1983
1984 bool
1985 fs_visitor::compute_to_mrf()
1986 {
1987 bool progress = false;
1988 int next_ip = 0;
1989
1990 calculate_live_intervals();
1991
1992 foreach_list_safe(node, &this->instructions) {
1993 fs_inst *inst = (fs_inst *)node;
1994
1995 int ip = next_ip;
1996 next_ip++;
1997
1998 if (inst->opcode != BRW_OPCODE_MOV ||
1999 inst->predicate ||
2000 inst->dst.file != MRF || inst->src[0].file != GRF ||
2001 inst->dst.type != inst->src[0].type ||
2002 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2003 continue;
2004
2005 /* Work out which hardware MRF registers are written by this
2006 * instruction.
2007 */
2008 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2009 int mrf_high;
2010 if (inst->dst.reg & BRW_MRF_COMPR4) {
2011 mrf_high = mrf_low + 4;
2012 } else if (dispatch_width == 16 &&
2013 (!inst->force_uncompressed && !inst->force_sechalf)) {
2014 mrf_high = mrf_low + 1;
2015 } else {
2016 mrf_high = mrf_low;
2017 }
2018
2019 /* Can't compute-to-MRF this GRF if someone else was going to
2020 * read it later.
2021 */
2022 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2023 continue;
2024
2025 /* Found a move of a GRF to a MRF. Let's see if we can go
2026 * rewrite the thing that made this GRF to write into the MRF.
2027 */
2028 fs_inst *scan_inst;
2029 for (scan_inst = (fs_inst *)inst->prev;
2030 scan_inst->prev != NULL;
2031 scan_inst = (fs_inst *)scan_inst->prev) {
2032 if (scan_inst->dst.file == GRF &&
2033 scan_inst->dst.reg == inst->src[0].reg) {
2034 /* Found the last thing to write our reg we want to turn
2035 * into a compute-to-MRF.
2036 */
2037
2038 /* SENDs can only write to GRFs, so no compute-to-MRF. */
2039 if (scan_inst->mlen) {
2040 break;
2041 }
2042
2043 /* If it's predicated, it (probably) didn't populate all
2044 * the channels. We might be able to rewrite everything
2045 * that writes that reg, but it would require smarter
2046 * tracking to delay the rewriting until complete success.
2047 */
2048 if (scan_inst->predicate)
2049 break;
2050
2051 /* If it's half of register setup and not the same half as
2052 * our MOV we're trying to remove, bail for now.
2053 */
2054 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2055 scan_inst->force_sechalf != inst->force_sechalf) {
2056 break;
2057 }
2058
2059 /* SEND instructions can't have MRF as a destination. */
2060 if (scan_inst->mlen)
2061 break;
2062
2063 if (intel->gen >= 6) {
2064 /* gen6 math instructions must have the destination be
2065 * GRF, so no compute-to-MRF for them.
2066 */
2067 if (scan_inst->is_math()) {
2068 break;
2069 }
2070 }
2071
2072 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2073 /* Found the creator of our MRF's source value. */
2074 scan_inst->dst.file = MRF;
2075 scan_inst->dst.reg = inst->dst.reg;
2076 scan_inst->saturate |= inst->saturate;
2077 inst->remove();
2078 progress = true;
2079 }
2080 break;
2081 }
2082
2083 /* We don't handle flow control here. Most computation of
2084 * values that end up in MRFs are shortly before the MRF
2085 * write anyway.
2086 */
2087 if (scan_inst->opcode == BRW_OPCODE_DO ||
2088 scan_inst->opcode == BRW_OPCODE_WHILE ||
2089 scan_inst->opcode == BRW_OPCODE_ELSE ||
2090 scan_inst->opcode == BRW_OPCODE_ENDIF) {
2091 break;
2092 }
2093
2094 /* You can't read from an MRF, so if someone else reads our
2095 * MRF's source GRF that we wanted to rewrite, that stops us.
2096 */
2097 bool interfered = false;
2098 for (int i = 0; i < 3; i++) {
2099 if (scan_inst->src[i].file == GRF &&
2100 scan_inst->src[i].reg == inst->src[0].reg &&
2101 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2102 interfered = true;
2103 }
2104 }
2105 if (interfered)
2106 break;
2107
2108 if (scan_inst->dst.file == MRF) {
2109 /* If somebody else writes our MRF here, we can't
2110 * compute-to-MRF before that.
2111 */
2112 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2113 int scan_mrf_high;
2114
2115 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2116 scan_mrf_high = scan_mrf_low + 4;
2117 } else if (dispatch_width == 16 &&
2118 (!scan_inst->force_uncompressed &&
2119 !scan_inst->force_sechalf)) {
2120 scan_mrf_high = scan_mrf_low + 1;
2121 } else {
2122 scan_mrf_high = scan_mrf_low;
2123 }
2124
2125 if (mrf_low == scan_mrf_low ||
2126 mrf_low == scan_mrf_high ||
2127 mrf_high == scan_mrf_low ||
2128 mrf_high == scan_mrf_high) {
2129 break;
2130 }
2131 }
2132
2133 if (scan_inst->mlen > 0) {
2134 /* Found a SEND instruction, which means that there are
2135 * live values in MRFs from base_mrf to base_mrf +
2136 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2137 * above it.
2138 */
2139 if (mrf_low >= scan_inst->base_mrf &&
2140 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2141 break;
2142 }
2143 if (mrf_high >= scan_inst->base_mrf &&
2144 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2145 break;
2146 }
2147 }
2148 }
2149 }
2150
2151 if (progress)
2152 live_intervals_valid = false;
2153
2154 return progress;
2155 }
2156
2157 /**
2158 * Walks through basic blocks, looking for repeated MRF writes and
2159 * removing the later ones.
2160 */
2161 bool
2162 fs_visitor::remove_duplicate_mrf_writes()
2163 {
2164 fs_inst *last_mrf_move[16];
2165 bool progress = false;
2166
2167 /* Need to update the MRF tracking for compressed instructions. */
2168 if (dispatch_width == 16)
2169 return false;
2170
2171 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2172
2173 foreach_list_safe(node, &this->instructions) {
2174 fs_inst *inst = (fs_inst *)node;
2175
2176 switch (inst->opcode) {
2177 case BRW_OPCODE_DO:
2178 case BRW_OPCODE_WHILE:
2179 case BRW_OPCODE_IF:
2180 case BRW_OPCODE_ELSE:
2181 case BRW_OPCODE_ENDIF:
2182 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2183 continue;
2184 default:
2185 break;
2186 }
2187
2188 if (inst->opcode == BRW_OPCODE_MOV &&
2189 inst->dst.file == MRF) {
2190 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2191 if (prev_inst && inst->equals(prev_inst)) {
2192 inst->remove();
2193 progress = true;
2194 continue;
2195 }
2196 }
2197
2198 /* Clear out the last-write records for MRFs that were overwritten. */
2199 if (inst->dst.file == MRF) {
2200 last_mrf_move[inst->dst.reg] = NULL;
2201 }
2202
2203 if (inst->mlen > 0) {
2204 /* Found a SEND instruction, which will include two or fewer
2205 * implied MRF writes. We could do better here.
2206 */
2207 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2208 last_mrf_move[inst->base_mrf + i] = NULL;
2209 }
2210 }
2211
2212 /* Clear out any MRF move records whose sources got overwritten. */
2213 if (inst->dst.file == GRF) {
2214 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2215 if (last_mrf_move[i] &&
2216 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2217 last_mrf_move[i] = NULL;
2218 }
2219 }
2220 }
2221
2222 if (inst->opcode == BRW_OPCODE_MOV &&
2223 inst->dst.file == MRF &&
2224 inst->src[0].file == GRF &&
2225 !inst->predicate) {
2226 last_mrf_move[inst->dst.reg] = inst;
2227 }
2228 }
2229
2230 if (progress)
2231 live_intervals_valid = false;
2232
2233 return progress;
2234 }
2235
2236 void
2237 fs_visitor::dump_instruction(fs_inst *inst)
2238 {
2239 if (inst->predicate) {
2240 printf("(%cf0.%d) ",
2241 inst->predicate_inverse ? '-' : '+',
2242 inst->flag_subreg);
2243 }
2244
2245 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2246 opcode_descs[inst->opcode].name) {
2247 printf("%s", opcode_descs[inst->opcode].name);
2248 } else {
2249 printf("op%d", inst->opcode);
2250 }
2251 if (inst->saturate)
2252 printf(".sat");
2253 if (inst->conditional_mod) {
2254 printf(".cmod");
2255 if (!inst->predicate &&
2256 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2257 inst->opcode != BRW_OPCODE_IF &&
2258 inst->opcode != BRW_OPCODE_WHILE))) {
2259 printf(".f0.%d\n", inst->flag_subreg);
2260 }
2261 }
2262 printf(" ");
2263
2264
2265 switch (inst->dst.file) {
2266 case GRF:
2267 printf("vgrf%d", inst->dst.reg);
2268 if (inst->dst.reg_offset)
2269 printf("+%d", inst->dst.reg_offset);
2270 break;
2271 case MRF:
2272 printf("m%d", inst->dst.reg);
2273 break;
2274 case BAD_FILE:
2275 printf("(null)");
2276 break;
2277 case UNIFORM:
2278 printf("***u%d***", inst->dst.reg);
2279 break;
2280 default:
2281 printf("???");
2282 break;
2283 }
2284 printf(", ");
2285
2286 for (int i = 0; i < 3; i++) {
2287 if (inst->src[i].negate)
2288 printf("-");
2289 if (inst->src[i].abs)
2290 printf("|");
2291 switch (inst->src[i].file) {
2292 case GRF:
2293 printf("vgrf%d", inst->src[i].reg);
2294 if (inst->src[i].reg_offset)
2295 printf("+%d", inst->src[i].reg_offset);
2296 break;
2297 case MRF:
2298 printf("***m%d***", inst->src[i].reg);
2299 break;
2300 case UNIFORM:
2301 printf("u%d", inst->src[i].reg);
2302 if (inst->src[i].reg_offset)
2303 printf(".%d", inst->src[i].reg_offset);
2304 break;
2305 case BAD_FILE:
2306 printf("(null)");
2307 break;
2308 default:
2309 printf("???");
2310 break;
2311 }
2312 if (inst->src[i].abs)
2313 printf("|");
2314
2315 if (i < 3)
2316 printf(", ");
2317 }
2318
2319 printf(" ");
2320
2321 if (inst->force_uncompressed)
2322 printf("1sthalf ");
2323
2324 if (inst->force_sechalf)
2325 printf("2ndhalf ");
2326
2327 printf("\n");
2328 }
2329
2330 void
2331 fs_visitor::dump_instructions()
2332 {
2333 int ip = 0;
2334 foreach_list(node, &this->instructions) {
2335 fs_inst *inst = (fs_inst *)node;
2336 printf("%d: ", ip++);
2337 dump_instruction(inst);
2338 }
2339 }
2340
2341 /**
2342 * Possibly returns an instruction that set up @param reg.
2343 *
2344 * Sometimes we want to take the result of some expression/variable
2345 * dereference tree and rewrite the instruction generating the result
2346 * of the tree. When processing the tree, we know that the
2347 * instructions generated are all writing temporaries that are dead
2348 * outside of this tree. So, if we have some instructions that write
2349 * a temporary, we're free to point that temp write somewhere else.
2350 *
2351 * Note that this doesn't guarantee that the instruction generated
2352 * only reg -- it might be the size=4 destination of a texture instruction.
2353 */
2354 fs_inst *
2355 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2356 fs_inst *end,
2357 fs_reg reg)
2358 {
2359 if (end == start ||
2360 end->predicate ||
2361 end->force_uncompressed ||
2362 end->force_sechalf ||
2363 reg.reladdr ||
2364 !reg.equals(end->dst)) {
2365 return NULL;
2366 } else {
2367 return end;
2368 }
2369 }
2370
2371 void
2372 fs_visitor::setup_payload_gen6()
2373 {
2374 struct intel_context *intel = &brw->intel;
2375 bool uses_depth =
2376 (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2377 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2378
2379 assert(intel->gen >= 6);
2380
2381 /* R0-1: masks, pixel X/Y coordinates. */
2382 c->nr_payload_regs = 2;
2383 /* R2: only for 32-pixel dispatch.*/
2384
2385 /* R3-26: barycentric interpolation coordinates. These appear in the
2386 * same order that they appear in the brw_wm_barycentric_interp_mode
2387 * enum. Each set of coordinates occupies 2 registers if dispatch width
2388 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2389 * appear if they were enabled using the "Barycentric Interpolation
2390 * Mode" bits in WM_STATE.
2391 */
2392 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2393 if (barycentric_interp_modes & (1 << i)) {
2394 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2395 c->nr_payload_regs += 2;
2396 if (dispatch_width == 16) {
2397 c->nr_payload_regs += 2;
2398 }
2399 }
2400 }
2401
2402 /* R27: interpolated depth if uses source depth */
2403 if (uses_depth) {
2404 c->source_depth_reg = c->nr_payload_regs;
2405 c->nr_payload_regs++;
2406 if (dispatch_width == 16) {
2407 /* R28: interpolated depth if not 8-wide. */
2408 c->nr_payload_regs++;
2409 }
2410 }
2411 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2412 if (uses_depth) {
2413 c->source_w_reg = c->nr_payload_regs;
2414 c->nr_payload_regs++;
2415 if (dispatch_width == 16) {
2416 /* R30: interpolated W if not 8-wide. */
2417 c->nr_payload_regs++;
2418 }
2419 }
2420 /* R31: MSAA position offsets. */
2421 /* R32-: bary for 32-pixel. */
2422 /* R58-59: interp W for 32-pixel. */
2423
2424 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2425 c->source_depth_to_render_target = true;
2426 }
2427 }
2428
2429 bool
2430 fs_visitor::run()
2431 {
2432 sanity_param_count = fp->Base.Parameters->NumParameters;
2433 uint32_t orig_nr_params = c->prog_data.nr_params;
2434
2435 if (intel->gen >= 6)
2436 setup_payload_gen6();
2437 else
2438 setup_payload_gen4();
2439
2440 if (0) {
2441 emit_dummy_fs();
2442 } else {
2443 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2444 emit_shader_time_begin();
2445
2446 calculate_urb_setup();
2447 if (intel->gen < 6)
2448 emit_interpolation_setup_gen4();
2449 else
2450 emit_interpolation_setup_gen6();
2451
2452 /* We handle discards by keeping track of the still-live pixels in f0.1.
2453 * Initialize it with the dispatched pixels.
2454 */
2455 if (fp->UsesKill) {
2456 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2457 discard_init->flag_subreg = 1;
2458 }
2459
2460 /* Generate FS IR for main(). (the visitor only descends into
2461 * functions called "main").
2462 */
2463 if (shader) {
2464 foreach_list(node, &*shader->ir) {
2465 ir_instruction *ir = (ir_instruction *)node;
2466 base_ir = ir;
2467 this->result = reg_undef;
2468 ir->accept(this);
2469 }
2470 } else {
2471 emit_fragment_program_code();
2472 }
2473 base_ir = NULL;
2474 if (failed)
2475 return false;
2476
2477 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2478 emit_shader_time_end();
2479
2480 emit_fb_writes();
2481
2482 split_virtual_grfs();
2483
2484 move_uniform_array_access_to_pull_constants();
2485 setup_pull_constants();
2486
2487 bool progress;
2488 do {
2489 progress = false;
2490
2491 compact_virtual_grfs();
2492
2493 progress = remove_duplicate_mrf_writes() || progress;
2494
2495 progress = opt_algebraic() || progress;
2496 progress = opt_cse() || progress;
2497 progress = opt_copy_propagate() || progress;
2498 progress = dead_code_eliminate() || progress;
2499 progress = register_coalesce() || progress;
2500 progress = register_coalesce_2() || progress;
2501 progress = compute_to_mrf() || progress;
2502 } while (progress);
2503
2504 remove_dead_constants();
2505
2506 schedule_instructions(false);
2507
2508 assign_curb_setup();
2509 assign_urb_setup();
2510
2511 if (0) {
2512 /* Debug of register spilling: Go spill everything. */
2513 for (int i = 0; i < virtual_grf_count; i++) {
2514 spill_reg(i);
2515 }
2516 }
2517
2518 if (0)
2519 assign_regs_trivial();
2520 else {
2521 while (!assign_regs()) {
2522 if (failed)
2523 break;
2524 }
2525 }
2526 }
2527 assert(force_uncompressed_stack == 0);
2528 assert(force_sechalf_stack == 0);
2529
2530 if (failed)
2531 return false;
2532
2533 schedule_instructions(true);
2534
2535 if (dispatch_width == 8) {
2536 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2537 } else {
2538 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2539
2540 /* Make sure we didn't try to sneak in an extra uniform */
2541 assert(orig_nr_params == c->prog_data.nr_params);
2542 (void) orig_nr_params;
2543 }
2544
2545 /* If any state parameters were appended, then ParameterValues could have
2546 * been realloced, in which case the driver uniform storage set up by
2547 * _mesa_associate_uniform_storage() would point to freed memory. Make
2548 * sure that didn't happen.
2549 */
2550 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2551
2552 return !failed;
2553 }
2554
2555 const unsigned *
2556 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2557 struct gl_fragment_program *fp,
2558 struct gl_shader_program *prog,
2559 unsigned *final_assembly_size)
2560 {
2561 struct intel_context *intel = &brw->intel;
2562 bool start_busy = false;
2563 float start_time = 0;
2564
2565 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2566 start_busy = (intel->batch.last_bo &&
2567 drm_intel_bo_busy(intel->batch.last_bo));
2568 start_time = get_time();
2569 }
2570
2571 struct brw_shader *shader = NULL;
2572 if (prog)
2573 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2574
2575 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2576 if (shader) {
2577 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2578 _mesa_print_ir(shader->ir, NULL);
2579 printf("\n\n");
2580 } else {
2581 printf("ARB_fragment_program %d ir for native fragment shader\n",
2582 fp->Base.Id);
2583 _mesa_print_program(&fp->Base);
2584 }
2585 }
2586
2587 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2588 */
2589 fs_visitor v(brw, c, prog, fp, 8);
2590 if (!v.run()) {
2591 prog->LinkStatus = false;
2592 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2593
2594 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2595 v.fail_msg);
2596
2597 return NULL;
2598 }
2599
2600 exec_list *simd16_instructions = NULL;
2601 fs_visitor v2(brw, c, prog, fp, 16);
2602 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2603 v2.import_uniforms(&v);
2604 if (!v2.run()) {
2605 perf_debug("16-wide shader failed to compile, falling back to "
2606 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2607 } else {
2608 simd16_instructions = &v2.instructions;
2609 }
2610 }
2611
2612 c->prog_data.dispatch_width = 8;
2613
2614 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2615 const unsigned *generated = g.generate_assembly(&v.instructions,
2616 simd16_instructions,
2617 final_assembly_size);
2618
2619 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2620 if (shader->compiled_once)
2621 brw_wm_debug_recompile(brw, prog, &c->key);
2622 shader->compiled_once = true;
2623
2624 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2625 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2626 (get_time() - start_time) * 1000);
2627 }
2628 }
2629
2630 return generated;
2631 }
2632
2633 bool
2634 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2635 {
2636 struct brw_context *brw = brw_context(ctx);
2637 struct intel_context *intel = &brw->intel;
2638 struct brw_wm_prog_key key;
2639
2640 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2641 return true;
2642
2643 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2644 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2645 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2646 bool program_uses_dfdy = fp->UsesDFdy;
2647
2648 memset(&key, 0, sizeof(key));
2649
2650 if (intel->gen < 6) {
2651 if (fp->UsesKill)
2652 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2653
2654 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2655 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2656
2657 /* Just assume depth testing. */
2658 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2659 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2660 }
2661
2662 if (prog->Name != 0)
2663 key.proj_attrib_mask = 0xffffffff;
2664
2665 if (intel->gen < 6)
2666 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2667
2668 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2669 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2670 continue;
2671
2672 if (prog->Name == 0)
2673 key.proj_attrib_mask |= 1 << i;
2674
2675 if (intel->gen < 6) {
2676 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2677
2678 if (vp_index >= 0)
2679 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2680 }
2681 }
2682
2683 key.clamp_fragment_color = true;
2684
2685 for (int i = 0; i < MAX_SAMPLERS; i++) {
2686 if (fp->Base.ShadowSamplers & (1 << i)) {
2687 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2688 key.tex.swizzles[i] =
2689 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2690 } else {
2691 /* Color sampler: assume no swizzling. */
2692 key.tex.swizzles[i] = SWIZZLE_XYZW;
2693 }
2694 }
2695
2696 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2697 key.drawable_height = ctx->DrawBuffer->Height;
2698 }
2699
2700 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2701 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2702 }
2703
2704 key.nr_color_regions = 1;
2705
2706 key.program_string_id = bfp->id;
2707
2708 uint32_t old_prog_offset = brw->wm.prog_offset;
2709 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2710
2711 bool success = do_wm_prog(brw, prog, bfp, &key);
2712
2713 brw->wm.prog_offset = old_prog_offset;
2714 brw->wm.prog_data = old_prog_data;
2715
2716 return success;
2717 }