i965: Add asserts to check that we don't realloc ParameterValues.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 ALU1(NOT)
150 ALU1(MOV)
151 ALU1(FRC)
152 ALU1(RNDD)
153 ALU1(RNDE)
154 ALU1(RNDZ)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(SHL)
162 ALU2(SHR)
163 ALU2(ASR)
164
165 /** Gen4 predicated IF. */
166 fs_inst *
167 fs_visitor::IF(uint32_t predicate)
168 {
169 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
170 inst->predicate = predicate;
171 return inst;
172 }
173
174 /** Gen6+ IF with embedded comparison. */
175 fs_inst *
176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
177 {
178 assert(intel->gen >= 6);
179 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
180 reg_null_d, src0, src1);
181 inst->conditional_mod = condition;
182 return inst;
183 }
184
185 /**
186 * CMP: Sets the low bit of the destination channels with the result
187 * of the comparison, while the upper bits are undefined, and updates
188 * the flag register with the packed 16 bits of the result.
189 */
190 fs_inst *
191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
192 {
193 fs_inst *inst;
194
195 /* Take the instruction:
196 *
197 * CMP null<d> src0<f> src1<f>
198 *
199 * Original gen4 does type conversion to the destination type before
200 * comparison, producing garbage results for floating point comparisons.
201 * gen5 does the comparison on the execution type (resolved source types),
202 * so dst type doesn't matter. gen6 does comparison and then uses the
203 * result as if it was the dst type with no conversion, which happens to
204 * mostly work out for float-interpreted-as-int since our comparisons are
205 * for >0, =0, <0.
206 */
207 if (intel->gen == 4) {
208 dst.type = src0.type;
209 if (dst.file == FIXED_HW_REG)
210 dst.fixed_hw_reg.type = dst.type;
211 }
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 exec_list
223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
224 fs_reg offset)
225 {
226 exec_list instructions;
227 fs_inst *inst;
228
229 if (intel->gen >= 7) {
230 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
231 dst, surf_index, offset);
232 instructions.push_tail(inst);
233 } else {
234 int base_mrf = 13;
235 bool header_present = true;
236
237 fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
238 mrf.type = BRW_REGISTER_TYPE_D;
239
240 /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
241 * dword-aligned byte offset.
242 */
243 if (intel->gen == 6) {
244 instructions.push_tail(MOV(mrf, offset));
245 } else {
246 instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
247 }
248 inst = MOV(mrf, offset);
249 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
250 dst, surf_index);
251 inst->header_present = header_present;
252 inst->base_mrf = base_mrf;
253 inst->mlen = header_present + dispatch_width / 8;
254
255 instructions.push_tail(inst);
256 }
257
258 return instructions;
259 }
260
261 bool
262 fs_inst::equals(fs_inst *inst)
263 {
264 return (opcode == inst->opcode &&
265 dst.equals(inst->dst) &&
266 src[0].equals(inst->src[0]) &&
267 src[1].equals(inst->src[1]) &&
268 src[2].equals(inst->src[2]) &&
269 saturate == inst->saturate &&
270 predicate == inst->predicate &&
271 conditional_mod == inst->conditional_mod &&
272 mlen == inst->mlen &&
273 base_mrf == inst->base_mrf &&
274 sampler == inst->sampler &&
275 target == inst->target &&
276 eot == inst->eot &&
277 header_present == inst->header_present &&
278 shadow_compare == inst->shadow_compare &&
279 offset == inst->offset);
280 }
281
282 int
283 fs_inst::regs_written()
284 {
285 if (is_tex())
286 return 4;
287
288 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
289 * but we don't currently use them...nor do we have an opcode for them.
290 */
291
292 return 1;
293 }
294
295 bool
296 fs_inst::overwrites_reg(const fs_reg &reg)
297 {
298 return (reg.file == dst.file &&
299 reg.reg == dst.reg &&
300 reg.reg_offset >= dst.reg_offset &&
301 reg.reg_offset < dst.reg_offset + regs_written());
302 }
303
304 bool
305 fs_inst::is_tex()
306 {
307 return (opcode == SHADER_OPCODE_TEX ||
308 opcode == FS_OPCODE_TXB ||
309 opcode == SHADER_OPCODE_TXD ||
310 opcode == SHADER_OPCODE_TXF ||
311 opcode == SHADER_OPCODE_TXL ||
312 opcode == SHADER_OPCODE_TXS);
313 }
314
315 bool
316 fs_inst::is_math()
317 {
318 return (opcode == SHADER_OPCODE_RCP ||
319 opcode == SHADER_OPCODE_RSQ ||
320 opcode == SHADER_OPCODE_SQRT ||
321 opcode == SHADER_OPCODE_EXP2 ||
322 opcode == SHADER_OPCODE_LOG2 ||
323 opcode == SHADER_OPCODE_SIN ||
324 opcode == SHADER_OPCODE_COS ||
325 opcode == SHADER_OPCODE_INT_QUOTIENT ||
326 opcode == SHADER_OPCODE_INT_REMAINDER ||
327 opcode == SHADER_OPCODE_POW);
328 }
329
330 bool
331 fs_inst::is_send_from_grf()
332 {
333 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
334 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
335 src[1].file == GRF));
336 }
337
338 bool
339 fs_visitor::can_do_source_mods(fs_inst *inst)
340 {
341 if (intel->gen == 6 && inst->is_math())
342 return false;
343
344 if (inst->is_send_from_grf())
345 return false;
346
347 return true;
348 }
349
350 void
351 fs_reg::init()
352 {
353 memset(this, 0, sizeof(*this));
354 this->smear = -1;
355 }
356
357 /** Generic unset register constructor. */
358 fs_reg::fs_reg()
359 {
360 init();
361 this->file = BAD_FILE;
362 }
363
364 /** Immediate value constructor. */
365 fs_reg::fs_reg(float f)
366 {
367 init();
368 this->file = IMM;
369 this->type = BRW_REGISTER_TYPE_F;
370 this->imm.f = f;
371 }
372
373 /** Immediate value constructor. */
374 fs_reg::fs_reg(int32_t i)
375 {
376 init();
377 this->file = IMM;
378 this->type = BRW_REGISTER_TYPE_D;
379 this->imm.i = i;
380 }
381
382 /** Immediate value constructor. */
383 fs_reg::fs_reg(uint32_t u)
384 {
385 init();
386 this->file = IMM;
387 this->type = BRW_REGISTER_TYPE_UD;
388 this->imm.u = u;
389 }
390
391 /** Fixed brw_reg Immediate value constructor. */
392 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
393 {
394 init();
395 this->file = FIXED_HW_REG;
396 this->fixed_hw_reg = fixed_hw_reg;
397 this->type = fixed_hw_reg.type;
398 }
399
400 bool
401 fs_reg::equals(const fs_reg &r) const
402 {
403 return (file == r.file &&
404 reg == r.reg &&
405 reg_offset == r.reg_offset &&
406 type == r.type &&
407 negate == r.negate &&
408 abs == r.abs &&
409 !reladdr && !r.reladdr &&
410 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
411 sizeof(fixed_hw_reg)) == 0 &&
412 smear == r.smear &&
413 imm.u == r.imm.u);
414 }
415
416 bool
417 fs_reg::is_zero() const
418 {
419 if (file != IMM)
420 return false;
421
422 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
423 }
424
425 bool
426 fs_reg::is_one() const
427 {
428 if (file != IMM)
429 return false;
430
431 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
432 }
433
434 int
435 fs_visitor::type_size(const struct glsl_type *type)
436 {
437 unsigned int size, i;
438
439 switch (type->base_type) {
440 case GLSL_TYPE_UINT:
441 case GLSL_TYPE_INT:
442 case GLSL_TYPE_FLOAT:
443 case GLSL_TYPE_BOOL:
444 return type->components();
445 case GLSL_TYPE_ARRAY:
446 return type_size(type->fields.array) * type->length;
447 case GLSL_TYPE_STRUCT:
448 size = 0;
449 for (i = 0; i < type->length; i++) {
450 size += type_size(type->fields.structure[i].type);
451 }
452 return size;
453 case GLSL_TYPE_SAMPLER:
454 /* Samplers take up no register space, since they're baked in at
455 * link time.
456 */
457 return 0;
458 default:
459 assert(!"not reached");
460 return 0;
461 }
462 }
463
464 fs_reg
465 fs_visitor::get_timestamp()
466 {
467 assert(intel->gen >= 7);
468
469 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
470 BRW_ARF_TIMESTAMP,
471 0),
472 BRW_REGISTER_TYPE_UD));
473
474 fs_reg dst = fs_reg(this, glsl_type::uint_type);
475
476 fs_inst *mov = emit(MOV(dst, ts));
477 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
478 * even if it's not enabled in the dispatch.
479 */
480 mov->force_writemask_all = true;
481 mov->force_uncompressed = true;
482
483 /* The caller wants the low 32 bits of the timestamp. Since it's running
484 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
485 * which is plenty of time for our purposes. It is identical across the
486 * EUs, but since it's tracking GPU core speed it will increment at a
487 * varying rate as render P-states change.
488 *
489 * The caller could also check if render P-states have changed (or anything
490 * else that might disrupt timing) by setting smear to 2 and checking if
491 * that field is != 0.
492 */
493 dst.smear = 0;
494
495 return dst;
496 }
497
498 void
499 fs_visitor::emit_shader_time_begin()
500 {
501 current_annotation = "shader time start";
502 shader_start_time = get_timestamp();
503 }
504
505 void
506 fs_visitor::emit_shader_time_end()
507 {
508 current_annotation = "shader time end";
509
510 enum shader_time_shader_type type, written_type, reset_type;
511 if (dispatch_width == 8) {
512 type = ST_FS8;
513 written_type = ST_FS8_WRITTEN;
514 reset_type = ST_FS8_RESET;
515 } else {
516 assert(dispatch_width == 16);
517 type = ST_FS16;
518 written_type = ST_FS16_WRITTEN;
519 reset_type = ST_FS16_RESET;
520 }
521
522 fs_reg shader_end_time = get_timestamp();
523
524 /* Check that there weren't any timestamp reset events (assuming these
525 * were the only two timestamp reads that happened).
526 */
527 fs_reg reset = shader_end_time;
528 reset.smear = 2;
529 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
530 test->conditional_mod = BRW_CONDITIONAL_Z;
531 emit(IF(BRW_PREDICATE_NORMAL));
532
533 push_force_uncompressed();
534 fs_reg start = shader_start_time;
535 start.negate = true;
536 fs_reg diff = fs_reg(this, glsl_type::uint_type);
537 emit(ADD(diff, start, shader_end_time));
538
539 /* If there were no instructions between the two timestamp gets, the diff
540 * is 2 cycles. Remove that overhead, so I can forget about that when
541 * trying to determine the time taken for single instructions.
542 */
543 emit(ADD(diff, diff, fs_reg(-2u)));
544
545 emit_shader_time_write(type, diff);
546 emit_shader_time_write(written_type, fs_reg(1u));
547 emit(BRW_OPCODE_ELSE);
548 emit_shader_time_write(reset_type, fs_reg(1u));
549 emit(BRW_OPCODE_ENDIF);
550
551 pop_force_uncompressed();
552 }
553
554 void
555 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
556 fs_reg value)
557 {
558 /* Choose an index in the buffer and set up tracking information for our
559 * printouts.
560 */
561 int shader_time_index = brw->shader_time.num_entries++;
562 assert(shader_time_index <= brw->shader_time.max_entries);
563 brw->shader_time.types[shader_time_index] = type;
564 if (prog) {
565 _mesa_reference_shader_program(ctx,
566 &brw->shader_time.programs[shader_time_index],
567 prog);
568 }
569
570 int base_mrf = 6;
571
572 fs_reg offset_mrf = fs_reg(MRF, base_mrf);
573 offset_mrf.type = BRW_REGISTER_TYPE_UD;
574 emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
575
576 fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
577 time_mrf.type = BRW_REGISTER_TYPE_UD;
578 emit(MOV(time_mrf, value));
579
580 fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
581 inst->base_mrf = base_mrf;
582 inst->mlen = 2;
583 }
584
585 void
586 fs_visitor::fail(const char *format, ...)
587 {
588 va_list va;
589 char *msg;
590
591 if (failed)
592 return;
593
594 failed = true;
595
596 va_start(va, format);
597 msg = ralloc_vasprintf(mem_ctx, format, va);
598 va_end(va);
599 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
600
601 this->fail_msg = msg;
602
603 if (INTEL_DEBUG & DEBUG_WM) {
604 fprintf(stderr, "%s", msg);
605 }
606 }
607
608 fs_inst *
609 fs_visitor::emit(enum opcode opcode)
610 {
611 return emit(fs_inst(opcode));
612 }
613
614 fs_inst *
615 fs_visitor::emit(enum opcode opcode, fs_reg dst)
616 {
617 return emit(fs_inst(opcode, dst));
618 }
619
620 fs_inst *
621 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
622 {
623 return emit(fs_inst(opcode, dst, src0));
624 }
625
626 fs_inst *
627 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
628 {
629 return emit(fs_inst(opcode, dst, src0, src1));
630 }
631
632 fs_inst *
633 fs_visitor::emit(enum opcode opcode, fs_reg dst,
634 fs_reg src0, fs_reg src1, fs_reg src2)
635 {
636 return emit(fs_inst(opcode, dst, src0, src1, src2));
637 }
638
639 void
640 fs_visitor::push_force_uncompressed()
641 {
642 force_uncompressed_stack++;
643 }
644
645 void
646 fs_visitor::pop_force_uncompressed()
647 {
648 force_uncompressed_stack--;
649 assert(force_uncompressed_stack >= 0);
650 }
651
652 void
653 fs_visitor::push_force_sechalf()
654 {
655 force_sechalf_stack++;
656 }
657
658 void
659 fs_visitor::pop_force_sechalf()
660 {
661 force_sechalf_stack--;
662 assert(force_sechalf_stack >= 0);
663 }
664
665 /**
666 * Returns how many MRFs an FS opcode will write over.
667 *
668 * Note that this is not the 0 or 1 implied writes in an actual gen
669 * instruction -- the FS opcodes often generate MOVs in addition.
670 */
671 int
672 fs_visitor::implied_mrf_writes(fs_inst *inst)
673 {
674 if (inst->mlen == 0)
675 return 0;
676
677 switch (inst->opcode) {
678 case SHADER_OPCODE_RCP:
679 case SHADER_OPCODE_RSQ:
680 case SHADER_OPCODE_SQRT:
681 case SHADER_OPCODE_EXP2:
682 case SHADER_OPCODE_LOG2:
683 case SHADER_OPCODE_SIN:
684 case SHADER_OPCODE_COS:
685 return 1 * dispatch_width / 8;
686 case SHADER_OPCODE_POW:
687 case SHADER_OPCODE_INT_QUOTIENT:
688 case SHADER_OPCODE_INT_REMAINDER:
689 return 2 * dispatch_width / 8;
690 case SHADER_OPCODE_TEX:
691 case FS_OPCODE_TXB:
692 case SHADER_OPCODE_TXD:
693 case SHADER_OPCODE_TXF:
694 case SHADER_OPCODE_TXL:
695 case SHADER_OPCODE_TXS:
696 return 1;
697 case SHADER_OPCODE_SHADER_TIME_ADD:
698 return 0;
699 case FS_OPCODE_FB_WRITE:
700 return 2;
701 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
702 case FS_OPCODE_UNSPILL:
703 return 1;
704 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
705 return inst->header_present;
706 case FS_OPCODE_SPILL:
707 return 2;
708 default:
709 assert(!"not reached");
710 return inst->mlen;
711 }
712 }
713
714 int
715 fs_visitor::virtual_grf_alloc(int size)
716 {
717 if (virtual_grf_array_size <= virtual_grf_count) {
718 if (virtual_grf_array_size == 0)
719 virtual_grf_array_size = 16;
720 else
721 virtual_grf_array_size *= 2;
722 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
723 virtual_grf_array_size);
724 }
725 virtual_grf_sizes[virtual_grf_count] = size;
726 return virtual_grf_count++;
727 }
728
729 /** Fixed HW reg constructor. */
730 fs_reg::fs_reg(enum register_file file, int reg)
731 {
732 init();
733 this->file = file;
734 this->reg = reg;
735 this->type = BRW_REGISTER_TYPE_F;
736 }
737
738 /** Fixed HW reg constructor. */
739 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
740 {
741 init();
742 this->file = file;
743 this->reg = reg;
744 this->type = type;
745 }
746
747 /** Automatic reg constructor. */
748 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
749 {
750 init();
751
752 this->file = GRF;
753 this->reg = v->virtual_grf_alloc(v->type_size(type));
754 this->reg_offset = 0;
755 this->type = brw_type_for_base_type(type);
756 }
757
758 fs_reg *
759 fs_visitor::variable_storage(ir_variable *var)
760 {
761 return (fs_reg *)hash_table_find(this->variable_ht, var);
762 }
763
764 void
765 import_uniforms_callback(const void *key,
766 void *data,
767 void *closure)
768 {
769 struct hash_table *dst_ht = (struct hash_table *)closure;
770 const fs_reg *reg = (const fs_reg *)data;
771
772 if (reg->file != UNIFORM)
773 return;
774
775 hash_table_insert(dst_ht, data, key);
776 }
777
778 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
779 * This brings in those uniform definitions
780 */
781 void
782 fs_visitor::import_uniforms(fs_visitor *v)
783 {
784 hash_table_call_foreach(v->variable_ht,
785 import_uniforms_callback,
786 variable_ht);
787 this->params_remap = v->params_remap;
788 }
789
790 /* Our support for uniforms is piggy-backed on the struct
791 * gl_fragment_program, because that's where the values actually
792 * get stored, rather than in some global gl_shader_program uniform
793 * store.
794 */
795 int
796 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
797 {
798 unsigned int offset = 0;
799
800 if (type->is_matrix()) {
801 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
802 type->vector_elements,
803 1);
804
805 for (unsigned int i = 0; i < type->matrix_columns; i++) {
806 offset += setup_uniform_values(loc + offset, column);
807 }
808
809 return offset;
810 }
811
812 switch (type->base_type) {
813 case GLSL_TYPE_FLOAT:
814 case GLSL_TYPE_UINT:
815 case GLSL_TYPE_INT:
816 case GLSL_TYPE_BOOL:
817 for (unsigned int i = 0; i < type->vector_elements; i++) {
818 unsigned int param = c->prog_data.nr_params++;
819
820 this->param_index[param] = loc;
821 this->param_offset[param] = i;
822 }
823 return 1;
824
825 case GLSL_TYPE_STRUCT:
826 for (unsigned int i = 0; i < type->length; i++) {
827 offset += setup_uniform_values(loc + offset,
828 type->fields.structure[i].type);
829 }
830 return offset;
831
832 case GLSL_TYPE_ARRAY:
833 for (unsigned int i = 0; i < type->length; i++) {
834 offset += setup_uniform_values(loc + offset, type->fields.array);
835 }
836 return offset;
837
838 case GLSL_TYPE_SAMPLER:
839 /* The sampler takes up a slot, but we don't use any values from it. */
840 return 1;
841
842 default:
843 assert(!"not reached");
844 return 0;
845 }
846 }
847
848
849 /* Our support for builtin uniforms is even scarier than non-builtin.
850 * It sits on top of the PROG_STATE_VAR parameters that are
851 * automatically updated from GL context state.
852 */
853 void
854 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
855 {
856 const ir_state_slot *const slots = ir->state_slots;
857 assert(ir->state_slots != NULL);
858
859 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
860 /* This state reference has already been setup by ir_to_mesa, but we'll
861 * get the same index back here.
862 */
863 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
864 (gl_state_index *)slots[i].tokens);
865
866 /* Add each of the unique swizzles of the element as a parameter.
867 * This'll end up matching the expected layout of the
868 * array/matrix/structure we're trying to fill in.
869 */
870 int last_swiz = -1;
871 for (unsigned int j = 0; j < 4; j++) {
872 int swiz = GET_SWZ(slots[i].swizzle, j);
873 if (swiz == last_swiz)
874 break;
875 last_swiz = swiz;
876
877 this->param_index[c->prog_data.nr_params] = index;
878 this->param_offset[c->prog_data.nr_params] = swiz;
879 c->prog_data.nr_params++;
880 }
881 }
882 }
883
884 fs_reg *
885 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
886 {
887 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
888 fs_reg wpos = *reg;
889 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
890
891 /* gl_FragCoord.x */
892 if (ir->pixel_center_integer) {
893 emit(MOV(wpos, this->pixel_x));
894 } else {
895 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
896 }
897 wpos.reg_offset++;
898
899 /* gl_FragCoord.y */
900 if (!flip && ir->pixel_center_integer) {
901 emit(MOV(wpos, this->pixel_y));
902 } else {
903 fs_reg pixel_y = this->pixel_y;
904 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
905
906 if (flip) {
907 pixel_y.negate = true;
908 offset += c->key.drawable_height - 1.0;
909 }
910
911 emit(ADD(wpos, pixel_y, fs_reg(offset)));
912 }
913 wpos.reg_offset++;
914
915 /* gl_FragCoord.z */
916 if (intel->gen >= 6) {
917 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
918 } else {
919 emit(FS_OPCODE_LINTERP, wpos,
920 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
921 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
922 interp_reg(FRAG_ATTRIB_WPOS, 2));
923 }
924 wpos.reg_offset++;
925
926 /* gl_FragCoord.w: Already set up in emit_interpolation */
927 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
928
929 return reg;
930 }
931
932 fs_inst *
933 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
934 glsl_interp_qualifier interpolation_mode,
935 bool is_centroid)
936 {
937 brw_wm_barycentric_interp_mode barycoord_mode;
938 if (is_centroid) {
939 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
940 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
941 else
942 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
943 } else {
944 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
945 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
946 else
947 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
948 }
949 return emit(FS_OPCODE_LINTERP, attr,
950 this->delta_x[barycoord_mode],
951 this->delta_y[barycoord_mode], interp);
952 }
953
954 fs_reg *
955 fs_visitor::emit_general_interpolation(ir_variable *ir)
956 {
957 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
958 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
959 fs_reg attr = *reg;
960
961 unsigned int array_elements;
962 const glsl_type *type;
963
964 if (ir->type->is_array()) {
965 array_elements = ir->type->length;
966 if (array_elements == 0) {
967 fail("dereferenced array '%s' has length 0\n", ir->name);
968 }
969 type = ir->type->fields.array;
970 } else {
971 array_elements = 1;
972 type = ir->type;
973 }
974
975 glsl_interp_qualifier interpolation_mode =
976 ir->determine_interpolation_mode(c->key.flat_shade);
977
978 int location = ir->location;
979 for (unsigned int i = 0; i < array_elements; i++) {
980 for (unsigned int j = 0; j < type->matrix_columns; j++) {
981 if (urb_setup[location] == -1) {
982 /* If there's no incoming setup data for this slot, don't
983 * emit interpolation for it.
984 */
985 attr.reg_offset += type->vector_elements;
986 location++;
987 continue;
988 }
989
990 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
991 /* Constant interpolation (flat shading) case. The SF has
992 * handed us defined values in only the constant offset
993 * field of the setup reg.
994 */
995 for (unsigned int k = 0; k < type->vector_elements; k++) {
996 struct brw_reg interp = interp_reg(location, k);
997 interp = suboffset(interp, 3);
998 interp.type = reg->type;
999 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1000 attr.reg_offset++;
1001 }
1002 } else {
1003 /* Smooth/noperspective interpolation case. */
1004 for (unsigned int k = 0; k < type->vector_elements; k++) {
1005 /* FINISHME: At some point we probably want to push
1006 * this farther by giving similar treatment to the
1007 * other potentially constant components of the
1008 * attribute, as well as making brw_vs_constval.c
1009 * handle varyings other than gl_TexCoord.
1010 */
1011 if (location >= FRAG_ATTRIB_TEX0 &&
1012 location <= FRAG_ATTRIB_TEX7 &&
1013 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1014 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1015 } else {
1016 struct brw_reg interp = interp_reg(location, k);
1017 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1018 ir->centroid);
1019 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1020 /* Get the pixel/sample mask into f0 so that we know
1021 * which pixels are lit. Then, for each channel that is
1022 * unlit, replace the centroid data with non-centroid
1023 * data.
1024 */
1025 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1026 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1027 interpolation_mode, false);
1028 inst->predicate = BRW_PREDICATE_NORMAL;
1029 inst->predicate_inverse = true;
1030 }
1031 if (intel->gen < 6) {
1032 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1033 }
1034 }
1035 attr.reg_offset++;
1036 }
1037
1038 }
1039 location++;
1040 }
1041 }
1042
1043 return reg;
1044 }
1045
1046 fs_reg *
1047 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1048 {
1049 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1050
1051 /* The frontfacing comes in as a bit in the thread payload. */
1052 if (intel->gen >= 6) {
1053 emit(BRW_OPCODE_ASR, *reg,
1054 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1055 fs_reg(15));
1056 emit(BRW_OPCODE_NOT, *reg, *reg);
1057 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1058 } else {
1059 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1060 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1061 * us front face
1062 */
1063 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1064 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1065 }
1066
1067 return reg;
1068 }
1069
1070 fs_reg
1071 fs_visitor::fix_math_operand(fs_reg src)
1072 {
1073 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1074 * might be able to do better by doing execsize = 1 math and then
1075 * expanding that result out, but we would need to be careful with
1076 * masking.
1077 *
1078 * The hardware ignores source modifiers (negate and abs) on math
1079 * instructions, so we also move to a temp to set those up.
1080 */
1081 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1082 !src.abs && !src.negate)
1083 return src;
1084
1085 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1086 * operands to math
1087 */
1088 if (intel->gen >= 7 && src.file != IMM)
1089 return src;
1090
1091 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1092 expanded.type = src.type;
1093 emit(BRW_OPCODE_MOV, expanded, src);
1094 return expanded;
1095 }
1096
1097 fs_inst *
1098 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1099 {
1100 switch (opcode) {
1101 case SHADER_OPCODE_RCP:
1102 case SHADER_OPCODE_RSQ:
1103 case SHADER_OPCODE_SQRT:
1104 case SHADER_OPCODE_EXP2:
1105 case SHADER_OPCODE_LOG2:
1106 case SHADER_OPCODE_SIN:
1107 case SHADER_OPCODE_COS:
1108 break;
1109 default:
1110 assert(!"not reached: bad math opcode");
1111 return NULL;
1112 }
1113
1114 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1115 * might be able to do better by doing execsize = 1 math and then
1116 * expanding that result out, but we would need to be careful with
1117 * masking.
1118 *
1119 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1120 * instructions, so we also move to a temp to set those up.
1121 */
1122 if (intel->gen >= 6)
1123 src = fix_math_operand(src);
1124
1125 fs_inst *inst = emit(opcode, dst, src);
1126
1127 if (intel->gen < 6) {
1128 inst->base_mrf = 2;
1129 inst->mlen = dispatch_width / 8;
1130 }
1131
1132 return inst;
1133 }
1134
1135 fs_inst *
1136 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1137 {
1138 int base_mrf = 2;
1139 fs_inst *inst;
1140
1141 switch (opcode) {
1142 case SHADER_OPCODE_INT_QUOTIENT:
1143 case SHADER_OPCODE_INT_REMAINDER:
1144 if (intel->gen >= 7 && dispatch_width == 16)
1145 fail("16-wide INTDIV unsupported\n");
1146 break;
1147 case SHADER_OPCODE_POW:
1148 break;
1149 default:
1150 assert(!"not reached: unsupported binary math opcode.");
1151 return NULL;
1152 }
1153
1154 if (intel->gen >= 6) {
1155 src0 = fix_math_operand(src0);
1156 src1 = fix_math_operand(src1);
1157
1158 inst = emit(opcode, dst, src0, src1);
1159 } else {
1160 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1161 * "Message Payload":
1162 *
1163 * "Operand0[7]. For the INT DIV functions, this operand is the
1164 * denominator."
1165 * ...
1166 * "Operand1[7]. For the INT DIV functions, this operand is the
1167 * numerator."
1168 */
1169 bool is_int_div = opcode != SHADER_OPCODE_POW;
1170 fs_reg &op0 = is_int_div ? src1 : src0;
1171 fs_reg &op1 = is_int_div ? src0 : src1;
1172
1173 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1174 inst = emit(opcode, dst, op0, reg_null_f);
1175
1176 inst->base_mrf = base_mrf;
1177 inst->mlen = 2 * dispatch_width / 8;
1178 }
1179 return inst;
1180 }
1181
1182 /**
1183 * To be called after the last _mesa_add_state_reference() call, to
1184 * set up prog_data.param[] for assign_curb_setup() and
1185 * setup_pull_constants().
1186 */
1187 void
1188 fs_visitor::setup_paramvalues_refs()
1189 {
1190 if (dispatch_width != 8)
1191 return;
1192
1193 /* Set up the pointers to ParamValues now that that array is finalized. */
1194 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1195 c->prog_data.param[i] =
1196 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
1197 this->param_offset[i];
1198 }
1199 }
1200
1201 void
1202 fs_visitor::assign_curb_setup()
1203 {
1204 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1205 if (dispatch_width == 8) {
1206 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1207 } else {
1208 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1209 }
1210
1211 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1212 foreach_list(node, &this->instructions) {
1213 fs_inst *inst = (fs_inst *)node;
1214
1215 for (unsigned int i = 0; i < 3; i++) {
1216 if (inst->src[i].file == UNIFORM) {
1217 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1218 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1219 constant_nr / 8,
1220 constant_nr % 8);
1221
1222 inst->src[i].file = FIXED_HW_REG;
1223 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1224 }
1225 }
1226 }
1227 }
1228
1229 void
1230 fs_visitor::calculate_urb_setup()
1231 {
1232 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1233 urb_setup[i] = -1;
1234 }
1235
1236 int urb_next = 0;
1237 /* Figure out where each of the incoming setup attributes lands. */
1238 if (intel->gen >= 6) {
1239 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1240 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1241 urb_setup[i] = urb_next++;
1242 }
1243 }
1244 } else {
1245 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1246 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1247 /* Point size is packed into the header, not as a general attribute */
1248 if (i == VERT_RESULT_PSIZ)
1249 continue;
1250
1251 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1252 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1253
1254 /* The back color slot is skipped when the front color is
1255 * also written to. In addition, some slots can be
1256 * written in the vertex shader and not read in the
1257 * fragment shader. So the register number must always be
1258 * incremented, mapped or not.
1259 */
1260 if (fp_index >= 0)
1261 urb_setup[fp_index] = urb_next;
1262 urb_next++;
1263 }
1264 }
1265
1266 /*
1267 * It's a FS only attribute, and we did interpolation for this attribute
1268 * in SF thread. So, count it here, too.
1269 *
1270 * See compile_sf_prog() for more info.
1271 */
1272 if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1273 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1274 }
1275
1276 /* Each attribute is 4 setup channels, each of which is half a reg. */
1277 c->prog_data.urb_read_length = urb_next * 2;
1278 }
1279
1280 void
1281 fs_visitor::assign_urb_setup()
1282 {
1283 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1284
1285 /* Offset all the urb_setup[] index by the actual position of the
1286 * setup regs, now that the location of the constants has been chosen.
1287 */
1288 foreach_list(node, &this->instructions) {
1289 fs_inst *inst = (fs_inst *)node;
1290
1291 if (inst->opcode == FS_OPCODE_LINTERP) {
1292 assert(inst->src[2].file == FIXED_HW_REG);
1293 inst->src[2].fixed_hw_reg.nr += urb_start;
1294 }
1295
1296 if (inst->opcode == FS_OPCODE_CINTERP) {
1297 assert(inst->src[0].file == FIXED_HW_REG);
1298 inst->src[0].fixed_hw_reg.nr += urb_start;
1299 }
1300 }
1301
1302 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1303 }
1304
1305 /**
1306 * Split large virtual GRFs into separate components if we can.
1307 *
1308 * This is mostly duplicated with what brw_fs_vector_splitting does,
1309 * but that's really conservative because it's afraid of doing
1310 * splitting that doesn't result in real progress after the rest of
1311 * the optimization phases, which would cause infinite looping in
1312 * optimization. We can do it once here, safely. This also has the
1313 * opportunity to split interpolated values, or maybe even uniforms,
1314 * which we don't have at the IR level.
1315 *
1316 * We want to split, because virtual GRFs are what we register
1317 * allocate and spill (due to contiguousness requirements for some
1318 * instructions), and they're what we naturally generate in the
1319 * codegen process, but most virtual GRFs don't actually need to be
1320 * contiguous sets of GRFs. If we split, we'll end up with reduced
1321 * live intervals and better dead code elimination and coalescing.
1322 */
1323 void
1324 fs_visitor::split_virtual_grfs()
1325 {
1326 int num_vars = this->virtual_grf_count;
1327 bool split_grf[num_vars];
1328 int new_virtual_grf[num_vars];
1329
1330 /* Try to split anything > 0 sized. */
1331 for (int i = 0; i < num_vars; i++) {
1332 if (this->virtual_grf_sizes[i] != 1)
1333 split_grf[i] = true;
1334 else
1335 split_grf[i] = false;
1336 }
1337
1338 if (brw->has_pln &&
1339 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1340 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1341 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1342 * Gen6, that was the only supported interpolation mode, and since Gen6,
1343 * delta_x and delta_y are in fixed hardware registers.
1344 */
1345 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1346 false;
1347 }
1348
1349 foreach_list(node, &this->instructions) {
1350 fs_inst *inst = (fs_inst *)node;
1351
1352 /* If there's a SEND message that requires contiguous destination
1353 * registers, no splitting is allowed.
1354 */
1355 if (inst->regs_written() > 1) {
1356 split_grf[inst->dst.reg] = false;
1357 }
1358 }
1359
1360 /* Allocate new space for split regs. Note that the virtual
1361 * numbers will be contiguous.
1362 */
1363 for (int i = 0; i < num_vars; i++) {
1364 if (split_grf[i]) {
1365 new_virtual_grf[i] = virtual_grf_alloc(1);
1366 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1367 int reg = virtual_grf_alloc(1);
1368 assert(reg == new_virtual_grf[i] + j - 1);
1369 (void) reg;
1370 }
1371 this->virtual_grf_sizes[i] = 1;
1372 }
1373 }
1374
1375 foreach_list(node, &this->instructions) {
1376 fs_inst *inst = (fs_inst *)node;
1377
1378 if (inst->dst.file == GRF &&
1379 split_grf[inst->dst.reg] &&
1380 inst->dst.reg_offset != 0) {
1381 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1382 inst->dst.reg_offset - 1);
1383 inst->dst.reg_offset = 0;
1384 }
1385 for (int i = 0; i < 3; i++) {
1386 if (inst->src[i].file == GRF &&
1387 split_grf[inst->src[i].reg] &&
1388 inst->src[i].reg_offset != 0) {
1389 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1390 inst->src[i].reg_offset - 1);
1391 inst->src[i].reg_offset = 0;
1392 }
1393 }
1394 }
1395 this->live_intervals_valid = false;
1396 }
1397
1398 /**
1399 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1400 *
1401 * During code generation, we create tons of temporary variables, many of
1402 * which get immediately killed and are never used again. Yet, in later
1403 * optimization and analysis passes, such as compute_live_intervals, we need
1404 * to loop over all the virtual GRFs. Compacting them can save a lot of
1405 * overhead.
1406 */
1407 void
1408 fs_visitor::compact_virtual_grfs()
1409 {
1410 /* Mark which virtual GRFs are used, and count how many. */
1411 int remap_table[this->virtual_grf_count];
1412 memset(remap_table, -1, sizeof(remap_table));
1413
1414 foreach_list(node, &this->instructions) {
1415 const fs_inst *inst = (const fs_inst *) node;
1416
1417 if (inst->dst.file == GRF)
1418 remap_table[inst->dst.reg] = 0;
1419
1420 for (int i = 0; i < 3; i++) {
1421 if (inst->src[i].file == GRF)
1422 remap_table[inst->src[i].reg] = 0;
1423 }
1424 }
1425
1426 /* In addition to registers used in instructions, fs_visitor keeps
1427 * direct references to certain special values which must be patched:
1428 */
1429 fs_reg *special[] = {
1430 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1431 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1432 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1433 &delta_x[0], &delta_x[1], &delta_x[2],
1434 &delta_x[3], &delta_x[4], &delta_x[5],
1435 &delta_y[0], &delta_y[1], &delta_y[2],
1436 &delta_y[3], &delta_y[4], &delta_y[5],
1437 };
1438 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1439 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1440
1441 /* Treat all special values as used, to be conservative */
1442 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1443 if (special[i]->file == GRF)
1444 remap_table[special[i]->reg] = 0;
1445 }
1446
1447 /* Compact the GRF arrays. */
1448 int new_index = 0;
1449 for (int i = 0; i < this->virtual_grf_count; i++) {
1450 if (remap_table[i] != -1) {
1451 remap_table[i] = new_index;
1452 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1453 if (live_intervals_valid) {
1454 virtual_grf_use[new_index] = virtual_grf_use[i];
1455 virtual_grf_def[new_index] = virtual_grf_def[i];
1456 }
1457 ++new_index;
1458 }
1459 }
1460
1461 this->virtual_grf_count = new_index;
1462
1463 /* Patch all the instructions to use the newly renumbered registers */
1464 foreach_list(node, &this->instructions) {
1465 fs_inst *inst = (fs_inst *) node;
1466
1467 if (inst->dst.file == GRF)
1468 inst->dst.reg = remap_table[inst->dst.reg];
1469
1470 for (int i = 0; i < 3; i++) {
1471 if (inst->src[i].file == GRF)
1472 inst->src[i].reg = remap_table[inst->src[i].reg];
1473 }
1474 }
1475
1476 /* Patch all the references to special values */
1477 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1478 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1479 special[i]->reg = remap_table[special[i]->reg];
1480 }
1481 }
1482
1483 bool
1484 fs_visitor::remove_dead_constants()
1485 {
1486 if (dispatch_width == 8) {
1487 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1488
1489 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1490 this->params_remap[i] = -1;
1491
1492 /* Find which params are still in use. */
1493 foreach_list(node, &this->instructions) {
1494 fs_inst *inst = (fs_inst *)node;
1495
1496 for (int i = 0; i < 3; i++) {
1497 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1498
1499 if (inst->src[i].file != UNIFORM)
1500 continue;
1501
1502 assert(constant_nr < (int)c->prog_data.nr_params);
1503
1504 /* For now, set this to non-negative. We'll give it the
1505 * actual new number in a moment, in order to keep the
1506 * register numbers nicely ordered.
1507 */
1508 this->params_remap[constant_nr] = 0;
1509 }
1510 }
1511
1512 /* Figure out what the new numbers for the params will be. At some
1513 * point when we're doing uniform array access, we're going to want
1514 * to keep the distinction between .reg and .reg_offset, but for
1515 * now we don't care.
1516 */
1517 unsigned int new_nr_params = 0;
1518 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1519 if (this->params_remap[i] != -1) {
1520 this->params_remap[i] = new_nr_params++;
1521 }
1522 }
1523
1524 /* Update the list of params to be uploaded to match our new numbering. */
1525 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1526 int remapped = this->params_remap[i];
1527
1528 if (remapped == -1)
1529 continue;
1530
1531 /* We've already done setup_paramvalues_refs() so no need to worry
1532 * about param_index and param_offset.
1533 */
1534 c->prog_data.param[remapped] = c->prog_data.param[i];
1535 }
1536
1537 c->prog_data.nr_params = new_nr_params;
1538 } else {
1539 /* This should have been generated in the 8-wide pass already. */
1540 assert(this->params_remap);
1541 }
1542
1543 /* Now do the renumbering of the shader to remove unused params. */
1544 foreach_list(node, &this->instructions) {
1545 fs_inst *inst = (fs_inst *)node;
1546
1547 for (int i = 0; i < 3; i++) {
1548 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1549
1550 if (inst->src[i].file != UNIFORM)
1551 continue;
1552
1553 assert(this->params_remap[constant_nr] != -1);
1554 inst->src[i].reg = this->params_remap[constant_nr];
1555 inst->src[i].reg_offset = 0;
1556 }
1557 }
1558
1559 return true;
1560 }
1561
1562 /*
1563 * Implements array access of uniforms by inserting a
1564 * PULL_CONSTANT_LOAD instruction.
1565 *
1566 * Unlike temporary GRF array access (where we don't support it due to
1567 * the difficulty of doing relative addressing on instruction
1568 * destinations), we could potentially do array access of uniforms
1569 * that were loaded in GRF space as push constants. In real-world
1570 * usage we've seen, though, the arrays being used are always larger
1571 * than we could load as push constants, so just always move all
1572 * uniform array access out to a pull constant buffer.
1573 */
1574 void
1575 fs_visitor::move_uniform_array_access_to_pull_constants()
1576 {
1577 int pull_constant_loc[c->prog_data.nr_params];
1578
1579 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1580 pull_constant_loc[i] = -1;
1581 }
1582
1583 /* Walk through and find array access of uniforms. Put a copy of that
1584 * uniform in the pull constant buffer.
1585 *
1586 * Note that we don't move constant-indexed accesses to arrays. No
1587 * testing has been done of the performance impact of this choice.
1588 */
1589 foreach_list_safe(node, &this->instructions) {
1590 fs_inst *inst = (fs_inst *)node;
1591
1592 for (int i = 0 ; i < 3; i++) {
1593 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1594 continue;
1595
1596 int uniform = inst->src[i].reg;
1597
1598 /* If this array isn't already present in the pull constant buffer,
1599 * add it.
1600 */
1601 if (pull_constant_loc[uniform] == -1) {
1602 const float **values = &c->prog_data.param[uniform];
1603
1604 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1605
1606 assert(param_size[uniform]);
1607
1608 for (int j = 0; j < param_size[uniform]; j++) {
1609 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1610 values[j];
1611 }
1612 }
1613
1614 /* Set up the annotation tracking for new generated instructions. */
1615 base_ir = inst->ir;
1616 current_annotation = inst->annotation;
1617
1618 fs_reg offset = fs_reg(this, glsl_type::int_type);
1619 inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1620 fs_reg(pull_constant_loc[uniform] +
1621 inst->src[i].reg_offset)));
1622
1623 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1624 fs_reg temp = fs_reg(this, glsl_type::float_type);
1625 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1626 surf_index, offset);
1627 inst->insert_before(&list);
1628
1629 inst->src[i].file = temp.file;
1630 inst->src[i].reg = temp.reg;
1631 inst->src[i].reg_offset = temp.reg_offset;
1632 inst->src[i].reladdr = NULL;
1633 }
1634 }
1635 }
1636
1637 /**
1638 * Choose accesses from the UNIFORM file to demote to using the pull
1639 * constant buffer.
1640 *
1641 * We allow a fragment shader to have more than the specified minimum
1642 * maximum number of fragment shader uniform components (64). If
1643 * there are too many of these, they'd fill up all of register space.
1644 * So, this will push some of them out to the pull constant buffer and
1645 * update the program to load them.
1646 */
1647 void
1648 fs_visitor::setup_pull_constants()
1649 {
1650 /* Only allow 16 registers (128 uniform components) as push constants. */
1651 unsigned int max_uniform_components = 16 * 8;
1652 if (c->prog_data.nr_params <= max_uniform_components)
1653 return;
1654
1655 if (dispatch_width == 16) {
1656 fail("Pull constants not supported in 16-wide\n");
1657 return;
1658 }
1659
1660 /* Just demote the end of the list. We could probably do better
1661 * here, demoting things that are rarely used in the program first.
1662 */
1663 unsigned int pull_uniform_base = max_uniform_components;
1664
1665 int pull_constant_loc[c->prog_data.nr_params];
1666 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1667 if (i < pull_uniform_base) {
1668 pull_constant_loc[i] = -1;
1669 } else {
1670 pull_constant_loc[i] = -1;
1671 /* If our constant is already being uploaded for reladdr purposes,
1672 * reuse it.
1673 */
1674 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1675 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1676 pull_constant_loc[i] = j;
1677 break;
1678 }
1679 }
1680 if (pull_constant_loc[i] == -1) {
1681 int pull_index = c->prog_data.nr_pull_params++;
1682 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1683 pull_constant_loc[i] = pull_index;;
1684 }
1685 }
1686 }
1687 c->prog_data.nr_params = pull_uniform_base;
1688
1689 foreach_list(node, &this->instructions) {
1690 fs_inst *inst = (fs_inst *)node;
1691
1692 for (int i = 0; i < 3; i++) {
1693 if (inst->src[i].file != UNIFORM)
1694 continue;
1695
1696 int pull_index = pull_constant_loc[inst->src[i].reg +
1697 inst->src[i].reg_offset];
1698 if (pull_index == -1)
1699 continue;
1700
1701 assert(!inst->src[i].reladdr);
1702
1703 fs_reg dst = fs_reg(this, glsl_type::float_type);
1704 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1705 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1706 fs_inst *pull =
1707 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1708 dst, index, offset);
1709 pull->ir = inst->ir;
1710 pull->annotation = inst->annotation;
1711 pull->base_mrf = 14;
1712 pull->mlen = 1;
1713
1714 inst->insert_before(pull);
1715
1716 inst->src[i].file = GRF;
1717 inst->src[i].reg = dst.reg;
1718 inst->src[i].reg_offset = 0;
1719 inst->src[i].smear = pull_index & 3;
1720 }
1721 }
1722 }
1723
1724 bool
1725 fs_visitor::opt_algebraic()
1726 {
1727 bool progress = false;
1728
1729 foreach_list(node, &this->instructions) {
1730 fs_inst *inst = (fs_inst *)node;
1731
1732 switch (inst->opcode) {
1733 case BRW_OPCODE_MUL:
1734 if (inst->src[1].file != IMM)
1735 continue;
1736
1737 /* a * 1.0 = a */
1738 if (inst->src[1].is_one()) {
1739 inst->opcode = BRW_OPCODE_MOV;
1740 inst->src[1] = reg_undef;
1741 progress = true;
1742 break;
1743 }
1744
1745 /* a * 0.0 = 0.0 */
1746 if (inst->src[1].is_zero()) {
1747 inst->opcode = BRW_OPCODE_MOV;
1748 inst->src[0] = inst->src[1];
1749 inst->src[1] = reg_undef;
1750 progress = true;
1751 break;
1752 }
1753
1754 break;
1755 case BRW_OPCODE_ADD:
1756 if (inst->src[1].file != IMM)
1757 continue;
1758
1759 /* a + 0.0 = a */
1760 if (inst->src[1].is_zero()) {
1761 inst->opcode = BRW_OPCODE_MOV;
1762 inst->src[1] = reg_undef;
1763 progress = true;
1764 break;
1765 }
1766 break;
1767 default:
1768 break;
1769 }
1770 }
1771
1772 return progress;
1773 }
1774
1775 /**
1776 * Must be called after calculate_live_intervales() to remove unused
1777 * writes to registers -- register allocation will fail otherwise
1778 * because something deffed but not used won't be considered to
1779 * interfere with other regs.
1780 */
1781 bool
1782 fs_visitor::dead_code_eliminate()
1783 {
1784 bool progress = false;
1785 int pc = 0;
1786
1787 calculate_live_intervals();
1788
1789 foreach_list_safe(node, &this->instructions) {
1790 fs_inst *inst = (fs_inst *)node;
1791
1792 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1793 inst->remove();
1794 progress = true;
1795 }
1796
1797 pc++;
1798 }
1799
1800 if (progress)
1801 live_intervals_valid = false;
1802
1803 return progress;
1804 }
1805
1806 /**
1807 * Implements a second type of register coalescing: This one checks if
1808 * the two regs involved in a raw move don't interfere, in which case
1809 * they can both by stored in the same place and the MOV removed.
1810 */
1811 bool
1812 fs_visitor::register_coalesce_2()
1813 {
1814 bool progress = false;
1815
1816 calculate_live_intervals();
1817
1818 foreach_list_safe(node, &this->instructions) {
1819 fs_inst *inst = (fs_inst *)node;
1820
1821 if (inst->opcode != BRW_OPCODE_MOV ||
1822 inst->predicate ||
1823 inst->saturate ||
1824 inst->src[0].file != GRF ||
1825 inst->src[0].negate ||
1826 inst->src[0].abs ||
1827 inst->src[0].smear != -1 ||
1828 inst->dst.file != GRF ||
1829 inst->dst.type != inst->src[0].type ||
1830 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1831 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1832 continue;
1833 }
1834
1835 int reg_from = inst->src[0].reg;
1836 assert(inst->src[0].reg_offset == 0);
1837 int reg_to = inst->dst.reg;
1838 int reg_to_offset = inst->dst.reg_offset;
1839
1840 foreach_list(node, &this->instructions) {
1841 fs_inst *scan_inst = (fs_inst *)node;
1842
1843 if (scan_inst->dst.file == GRF &&
1844 scan_inst->dst.reg == reg_from) {
1845 scan_inst->dst.reg = reg_to;
1846 scan_inst->dst.reg_offset = reg_to_offset;
1847 }
1848 for (int i = 0; i < 3; i++) {
1849 if (scan_inst->src[i].file == GRF &&
1850 scan_inst->src[i].reg == reg_from) {
1851 scan_inst->src[i].reg = reg_to;
1852 scan_inst->src[i].reg_offset = reg_to_offset;
1853 }
1854 }
1855 }
1856
1857 inst->remove();
1858
1859 /* We don't need to recalculate live intervals inside the loop despite
1860 * flagging live_intervals_valid because we only use live intervals for
1861 * the interferes test, and we must have had a situation where the
1862 * intervals were:
1863 *
1864 * from to
1865 * ^
1866 * |
1867 * v
1868 * ^
1869 * |
1870 * v
1871 *
1872 * Some register R that might get coalesced with one of these two could
1873 * only be referencing "to", otherwise "from"'s range would have been
1874 * longer. R's range could also only start at the end of "to" or later,
1875 * otherwise it will conflict with "to" when we try to coalesce "to"
1876 * into Rw anyway.
1877 */
1878 live_intervals_valid = false;
1879
1880 progress = true;
1881 continue;
1882 }
1883
1884 return progress;
1885 }
1886
1887 bool
1888 fs_visitor::register_coalesce()
1889 {
1890 bool progress = false;
1891 int if_depth = 0;
1892 int loop_depth = 0;
1893
1894 foreach_list_safe(node, &this->instructions) {
1895 fs_inst *inst = (fs_inst *)node;
1896
1897 /* Make sure that we dominate the instructions we're going to
1898 * scan for interfering with our coalescing, or we won't have
1899 * scanned enough to see if anything interferes with our
1900 * coalescing. We don't dominate the following instructions if
1901 * we're in a loop or an if block.
1902 */
1903 switch (inst->opcode) {
1904 case BRW_OPCODE_DO:
1905 loop_depth++;
1906 break;
1907 case BRW_OPCODE_WHILE:
1908 loop_depth--;
1909 break;
1910 case BRW_OPCODE_IF:
1911 if_depth++;
1912 break;
1913 case BRW_OPCODE_ENDIF:
1914 if_depth--;
1915 break;
1916 default:
1917 break;
1918 }
1919 if (loop_depth || if_depth)
1920 continue;
1921
1922 if (inst->opcode != BRW_OPCODE_MOV ||
1923 inst->predicate ||
1924 inst->saturate ||
1925 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1926 inst->src[0].file != UNIFORM)||
1927 inst->dst.type != inst->src[0].type)
1928 continue;
1929
1930 bool has_source_modifiers = (inst->src[0].abs ||
1931 inst->src[0].negate ||
1932 inst->src[0].file == UNIFORM);
1933
1934 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1935 * them: check for no writes to either one until the exit of the
1936 * program.
1937 */
1938 bool interfered = false;
1939
1940 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1941 !scan_inst->is_tail_sentinel();
1942 scan_inst = (fs_inst *)scan_inst->next) {
1943 if (scan_inst->dst.file == GRF) {
1944 if (scan_inst->overwrites_reg(inst->dst) ||
1945 scan_inst->overwrites_reg(inst->src[0])) {
1946 interfered = true;
1947 break;
1948 }
1949 }
1950
1951 /* The gen6 MATH instruction can't handle source modifiers or
1952 * unusual register regions, so avoid coalescing those for
1953 * now. We should do something more specific.
1954 */
1955 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1956 interfered = true;
1957 break;
1958 }
1959
1960 /* The accumulator result appears to get used for the
1961 * conditional modifier generation. When negating a UD
1962 * value, there is a 33rd bit generated for the sign in the
1963 * accumulator value, so now you can't check, for example,
1964 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1965 */
1966 if (scan_inst->conditional_mod &&
1967 inst->src[0].negate &&
1968 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1969 interfered = true;
1970 break;
1971 }
1972 }
1973 if (interfered) {
1974 continue;
1975 }
1976
1977 /* Rewrite the later usage to point at the source of the move to
1978 * be removed.
1979 */
1980 for (fs_inst *scan_inst = inst;
1981 !scan_inst->is_tail_sentinel();
1982 scan_inst = (fs_inst *)scan_inst->next) {
1983 for (int i = 0; i < 3; i++) {
1984 if (scan_inst->src[i].file == GRF &&
1985 scan_inst->src[i].reg == inst->dst.reg &&
1986 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1987 fs_reg new_src = inst->src[0];
1988 if (scan_inst->src[i].abs) {
1989 new_src.negate = 0;
1990 new_src.abs = 1;
1991 }
1992 new_src.negate ^= scan_inst->src[i].negate;
1993 scan_inst->src[i] = new_src;
1994 }
1995 }
1996 }
1997
1998 inst->remove();
1999 progress = true;
2000 }
2001
2002 if (progress)
2003 live_intervals_valid = false;
2004
2005 return progress;
2006 }
2007
2008
2009 bool
2010 fs_visitor::compute_to_mrf()
2011 {
2012 bool progress = false;
2013 int next_ip = 0;
2014
2015 calculate_live_intervals();
2016
2017 foreach_list_safe(node, &this->instructions) {
2018 fs_inst *inst = (fs_inst *)node;
2019
2020 int ip = next_ip;
2021 next_ip++;
2022
2023 if (inst->opcode != BRW_OPCODE_MOV ||
2024 inst->predicate ||
2025 inst->dst.file != MRF || inst->src[0].file != GRF ||
2026 inst->dst.type != inst->src[0].type ||
2027 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2028 continue;
2029
2030 /* Work out which hardware MRF registers are written by this
2031 * instruction.
2032 */
2033 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2034 int mrf_high;
2035 if (inst->dst.reg & BRW_MRF_COMPR4) {
2036 mrf_high = mrf_low + 4;
2037 } else if (dispatch_width == 16 &&
2038 (!inst->force_uncompressed && !inst->force_sechalf)) {
2039 mrf_high = mrf_low + 1;
2040 } else {
2041 mrf_high = mrf_low;
2042 }
2043
2044 /* Can't compute-to-MRF this GRF if someone else was going to
2045 * read it later.
2046 */
2047 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2048 continue;
2049
2050 /* Found a move of a GRF to a MRF. Let's see if we can go
2051 * rewrite the thing that made this GRF to write into the MRF.
2052 */
2053 fs_inst *scan_inst;
2054 for (scan_inst = (fs_inst *)inst->prev;
2055 scan_inst->prev != NULL;
2056 scan_inst = (fs_inst *)scan_inst->prev) {
2057 if (scan_inst->dst.file == GRF &&
2058 scan_inst->dst.reg == inst->src[0].reg) {
2059 /* Found the last thing to write our reg we want to turn
2060 * into a compute-to-MRF.
2061 */
2062
2063 /* SENDs can only write to GRFs, so no compute-to-MRF. */
2064 if (scan_inst->mlen) {
2065 break;
2066 }
2067
2068 /* If it's predicated, it (probably) didn't populate all
2069 * the channels. We might be able to rewrite everything
2070 * that writes that reg, but it would require smarter
2071 * tracking to delay the rewriting until complete success.
2072 */
2073 if (scan_inst->predicate)
2074 break;
2075
2076 /* If it's half of register setup and not the same half as
2077 * our MOV we're trying to remove, bail for now.
2078 */
2079 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2080 scan_inst->force_sechalf != inst->force_sechalf) {
2081 break;
2082 }
2083
2084 /* SEND instructions can't have MRF as a destination. */
2085 if (scan_inst->mlen)
2086 break;
2087
2088 if (intel->gen >= 6) {
2089 /* gen6 math instructions must have the destination be
2090 * GRF, so no compute-to-MRF for them.
2091 */
2092 if (scan_inst->is_math()) {
2093 break;
2094 }
2095 }
2096
2097 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2098 /* Found the creator of our MRF's source value. */
2099 scan_inst->dst.file = MRF;
2100 scan_inst->dst.reg = inst->dst.reg;
2101 scan_inst->saturate |= inst->saturate;
2102 inst->remove();
2103 progress = true;
2104 }
2105 break;
2106 }
2107
2108 /* We don't handle flow control here. Most computation of
2109 * values that end up in MRFs are shortly before the MRF
2110 * write anyway.
2111 */
2112 if (scan_inst->opcode == BRW_OPCODE_DO ||
2113 scan_inst->opcode == BRW_OPCODE_WHILE ||
2114 scan_inst->opcode == BRW_OPCODE_ELSE ||
2115 scan_inst->opcode == BRW_OPCODE_ENDIF) {
2116 break;
2117 }
2118
2119 /* You can't read from an MRF, so if someone else reads our
2120 * MRF's source GRF that we wanted to rewrite, that stops us.
2121 */
2122 bool interfered = false;
2123 for (int i = 0; i < 3; i++) {
2124 if (scan_inst->src[i].file == GRF &&
2125 scan_inst->src[i].reg == inst->src[0].reg &&
2126 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2127 interfered = true;
2128 }
2129 }
2130 if (interfered)
2131 break;
2132
2133 if (scan_inst->dst.file == MRF) {
2134 /* If somebody else writes our MRF here, we can't
2135 * compute-to-MRF before that.
2136 */
2137 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2138 int scan_mrf_high;
2139
2140 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2141 scan_mrf_high = scan_mrf_low + 4;
2142 } else if (dispatch_width == 16 &&
2143 (!scan_inst->force_uncompressed &&
2144 !scan_inst->force_sechalf)) {
2145 scan_mrf_high = scan_mrf_low + 1;
2146 } else {
2147 scan_mrf_high = scan_mrf_low;
2148 }
2149
2150 if (mrf_low == scan_mrf_low ||
2151 mrf_low == scan_mrf_high ||
2152 mrf_high == scan_mrf_low ||
2153 mrf_high == scan_mrf_high) {
2154 break;
2155 }
2156 }
2157
2158 if (scan_inst->mlen > 0) {
2159 /* Found a SEND instruction, which means that there are
2160 * live values in MRFs from base_mrf to base_mrf +
2161 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2162 * above it.
2163 */
2164 if (mrf_low >= scan_inst->base_mrf &&
2165 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2166 break;
2167 }
2168 if (mrf_high >= scan_inst->base_mrf &&
2169 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2170 break;
2171 }
2172 }
2173 }
2174 }
2175
2176 if (progress)
2177 live_intervals_valid = false;
2178
2179 return progress;
2180 }
2181
2182 /**
2183 * Walks through basic blocks, looking for repeated MRF writes and
2184 * removing the later ones.
2185 */
2186 bool
2187 fs_visitor::remove_duplicate_mrf_writes()
2188 {
2189 fs_inst *last_mrf_move[16];
2190 bool progress = false;
2191
2192 /* Need to update the MRF tracking for compressed instructions. */
2193 if (dispatch_width == 16)
2194 return false;
2195
2196 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2197
2198 foreach_list_safe(node, &this->instructions) {
2199 fs_inst *inst = (fs_inst *)node;
2200
2201 switch (inst->opcode) {
2202 case BRW_OPCODE_DO:
2203 case BRW_OPCODE_WHILE:
2204 case BRW_OPCODE_IF:
2205 case BRW_OPCODE_ELSE:
2206 case BRW_OPCODE_ENDIF:
2207 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2208 continue;
2209 default:
2210 break;
2211 }
2212
2213 if (inst->opcode == BRW_OPCODE_MOV &&
2214 inst->dst.file == MRF) {
2215 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2216 if (prev_inst && inst->equals(prev_inst)) {
2217 inst->remove();
2218 progress = true;
2219 continue;
2220 }
2221 }
2222
2223 /* Clear out the last-write records for MRFs that were overwritten. */
2224 if (inst->dst.file == MRF) {
2225 last_mrf_move[inst->dst.reg] = NULL;
2226 }
2227
2228 if (inst->mlen > 0) {
2229 /* Found a SEND instruction, which will include two or fewer
2230 * implied MRF writes. We could do better here.
2231 */
2232 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2233 last_mrf_move[inst->base_mrf + i] = NULL;
2234 }
2235 }
2236
2237 /* Clear out any MRF move records whose sources got overwritten. */
2238 if (inst->dst.file == GRF) {
2239 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2240 if (last_mrf_move[i] &&
2241 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2242 last_mrf_move[i] = NULL;
2243 }
2244 }
2245 }
2246
2247 if (inst->opcode == BRW_OPCODE_MOV &&
2248 inst->dst.file == MRF &&
2249 inst->src[0].file == GRF &&
2250 !inst->predicate) {
2251 last_mrf_move[inst->dst.reg] = inst;
2252 }
2253 }
2254
2255 if (progress)
2256 live_intervals_valid = false;
2257
2258 return progress;
2259 }
2260
2261 void
2262 fs_visitor::dump_instruction(fs_inst *inst)
2263 {
2264 if (inst->predicate) {
2265 printf("(%cf0.%d) ",
2266 inst->predicate_inverse ? '-' : '+',
2267 inst->flag_subreg);
2268 }
2269
2270 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2271 opcode_descs[inst->opcode].name) {
2272 printf("%s", opcode_descs[inst->opcode].name);
2273 } else {
2274 printf("op%d", inst->opcode);
2275 }
2276 if (inst->saturate)
2277 printf(".sat");
2278 if (inst->conditional_mod) {
2279 printf(".cmod");
2280 if (!inst->predicate &&
2281 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2282 inst->opcode != BRW_OPCODE_IF &&
2283 inst->opcode != BRW_OPCODE_WHILE))) {
2284 printf(".f0.%d\n", inst->flag_subreg);
2285 }
2286 }
2287 printf(" ");
2288
2289
2290 switch (inst->dst.file) {
2291 case GRF:
2292 printf("vgrf%d", inst->dst.reg);
2293 if (inst->dst.reg_offset)
2294 printf("+%d", inst->dst.reg_offset);
2295 break;
2296 case MRF:
2297 printf("m%d", inst->dst.reg);
2298 break;
2299 case BAD_FILE:
2300 printf("(null)");
2301 break;
2302 case UNIFORM:
2303 printf("***u%d***", inst->dst.reg);
2304 break;
2305 default:
2306 printf("???");
2307 break;
2308 }
2309 printf(", ");
2310
2311 for (int i = 0; i < 3; i++) {
2312 if (inst->src[i].negate)
2313 printf("-");
2314 if (inst->src[i].abs)
2315 printf("|");
2316 switch (inst->src[i].file) {
2317 case GRF:
2318 printf("vgrf%d", inst->src[i].reg);
2319 if (inst->src[i].reg_offset)
2320 printf("+%d", inst->src[i].reg_offset);
2321 break;
2322 case MRF:
2323 printf("***m%d***", inst->src[i].reg);
2324 break;
2325 case UNIFORM:
2326 printf("u%d", inst->src[i].reg);
2327 if (inst->src[i].reg_offset)
2328 printf(".%d", inst->src[i].reg_offset);
2329 break;
2330 case BAD_FILE:
2331 printf("(null)");
2332 break;
2333 default:
2334 printf("???");
2335 break;
2336 }
2337 if (inst->src[i].abs)
2338 printf("|");
2339
2340 if (i < 3)
2341 printf(", ");
2342 }
2343
2344 printf(" ");
2345
2346 if (inst->force_uncompressed)
2347 printf("1sthalf ");
2348
2349 if (inst->force_sechalf)
2350 printf("2ndhalf ");
2351
2352 printf("\n");
2353 }
2354
2355 void
2356 fs_visitor::dump_instructions()
2357 {
2358 int ip = 0;
2359 foreach_list(node, &this->instructions) {
2360 fs_inst *inst = (fs_inst *)node;
2361 printf("%d: ", ip++);
2362 dump_instruction(inst);
2363 }
2364 }
2365
2366 /**
2367 * Possibly returns an instruction that set up @param reg.
2368 *
2369 * Sometimes we want to take the result of some expression/variable
2370 * dereference tree and rewrite the instruction generating the result
2371 * of the tree. When processing the tree, we know that the
2372 * instructions generated are all writing temporaries that are dead
2373 * outside of this tree. So, if we have some instructions that write
2374 * a temporary, we're free to point that temp write somewhere else.
2375 *
2376 * Note that this doesn't guarantee that the instruction generated
2377 * only reg -- it might be the size=4 destination of a texture instruction.
2378 */
2379 fs_inst *
2380 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2381 fs_inst *end,
2382 fs_reg reg)
2383 {
2384 if (end == start ||
2385 end->predicate ||
2386 end->force_uncompressed ||
2387 end->force_sechalf ||
2388 reg.reladdr ||
2389 !reg.equals(end->dst)) {
2390 return NULL;
2391 } else {
2392 return end;
2393 }
2394 }
2395
2396 void
2397 fs_visitor::setup_payload_gen6()
2398 {
2399 struct intel_context *intel = &brw->intel;
2400 bool uses_depth =
2401 (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2402 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2403
2404 assert(intel->gen >= 6);
2405
2406 /* R0-1: masks, pixel X/Y coordinates. */
2407 c->nr_payload_regs = 2;
2408 /* R2: only for 32-pixel dispatch.*/
2409
2410 /* R3-26: barycentric interpolation coordinates. These appear in the
2411 * same order that they appear in the brw_wm_barycentric_interp_mode
2412 * enum. Each set of coordinates occupies 2 registers if dispatch width
2413 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2414 * appear if they were enabled using the "Barycentric Interpolation
2415 * Mode" bits in WM_STATE.
2416 */
2417 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2418 if (barycentric_interp_modes & (1 << i)) {
2419 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2420 c->nr_payload_regs += 2;
2421 if (dispatch_width == 16) {
2422 c->nr_payload_regs += 2;
2423 }
2424 }
2425 }
2426
2427 /* R27: interpolated depth if uses source depth */
2428 if (uses_depth) {
2429 c->source_depth_reg = c->nr_payload_regs;
2430 c->nr_payload_regs++;
2431 if (dispatch_width == 16) {
2432 /* R28: interpolated depth if not 8-wide. */
2433 c->nr_payload_regs++;
2434 }
2435 }
2436 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2437 if (uses_depth) {
2438 c->source_w_reg = c->nr_payload_regs;
2439 c->nr_payload_regs++;
2440 if (dispatch_width == 16) {
2441 /* R30: interpolated W if not 8-wide. */
2442 c->nr_payload_regs++;
2443 }
2444 }
2445 /* R31: MSAA position offsets. */
2446 /* R32-: bary for 32-pixel. */
2447 /* R58-59: interp W for 32-pixel. */
2448
2449 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2450 c->source_depth_to_render_target = true;
2451 }
2452 }
2453
2454 bool
2455 fs_visitor::run()
2456 {
2457 sanity_param_count = fp->Base.Parameters->NumParameters;
2458 uint32_t orig_nr_params = c->prog_data.nr_params;
2459
2460 if (intel->gen >= 6)
2461 setup_payload_gen6();
2462 else
2463 setup_payload_gen4();
2464
2465 if (0) {
2466 emit_dummy_fs();
2467 } else {
2468 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2469 emit_shader_time_begin();
2470
2471 calculate_urb_setup();
2472 if (intel->gen < 6)
2473 emit_interpolation_setup_gen4();
2474 else
2475 emit_interpolation_setup_gen6();
2476
2477 /* We handle discards by keeping track of the still-live pixels in f0.1.
2478 * Initialize it with the dispatched pixels.
2479 */
2480 if (fp->UsesKill) {
2481 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2482 discard_init->flag_subreg = 1;
2483 }
2484
2485 /* Generate FS IR for main(). (the visitor only descends into
2486 * functions called "main").
2487 */
2488 if (shader) {
2489 foreach_list(node, &*shader->ir) {
2490 ir_instruction *ir = (ir_instruction *)node;
2491 base_ir = ir;
2492 this->result = reg_undef;
2493 ir->accept(this);
2494 }
2495 } else {
2496 emit_fragment_program_code();
2497 }
2498 base_ir = NULL;
2499 if (failed)
2500 return false;
2501
2502 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2503 emit_shader_time_end();
2504
2505 emit_fb_writes();
2506
2507 split_virtual_grfs();
2508
2509 setup_paramvalues_refs();
2510 move_uniform_array_access_to_pull_constants();
2511 setup_pull_constants();
2512
2513 bool progress;
2514 do {
2515 progress = false;
2516
2517 compact_virtual_grfs();
2518
2519 progress = remove_duplicate_mrf_writes() || progress;
2520
2521 progress = opt_algebraic() || progress;
2522 progress = opt_cse() || progress;
2523 progress = opt_copy_propagate() || progress;
2524 progress = dead_code_eliminate() || progress;
2525 progress = register_coalesce() || progress;
2526 progress = register_coalesce_2() || progress;
2527 progress = compute_to_mrf() || progress;
2528 } while (progress);
2529
2530 remove_dead_constants();
2531
2532 schedule_instructions(false);
2533
2534 assign_curb_setup();
2535 assign_urb_setup();
2536
2537 if (0) {
2538 /* Debug of register spilling: Go spill everything. */
2539 for (int i = 0; i < virtual_grf_count; i++) {
2540 spill_reg(i);
2541 }
2542 }
2543
2544 if (0)
2545 assign_regs_trivial();
2546 else {
2547 while (!assign_regs()) {
2548 if (failed)
2549 break;
2550 }
2551 }
2552 }
2553 assert(force_uncompressed_stack == 0);
2554 assert(force_sechalf_stack == 0);
2555
2556 if (failed)
2557 return false;
2558
2559 schedule_instructions(true);
2560
2561 if (dispatch_width == 8) {
2562 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2563 } else {
2564 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2565
2566 /* Make sure we didn't try to sneak in an extra uniform */
2567 assert(orig_nr_params == c->prog_data.nr_params);
2568 (void) orig_nr_params;
2569 }
2570
2571 /* If any state parameters were appended, then ParameterValues could have
2572 * been realloced, in which case the driver uniform storage set up by
2573 * _mesa_associate_uniform_storage() would point to freed memory. Make
2574 * sure that didn't happen.
2575 */
2576 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2577
2578 return !failed;
2579 }
2580
2581 const unsigned *
2582 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2583 struct gl_fragment_program *fp,
2584 struct gl_shader_program *prog,
2585 unsigned *final_assembly_size)
2586 {
2587 struct intel_context *intel = &brw->intel;
2588 bool start_busy = false;
2589 float start_time = 0;
2590
2591 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2592 start_busy = (intel->batch.last_bo &&
2593 drm_intel_bo_busy(intel->batch.last_bo));
2594 start_time = get_time();
2595 }
2596
2597 struct brw_shader *shader = NULL;
2598 if (prog)
2599 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2600
2601 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2602 if (shader) {
2603 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2604 _mesa_print_ir(shader->ir, NULL);
2605 printf("\n\n");
2606 } else {
2607 printf("ARB_fragment_program %d ir for native fragment shader\n",
2608 fp->Base.Id);
2609 _mesa_print_program(&fp->Base);
2610 }
2611 }
2612
2613 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2614 */
2615 fs_visitor v(brw, c, prog, fp, 8);
2616 if (!v.run()) {
2617 prog->LinkStatus = false;
2618 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2619
2620 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2621 v.fail_msg);
2622
2623 return NULL;
2624 }
2625
2626 exec_list *simd16_instructions = NULL;
2627 fs_visitor v2(brw, c, prog, fp, 16);
2628 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2629 v2.import_uniforms(&v);
2630 if (!v2.run()) {
2631 perf_debug("16-wide shader failed to compile, falling back to "
2632 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2633 } else {
2634 simd16_instructions = &v2.instructions;
2635 }
2636 }
2637
2638 c->prog_data.dispatch_width = 8;
2639
2640 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2641 const unsigned *generated = g.generate_assembly(&v.instructions,
2642 simd16_instructions,
2643 final_assembly_size);
2644
2645 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2646 if (shader->compiled_once)
2647 brw_wm_debug_recompile(brw, prog, &c->key);
2648 shader->compiled_once = true;
2649
2650 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2651 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2652 (get_time() - start_time) * 1000);
2653 }
2654 }
2655
2656 return generated;
2657 }
2658
2659 bool
2660 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2661 {
2662 struct brw_context *brw = brw_context(ctx);
2663 struct intel_context *intel = &brw->intel;
2664 struct brw_wm_prog_key key;
2665
2666 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2667 return true;
2668
2669 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2670 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2671 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2672 bool program_uses_dfdy = fp->UsesDFdy;
2673
2674 memset(&key, 0, sizeof(key));
2675
2676 if (intel->gen < 6) {
2677 if (fp->UsesKill)
2678 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2679
2680 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2681 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2682
2683 /* Just assume depth testing. */
2684 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2685 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2686 }
2687
2688 if (prog->Name != 0)
2689 key.proj_attrib_mask = 0xffffffff;
2690
2691 if (intel->gen < 6)
2692 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2693
2694 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2695 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2696 continue;
2697
2698 if (prog->Name == 0)
2699 key.proj_attrib_mask |= 1 << i;
2700
2701 if (intel->gen < 6) {
2702 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2703
2704 if (vp_index >= 0)
2705 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2706 }
2707 }
2708
2709 key.clamp_fragment_color = true;
2710
2711 for (int i = 0; i < MAX_SAMPLERS; i++) {
2712 if (fp->Base.ShadowSamplers & (1 << i)) {
2713 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2714 key.tex.swizzles[i] =
2715 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2716 } else {
2717 /* Color sampler: assume no swizzling. */
2718 key.tex.swizzles[i] = SWIZZLE_XYZW;
2719 }
2720 }
2721
2722 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2723 key.drawable_height = ctx->DrawBuffer->Height;
2724 }
2725
2726 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2727 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2728 }
2729
2730 key.nr_color_regions = 1;
2731
2732 key.program_string_id = bfp->id;
2733
2734 uint32_t old_prog_offset = brw->wm.prog_offset;
2735 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2736
2737 bool success = do_wm_prog(brw, prog, bfp, &key);
2738
2739 brw->wm.prog_offset = old_prog_offset;
2740 brw->wm.prog_data = old_prog_data;
2741
2742 return success;
2743 }