i965/fs: Use the LRP instruction for ir_triop_lrp when possible.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 #define ALU3(op) \
150 fs_inst * \
151 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
152 { \
153 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
154 }
155
156 ALU1(NOT)
157 ALU1(MOV)
158 ALU1(FRC)
159 ALU1(RNDD)
160 ALU1(RNDE)
161 ALU1(RNDZ)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(SHL)
169 ALU2(SHR)
170 ALU2(ASR)
171 ALU3(LRP)
172
173 /** Gen4 predicated IF. */
174 fs_inst *
175 fs_visitor::IF(uint32_t predicate)
176 {
177 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
178 inst->predicate = predicate;
179 return inst;
180 }
181
182 /** Gen6+ IF with embedded comparison. */
183 fs_inst *
184 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
185 {
186 assert(intel->gen >= 6);
187 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
188 reg_null_d, src0, src1);
189 inst->conditional_mod = condition;
190 return inst;
191 }
192
193 /**
194 * CMP: Sets the low bit of the destination channels with the result
195 * of the comparison, while the upper bits are undefined, and updates
196 * the flag register with the packed 16 bits of the result.
197 */
198 fs_inst *
199 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
200 {
201 fs_inst *inst;
202
203 /* Take the instruction:
204 *
205 * CMP null<d> src0<f> src1<f>
206 *
207 * Original gen4 does type conversion to the destination type before
208 * comparison, producing garbage results for floating point comparisons.
209 * gen5 does the comparison on the execution type (resolved source types),
210 * so dst type doesn't matter. gen6 does comparison and then uses the
211 * result as if it was the dst type with no conversion, which happens to
212 * mostly work out for float-interpreted-as-int since our comparisons are
213 * for >0, =0, <0.
214 */
215 if (intel->gen == 4) {
216 dst.type = src0.type;
217 if (dst.file == FIXED_HW_REG)
218 dst.fixed_hw_reg.type = dst.type;
219 }
220
221 resolve_ud_negate(&src0);
222 resolve_ud_negate(&src1);
223
224 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
225 inst->conditional_mod = condition;
226
227 return inst;
228 }
229
230 exec_list
231 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
232 fs_reg offset)
233 {
234 exec_list instructions;
235 fs_inst *inst;
236
237 if (intel->gen >= 7) {
238 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
239 dst, surf_index, offset);
240 instructions.push_tail(inst);
241 } else {
242 int base_mrf = 13;
243 bool header_present = true;
244
245 fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
246 mrf.type = BRW_REGISTER_TYPE_D;
247
248 /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
249 * dword-aligned byte offset.
250 */
251 if (intel->gen == 6) {
252 instructions.push_tail(MOV(mrf, offset));
253 } else {
254 instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
255 }
256 inst = MOV(mrf, offset);
257 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
258 dst, surf_index);
259 inst->header_present = header_present;
260 inst->base_mrf = base_mrf;
261 inst->mlen = header_present + dispatch_width / 8;
262
263 instructions.push_tail(inst);
264 }
265
266 return instructions;
267 }
268
269 /**
270 * A helper for MOV generation for fixing up broken hardware SEND dependency
271 * handling.
272 */
273 fs_inst *
274 fs_visitor::DEP_RESOLVE_MOV(int grf)
275 {
276 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
277
278 inst->ir = NULL;
279 inst->annotation = "send dependency resolve";
280
281 /* The caller always wants uncompressed to emit the minimal extra
282 * dependencies, and to avoid having to deal with aligning its regs to 2.
283 */
284 inst->force_uncompressed = true;
285
286 return inst;
287 }
288
289 bool
290 fs_inst::equals(fs_inst *inst)
291 {
292 return (opcode == inst->opcode &&
293 dst.equals(inst->dst) &&
294 src[0].equals(inst->src[0]) &&
295 src[1].equals(inst->src[1]) &&
296 src[2].equals(inst->src[2]) &&
297 saturate == inst->saturate &&
298 predicate == inst->predicate &&
299 conditional_mod == inst->conditional_mod &&
300 mlen == inst->mlen &&
301 base_mrf == inst->base_mrf &&
302 sampler == inst->sampler &&
303 target == inst->target &&
304 eot == inst->eot &&
305 header_present == inst->header_present &&
306 shadow_compare == inst->shadow_compare &&
307 offset == inst->offset);
308 }
309
310 int
311 fs_inst::regs_written()
312 {
313 if (is_tex())
314 return 4;
315
316 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
317 * but we don't currently use them...nor do we have an opcode for them.
318 */
319
320 return 1;
321 }
322
323 bool
324 fs_inst::overwrites_reg(const fs_reg &reg)
325 {
326 return (reg.file == dst.file &&
327 reg.reg == dst.reg &&
328 reg.reg_offset >= dst.reg_offset &&
329 reg.reg_offset < dst.reg_offset + regs_written());
330 }
331
332 bool
333 fs_inst::is_tex()
334 {
335 return (opcode == SHADER_OPCODE_TEX ||
336 opcode == FS_OPCODE_TXB ||
337 opcode == SHADER_OPCODE_TXD ||
338 opcode == SHADER_OPCODE_TXF ||
339 opcode == SHADER_OPCODE_TXL ||
340 opcode == SHADER_OPCODE_TXS);
341 }
342
343 bool
344 fs_inst::is_math()
345 {
346 return (opcode == SHADER_OPCODE_RCP ||
347 opcode == SHADER_OPCODE_RSQ ||
348 opcode == SHADER_OPCODE_SQRT ||
349 opcode == SHADER_OPCODE_EXP2 ||
350 opcode == SHADER_OPCODE_LOG2 ||
351 opcode == SHADER_OPCODE_SIN ||
352 opcode == SHADER_OPCODE_COS ||
353 opcode == SHADER_OPCODE_INT_QUOTIENT ||
354 opcode == SHADER_OPCODE_INT_REMAINDER ||
355 opcode == SHADER_OPCODE_POW);
356 }
357
358 bool
359 fs_inst::is_control_flow()
360 {
361 switch (opcode) {
362 case BRW_OPCODE_DO:
363 case BRW_OPCODE_WHILE:
364 case BRW_OPCODE_IF:
365 case BRW_OPCODE_ELSE:
366 case BRW_OPCODE_ENDIF:
367 case BRW_OPCODE_BREAK:
368 case BRW_OPCODE_CONTINUE:
369 return true;
370 default:
371 return false;
372 }
373 }
374
375 bool
376 fs_inst::is_send_from_grf()
377 {
378 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
379 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
380 src[1].file == GRF));
381 }
382
383 bool
384 fs_visitor::can_do_source_mods(fs_inst *inst)
385 {
386 if (intel->gen == 6 && inst->is_math())
387 return false;
388
389 if (inst->is_send_from_grf())
390 return false;
391
392 return true;
393 }
394
395 void
396 fs_reg::init()
397 {
398 memset(this, 0, sizeof(*this));
399 this->smear = -1;
400 }
401
402 /** Generic unset register constructor. */
403 fs_reg::fs_reg()
404 {
405 init();
406 this->file = BAD_FILE;
407 }
408
409 /** Immediate value constructor. */
410 fs_reg::fs_reg(float f)
411 {
412 init();
413 this->file = IMM;
414 this->type = BRW_REGISTER_TYPE_F;
415 this->imm.f = f;
416 }
417
418 /** Immediate value constructor. */
419 fs_reg::fs_reg(int32_t i)
420 {
421 init();
422 this->file = IMM;
423 this->type = BRW_REGISTER_TYPE_D;
424 this->imm.i = i;
425 }
426
427 /** Immediate value constructor. */
428 fs_reg::fs_reg(uint32_t u)
429 {
430 init();
431 this->file = IMM;
432 this->type = BRW_REGISTER_TYPE_UD;
433 this->imm.u = u;
434 }
435
436 /** Fixed brw_reg Immediate value constructor. */
437 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
438 {
439 init();
440 this->file = FIXED_HW_REG;
441 this->fixed_hw_reg = fixed_hw_reg;
442 this->type = fixed_hw_reg.type;
443 }
444
445 bool
446 fs_reg::equals(const fs_reg &r) const
447 {
448 return (file == r.file &&
449 reg == r.reg &&
450 reg_offset == r.reg_offset &&
451 type == r.type &&
452 negate == r.negate &&
453 abs == r.abs &&
454 !reladdr && !r.reladdr &&
455 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
456 sizeof(fixed_hw_reg)) == 0 &&
457 smear == r.smear &&
458 imm.u == r.imm.u);
459 }
460
461 bool
462 fs_reg::is_zero() const
463 {
464 if (file != IMM)
465 return false;
466
467 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
468 }
469
470 bool
471 fs_reg::is_one() const
472 {
473 if (file != IMM)
474 return false;
475
476 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
477 }
478
479 int
480 fs_visitor::type_size(const struct glsl_type *type)
481 {
482 unsigned int size, i;
483
484 switch (type->base_type) {
485 case GLSL_TYPE_UINT:
486 case GLSL_TYPE_INT:
487 case GLSL_TYPE_FLOAT:
488 case GLSL_TYPE_BOOL:
489 return type->components();
490 case GLSL_TYPE_ARRAY:
491 return type_size(type->fields.array) * type->length;
492 case GLSL_TYPE_STRUCT:
493 size = 0;
494 for (i = 0; i < type->length; i++) {
495 size += type_size(type->fields.structure[i].type);
496 }
497 return size;
498 case GLSL_TYPE_SAMPLER:
499 /* Samplers take up no register space, since they're baked in at
500 * link time.
501 */
502 return 0;
503 case GLSL_TYPE_VOID:
504 case GLSL_TYPE_ERROR:
505 case GLSL_TYPE_INTERFACE:
506 assert(!"not reached");
507 break;
508 }
509
510 return 0;
511 }
512
513 fs_reg
514 fs_visitor::get_timestamp()
515 {
516 assert(intel->gen >= 7);
517
518 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
519 BRW_ARF_TIMESTAMP,
520 0),
521 BRW_REGISTER_TYPE_UD));
522
523 fs_reg dst = fs_reg(this, glsl_type::uint_type);
524
525 fs_inst *mov = emit(MOV(dst, ts));
526 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
527 * even if it's not enabled in the dispatch.
528 */
529 mov->force_writemask_all = true;
530 mov->force_uncompressed = true;
531
532 /* The caller wants the low 32 bits of the timestamp. Since it's running
533 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
534 * which is plenty of time for our purposes. It is identical across the
535 * EUs, but since it's tracking GPU core speed it will increment at a
536 * varying rate as render P-states change.
537 *
538 * The caller could also check if render P-states have changed (or anything
539 * else that might disrupt timing) by setting smear to 2 and checking if
540 * that field is != 0.
541 */
542 dst.smear = 0;
543
544 return dst;
545 }
546
547 void
548 fs_visitor::emit_shader_time_begin()
549 {
550 current_annotation = "shader time start";
551 shader_start_time = get_timestamp();
552 }
553
554 void
555 fs_visitor::emit_shader_time_end()
556 {
557 current_annotation = "shader time end";
558
559 enum shader_time_shader_type type, written_type, reset_type;
560 if (dispatch_width == 8) {
561 type = ST_FS8;
562 written_type = ST_FS8_WRITTEN;
563 reset_type = ST_FS8_RESET;
564 } else {
565 assert(dispatch_width == 16);
566 type = ST_FS16;
567 written_type = ST_FS16_WRITTEN;
568 reset_type = ST_FS16_RESET;
569 }
570
571 fs_reg shader_end_time = get_timestamp();
572
573 /* Check that there weren't any timestamp reset events (assuming these
574 * were the only two timestamp reads that happened).
575 */
576 fs_reg reset = shader_end_time;
577 reset.smear = 2;
578 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
579 test->conditional_mod = BRW_CONDITIONAL_Z;
580 emit(IF(BRW_PREDICATE_NORMAL));
581
582 push_force_uncompressed();
583 fs_reg start = shader_start_time;
584 start.negate = true;
585 fs_reg diff = fs_reg(this, glsl_type::uint_type);
586 emit(ADD(diff, start, shader_end_time));
587
588 /* If there were no instructions between the two timestamp gets, the diff
589 * is 2 cycles. Remove that overhead, so I can forget about that when
590 * trying to determine the time taken for single instructions.
591 */
592 emit(ADD(diff, diff, fs_reg(-2u)));
593
594 emit_shader_time_write(type, diff);
595 emit_shader_time_write(written_type, fs_reg(1u));
596 emit(BRW_OPCODE_ELSE);
597 emit_shader_time_write(reset_type, fs_reg(1u));
598 emit(BRW_OPCODE_ENDIF);
599
600 pop_force_uncompressed();
601 }
602
603 void
604 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
605 fs_reg value)
606 {
607 /* Choose an index in the buffer and set up tracking information for our
608 * printouts.
609 */
610 int shader_time_index = brw->shader_time.num_entries++;
611 assert(shader_time_index <= brw->shader_time.max_entries);
612 brw->shader_time.types[shader_time_index] = type;
613 if (prog) {
614 _mesa_reference_shader_program(ctx,
615 &brw->shader_time.programs[shader_time_index],
616 prog);
617 }
618
619 int base_mrf = 6;
620
621 fs_reg offset_mrf = fs_reg(MRF, base_mrf);
622 offset_mrf.type = BRW_REGISTER_TYPE_UD;
623 emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
624
625 fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
626 time_mrf.type = BRW_REGISTER_TYPE_UD;
627 emit(MOV(time_mrf, value));
628
629 fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
630 inst->base_mrf = base_mrf;
631 inst->mlen = 2;
632 }
633
634 void
635 fs_visitor::fail(const char *format, ...)
636 {
637 va_list va;
638 char *msg;
639
640 if (failed)
641 return;
642
643 failed = true;
644
645 va_start(va, format);
646 msg = ralloc_vasprintf(mem_ctx, format, va);
647 va_end(va);
648 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
649
650 this->fail_msg = msg;
651
652 if (INTEL_DEBUG & DEBUG_WM) {
653 fprintf(stderr, "%s", msg);
654 }
655 }
656
657 fs_inst *
658 fs_visitor::emit(enum opcode opcode)
659 {
660 return emit(fs_inst(opcode));
661 }
662
663 fs_inst *
664 fs_visitor::emit(enum opcode opcode, fs_reg dst)
665 {
666 return emit(fs_inst(opcode, dst));
667 }
668
669 fs_inst *
670 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
671 {
672 return emit(fs_inst(opcode, dst, src0));
673 }
674
675 fs_inst *
676 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
677 {
678 return emit(fs_inst(opcode, dst, src0, src1));
679 }
680
681 fs_inst *
682 fs_visitor::emit(enum opcode opcode, fs_reg dst,
683 fs_reg src0, fs_reg src1, fs_reg src2)
684 {
685 return emit(fs_inst(opcode, dst, src0, src1, src2));
686 }
687
688 void
689 fs_visitor::push_force_uncompressed()
690 {
691 force_uncompressed_stack++;
692 }
693
694 void
695 fs_visitor::pop_force_uncompressed()
696 {
697 force_uncompressed_stack--;
698 assert(force_uncompressed_stack >= 0);
699 }
700
701 void
702 fs_visitor::push_force_sechalf()
703 {
704 force_sechalf_stack++;
705 }
706
707 void
708 fs_visitor::pop_force_sechalf()
709 {
710 force_sechalf_stack--;
711 assert(force_sechalf_stack >= 0);
712 }
713
714 /**
715 * Returns how many MRFs an FS opcode will write over.
716 *
717 * Note that this is not the 0 or 1 implied writes in an actual gen
718 * instruction -- the FS opcodes often generate MOVs in addition.
719 */
720 int
721 fs_visitor::implied_mrf_writes(fs_inst *inst)
722 {
723 if (inst->mlen == 0)
724 return 0;
725
726 switch (inst->opcode) {
727 case SHADER_OPCODE_RCP:
728 case SHADER_OPCODE_RSQ:
729 case SHADER_OPCODE_SQRT:
730 case SHADER_OPCODE_EXP2:
731 case SHADER_OPCODE_LOG2:
732 case SHADER_OPCODE_SIN:
733 case SHADER_OPCODE_COS:
734 return 1 * dispatch_width / 8;
735 case SHADER_OPCODE_POW:
736 case SHADER_OPCODE_INT_QUOTIENT:
737 case SHADER_OPCODE_INT_REMAINDER:
738 return 2 * dispatch_width / 8;
739 case SHADER_OPCODE_TEX:
740 case FS_OPCODE_TXB:
741 case SHADER_OPCODE_TXD:
742 case SHADER_OPCODE_TXF:
743 case SHADER_OPCODE_TXL:
744 case SHADER_OPCODE_TXS:
745 return 1;
746 case SHADER_OPCODE_SHADER_TIME_ADD:
747 return 0;
748 case FS_OPCODE_FB_WRITE:
749 return 2;
750 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
751 case FS_OPCODE_UNSPILL:
752 return 1;
753 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
754 return inst->header_present;
755 case FS_OPCODE_SPILL:
756 return 2;
757 default:
758 assert(!"not reached");
759 return inst->mlen;
760 }
761 }
762
763 int
764 fs_visitor::virtual_grf_alloc(int size)
765 {
766 if (virtual_grf_array_size <= virtual_grf_count) {
767 if (virtual_grf_array_size == 0)
768 virtual_grf_array_size = 16;
769 else
770 virtual_grf_array_size *= 2;
771 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
772 virtual_grf_array_size);
773 }
774 virtual_grf_sizes[virtual_grf_count] = size;
775 return virtual_grf_count++;
776 }
777
778 /** Fixed HW reg constructor. */
779 fs_reg::fs_reg(enum register_file file, int reg)
780 {
781 init();
782 this->file = file;
783 this->reg = reg;
784 this->type = BRW_REGISTER_TYPE_F;
785 }
786
787 /** Fixed HW reg constructor. */
788 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
789 {
790 init();
791 this->file = file;
792 this->reg = reg;
793 this->type = type;
794 }
795
796 /** Automatic reg constructor. */
797 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
798 {
799 init();
800
801 this->file = GRF;
802 this->reg = v->virtual_grf_alloc(v->type_size(type));
803 this->reg_offset = 0;
804 this->type = brw_type_for_base_type(type);
805 }
806
807 fs_reg *
808 fs_visitor::variable_storage(ir_variable *var)
809 {
810 return (fs_reg *)hash_table_find(this->variable_ht, var);
811 }
812
813 void
814 import_uniforms_callback(const void *key,
815 void *data,
816 void *closure)
817 {
818 struct hash_table *dst_ht = (struct hash_table *)closure;
819 const fs_reg *reg = (const fs_reg *)data;
820
821 if (reg->file != UNIFORM)
822 return;
823
824 hash_table_insert(dst_ht, data, key);
825 }
826
827 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
828 * This brings in those uniform definitions
829 */
830 void
831 fs_visitor::import_uniforms(fs_visitor *v)
832 {
833 hash_table_call_foreach(v->variable_ht,
834 import_uniforms_callback,
835 variable_ht);
836 this->params_remap = v->params_remap;
837 }
838
839 /* Our support for uniforms is piggy-backed on the struct
840 * gl_fragment_program, because that's where the values actually
841 * get stored, rather than in some global gl_shader_program uniform
842 * store.
843 */
844 void
845 fs_visitor::setup_uniform_values(ir_variable *ir)
846 {
847 int namelen = strlen(ir->name);
848
849 /* The data for our (non-builtin) uniforms is stored in a series of
850 * gl_uniform_driver_storage structs for each subcomponent that
851 * glGetUniformLocation() could name. We know it's been set up in the same
852 * order we'd walk the type, so walk the list of storage and find anything
853 * with our name, or the prefix of a component that starts with our name.
854 */
855 unsigned params_before = c->prog_data.nr_params;
856 for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
857 struct gl_uniform_storage *storage = &prog->UniformStorage[u];
858
859 if (strncmp(ir->name, storage->name, namelen) != 0 ||
860 (storage->name[namelen] != 0 &&
861 storage->name[namelen] != '.' &&
862 storage->name[namelen] != '[')) {
863 continue;
864 }
865
866 unsigned slots = storage->type->component_slots();
867 if (storage->array_elements)
868 slots *= storage->array_elements;
869
870 for (unsigned i = 0; i < slots; i++) {
871 c->prog_data.param[c->prog_data.nr_params++] =
872 &storage->storage[i].f;
873 }
874 }
875
876 /* Make sure we actually initialized the right amount of stuff here. */
877 assert(params_before + ir->type->component_slots() ==
878 c->prog_data.nr_params);
879 }
880
881
882 /* Our support for builtin uniforms is even scarier than non-builtin.
883 * It sits on top of the PROG_STATE_VAR parameters that are
884 * automatically updated from GL context state.
885 */
886 void
887 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
888 {
889 const ir_state_slot *const slots = ir->state_slots;
890 assert(ir->state_slots != NULL);
891
892 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
893 /* This state reference has already been setup by ir_to_mesa, but we'll
894 * get the same index back here.
895 */
896 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
897 (gl_state_index *)slots[i].tokens);
898
899 /* Add each of the unique swizzles of the element as a parameter.
900 * This'll end up matching the expected layout of the
901 * array/matrix/structure we're trying to fill in.
902 */
903 int last_swiz = -1;
904 for (unsigned int j = 0; j < 4; j++) {
905 int swiz = GET_SWZ(slots[i].swizzle, j);
906 if (swiz == last_swiz)
907 break;
908 last_swiz = swiz;
909
910 c->prog_data.param[c->prog_data.nr_params++] =
911 &fp->Base.Parameters->ParameterValues[index][swiz].f;
912 }
913 }
914 }
915
916 fs_reg *
917 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
918 {
919 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
920 fs_reg wpos = *reg;
921 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
922
923 /* gl_FragCoord.x */
924 if (ir->pixel_center_integer) {
925 emit(MOV(wpos, this->pixel_x));
926 } else {
927 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
928 }
929 wpos.reg_offset++;
930
931 /* gl_FragCoord.y */
932 if (!flip && ir->pixel_center_integer) {
933 emit(MOV(wpos, this->pixel_y));
934 } else {
935 fs_reg pixel_y = this->pixel_y;
936 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
937
938 if (flip) {
939 pixel_y.negate = true;
940 offset += c->key.drawable_height - 1.0;
941 }
942
943 emit(ADD(wpos, pixel_y, fs_reg(offset)));
944 }
945 wpos.reg_offset++;
946
947 /* gl_FragCoord.z */
948 if (intel->gen >= 6) {
949 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
950 } else {
951 emit(FS_OPCODE_LINTERP, wpos,
952 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
953 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
954 interp_reg(FRAG_ATTRIB_WPOS, 2));
955 }
956 wpos.reg_offset++;
957
958 /* gl_FragCoord.w: Already set up in emit_interpolation */
959 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
960
961 return reg;
962 }
963
964 fs_inst *
965 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
966 glsl_interp_qualifier interpolation_mode,
967 bool is_centroid)
968 {
969 brw_wm_barycentric_interp_mode barycoord_mode;
970 if (is_centroid) {
971 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
972 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
973 else
974 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
975 } else {
976 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
977 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
978 else
979 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
980 }
981 return emit(FS_OPCODE_LINTERP, attr,
982 this->delta_x[barycoord_mode],
983 this->delta_y[barycoord_mode], interp);
984 }
985
986 fs_reg *
987 fs_visitor::emit_general_interpolation(ir_variable *ir)
988 {
989 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
990 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
991 fs_reg attr = *reg;
992
993 unsigned int array_elements;
994 const glsl_type *type;
995
996 if (ir->type->is_array()) {
997 array_elements = ir->type->length;
998 if (array_elements == 0) {
999 fail("dereferenced array '%s' has length 0\n", ir->name);
1000 }
1001 type = ir->type->fields.array;
1002 } else {
1003 array_elements = 1;
1004 type = ir->type;
1005 }
1006
1007 glsl_interp_qualifier interpolation_mode =
1008 ir->determine_interpolation_mode(c->key.flat_shade);
1009
1010 int location = ir->location;
1011 for (unsigned int i = 0; i < array_elements; i++) {
1012 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1013 if (urb_setup[location] == -1) {
1014 /* If there's no incoming setup data for this slot, don't
1015 * emit interpolation for it.
1016 */
1017 attr.reg_offset += type->vector_elements;
1018 location++;
1019 continue;
1020 }
1021
1022 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1023 /* Constant interpolation (flat shading) case. The SF has
1024 * handed us defined values in only the constant offset
1025 * field of the setup reg.
1026 */
1027 for (unsigned int k = 0; k < type->vector_elements; k++) {
1028 struct brw_reg interp = interp_reg(location, k);
1029 interp = suboffset(interp, 3);
1030 interp.type = reg->type;
1031 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1032 attr.reg_offset++;
1033 }
1034 } else {
1035 /* Smooth/noperspective interpolation case. */
1036 for (unsigned int k = 0; k < type->vector_elements; k++) {
1037 /* FINISHME: At some point we probably want to push
1038 * this farther by giving similar treatment to the
1039 * other potentially constant components of the
1040 * attribute, as well as making brw_vs_constval.c
1041 * handle varyings other than gl_TexCoord.
1042 */
1043 if (location >= FRAG_ATTRIB_TEX0 &&
1044 location <= FRAG_ATTRIB_TEX7 &&
1045 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1046 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1047 } else {
1048 struct brw_reg interp = interp_reg(location, k);
1049 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1050 ir->centroid);
1051 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1052 /* Get the pixel/sample mask into f0 so that we know
1053 * which pixels are lit. Then, for each channel that is
1054 * unlit, replace the centroid data with non-centroid
1055 * data.
1056 */
1057 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1058 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1059 interpolation_mode, false);
1060 inst->predicate = BRW_PREDICATE_NORMAL;
1061 inst->predicate_inverse = true;
1062 }
1063 if (intel->gen < 6) {
1064 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1065 }
1066 }
1067 attr.reg_offset++;
1068 }
1069
1070 }
1071 location++;
1072 }
1073 }
1074
1075 return reg;
1076 }
1077
1078 fs_reg *
1079 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1080 {
1081 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1082
1083 /* The frontfacing comes in as a bit in the thread payload. */
1084 if (intel->gen >= 6) {
1085 emit(BRW_OPCODE_ASR, *reg,
1086 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1087 fs_reg(15));
1088 emit(BRW_OPCODE_NOT, *reg, *reg);
1089 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1090 } else {
1091 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1092 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1093 * us front face
1094 */
1095 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1096 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1097 }
1098
1099 return reg;
1100 }
1101
1102 fs_reg
1103 fs_visitor::fix_math_operand(fs_reg src)
1104 {
1105 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1106 * might be able to do better by doing execsize = 1 math and then
1107 * expanding that result out, but we would need to be careful with
1108 * masking.
1109 *
1110 * The hardware ignores source modifiers (negate and abs) on math
1111 * instructions, so we also move to a temp to set those up.
1112 */
1113 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1114 !src.abs && !src.negate)
1115 return src;
1116
1117 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1118 * operands to math
1119 */
1120 if (intel->gen >= 7 && src.file != IMM)
1121 return src;
1122
1123 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1124 expanded.type = src.type;
1125 emit(BRW_OPCODE_MOV, expanded, src);
1126 return expanded;
1127 }
1128
1129 fs_inst *
1130 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1131 {
1132 switch (opcode) {
1133 case SHADER_OPCODE_RCP:
1134 case SHADER_OPCODE_RSQ:
1135 case SHADER_OPCODE_SQRT:
1136 case SHADER_OPCODE_EXP2:
1137 case SHADER_OPCODE_LOG2:
1138 case SHADER_OPCODE_SIN:
1139 case SHADER_OPCODE_COS:
1140 break;
1141 default:
1142 assert(!"not reached: bad math opcode");
1143 return NULL;
1144 }
1145
1146 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1147 * might be able to do better by doing execsize = 1 math and then
1148 * expanding that result out, but we would need to be careful with
1149 * masking.
1150 *
1151 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1152 * instructions, so we also move to a temp to set those up.
1153 */
1154 if (intel->gen >= 6)
1155 src = fix_math_operand(src);
1156
1157 fs_inst *inst = emit(opcode, dst, src);
1158
1159 if (intel->gen < 6) {
1160 inst->base_mrf = 2;
1161 inst->mlen = dispatch_width / 8;
1162 }
1163
1164 return inst;
1165 }
1166
1167 fs_inst *
1168 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1169 {
1170 int base_mrf = 2;
1171 fs_inst *inst;
1172
1173 switch (opcode) {
1174 case SHADER_OPCODE_INT_QUOTIENT:
1175 case SHADER_OPCODE_INT_REMAINDER:
1176 if (intel->gen >= 7 && dispatch_width == 16)
1177 fail("16-wide INTDIV unsupported\n");
1178 break;
1179 case SHADER_OPCODE_POW:
1180 break;
1181 default:
1182 assert(!"not reached: unsupported binary math opcode.");
1183 return NULL;
1184 }
1185
1186 if (intel->gen >= 6) {
1187 src0 = fix_math_operand(src0);
1188 src1 = fix_math_operand(src1);
1189
1190 inst = emit(opcode, dst, src0, src1);
1191 } else {
1192 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1193 * "Message Payload":
1194 *
1195 * "Operand0[7]. For the INT DIV functions, this operand is the
1196 * denominator."
1197 * ...
1198 * "Operand1[7]. For the INT DIV functions, this operand is the
1199 * numerator."
1200 */
1201 bool is_int_div = opcode != SHADER_OPCODE_POW;
1202 fs_reg &op0 = is_int_div ? src1 : src0;
1203 fs_reg &op1 = is_int_div ? src0 : src1;
1204
1205 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1206 inst = emit(opcode, dst, op0, reg_null_f);
1207
1208 inst->base_mrf = base_mrf;
1209 inst->mlen = 2 * dispatch_width / 8;
1210 }
1211 return inst;
1212 }
1213
1214 void
1215 fs_visitor::assign_curb_setup()
1216 {
1217 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1218 if (dispatch_width == 8) {
1219 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1220 } else {
1221 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1222 }
1223
1224 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1225 foreach_list(node, &this->instructions) {
1226 fs_inst *inst = (fs_inst *)node;
1227
1228 for (unsigned int i = 0; i < 3; i++) {
1229 if (inst->src[i].file == UNIFORM) {
1230 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1231 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1232 constant_nr / 8,
1233 constant_nr % 8);
1234
1235 inst->src[i].file = FIXED_HW_REG;
1236 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1237 }
1238 }
1239 }
1240 }
1241
1242 void
1243 fs_visitor::calculate_urb_setup()
1244 {
1245 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1246 urb_setup[i] = -1;
1247 }
1248
1249 int urb_next = 0;
1250 /* Figure out where each of the incoming setup attributes lands. */
1251 if (intel->gen >= 6) {
1252 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1253 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1254 urb_setup[i] = urb_next++;
1255 }
1256 }
1257 } else {
1258 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1259 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1260 /* Point size is packed into the header, not as a general attribute */
1261 if (i == VERT_RESULT_PSIZ)
1262 continue;
1263
1264 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1265 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1266
1267 /* The back color slot is skipped when the front color is
1268 * also written to. In addition, some slots can be
1269 * written in the vertex shader and not read in the
1270 * fragment shader. So the register number must always be
1271 * incremented, mapped or not.
1272 */
1273 if (fp_index >= 0)
1274 urb_setup[fp_index] = urb_next;
1275 urb_next++;
1276 }
1277 }
1278
1279 /*
1280 * It's a FS only attribute, and we did interpolation for this attribute
1281 * in SF thread. So, count it here, too.
1282 *
1283 * See compile_sf_prog() for more info.
1284 */
1285 if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1286 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1287 }
1288
1289 /* Each attribute is 4 setup channels, each of which is half a reg. */
1290 c->prog_data.urb_read_length = urb_next * 2;
1291 }
1292
1293 void
1294 fs_visitor::assign_urb_setup()
1295 {
1296 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1297
1298 /* Offset all the urb_setup[] index by the actual position of the
1299 * setup regs, now that the location of the constants has been chosen.
1300 */
1301 foreach_list(node, &this->instructions) {
1302 fs_inst *inst = (fs_inst *)node;
1303
1304 if (inst->opcode == FS_OPCODE_LINTERP) {
1305 assert(inst->src[2].file == FIXED_HW_REG);
1306 inst->src[2].fixed_hw_reg.nr += urb_start;
1307 }
1308
1309 if (inst->opcode == FS_OPCODE_CINTERP) {
1310 assert(inst->src[0].file == FIXED_HW_REG);
1311 inst->src[0].fixed_hw_reg.nr += urb_start;
1312 }
1313 }
1314
1315 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1316 }
1317
1318 /**
1319 * Split large virtual GRFs into separate components if we can.
1320 *
1321 * This is mostly duplicated with what brw_fs_vector_splitting does,
1322 * but that's really conservative because it's afraid of doing
1323 * splitting that doesn't result in real progress after the rest of
1324 * the optimization phases, which would cause infinite looping in
1325 * optimization. We can do it once here, safely. This also has the
1326 * opportunity to split interpolated values, or maybe even uniforms,
1327 * which we don't have at the IR level.
1328 *
1329 * We want to split, because virtual GRFs are what we register
1330 * allocate and spill (due to contiguousness requirements for some
1331 * instructions), and they're what we naturally generate in the
1332 * codegen process, but most virtual GRFs don't actually need to be
1333 * contiguous sets of GRFs. If we split, we'll end up with reduced
1334 * live intervals and better dead code elimination and coalescing.
1335 */
1336 void
1337 fs_visitor::split_virtual_grfs()
1338 {
1339 int num_vars = this->virtual_grf_count;
1340 bool split_grf[num_vars];
1341 int new_virtual_grf[num_vars];
1342
1343 /* Try to split anything > 0 sized. */
1344 for (int i = 0; i < num_vars; i++) {
1345 if (this->virtual_grf_sizes[i] != 1)
1346 split_grf[i] = true;
1347 else
1348 split_grf[i] = false;
1349 }
1350
1351 if (brw->has_pln &&
1352 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1353 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1354 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1355 * Gen6, that was the only supported interpolation mode, and since Gen6,
1356 * delta_x and delta_y are in fixed hardware registers.
1357 */
1358 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1359 false;
1360 }
1361
1362 foreach_list(node, &this->instructions) {
1363 fs_inst *inst = (fs_inst *)node;
1364
1365 /* If there's a SEND message that requires contiguous destination
1366 * registers, no splitting is allowed.
1367 */
1368 if (inst->regs_written() > 1) {
1369 split_grf[inst->dst.reg] = false;
1370 }
1371 }
1372
1373 /* Allocate new space for split regs. Note that the virtual
1374 * numbers will be contiguous.
1375 */
1376 for (int i = 0; i < num_vars; i++) {
1377 if (split_grf[i]) {
1378 new_virtual_grf[i] = virtual_grf_alloc(1);
1379 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1380 int reg = virtual_grf_alloc(1);
1381 assert(reg == new_virtual_grf[i] + j - 1);
1382 (void) reg;
1383 }
1384 this->virtual_grf_sizes[i] = 1;
1385 }
1386 }
1387
1388 foreach_list(node, &this->instructions) {
1389 fs_inst *inst = (fs_inst *)node;
1390
1391 if (inst->dst.file == GRF &&
1392 split_grf[inst->dst.reg] &&
1393 inst->dst.reg_offset != 0) {
1394 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1395 inst->dst.reg_offset - 1);
1396 inst->dst.reg_offset = 0;
1397 }
1398 for (int i = 0; i < 3; i++) {
1399 if (inst->src[i].file == GRF &&
1400 split_grf[inst->src[i].reg] &&
1401 inst->src[i].reg_offset != 0) {
1402 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1403 inst->src[i].reg_offset - 1);
1404 inst->src[i].reg_offset = 0;
1405 }
1406 }
1407 }
1408 this->live_intervals_valid = false;
1409 }
1410
1411 /**
1412 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1413 *
1414 * During code generation, we create tons of temporary variables, many of
1415 * which get immediately killed and are never used again. Yet, in later
1416 * optimization and analysis passes, such as compute_live_intervals, we need
1417 * to loop over all the virtual GRFs. Compacting them can save a lot of
1418 * overhead.
1419 */
1420 void
1421 fs_visitor::compact_virtual_grfs()
1422 {
1423 /* Mark which virtual GRFs are used, and count how many. */
1424 int remap_table[this->virtual_grf_count];
1425 memset(remap_table, -1, sizeof(remap_table));
1426
1427 foreach_list(node, &this->instructions) {
1428 const fs_inst *inst = (const fs_inst *) node;
1429
1430 if (inst->dst.file == GRF)
1431 remap_table[inst->dst.reg] = 0;
1432
1433 for (int i = 0; i < 3; i++) {
1434 if (inst->src[i].file == GRF)
1435 remap_table[inst->src[i].reg] = 0;
1436 }
1437 }
1438
1439 /* In addition to registers used in instructions, fs_visitor keeps
1440 * direct references to certain special values which must be patched:
1441 */
1442 fs_reg *special[] = {
1443 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1444 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1445 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1446 &delta_x[0], &delta_x[1], &delta_x[2],
1447 &delta_x[3], &delta_x[4], &delta_x[5],
1448 &delta_y[0], &delta_y[1], &delta_y[2],
1449 &delta_y[3], &delta_y[4], &delta_y[5],
1450 };
1451 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1452 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1453
1454 /* Treat all special values as used, to be conservative */
1455 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1456 if (special[i]->file == GRF)
1457 remap_table[special[i]->reg] = 0;
1458 }
1459
1460 /* Compact the GRF arrays. */
1461 int new_index = 0;
1462 for (int i = 0; i < this->virtual_grf_count; i++) {
1463 if (remap_table[i] != -1) {
1464 remap_table[i] = new_index;
1465 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1466 if (live_intervals_valid) {
1467 virtual_grf_use[new_index] = virtual_grf_use[i];
1468 virtual_grf_def[new_index] = virtual_grf_def[i];
1469 }
1470 ++new_index;
1471 }
1472 }
1473
1474 this->virtual_grf_count = new_index;
1475
1476 /* Patch all the instructions to use the newly renumbered registers */
1477 foreach_list(node, &this->instructions) {
1478 fs_inst *inst = (fs_inst *) node;
1479
1480 if (inst->dst.file == GRF)
1481 inst->dst.reg = remap_table[inst->dst.reg];
1482
1483 for (int i = 0; i < 3; i++) {
1484 if (inst->src[i].file == GRF)
1485 inst->src[i].reg = remap_table[inst->src[i].reg];
1486 }
1487 }
1488
1489 /* Patch all the references to special values */
1490 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1491 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1492 special[i]->reg = remap_table[special[i]->reg];
1493 }
1494 }
1495
1496 bool
1497 fs_visitor::remove_dead_constants()
1498 {
1499 if (dispatch_width == 8) {
1500 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1501
1502 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1503 this->params_remap[i] = -1;
1504
1505 /* Find which params are still in use. */
1506 foreach_list(node, &this->instructions) {
1507 fs_inst *inst = (fs_inst *)node;
1508
1509 for (int i = 0; i < 3; i++) {
1510 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1511
1512 if (inst->src[i].file != UNIFORM)
1513 continue;
1514
1515 assert(constant_nr < (int)c->prog_data.nr_params);
1516
1517 /* For now, set this to non-negative. We'll give it the
1518 * actual new number in a moment, in order to keep the
1519 * register numbers nicely ordered.
1520 */
1521 this->params_remap[constant_nr] = 0;
1522 }
1523 }
1524
1525 /* Figure out what the new numbers for the params will be. At some
1526 * point when we're doing uniform array access, we're going to want
1527 * to keep the distinction between .reg and .reg_offset, but for
1528 * now we don't care.
1529 */
1530 unsigned int new_nr_params = 0;
1531 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1532 if (this->params_remap[i] != -1) {
1533 this->params_remap[i] = new_nr_params++;
1534 }
1535 }
1536
1537 /* Update the list of params to be uploaded to match our new numbering. */
1538 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1539 int remapped = this->params_remap[i];
1540
1541 if (remapped == -1)
1542 continue;
1543
1544 c->prog_data.param[remapped] = c->prog_data.param[i];
1545 }
1546
1547 c->prog_data.nr_params = new_nr_params;
1548 } else {
1549 /* This should have been generated in the 8-wide pass already. */
1550 assert(this->params_remap);
1551 }
1552
1553 /* Now do the renumbering of the shader to remove unused params. */
1554 foreach_list(node, &this->instructions) {
1555 fs_inst *inst = (fs_inst *)node;
1556
1557 for (int i = 0; i < 3; i++) {
1558 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1559
1560 if (inst->src[i].file != UNIFORM)
1561 continue;
1562
1563 assert(this->params_remap[constant_nr] != -1);
1564 inst->src[i].reg = this->params_remap[constant_nr];
1565 inst->src[i].reg_offset = 0;
1566 }
1567 }
1568
1569 return true;
1570 }
1571
1572 /*
1573 * Implements array access of uniforms by inserting a
1574 * PULL_CONSTANT_LOAD instruction.
1575 *
1576 * Unlike temporary GRF array access (where we don't support it due to
1577 * the difficulty of doing relative addressing on instruction
1578 * destinations), we could potentially do array access of uniforms
1579 * that were loaded in GRF space as push constants. In real-world
1580 * usage we've seen, though, the arrays being used are always larger
1581 * than we could load as push constants, so just always move all
1582 * uniform array access out to a pull constant buffer.
1583 */
1584 void
1585 fs_visitor::move_uniform_array_access_to_pull_constants()
1586 {
1587 int pull_constant_loc[c->prog_data.nr_params];
1588
1589 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1590 pull_constant_loc[i] = -1;
1591 }
1592
1593 /* Walk through and find array access of uniforms. Put a copy of that
1594 * uniform in the pull constant buffer.
1595 *
1596 * Note that we don't move constant-indexed accesses to arrays. No
1597 * testing has been done of the performance impact of this choice.
1598 */
1599 foreach_list_safe(node, &this->instructions) {
1600 fs_inst *inst = (fs_inst *)node;
1601
1602 for (int i = 0 ; i < 3; i++) {
1603 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1604 continue;
1605
1606 int uniform = inst->src[i].reg;
1607
1608 /* If this array isn't already present in the pull constant buffer,
1609 * add it.
1610 */
1611 if (pull_constant_loc[uniform] == -1) {
1612 const float **values = &c->prog_data.param[uniform];
1613
1614 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1615
1616 assert(param_size[uniform]);
1617
1618 for (int j = 0; j < param_size[uniform]; j++) {
1619 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1620 values[j];
1621 }
1622 }
1623
1624 /* Set up the annotation tracking for new generated instructions. */
1625 base_ir = inst->ir;
1626 current_annotation = inst->annotation;
1627
1628 fs_reg offset = fs_reg(this, glsl_type::int_type);
1629 inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1630 fs_reg(pull_constant_loc[uniform] +
1631 inst->src[i].reg_offset)));
1632
1633 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1634 fs_reg temp = fs_reg(this, glsl_type::float_type);
1635 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1636 surf_index, offset);
1637 inst->insert_before(&list);
1638
1639 inst->src[i].file = temp.file;
1640 inst->src[i].reg = temp.reg;
1641 inst->src[i].reg_offset = temp.reg_offset;
1642 inst->src[i].reladdr = NULL;
1643 }
1644 }
1645 }
1646
1647 /**
1648 * Choose accesses from the UNIFORM file to demote to using the pull
1649 * constant buffer.
1650 *
1651 * We allow a fragment shader to have more than the specified minimum
1652 * maximum number of fragment shader uniform components (64). If
1653 * there are too many of these, they'd fill up all of register space.
1654 * So, this will push some of them out to the pull constant buffer and
1655 * update the program to load them.
1656 */
1657 void
1658 fs_visitor::setup_pull_constants()
1659 {
1660 /* Only allow 16 registers (128 uniform components) as push constants. */
1661 unsigned int max_uniform_components = 16 * 8;
1662 if (c->prog_data.nr_params <= max_uniform_components)
1663 return;
1664
1665 if (dispatch_width == 16) {
1666 fail("Pull constants not supported in 16-wide\n");
1667 return;
1668 }
1669
1670 /* Just demote the end of the list. We could probably do better
1671 * here, demoting things that are rarely used in the program first.
1672 */
1673 unsigned int pull_uniform_base = max_uniform_components;
1674
1675 int pull_constant_loc[c->prog_data.nr_params];
1676 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1677 if (i < pull_uniform_base) {
1678 pull_constant_loc[i] = -1;
1679 } else {
1680 pull_constant_loc[i] = -1;
1681 /* If our constant is already being uploaded for reladdr purposes,
1682 * reuse it.
1683 */
1684 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1685 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1686 pull_constant_loc[i] = j;
1687 break;
1688 }
1689 }
1690 if (pull_constant_loc[i] == -1) {
1691 int pull_index = c->prog_data.nr_pull_params++;
1692 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1693 pull_constant_loc[i] = pull_index;;
1694 }
1695 }
1696 }
1697 c->prog_data.nr_params = pull_uniform_base;
1698
1699 foreach_list(node, &this->instructions) {
1700 fs_inst *inst = (fs_inst *)node;
1701
1702 for (int i = 0; i < 3; i++) {
1703 if (inst->src[i].file != UNIFORM)
1704 continue;
1705
1706 int pull_index = pull_constant_loc[inst->src[i].reg +
1707 inst->src[i].reg_offset];
1708 if (pull_index == -1)
1709 continue;
1710
1711 assert(!inst->src[i].reladdr);
1712
1713 fs_reg dst = fs_reg(this, glsl_type::float_type);
1714 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1715 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1716 fs_inst *pull =
1717 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1718 dst, index, offset);
1719 pull->ir = inst->ir;
1720 pull->annotation = inst->annotation;
1721
1722 inst->insert_before(pull);
1723
1724 inst->src[i].file = GRF;
1725 inst->src[i].reg = dst.reg;
1726 inst->src[i].reg_offset = 0;
1727 inst->src[i].smear = pull_index & 3;
1728 }
1729 }
1730 }
1731
1732 bool
1733 fs_visitor::opt_algebraic()
1734 {
1735 bool progress = false;
1736
1737 foreach_list(node, &this->instructions) {
1738 fs_inst *inst = (fs_inst *)node;
1739
1740 switch (inst->opcode) {
1741 case BRW_OPCODE_MUL:
1742 if (inst->src[1].file != IMM)
1743 continue;
1744
1745 /* a * 1.0 = a */
1746 if (inst->src[1].is_one()) {
1747 inst->opcode = BRW_OPCODE_MOV;
1748 inst->src[1] = reg_undef;
1749 progress = true;
1750 break;
1751 }
1752
1753 /* a * 0.0 = 0.0 */
1754 if (inst->src[1].is_zero()) {
1755 inst->opcode = BRW_OPCODE_MOV;
1756 inst->src[0] = inst->src[1];
1757 inst->src[1] = reg_undef;
1758 progress = true;
1759 break;
1760 }
1761
1762 break;
1763 case BRW_OPCODE_ADD:
1764 if (inst->src[1].file != IMM)
1765 continue;
1766
1767 /* a + 0.0 = a */
1768 if (inst->src[1].is_zero()) {
1769 inst->opcode = BRW_OPCODE_MOV;
1770 inst->src[1] = reg_undef;
1771 progress = true;
1772 break;
1773 }
1774 break;
1775 default:
1776 break;
1777 }
1778 }
1779
1780 return progress;
1781 }
1782
1783 /**
1784 * Must be called after calculate_live_intervales() to remove unused
1785 * writes to registers -- register allocation will fail otherwise
1786 * because something deffed but not used won't be considered to
1787 * interfere with other regs.
1788 */
1789 bool
1790 fs_visitor::dead_code_eliminate()
1791 {
1792 bool progress = false;
1793 int pc = 0;
1794
1795 calculate_live_intervals();
1796
1797 foreach_list_safe(node, &this->instructions) {
1798 fs_inst *inst = (fs_inst *)node;
1799
1800 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1801 inst->remove();
1802 progress = true;
1803 }
1804
1805 pc++;
1806 }
1807
1808 if (progress)
1809 live_intervals_valid = false;
1810
1811 return progress;
1812 }
1813
1814 /**
1815 * Implements a second type of register coalescing: This one checks if
1816 * the two regs involved in a raw move don't interfere, in which case
1817 * they can both by stored in the same place and the MOV removed.
1818 */
1819 bool
1820 fs_visitor::register_coalesce_2()
1821 {
1822 bool progress = false;
1823
1824 calculate_live_intervals();
1825
1826 foreach_list_safe(node, &this->instructions) {
1827 fs_inst *inst = (fs_inst *)node;
1828
1829 if (inst->opcode != BRW_OPCODE_MOV ||
1830 inst->predicate ||
1831 inst->saturate ||
1832 inst->src[0].file != GRF ||
1833 inst->src[0].negate ||
1834 inst->src[0].abs ||
1835 inst->src[0].smear != -1 ||
1836 inst->dst.file != GRF ||
1837 inst->dst.type != inst->src[0].type ||
1838 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1839 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1840 continue;
1841 }
1842
1843 int reg_from = inst->src[0].reg;
1844 assert(inst->src[0].reg_offset == 0);
1845 int reg_to = inst->dst.reg;
1846 int reg_to_offset = inst->dst.reg_offset;
1847
1848 foreach_list(node, &this->instructions) {
1849 fs_inst *scan_inst = (fs_inst *)node;
1850
1851 if (scan_inst->dst.file == GRF &&
1852 scan_inst->dst.reg == reg_from) {
1853 scan_inst->dst.reg = reg_to;
1854 scan_inst->dst.reg_offset = reg_to_offset;
1855 }
1856 for (int i = 0; i < 3; i++) {
1857 if (scan_inst->src[i].file == GRF &&
1858 scan_inst->src[i].reg == reg_from) {
1859 scan_inst->src[i].reg = reg_to;
1860 scan_inst->src[i].reg_offset = reg_to_offset;
1861 }
1862 }
1863 }
1864
1865 inst->remove();
1866
1867 /* We don't need to recalculate live intervals inside the loop despite
1868 * flagging live_intervals_valid because we only use live intervals for
1869 * the interferes test, and we must have had a situation where the
1870 * intervals were:
1871 *
1872 * from to
1873 * ^
1874 * |
1875 * v
1876 * ^
1877 * |
1878 * v
1879 *
1880 * Some register R that might get coalesced with one of these two could
1881 * only be referencing "to", otherwise "from"'s range would have been
1882 * longer. R's range could also only start at the end of "to" or later,
1883 * otherwise it will conflict with "to" when we try to coalesce "to"
1884 * into Rw anyway.
1885 */
1886 live_intervals_valid = false;
1887
1888 progress = true;
1889 continue;
1890 }
1891
1892 return progress;
1893 }
1894
1895 bool
1896 fs_visitor::register_coalesce()
1897 {
1898 bool progress = false;
1899 int if_depth = 0;
1900 int loop_depth = 0;
1901
1902 foreach_list_safe(node, &this->instructions) {
1903 fs_inst *inst = (fs_inst *)node;
1904
1905 /* Make sure that we dominate the instructions we're going to
1906 * scan for interfering with our coalescing, or we won't have
1907 * scanned enough to see if anything interferes with our
1908 * coalescing. We don't dominate the following instructions if
1909 * we're in a loop or an if block.
1910 */
1911 switch (inst->opcode) {
1912 case BRW_OPCODE_DO:
1913 loop_depth++;
1914 break;
1915 case BRW_OPCODE_WHILE:
1916 loop_depth--;
1917 break;
1918 case BRW_OPCODE_IF:
1919 if_depth++;
1920 break;
1921 case BRW_OPCODE_ENDIF:
1922 if_depth--;
1923 break;
1924 default:
1925 break;
1926 }
1927 if (loop_depth || if_depth)
1928 continue;
1929
1930 if (inst->opcode != BRW_OPCODE_MOV ||
1931 inst->predicate ||
1932 inst->saturate ||
1933 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1934 inst->src[0].file != UNIFORM)||
1935 inst->dst.type != inst->src[0].type)
1936 continue;
1937
1938 bool has_source_modifiers = (inst->src[0].abs ||
1939 inst->src[0].negate ||
1940 inst->src[0].smear != -1 ||
1941 inst->src[0].file == UNIFORM);
1942
1943 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1944 * them: check for no writes to either one until the exit of the
1945 * program.
1946 */
1947 bool interfered = false;
1948
1949 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1950 !scan_inst->is_tail_sentinel();
1951 scan_inst = (fs_inst *)scan_inst->next) {
1952 if (scan_inst->dst.file == GRF) {
1953 if (scan_inst->overwrites_reg(inst->dst) ||
1954 scan_inst->overwrites_reg(inst->src[0])) {
1955 interfered = true;
1956 break;
1957 }
1958 }
1959
1960 /* The gen6 MATH instruction can't handle source modifiers or
1961 * unusual register regions, so avoid coalescing those for
1962 * now. We should do something more specific.
1963 */
1964 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1965 interfered = true;
1966 break;
1967 }
1968
1969 /* The accumulator result appears to get used for the
1970 * conditional modifier generation. When negating a UD
1971 * value, there is a 33rd bit generated for the sign in the
1972 * accumulator value, so now you can't check, for example,
1973 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1974 */
1975 if (scan_inst->conditional_mod &&
1976 inst->src[0].negate &&
1977 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1978 interfered = true;
1979 break;
1980 }
1981 }
1982 if (interfered) {
1983 continue;
1984 }
1985
1986 /* Rewrite the later usage to point at the source of the move to
1987 * be removed.
1988 */
1989 for (fs_inst *scan_inst = inst;
1990 !scan_inst->is_tail_sentinel();
1991 scan_inst = (fs_inst *)scan_inst->next) {
1992 for (int i = 0; i < 3; i++) {
1993 if (scan_inst->src[i].file == GRF &&
1994 scan_inst->src[i].reg == inst->dst.reg &&
1995 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1996 fs_reg new_src = inst->src[0];
1997 if (scan_inst->src[i].abs) {
1998 new_src.negate = 0;
1999 new_src.abs = 1;
2000 }
2001 new_src.negate ^= scan_inst->src[i].negate;
2002 scan_inst->src[i] = new_src;
2003 }
2004 }
2005 }
2006
2007 inst->remove();
2008 progress = true;
2009 }
2010
2011 if (progress)
2012 live_intervals_valid = false;
2013
2014 return progress;
2015 }
2016
2017
2018 bool
2019 fs_visitor::compute_to_mrf()
2020 {
2021 bool progress = false;
2022 int next_ip = 0;
2023
2024 calculate_live_intervals();
2025
2026 foreach_list_safe(node, &this->instructions) {
2027 fs_inst *inst = (fs_inst *)node;
2028
2029 int ip = next_ip;
2030 next_ip++;
2031
2032 if (inst->opcode != BRW_OPCODE_MOV ||
2033 inst->predicate ||
2034 inst->dst.file != MRF || inst->src[0].file != GRF ||
2035 inst->dst.type != inst->src[0].type ||
2036 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2037 continue;
2038
2039 /* Work out which hardware MRF registers are written by this
2040 * instruction.
2041 */
2042 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2043 int mrf_high;
2044 if (inst->dst.reg & BRW_MRF_COMPR4) {
2045 mrf_high = mrf_low + 4;
2046 } else if (dispatch_width == 16 &&
2047 (!inst->force_uncompressed && !inst->force_sechalf)) {
2048 mrf_high = mrf_low + 1;
2049 } else {
2050 mrf_high = mrf_low;
2051 }
2052
2053 /* Can't compute-to-MRF this GRF if someone else was going to
2054 * read it later.
2055 */
2056 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2057 continue;
2058
2059 /* Found a move of a GRF to a MRF. Let's see if we can go
2060 * rewrite the thing that made this GRF to write into the MRF.
2061 */
2062 fs_inst *scan_inst;
2063 for (scan_inst = (fs_inst *)inst->prev;
2064 scan_inst->prev != NULL;
2065 scan_inst = (fs_inst *)scan_inst->prev) {
2066 if (scan_inst->dst.file == GRF &&
2067 scan_inst->dst.reg == inst->src[0].reg) {
2068 /* Found the last thing to write our reg we want to turn
2069 * into a compute-to-MRF.
2070 */
2071
2072 /* If it's predicated, it (probably) didn't populate all
2073 * the channels. We might be able to rewrite everything
2074 * that writes that reg, but it would require smarter
2075 * tracking to delay the rewriting until complete success.
2076 */
2077 if (scan_inst->predicate)
2078 break;
2079
2080 /* If it's half of register setup and not the same half as
2081 * our MOV we're trying to remove, bail for now.
2082 */
2083 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2084 scan_inst->force_sechalf != inst->force_sechalf) {
2085 break;
2086 }
2087
2088 /* SEND instructions can't have MRF as a destination. */
2089 if (scan_inst->mlen)
2090 break;
2091
2092 if (intel->gen == 6) {
2093 /* gen6 math instructions must have the destination be
2094 * GRF, so no compute-to-MRF for them.
2095 */
2096 if (scan_inst->is_math()) {
2097 break;
2098 }
2099 }
2100
2101 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2102 /* Found the creator of our MRF's source value. */
2103 scan_inst->dst.file = MRF;
2104 scan_inst->dst.reg = inst->dst.reg;
2105 scan_inst->saturate |= inst->saturate;
2106 inst->remove();
2107 progress = true;
2108 }
2109 break;
2110 }
2111
2112 /* We don't handle control flow here. Most computation of
2113 * values that end up in MRFs are shortly before the MRF
2114 * write anyway.
2115 */
2116 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2117 break;
2118
2119 /* You can't read from an MRF, so if someone else reads our
2120 * MRF's source GRF that we wanted to rewrite, that stops us.
2121 */
2122 bool interfered = false;
2123 for (int i = 0; i < 3; i++) {
2124 if (scan_inst->src[i].file == GRF &&
2125 scan_inst->src[i].reg == inst->src[0].reg &&
2126 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2127 interfered = true;
2128 }
2129 }
2130 if (interfered)
2131 break;
2132
2133 if (scan_inst->dst.file == MRF) {
2134 /* If somebody else writes our MRF here, we can't
2135 * compute-to-MRF before that.
2136 */
2137 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2138 int scan_mrf_high;
2139
2140 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2141 scan_mrf_high = scan_mrf_low + 4;
2142 } else if (dispatch_width == 16 &&
2143 (!scan_inst->force_uncompressed &&
2144 !scan_inst->force_sechalf)) {
2145 scan_mrf_high = scan_mrf_low + 1;
2146 } else {
2147 scan_mrf_high = scan_mrf_low;
2148 }
2149
2150 if (mrf_low == scan_mrf_low ||
2151 mrf_low == scan_mrf_high ||
2152 mrf_high == scan_mrf_low ||
2153 mrf_high == scan_mrf_high) {
2154 break;
2155 }
2156 }
2157
2158 if (scan_inst->mlen > 0) {
2159 /* Found a SEND instruction, which means that there are
2160 * live values in MRFs from base_mrf to base_mrf +
2161 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2162 * above it.
2163 */
2164 if (mrf_low >= scan_inst->base_mrf &&
2165 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2166 break;
2167 }
2168 if (mrf_high >= scan_inst->base_mrf &&
2169 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2170 break;
2171 }
2172 }
2173 }
2174 }
2175
2176 if (progress)
2177 live_intervals_valid = false;
2178
2179 return progress;
2180 }
2181
2182 /**
2183 * Walks through basic blocks, looking for repeated MRF writes and
2184 * removing the later ones.
2185 */
2186 bool
2187 fs_visitor::remove_duplicate_mrf_writes()
2188 {
2189 fs_inst *last_mrf_move[16];
2190 bool progress = false;
2191
2192 /* Need to update the MRF tracking for compressed instructions. */
2193 if (dispatch_width == 16)
2194 return false;
2195
2196 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2197
2198 foreach_list_safe(node, &this->instructions) {
2199 fs_inst *inst = (fs_inst *)node;
2200
2201 if (inst->is_control_flow()) {
2202 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2203 }
2204
2205 if (inst->opcode == BRW_OPCODE_MOV &&
2206 inst->dst.file == MRF) {
2207 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2208 if (prev_inst && inst->equals(prev_inst)) {
2209 inst->remove();
2210 progress = true;
2211 continue;
2212 }
2213 }
2214
2215 /* Clear out the last-write records for MRFs that were overwritten. */
2216 if (inst->dst.file == MRF) {
2217 last_mrf_move[inst->dst.reg] = NULL;
2218 }
2219
2220 if (inst->mlen > 0) {
2221 /* Found a SEND instruction, which will include two or fewer
2222 * implied MRF writes. We could do better here.
2223 */
2224 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2225 last_mrf_move[inst->base_mrf + i] = NULL;
2226 }
2227 }
2228
2229 /* Clear out any MRF move records whose sources got overwritten. */
2230 if (inst->dst.file == GRF) {
2231 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2232 if (last_mrf_move[i] &&
2233 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2234 last_mrf_move[i] = NULL;
2235 }
2236 }
2237 }
2238
2239 if (inst->opcode == BRW_OPCODE_MOV &&
2240 inst->dst.file == MRF &&
2241 inst->src[0].file == GRF &&
2242 !inst->predicate) {
2243 last_mrf_move[inst->dst.reg] = inst;
2244 }
2245 }
2246
2247 if (progress)
2248 live_intervals_valid = false;
2249
2250 return progress;
2251 }
2252
2253 static void
2254 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2255 int first_grf, int grf_len)
2256 {
2257 bool inst_16wide = (dispatch_width > 8 &&
2258 !inst->force_uncompressed &&
2259 !inst->force_sechalf);
2260
2261 /* Clear the flag for registers that actually got read (as expected). */
2262 for (int i = 0; i < 3; i++) {
2263 int grf;
2264 if (inst->src[i].file == GRF) {
2265 grf = inst->src[i].reg;
2266 } else if (inst->src[i].file == FIXED_HW_REG &&
2267 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2268 grf = inst->src[i].fixed_hw_reg.nr;
2269 } else {
2270 continue;
2271 }
2272
2273 if (grf >= first_grf &&
2274 grf < first_grf + grf_len) {
2275 deps[grf - first_grf] = false;
2276 if (inst_16wide)
2277 deps[grf - first_grf + 1] = false;
2278 }
2279 }
2280 }
2281
2282 /**
2283 * Implements this workaround for the original 965:
2284 *
2285 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2286 * check for post destination dependencies on this instruction, software
2287 * must ensure that there is no destination hazard for the case of ‘write
2288 * followed by a posted write’ shown in the following example.
2289 *
2290 * 1. mov r3 0
2291 * 2. send r3.xy <rest of send instruction>
2292 * 3. mov r2 r3
2293 *
2294 * Due to no post-destination dependency check on the ‘send’, the above
2295 * code sequence could have two instructions (1 and 2) in flight at the
2296 * same time that both consider ‘r3’ as the target of their final writes.
2297 */
2298 void
2299 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2300 {
2301 int write_len = inst->regs_written() * dispatch_width / 8;
2302 int first_write_grf = inst->dst.reg;
2303 bool needs_dep[BRW_MAX_MRF];
2304 assert(write_len < (int)sizeof(needs_dep) - 1);
2305
2306 memset(needs_dep, false, sizeof(needs_dep));
2307 memset(needs_dep, true, write_len);
2308
2309 clear_deps_for_inst_src(inst, dispatch_width,
2310 needs_dep, first_write_grf, write_len);
2311
2312 /* Walk backwards looking for writes to registers we're writing which
2313 * aren't read since being written. If we hit the start of the program,
2314 * we assume that there are no outstanding dependencies on entry to the
2315 * program.
2316 */
2317 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2318 scan_inst != NULL;
2319 scan_inst = (fs_inst *)scan_inst->prev) {
2320
2321 /* If we hit control flow, assume that there *are* outstanding
2322 * dependencies, and force their cleanup before our instruction.
2323 */
2324 if (scan_inst->is_control_flow()) {
2325 for (int i = 0; i < write_len; i++) {
2326 if (needs_dep[i]) {
2327 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2328 }
2329 }
2330 }
2331
2332 bool scan_inst_16wide = (dispatch_width > 8 &&
2333 !scan_inst->force_uncompressed &&
2334 !scan_inst->force_sechalf);
2335
2336 /* We insert our reads as late as possible on the assumption that any
2337 * instruction but a MOV that might have left us an outstanding
2338 * dependency has more latency than a MOV.
2339 */
2340 if (scan_inst->dst.file == GRF &&
2341 scan_inst->dst.reg >= first_write_grf &&
2342 scan_inst->dst.reg < first_write_grf + write_len &&
2343 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2344 inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2345 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2346 if (scan_inst_16wide)
2347 needs_dep[scan_inst->dst.reg - first_write_grf + 1] = false;
2348 }
2349
2350 /* Clear the flag for registers that actually got read (as expected). */
2351 clear_deps_for_inst_src(scan_inst, dispatch_width,
2352 needs_dep, first_write_grf, write_len);
2353
2354 /* Continue the loop only if we haven't resolved all the dependencies */
2355 int i;
2356 for (i = 0; i < write_len; i++) {
2357 if (needs_dep[i])
2358 break;
2359 }
2360 if (i == write_len)
2361 return;
2362 }
2363 }
2364
2365 /**
2366 * Implements this workaround for the original 965:
2367 *
2368 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2369 * used as a destination register until after it has been sourced by an
2370 * instruction with a different destination register.
2371 */
2372 void
2373 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2374 {
2375 int write_len = inst->regs_written() * dispatch_width / 8;
2376 int first_write_grf = inst->dst.reg;
2377 bool needs_dep[BRW_MAX_MRF];
2378 assert(write_len < (int)sizeof(needs_dep) - 1);
2379
2380 memset(needs_dep, false, sizeof(needs_dep));
2381 memset(needs_dep, true, write_len);
2382 /* Walk forwards looking for writes to registers we're writing which aren't
2383 * read before being written.
2384 */
2385 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2386 !scan_inst->is_tail_sentinel();
2387 scan_inst = (fs_inst *)scan_inst->next) {
2388 /* If we hit control flow, force resolve all remaining dependencies. */
2389 if (scan_inst->is_control_flow()) {
2390 for (int i = 0; i < write_len; i++) {
2391 if (needs_dep[i])
2392 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2393 }
2394 }
2395
2396 /* Clear the flag for registers that actually got read (as expected). */
2397 clear_deps_for_inst_src(scan_inst, dispatch_width,
2398 needs_dep, first_write_grf, write_len);
2399
2400 /* We insert our reads as late as possible since they're reading the
2401 * result of a SEND, which has massive latency.
2402 */
2403 if (scan_inst->dst.file == GRF &&
2404 scan_inst->dst.reg >= first_write_grf &&
2405 scan_inst->dst.reg < first_write_grf + write_len &&
2406 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2407 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2408 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2409 }
2410
2411 /* Continue the loop only if we haven't resolved all the dependencies */
2412 int i;
2413 for (i = 0; i < write_len; i++) {
2414 if (needs_dep[i])
2415 break;
2416 }
2417 if (i == write_len)
2418 return;
2419 }
2420
2421 /* If we hit the end of the program, resolve all remaining dependencies out
2422 * of paranoia.
2423 */
2424 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2425 assert(last_inst->eot);
2426 for (int i = 0; i < write_len; i++) {
2427 if (needs_dep[i])
2428 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2429 }
2430 }
2431
2432 void
2433 fs_visitor::insert_gen4_send_dependency_workarounds()
2434 {
2435 if (intel->gen != 4 || intel->is_g4x)
2436 return;
2437
2438 /* Note that we're done with register allocation, so GRF fs_regs always
2439 * have a .reg_offset of 0.
2440 */
2441
2442 foreach_list_safe(node, &this->instructions) {
2443 fs_inst *inst = (fs_inst *)node;
2444
2445 if (inst->mlen != 0 && inst->dst.file == GRF) {
2446 insert_gen4_pre_send_dependency_workarounds(inst);
2447 insert_gen4_post_send_dependency_workarounds(inst);
2448 }
2449 }
2450 }
2451
2452 /**
2453 * Turns the generic expression-style uniform pull constant load instruction
2454 * into a hardware-specific series of instructions for loading a pull
2455 * constant.
2456 *
2457 * The expression style allows the CSE pass before this to optimize out
2458 * repeated loads from the same offset, and gives the pre-register-allocation
2459 * scheduling full flexibility, while the conversion to native instructions
2460 * allows the post-register-allocation scheduler the best information
2461 * possible.
2462 */
2463 void
2464 fs_visitor::lower_uniform_pull_constant_loads()
2465 {
2466 foreach_list(node, &this->instructions) {
2467 fs_inst *inst = (fs_inst *)node;
2468
2469 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2470 continue;
2471
2472 if (intel->gen >= 7) {
2473 fs_reg const_offset_reg = inst->src[1];
2474 assert(const_offset_reg.file == IMM &&
2475 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2476 const_offset_reg.imm.u /= 16;
2477 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2478 struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
2479 BRW_REGISTER_TYPE_UD);
2480
2481 fs_inst *setup1 = MOV(payload, fs_reg(g0));
2482 setup1->force_writemask_all = true;
2483 /* We don't need the second half of this vgrf to be filled with g1
2484 * in the 16-wide case, but if we use force_uncompressed then live
2485 * variable analysis won't consider this a def!
2486 */
2487
2488 fs_inst *setup2 = new(mem_ctx) fs_inst(FS_OPCODE_SET_GLOBAL_OFFSET,
2489 payload, payload,
2490 const_offset_reg);
2491
2492 setup1->ir = inst->ir;
2493 setup1->annotation = inst->annotation;
2494 inst->insert_before(setup1);
2495 setup2->ir = inst->ir;
2496 setup2->annotation = inst->annotation;
2497 inst->insert_before(setup2);
2498 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2499 inst->src[1] = payload;
2500 } else {
2501 /* Before register allocation, we didn't tell the scheduler about the
2502 * MRF we use. We know it's safe to use this MRF because nothing
2503 * else does except for register spill/unspill, which generates and
2504 * uses its MRF within a single IR instruction.
2505 */
2506 inst->base_mrf = 14;
2507 inst->mlen = 1;
2508 }
2509 }
2510 }
2511
2512 void
2513 fs_visitor::dump_instruction(fs_inst *inst)
2514 {
2515 if (inst->predicate) {
2516 printf("(%cf0.%d) ",
2517 inst->predicate_inverse ? '-' : '+',
2518 inst->flag_subreg);
2519 }
2520
2521 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2522 opcode_descs[inst->opcode].name) {
2523 printf("%s", opcode_descs[inst->opcode].name);
2524 } else {
2525 switch (inst->opcode) {
2526 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
2527 printf("uniform_pull_const");
2528 break;
2529 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
2530 printf("uniform_pull_const_gen7");
2531 break;
2532 case FS_OPCODE_SET_GLOBAL_OFFSET:
2533 printf("set_global_offset");
2534 break;
2535 default:
2536 printf("op%d", inst->opcode);
2537 break;
2538 }
2539 }
2540 if (inst->saturate)
2541 printf(".sat");
2542 if (inst->conditional_mod) {
2543 printf(".cmod");
2544 if (!inst->predicate &&
2545 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2546 inst->opcode != BRW_OPCODE_IF &&
2547 inst->opcode != BRW_OPCODE_WHILE))) {
2548 printf(".f0.%d\n", inst->flag_subreg);
2549 }
2550 }
2551 printf(" ");
2552
2553
2554 switch (inst->dst.file) {
2555 case GRF:
2556 printf("vgrf%d", inst->dst.reg);
2557 if (inst->dst.reg_offset)
2558 printf("+%d", inst->dst.reg_offset);
2559 break;
2560 case MRF:
2561 printf("m%d", inst->dst.reg);
2562 break;
2563 case BAD_FILE:
2564 printf("(null)");
2565 break;
2566 case UNIFORM:
2567 printf("***u%d***", inst->dst.reg);
2568 break;
2569 default:
2570 printf("???");
2571 break;
2572 }
2573 printf(", ");
2574
2575 for (int i = 0; i < 3; i++) {
2576 if (inst->src[i].negate)
2577 printf("-");
2578 if (inst->src[i].abs)
2579 printf("|");
2580 switch (inst->src[i].file) {
2581 case GRF:
2582 printf("vgrf%d", inst->src[i].reg);
2583 if (inst->src[i].reg_offset)
2584 printf("+%d", inst->src[i].reg_offset);
2585 break;
2586 case MRF:
2587 printf("***m%d***", inst->src[i].reg);
2588 break;
2589 case UNIFORM:
2590 printf("u%d", inst->src[i].reg);
2591 if (inst->src[i].reg_offset)
2592 printf(".%d", inst->src[i].reg_offset);
2593 break;
2594 case BAD_FILE:
2595 printf("(null)");
2596 break;
2597 case IMM:
2598 switch (inst->src[i].type) {
2599 case BRW_REGISTER_TYPE_F:
2600 printf("%ff", inst->src[i].imm.f);
2601 break;
2602 case BRW_REGISTER_TYPE_D:
2603 printf("%dd", inst->src[i].imm.i);
2604 break;
2605 case BRW_REGISTER_TYPE_UD:
2606 printf("%uu", inst->src[i].imm.u);
2607 break;
2608 default:
2609 printf("???");
2610 break;
2611 }
2612 break;
2613 default:
2614 printf("???");
2615 break;
2616 }
2617 if (inst->src[i].abs)
2618 printf("|");
2619
2620 if (i < 3)
2621 printf(", ");
2622 }
2623
2624 printf(" ");
2625
2626 if (inst->force_uncompressed)
2627 printf("1sthalf ");
2628
2629 if (inst->force_sechalf)
2630 printf("2ndhalf ");
2631
2632 printf("\n");
2633 }
2634
2635 void
2636 fs_visitor::dump_instructions()
2637 {
2638 int ip = 0;
2639 foreach_list(node, &this->instructions) {
2640 fs_inst *inst = (fs_inst *)node;
2641 printf("%d: ", ip++);
2642 dump_instruction(inst);
2643 }
2644 }
2645
2646 /**
2647 * Possibly returns an instruction that set up @param reg.
2648 *
2649 * Sometimes we want to take the result of some expression/variable
2650 * dereference tree and rewrite the instruction generating the result
2651 * of the tree. When processing the tree, we know that the
2652 * instructions generated are all writing temporaries that are dead
2653 * outside of this tree. So, if we have some instructions that write
2654 * a temporary, we're free to point that temp write somewhere else.
2655 *
2656 * Note that this doesn't guarantee that the instruction generated
2657 * only reg -- it might be the size=4 destination of a texture instruction.
2658 */
2659 fs_inst *
2660 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2661 fs_inst *end,
2662 fs_reg reg)
2663 {
2664 if (end == start ||
2665 end->predicate ||
2666 end->force_uncompressed ||
2667 end->force_sechalf ||
2668 reg.reladdr ||
2669 !reg.equals(end->dst)) {
2670 return NULL;
2671 } else {
2672 return end;
2673 }
2674 }
2675
2676 void
2677 fs_visitor::setup_payload_gen6()
2678 {
2679 struct intel_context *intel = &brw->intel;
2680 bool uses_depth =
2681 (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2682 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2683
2684 assert(intel->gen >= 6);
2685
2686 /* R0-1: masks, pixel X/Y coordinates. */
2687 c->nr_payload_regs = 2;
2688 /* R2: only for 32-pixel dispatch.*/
2689
2690 /* R3-26: barycentric interpolation coordinates. These appear in the
2691 * same order that they appear in the brw_wm_barycentric_interp_mode
2692 * enum. Each set of coordinates occupies 2 registers if dispatch width
2693 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2694 * appear if they were enabled using the "Barycentric Interpolation
2695 * Mode" bits in WM_STATE.
2696 */
2697 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2698 if (barycentric_interp_modes & (1 << i)) {
2699 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2700 c->nr_payload_regs += 2;
2701 if (dispatch_width == 16) {
2702 c->nr_payload_regs += 2;
2703 }
2704 }
2705 }
2706
2707 /* R27: interpolated depth if uses source depth */
2708 if (uses_depth) {
2709 c->source_depth_reg = c->nr_payload_regs;
2710 c->nr_payload_regs++;
2711 if (dispatch_width == 16) {
2712 /* R28: interpolated depth if not 8-wide. */
2713 c->nr_payload_regs++;
2714 }
2715 }
2716 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2717 if (uses_depth) {
2718 c->source_w_reg = c->nr_payload_regs;
2719 c->nr_payload_regs++;
2720 if (dispatch_width == 16) {
2721 /* R30: interpolated W if not 8-wide. */
2722 c->nr_payload_regs++;
2723 }
2724 }
2725 /* R31: MSAA position offsets. */
2726 /* R32-: bary for 32-pixel. */
2727 /* R58-59: interp W for 32-pixel. */
2728
2729 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2730 c->source_depth_to_render_target = true;
2731 }
2732 }
2733
2734 bool
2735 fs_visitor::run()
2736 {
2737 sanity_param_count = fp->Base.Parameters->NumParameters;
2738 uint32_t orig_nr_params = c->prog_data.nr_params;
2739
2740 if (intel->gen >= 6)
2741 setup_payload_gen6();
2742 else
2743 setup_payload_gen4();
2744
2745 if (0) {
2746 emit_dummy_fs();
2747 } else {
2748 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2749 emit_shader_time_begin();
2750
2751 calculate_urb_setup();
2752 if (intel->gen < 6)
2753 emit_interpolation_setup_gen4();
2754 else
2755 emit_interpolation_setup_gen6();
2756
2757 /* We handle discards by keeping track of the still-live pixels in f0.1.
2758 * Initialize it with the dispatched pixels.
2759 */
2760 if (fp->UsesKill) {
2761 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2762 discard_init->flag_subreg = 1;
2763 }
2764
2765 /* Generate FS IR for main(). (the visitor only descends into
2766 * functions called "main").
2767 */
2768 if (shader) {
2769 foreach_list(node, &*shader->ir) {
2770 ir_instruction *ir = (ir_instruction *)node;
2771 base_ir = ir;
2772 this->result = reg_undef;
2773 ir->accept(this);
2774 }
2775 } else {
2776 emit_fragment_program_code();
2777 }
2778 base_ir = NULL;
2779 if (failed)
2780 return false;
2781
2782 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2783 emit_shader_time_end();
2784
2785 emit_fb_writes();
2786
2787 split_virtual_grfs();
2788
2789 move_uniform_array_access_to_pull_constants();
2790 setup_pull_constants();
2791
2792 bool progress;
2793 do {
2794 progress = false;
2795
2796 compact_virtual_grfs();
2797
2798 progress = remove_duplicate_mrf_writes() || progress;
2799
2800 progress = opt_algebraic() || progress;
2801 progress = opt_cse() || progress;
2802 progress = opt_copy_propagate() || progress;
2803 progress = dead_code_eliminate() || progress;
2804 progress = register_coalesce() || progress;
2805 progress = register_coalesce_2() || progress;
2806 progress = compute_to_mrf() || progress;
2807 } while (progress);
2808
2809 remove_dead_constants();
2810
2811 schedule_instructions(false);
2812
2813 lower_uniform_pull_constant_loads();
2814
2815 assign_curb_setup();
2816 assign_urb_setup();
2817
2818 if (0) {
2819 /* Debug of register spilling: Go spill everything. */
2820 for (int i = 0; i < virtual_grf_count; i++) {
2821 spill_reg(i);
2822 }
2823 }
2824
2825 if (0)
2826 assign_regs_trivial();
2827 else {
2828 while (!assign_regs()) {
2829 if (failed)
2830 break;
2831 }
2832 }
2833 }
2834 assert(force_uncompressed_stack == 0);
2835 assert(force_sechalf_stack == 0);
2836
2837 /* This must come after all optimization and register allocation, since
2838 * it inserts dead code that happens to have side effects, and it does
2839 * so based on the actual physical registers in use.
2840 */
2841 insert_gen4_send_dependency_workarounds();
2842
2843 if (failed)
2844 return false;
2845
2846 schedule_instructions(true);
2847
2848 if (dispatch_width == 8) {
2849 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2850 } else {
2851 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2852
2853 /* Make sure we didn't try to sneak in an extra uniform */
2854 assert(orig_nr_params == c->prog_data.nr_params);
2855 (void) orig_nr_params;
2856 }
2857
2858 /* If any state parameters were appended, then ParameterValues could have
2859 * been realloced, in which case the driver uniform storage set up by
2860 * _mesa_associate_uniform_storage() would point to freed memory. Make
2861 * sure that didn't happen.
2862 */
2863 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2864
2865 return !failed;
2866 }
2867
2868 const unsigned *
2869 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2870 struct gl_fragment_program *fp,
2871 struct gl_shader_program *prog,
2872 unsigned *final_assembly_size)
2873 {
2874 struct intel_context *intel = &brw->intel;
2875 bool start_busy = false;
2876 float start_time = 0;
2877
2878 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2879 start_busy = (intel->batch.last_bo &&
2880 drm_intel_bo_busy(intel->batch.last_bo));
2881 start_time = get_time();
2882 }
2883
2884 struct brw_shader *shader = NULL;
2885 if (prog)
2886 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2887
2888 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2889 if (shader) {
2890 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2891 _mesa_print_ir(shader->ir, NULL);
2892 printf("\n\n");
2893 } else {
2894 printf("ARB_fragment_program %d ir for native fragment shader\n",
2895 fp->Base.Id);
2896 _mesa_print_program(&fp->Base);
2897 }
2898 }
2899
2900 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2901 */
2902 fs_visitor v(brw, c, prog, fp, 8);
2903 if (!v.run()) {
2904 prog->LinkStatus = false;
2905 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2906
2907 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2908 v.fail_msg);
2909
2910 return NULL;
2911 }
2912
2913 exec_list *simd16_instructions = NULL;
2914 fs_visitor v2(brw, c, prog, fp, 16);
2915 bool no16 = INTEL_DEBUG & DEBUG_NO16;
2916 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2917 v2.import_uniforms(&v);
2918 if (!v2.run()) {
2919 perf_debug("16-wide shader failed to compile, falling back to "
2920 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2921 } else {
2922 simd16_instructions = &v2.instructions;
2923 }
2924 }
2925
2926 c->prog_data.dispatch_width = 8;
2927
2928 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2929 const unsigned *generated = g.generate_assembly(&v.instructions,
2930 simd16_instructions,
2931 final_assembly_size);
2932
2933 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2934 if (shader->compiled_once)
2935 brw_wm_debug_recompile(brw, prog, &c->key);
2936 shader->compiled_once = true;
2937
2938 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2939 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2940 (get_time() - start_time) * 1000);
2941 }
2942 }
2943
2944 return generated;
2945 }
2946
2947 bool
2948 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2949 {
2950 struct brw_context *brw = brw_context(ctx);
2951 struct intel_context *intel = &brw->intel;
2952 struct brw_wm_prog_key key;
2953
2954 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2955 return true;
2956
2957 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2958 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2959 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2960 bool program_uses_dfdy = fp->UsesDFdy;
2961
2962 memset(&key, 0, sizeof(key));
2963
2964 if (intel->gen < 6) {
2965 if (fp->UsesKill)
2966 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2967
2968 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2969 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2970
2971 /* Just assume depth testing. */
2972 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2973 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2974 }
2975
2976 if (prog->Name != 0)
2977 key.proj_attrib_mask = 0xffffffff;
2978
2979 if (intel->gen < 6)
2980 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2981
2982 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2983 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2984 continue;
2985
2986 if (prog->Name == 0)
2987 key.proj_attrib_mask |= 1 << i;
2988
2989 if (intel->gen < 6) {
2990 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2991
2992 if (vp_index >= 0)
2993 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2994 }
2995 }
2996
2997 key.clamp_fragment_color = true;
2998
2999 for (int i = 0; i < MAX_SAMPLERS; i++) {
3000 if (fp->Base.ShadowSamplers & (1 << i)) {
3001 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3002 key.tex.swizzles[i] =
3003 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3004 } else {
3005 /* Color sampler: assume no swizzling. */
3006 key.tex.swizzles[i] = SWIZZLE_XYZW;
3007 }
3008 }
3009
3010 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
3011 key.drawable_height = ctx->DrawBuffer->Height;
3012 }
3013
3014 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
3015 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3016 }
3017
3018 key.nr_color_regions = 1;
3019
3020 key.program_string_id = bfp->id;
3021
3022 uint32_t old_prog_offset = brw->wm.prog_offset;
3023 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3024
3025 bool success = do_wm_prog(brw, prog, bfp, &key);
3026
3027 brw->wm.prog_offset = old_prog_offset;
3028 brw->wm.prog_data = old_prog_data;
3029
3030 return success;
3031 }