i965: add a new virtual opcode: SHADER_OPCODE_TXF_MS
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 #define ALU3(op) \
150 fs_inst * \
151 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
152 { \
153 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
154 }
155
156 ALU1(NOT)
157 ALU1(MOV)
158 ALU1(FRC)
159 ALU1(RNDD)
160 ALU1(RNDE)
161 ALU1(RNDZ)
162 ALU2(ADD)
163 ALU2(MUL)
164 ALU2(MACH)
165 ALU2(AND)
166 ALU2(OR)
167 ALU2(XOR)
168 ALU2(SHL)
169 ALU2(SHR)
170 ALU2(ASR)
171 ALU3(LRP)
172
173 /** Gen4 predicated IF. */
174 fs_inst *
175 fs_visitor::IF(uint32_t predicate)
176 {
177 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
178 inst->predicate = predicate;
179 return inst;
180 }
181
182 /** Gen6+ IF with embedded comparison. */
183 fs_inst *
184 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
185 {
186 assert(intel->gen >= 6);
187 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
188 reg_null_d, src0, src1);
189 inst->conditional_mod = condition;
190 return inst;
191 }
192
193 /**
194 * CMP: Sets the low bit of the destination channels with the result
195 * of the comparison, while the upper bits are undefined, and updates
196 * the flag register with the packed 16 bits of the result.
197 */
198 fs_inst *
199 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
200 {
201 fs_inst *inst;
202
203 /* Take the instruction:
204 *
205 * CMP null<d> src0<f> src1<f>
206 *
207 * Original gen4 does type conversion to the destination type before
208 * comparison, producing garbage results for floating point comparisons.
209 * gen5 does the comparison on the execution type (resolved source types),
210 * so dst type doesn't matter. gen6 does comparison and then uses the
211 * result as if it was the dst type with no conversion, which happens to
212 * mostly work out for float-interpreted-as-int since our comparisons are
213 * for >0, =0, <0.
214 */
215 if (intel->gen == 4) {
216 dst.type = src0.type;
217 if (dst.file == FIXED_HW_REG)
218 dst.fixed_hw_reg.type = dst.type;
219 }
220
221 resolve_ud_negate(&src0);
222 resolve_ud_negate(&src1);
223
224 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
225 inst->conditional_mod = condition;
226
227 return inst;
228 }
229
230 exec_list
231 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
232 fs_reg offset)
233 {
234 exec_list instructions;
235 fs_inst *inst;
236
237 if (intel->gen >= 7) {
238 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
239 dst, surf_index, offset);
240 instructions.push_tail(inst);
241 } else {
242 int base_mrf = 13;
243 bool header_present = true;
244
245 fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
246 mrf.type = BRW_REGISTER_TYPE_D;
247
248 /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
249 * dword-aligned byte offset.
250 */
251 if (intel->gen == 6) {
252 instructions.push_tail(MOV(mrf, offset));
253 } else {
254 instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
255 }
256 inst = MOV(mrf, offset);
257 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
258 dst, surf_index);
259 inst->header_present = header_present;
260 inst->base_mrf = base_mrf;
261 inst->mlen = header_present + dispatch_width / 8;
262
263 instructions.push_tail(inst);
264 }
265
266 return instructions;
267 }
268
269 /**
270 * A helper for MOV generation for fixing up broken hardware SEND dependency
271 * handling.
272 */
273 fs_inst *
274 fs_visitor::DEP_RESOLVE_MOV(int grf)
275 {
276 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
277
278 inst->ir = NULL;
279 inst->annotation = "send dependency resolve";
280
281 /* The caller always wants uncompressed to emit the minimal extra
282 * dependencies, and to avoid having to deal with aligning its regs to 2.
283 */
284 inst->force_uncompressed = true;
285
286 return inst;
287 }
288
289 bool
290 fs_inst::equals(fs_inst *inst)
291 {
292 return (opcode == inst->opcode &&
293 dst.equals(inst->dst) &&
294 src[0].equals(inst->src[0]) &&
295 src[1].equals(inst->src[1]) &&
296 src[2].equals(inst->src[2]) &&
297 saturate == inst->saturate &&
298 predicate == inst->predicate &&
299 conditional_mod == inst->conditional_mod &&
300 mlen == inst->mlen &&
301 base_mrf == inst->base_mrf &&
302 sampler == inst->sampler &&
303 target == inst->target &&
304 eot == inst->eot &&
305 header_present == inst->header_present &&
306 shadow_compare == inst->shadow_compare &&
307 offset == inst->offset);
308 }
309
310 int
311 fs_inst::regs_written()
312 {
313 if (is_tex())
314 return 4;
315
316 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
317 * but we don't currently use them...nor do we have an opcode for them.
318 */
319
320 return 1;
321 }
322
323 bool
324 fs_inst::overwrites_reg(const fs_reg &reg)
325 {
326 return (reg.file == dst.file &&
327 reg.reg == dst.reg &&
328 reg.reg_offset >= dst.reg_offset &&
329 reg.reg_offset < dst.reg_offset + regs_written());
330 }
331
332 bool
333 fs_inst::is_tex()
334 {
335 return (opcode == SHADER_OPCODE_TEX ||
336 opcode == FS_OPCODE_TXB ||
337 opcode == SHADER_OPCODE_TXD ||
338 opcode == SHADER_OPCODE_TXF ||
339 opcode == SHADER_OPCODE_TXF_MS ||
340 opcode == SHADER_OPCODE_TXL ||
341 opcode == SHADER_OPCODE_TXS);
342 }
343
344 bool
345 fs_inst::is_math()
346 {
347 return (opcode == SHADER_OPCODE_RCP ||
348 opcode == SHADER_OPCODE_RSQ ||
349 opcode == SHADER_OPCODE_SQRT ||
350 opcode == SHADER_OPCODE_EXP2 ||
351 opcode == SHADER_OPCODE_LOG2 ||
352 opcode == SHADER_OPCODE_SIN ||
353 opcode == SHADER_OPCODE_COS ||
354 opcode == SHADER_OPCODE_INT_QUOTIENT ||
355 opcode == SHADER_OPCODE_INT_REMAINDER ||
356 opcode == SHADER_OPCODE_POW);
357 }
358
359 bool
360 fs_inst::is_control_flow()
361 {
362 switch (opcode) {
363 case BRW_OPCODE_DO:
364 case BRW_OPCODE_WHILE:
365 case BRW_OPCODE_IF:
366 case BRW_OPCODE_ELSE:
367 case BRW_OPCODE_ENDIF:
368 case BRW_OPCODE_BREAK:
369 case BRW_OPCODE_CONTINUE:
370 return true;
371 default:
372 return false;
373 }
374 }
375
376 bool
377 fs_inst::is_send_from_grf()
378 {
379 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
380 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
381 src[1].file == GRF));
382 }
383
384 bool
385 fs_visitor::can_do_source_mods(fs_inst *inst)
386 {
387 if (intel->gen == 6 && inst->is_math())
388 return false;
389
390 if (inst->is_send_from_grf())
391 return false;
392
393 return true;
394 }
395
396 void
397 fs_reg::init()
398 {
399 memset(this, 0, sizeof(*this));
400 this->smear = -1;
401 }
402
403 /** Generic unset register constructor. */
404 fs_reg::fs_reg()
405 {
406 init();
407 this->file = BAD_FILE;
408 }
409
410 /** Immediate value constructor. */
411 fs_reg::fs_reg(float f)
412 {
413 init();
414 this->file = IMM;
415 this->type = BRW_REGISTER_TYPE_F;
416 this->imm.f = f;
417 }
418
419 /** Immediate value constructor. */
420 fs_reg::fs_reg(int32_t i)
421 {
422 init();
423 this->file = IMM;
424 this->type = BRW_REGISTER_TYPE_D;
425 this->imm.i = i;
426 }
427
428 /** Immediate value constructor. */
429 fs_reg::fs_reg(uint32_t u)
430 {
431 init();
432 this->file = IMM;
433 this->type = BRW_REGISTER_TYPE_UD;
434 this->imm.u = u;
435 }
436
437 /** Fixed brw_reg Immediate value constructor. */
438 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
439 {
440 init();
441 this->file = FIXED_HW_REG;
442 this->fixed_hw_reg = fixed_hw_reg;
443 this->type = fixed_hw_reg.type;
444 }
445
446 bool
447 fs_reg::equals(const fs_reg &r) const
448 {
449 return (file == r.file &&
450 reg == r.reg &&
451 reg_offset == r.reg_offset &&
452 type == r.type &&
453 negate == r.negate &&
454 abs == r.abs &&
455 !reladdr && !r.reladdr &&
456 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
457 sizeof(fixed_hw_reg)) == 0 &&
458 smear == r.smear &&
459 imm.u == r.imm.u);
460 }
461
462 bool
463 fs_reg::is_zero() const
464 {
465 if (file != IMM)
466 return false;
467
468 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
469 }
470
471 bool
472 fs_reg::is_one() const
473 {
474 if (file != IMM)
475 return false;
476
477 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
478 }
479
480 int
481 fs_visitor::type_size(const struct glsl_type *type)
482 {
483 unsigned int size, i;
484
485 switch (type->base_type) {
486 case GLSL_TYPE_UINT:
487 case GLSL_TYPE_INT:
488 case GLSL_TYPE_FLOAT:
489 case GLSL_TYPE_BOOL:
490 return type->components();
491 case GLSL_TYPE_ARRAY:
492 return type_size(type->fields.array) * type->length;
493 case GLSL_TYPE_STRUCT:
494 size = 0;
495 for (i = 0; i < type->length; i++) {
496 size += type_size(type->fields.structure[i].type);
497 }
498 return size;
499 case GLSL_TYPE_SAMPLER:
500 /* Samplers take up no register space, since they're baked in at
501 * link time.
502 */
503 return 0;
504 case GLSL_TYPE_VOID:
505 case GLSL_TYPE_ERROR:
506 case GLSL_TYPE_INTERFACE:
507 assert(!"not reached");
508 break;
509 }
510
511 return 0;
512 }
513
514 fs_reg
515 fs_visitor::get_timestamp()
516 {
517 assert(intel->gen >= 7);
518
519 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
520 BRW_ARF_TIMESTAMP,
521 0),
522 BRW_REGISTER_TYPE_UD));
523
524 fs_reg dst = fs_reg(this, glsl_type::uint_type);
525
526 fs_inst *mov = emit(MOV(dst, ts));
527 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
528 * even if it's not enabled in the dispatch.
529 */
530 mov->force_writemask_all = true;
531 mov->force_uncompressed = true;
532
533 /* The caller wants the low 32 bits of the timestamp. Since it's running
534 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
535 * which is plenty of time for our purposes. It is identical across the
536 * EUs, but since it's tracking GPU core speed it will increment at a
537 * varying rate as render P-states change.
538 *
539 * The caller could also check if render P-states have changed (or anything
540 * else that might disrupt timing) by setting smear to 2 and checking if
541 * that field is != 0.
542 */
543 dst.smear = 0;
544
545 return dst;
546 }
547
548 void
549 fs_visitor::emit_shader_time_begin()
550 {
551 current_annotation = "shader time start";
552 shader_start_time = get_timestamp();
553 }
554
555 void
556 fs_visitor::emit_shader_time_end()
557 {
558 current_annotation = "shader time end";
559
560 enum shader_time_shader_type type, written_type, reset_type;
561 if (dispatch_width == 8) {
562 type = ST_FS8;
563 written_type = ST_FS8_WRITTEN;
564 reset_type = ST_FS8_RESET;
565 } else {
566 assert(dispatch_width == 16);
567 type = ST_FS16;
568 written_type = ST_FS16_WRITTEN;
569 reset_type = ST_FS16_RESET;
570 }
571
572 fs_reg shader_end_time = get_timestamp();
573
574 /* Check that there weren't any timestamp reset events (assuming these
575 * were the only two timestamp reads that happened).
576 */
577 fs_reg reset = shader_end_time;
578 reset.smear = 2;
579 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
580 test->conditional_mod = BRW_CONDITIONAL_Z;
581 emit(IF(BRW_PREDICATE_NORMAL));
582
583 push_force_uncompressed();
584 fs_reg start = shader_start_time;
585 start.negate = true;
586 fs_reg diff = fs_reg(this, glsl_type::uint_type);
587 emit(ADD(diff, start, shader_end_time));
588
589 /* If there were no instructions between the two timestamp gets, the diff
590 * is 2 cycles. Remove that overhead, so I can forget about that when
591 * trying to determine the time taken for single instructions.
592 */
593 emit(ADD(diff, diff, fs_reg(-2u)));
594
595 emit_shader_time_write(type, diff);
596 emit_shader_time_write(written_type, fs_reg(1u));
597 emit(BRW_OPCODE_ELSE);
598 emit_shader_time_write(reset_type, fs_reg(1u));
599 emit(BRW_OPCODE_ENDIF);
600
601 pop_force_uncompressed();
602 }
603
604 void
605 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
606 fs_reg value)
607 {
608 /* Choose an index in the buffer and set up tracking information for our
609 * printouts.
610 */
611 int shader_time_index = brw->shader_time.num_entries++;
612 assert(shader_time_index <= brw->shader_time.max_entries);
613 brw->shader_time.types[shader_time_index] = type;
614 if (prog) {
615 _mesa_reference_shader_program(ctx,
616 &brw->shader_time.programs[shader_time_index],
617 prog);
618 }
619
620 int base_mrf = 6;
621
622 fs_reg offset_mrf = fs_reg(MRF, base_mrf);
623 offset_mrf.type = BRW_REGISTER_TYPE_UD;
624 emit(MOV(offset_mrf, fs_reg(shader_time_index * 4)));
625
626 fs_reg time_mrf = fs_reg(MRF, base_mrf + 1);
627 time_mrf.type = BRW_REGISTER_TYPE_UD;
628 emit(MOV(time_mrf, value));
629
630 fs_inst *inst = emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD));
631 inst->base_mrf = base_mrf;
632 inst->mlen = 2;
633 }
634
635 void
636 fs_visitor::fail(const char *format, ...)
637 {
638 va_list va;
639 char *msg;
640
641 if (failed)
642 return;
643
644 failed = true;
645
646 va_start(va, format);
647 msg = ralloc_vasprintf(mem_ctx, format, va);
648 va_end(va);
649 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
650
651 this->fail_msg = msg;
652
653 if (INTEL_DEBUG & DEBUG_WM) {
654 fprintf(stderr, "%s", msg);
655 }
656 }
657
658 fs_inst *
659 fs_visitor::emit(enum opcode opcode)
660 {
661 return emit(fs_inst(opcode));
662 }
663
664 fs_inst *
665 fs_visitor::emit(enum opcode opcode, fs_reg dst)
666 {
667 return emit(fs_inst(opcode, dst));
668 }
669
670 fs_inst *
671 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
672 {
673 return emit(fs_inst(opcode, dst, src0));
674 }
675
676 fs_inst *
677 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
678 {
679 return emit(fs_inst(opcode, dst, src0, src1));
680 }
681
682 fs_inst *
683 fs_visitor::emit(enum opcode opcode, fs_reg dst,
684 fs_reg src0, fs_reg src1, fs_reg src2)
685 {
686 return emit(fs_inst(opcode, dst, src0, src1, src2));
687 }
688
689 void
690 fs_visitor::push_force_uncompressed()
691 {
692 force_uncompressed_stack++;
693 }
694
695 void
696 fs_visitor::pop_force_uncompressed()
697 {
698 force_uncompressed_stack--;
699 assert(force_uncompressed_stack >= 0);
700 }
701
702 void
703 fs_visitor::push_force_sechalf()
704 {
705 force_sechalf_stack++;
706 }
707
708 void
709 fs_visitor::pop_force_sechalf()
710 {
711 force_sechalf_stack--;
712 assert(force_sechalf_stack >= 0);
713 }
714
715 /**
716 * Returns how many MRFs an FS opcode will write over.
717 *
718 * Note that this is not the 0 or 1 implied writes in an actual gen
719 * instruction -- the FS opcodes often generate MOVs in addition.
720 */
721 int
722 fs_visitor::implied_mrf_writes(fs_inst *inst)
723 {
724 if (inst->mlen == 0)
725 return 0;
726
727 switch (inst->opcode) {
728 case SHADER_OPCODE_RCP:
729 case SHADER_OPCODE_RSQ:
730 case SHADER_OPCODE_SQRT:
731 case SHADER_OPCODE_EXP2:
732 case SHADER_OPCODE_LOG2:
733 case SHADER_OPCODE_SIN:
734 case SHADER_OPCODE_COS:
735 return 1 * dispatch_width / 8;
736 case SHADER_OPCODE_POW:
737 case SHADER_OPCODE_INT_QUOTIENT:
738 case SHADER_OPCODE_INT_REMAINDER:
739 return 2 * dispatch_width / 8;
740 case SHADER_OPCODE_TEX:
741 case FS_OPCODE_TXB:
742 case SHADER_OPCODE_TXD:
743 case SHADER_OPCODE_TXF:
744 case SHADER_OPCODE_TXF_MS:
745 case SHADER_OPCODE_TXL:
746 case SHADER_OPCODE_TXS:
747 return 1;
748 case SHADER_OPCODE_SHADER_TIME_ADD:
749 return 0;
750 case FS_OPCODE_FB_WRITE:
751 return 2;
752 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
753 case FS_OPCODE_UNSPILL:
754 return 1;
755 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
756 return inst->header_present;
757 case FS_OPCODE_SPILL:
758 return 2;
759 default:
760 assert(!"not reached");
761 return inst->mlen;
762 }
763 }
764
765 int
766 fs_visitor::virtual_grf_alloc(int size)
767 {
768 if (virtual_grf_array_size <= virtual_grf_count) {
769 if (virtual_grf_array_size == 0)
770 virtual_grf_array_size = 16;
771 else
772 virtual_grf_array_size *= 2;
773 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
774 virtual_grf_array_size);
775 }
776 virtual_grf_sizes[virtual_grf_count] = size;
777 return virtual_grf_count++;
778 }
779
780 /** Fixed HW reg constructor. */
781 fs_reg::fs_reg(enum register_file file, int reg)
782 {
783 init();
784 this->file = file;
785 this->reg = reg;
786 this->type = BRW_REGISTER_TYPE_F;
787 }
788
789 /** Fixed HW reg constructor. */
790 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
791 {
792 init();
793 this->file = file;
794 this->reg = reg;
795 this->type = type;
796 }
797
798 /** Automatic reg constructor. */
799 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
800 {
801 init();
802
803 this->file = GRF;
804 this->reg = v->virtual_grf_alloc(v->type_size(type));
805 this->reg_offset = 0;
806 this->type = brw_type_for_base_type(type);
807 }
808
809 fs_reg *
810 fs_visitor::variable_storage(ir_variable *var)
811 {
812 return (fs_reg *)hash_table_find(this->variable_ht, var);
813 }
814
815 void
816 import_uniforms_callback(const void *key,
817 void *data,
818 void *closure)
819 {
820 struct hash_table *dst_ht = (struct hash_table *)closure;
821 const fs_reg *reg = (const fs_reg *)data;
822
823 if (reg->file != UNIFORM)
824 return;
825
826 hash_table_insert(dst_ht, data, key);
827 }
828
829 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
830 * This brings in those uniform definitions
831 */
832 void
833 fs_visitor::import_uniforms(fs_visitor *v)
834 {
835 hash_table_call_foreach(v->variable_ht,
836 import_uniforms_callback,
837 variable_ht);
838 this->params_remap = v->params_remap;
839 }
840
841 /* Our support for uniforms is piggy-backed on the struct
842 * gl_fragment_program, because that's where the values actually
843 * get stored, rather than in some global gl_shader_program uniform
844 * store.
845 */
846 void
847 fs_visitor::setup_uniform_values(ir_variable *ir)
848 {
849 int namelen = strlen(ir->name);
850
851 /* The data for our (non-builtin) uniforms is stored in a series of
852 * gl_uniform_driver_storage structs for each subcomponent that
853 * glGetUniformLocation() could name. We know it's been set up in the same
854 * order we'd walk the type, so walk the list of storage and find anything
855 * with our name, or the prefix of a component that starts with our name.
856 */
857 unsigned params_before = c->prog_data.nr_params;
858 for (unsigned u = 0; u < prog->NumUserUniformStorage; u++) {
859 struct gl_uniform_storage *storage = &prog->UniformStorage[u];
860
861 if (strncmp(ir->name, storage->name, namelen) != 0 ||
862 (storage->name[namelen] != 0 &&
863 storage->name[namelen] != '.' &&
864 storage->name[namelen] != '[')) {
865 continue;
866 }
867
868 unsigned slots = storage->type->component_slots();
869 if (storage->array_elements)
870 slots *= storage->array_elements;
871
872 for (unsigned i = 0; i < slots; i++) {
873 c->prog_data.param[c->prog_data.nr_params++] =
874 &storage->storage[i].f;
875 }
876 }
877
878 /* Make sure we actually initialized the right amount of stuff here. */
879 assert(params_before + ir->type->component_slots() ==
880 c->prog_data.nr_params);
881 }
882
883
884 /* Our support for builtin uniforms is even scarier than non-builtin.
885 * It sits on top of the PROG_STATE_VAR parameters that are
886 * automatically updated from GL context state.
887 */
888 void
889 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
890 {
891 const ir_state_slot *const slots = ir->state_slots;
892 assert(ir->state_slots != NULL);
893
894 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
895 /* This state reference has already been setup by ir_to_mesa, but we'll
896 * get the same index back here.
897 */
898 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
899 (gl_state_index *)slots[i].tokens);
900
901 /* Add each of the unique swizzles of the element as a parameter.
902 * This'll end up matching the expected layout of the
903 * array/matrix/structure we're trying to fill in.
904 */
905 int last_swiz = -1;
906 for (unsigned int j = 0; j < 4; j++) {
907 int swiz = GET_SWZ(slots[i].swizzle, j);
908 if (swiz == last_swiz)
909 break;
910 last_swiz = swiz;
911
912 c->prog_data.param[c->prog_data.nr_params++] =
913 &fp->Base.Parameters->ParameterValues[index][swiz].f;
914 }
915 }
916 }
917
918 fs_reg *
919 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
920 {
921 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
922 fs_reg wpos = *reg;
923 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
924
925 /* gl_FragCoord.x */
926 if (ir->pixel_center_integer) {
927 emit(MOV(wpos, this->pixel_x));
928 } else {
929 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
930 }
931 wpos.reg_offset++;
932
933 /* gl_FragCoord.y */
934 if (!flip && ir->pixel_center_integer) {
935 emit(MOV(wpos, this->pixel_y));
936 } else {
937 fs_reg pixel_y = this->pixel_y;
938 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
939
940 if (flip) {
941 pixel_y.negate = true;
942 offset += c->key.drawable_height - 1.0;
943 }
944
945 emit(ADD(wpos, pixel_y, fs_reg(offset)));
946 }
947 wpos.reg_offset++;
948
949 /* gl_FragCoord.z */
950 if (intel->gen >= 6) {
951 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
952 } else {
953 emit(FS_OPCODE_LINTERP, wpos,
954 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
955 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
956 interp_reg(FRAG_ATTRIB_WPOS, 2));
957 }
958 wpos.reg_offset++;
959
960 /* gl_FragCoord.w: Already set up in emit_interpolation */
961 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
962
963 return reg;
964 }
965
966 fs_inst *
967 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
968 glsl_interp_qualifier interpolation_mode,
969 bool is_centroid)
970 {
971 brw_wm_barycentric_interp_mode barycoord_mode;
972 if (is_centroid) {
973 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
974 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
975 else
976 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
977 } else {
978 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
979 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
980 else
981 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
982 }
983 return emit(FS_OPCODE_LINTERP, attr,
984 this->delta_x[barycoord_mode],
985 this->delta_y[barycoord_mode], interp);
986 }
987
988 fs_reg *
989 fs_visitor::emit_general_interpolation(ir_variable *ir)
990 {
991 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
992 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
993 fs_reg attr = *reg;
994
995 unsigned int array_elements;
996 const glsl_type *type;
997
998 if (ir->type->is_array()) {
999 array_elements = ir->type->length;
1000 if (array_elements == 0) {
1001 fail("dereferenced array '%s' has length 0\n", ir->name);
1002 }
1003 type = ir->type->fields.array;
1004 } else {
1005 array_elements = 1;
1006 type = ir->type;
1007 }
1008
1009 glsl_interp_qualifier interpolation_mode =
1010 ir->determine_interpolation_mode(c->key.flat_shade);
1011
1012 int location = ir->location;
1013 for (unsigned int i = 0; i < array_elements; i++) {
1014 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1015 if (urb_setup[location] == -1) {
1016 /* If there's no incoming setup data for this slot, don't
1017 * emit interpolation for it.
1018 */
1019 attr.reg_offset += type->vector_elements;
1020 location++;
1021 continue;
1022 }
1023
1024 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1025 /* Constant interpolation (flat shading) case. The SF has
1026 * handed us defined values in only the constant offset
1027 * field of the setup reg.
1028 */
1029 for (unsigned int k = 0; k < type->vector_elements; k++) {
1030 struct brw_reg interp = interp_reg(location, k);
1031 interp = suboffset(interp, 3);
1032 interp.type = reg->type;
1033 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1034 attr.reg_offset++;
1035 }
1036 } else {
1037 /* Smooth/noperspective interpolation case. */
1038 for (unsigned int k = 0; k < type->vector_elements; k++) {
1039 /* FINISHME: At some point we probably want to push
1040 * this farther by giving similar treatment to the
1041 * other potentially constant components of the
1042 * attribute, as well as making brw_vs_constval.c
1043 * handle varyings other than gl_TexCoord.
1044 */
1045 if (location >= FRAG_ATTRIB_TEX0 &&
1046 location <= FRAG_ATTRIB_TEX7 &&
1047 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
1048 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
1049 } else {
1050 struct brw_reg interp = interp_reg(location, k);
1051 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1052 ir->centroid);
1053 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1054 /* Get the pixel/sample mask into f0 so that we know
1055 * which pixels are lit. Then, for each channel that is
1056 * unlit, replace the centroid data with non-centroid
1057 * data.
1058 */
1059 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
1060 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1061 interpolation_mode, false);
1062 inst->predicate = BRW_PREDICATE_NORMAL;
1063 inst->predicate_inverse = true;
1064 }
1065 if (intel->gen < 6) {
1066 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1067 }
1068 }
1069 attr.reg_offset++;
1070 }
1071
1072 }
1073 location++;
1074 }
1075 }
1076
1077 return reg;
1078 }
1079
1080 fs_reg *
1081 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1082 {
1083 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1084
1085 /* The frontfacing comes in as a bit in the thread payload. */
1086 if (intel->gen >= 6) {
1087 emit(BRW_OPCODE_ASR, *reg,
1088 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1089 fs_reg(15));
1090 emit(BRW_OPCODE_NOT, *reg, *reg);
1091 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1092 } else {
1093 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1094 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1095 * us front face
1096 */
1097 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1098 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1099 }
1100
1101 return reg;
1102 }
1103
1104 fs_reg
1105 fs_visitor::fix_math_operand(fs_reg src)
1106 {
1107 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1108 * might be able to do better by doing execsize = 1 math and then
1109 * expanding that result out, but we would need to be careful with
1110 * masking.
1111 *
1112 * The hardware ignores source modifiers (negate and abs) on math
1113 * instructions, so we also move to a temp to set those up.
1114 */
1115 if (intel->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1116 !src.abs && !src.negate)
1117 return src;
1118
1119 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1120 * operands to math
1121 */
1122 if (intel->gen >= 7 && src.file != IMM)
1123 return src;
1124
1125 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1126 expanded.type = src.type;
1127 emit(BRW_OPCODE_MOV, expanded, src);
1128 return expanded;
1129 }
1130
1131 fs_inst *
1132 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1133 {
1134 switch (opcode) {
1135 case SHADER_OPCODE_RCP:
1136 case SHADER_OPCODE_RSQ:
1137 case SHADER_OPCODE_SQRT:
1138 case SHADER_OPCODE_EXP2:
1139 case SHADER_OPCODE_LOG2:
1140 case SHADER_OPCODE_SIN:
1141 case SHADER_OPCODE_COS:
1142 break;
1143 default:
1144 assert(!"not reached: bad math opcode");
1145 return NULL;
1146 }
1147
1148 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1149 * might be able to do better by doing execsize = 1 math and then
1150 * expanding that result out, but we would need to be careful with
1151 * masking.
1152 *
1153 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1154 * instructions, so we also move to a temp to set those up.
1155 */
1156 if (intel->gen >= 6)
1157 src = fix_math_operand(src);
1158
1159 fs_inst *inst = emit(opcode, dst, src);
1160
1161 if (intel->gen < 6) {
1162 inst->base_mrf = 2;
1163 inst->mlen = dispatch_width / 8;
1164 }
1165
1166 return inst;
1167 }
1168
1169 fs_inst *
1170 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1171 {
1172 int base_mrf = 2;
1173 fs_inst *inst;
1174
1175 switch (opcode) {
1176 case SHADER_OPCODE_INT_QUOTIENT:
1177 case SHADER_OPCODE_INT_REMAINDER:
1178 if (intel->gen >= 7 && dispatch_width == 16)
1179 fail("16-wide INTDIV unsupported\n");
1180 break;
1181 case SHADER_OPCODE_POW:
1182 break;
1183 default:
1184 assert(!"not reached: unsupported binary math opcode.");
1185 return NULL;
1186 }
1187
1188 if (intel->gen >= 6) {
1189 src0 = fix_math_operand(src0);
1190 src1 = fix_math_operand(src1);
1191
1192 inst = emit(opcode, dst, src0, src1);
1193 } else {
1194 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1195 * "Message Payload":
1196 *
1197 * "Operand0[7]. For the INT DIV functions, this operand is the
1198 * denominator."
1199 * ...
1200 * "Operand1[7]. For the INT DIV functions, this operand is the
1201 * numerator."
1202 */
1203 bool is_int_div = opcode != SHADER_OPCODE_POW;
1204 fs_reg &op0 = is_int_div ? src1 : src0;
1205 fs_reg &op1 = is_int_div ? src0 : src1;
1206
1207 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1208 inst = emit(opcode, dst, op0, reg_null_f);
1209
1210 inst->base_mrf = base_mrf;
1211 inst->mlen = 2 * dispatch_width / 8;
1212 }
1213 return inst;
1214 }
1215
1216 void
1217 fs_visitor::assign_curb_setup()
1218 {
1219 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1220 if (dispatch_width == 8) {
1221 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1222 } else {
1223 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1224 }
1225
1226 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1227 foreach_list(node, &this->instructions) {
1228 fs_inst *inst = (fs_inst *)node;
1229
1230 for (unsigned int i = 0; i < 3; i++) {
1231 if (inst->src[i].file == UNIFORM) {
1232 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1233 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1234 constant_nr / 8,
1235 constant_nr % 8);
1236
1237 inst->src[i].file = FIXED_HW_REG;
1238 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1239 }
1240 }
1241 }
1242 }
1243
1244 void
1245 fs_visitor::calculate_urb_setup()
1246 {
1247 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1248 urb_setup[i] = -1;
1249 }
1250
1251 int urb_next = 0;
1252 /* Figure out where each of the incoming setup attributes lands. */
1253 if (intel->gen >= 6) {
1254 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1255 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1256 urb_setup[i] = urb_next++;
1257 }
1258 }
1259 } else {
1260 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1261 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1262 /* Point size is packed into the header, not as a general attribute */
1263 if (i == VERT_RESULT_PSIZ)
1264 continue;
1265
1266 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1267 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1268
1269 /* The back color slot is skipped when the front color is
1270 * also written to. In addition, some slots can be
1271 * written in the vertex shader and not read in the
1272 * fragment shader. So the register number must always be
1273 * incremented, mapped or not.
1274 */
1275 if (fp_index >= 0)
1276 urb_setup[fp_index] = urb_next;
1277 urb_next++;
1278 }
1279 }
1280
1281 /*
1282 * It's a FS only attribute, and we did interpolation for this attribute
1283 * in SF thread. So, count it here, too.
1284 *
1285 * See compile_sf_prog() for more info.
1286 */
1287 if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1288 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1289 }
1290
1291 /* Each attribute is 4 setup channels, each of which is half a reg. */
1292 c->prog_data.urb_read_length = urb_next * 2;
1293 }
1294
1295 void
1296 fs_visitor::assign_urb_setup()
1297 {
1298 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1299
1300 /* Offset all the urb_setup[] index by the actual position of the
1301 * setup regs, now that the location of the constants has been chosen.
1302 */
1303 foreach_list(node, &this->instructions) {
1304 fs_inst *inst = (fs_inst *)node;
1305
1306 if (inst->opcode == FS_OPCODE_LINTERP) {
1307 assert(inst->src[2].file == FIXED_HW_REG);
1308 inst->src[2].fixed_hw_reg.nr += urb_start;
1309 }
1310
1311 if (inst->opcode == FS_OPCODE_CINTERP) {
1312 assert(inst->src[0].file == FIXED_HW_REG);
1313 inst->src[0].fixed_hw_reg.nr += urb_start;
1314 }
1315 }
1316
1317 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1318 }
1319
1320 /**
1321 * Split large virtual GRFs into separate components if we can.
1322 *
1323 * This is mostly duplicated with what brw_fs_vector_splitting does,
1324 * but that's really conservative because it's afraid of doing
1325 * splitting that doesn't result in real progress after the rest of
1326 * the optimization phases, which would cause infinite looping in
1327 * optimization. We can do it once here, safely. This also has the
1328 * opportunity to split interpolated values, or maybe even uniforms,
1329 * which we don't have at the IR level.
1330 *
1331 * We want to split, because virtual GRFs are what we register
1332 * allocate and spill (due to contiguousness requirements for some
1333 * instructions), and they're what we naturally generate in the
1334 * codegen process, but most virtual GRFs don't actually need to be
1335 * contiguous sets of GRFs. If we split, we'll end up with reduced
1336 * live intervals and better dead code elimination and coalescing.
1337 */
1338 void
1339 fs_visitor::split_virtual_grfs()
1340 {
1341 int num_vars = this->virtual_grf_count;
1342 bool split_grf[num_vars];
1343 int new_virtual_grf[num_vars];
1344
1345 /* Try to split anything > 0 sized. */
1346 for (int i = 0; i < num_vars; i++) {
1347 if (this->virtual_grf_sizes[i] != 1)
1348 split_grf[i] = true;
1349 else
1350 split_grf[i] = false;
1351 }
1352
1353 if (brw->has_pln &&
1354 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1355 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1356 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1357 * Gen6, that was the only supported interpolation mode, and since Gen6,
1358 * delta_x and delta_y are in fixed hardware registers.
1359 */
1360 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1361 false;
1362 }
1363
1364 foreach_list(node, &this->instructions) {
1365 fs_inst *inst = (fs_inst *)node;
1366
1367 /* If there's a SEND message that requires contiguous destination
1368 * registers, no splitting is allowed.
1369 */
1370 if (inst->regs_written() > 1) {
1371 split_grf[inst->dst.reg] = false;
1372 }
1373 }
1374
1375 /* Allocate new space for split regs. Note that the virtual
1376 * numbers will be contiguous.
1377 */
1378 for (int i = 0; i < num_vars; i++) {
1379 if (split_grf[i]) {
1380 new_virtual_grf[i] = virtual_grf_alloc(1);
1381 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1382 int reg = virtual_grf_alloc(1);
1383 assert(reg == new_virtual_grf[i] + j - 1);
1384 (void) reg;
1385 }
1386 this->virtual_grf_sizes[i] = 1;
1387 }
1388 }
1389
1390 foreach_list(node, &this->instructions) {
1391 fs_inst *inst = (fs_inst *)node;
1392
1393 if (inst->dst.file == GRF &&
1394 split_grf[inst->dst.reg] &&
1395 inst->dst.reg_offset != 0) {
1396 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1397 inst->dst.reg_offset - 1);
1398 inst->dst.reg_offset = 0;
1399 }
1400 for (int i = 0; i < 3; i++) {
1401 if (inst->src[i].file == GRF &&
1402 split_grf[inst->src[i].reg] &&
1403 inst->src[i].reg_offset != 0) {
1404 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1405 inst->src[i].reg_offset - 1);
1406 inst->src[i].reg_offset = 0;
1407 }
1408 }
1409 }
1410 this->live_intervals_valid = false;
1411 }
1412
1413 /**
1414 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1415 *
1416 * During code generation, we create tons of temporary variables, many of
1417 * which get immediately killed and are never used again. Yet, in later
1418 * optimization and analysis passes, such as compute_live_intervals, we need
1419 * to loop over all the virtual GRFs. Compacting them can save a lot of
1420 * overhead.
1421 */
1422 void
1423 fs_visitor::compact_virtual_grfs()
1424 {
1425 /* Mark which virtual GRFs are used, and count how many. */
1426 int remap_table[this->virtual_grf_count];
1427 memset(remap_table, -1, sizeof(remap_table));
1428
1429 foreach_list(node, &this->instructions) {
1430 const fs_inst *inst = (const fs_inst *) node;
1431
1432 if (inst->dst.file == GRF)
1433 remap_table[inst->dst.reg] = 0;
1434
1435 for (int i = 0; i < 3; i++) {
1436 if (inst->src[i].file == GRF)
1437 remap_table[inst->src[i].reg] = 0;
1438 }
1439 }
1440
1441 /* In addition to registers used in instructions, fs_visitor keeps
1442 * direct references to certain special values which must be patched:
1443 */
1444 fs_reg *special[] = {
1445 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1446 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1447 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1448 &delta_x[0], &delta_x[1], &delta_x[2],
1449 &delta_x[3], &delta_x[4], &delta_x[5],
1450 &delta_y[0], &delta_y[1], &delta_y[2],
1451 &delta_y[3], &delta_y[4], &delta_y[5],
1452 };
1453 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1454 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1455
1456 /* Treat all special values as used, to be conservative */
1457 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1458 if (special[i]->file == GRF)
1459 remap_table[special[i]->reg] = 0;
1460 }
1461
1462 /* Compact the GRF arrays. */
1463 int new_index = 0;
1464 for (int i = 0; i < this->virtual_grf_count; i++) {
1465 if (remap_table[i] != -1) {
1466 remap_table[i] = new_index;
1467 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1468 if (live_intervals_valid) {
1469 virtual_grf_use[new_index] = virtual_grf_use[i];
1470 virtual_grf_def[new_index] = virtual_grf_def[i];
1471 }
1472 ++new_index;
1473 }
1474 }
1475
1476 this->virtual_grf_count = new_index;
1477
1478 /* Patch all the instructions to use the newly renumbered registers */
1479 foreach_list(node, &this->instructions) {
1480 fs_inst *inst = (fs_inst *) node;
1481
1482 if (inst->dst.file == GRF)
1483 inst->dst.reg = remap_table[inst->dst.reg];
1484
1485 for (int i = 0; i < 3; i++) {
1486 if (inst->src[i].file == GRF)
1487 inst->src[i].reg = remap_table[inst->src[i].reg];
1488 }
1489 }
1490
1491 /* Patch all the references to special values */
1492 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1493 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1494 special[i]->reg = remap_table[special[i]->reg];
1495 }
1496 }
1497
1498 bool
1499 fs_visitor::remove_dead_constants()
1500 {
1501 if (dispatch_width == 8) {
1502 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1503
1504 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1505 this->params_remap[i] = -1;
1506
1507 /* Find which params are still in use. */
1508 foreach_list(node, &this->instructions) {
1509 fs_inst *inst = (fs_inst *)node;
1510
1511 for (int i = 0; i < 3; i++) {
1512 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1513
1514 if (inst->src[i].file != UNIFORM)
1515 continue;
1516
1517 assert(constant_nr < (int)c->prog_data.nr_params);
1518
1519 /* For now, set this to non-negative. We'll give it the
1520 * actual new number in a moment, in order to keep the
1521 * register numbers nicely ordered.
1522 */
1523 this->params_remap[constant_nr] = 0;
1524 }
1525 }
1526
1527 /* Figure out what the new numbers for the params will be. At some
1528 * point when we're doing uniform array access, we're going to want
1529 * to keep the distinction between .reg and .reg_offset, but for
1530 * now we don't care.
1531 */
1532 unsigned int new_nr_params = 0;
1533 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1534 if (this->params_remap[i] != -1) {
1535 this->params_remap[i] = new_nr_params++;
1536 }
1537 }
1538
1539 /* Update the list of params to be uploaded to match our new numbering. */
1540 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1541 int remapped = this->params_remap[i];
1542
1543 if (remapped == -1)
1544 continue;
1545
1546 c->prog_data.param[remapped] = c->prog_data.param[i];
1547 }
1548
1549 c->prog_data.nr_params = new_nr_params;
1550 } else {
1551 /* This should have been generated in the 8-wide pass already. */
1552 assert(this->params_remap);
1553 }
1554
1555 /* Now do the renumbering of the shader to remove unused params. */
1556 foreach_list(node, &this->instructions) {
1557 fs_inst *inst = (fs_inst *)node;
1558
1559 for (int i = 0; i < 3; i++) {
1560 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1561
1562 if (inst->src[i].file != UNIFORM)
1563 continue;
1564
1565 assert(this->params_remap[constant_nr] != -1);
1566 inst->src[i].reg = this->params_remap[constant_nr];
1567 inst->src[i].reg_offset = 0;
1568 }
1569 }
1570
1571 return true;
1572 }
1573
1574 /*
1575 * Implements array access of uniforms by inserting a
1576 * PULL_CONSTANT_LOAD instruction.
1577 *
1578 * Unlike temporary GRF array access (where we don't support it due to
1579 * the difficulty of doing relative addressing on instruction
1580 * destinations), we could potentially do array access of uniforms
1581 * that were loaded in GRF space as push constants. In real-world
1582 * usage we've seen, though, the arrays being used are always larger
1583 * than we could load as push constants, so just always move all
1584 * uniform array access out to a pull constant buffer.
1585 */
1586 void
1587 fs_visitor::move_uniform_array_access_to_pull_constants()
1588 {
1589 int pull_constant_loc[c->prog_data.nr_params];
1590
1591 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1592 pull_constant_loc[i] = -1;
1593 }
1594
1595 /* Walk through and find array access of uniforms. Put a copy of that
1596 * uniform in the pull constant buffer.
1597 *
1598 * Note that we don't move constant-indexed accesses to arrays. No
1599 * testing has been done of the performance impact of this choice.
1600 */
1601 foreach_list_safe(node, &this->instructions) {
1602 fs_inst *inst = (fs_inst *)node;
1603
1604 for (int i = 0 ; i < 3; i++) {
1605 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1606 continue;
1607
1608 int uniform = inst->src[i].reg;
1609
1610 /* If this array isn't already present in the pull constant buffer,
1611 * add it.
1612 */
1613 if (pull_constant_loc[uniform] == -1) {
1614 const float **values = &c->prog_data.param[uniform];
1615
1616 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1617
1618 assert(param_size[uniform]);
1619
1620 for (int j = 0; j < param_size[uniform]; j++) {
1621 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1622 values[j];
1623 }
1624 }
1625
1626 /* Set up the annotation tracking for new generated instructions. */
1627 base_ir = inst->ir;
1628 current_annotation = inst->annotation;
1629
1630 fs_reg offset = fs_reg(this, glsl_type::int_type);
1631 inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1632 fs_reg(pull_constant_loc[uniform] +
1633 inst->src[i].reg_offset)));
1634
1635 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1636 fs_reg temp = fs_reg(this, glsl_type::float_type);
1637 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1638 surf_index, offset);
1639 inst->insert_before(&list);
1640
1641 inst->src[i].file = temp.file;
1642 inst->src[i].reg = temp.reg;
1643 inst->src[i].reg_offset = temp.reg_offset;
1644 inst->src[i].reladdr = NULL;
1645 }
1646 }
1647 }
1648
1649 /**
1650 * Choose accesses from the UNIFORM file to demote to using the pull
1651 * constant buffer.
1652 *
1653 * We allow a fragment shader to have more than the specified minimum
1654 * maximum number of fragment shader uniform components (64). If
1655 * there are too many of these, they'd fill up all of register space.
1656 * So, this will push some of them out to the pull constant buffer and
1657 * update the program to load them.
1658 */
1659 void
1660 fs_visitor::setup_pull_constants()
1661 {
1662 /* Only allow 16 registers (128 uniform components) as push constants. */
1663 unsigned int max_uniform_components = 16 * 8;
1664 if (c->prog_data.nr_params <= max_uniform_components)
1665 return;
1666
1667 if (dispatch_width == 16) {
1668 fail("Pull constants not supported in 16-wide\n");
1669 return;
1670 }
1671
1672 /* Just demote the end of the list. We could probably do better
1673 * here, demoting things that are rarely used in the program first.
1674 */
1675 unsigned int pull_uniform_base = max_uniform_components;
1676
1677 int pull_constant_loc[c->prog_data.nr_params];
1678 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1679 if (i < pull_uniform_base) {
1680 pull_constant_loc[i] = -1;
1681 } else {
1682 pull_constant_loc[i] = -1;
1683 /* If our constant is already being uploaded for reladdr purposes,
1684 * reuse it.
1685 */
1686 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1687 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1688 pull_constant_loc[i] = j;
1689 break;
1690 }
1691 }
1692 if (pull_constant_loc[i] == -1) {
1693 int pull_index = c->prog_data.nr_pull_params++;
1694 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1695 pull_constant_loc[i] = pull_index;;
1696 }
1697 }
1698 }
1699 c->prog_data.nr_params = pull_uniform_base;
1700
1701 foreach_list(node, &this->instructions) {
1702 fs_inst *inst = (fs_inst *)node;
1703
1704 for (int i = 0; i < 3; i++) {
1705 if (inst->src[i].file != UNIFORM)
1706 continue;
1707
1708 int pull_index = pull_constant_loc[inst->src[i].reg +
1709 inst->src[i].reg_offset];
1710 if (pull_index == -1)
1711 continue;
1712
1713 assert(!inst->src[i].reladdr);
1714
1715 fs_reg dst = fs_reg(this, glsl_type::float_type);
1716 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1717 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1718 fs_inst *pull =
1719 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1720 dst, index, offset);
1721 pull->ir = inst->ir;
1722 pull->annotation = inst->annotation;
1723
1724 inst->insert_before(pull);
1725
1726 inst->src[i].file = GRF;
1727 inst->src[i].reg = dst.reg;
1728 inst->src[i].reg_offset = 0;
1729 inst->src[i].smear = pull_index & 3;
1730 }
1731 }
1732 }
1733
1734 bool
1735 fs_visitor::opt_algebraic()
1736 {
1737 bool progress = false;
1738
1739 foreach_list(node, &this->instructions) {
1740 fs_inst *inst = (fs_inst *)node;
1741
1742 switch (inst->opcode) {
1743 case BRW_OPCODE_MUL:
1744 if (inst->src[1].file != IMM)
1745 continue;
1746
1747 /* a * 1.0 = a */
1748 if (inst->src[1].is_one()) {
1749 inst->opcode = BRW_OPCODE_MOV;
1750 inst->src[1] = reg_undef;
1751 progress = true;
1752 break;
1753 }
1754
1755 /* a * 0.0 = 0.0 */
1756 if (inst->src[1].is_zero()) {
1757 inst->opcode = BRW_OPCODE_MOV;
1758 inst->src[0] = inst->src[1];
1759 inst->src[1] = reg_undef;
1760 progress = true;
1761 break;
1762 }
1763
1764 break;
1765 case BRW_OPCODE_ADD:
1766 if (inst->src[1].file != IMM)
1767 continue;
1768
1769 /* a + 0.0 = a */
1770 if (inst->src[1].is_zero()) {
1771 inst->opcode = BRW_OPCODE_MOV;
1772 inst->src[1] = reg_undef;
1773 progress = true;
1774 break;
1775 }
1776 break;
1777 default:
1778 break;
1779 }
1780 }
1781
1782 return progress;
1783 }
1784
1785 /**
1786 * Must be called after calculate_live_intervales() to remove unused
1787 * writes to registers -- register allocation will fail otherwise
1788 * because something deffed but not used won't be considered to
1789 * interfere with other regs.
1790 */
1791 bool
1792 fs_visitor::dead_code_eliminate()
1793 {
1794 bool progress = false;
1795 int pc = 0;
1796
1797 calculate_live_intervals();
1798
1799 foreach_list_safe(node, &this->instructions) {
1800 fs_inst *inst = (fs_inst *)node;
1801
1802 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1803 inst->remove();
1804 progress = true;
1805 }
1806
1807 pc++;
1808 }
1809
1810 if (progress)
1811 live_intervals_valid = false;
1812
1813 return progress;
1814 }
1815
1816 /**
1817 * Implements a second type of register coalescing: This one checks if
1818 * the two regs involved in a raw move don't interfere, in which case
1819 * they can both by stored in the same place and the MOV removed.
1820 */
1821 bool
1822 fs_visitor::register_coalesce_2()
1823 {
1824 bool progress = false;
1825
1826 calculate_live_intervals();
1827
1828 foreach_list_safe(node, &this->instructions) {
1829 fs_inst *inst = (fs_inst *)node;
1830
1831 if (inst->opcode != BRW_OPCODE_MOV ||
1832 inst->predicate ||
1833 inst->saturate ||
1834 inst->src[0].file != GRF ||
1835 inst->src[0].negate ||
1836 inst->src[0].abs ||
1837 inst->src[0].smear != -1 ||
1838 inst->dst.file != GRF ||
1839 inst->dst.type != inst->src[0].type ||
1840 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1841 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1842 continue;
1843 }
1844
1845 int reg_from = inst->src[0].reg;
1846 assert(inst->src[0].reg_offset == 0);
1847 int reg_to = inst->dst.reg;
1848 int reg_to_offset = inst->dst.reg_offset;
1849
1850 foreach_list(node, &this->instructions) {
1851 fs_inst *scan_inst = (fs_inst *)node;
1852
1853 if (scan_inst->dst.file == GRF &&
1854 scan_inst->dst.reg == reg_from) {
1855 scan_inst->dst.reg = reg_to;
1856 scan_inst->dst.reg_offset = reg_to_offset;
1857 }
1858 for (int i = 0; i < 3; i++) {
1859 if (scan_inst->src[i].file == GRF &&
1860 scan_inst->src[i].reg == reg_from) {
1861 scan_inst->src[i].reg = reg_to;
1862 scan_inst->src[i].reg_offset = reg_to_offset;
1863 }
1864 }
1865 }
1866
1867 inst->remove();
1868
1869 /* We don't need to recalculate live intervals inside the loop despite
1870 * flagging live_intervals_valid because we only use live intervals for
1871 * the interferes test, and we must have had a situation where the
1872 * intervals were:
1873 *
1874 * from to
1875 * ^
1876 * |
1877 * v
1878 * ^
1879 * |
1880 * v
1881 *
1882 * Some register R that might get coalesced with one of these two could
1883 * only be referencing "to", otherwise "from"'s range would have been
1884 * longer. R's range could also only start at the end of "to" or later,
1885 * otherwise it will conflict with "to" when we try to coalesce "to"
1886 * into Rw anyway.
1887 */
1888 live_intervals_valid = false;
1889
1890 progress = true;
1891 continue;
1892 }
1893
1894 return progress;
1895 }
1896
1897 bool
1898 fs_visitor::register_coalesce()
1899 {
1900 bool progress = false;
1901 int if_depth = 0;
1902 int loop_depth = 0;
1903
1904 foreach_list_safe(node, &this->instructions) {
1905 fs_inst *inst = (fs_inst *)node;
1906
1907 /* Make sure that we dominate the instructions we're going to
1908 * scan for interfering with our coalescing, or we won't have
1909 * scanned enough to see if anything interferes with our
1910 * coalescing. We don't dominate the following instructions if
1911 * we're in a loop or an if block.
1912 */
1913 switch (inst->opcode) {
1914 case BRW_OPCODE_DO:
1915 loop_depth++;
1916 break;
1917 case BRW_OPCODE_WHILE:
1918 loop_depth--;
1919 break;
1920 case BRW_OPCODE_IF:
1921 if_depth++;
1922 break;
1923 case BRW_OPCODE_ENDIF:
1924 if_depth--;
1925 break;
1926 default:
1927 break;
1928 }
1929 if (loop_depth || if_depth)
1930 continue;
1931
1932 if (inst->opcode != BRW_OPCODE_MOV ||
1933 inst->predicate ||
1934 inst->saturate ||
1935 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1936 inst->src[0].file != UNIFORM)||
1937 inst->dst.type != inst->src[0].type)
1938 continue;
1939
1940 bool has_source_modifiers = (inst->src[0].abs ||
1941 inst->src[0].negate ||
1942 inst->src[0].smear != -1 ||
1943 inst->src[0].file == UNIFORM);
1944
1945 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1946 * them: check for no writes to either one until the exit of the
1947 * program.
1948 */
1949 bool interfered = false;
1950
1951 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1952 !scan_inst->is_tail_sentinel();
1953 scan_inst = (fs_inst *)scan_inst->next) {
1954 if (scan_inst->dst.file == GRF) {
1955 if (scan_inst->overwrites_reg(inst->dst) ||
1956 scan_inst->overwrites_reg(inst->src[0])) {
1957 interfered = true;
1958 break;
1959 }
1960 }
1961
1962 /* The gen6 MATH instruction can't handle source modifiers or
1963 * unusual register regions, so avoid coalescing those for
1964 * now. We should do something more specific.
1965 */
1966 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1967 interfered = true;
1968 break;
1969 }
1970
1971 /* The accumulator result appears to get used for the
1972 * conditional modifier generation. When negating a UD
1973 * value, there is a 33rd bit generated for the sign in the
1974 * accumulator value, so now you can't check, for example,
1975 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1976 */
1977 if (scan_inst->conditional_mod &&
1978 inst->src[0].negate &&
1979 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1980 interfered = true;
1981 break;
1982 }
1983 }
1984 if (interfered) {
1985 continue;
1986 }
1987
1988 /* Rewrite the later usage to point at the source of the move to
1989 * be removed.
1990 */
1991 for (fs_inst *scan_inst = inst;
1992 !scan_inst->is_tail_sentinel();
1993 scan_inst = (fs_inst *)scan_inst->next) {
1994 for (int i = 0; i < 3; i++) {
1995 if (scan_inst->src[i].file == GRF &&
1996 scan_inst->src[i].reg == inst->dst.reg &&
1997 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1998 fs_reg new_src = inst->src[0];
1999 if (scan_inst->src[i].abs) {
2000 new_src.negate = 0;
2001 new_src.abs = 1;
2002 }
2003 new_src.negate ^= scan_inst->src[i].negate;
2004 scan_inst->src[i] = new_src;
2005 }
2006 }
2007 }
2008
2009 inst->remove();
2010 progress = true;
2011 }
2012
2013 if (progress)
2014 live_intervals_valid = false;
2015
2016 return progress;
2017 }
2018
2019
2020 bool
2021 fs_visitor::compute_to_mrf()
2022 {
2023 bool progress = false;
2024 int next_ip = 0;
2025
2026 calculate_live_intervals();
2027
2028 foreach_list_safe(node, &this->instructions) {
2029 fs_inst *inst = (fs_inst *)node;
2030
2031 int ip = next_ip;
2032 next_ip++;
2033
2034 if (inst->opcode != BRW_OPCODE_MOV ||
2035 inst->predicate ||
2036 inst->dst.file != MRF || inst->src[0].file != GRF ||
2037 inst->dst.type != inst->src[0].type ||
2038 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2039 continue;
2040
2041 /* Work out which hardware MRF registers are written by this
2042 * instruction.
2043 */
2044 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2045 int mrf_high;
2046 if (inst->dst.reg & BRW_MRF_COMPR4) {
2047 mrf_high = mrf_low + 4;
2048 } else if (dispatch_width == 16 &&
2049 (!inst->force_uncompressed && !inst->force_sechalf)) {
2050 mrf_high = mrf_low + 1;
2051 } else {
2052 mrf_high = mrf_low;
2053 }
2054
2055 /* Can't compute-to-MRF this GRF if someone else was going to
2056 * read it later.
2057 */
2058 if (this->virtual_grf_use[inst->src[0].reg] > ip)
2059 continue;
2060
2061 /* Found a move of a GRF to a MRF. Let's see if we can go
2062 * rewrite the thing that made this GRF to write into the MRF.
2063 */
2064 fs_inst *scan_inst;
2065 for (scan_inst = (fs_inst *)inst->prev;
2066 scan_inst->prev != NULL;
2067 scan_inst = (fs_inst *)scan_inst->prev) {
2068 if (scan_inst->dst.file == GRF &&
2069 scan_inst->dst.reg == inst->src[0].reg) {
2070 /* Found the last thing to write our reg we want to turn
2071 * into a compute-to-MRF.
2072 */
2073
2074 /* If it's predicated, it (probably) didn't populate all
2075 * the channels. We might be able to rewrite everything
2076 * that writes that reg, but it would require smarter
2077 * tracking to delay the rewriting until complete success.
2078 */
2079 if (scan_inst->predicate)
2080 break;
2081
2082 /* If it's half of register setup and not the same half as
2083 * our MOV we're trying to remove, bail for now.
2084 */
2085 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
2086 scan_inst->force_sechalf != inst->force_sechalf) {
2087 break;
2088 }
2089
2090 /* SEND instructions can't have MRF as a destination. */
2091 if (scan_inst->mlen)
2092 break;
2093
2094 if (intel->gen == 6) {
2095 /* gen6 math instructions must have the destination be
2096 * GRF, so no compute-to-MRF for them.
2097 */
2098 if (scan_inst->is_math()) {
2099 break;
2100 }
2101 }
2102
2103 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2104 /* Found the creator of our MRF's source value. */
2105 scan_inst->dst.file = MRF;
2106 scan_inst->dst.reg = inst->dst.reg;
2107 scan_inst->saturate |= inst->saturate;
2108 inst->remove();
2109 progress = true;
2110 }
2111 break;
2112 }
2113
2114 /* We don't handle control flow here. Most computation of
2115 * values that end up in MRFs are shortly before the MRF
2116 * write anyway.
2117 */
2118 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2119 break;
2120
2121 /* You can't read from an MRF, so if someone else reads our
2122 * MRF's source GRF that we wanted to rewrite, that stops us.
2123 */
2124 bool interfered = false;
2125 for (int i = 0; i < 3; i++) {
2126 if (scan_inst->src[i].file == GRF &&
2127 scan_inst->src[i].reg == inst->src[0].reg &&
2128 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2129 interfered = true;
2130 }
2131 }
2132 if (interfered)
2133 break;
2134
2135 if (scan_inst->dst.file == MRF) {
2136 /* If somebody else writes our MRF here, we can't
2137 * compute-to-MRF before that.
2138 */
2139 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2140 int scan_mrf_high;
2141
2142 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2143 scan_mrf_high = scan_mrf_low + 4;
2144 } else if (dispatch_width == 16 &&
2145 (!scan_inst->force_uncompressed &&
2146 !scan_inst->force_sechalf)) {
2147 scan_mrf_high = scan_mrf_low + 1;
2148 } else {
2149 scan_mrf_high = scan_mrf_low;
2150 }
2151
2152 if (mrf_low == scan_mrf_low ||
2153 mrf_low == scan_mrf_high ||
2154 mrf_high == scan_mrf_low ||
2155 mrf_high == scan_mrf_high) {
2156 break;
2157 }
2158 }
2159
2160 if (scan_inst->mlen > 0) {
2161 /* Found a SEND instruction, which means that there are
2162 * live values in MRFs from base_mrf to base_mrf +
2163 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2164 * above it.
2165 */
2166 if (mrf_low >= scan_inst->base_mrf &&
2167 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2168 break;
2169 }
2170 if (mrf_high >= scan_inst->base_mrf &&
2171 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2172 break;
2173 }
2174 }
2175 }
2176 }
2177
2178 if (progress)
2179 live_intervals_valid = false;
2180
2181 return progress;
2182 }
2183
2184 /**
2185 * Walks through basic blocks, looking for repeated MRF writes and
2186 * removing the later ones.
2187 */
2188 bool
2189 fs_visitor::remove_duplicate_mrf_writes()
2190 {
2191 fs_inst *last_mrf_move[16];
2192 bool progress = false;
2193
2194 /* Need to update the MRF tracking for compressed instructions. */
2195 if (dispatch_width == 16)
2196 return false;
2197
2198 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2199
2200 foreach_list_safe(node, &this->instructions) {
2201 fs_inst *inst = (fs_inst *)node;
2202
2203 if (inst->is_control_flow()) {
2204 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2205 }
2206
2207 if (inst->opcode == BRW_OPCODE_MOV &&
2208 inst->dst.file == MRF) {
2209 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2210 if (prev_inst && inst->equals(prev_inst)) {
2211 inst->remove();
2212 progress = true;
2213 continue;
2214 }
2215 }
2216
2217 /* Clear out the last-write records for MRFs that were overwritten. */
2218 if (inst->dst.file == MRF) {
2219 last_mrf_move[inst->dst.reg] = NULL;
2220 }
2221
2222 if (inst->mlen > 0) {
2223 /* Found a SEND instruction, which will include two or fewer
2224 * implied MRF writes. We could do better here.
2225 */
2226 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2227 last_mrf_move[inst->base_mrf + i] = NULL;
2228 }
2229 }
2230
2231 /* Clear out any MRF move records whose sources got overwritten. */
2232 if (inst->dst.file == GRF) {
2233 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2234 if (last_mrf_move[i] &&
2235 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2236 last_mrf_move[i] = NULL;
2237 }
2238 }
2239 }
2240
2241 if (inst->opcode == BRW_OPCODE_MOV &&
2242 inst->dst.file == MRF &&
2243 inst->src[0].file == GRF &&
2244 !inst->predicate) {
2245 last_mrf_move[inst->dst.reg] = inst;
2246 }
2247 }
2248
2249 if (progress)
2250 live_intervals_valid = false;
2251
2252 return progress;
2253 }
2254
2255 static void
2256 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2257 int first_grf, int grf_len)
2258 {
2259 bool inst_16wide = (dispatch_width > 8 &&
2260 !inst->force_uncompressed &&
2261 !inst->force_sechalf);
2262
2263 /* Clear the flag for registers that actually got read (as expected). */
2264 for (int i = 0; i < 3; i++) {
2265 int grf;
2266 if (inst->src[i].file == GRF) {
2267 grf = inst->src[i].reg;
2268 } else if (inst->src[i].file == FIXED_HW_REG &&
2269 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2270 grf = inst->src[i].fixed_hw_reg.nr;
2271 } else {
2272 continue;
2273 }
2274
2275 if (grf >= first_grf &&
2276 grf < first_grf + grf_len) {
2277 deps[grf - first_grf] = false;
2278 if (inst_16wide)
2279 deps[grf - first_grf + 1] = false;
2280 }
2281 }
2282 }
2283
2284 /**
2285 * Implements this workaround for the original 965:
2286 *
2287 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2288 * check for post destination dependencies on this instruction, software
2289 * must ensure that there is no destination hazard for the case of ‘write
2290 * followed by a posted write’ shown in the following example.
2291 *
2292 * 1. mov r3 0
2293 * 2. send r3.xy <rest of send instruction>
2294 * 3. mov r2 r3
2295 *
2296 * Due to no post-destination dependency check on the ‘send’, the above
2297 * code sequence could have two instructions (1 and 2) in flight at the
2298 * same time that both consider ‘r3’ as the target of their final writes.
2299 */
2300 void
2301 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2302 {
2303 int write_len = inst->regs_written() * dispatch_width / 8;
2304 int first_write_grf = inst->dst.reg;
2305 bool needs_dep[BRW_MAX_MRF];
2306 assert(write_len < (int)sizeof(needs_dep) - 1);
2307
2308 memset(needs_dep, false, sizeof(needs_dep));
2309 memset(needs_dep, true, write_len);
2310
2311 clear_deps_for_inst_src(inst, dispatch_width,
2312 needs_dep, first_write_grf, write_len);
2313
2314 /* Walk backwards looking for writes to registers we're writing which
2315 * aren't read since being written. If we hit the start of the program,
2316 * we assume that there are no outstanding dependencies on entry to the
2317 * program.
2318 */
2319 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2320 scan_inst != NULL;
2321 scan_inst = (fs_inst *)scan_inst->prev) {
2322
2323 /* If we hit control flow, assume that there *are* outstanding
2324 * dependencies, and force their cleanup before our instruction.
2325 */
2326 if (scan_inst->is_control_flow()) {
2327 for (int i = 0; i < write_len; i++) {
2328 if (needs_dep[i]) {
2329 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2330 }
2331 }
2332 }
2333
2334 bool scan_inst_16wide = (dispatch_width > 8 &&
2335 !scan_inst->force_uncompressed &&
2336 !scan_inst->force_sechalf);
2337
2338 /* We insert our reads as late as possible on the assumption that any
2339 * instruction but a MOV that might have left us an outstanding
2340 * dependency has more latency than a MOV.
2341 */
2342 if (scan_inst->dst.file == GRF &&
2343 scan_inst->dst.reg >= first_write_grf &&
2344 scan_inst->dst.reg < first_write_grf + write_len &&
2345 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2346 inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2347 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2348 if (scan_inst_16wide)
2349 needs_dep[scan_inst->dst.reg - first_write_grf + 1] = false;
2350 }
2351
2352 /* Clear the flag for registers that actually got read (as expected). */
2353 clear_deps_for_inst_src(scan_inst, dispatch_width,
2354 needs_dep, first_write_grf, write_len);
2355
2356 /* Continue the loop only if we haven't resolved all the dependencies */
2357 int i;
2358 for (i = 0; i < write_len; i++) {
2359 if (needs_dep[i])
2360 break;
2361 }
2362 if (i == write_len)
2363 return;
2364 }
2365 }
2366
2367 /**
2368 * Implements this workaround for the original 965:
2369 *
2370 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2371 * used as a destination register until after it has been sourced by an
2372 * instruction with a different destination register.
2373 */
2374 void
2375 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2376 {
2377 int write_len = inst->regs_written() * dispatch_width / 8;
2378 int first_write_grf = inst->dst.reg;
2379 bool needs_dep[BRW_MAX_MRF];
2380 assert(write_len < (int)sizeof(needs_dep) - 1);
2381
2382 memset(needs_dep, false, sizeof(needs_dep));
2383 memset(needs_dep, true, write_len);
2384 /* Walk forwards looking for writes to registers we're writing which aren't
2385 * read before being written.
2386 */
2387 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2388 !scan_inst->is_tail_sentinel();
2389 scan_inst = (fs_inst *)scan_inst->next) {
2390 /* If we hit control flow, force resolve all remaining dependencies. */
2391 if (scan_inst->is_control_flow()) {
2392 for (int i = 0; i < write_len; i++) {
2393 if (needs_dep[i])
2394 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2395 }
2396 }
2397
2398 /* Clear the flag for registers that actually got read (as expected). */
2399 clear_deps_for_inst_src(scan_inst, dispatch_width,
2400 needs_dep, first_write_grf, write_len);
2401
2402 /* We insert our reads as late as possible since they're reading the
2403 * result of a SEND, which has massive latency.
2404 */
2405 if (scan_inst->dst.file == GRF &&
2406 scan_inst->dst.reg >= first_write_grf &&
2407 scan_inst->dst.reg < first_write_grf + write_len &&
2408 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2409 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2410 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2411 }
2412
2413 /* Continue the loop only if we haven't resolved all the dependencies */
2414 int i;
2415 for (i = 0; i < write_len; i++) {
2416 if (needs_dep[i])
2417 break;
2418 }
2419 if (i == write_len)
2420 return;
2421 }
2422
2423 /* If we hit the end of the program, resolve all remaining dependencies out
2424 * of paranoia.
2425 */
2426 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2427 assert(last_inst->eot);
2428 for (int i = 0; i < write_len; i++) {
2429 if (needs_dep[i])
2430 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2431 }
2432 }
2433
2434 void
2435 fs_visitor::insert_gen4_send_dependency_workarounds()
2436 {
2437 if (intel->gen != 4 || intel->is_g4x)
2438 return;
2439
2440 /* Note that we're done with register allocation, so GRF fs_regs always
2441 * have a .reg_offset of 0.
2442 */
2443
2444 foreach_list_safe(node, &this->instructions) {
2445 fs_inst *inst = (fs_inst *)node;
2446
2447 if (inst->mlen != 0 && inst->dst.file == GRF) {
2448 insert_gen4_pre_send_dependency_workarounds(inst);
2449 insert_gen4_post_send_dependency_workarounds(inst);
2450 }
2451 }
2452 }
2453
2454 /**
2455 * Turns the generic expression-style uniform pull constant load instruction
2456 * into a hardware-specific series of instructions for loading a pull
2457 * constant.
2458 *
2459 * The expression style allows the CSE pass before this to optimize out
2460 * repeated loads from the same offset, and gives the pre-register-allocation
2461 * scheduling full flexibility, while the conversion to native instructions
2462 * allows the post-register-allocation scheduler the best information
2463 * possible.
2464 */
2465 void
2466 fs_visitor::lower_uniform_pull_constant_loads()
2467 {
2468 foreach_list(node, &this->instructions) {
2469 fs_inst *inst = (fs_inst *)node;
2470
2471 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2472 continue;
2473
2474 if (intel->gen >= 7) {
2475 fs_reg const_offset_reg = inst->src[1];
2476 assert(const_offset_reg.file == IMM &&
2477 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2478 const_offset_reg.imm.u /= 16;
2479 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2480 struct brw_reg g0 = retype(brw_vec8_grf(0, 0),
2481 BRW_REGISTER_TYPE_UD);
2482
2483 fs_inst *setup1 = MOV(payload, fs_reg(g0));
2484 setup1->force_writemask_all = true;
2485 /* We don't need the second half of this vgrf to be filled with g1
2486 * in the 16-wide case, but if we use force_uncompressed then live
2487 * variable analysis won't consider this a def!
2488 */
2489
2490 fs_inst *setup2 = new(mem_ctx) fs_inst(FS_OPCODE_SET_GLOBAL_OFFSET,
2491 payload, payload,
2492 const_offset_reg);
2493
2494 setup1->ir = inst->ir;
2495 setup1->annotation = inst->annotation;
2496 inst->insert_before(setup1);
2497 setup2->ir = inst->ir;
2498 setup2->annotation = inst->annotation;
2499 inst->insert_before(setup2);
2500 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2501 inst->src[1] = payload;
2502 } else {
2503 /* Before register allocation, we didn't tell the scheduler about the
2504 * MRF we use. We know it's safe to use this MRF because nothing
2505 * else does except for register spill/unspill, which generates and
2506 * uses its MRF within a single IR instruction.
2507 */
2508 inst->base_mrf = 14;
2509 inst->mlen = 1;
2510 }
2511 }
2512 }
2513
2514 void
2515 fs_visitor::dump_instruction(fs_inst *inst)
2516 {
2517 if (inst->predicate) {
2518 printf("(%cf0.%d) ",
2519 inst->predicate_inverse ? '-' : '+',
2520 inst->flag_subreg);
2521 }
2522
2523 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2524 opcode_descs[inst->opcode].name) {
2525 printf("%s", opcode_descs[inst->opcode].name);
2526 } else {
2527 switch (inst->opcode) {
2528 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
2529 printf("uniform_pull_const");
2530 break;
2531 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7:
2532 printf("uniform_pull_const_gen7");
2533 break;
2534 case FS_OPCODE_SET_GLOBAL_OFFSET:
2535 printf("set_global_offset");
2536 break;
2537 default:
2538 printf("op%d", inst->opcode);
2539 break;
2540 }
2541 }
2542 if (inst->saturate)
2543 printf(".sat");
2544 if (inst->conditional_mod) {
2545 printf(".cmod");
2546 if (!inst->predicate &&
2547 (intel->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2548 inst->opcode != BRW_OPCODE_IF &&
2549 inst->opcode != BRW_OPCODE_WHILE))) {
2550 printf(".f0.%d\n", inst->flag_subreg);
2551 }
2552 }
2553 printf(" ");
2554
2555
2556 switch (inst->dst.file) {
2557 case GRF:
2558 printf("vgrf%d", inst->dst.reg);
2559 if (inst->dst.reg_offset)
2560 printf("+%d", inst->dst.reg_offset);
2561 break;
2562 case MRF:
2563 printf("m%d", inst->dst.reg);
2564 break;
2565 case BAD_FILE:
2566 printf("(null)");
2567 break;
2568 case UNIFORM:
2569 printf("***u%d***", inst->dst.reg);
2570 break;
2571 default:
2572 printf("???");
2573 break;
2574 }
2575 printf(", ");
2576
2577 for (int i = 0; i < 3; i++) {
2578 if (inst->src[i].negate)
2579 printf("-");
2580 if (inst->src[i].abs)
2581 printf("|");
2582 switch (inst->src[i].file) {
2583 case GRF:
2584 printf("vgrf%d", inst->src[i].reg);
2585 if (inst->src[i].reg_offset)
2586 printf("+%d", inst->src[i].reg_offset);
2587 break;
2588 case MRF:
2589 printf("***m%d***", inst->src[i].reg);
2590 break;
2591 case UNIFORM:
2592 printf("u%d", inst->src[i].reg);
2593 if (inst->src[i].reg_offset)
2594 printf(".%d", inst->src[i].reg_offset);
2595 break;
2596 case BAD_FILE:
2597 printf("(null)");
2598 break;
2599 case IMM:
2600 switch (inst->src[i].type) {
2601 case BRW_REGISTER_TYPE_F:
2602 printf("%ff", inst->src[i].imm.f);
2603 break;
2604 case BRW_REGISTER_TYPE_D:
2605 printf("%dd", inst->src[i].imm.i);
2606 break;
2607 case BRW_REGISTER_TYPE_UD:
2608 printf("%uu", inst->src[i].imm.u);
2609 break;
2610 default:
2611 printf("???");
2612 break;
2613 }
2614 break;
2615 default:
2616 printf("???");
2617 break;
2618 }
2619 if (inst->src[i].abs)
2620 printf("|");
2621
2622 if (i < 3)
2623 printf(", ");
2624 }
2625
2626 printf(" ");
2627
2628 if (inst->force_uncompressed)
2629 printf("1sthalf ");
2630
2631 if (inst->force_sechalf)
2632 printf("2ndhalf ");
2633
2634 printf("\n");
2635 }
2636
2637 void
2638 fs_visitor::dump_instructions()
2639 {
2640 int ip = 0;
2641 foreach_list(node, &this->instructions) {
2642 fs_inst *inst = (fs_inst *)node;
2643 printf("%d: ", ip++);
2644 dump_instruction(inst);
2645 }
2646 }
2647
2648 /**
2649 * Possibly returns an instruction that set up @param reg.
2650 *
2651 * Sometimes we want to take the result of some expression/variable
2652 * dereference tree and rewrite the instruction generating the result
2653 * of the tree. When processing the tree, we know that the
2654 * instructions generated are all writing temporaries that are dead
2655 * outside of this tree. So, if we have some instructions that write
2656 * a temporary, we're free to point that temp write somewhere else.
2657 *
2658 * Note that this doesn't guarantee that the instruction generated
2659 * only reg -- it might be the size=4 destination of a texture instruction.
2660 */
2661 fs_inst *
2662 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2663 fs_inst *end,
2664 fs_reg reg)
2665 {
2666 if (end == start ||
2667 end->predicate ||
2668 end->force_uncompressed ||
2669 end->force_sechalf ||
2670 reg.reladdr ||
2671 !reg.equals(end->dst)) {
2672 return NULL;
2673 } else {
2674 return end;
2675 }
2676 }
2677
2678 void
2679 fs_visitor::setup_payload_gen6()
2680 {
2681 struct intel_context *intel = &brw->intel;
2682 bool uses_depth =
2683 (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2684 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2685
2686 assert(intel->gen >= 6);
2687
2688 /* R0-1: masks, pixel X/Y coordinates. */
2689 c->nr_payload_regs = 2;
2690 /* R2: only for 32-pixel dispatch.*/
2691
2692 /* R3-26: barycentric interpolation coordinates. These appear in the
2693 * same order that they appear in the brw_wm_barycentric_interp_mode
2694 * enum. Each set of coordinates occupies 2 registers if dispatch width
2695 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2696 * appear if they were enabled using the "Barycentric Interpolation
2697 * Mode" bits in WM_STATE.
2698 */
2699 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2700 if (barycentric_interp_modes & (1 << i)) {
2701 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2702 c->nr_payload_regs += 2;
2703 if (dispatch_width == 16) {
2704 c->nr_payload_regs += 2;
2705 }
2706 }
2707 }
2708
2709 /* R27: interpolated depth if uses source depth */
2710 if (uses_depth) {
2711 c->source_depth_reg = c->nr_payload_regs;
2712 c->nr_payload_regs++;
2713 if (dispatch_width == 16) {
2714 /* R28: interpolated depth if not 8-wide. */
2715 c->nr_payload_regs++;
2716 }
2717 }
2718 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2719 if (uses_depth) {
2720 c->source_w_reg = c->nr_payload_regs;
2721 c->nr_payload_regs++;
2722 if (dispatch_width == 16) {
2723 /* R30: interpolated W if not 8-wide. */
2724 c->nr_payload_regs++;
2725 }
2726 }
2727 /* R31: MSAA position offsets. */
2728 /* R32-: bary for 32-pixel. */
2729 /* R58-59: interp W for 32-pixel. */
2730
2731 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2732 c->source_depth_to_render_target = true;
2733 }
2734 }
2735
2736 bool
2737 fs_visitor::run()
2738 {
2739 sanity_param_count = fp->Base.Parameters->NumParameters;
2740 uint32_t orig_nr_params = c->prog_data.nr_params;
2741
2742 if (intel->gen >= 6)
2743 setup_payload_gen6();
2744 else
2745 setup_payload_gen4();
2746
2747 if (0) {
2748 emit_dummy_fs();
2749 } else {
2750 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2751 emit_shader_time_begin();
2752
2753 calculate_urb_setup();
2754 if (intel->gen < 6)
2755 emit_interpolation_setup_gen4();
2756 else
2757 emit_interpolation_setup_gen6();
2758
2759 /* We handle discards by keeping track of the still-live pixels in f0.1.
2760 * Initialize it with the dispatched pixels.
2761 */
2762 if (fp->UsesKill) {
2763 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2764 discard_init->flag_subreg = 1;
2765 }
2766
2767 /* Generate FS IR for main(). (the visitor only descends into
2768 * functions called "main").
2769 */
2770 if (shader) {
2771 foreach_list(node, &*shader->ir) {
2772 ir_instruction *ir = (ir_instruction *)node;
2773 base_ir = ir;
2774 this->result = reg_undef;
2775 ir->accept(this);
2776 }
2777 } else {
2778 emit_fragment_program_code();
2779 }
2780 base_ir = NULL;
2781 if (failed)
2782 return false;
2783
2784 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2785 emit_shader_time_end();
2786
2787 emit_fb_writes();
2788
2789 split_virtual_grfs();
2790
2791 move_uniform_array_access_to_pull_constants();
2792 setup_pull_constants();
2793
2794 bool progress;
2795 do {
2796 progress = false;
2797
2798 compact_virtual_grfs();
2799
2800 progress = remove_duplicate_mrf_writes() || progress;
2801
2802 progress = opt_algebraic() || progress;
2803 progress = opt_cse() || progress;
2804 progress = opt_copy_propagate() || progress;
2805 progress = dead_code_eliminate() || progress;
2806 progress = register_coalesce() || progress;
2807 progress = register_coalesce_2() || progress;
2808 progress = compute_to_mrf() || progress;
2809 } while (progress);
2810
2811 remove_dead_constants();
2812
2813 schedule_instructions(false);
2814
2815 lower_uniform_pull_constant_loads();
2816
2817 assign_curb_setup();
2818 assign_urb_setup();
2819
2820 if (0) {
2821 /* Debug of register spilling: Go spill everything. */
2822 for (int i = 0; i < virtual_grf_count; i++) {
2823 spill_reg(i);
2824 }
2825 }
2826
2827 if (0)
2828 assign_regs_trivial();
2829 else {
2830 while (!assign_regs()) {
2831 if (failed)
2832 break;
2833 }
2834 }
2835 }
2836 assert(force_uncompressed_stack == 0);
2837 assert(force_sechalf_stack == 0);
2838
2839 /* This must come after all optimization and register allocation, since
2840 * it inserts dead code that happens to have side effects, and it does
2841 * so based on the actual physical registers in use.
2842 */
2843 insert_gen4_send_dependency_workarounds();
2844
2845 if (failed)
2846 return false;
2847
2848 schedule_instructions(true);
2849
2850 if (dispatch_width == 8) {
2851 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2852 } else {
2853 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2854
2855 /* Make sure we didn't try to sneak in an extra uniform */
2856 assert(orig_nr_params == c->prog_data.nr_params);
2857 (void) orig_nr_params;
2858 }
2859
2860 /* If any state parameters were appended, then ParameterValues could have
2861 * been realloced, in which case the driver uniform storage set up by
2862 * _mesa_associate_uniform_storage() would point to freed memory. Make
2863 * sure that didn't happen.
2864 */
2865 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
2866
2867 return !failed;
2868 }
2869
2870 const unsigned *
2871 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2872 struct gl_fragment_program *fp,
2873 struct gl_shader_program *prog,
2874 unsigned *final_assembly_size)
2875 {
2876 struct intel_context *intel = &brw->intel;
2877 bool start_busy = false;
2878 float start_time = 0;
2879
2880 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2881 start_busy = (intel->batch.last_bo &&
2882 drm_intel_bo_busy(intel->batch.last_bo));
2883 start_time = get_time();
2884 }
2885
2886 struct brw_shader *shader = NULL;
2887 if (prog)
2888 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2889
2890 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2891 if (shader) {
2892 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2893 _mesa_print_ir(shader->ir, NULL);
2894 printf("\n\n");
2895 } else {
2896 printf("ARB_fragment_program %d ir for native fragment shader\n",
2897 fp->Base.Id);
2898 _mesa_print_program(&fp->Base);
2899 }
2900 }
2901
2902 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2903 */
2904 fs_visitor v(brw, c, prog, fp, 8);
2905 if (!v.run()) {
2906 prog->LinkStatus = false;
2907 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2908
2909 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2910 v.fail_msg);
2911
2912 return NULL;
2913 }
2914
2915 exec_list *simd16_instructions = NULL;
2916 fs_visitor v2(brw, c, prog, fp, 16);
2917 bool no16 = INTEL_DEBUG & DEBUG_NO16;
2918 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0 && likely(!no16)) {
2919 v2.import_uniforms(&v);
2920 if (!v2.run()) {
2921 perf_debug("16-wide shader failed to compile, falling back to "
2922 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2923 } else {
2924 simd16_instructions = &v2.instructions;
2925 }
2926 }
2927
2928 c->prog_data.dispatch_width = 8;
2929
2930 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2931 const unsigned *generated = g.generate_assembly(&v.instructions,
2932 simd16_instructions,
2933 final_assembly_size);
2934
2935 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2936 if (shader->compiled_once)
2937 brw_wm_debug_recompile(brw, prog, &c->key);
2938 shader->compiled_once = true;
2939
2940 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2941 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2942 (get_time() - start_time) * 1000);
2943 }
2944 }
2945
2946 return generated;
2947 }
2948
2949 bool
2950 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2951 {
2952 struct brw_context *brw = brw_context(ctx);
2953 struct intel_context *intel = &brw->intel;
2954 struct brw_wm_prog_key key;
2955
2956 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2957 return true;
2958
2959 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2960 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2961 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2962 bool program_uses_dfdy = fp->UsesDFdy;
2963
2964 memset(&key, 0, sizeof(key));
2965
2966 if (intel->gen < 6) {
2967 if (fp->UsesKill)
2968 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2969
2970 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2971 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2972
2973 /* Just assume depth testing. */
2974 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2975 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2976 }
2977
2978 if (prog->Name != 0)
2979 key.proj_attrib_mask = 0xffffffff;
2980
2981 if (intel->gen < 6)
2982 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2983
2984 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2985 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2986 continue;
2987
2988 if (prog->Name == 0)
2989 key.proj_attrib_mask |= 1 << i;
2990
2991 if (intel->gen < 6) {
2992 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2993
2994 if (vp_index >= 0)
2995 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2996 }
2997 }
2998
2999 key.clamp_fragment_color = true;
3000
3001 for (int i = 0; i < MAX_SAMPLERS; i++) {
3002 if (fp->Base.ShadowSamplers & (1 << i)) {
3003 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3004 key.tex.swizzles[i] =
3005 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3006 } else {
3007 /* Color sampler: assume no swizzling. */
3008 key.tex.swizzles[i] = SWIZZLE_XYZW;
3009 }
3010 }
3011
3012 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
3013 key.drawable_height = ctx->DrawBuffer->Height;
3014 }
3015
3016 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
3017 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3018 }
3019
3020 key.nr_color_regions = 1;
3021
3022 key.program_string_id = bfp->id;
3023
3024 uint32_t old_prog_offset = brw->wm.prog_offset;
3025 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3026
3027 bool success = do_wm_prog(brw, prog, bfp, &key);
3028
3029 brw->wm.prog_offset = old_prog_offset;
3030 brw->wm.prog_data = old_prog_data;
3031
3032 return success;
3033 }