ed5bb0755f7a44092e65212d009e146eb3ee1e20
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init()
56 {
57 memset(this, 0, sizeof(*this));
58 this->conditional_mod = BRW_CONDITIONAL_NONE;
59
60 this->dst = reg_undef;
61 this->src[0] = reg_undef;
62 this->src[1] = reg_undef;
63 this->src[2] = reg_undef;
64
65 /* This will be the case for almost all instructions. */
66 this->regs_written = 1;
67
68 this->writes_accumulator = false;
69 }
70
71 fs_inst::fs_inst()
72 {
73 init();
74 this->opcode = BRW_OPCODE_NOP;
75 }
76
77 fs_inst::fs_inst(enum opcode opcode)
78 {
79 init();
80 this->opcode = opcode;
81 }
82
83 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
84 {
85 init();
86 this->opcode = opcode;
87 this->dst = dst;
88
89 if (dst.file == GRF)
90 assert(dst.reg_offset >= 0);
91 }
92
93 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
94 {
95 init();
96 this->opcode = opcode;
97 this->dst = dst;
98 this->src[0] = src0;
99
100 if (dst.file == GRF)
101 assert(dst.reg_offset >= 0);
102 if (src[0].file == GRF)
103 assert(src[0].reg_offset >= 0);
104 }
105
106 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
107 {
108 init();
109 this->opcode = opcode;
110 this->dst = dst;
111 this->src[0] = src0;
112 this->src[1] = src1;
113
114 if (dst.file == GRF)
115 assert(dst.reg_offset >= 0);
116 if (src[0].file == GRF)
117 assert(src[0].reg_offset >= 0);
118 if (src[1].file == GRF)
119 assert(src[1].reg_offset >= 0);
120 }
121
122 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
123 fs_reg src0, fs_reg src1, fs_reg src2)
124 {
125 init();
126 this->opcode = opcode;
127 this->dst = dst;
128 this->src[0] = src0;
129 this->src[1] = src1;
130 this->src[2] = src2;
131
132 if (dst.file == GRF)
133 assert(dst.reg_offset >= 0);
134 if (src[0].file == GRF)
135 assert(src[0].reg_offset >= 0);
136 if (src[1].file == GRF)
137 assert(src[1].reg_offset >= 0);
138 if (src[2].file == GRF)
139 assert(src[2].reg_offset >= 0);
140 }
141
142 #define ALU1(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
147 }
148
149 #define ALU2(op) \
150 fs_inst * \
151 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
152 { \
153 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
154 }
155
156 #define ALU2_ACC(op) \
157 fs_inst * \
158 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
159 { \
160 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
161 inst->writes_accumulator = true; \
162 return inst; \
163 }
164
165 #define ALU3(op) \
166 fs_inst * \
167 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
168 { \
169 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
170 }
171
172 ALU1(NOT)
173 ALU1(MOV)
174 ALU1(FRC)
175 ALU1(RNDD)
176 ALU1(RNDE)
177 ALU1(RNDZ)
178 ALU2(ADD)
179 ALU2(MUL)
180 ALU2_ACC(MACH)
181 ALU2(AND)
182 ALU2(OR)
183 ALU2(XOR)
184 ALU2(SHL)
185 ALU2(SHR)
186 ALU2(ASR)
187 ALU3(LRP)
188 ALU1(BFREV)
189 ALU3(BFE)
190 ALU2(BFI1)
191 ALU3(BFI2)
192 ALU1(FBH)
193 ALU1(FBL)
194 ALU1(CBIT)
195 ALU3(MAD)
196 ALU2_ACC(ADDC)
197 ALU2_ACC(SUBB)
198 ALU2(SEL)
199 ALU2(MAC)
200
201 /** Gen4 predicated IF. */
202 fs_inst *
203 fs_visitor::IF(uint32_t predicate)
204 {
205 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
206 inst->predicate = predicate;
207 return inst;
208 }
209
210 /** Gen6 IF with embedded comparison. */
211 fs_inst *
212 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
213 {
214 assert(brw->gen == 6);
215 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
216 reg_null_d, src0, src1);
217 inst->conditional_mod = condition;
218 return inst;
219 }
220
221 /**
222 * CMP: Sets the low bit of the destination channels with the result
223 * of the comparison, while the upper bits are undefined, and updates
224 * the flag register with the packed 16 bits of the result.
225 */
226 fs_inst *
227 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
228 {
229 fs_inst *inst;
230
231 /* Take the instruction:
232 *
233 * CMP null<d> src0<f> src1<f>
234 *
235 * Original gen4 does type conversion to the destination type before
236 * comparison, producing garbage results for floating point comparisons.
237 * gen5 does the comparison on the execution type (resolved source types),
238 * so dst type doesn't matter. gen6 does comparison and then uses the
239 * result as if it was the dst type with no conversion, which happens to
240 * mostly work out for float-interpreted-as-int since our comparisons are
241 * for >0, =0, <0.
242 */
243 if (brw->gen == 4) {
244 dst.type = src0.type;
245 if (dst.file == HW_REG)
246 dst.fixed_hw_reg.type = dst.type;
247 }
248
249 resolve_ud_negate(&src0);
250 resolve_ud_negate(&src1);
251
252 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
253 inst->conditional_mod = condition;
254
255 return inst;
256 }
257
258 exec_list
259 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
260 const fs_reg &surf_index,
261 const fs_reg &varying_offset,
262 uint32_t const_offset)
263 {
264 exec_list instructions;
265 fs_inst *inst;
266
267 /* We have our constant surface use a pitch of 4 bytes, so our index can
268 * be any component of a vector, and then we load 4 contiguous
269 * components starting from that.
270 *
271 * We break down the const_offset to a portion added to the variable
272 * offset and a portion done using reg_offset, which means that if you
273 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
274 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
275 * CSE can later notice that those loads are all the same and eliminate
276 * the redundant ones.
277 */
278 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
279 instructions.push_tail(ADD(vec4_offset,
280 varying_offset, const_offset & ~3));
281
282 int scale = 1;
283 if (brw->gen == 4 && dispatch_width == 8) {
284 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
285 * u, v, r) as parameters, or we can just use the SIMD16 message
286 * consisting of (header, u). We choose the second, at the cost of a
287 * longer return length.
288 */
289 scale = 2;
290 }
291
292 enum opcode op;
293 if (brw->gen >= 7)
294 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
295 else
296 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
297 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
298 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
299 inst->regs_written = 4 * scale;
300 instructions.push_tail(inst);
301
302 if (brw->gen < 7) {
303 inst->base_mrf = 13;
304 inst->header_present = true;
305 if (brw->gen == 4)
306 inst->mlen = 3;
307 else
308 inst->mlen = 1 + dispatch_width / 8;
309 }
310
311 vec4_result.reg_offset += (const_offset & 3) * scale;
312 instructions.push_tail(MOV(dst, vec4_result));
313
314 return instructions;
315 }
316
317 /**
318 * A helper for MOV generation for fixing up broken hardware SEND dependency
319 * handling.
320 */
321 fs_inst *
322 fs_visitor::DEP_RESOLVE_MOV(int grf)
323 {
324 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
325
326 inst->ir = NULL;
327 inst->annotation = "send dependency resolve";
328
329 /* The caller always wants uncompressed to emit the minimal extra
330 * dependencies, and to avoid having to deal with aligning its regs to 2.
331 */
332 inst->force_uncompressed = true;
333
334 return inst;
335 }
336
337 bool
338 fs_inst::equals(fs_inst *inst) const
339 {
340 return (opcode == inst->opcode &&
341 dst.equals(inst->dst) &&
342 src[0].equals(inst->src[0]) &&
343 src[1].equals(inst->src[1]) &&
344 src[2].equals(inst->src[2]) &&
345 saturate == inst->saturate &&
346 predicate == inst->predicate &&
347 conditional_mod == inst->conditional_mod &&
348 mlen == inst->mlen &&
349 base_mrf == inst->base_mrf &&
350 sampler == inst->sampler &&
351 target == inst->target &&
352 eot == inst->eot &&
353 header_present == inst->header_present &&
354 shadow_compare == inst->shadow_compare &&
355 offset == inst->offset);
356 }
357
358 bool
359 fs_inst::overwrites_reg(const fs_reg &reg) const
360 {
361 return (reg.file == dst.file &&
362 reg.reg == dst.reg &&
363 reg.reg_offset >= dst.reg_offset &&
364 reg.reg_offset < dst.reg_offset + regs_written);
365 }
366
367 bool
368 fs_inst::is_send_from_grf() const
369 {
370 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
371 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
372 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
373 src[1].file == GRF) ||
374 (is_tex() && src[0].file == GRF));
375 }
376
377 bool
378 fs_visitor::can_do_source_mods(fs_inst *inst)
379 {
380 if (brw->gen == 6 && inst->is_math())
381 return false;
382
383 if (inst->is_send_from_grf())
384 return false;
385
386 if (!inst->can_do_source_mods())
387 return false;
388
389 return true;
390 }
391
392 void
393 fs_reg::init()
394 {
395 memset(this, 0, sizeof(*this));
396 stride = 1;
397 }
398
399 /** Generic unset register constructor. */
400 fs_reg::fs_reg()
401 {
402 init();
403 this->file = BAD_FILE;
404 }
405
406 /** Immediate value constructor. */
407 fs_reg::fs_reg(float f)
408 {
409 init();
410 this->file = IMM;
411 this->type = BRW_REGISTER_TYPE_F;
412 this->imm.f = f;
413 }
414
415 /** Immediate value constructor. */
416 fs_reg::fs_reg(int32_t i)
417 {
418 init();
419 this->file = IMM;
420 this->type = BRW_REGISTER_TYPE_D;
421 this->imm.i = i;
422 }
423
424 /** Immediate value constructor. */
425 fs_reg::fs_reg(uint32_t u)
426 {
427 init();
428 this->file = IMM;
429 this->type = BRW_REGISTER_TYPE_UD;
430 this->imm.u = u;
431 }
432
433 /** Fixed brw_reg. */
434 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
435 {
436 init();
437 this->file = HW_REG;
438 this->fixed_hw_reg = fixed_hw_reg;
439 this->type = fixed_hw_reg.type;
440 }
441
442 bool
443 fs_reg::equals(const fs_reg &r) const
444 {
445 return (file == r.file &&
446 reg == r.reg &&
447 reg_offset == r.reg_offset &&
448 subreg_offset == r.subreg_offset &&
449 type == r.type &&
450 negate == r.negate &&
451 abs == r.abs &&
452 !reladdr && !r.reladdr &&
453 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
454 sizeof(fixed_hw_reg)) == 0 &&
455 stride == r.stride &&
456 imm.u == r.imm.u);
457 }
458
459 fs_reg &
460 fs_reg::apply_stride(unsigned stride)
461 {
462 assert((this->stride * stride) <= 4 &&
463 (is_power_of_two(stride) || stride == 0) &&
464 file != HW_REG && file != IMM);
465 this->stride *= stride;
466 return *this;
467 }
468
469 fs_reg &
470 fs_reg::set_smear(unsigned subreg)
471 {
472 assert(file != HW_REG && file != IMM);
473 subreg_offset = subreg * type_sz(type);
474 stride = 0;
475 return *this;
476 }
477
478 bool
479 fs_reg::is_contiguous() const
480 {
481 return stride == 1;
482 }
483
484 bool
485 fs_reg::is_zero() const
486 {
487 if (file != IMM)
488 return false;
489
490 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
491 }
492
493 bool
494 fs_reg::is_one() const
495 {
496 if (file != IMM)
497 return false;
498
499 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
500 }
501
502 bool
503 fs_reg::is_null() const
504 {
505 return file == HW_REG &&
506 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
507 fixed_hw_reg.nr == BRW_ARF_NULL;
508 }
509
510 bool
511 fs_reg::is_valid_3src() const
512 {
513 return file == GRF || file == UNIFORM;
514 }
515
516 bool
517 fs_reg::is_accumulator() const
518 {
519 return file == HW_REG &&
520 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
521 fixed_hw_reg.nr == BRW_ARF_ACCUMULATOR;
522 }
523
524 int
525 fs_visitor::type_size(const struct glsl_type *type)
526 {
527 unsigned int size, i;
528
529 switch (type->base_type) {
530 case GLSL_TYPE_UINT:
531 case GLSL_TYPE_INT:
532 case GLSL_TYPE_FLOAT:
533 case GLSL_TYPE_BOOL:
534 return type->components();
535 case GLSL_TYPE_ARRAY:
536 return type_size(type->fields.array) * type->length;
537 case GLSL_TYPE_STRUCT:
538 size = 0;
539 for (i = 0; i < type->length; i++) {
540 size += type_size(type->fields.structure[i].type);
541 }
542 return size;
543 case GLSL_TYPE_SAMPLER:
544 /* Samplers take up no register space, since they're baked in at
545 * link time.
546 */
547 return 0;
548 case GLSL_TYPE_ATOMIC_UINT:
549 return 0;
550 case GLSL_TYPE_IMAGE:
551 case GLSL_TYPE_VOID:
552 case GLSL_TYPE_ERROR:
553 case GLSL_TYPE_INTERFACE:
554 assert(!"not reached");
555 break;
556 }
557
558 return 0;
559 }
560
561 fs_reg
562 fs_visitor::get_timestamp()
563 {
564 assert(brw->gen >= 7);
565
566 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
567 BRW_ARF_TIMESTAMP,
568 0),
569 BRW_REGISTER_TYPE_UD));
570
571 fs_reg dst = fs_reg(this, glsl_type::uint_type);
572
573 fs_inst *mov = emit(MOV(dst, ts));
574 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
575 * even if it's not enabled in the dispatch.
576 */
577 mov->force_writemask_all = true;
578 mov->force_uncompressed = true;
579
580 /* The caller wants the low 32 bits of the timestamp. Since it's running
581 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
582 * which is plenty of time for our purposes. It is identical across the
583 * EUs, but since it's tracking GPU core speed it will increment at a
584 * varying rate as render P-states change.
585 *
586 * The caller could also check if render P-states have changed (or anything
587 * else that might disrupt timing) by setting smear to 2 and checking if
588 * that field is != 0.
589 */
590 dst.set_smear(0);
591
592 return dst;
593 }
594
595 void
596 fs_visitor::emit_shader_time_begin()
597 {
598 current_annotation = "shader time start";
599 shader_start_time = get_timestamp();
600 }
601
602 void
603 fs_visitor::emit_shader_time_end()
604 {
605 current_annotation = "shader time end";
606
607 enum shader_time_shader_type type, written_type, reset_type;
608 if (dispatch_width == 8) {
609 type = ST_FS8;
610 written_type = ST_FS8_WRITTEN;
611 reset_type = ST_FS8_RESET;
612 } else {
613 assert(dispatch_width == 16);
614 type = ST_FS16;
615 written_type = ST_FS16_WRITTEN;
616 reset_type = ST_FS16_RESET;
617 }
618
619 fs_reg shader_end_time = get_timestamp();
620
621 /* Check that there weren't any timestamp reset events (assuming these
622 * were the only two timestamp reads that happened).
623 */
624 fs_reg reset = shader_end_time;
625 reset.set_smear(2);
626 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
627 test->conditional_mod = BRW_CONDITIONAL_Z;
628 emit(IF(BRW_PREDICATE_NORMAL));
629
630 push_force_uncompressed();
631 fs_reg start = shader_start_time;
632 start.negate = true;
633 fs_reg diff = fs_reg(this, glsl_type::uint_type);
634 emit(ADD(diff, start, shader_end_time));
635
636 /* If there were no instructions between the two timestamp gets, the diff
637 * is 2 cycles. Remove that overhead, so I can forget about that when
638 * trying to determine the time taken for single instructions.
639 */
640 emit(ADD(diff, diff, fs_reg(-2u)));
641
642 emit_shader_time_write(type, diff);
643 emit_shader_time_write(written_type, fs_reg(1u));
644 emit(BRW_OPCODE_ELSE);
645 emit_shader_time_write(reset_type, fs_reg(1u));
646 emit(BRW_OPCODE_ENDIF);
647
648 pop_force_uncompressed();
649 }
650
651 void
652 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
653 fs_reg value)
654 {
655 int shader_time_index =
656 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
657 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
658
659 fs_reg payload;
660 if (dispatch_width == 8)
661 payload = fs_reg(this, glsl_type::uvec2_type);
662 else
663 payload = fs_reg(this, glsl_type::uint_type);
664
665 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
666 fs_reg(), payload, offset, value));
667 }
668
669 void
670 fs_visitor::vfail(const char *format, va_list va)
671 {
672 char *msg;
673
674 if (failed)
675 return;
676
677 failed = true;
678
679 msg = ralloc_vasprintf(mem_ctx, format, va);
680 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
681
682 this->fail_msg = msg;
683
684 if (INTEL_DEBUG & DEBUG_WM) {
685 fprintf(stderr, "%s", msg);
686 }
687 }
688
689 void
690 fs_visitor::fail(const char *format, ...)
691 {
692 va_list va;
693
694 va_start(va, format);
695 vfail(format, va);
696 va_end(va);
697 }
698
699 /**
700 * Mark this program as impossible to compile in SIMD16 mode.
701 *
702 * During the SIMD8 compile (which happens first), we can detect and flag
703 * things that are unsupported in SIMD16 mode, so the compiler can skip
704 * the SIMD16 compile altogether.
705 *
706 * During a SIMD16 compile (if one happens anyway), this just calls fail().
707 */
708 void
709 fs_visitor::no16(const char *format, ...)
710 {
711 va_list va;
712
713 va_start(va, format);
714
715 if (dispatch_width == 16) {
716 vfail(format, va);
717 } else {
718 simd16_unsupported = true;
719
720 if (brw->perf_debug) {
721 if (no16_msg)
722 ralloc_vasprintf_append(&no16_msg, format, va);
723 else
724 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
725 }
726 }
727
728 va_end(va);
729 }
730
731 fs_inst *
732 fs_visitor::emit(enum opcode opcode)
733 {
734 return emit(new(mem_ctx) fs_inst(opcode));
735 }
736
737 fs_inst *
738 fs_visitor::emit(enum opcode opcode, fs_reg dst)
739 {
740 return emit(new(mem_ctx) fs_inst(opcode, dst));
741 }
742
743 fs_inst *
744 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
745 {
746 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
747 }
748
749 fs_inst *
750 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
751 {
752 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
753 }
754
755 fs_inst *
756 fs_visitor::emit(enum opcode opcode, fs_reg dst,
757 fs_reg src0, fs_reg src1, fs_reg src2)
758 {
759 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
760 }
761
762 void
763 fs_visitor::push_force_uncompressed()
764 {
765 force_uncompressed_stack++;
766 }
767
768 void
769 fs_visitor::pop_force_uncompressed()
770 {
771 force_uncompressed_stack--;
772 assert(force_uncompressed_stack >= 0);
773 }
774
775 /**
776 * Returns true if the instruction has a flag that means it won't
777 * update an entire destination register.
778 *
779 * For example, dead code elimination and live variable analysis want to know
780 * when a write to a variable screens off any preceding values that were in
781 * it.
782 */
783 bool
784 fs_inst::is_partial_write() const
785 {
786 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
787 this->force_uncompressed ||
788 this->force_sechalf || !this->dst.is_contiguous());
789 }
790
791 int
792 fs_inst::regs_read(fs_visitor *v, int arg) const
793 {
794 if (is_tex() && arg == 0 && src[0].file == GRF) {
795 if (v->dispatch_width == 16)
796 return (mlen + 1) / 2;
797 else
798 return mlen;
799 }
800 return 1;
801 }
802
803 bool
804 fs_inst::reads_flag() const
805 {
806 return predicate;
807 }
808
809 bool
810 fs_inst::writes_flag() const
811 {
812 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
813 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
814 }
815
816 /**
817 * Returns how many MRFs an FS opcode will write over.
818 *
819 * Note that this is not the 0 or 1 implied writes in an actual gen
820 * instruction -- the FS opcodes often generate MOVs in addition.
821 */
822 int
823 fs_visitor::implied_mrf_writes(fs_inst *inst)
824 {
825 if (inst->mlen == 0)
826 return 0;
827
828 if (inst->base_mrf == -1)
829 return 0;
830
831 switch (inst->opcode) {
832 case SHADER_OPCODE_RCP:
833 case SHADER_OPCODE_RSQ:
834 case SHADER_OPCODE_SQRT:
835 case SHADER_OPCODE_EXP2:
836 case SHADER_OPCODE_LOG2:
837 case SHADER_OPCODE_SIN:
838 case SHADER_OPCODE_COS:
839 return 1 * dispatch_width / 8;
840 case SHADER_OPCODE_POW:
841 case SHADER_OPCODE_INT_QUOTIENT:
842 case SHADER_OPCODE_INT_REMAINDER:
843 return 2 * dispatch_width / 8;
844 case SHADER_OPCODE_TEX:
845 case FS_OPCODE_TXB:
846 case SHADER_OPCODE_TXD:
847 case SHADER_OPCODE_TXF:
848 case SHADER_OPCODE_TXF_CMS:
849 case SHADER_OPCODE_TXF_MCS:
850 case SHADER_OPCODE_TG4:
851 case SHADER_OPCODE_TG4_OFFSET:
852 case SHADER_OPCODE_TXL:
853 case SHADER_OPCODE_TXS:
854 case SHADER_OPCODE_LOD:
855 return 1;
856 case FS_OPCODE_FB_WRITE:
857 return 2;
858 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
859 case SHADER_OPCODE_GEN4_SCRATCH_READ:
860 return 1;
861 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
862 return inst->mlen;
863 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
864 return 2;
865 case SHADER_OPCODE_UNTYPED_ATOMIC:
866 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
867 return 0;
868 default:
869 assert(!"not reached");
870 return inst->mlen;
871 }
872 }
873
874 int
875 fs_visitor::virtual_grf_alloc(int size)
876 {
877 if (virtual_grf_array_size <= virtual_grf_count) {
878 if (virtual_grf_array_size == 0)
879 virtual_grf_array_size = 16;
880 else
881 virtual_grf_array_size *= 2;
882 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
883 virtual_grf_array_size);
884 }
885 virtual_grf_sizes[virtual_grf_count] = size;
886 return virtual_grf_count++;
887 }
888
889 /** Fixed HW reg constructor. */
890 fs_reg::fs_reg(enum register_file file, int reg)
891 {
892 init();
893 this->file = file;
894 this->reg = reg;
895 this->type = BRW_REGISTER_TYPE_F;
896 }
897
898 /** Fixed HW reg constructor. */
899 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
900 {
901 init();
902 this->file = file;
903 this->reg = reg;
904 this->type = type;
905 }
906
907 /** Automatic reg constructor. */
908 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
909 {
910 init();
911
912 this->file = GRF;
913 this->reg = v->virtual_grf_alloc(v->type_size(type));
914 this->reg_offset = 0;
915 this->type = brw_type_for_base_type(type);
916 }
917
918 fs_reg *
919 fs_visitor::variable_storage(ir_variable *var)
920 {
921 return (fs_reg *)hash_table_find(this->variable_ht, var);
922 }
923
924 void
925 import_uniforms_callback(const void *key,
926 void *data,
927 void *closure)
928 {
929 struct hash_table *dst_ht = (struct hash_table *)closure;
930 const fs_reg *reg = (const fs_reg *)data;
931
932 if (reg->file != UNIFORM)
933 return;
934
935 hash_table_insert(dst_ht, data, key);
936 }
937
938 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
939 * This brings in those uniform definitions
940 */
941 void
942 fs_visitor::import_uniforms(fs_visitor *v)
943 {
944 hash_table_call_foreach(v->variable_ht,
945 import_uniforms_callback,
946 variable_ht);
947 this->push_constant_loc = v->push_constant_loc;
948 this->pull_constant_loc = v->pull_constant_loc;
949 this->uniforms = v->uniforms;
950 this->param_size = v->param_size;
951 }
952
953 /* Our support for uniforms is piggy-backed on the struct
954 * gl_fragment_program, because that's where the values actually
955 * get stored, rather than in some global gl_shader_program uniform
956 * store.
957 */
958 void
959 fs_visitor::setup_uniform_values(ir_variable *ir)
960 {
961 int namelen = strlen(ir->name);
962
963 /* The data for our (non-builtin) uniforms is stored in a series of
964 * gl_uniform_driver_storage structs for each subcomponent that
965 * glGetUniformLocation() could name. We know it's been set up in the same
966 * order we'd walk the type, so walk the list of storage and find anything
967 * with our name, or the prefix of a component that starts with our name.
968 */
969 unsigned params_before = uniforms;
970 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
971 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
972
973 if (strncmp(ir->name, storage->name, namelen) != 0 ||
974 (storage->name[namelen] != 0 &&
975 storage->name[namelen] != '.' &&
976 storage->name[namelen] != '[')) {
977 continue;
978 }
979
980 unsigned slots = storage->type->component_slots();
981 if (storage->array_elements)
982 slots *= storage->array_elements;
983
984 for (unsigned i = 0; i < slots; i++) {
985 stage_prog_data->param[uniforms++] = &storage->storage[i].f;
986 }
987 }
988
989 /* Make sure we actually initialized the right amount of stuff here. */
990 assert(params_before + ir->type->component_slots() == uniforms);
991 (void)params_before;
992 }
993
994
995 /* Our support for builtin uniforms is even scarier than non-builtin.
996 * It sits on top of the PROG_STATE_VAR parameters that are
997 * automatically updated from GL context state.
998 */
999 void
1000 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1001 {
1002 const ir_state_slot *const slots = ir->state_slots;
1003 assert(ir->state_slots != NULL);
1004
1005 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
1006 /* This state reference has already been setup by ir_to_mesa, but we'll
1007 * get the same index back here.
1008 */
1009 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
1010 (gl_state_index *)slots[i].tokens);
1011
1012 /* Add each of the unique swizzles of the element as a parameter.
1013 * This'll end up matching the expected layout of the
1014 * array/matrix/structure we're trying to fill in.
1015 */
1016 int last_swiz = -1;
1017 for (unsigned int j = 0; j < 4; j++) {
1018 int swiz = GET_SWZ(slots[i].swizzle, j);
1019 if (swiz == last_swiz)
1020 break;
1021 last_swiz = swiz;
1022
1023 stage_prog_data->param[uniforms++] =
1024 &fp->Base.Parameters->ParameterValues[index][swiz].f;
1025 }
1026 }
1027 }
1028
1029 fs_reg *
1030 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1031 {
1032 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1033 fs_reg wpos = *reg;
1034 bool flip = !ir->data.origin_upper_left ^ c->key.render_to_fbo;
1035
1036 /* gl_FragCoord.x */
1037 if (ir->data.pixel_center_integer) {
1038 emit(MOV(wpos, this->pixel_x));
1039 } else {
1040 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1041 }
1042 wpos.reg_offset++;
1043
1044 /* gl_FragCoord.y */
1045 if (!flip && ir->data.pixel_center_integer) {
1046 emit(MOV(wpos, this->pixel_y));
1047 } else {
1048 fs_reg pixel_y = this->pixel_y;
1049 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1050
1051 if (flip) {
1052 pixel_y.negate = true;
1053 offset += c->key.drawable_height - 1.0;
1054 }
1055
1056 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1057 }
1058 wpos.reg_offset++;
1059
1060 /* gl_FragCoord.z */
1061 if (brw->gen >= 6) {
1062 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
1063 } else {
1064 emit(FS_OPCODE_LINTERP, wpos,
1065 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1066 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1067 interp_reg(VARYING_SLOT_POS, 2));
1068 }
1069 wpos.reg_offset++;
1070
1071 /* gl_FragCoord.w: Already set up in emit_interpolation */
1072 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1073
1074 return reg;
1075 }
1076
1077 fs_inst *
1078 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1079 glsl_interp_qualifier interpolation_mode,
1080 bool is_centroid, bool is_sample)
1081 {
1082 brw_wm_barycentric_interp_mode barycoord_mode;
1083 if (brw->gen >= 6) {
1084 if (is_centroid) {
1085 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1086 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1087 else
1088 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1089 } else if (is_sample) {
1090 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1091 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1092 else
1093 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1094 } else {
1095 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1096 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1097 else
1098 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1099 }
1100 } else {
1101 /* On Ironlake and below, there is only one interpolation mode.
1102 * Centroid interpolation doesn't mean anything on this hardware --
1103 * there is no multisampling.
1104 */
1105 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1106 }
1107 return emit(FS_OPCODE_LINTERP, attr,
1108 this->delta_x[barycoord_mode],
1109 this->delta_y[barycoord_mode], interp);
1110 }
1111
1112 fs_reg *
1113 fs_visitor::emit_general_interpolation(ir_variable *ir)
1114 {
1115 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1116 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1117 fs_reg attr = *reg;
1118
1119 unsigned int array_elements;
1120 const glsl_type *type;
1121
1122 if (ir->type->is_array()) {
1123 array_elements = ir->type->length;
1124 if (array_elements == 0) {
1125 fail("dereferenced array '%s' has length 0\n", ir->name);
1126 }
1127 type = ir->type->fields.array;
1128 } else {
1129 array_elements = 1;
1130 type = ir->type;
1131 }
1132
1133 glsl_interp_qualifier interpolation_mode =
1134 ir->determine_interpolation_mode(c->key.flat_shade);
1135
1136 int location = ir->data.location;
1137 for (unsigned int i = 0; i < array_elements; i++) {
1138 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1139 if (c->prog_data.urb_setup[location] == -1) {
1140 /* If there's no incoming setup data for this slot, don't
1141 * emit interpolation for it.
1142 */
1143 attr.reg_offset += type->vector_elements;
1144 location++;
1145 continue;
1146 }
1147
1148 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1149 /* Constant interpolation (flat shading) case. The SF has
1150 * handed us defined values in only the constant offset
1151 * field of the setup reg.
1152 */
1153 for (unsigned int k = 0; k < type->vector_elements; k++) {
1154 struct brw_reg interp = interp_reg(location, k);
1155 interp = suboffset(interp, 3);
1156 interp.type = reg->type;
1157 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1158 attr.reg_offset++;
1159 }
1160 } else {
1161 /* Smooth/noperspective interpolation case. */
1162 for (unsigned int k = 0; k < type->vector_elements; k++) {
1163 struct brw_reg interp = interp_reg(location, k);
1164 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1165 ir->data.centroid && !c->key.persample_shading,
1166 ir->data.sample || c->key.persample_shading);
1167 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1168 /* Get the pixel/sample mask into f0 so that we know
1169 * which pixels are lit. Then, for each channel that is
1170 * unlit, replace the centroid data with non-centroid
1171 * data.
1172 */
1173 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1174 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1175 interpolation_mode,
1176 false, false);
1177 inst->predicate = BRW_PREDICATE_NORMAL;
1178 inst->predicate_inverse = true;
1179 }
1180 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1181 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1182 }
1183 attr.reg_offset++;
1184 }
1185
1186 }
1187 location++;
1188 }
1189 }
1190
1191 return reg;
1192 }
1193
1194 fs_reg *
1195 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1196 {
1197 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1198
1199 /* The frontfacing comes in as a bit in the thread payload. */
1200 if (brw->gen >= 6) {
1201 emit(BRW_OPCODE_ASR, *reg,
1202 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1203 fs_reg(15));
1204 emit(BRW_OPCODE_NOT, *reg, *reg);
1205 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1206 } else {
1207 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1208 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1209 * us front face
1210 */
1211 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1212 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1213 }
1214
1215 return reg;
1216 }
1217
1218 void
1219 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1220 {
1221 assert(dst.type == BRW_REGISTER_TYPE_F);
1222
1223 if (c->key.compute_pos_offset) {
1224 /* Convert int_sample_pos to floating point */
1225 emit(MOV(dst, int_sample_pos));
1226 /* Scale to the range [0, 1] */
1227 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1228 }
1229 else {
1230 /* From ARB_sample_shading specification:
1231 * "When rendering to a non-multisample buffer, or if multisample
1232 * rasterization is disabled, gl_SamplePosition will always be
1233 * (0.5, 0.5).
1234 */
1235 emit(MOV(dst, fs_reg(0.5f)));
1236 }
1237 }
1238
1239 fs_reg *
1240 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1241 {
1242 assert(brw->gen >= 6);
1243 assert(ir->type == glsl_type::vec2_type);
1244
1245 this->current_annotation = "compute sample position";
1246 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1247 fs_reg pos = *reg;
1248 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1249 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1250
1251 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1252 * mode will be enabled.
1253 *
1254 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1255 * R31.1:0 Position Offset X/Y for Slot[3:0]
1256 * R31.3:2 Position Offset X/Y for Slot[7:4]
1257 * .....
1258 *
1259 * The X, Y sample positions come in as bytes in thread payload. So, read
1260 * the positions using vstride=16, width=8, hstride=2.
1261 */
1262 struct brw_reg sample_pos_reg =
1263 stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1264 BRW_REGISTER_TYPE_B), 16, 8, 2);
1265
1266 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1267 if (dispatch_width == 16) {
1268 fs_inst *inst = emit(MOV(half(int_sample_x, 1),
1269 fs_reg(suboffset(sample_pos_reg, 16))));
1270 inst->force_sechalf = true;
1271 }
1272 /* Compute gl_SamplePosition.x */
1273 compute_sample_position(pos, int_sample_x);
1274 pos.reg_offset++;
1275 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1276 if (dispatch_width == 16) {
1277 fs_inst *inst = emit(MOV(half(int_sample_y, 1),
1278 fs_reg(suboffset(sample_pos_reg, 17))));
1279 inst->force_sechalf = true;
1280 }
1281 /* Compute gl_SamplePosition.y */
1282 compute_sample_position(pos, int_sample_y);
1283 return reg;
1284 }
1285
1286 fs_reg *
1287 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1288 {
1289 assert(brw->gen >= 6);
1290
1291 this->current_annotation = "compute sample id";
1292 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1293
1294 if (c->key.compute_sample_id) {
1295 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1296 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1297 t2.type = BRW_REGISTER_TYPE_UW;
1298
1299 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1300 * 8x multisampling, subspan 0 will represent sample N (where N
1301 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1302 * 7. We can find the value of N by looking at R0.0 bits 7:6
1303 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1304 * (since samples are always delivered in pairs). That is, we
1305 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1306 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1307 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1308 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1309 * populating a temporary variable with the sequence (0, 1, 2, 3),
1310 * and then reading from it using vstride=1, width=4, hstride=0.
1311 * These computations hold good for 4x multisampling as well.
1312 */
1313 emit(BRW_OPCODE_AND, t1,
1314 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1315 fs_reg(brw_imm_d(0xc0)));
1316 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1317 /* This works for both SIMD8 and SIMD16 */
1318 emit(MOV(t2, brw_imm_v(0x3210)));
1319 /* This special instruction takes care of setting vstride=1,
1320 * width=4, hstride=0 of t2 during an ADD instruction.
1321 */
1322 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1323 } else {
1324 /* As per GL_ARB_sample_shading specification:
1325 * "When rendering to a non-multisample buffer, or if multisample
1326 * rasterization is disabled, gl_SampleID will always be zero."
1327 */
1328 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1329 }
1330
1331 return reg;
1332 }
1333
1334 fs_reg *
1335 fs_visitor::emit_samplemaskin_setup(ir_variable *ir)
1336 {
1337 assert(brw->gen >= 7);
1338 this->current_annotation = "compute gl_SampleMaskIn";
1339 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1340 emit(MOV(*reg, fs_reg(retype(brw_vec8_grf(c->sample_mask_reg, 0), BRW_REGISTER_TYPE_D))));
1341 return reg;
1342 }
1343
1344 fs_reg
1345 fs_visitor::fix_math_operand(fs_reg src)
1346 {
1347 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1348 * might be able to do better by doing execsize = 1 math and then
1349 * expanding that result out, but we would need to be careful with
1350 * masking.
1351 *
1352 * The hardware ignores source modifiers (negate and abs) on math
1353 * instructions, so we also move to a temp to set those up.
1354 */
1355 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1356 !src.abs && !src.negate)
1357 return src;
1358
1359 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1360 * operands to math
1361 */
1362 if (brw->gen >= 7 && src.file != IMM)
1363 return src;
1364
1365 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1366 expanded.type = src.type;
1367 emit(BRW_OPCODE_MOV, expanded, src);
1368 return expanded;
1369 }
1370
1371 fs_inst *
1372 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1373 {
1374 switch (opcode) {
1375 case SHADER_OPCODE_RCP:
1376 case SHADER_OPCODE_RSQ:
1377 case SHADER_OPCODE_SQRT:
1378 case SHADER_OPCODE_EXP2:
1379 case SHADER_OPCODE_LOG2:
1380 case SHADER_OPCODE_SIN:
1381 case SHADER_OPCODE_COS:
1382 break;
1383 default:
1384 assert(!"not reached: bad math opcode");
1385 return NULL;
1386 }
1387
1388 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1389 * might be able to do better by doing execsize = 1 math and then
1390 * expanding that result out, but we would need to be careful with
1391 * masking.
1392 *
1393 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1394 * instructions, so we also move to a temp to set those up.
1395 */
1396 if (brw->gen >= 6)
1397 src = fix_math_operand(src);
1398
1399 fs_inst *inst = emit(opcode, dst, src);
1400
1401 if (brw->gen < 6) {
1402 inst->base_mrf = 2;
1403 inst->mlen = dispatch_width / 8;
1404 }
1405
1406 return inst;
1407 }
1408
1409 fs_inst *
1410 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1411 {
1412 int base_mrf = 2;
1413 fs_inst *inst;
1414
1415 switch (opcode) {
1416 case SHADER_OPCODE_INT_QUOTIENT:
1417 case SHADER_OPCODE_INT_REMAINDER:
1418 if (brw->gen >= 7)
1419 no16("SIMD16 INTDIV unsupported\n");
1420 break;
1421 case SHADER_OPCODE_POW:
1422 break;
1423 default:
1424 assert(!"not reached: unsupported binary math opcode.");
1425 return NULL;
1426 }
1427
1428 if (brw->gen >= 6) {
1429 src0 = fix_math_operand(src0);
1430 src1 = fix_math_operand(src1);
1431
1432 inst = emit(opcode, dst, src0, src1);
1433 } else {
1434 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1435 * "Message Payload":
1436 *
1437 * "Operand0[7]. For the INT DIV functions, this operand is the
1438 * denominator."
1439 * ...
1440 * "Operand1[7]. For the INT DIV functions, this operand is the
1441 * numerator."
1442 */
1443 bool is_int_div = opcode != SHADER_OPCODE_POW;
1444 fs_reg &op0 = is_int_div ? src1 : src0;
1445 fs_reg &op1 = is_int_div ? src0 : src1;
1446
1447 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1448 inst = emit(opcode, dst, op0, reg_null_f);
1449
1450 inst->base_mrf = base_mrf;
1451 inst->mlen = 2 * dispatch_width / 8;
1452 }
1453 return inst;
1454 }
1455
1456 void
1457 fs_visitor::assign_curb_setup()
1458 {
1459 if (dispatch_width == 8) {
1460 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1461 } else {
1462 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1463 }
1464
1465 c->prog_data.curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1466
1467 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1468 foreach_list(node, &this->instructions) {
1469 fs_inst *inst = (fs_inst *)node;
1470
1471 for (unsigned int i = 0; i < 3; i++) {
1472 if (inst->src[i].file == UNIFORM) {
1473 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1474 int constant_nr;
1475 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1476 constant_nr = push_constant_loc[uniform_nr];
1477 } else {
1478 /* Section 5.11 of the OpenGL 4.1 spec says:
1479 * "Out-of-bounds reads return undefined values, which include
1480 * values from other variables of the active program or zero."
1481 * Just return the first push constant.
1482 */
1483 constant_nr = 0;
1484 }
1485
1486 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1487 constant_nr / 8,
1488 constant_nr % 8);
1489
1490 inst->src[i].file = HW_REG;
1491 inst->src[i].fixed_hw_reg = byte_offset(
1492 retype(brw_reg, inst->src[i].type),
1493 inst->src[i].subreg_offset);
1494 }
1495 }
1496 }
1497 }
1498
1499 void
1500 fs_visitor::calculate_urb_setup()
1501 {
1502 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1503 c->prog_data.urb_setup[i] = -1;
1504 }
1505
1506 int urb_next = 0;
1507 /* Figure out where each of the incoming setup attributes lands. */
1508 if (brw->gen >= 6) {
1509 if (_mesa_bitcount_64(fp->Base.InputsRead &
1510 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1511 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1512 * first 16 varying inputs, so we can put them wherever we want.
1513 * Just put them in order.
1514 *
1515 * This is useful because it means that (a) inputs not used by the
1516 * fragment shader won't take up valuable register space, and (b) we
1517 * won't have to recompile the fragment shader if it gets paired with
1518 * a different vertex (or geometry) shader.
1519 */
1520 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1521 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1522 BITFIELD64_BIT(i)) {
1523 c->prog_data.urb_setup[i] = urb_next++;
1524 }
1525 }
1526 } else {
1527 /* We have enough input varyings that the SF/SBE pipeline stage can't
1528 * arbitrarily rearrange them to suit our whim; we have to put them
1529 * in an order that matches the output of the previous pipeline stage
1530 * (geometry or vertex shader).
1531 */
1532 struct brw_vue_map prev_stage_vue_map;
1533 brw_compute_vue_map(brw, &prev_stage_vue_map,
1534 c->key.input_slots_valid);
1535 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1536 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1537 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1538 slot++) {
1539 int varying = prev_stage_vue_map.slot_to_varying[slot];
1540 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1541 * unused.
1542 */
1543 if (varying != BRW_VARYING_SLOT_COUNT &&
1544 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1545 BITFIELD64_BIT(varying))) {
1546 c->prog_data.urb_setup[varying] = slot - first_slot;
1547 }
1548 }
1549 urb_next = prev_stage_vue_map.num_slots - first_slot;
1550 }
1551 } else {
1552 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1553 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1554 /* Point size is packed into the header, not as a general attribute */
1555 if (i == VARYING_SLOT_PSIZ)
1556 continue;
1557
1558 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1559 /* The back color slot is skipped when the front color is
1560 * also written to. In addition, some slots can be
1561 * written in the vertex shader and not read in the
1562 * fragment shader. So the register number must always be
1563 * incremented, mapped or not.
1564 */
1565 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1566 c->prog_data.urb_setup[i] = urb_next;
1567 urb_next++;
1568 }
1569 }
1570
1571 /*
1572 * It's a FS only attribute, and we did interpolation for this attribute
1573 * in SF thread. So, count it here, too.
1574 *
1575 * See compile_sf_prog() for more info.
1576 */
1577 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1578 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1579 }
1580
1581 c->prog_data.num_varying_inputs = urb_next;
1582 }
1583
1584 void
1585 fs_visitor::assign_urb_setup()
1586 {
1587 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1588
1589 /* Offset all the urb_setup[] index by the actual position of the
1590 * setup regs, now that the location of the constants has been chosen.
1591 */
1592 foreach_list(node, &this->instructions) {
1593 fs_inst *inst = (fs_inst *)node;
1594
1595 if (inst->opcode == FS_OPCODE_LINTERP) {
1596 assert(inst->src[2].file == HW_REG);
1597 inst->src[2].fixed_hw_reg.nr += urb_start;
1598 }
1599
1600 if (inst->opcode == FS_OPCODE_CINTERP) {
1601 assert(inst->src[0].file == HW_REG);
1602 inst->src[0].fixed_hw_reg.nr += urb_start;
1603 }
1604 }
1605
1606 /* Each attribute is 4 setup channels, each of which is half a reg. */
1607 this->first_non_payload_grf =
1608 urb_start + c->prog_data.num_varying_inputs * 2;
1609 }
1610
1611 /**
1612 * Split large virtual GRFs into separate components if we can.
1613 *
1614 * This is mostly duplicated with what brw_fs_vector_splitting does,
1615 * but that's really conservative because it's afraid of doing
1616 * splitting that doesn't result in real progress after the rest of
1617 * the optimization phases, which would cause infinite looping in
1618 * optimization. We can do it once here, safely. This also has the
1619 * opportunity to split interpolated values, or maybe even uniforms,
1620 * which we don't have at the IR level.
1621 *
1622 * We want to split, because virtual GRFs are what we register
1623 * allocate and spill (due to contiguousness requirements for some
1624 * instructions), and they're what we naturally generate in the
1625 * codegen process, but most virtual GRFs don't actually need to be
1626 * contiguous sets of GRFs. If we split, we'll end up with reduced
1627 * live intervals and better dead code elimination and coalescing.
1628 */
1629 void
1630 fs_visitor::split_virtual_grfs()
1631 {
1632 int num_vars = this->virtual_grf_count;
1633 bool split_grf[num_vars];
1634 int new_virtual_grf[num_vars];
1635
1636 /* Try to split anything > 0 sized. */
1637 for (int i = 0; i < num_vars; i++) {
1638 if (this->virtual_grf_sizes[i] != 1)
1639 split_grf[i] = true;
1640 else
1641 split_grf[i] = false;
1642 }
1643
1644 if (brw->has_pln &&
1645 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1646 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1647 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1648 * Gen6, that was the only supported interpolation mode, and since Gen6,
1649 * delta_x and delta_y are in fixed hardware registers.
1650 */
1651 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1652 false;
1653 }
1654
1655 foreach_list(node, &this->instructions) {
1656 fs_inst *inst = (fs_inst *)node;
1657
1658 /* If there's a SEND message that requires contiguous destination
1659 * registers, no splitting is allowed.
1660 */
1661 if (inst->regs_written > 1) {
1662 split_grf[inst->dst.reg] = false;
1663 }
1664
1665 /* If we're sending from a GRF, don't split it, on the assumption that
1666 * the send is reading the whole thing.
1667 */
1668 if (inst->is_send_from_grf()) {
1669 for (int i = 0; i < 3; i++) {
1670 if (inst->src[i].file == GRF) {
1671 split_grf[inst->src[i].reg] = false;
1672 }
1673 }
1674 }
1675 }
1676
1677 /* Allocate new space for split regs. Note that the virtual
1678 * numbers will be contiguous.
1679 */
1680 for (int i = 0; i < num_vars; i++) {
1681 if (split_grf[i]) {
1682 new_virtual_grf[i] = virtual_grf_alloc(1);
1683 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1684 int reg = virtual_grf_alloc(1);
1685 assert(reg == new_virtual_grf[i] + j - 1);
1686 (void) reg;
1687 }
1688 this->virtual_grf_sizes[i] = 1;
1689 }
1690 }
1691
1692 foreach_list(node, &this->instructions) {
1693 fs_inst *inst = (fs_inst *)node;
1694
1695 if (inst->dst.file == GRF &&
1696 split_grf[inst->dst.reg] &&
1697 inst->dst.reg_offset != 0) {
1698 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1699 inst->dst.reg_offset - 1);
1700 inst->dst.reg_offset = 0;
1701 }
1702 for (int i = 0; i < 3; i++) {
1703 if (inst->src[i].file == GRF &&
1704 split_grf[inst->src[i].reg] &&
1705 inst->src[i].reg_offset != 0) {
1706 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1707 inst->src[i].reg_offset - 1);
1708 inst->src[i].reg_offset = 0;
1709 }
1710 }
1711 }
1712 invalidate_live_intervals();
1713 }
1714
1715 /**
1716 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1717 *
1718 * During code generation, we create tons of temporary variables, many of
1719 * which get immediately killed and are never used again. Yet, in later
1720 * optimization and analysis passes, such as compute_live_intervals, we need
1721 * to loop over all the virtual GRFs. Compacting them can save a lot of
1722 * overhead.
1723 */
1724 void
1725 fs_visitor::compact_virtual_grfs()
1726 {
1727 /* Mark which virtual GRFs are used, and count how many. */
1728 int remap_table[this->virtual_grf_count];
1729 memset(remap_table, -1, sizeof(remap_table));
1730
1731 foreach_list(node, &this->instructions) {
1732 const fs_inst *inst = (const fs_inst *) node;
1733
1734 if (inst->dst.file == GRF)
1735 remap_table[inst->dst.reg] = 0;
1736
1737 for (int i = 0; i < 3; i++) {
1738 if (inst->src[i].file == GRF)
1739 remap_table[inst->src[i].reg] = 0;
1740 }
1741 }
1742
1743 /* Compact the GRF arrays. */
1744 int new_index = 0;
1745 for (int i = 0; i < this->virtual_grf_count; i++) {
1746 if (remap_table[i] != -1) {
1747 remap_table[i] = new_index;
1748 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1749 invalidate_live_intervals();
1750 ++new_index;
1751 }
1752 }
1753
1754 this->virtual_grf_count = new_index;
1755
1756 /* Patch all the instructions to use the newly renumbered registers */
1757 foreach_list(node, &this->instructions) {
1758 fs_inst *inst = (fs_inst *) node;
1759
1760 if (inst->dst.file == GRF)
1761 inst->dst.reg = remap_table[inst->dst.reg];
1762
1763 for (int i = 0; i < 3; i++) {
1764 if (inst->src[i].file == GRF)
1765 inst->src[i].reg = remap_table[inst->src[i].reg];
1766 }
1767 }
1768 }
1769
1770 /*
1771 * Implements array access of uniforms by inserting a
1772 * PULL_CONSTANT_LOAD instruction.
1773 *
1774 * Unlike temporary GRF array access (where we don't support it due to
1775 * the difficulty of doing relative addressing on instruction
1776 * destinations), we could potentially do array access of uniforms
1777 * that were loaded in GRF space as push constants. In real-world
1778 * usage we've seen, though, the arrays being used are always larger
1779 * than we could load as push constants, so just always move all
1780 * uniform array access out to a pull constant buffer.
1781 */
1782 void
1783 fs_visitor::move_uniform_array_access_to_pull_constants()
1784 {
1785 if (dispatch_width != 8)
1786 return;
1787
1788 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1789
1790 for (unsigned int i = 0; i < uniforms; i++) {
1791 pull_constant_loc[i] = -1;
1792 }
1793
1794 /* Walk through and find array access of uniforms. Put a copy of that
1795 * uniform in the pull constant buffer.
1796 *
1797 * Note that we don't move constant-indexed accesses to arrays. No
1798 * testing has been done of the performance impact of this choice.
1799 */
1800 foreach_list_safe(node, &this->instructions) {
1801 fs_inst *inst = (fs_inst *)node;
1802
1803 for (int i = 0 ; i < 3; i++) {
1804 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1805 continue;
1806
1807 int uniform = inst->src[i].reg;
1808
1809 /* If this array isn't already present in the pull constant buffer,
1810 * add it.
1811 */
1812 if (pull_constant_loc[uniform] == -1) {
1813 const float **values = &stage_prog_data->param[uniform];
1814
1815 assert(param_size[uniform]);
1816
1817 for (int j = 0; j < param_size[uniform]; j++) {
1818 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1819
1820 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1821 values[j];
1822 }
1823 }
1824 }
1825 }
1826 }
1827
1828 /**
1829 * Assign UNIFORM file registers to either push constants or pull constants.
1830 *
1831 * We allow a fragment shader to have more than the specified minimum
1832 * maximum number of fragment shader uniform components (64). If
1833 * there are too many of these, they'd fill up all of register space.
1834 * So, this will push some of them out to the pull constant buffer and
1835 * update the program to load them.
1836 */
1837 void
1838 fs_visitor::assign_constant_locations()
1839 {
1840 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1841 if (dispatch_width != 8)
1842 return;
1843
1844 /* Find which UNIFORM registers are still in use. */
1845 bool is_live[uniforms];
1846 for (unsigned int i = 0; i < uniforms; i++) {
1847 is_live[i] = false;
1848 }
1849
1850 foreach_list(node, &this->instructions) {
1851 fs_inst *inst = (fs_inst *) node;
1852
1853 for (int i = 0; i < 3; i++) {
1854 if (inst->src[i].file != UNIFORM)
1855 continue;
1856
1857 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1858 if (constant_nr >= 0 && constant_nr < (int) uniforms)
1859 is_live[constant_nr] = true;
1860 }
1861 }
1862
1863 /* Only allow 16 registers (128 uniform components) as push constants.
1864 *
1865 * Just demote the end of the list. We could probably do better
1866 * here, demoting things that are rarely used in the program first.
1867 */
1868 unsigned int max_push_components = 16 * 8;
1869 unsigned int num_push_constants = 0;
1870
1871 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1872
1873 for (unsigned int i = 0; i < uniforms; i++) {
1874 if (!is_live[i] || pull_constant_loc[i] != -1) {
1875 /* This UNIFORM register is either dead, or has already been demoted
1876 * to a pull const. Mark it as no longer living in the param[] array.
1877 */
1878 push_constant_loc[i] = -1;
1879 continue;
1880 }
1881
1882 if (num_push_constants < max_push_components) {
1883 /* Retain as a push constant. Record the location in the params[]
1884 * array.
1885 */
1886 push_constant_loc[i] = num_push_constants++;
1887 } else {
1888 /* Demote to a pull constant. */
1889 push_constant_loc[i] = -1;
1890
1891 int pull_index = stage_prog_data->nr_pull_params++;
1892 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1893 pull_constant_loc[i] = pull_index;
1894 }
1895 }
1896
1897 stage_prog_data->nr_params = num_push_constants;
1898
1899 /* Up until now, the param[] array has been indexed by reg + reg_offset
1900 * of UNIFORM registers. Condense it to only contain the uniforms we
1901 * chose to upload as push constants.
1902 */
1903 for (unsigned int i = 0; i < uniforms; i++) {
1904 int remapped = push_constant_loc[i];
1905
1906 if (remapped == -1)
1907 continue;
1908
1909 assert(remapped <= (int)i);
1910 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1911 }
1912 }
1913
1914 /**
1915 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1916 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1917 */
1918 void
1919 fs_visitor::demote_pull_constants()
1920 {
1921 foreach_list(node, &this->instructions) {
1922 fs_inst *inst = (fs_inst *)node;
1923
1924 for (int i = 0; i < 3; i++) {
1925 if (inst->src[i].file != UNIFORM)
1926 continue;
1927
1928 int pull_index = pull_constant_loc[inst->src[i].reg +
1929 inst->src[i].reg_offset];
1930 if (pull_index == -1)
1931 continue;
1932
1933 /* Set up the annotation tracking for new generated instructions. */
1934 base_ir = inst->ir;
1935 current_annotation = inst->annotation;
1936
1937 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1938 fs_reg dst = fs_reg(this, glsl_type::float_type);
1939
1940 /* Generate a pull load into dst. */
1941 if (inst->src[i].reladdr) {
1942 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1943 surf_index,
1944 *inst->src[i].reladdr,
1945 pull_index);
1946 inst->insert_before(&list);
1947 inst->src[i].reladdr = NULL;
1948 } else {
1949 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1950 fs_inst *pull =
1951 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1952 dst, surf_index, offset);
1953 inst->insert_before(pull);
1954 inst->src[i].set_smear(pull_index & 3);
1955 }
1956
1957 /* Rewrite the instruction to use the temporary VGRF. */
1958 inst->src[i].file = GRF;
1959 inst->src[i].reg = dst.reg;
1960 inst->src[i].reg_offset = 0;
1961 }
1962 }
1963 invalidate_live_intervals();
1964 }
1965
1966 bool
1967 fs_visitor::opt_algebraic()
1968 {
1969 bool progress = false;
1970
1971 foreach_list(node, &this->instructions) {
1972 fs_inst *inst = (fs_inst *)node;
1973
1974 switch (inst->opcode) {
1975 case BRW_OPCODE_MUL:
1976 if (inst->src[1].file != IMM)
1977 continue;
1978
1979 /* a * 1.0 = a */
1980 if (inst->src[1].is_one()) {
1981 inst->opcode = BRW_OPCODE_MOV;
1982 inst->src[1] = reg_undef;
1983 progress = true;
1984 break;
1985 }
1986
1987 /* a * 0.0 = 0.0 */
1988 if (inst->src[1].is_zero()) {
1989 inst->opcode = BRW_OPCODE_MOV;
1990 inst->src[0] = inst->src[1];
1991 inst->src[1] = reg_undef;
1992 progress = true;
1993 break;
1994 }
1995
1996 break;
1997 case BRW_OPCODE_ADD:
1998 if (inst->src[1].file != IMM)
1999 continue;
2000
2001 /* a + 0.0 = a */
2002 if (inst->src[1].is_zero()) {
2003 inst->opcode = BRW_OPCODE_MOV;
2004 inst->src[1] = reg_undef;
2005 progress = true;
2006 break;
2007 }
2008 break;
2009 case BRW_OPCODE_OR:
2010 if (inst->src[0].equals(inst->src[1])) {
2011 inst->opcode = BRW_OPCODE_MOV;
2012 inst->src[1] = reg_undef;
2013 progress = true;
2014 break;
2015 }
2016 break;
2017 case BRW_OPCODE_LRP:
2018 if (inst->src[1].equals(inst->src[2])) {
2019 inst->opcode = BRW_OPCODE_MOV;
2020 inst->src[0] = inst->src[1];
2021 inst->src[1] = reg_undef;
2022 inst->src[2] = reg_undef;
2023 progress = true;
2024 break;
2025 }
2026 break;
2027 case BRW_OPCODE_SEL:
2028 if (inst->saturate && inst->src[1].file == IMM) {
2029 switch (inst->conditional_mod) {
2030 case BRW_CONDITIONAL_LE:
2031 case BRW_CONDITIONAL_L:
2032 switch (inst->src[1].type) {
2033 case BRW_REGISTER_TYPE_F:
2034 if (inst->src[1].imm.f >= 1.0f) {
2035 inst->opcode = BRW_OPCODE_MOV;
2036 inst->src[1] = reg_undef;
2037 progress = true;
2038 }
2039 break;
2040 default:
2041 break;
2042 }
2043 break;
2044 case BRW_CONDITIONAL_GE:
2045 case BRW_CONDITIONAL_G:
2046 switch (inst->src[1].type) {
2047 case BRW_REGISTER_TYPE_F:
2048 if (inst->src[1].imm.f <= 0.0f) {
2049 inst->opcode = BRW_OPCODE_MOV;
2050 inst->src[1] = reg_undef;
2051 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2052 progress = true;
2053 }
2054 break;
2055 default:
2056 break;
2057 }
2058 default:
2059 break;
2060 }
2061 }
2062 break;
2063 default:
2064 break;
2065 }
2066 }
2067
2068 return progress;
2069 }
2070
2071 bool
2072 fs_visitor::compute_to_mrf()
2073 {
2074 bool progress = false;
2075 int next_ip = 0;
2076
2077 calculate_live_intervals();
2078
2079 foreach_list_safe(node, &this->instructions) {
2080 fs_inst *inst = (fs_inst *)node;
2081
2082 int ip = next_ip;
2083 next_ip++;
2084
2085 if (inst->opcode != BRW_OPCODE_MOV ||
2086 inst->is_partial_write() ||
2087 inst->dst.file != MRF || inst->src[0].file != GRF ||
2088 inst->dst.type != inst->src[0].type ||
2089 inst->src[0].abs || inst->src[0].negate ||
2090 !inst->src[0].is_contiguous() ||
2091 inst->src[0].subreg_offset)
2092 continue;
2093
2094 /* Work out which hardware MRF registers are written by this
2095 * instruction.
2096 */
2097 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2098 int mrf_high;
2099 if (inst->dst.reg & BRW_MRF_COMPR4) {
2100 mrf_high = mrf_low + 4;
2101 } else if (dispatch_width == 16 &&
2102 (!inst->force_uncompressed && !inst->force_sechalf)) {
2103 mrf_high = mrf_low + 1;
2104 } else {
2105 mrf_high = mrf_low;
2106 }
2107
2108 /* Can't compute-to-MRF this GRF if someone else was going to
2109 * read it later.
2110 */
2111 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2112 continue;
2113
2114 /* Found a move of a GRF to a MRF. Let's see if we can go
2115 * rewrite the thing that made this GRF to write into the MRF.
2116 */
2117 fs_inst *scan_inst;
2118 for (scan_inst = (fs_inst *)inst->prev;
2119 scan_inst->prev != NULL;
2120 scan_inst = (fs_inst *)scan_inst->prev) {
2121 if (scan_inst->dst.file == GRF &&
2122 scan_inst->dst.reg == inst->src[0].reg) {
2123 /* Found the last thing to write our reg we want to turn
2124 * into a compute-to-MRF.
2125 */
2126
2127 /* If this one instruction didn't populate all the
2128 * channels, bail. We might be able to rewrite everything
2129 * that writes that reg, but it would require smarter
2130 * tracking to delay the rewriting until complete success.
2131 */
2132 if (scan_inst->is_partial_write())
2133 break;
2134
2135 /* Things returning more than one register would need us to
2136 * understand coalescing out more than one MOV at a time.
2137 */
2138 if (scan_inst->regs_written > 1)
2139 break;
2140
2141 /* SEND instructions can't have MRF as a destination. */
2142 if (scan_inst->mlen)
2143 break;
2144
2145 if (brw->gen == 6) {
2146 /* gen6 math instructions must have the destination be
2147 * GRF, so no compute-to-MRF for them.
2148 */
2149 if (scan_inst->is_math()) {
2150 break;
2151 }
2152 }
2153
2154 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2155 /* Found the creator of our MRF's source value. */
2156 scan_inst->dst.file = MRF;
2157 scan_inst->dst.reg = inst->dst.reg;
2158 scan_inst->saturate |= inst->saturate;
2159 inst->remove();
2160 progress = true;
2161 }
2162 break;
2163 }
2164
2165 /* We don't handle control flow here. Most computation of
2166 * values that end up in MRFs are shortly before the MRF
2167 * write anyway.
2168 */
2169 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2170 break;
2171
2172 /* You can't read from an MRF, so if someone else reads our
2173 * MRF's source GRF that we wanted to rewrite, that stops us.
2174 */
2175 bool interfered = false;
2176 for (int i = 0; i < 3; i++) {
2177 if (scan_inst->src[i].file == GRF &&
2178 scan_inst->src[i].reg == inst->src[0].reg &&
2179 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2180 interfered = true;
2181 }
2182 }
2183 if (interfered)
2184 break;
2185
2186 if (scan_inst->dst.file == MRF) {
2187 /* If somebody else writes our MRF here, we can't
2188 * compute-to-MRF before that.
2189 */
2190 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2191 int scan_mrf_high;
2192
2193 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2194 scan_mrf_high = scan_mrf_low + 4;
2195 } else if (dispatch_width == 16 &&
2196 (!scan_inst->force_uncompressed &&
2197 !scan_inst->force_sechalf)) {
2198 scan_mrf_high = scan_mrf_low + 1;
2199 } else {
2200 scan_mrf_high = scan_mrf_low;
2201 }
2202
2203 if (mrf_low == scan_mrf_low ||
2204 mrf_low == scan_mrf_high ||
2205 mrf_high == scan_mrf_low ||
2206 mrf_high == scan_mrf_high) {
2207 break;
2208 }
2209 }
2210
2211 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2212 /* Found a SEND instruction, which means that there are
2213 * live values in MRFs from base_mrf to base_mrf +
2214 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2215 * above it.
2216 */
2217 if (mrf_low >= scan_inst->base_mrf &&
2218 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2219 break;
2220 }
2221 if (mrf_high >= scan_inst->base_mrf &&
2222 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2223 break;
2224 }
2225 }
2226 }
2227 }
2228
2229 if (progress)
2230 invalidate_live_intervals();
2231
2232 return progress;
2233 }
2234
2235 /**
2236 * Walks through basic blocks, looking for repeated MRF writes and
2237 * removing the later ones.
2238 */
2239 bool
2240 fs_visitor::remove_duplicate_mrf_writes()
2241 {
2242 fs_inst *last_mrf_move[16];
2243 bool progress = false;
2244
2245 /* Need to update the MRF tracking for compressed instructions. */
2246 if (dispatch_width == 16)
2247 return false;
2248
2249 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2250
2251 foreach_list_safe(node, &this->instructions) {
2252 fs_inst *inst = (fs_inst *)node;
2253
2254 if (inst->is_control_flow()) {
2255 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2256 }
2257
2258 if (inst->opcode == BRW_OPCODE_MOV &&
2259 inst->dst.file == MRF) {
2260 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2261 if (prev_inst && inst->equals(prev_inst)) {
2262 inst->remove();
2263 progress = true;
2264 continue;
2265 }
2266 }
2267
2268 /* Clear out the last-write records for MRFs that were overwritten. */
2269 if (inst->dst.file == MRF) {
2270 last_mrf_move[inst->dst.reg] = NULL;
2271 }
2272
2273 if (inst->mlen > 0 && inst->base_mrf != -1) {
2274 /* Found a SEND instruction, which will include two or fewer
2275 * implied MRF writes. We could do better here.
2276 */
2277 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2278 last_mrf_move[inst->base_mrf + i] = NULL;
2279 }
2280 }
2281
2282 /* Clear out any MRF move records whose sources got overwritten. */
2283 if (inst->dst.file == GRF) {
2284 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2285 if (last_mrf_move[i] &&
2286 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2287 last_mrf_move[i] = NULL;
2288 }
2289 }
2290 }
2291
2292 if (inst->opcode == BRW_OPCODE_MOV &&
2293 inst->dst.file == MRF &&
2294 inst->src[0].file == GRF &&
2295 !inst->is_partial_write()) {
2296 last_mrf_move[inst->dst.reg] = inst;
2297 }
2298 }
2299
2300 if (progress)
2301 invalidate_live_intervals();
2302
2303 return progress;
2304 }
2305
2306 static void
2307 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2308 int first_grf, int grf_len)
2309 {
2310 bool inst_simd16 = (dispatch_width > 8 &&
2311 !inst->force_uncompressed &&
2312 !inst->force_sechalf);
2313
2314 /* Clear the flag for registers that actually got read (as expected). */
2315 for (int i = 0; i < 3; i++) {
2316 int grf;
2317 if (inst->src[i].file == GRF) {
2318 grf = inst->src[i].reg;
2319 } else if (inst->src[i].file == HW_REG &&
2320 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2321 grf = inst->src[i].fixed_hw_reg.nr;
2322 } else {
2323 continue;
2324 }
2325
2326 if (grf >= first_grf &&
2327 grf < first_grf + grf_len) {
2328 deps[grf - first_grf] = false;
2329 if (inst_simd16)
2330 deps[grf - first_grf + 1] = false;
2331 }
2332 }
2333 }
2334
2335 /**
2336 * Implements this workaround for the original 965:
2337 *
2338 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2339 * check for post destination dependencies on this instruction, software
2340 * must ensure that there is no destination hazard for the case of ‘write
2341 * followed by a posted write’ shown in the following example.
2342 *
2343 * 1. mov r3 0
2344 * 2. send r3.xy <rest of send instruction>
2345 * 3. mov r2 r3
2346 *
2347 * Due to no post-destination dependency check on the ‘send’, the above
2348 * code sequence could have two instructions (1 and 2) in flight at the
2349 * same time that both consider ‘r3’ as the target of their final writes.
2350 */
2351 void
2352 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2353 {
2354 int reg_size = dispatch_width / 8;
2355 int write_len = inst->regs_written * reg_size;
2356 int first_write_grf = inst->dst.reg;
2357 bool needs_dep[BRW_MAX_MRF];
2358 assert(write_len < (int)sizeof(needs_dep) - 1);
2359
2360 memset(needs_dep, false, sizeof(needs_dep));
2361 memset(needs_dep, true, write_len);
2362
2363 clear_deps_for_inst_src(inst, dispatch_width,
2364 needs_dep, first_write_grf, write_len);
2365
2366 /* Walk backwards looking for writes to registers we're writing which
2367 * aren't read since being written. If we hit the start of the program,
2368 * we assume that there are no outstanding dependencies on entry to the
2369 * program.
2370 */
2371 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2372 scan_inst != NULL;
2373 scan_inst = (fs_inst *)scan_inst->prev) {
2374
2375 /* If we hit control flow, assume that there *are* outstanding
2376 * dependencies, and force their cleanup before our instruction.
2377 */
2378 if (scan_inst->is_control_flow()) {
2379 for (int i = 0; i < write_len; i++) {
2380 if (needs_dep[i]) {
2381 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2382 }
2383 }
2384 return;
2385 }
2386
2387 bool scan_inst_simd16 = (dispatch_width > 8 &&
2388 !scan_inst->force_uncompressed &&
2389 !scan_inst->force_sechalf);
2390
2391 /* We insert our reads as late as possible on the assumption that any
2392 * instruction but a MOV that might have left us an outstanding
2393 * dependency has more latency than a MOV.
2394 */
2395 if (scan_inst->dst.file == GRF) {
2396 for (int i = 0; i < scan_inst->regs_written; i++) {
2397 int reg = scan_inst->dst.reg + i * reg_size;
2398
2399 if (reg >= first_write_grf &&
2400 reg < first_write_grf + write_len &&
2401 needs_dep[reg - first_write_grf]) {
2402 inst->insert_before(DEP_RESOLVE_MOV(reg));
2403 needs_dep[reg - first_write_grf] = false;
2404 if (scan_inst_simd16)
2405 needs_dep[reg - first_write_grf + 1] = false;
2406 }
2407 }
2408 }
2409
2410 /* Clear the flag for registers that actually got read (as expected). */
2411 clear_deps_for_inst_src(scan_inst, dispatch_width,
2412 needs_dep, first_write_grf, write_len);
2413
2414 /* Continue the loop only if we haven't resolved all the dependencies */
2415 int i;
2416 for (i = 0; i < write_len; i++) {
2417 if (needs_dep[i])
2418 break;
2419 }
2420 if (i == write_len)
2421 return;
2422 }
2423 }
2424
2425 /**
2426 * Implements this workaround for the original 965:
2427 *
2428 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2429 * used as a destination register until after it has been sourced by an
2430 * instruction with a different destination register.
2431 */
2432 void
2433 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2434 {
2435 int write_len = inst->regs_written * dispatch_width / 8;
2436 int first_write_grf = inst->dst.reg;
2437 bool needs_dep[BRW_MAX_MRF];
2438 assert(write_len < (int)sizeof(needs_dep) - 1);
2439
2440 memset(needs_dep, false, sizeof(needs_dep));
2441 memset(needs_dep, true, write_len);
2442 /* Walk forwards looking for writes to registers we're writing which aren't
2443 * read before being written.
2444 */
2445 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2446 !scan_inst->is_tail_sentinel();
2447 scan_inst = (fs_inst *)scan_inst->next) {
2448 /* If we hit control flow, force resolve all remaining dependencies. */
2449 if (scan_inst->is_control_flow()) {
2450 for (int i = 0; i < write_len; i++) {
2451 if (needs_dep[i])
2452 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2453 }
2454 return;
2455 }
2456
2457 /* Clear the flag for registers that actually got read (as expected). */
2458 clear_deps_for_inst_src(scan_inst, dispatch_width,
2459 needs_dep, first_write_grf, write_len);
2460
2461 /* We insert our reads as late as possible since they're reading the
2462 * result of a SEND, which has massive latency.
2463 */
2464 if (scan_inst->dst.file == GRF &&
2465 scan_inst->dst.reg >= first_write_grf &&
2466 scan_inst->dst.reg < first_write_grf + write_len &&
2467 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2468 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2469 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2470 }
2471
2472 /* Continue the loop only if we haven't resolved all the dependencies */
2473 int i;
2474 for (i = 0; i < write_len; i++) {
2475 if (needs_dep[i])
2476 break;
2477 }
2478 if (i == write_len)
2479 return;
2480 }
2481
2482 /* If we hit the end of the program, resolve all remaining dependencies out
2483 * of paranoia.
2484 */
2485 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2486 assert(last_inst->eot);
2487 for (int i = 0; i < write_len; i++) {
2488 if (needs_dep[i])
2489 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2490 }
2491 }
2492
2493 void
2494 fs_visitor::insert_gen4_send_dependency_workarounds()
2495 {
2496 if (brw->gen != 4 || brw->is_g4x)
2497 return;
2498
2499 /* Note that we're done with register allocation, so GRF fs_regs always
2500 * have a .reg_offset of 0.
2501 */
2502
2503 foreach_list_safe(node, &this->instructions) {
2504 fs_inst *inst = (fs_inst *)node;
2505
2506 if (inst->mlen != 0 && inst->dst.file == GRF) {
2507 insert_gen4_pre_send_dependency_workarounds(inst);
2508 insert_gen4_post_send_dependency_workarounds(inst);
2509 }
2510 }
2511 }
2512
2513 /**
2514 * Turns the generic expression-style uniform pull constant load instruction
2515 * into a hardware-specific series of instructions for loading a pull
2516 * constant.
2517 *
2518 * The expression style allows the CSE pass before this to optimize out
2519 * repeated loads from the same offset, and gives the pre-register-allocation
2520 * scheduling full flexibility, while the conversion to native instructions
2521 * allows the post-register-allocation scheduler the best information
2522 * possible.
2523 *
2524 * Note that execution masking for setting up pull constant loads is special:
2525 * the channels that need to be written are unrelated to the current execution
2526 * mask, since a later instruction will use one of the result channels as a
2527 * source operand for all 8 or 16 of its channels.
2528 */
2529 void
2530 fs_visitor::lower_uniform_pull_constant_loads()
2531 {
2532 foreach_list(node, &this->instructions) {
2533 fs_inst *inst = (fs_inst *)node;
2534
2535 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2536 continue;
2537
2538 if (brw->gen >= 7) {
2539 /* The offset arg before was a vec4-aligned byte offset. We need to
2540 * turn it into a dword offset.
2541 */
2542 fs_reg const_offset_reg = inst->src[1];
2543 assert(const_offset_reg.file == IMM &&
2544 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2545 const_offset_reg.imm.u /= 4;
2546 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2547
2548 /* This is actually going to be a MOV, but since only the first dword
2549 * is accessed, we have a special opcode to do just that one. Note
2550 * that this needs to be an operation that will be considered a def
2551 * by live variable analysis, or register allocation will explode.
2552 */
2553 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2554 payload, const_offset_reg);
2555 setup->force_writemask_all = true;
2556
2557 setup->ir = inst->ir;
2558 setup->annotation = inst->annotation;
2559 inst->insert_before(setup);
2560
2561 /* Similarly, this will only populate the first 4 channels of the
2562 * result register (since we only use smear values from 0-3), but we
2563 * don't tell the optimizer.
2564 */
2565 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2566 inst->src[1] = payload;
2567
2568 invalidate_live_intervals();
2569 } else {
2570 /* Before register allocation, we didn't tell the scheduler about the
2571 * MRF we use. We know it's safe to use this MRF because nothing
2572 * else does except for register spill/unspill, which generates and
2573 * uses its MRF within a single IR instruction.
2574 */
2575 inst->base_mrf = 14;
2576 inst->mlen = 1;
2577 }
2578 }
2579 }
2580
2581 void
2582 fs_visitor::dump_instructions()
2583 {
2584 calculate_register_pressure();
2585
2586 int ip = 0, max_pressure = 0;
2587 foreach_list(node, &this->instructions) {
2588 backend_instruction *inst = (backend_instruction *)node;
2589 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2590 fprintf(stderr, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2591 dump_instruction(inst);
2592 ++ip;
2593 }
2594 fprintf(stderr, "Maximum %3d registers live at once.\n", max_pressure);
2595 }
2596
2597 void
2598 fs_visitor::dump_instruction(backend_instruction *be_inst)
2599 {
2600 fs_inst *inst = (fs_inst *)be_inst;
2601
2602 if (inst->predicate) {
2603 fprintf(stderr, "(%cf0.%d) ",
2604 inst->predicate_inverse ? '-' : '+',
2605 inst->flag_subreg);
2606 }
2607
2608 fprintf(stderr, "%s", brw_instruction_name(inst->opcode));
2609 if (inst->saturate)
2610 fprintf(stderr, ".sat");
2611 if (inst->conditional_mod) {
2612 fprintf(stderr, "%s", conditional_modifier[inst->conditional_mod]);
2613 if (!inst->predicate &&
2614 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2615 inst->opcode != BRW_OPCODE_IF &&
2616 inst->opcode != BRW_OPCODE_WHILE))) {
2617 fprintf(stderr, ".f0.%d", inst->flag_subreg);
2618 }
2619 }
2620 fprintf(stderr, " ");
2621
2622
2623 switch (inst->dst.file) {
2624 case GRF:
2625 fprintf(stderr, "vgrf%d", inst->dst.reg);
2626 if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2627 inst->dst.subreg_offset)
2628 fprintf(stderr, "+%d.%d",
2629 inst->dst.reg_offset, inst->dst.subreg_offset);
2630 break;
2631 case MRF:
2632 fprintf(stderr, "m%d", inst->dst.reg);
2633 break;
2634 case BAD_FILE:
2635 fprintf(stderr, "(null)");
2636 break;
2637 case UNIFORM:
2638 fprintf(stderr, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2639 break;
2640 case HW_REG:
2641 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2642 switch (inst->dst.fixed_hw_reg.nr) {
2643 case BRW_ARF_NULL:
2644 fprintf(stderr, "null");
2645 break;
2646 case BRW_ARF_ADDRESS:
2647 fprintf(stderr, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2648 break;
2649 case BRW_ARF_ACCUMULATOR:
2650 fprintf(stderr, "acc%d", inst->dst.fixed_hw_reg.subnr);
2651 break;
2652 case BRW_ARF_FLAG:
2653 fprintf(stderr, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2654 inst->dst.fixed_hw_reg.subnr);
2655 break;
2656 default:
2657 fprintf(stderr, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2658 inst->dst.fixed_hw_reg.subnr);
2659 break;
2660 }
2661 } else {
2662 fprintf(stderr, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2663 }
2664 if (inst->dst.fixed_hw_reg.subnr)
2665 fprintf(stderr, "+%d", inst->dst.fixed_hw_reg.subnr);
2666 break;
2667 default:
2668 fprintf(stderr, "???");
2669 break;
2670 }
2671 fprintf(stderr, ":%s, ", brw_reg_type_letters(inst->dst.type));
2672
2673 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
2674 if (inst->src[i].negate)
2675 fprintf(stderr, "-");
2676 if (inst->src[i].abs)
2677 fprintf(stderr, "|");
2678 switch (inst->src[i].file) {
2679 case GRF:
2680 fprintf(stderr, "vgrf%d", inst->src[i].reg);
2681 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2682 inst->src[i].subreg_offset)
2683 fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
2684 inst->src[i].subreg_offset);
2685 break;
2686 case MRF:
2687 fprintf(stderr, "***m%d***", inst->src[i].reg);
2688 break;
2689 case UNIFORM:
2690 fprintf(stderr, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2691 if (inst->src[i].reladdr) {
2692 fprintf(stderr, "+reladdr");
2693 } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2694 inst->src[i].subreg_offset) {
2695 fprintf(stderr, "+%d.%d", inst->src[i].reg_offset,
2696 inst->src[i].subreg_offset);
2697 }
2698 break;
2699 case BAD_FILE:
2700 fprintf(stderr, "(null)");
2701 break;
2702 case IMM:
2703 switch (inst->src[i].type) {
2704 case BRW_REGISTER_TYPE_F:
2705 fprintf(stderr, "%ff", inst->src[i].imm.f);
2706 break;
2707 case BRW_REGISTER_TYPE_D:
2708 fprintf(stderr, "%dd", inst->src[i].imm.i);
2709 break;
2710 case BRW_REGISTER_TYPE_UD:
2711 fprintf(stderr, "%uu", inst->src[i].imm.u);
2712 break;
2713 default:
2714 fprintf(stderr, "???");
2715 break;
2716 }
2717 break;
2718 case HW_REG:
2719 if (inst->src[i].fixed_hw_reg.negate)
2720 fprintf(stderr, "-");
2721 if (inst->src[i].fixed_hw_reg.abs)
2722 fprintf(stderr, "|");
2723 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2724 switch (inst->src[i].fixed_hw_reg.nr) {
2725 case BRW_ARF_NULL:
2726 fprintf(stderr, "null");
2727 break;
2728 case BRW_ARF_ADDRESS:
2729 fprintf(stderr, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2730 break;
2731 case BRW_ARF_ACCUMULATOR:
2732 fprintf(stderr, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2733 break;
2734 case BRW_ARF_FLAG:
2735 fprintf(stderr, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2736 inst->src[i].fixed_hw_reg.subnr);
2737 break;
2738 default:
2739 fprintf(stderr, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2740 inst->src[i].fixed_hw_reg.subnr);
2741 break;
2742 }
2743 } else {
2744 fprintf(stderr, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2745 }
2746 if (inst->src[i].fixed_hw_reg.subnr)
2747 fprintf(stderr, "+%d", inst->src[i].fixed_hw_reg.subnr);
2748 if (inst->src[i].fixed_hw_reg.abs)
2749 fprintf(stderr, "|");
2750 break;
2751 default:
2752 fprintf(stderr, "???");
2753 break;
2754 }
2755 if (inst->src[i].abs)
2756 fprintf(stderr, "|");
2757
2758 if (inst->src[i].file != IMM) {
2759 fprintf(stderr, ":%s", brw_reg_type_letters(inst->src[i].type));
2760 }
2761
2762 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
2763 fprintf(stderr, ", ");
2764 }
2765
2766 fprintf(stderr, " ");
2767
2768 if (inst->force_uncompressed)
2769 fprintf(stderr, "1sthalf ");
2770
2771 if (inst->force_sechalf)
2772 fprintf(stderr, "2ndhalf ");
2773
2774 fprintf(stderr, "\n");
2775 }
2776
2777 /**
2778 * Possibly returns an instruction that set up @param reg.
2779 *
2780 * Sometimes we want to take the result of some expression/variable
2781 * dereference tree and rewrite the instruction generating the result
2782 * of the tree. When processing the tree, we know that the
2783 * instructions generated are all writing temporaries that are dead
2784 * outside of this tree. So, if we have some instructions that write
2785 * a temporary, we're free to point that temp write somewhere else.
2786 *
2787 * Note that this doesn't guarantee that the instruction generated
2788 * only reg -- it might be the size=4 destination of a texture instruction.
2789 */
2790 fs_inst *
2791 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2792 fs_inst *end,
2793 const fs_reg &reg)
2794 {
2795 if (end == start ||
2796 end->is_partial_write() ||
2797 reg.reladdr ||
2798 !reg.equals(end->dst)) {
2799 return NULL;
2800 } else {
2801 return end;
2802 }
2803 }
2804
2805 void
2806 fs_visitor::setup_payload_gen6()
2807 {
2808 bool uses_depth =
2809 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2810 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2811
2812 assert(brw->gen >= 6);
2813
2814 /* R0-1: masks, pixel X/Y coordinates. */
2815 c->nr_payload_regs = 2;
2816 /* R2: only for 32-pixel dispatch.*/
2817
2818 /* R3-26: barycentric interpolation coordinates. These appear in the
2819 * same order that they appear in the brw_wm_barycentric_interp_mode
2820 * enum. Each set of coordinates occupies 2 registers if dispatch width
2821 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2822 * appear if they were enabled using the "Barycentric Interpolation
2823 * Mode" bits in WM_STATE.
2824 */
2825 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2826 if (barycentric_interp_modes & (1 << i)) {
2827 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2828 c->nr_payload_regs += 2;
2829 if (dispatch_width == 16) {
2830 c->nr_payload_regs += 2;
2831 }
2832 }
2833 }
2834
2835 /* R27: interpolated depth if uses source depth */
2836 if (uses_depth) {
2837 c->source_depth_reg = c->nr_payload_regs;
2838 c->nr_payload_regs++;
2839 if (dispatch_width == 16) {
2840 /* R28: interpolated depth if not SIMD8. */
2841 c->nr_payload_regs++;
2842 }
2843 }
2844 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2845 if (uses_depth) {
2846 c->source_w_reg = c->nr_payload_regs;
2847 c->nr_payload_regs++;
2848 if (dispatch_width == 16) {
2849 /* R30: interpolated W if not SIMD8. */
2850 c->nr_payload_regs++;
2851 }
2852 }
2853
2854 c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
2855 /* R31: MSAA position offsets. */
2856 if (c->prog_data.uses_pos_offset) {
2857 c->sample_pos_reg = c->nr_payload_regs;
2858 c->nr_payload_regs++;
2859 }
2860
2861 /* R32: MSAA input coverage mask */
2862 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
2863 assert(brw->gen >= 7);
2864 c->sample_mask_reg = c->nr_payload_regs;
2865 c->nr_payload_regs++;
2866 if (dispatch_width == 16) {
2867 /* R33: input coverage mask if not SIMD8. */
2868 c->nr_payload_regs++;
2869 }
2870 }
2871
2872 /* R34-: bary for 32-pixel. */
2873 /* R58-59: interp W for 32-pixel. */
2874
2875 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2876 c->source_depth_to_render_target = true;
2877 }
2878 }
2879
2880 void
2881 fs_visitor::assign_binding_table_offsets()
2882 {
2883 uint32_t next_binding_table_offset = 0;
2884
2885 /* If there are no color regions, we still perform an FB write to a null
2886 * renderbuffer, which we place at surface index 0.
2887 */
2888 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
2889 next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
2890
2891 assign_common_binding_table_offsets(next_binding_table_offset);
2892 }
2893
2894 void
2895 fs_visitor::calculate_register_pressure()
2896 {
2897 invalidate_live_intervals();
2898 calculate_live_intervals();
2899
2900 int num_instructions = 0;
2901 foreach_list(node, &this->instructions) {
2902 ++num_instructions;
2903 }
2904
2905 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
2906
2907 for (int reg = 0; reg < virtual_grf_count; reg++) {
2908 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
2909 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
2910 }
2911 }
2912
2913 /**
2914 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
2915 *
2916 * The needs_unlit_centroid_workaround ends up producing one of these per
2917 * channel of centroid input, so it's good to clean them up.
2918 *
2919 * An assumption here is that nothing ever modifies the dispatched pixels
2920 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
2921 * dictates that anyway.
2922 */
2923 void
2924 fs_visitor::opt_drop_redundant_mov_to_flags()
2925 {
2926 bool flag_mov_found[2] = {false};
2927
2928 foreach_list_safe(node, &this->instructions) {
2929 fs_inst *inst = (fs_inst *)node;
2930
2931 if (inst->is_control_flow()) {
2932 memset(flag_mov_found, 0, sizeof(flag_mov_found));
2933 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
2934 if (!flag_mov_found[inst->flag_subreg])
2935 flag_mov_found[inst->flag_subreg] = true;
2936 else
2937 inst->remove();
2938 } else if (inst->writes_flag()) {
2939 flag_mov_found[inst->flag_subreg] = false;
2940 }
2941 }
2942 }
2943
2944 bool
2945 fs_visitor::run()
2946 {
2947 sanity_param_count = fp->Base.Parameters->NumParameters;
2948 bool allocated_without_spills;
2949
2950 assign_binding_table_offsets();
2951
2952 if (brw->gen >= 6)
2953 setup_payload_gen6();
2954 else
2955 setup_payload_gen4();
2956
2957 if (0) {
2958 emit_dummy_fs();
2959 } else {
2960 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2961 emit_shader_time_begin();
2962
2963 calculate_urb_setup();
2964 if (fp->Base.InputsRead > 0) {
2965 if (brw->gen < 6)
2966 emit_interpolation_setup_gen4();
2967 else
2968 emit_interpolation_setup_gen6();
2969 }
2970
2971 /* We handle discards by keeping track of the still-live pixels in f0.1.
2972 * Initialize it with the dispatched pixels.
2973 */
2974 if (fp->UsesKill || c->key.alpha_test_func) {
2975 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2976 discard_init->flag_subreg = 1;
2977 }
2978
2979 /* Generate FS IR for main(). (the visitor only descends into
2980 * functions called "main").
2981 */
2982 if (shader) {
2983 foreach_list(node, &*shader->base.ir) {
2984 ir_instruction *ir = (ir_instruction *)node;
2985 base_ir = ir;
2986 this->result = reg_undef;
2987 ir->accept(this);
2988 }
2989 } else {
2990 emit_fragment_program_code();
2991 }
2992 base_ir = NULL;
2993 if (failed)
2994 return false;
2995
2996 emit(FS_OPCODE_PLACEHOLDER_HALT);
2997
2998 if (c->key.alpha_test_func)
2999 emit_alpha_test();
3000
3001 emit_fb_writes();
3002
3003 split_virtual_grfs();
3004
3005 move_uniform_array_access_to_pull_constants();
3006 assign_constant_locations();
3007 demote_pull_constants();
3008
3009 opt_drop_redundant_mov_to_flags();
3010
3011 bool progress;
3012 do {
3013 progress = false;
3014
3015 compact_virtual_grfs();
3016
3017 progress = remove_duplicate_mrf_writes() || progress;
3018
3019 progress = opt_algebraic() || progress;
3020 progress = opt_cse() || progress;
3021 progress = opt_copy_propagate() || progress;
3022 progress = opt_peephole_predicated_break() || progress;
3023 progress = dead_code_eliminate() || progress;
3024 progress = opt_peephole_sel() || progress;
3025 progress = dead_control_flow_eliminate(this) || progress;
3026 progress = opt_saturate_propagation() || progress;
3027 progress = register_coalesce() || progress;
3028 progress = compute_to_mrf() || progress;
3029 } while (progress);
3030
3031 lower_uniform_pull_constant_loads();
3032
3033 assign_curb_setup();
3034 assign_urb_setup();
3035
3036 static enum instruction_scheduler_mode pre_modes[] = {
3037 SCHEDULE_PRE,
3038 SCHEDULE_PRE_NON_LIFO,
3039 SCHEDULE_PRE_LIFO,
3040 };
3041
3042 /* Try each scheduling heuristic to see if it can successfully register
3043 * allocate without spilling. They should be ordered by decreasing
3044 * performance but increasing likelihood of allocating.
3045 */
3046 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3047 schedule_instructions(pre_modes[i]);
3048
3049 if (0) {
3050 assign_regs_trivial();
3051 allocated_without_spills = true;
3052 } else {
3053 allocated_without_spills = assign_regs(false);
3054 }
3055 if (allocated_without_spills)
3056 break;
3057 }
3058
3059 if (!allocated_without_spills) {
3060 /* We assume that any spilling is worse than just dropping back to
3061 * SIMD8. There's probably actually some intermediate point where
3062 * SIMD16 with a couple of spills is still better.
3063 */
3064 if (dispatch_width == 16) {
3065 fail("Failure to register allocate. Reduce number of "
3066 "live scalar values to avoid this.");
3067 } else {
3068 perf_debug("Fragment shader triggered register spilling. "
3069 "Try reducing the number of live scalar values to "
3070 "improve performance.\n");
3071 }
3072
3073 /* Since we're out of heuristics, just go spill registers until we
3074 * get an allocation.
3075 */
3076 while (!assign_regs(true)) {
3077 if (failed)
3078 break;
3079 }
3080 }
3081 }
3082 assert(force_uncompressed_stack == 0);
3083
3084 /* This must come after all optimization and register allocation, since
3085 * it inserts dead code that happens to have side effects, and it does
3086 * so based on the actual physical registers in use.
3087 */
3088 insert_gen4_send_dependency_workarounds();
3089
3090 if (failed)
3091 return false;
3092
3093 if (!allocated_without_spills)
3094 schedule_instructions(SCHEDULE_POST);
3095
3096 if (dispatch_width == 8)
3097 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3098 else
3099 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3100
3101 /* If any state parameters were appended, then ParameterValues could have
3102 * been realloced, in which case the driver uniform storage set up by
3103 * _mesa_associate_uniform_storage() would point to freed memory. Make
3104 * sure that didn't happen.
3105 */
3106 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3107
3108 return !failed;
3109 }
3110
3111 const unsigned *
3112 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3113 struct gl_fragment_program *fp,
3114 struct gl_shader_program *prog,
3115 unsigned *final_assembly_size)
3116 {
3117 bool start_busy = false;
3118 double start_time = 0;
3119
3120 if (unlikely(brw->perf_debug)) {
3121 start_busy = (brw->batch.last_bo &&
3122 drm_intel_bo_busy(brw->batch.last_bo));
3123 start_time = get_time();
3124 }
3125
3126 struct brw_shader *shader = NULL;
3127 if (prog)
3128 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3129
3130 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3131 brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3132
3133 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3134 */
3135 fs_visitor v(brw, c, prog, fp, 8);
3136 if (!v.run()) {
3137 if (prog) {
3138 prog->LinkStatus = false;
3139 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3140 }
3141
3142 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3143 v.fail_msg);
3144
3145 return NULL;
3146 }
3147
3148 exec_list *simd16_instructions = NULL;
3149 fs_visitor v2(brw, c, prog, fp, 16);
3150 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3151 if (!v.simd16_unsupported) {
3152 /* Try a SIMD16 compile */
3153 v2.import_uniforms(&v);
3154 if (!v2.run()) {
3155 perf_debug("SIMD16 shader failed to compile, falling back to "
3156 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3157 } else {
3158 simd16_instructions = &v2.instructions;
3159 }
3160 } else {
3161 perf_debug("SIMD16 shader unsupported, falling back to "
3162 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3163 }
3164 }
3165
3166 const unsigned *assembly = NULL;
3167 if (brw->gen >= 8) {
3168 gen8_fs_generator g(brw, c, prog, fp, v.do_dual_src);
3169 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3170 final_assembly_size);
3171 } else {
3172 fs_generator g(brw, c, prog, fp, v.do_dual_src);
3173 assembly = g.generate_assembly(&v.instructions, simd16_instructions,
3174 final_assembly_size);
3175 }
3176
3177 if (unlikely(brw->perf_debug) && shader) {
3178 if (shader->compiled_once)
3179 brw_wm_debug_recompile(brw, prog, &c->key);
3180 shader->compiled_once = true;
3181
3182 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3183 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3184 (get_time() - start_time) * 1000);
3185 }
3186 }
3187
3188 return assembly;
3189 }
3190
3191 bool
3192 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3193 {
3194 struct brw_context *brw = brw_context(ctx);
3195 struct brw_wm_prog_key key;
3196
3197 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3198 return true;
3199
3200 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3201 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3202 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3203 bool program_uses_dfdy = fp->UsesDFdy;
3204
3205 memset(&key, 0, sizeof(key));
3206
3207 if (brw->gen < 6) {
3208 if (fp->UsesKill)
3209 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3210
3211 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3212 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3213
3214 /* Just assume depth testing. */
3215 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3216 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3217 }
3218
3219 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3220 BRW_FS_VARYING_INPUT_MASK) > 16)
3221 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3222
3223 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3224
3225 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3226 for (unsigned i = 0; i < sampler_count; i++) {
3227 if (fp->Base.ShadowSamplers & (1 << i)) {
3228 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3229 key.tex.swizzles[i] =
3230 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3231 } else {
3232 /* Color sampler: assume no swizzling. */
3233 key.tex.swizzles[i] = SWIZZLE_XYZW;
3234 }
3235 }
3236
3237 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3238 key.drawable_height = ctx->DrawBuffer->Height;
3239 }
3240
3241 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3242 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3243 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3244
3245 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3246 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3247 key.nr_color_regions > 1;
3248 }
3249
3250 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3251 * quality of the derivatives is likely to be determined by the driconf
3252 * option.
3253 */
3254 key.high_quality_derivatives = brw->disable_derivative_optimization;
3255
3256 key.program_string_id = bfp->id;
3257
3258 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3259 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3260
3261 bool success = do_wm_prog(brw, prog, bfp, &key);
3262
3263 brw->wm.base.prog_offset = old_prog_offset;
3264 brw->wm.prog_data = old_prog_data;
3265
3266 return success;
3267 }