i965: Move curb_read_length/total_scratch to brw_stage_prog_data.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_cfg.h"
50 #include "brw_dead_control_flow.h"
51 #include "main/uniforms.h"
52 #include "brw_fs_live_variables.h"
53 #include "glsl/glsl_types.h"
54
55 void
56 fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
57 {
58 memset(this, 0, sizeof(*this));
59
60 this->opcode = opcode;
61 this->dst = dst;
62 this->src = src;
63 this->sources = sources;
64
65 this->conditional_mod = BRW_CONDITIONAL_NONE;
66
67 /* This will be the case for almost all instructions. */
68 this->regs_written = 1;
69
70 this->writes_accumulator = false;
71 }
72
73 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
74 {
75 fs_reg *src = ralloc_array(this, fs_reg, 3);
76 init(opcode, dst, src, 0);
77 }
78
79 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
80 {
81 fs_reg *src = ralloc_array(this, fs_reg, 3);
82 src[0] = src0;
83 init(opcode, dst, src, 1);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
87 const fs_reg &src1)
88 {
89 fs_reg *src = ralloc_array(this, fs_reg, 3);
90 src[0] = src0;
91 src[1] = src1;
92 init(opcode, dst, src, 2);
93 }
94
95 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
96 const fs_reg &src1, const fs_reg &src2)
97 {
98 fs_reg *src = ralloc_array(this, fs_reg, 3);
99 src[0] = src0;
100 src[1] = src1;
101 src[2] = src2;
102 init(opcode, dst, src, 3);
103 }
104
105 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
106 {
107 init(opcode, dst, src, sources);
108 }
109
110 fs_inst::fs_inst(const fs_inst &that)
111 {
112 memcpy(this, &that, sizeof(that));
113
114 this->src = ralloc_array(this, fs_reg, that.sources);
115
116 for (int i = 0; i < that.sources; i++)
117 this->src[i] = that.src[i];
118 }
119
120 void
121 fs_inst::resize_sources(uint8_t num_sources)
122 {
123 if (this->sources != num_sources) {
124 this->src = reralloc(this, this->src, fs_reg, num_sources);
125 this->sources = num_sources;
126 }
127 }
128
129 #define ALU1(op) \
130 fs_inst * \
131 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
132 { \
133 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
134 }
135
136 #define ALU2(op) \
137 fs_inst * \
138 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
139 const fs_reg &src1) \
140 { \
141 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
142 }
143
144 #define ALU2_ACC(op) \
145 fs_inst * \
146 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
147 const fs_reg &src1) \
148 { \
149 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
150 inst->writes_accumulator = true; \
151 return inst; \
152 }
153
154 #define ALU3(op) \
155 fs_inst * \
156 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
157 const fs_reg &src1, const fs_reg &src2) \
158 { \
159 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
160 }
161
162 ALU1(NOT)
163 ALU1(MOV)
164 ALU1(FRC)
165 ALU1(RNDD)
166 ALU1(RNDE)
167 ALU1(RNDZ)
168 ALU2(ADD)
169 ALU2(MUL)
170 ALU2_ACC(MACH)
171 ALU2(AND)
172 ALU2(OR)
173 ALU2(XOR)
174 ALU2(SHL)
175 ALU2(SHR)
176 ALU2(ASR)
177 ALU3(LRP)
178 ALU1(BFREV)
179 ALU3(BFE)
180 ALU2(BFI1)
181 ALU3(BFI2)
182 ALU1(FBH)
183 ALU1(FBL)
184 ALU1(CBIT)
185 ALU3(MAD)
186 ALU2_ACC(ADDC)
187 ALU2_ACC(SUBB)
188 ALU2(SEL)
189 ALU2(MAC)
190
191 /** Gen4 predicated IF. */
192 fs_inst *
193 fs_visitor::IF(enum brw_predicate predicate)
194 {
195 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
196 inst->predicate = predicate;
197 return inst;
198 }
199
200 /** Gen6 IF with embedded comparison. */
201 fs_inst *
202 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
203 enum brw_conditional_mod condition)
204 {
205 assert(brw->gen == 6);
206 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
207 reg_null_d, src0, src1);
208 inst->conditional_mod = condition;
209 return inst;
210 }
211
212 /**
213 * CMP: Sets the low bit of the destination channels with the result
214 * of the comparison, while the upper bits are undefined, and updates
215 * the flag register with the packed 16 bits of the result.
216 */
217 fs_inst *
218 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
219 enum brw_conditional_mod condition)
220 {
221 fs_inst *inst;
222
223 /* Take the instruction:
224 *
225 * CMP null<d> src0<f> src1<f>
226 *
227 * Original gen4 does type conversion to the destination type before
228 * comparison, producing garbage results for floating point comparisons.
229 * gen5 does the comparison on the execution type (resolved source types),
230 * so dst type doesn't matter. gen6 does comparison and then uses the
231 * result as if it was the dst type with no conversion, which happens to
232 * mostly work out for float-interpreted-as-int since our comparisons are
233 * for >0, =0, <0.
234 */
235 if (brw->gen == 4) {
236 dst.type = src0.type;
237 if (dst.file == HW_REG)
238 dst.fixed_hw_reg.type = dst.type;
239 }
240
241 resolve_ud_negate(&src0);
242 resolve_ud_negate(&src1);
243
244 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
245 inst->conditional_mod = condition;
246
247 return inst;
248 }
249
250 fs_inst *
251 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
252 {
253 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst, src,
254 sources);
255 inst->regs_written = sources;
256
257 return inst;
258 }
259
260 exec_list
261 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
262 const fs_reg &surf_index,
263 const fs_reg &varying_offset,
264 uint32_t const_offset)
265 {
266 exec_list instructions;
267 fs_inst *inst;
268
269 /* We have our constant surface use a pitch of 4 bytes, so our index can
270 * be any component of a vector, and then we load 4 contiguous
271 * components starting from that.
272 *
273 * We break down the const_offset to a portion added to the variable
274 * offset and a portion done using reg_offset, which means that if you
275 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
276 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
277 * CSE can later notice that those loads are all the same and eliminate
278 * the redundant ones.
279 */
280 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
281 instructions.push_tail(ADD(vec4_offset,
282 varying_offset, const_offset & ~3));
283
284 int scale = 1;
285 if (brw->gen == 4 && dispatch_width == 8) {
286 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
287 * u, v, r) as parameters, or we can just use the SIMD16 message
288 * consisting of (header, u). We choose the second, at the cost of a
289 * longer return length.
290 */
291 scale = 2;
292 }
293
294 enum opcode op;
295 if (brw->gen >= 7)
296 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
297 else
298 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
299 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
300 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
301 inst->regs_written = 4 * scale;
302 instructions.push_tail(inst);
303
304 if (brw->gen < 7) {
305 inst->base_mrf = 13;
306 inst->header_present = true;
307 if (brw->gen == 4)
308 inst->mlen = 3;
309 else
310 inst->mlen = 1 + dispatch_width / 8;
311 }
312
313 vec4_result.reg_offset += (const_offset & 3) * scale;
314 instructions.push_tail(MOV(dst, vec4_result));
315
316 return instructions;
317 }
318
319 /**
320 * A helper for MOV generation for fixing up broken hardware SEND dependency
321 * handling.
322 */
323 fs_inst *
324 fs_visitor::DEP_RESOLVE_MOV(int grf)
325 {
326 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
327
328 inst->ir = NULL;
329 inst->annotation = "send dependency resolve";
330
331 /* The caller always wants uncompressed to emit the minimal extra
332 * dependencies, and to avoid having to deal with aligning its regs to 2.
333 */
334 inst->force_uncompressed = true;
335
336 return inst;
337 }
338
339 bool
340 fs_inst::equals(fs_inst *inst) const
341 {
342 return (opcode == inst->opcode &&
343 dst.equals(inst->dst) &&
344 src[0].equals(inst->src[0]) &&
345 src[1].equals(inst->src[1]) &&
346 src[2].equals(inst->src[2]) &&
347 saturate == inst->saturate &&
348 predicate == inst->predicate &&
349 conditional_mod == inst->conditional_mod &&
350 mlen == inst->mlen &&
351 base_mrf == inst->base_mrf &&
352 target == inst->target &&
353 eot == inst->eot &&
354 header_present == inst->header_present &&
355 shadow_compare == inst->shadow_compare &&
356 offset == inst->offset);
357 }
358
359 bool
360 fs_inst::overwrites_reg(const fs_reg &reg) const
361 {
362 return (reg.file == dst.file &&
363 reg.reg == dst.reg &&
364 reg.reg_offset >= dst.reg_offset &&
365 reg.reg_offset < dst.reg_offset + regs_written);
366 }
367
368 bool
369 fs_inst::is_send_from_grf() const
370 {
371 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
372 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
373 opcode == FS_OPCODE_INTERPOLATE_AT_CENTROID ||
374 opcode == FS_OPCODE_INTERPOLATE_AT_SAMPLE ||
375 opcode == FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET ||
376 opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET ||
377 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
378 src[1].file == GRF) ||
379 (is_tex() && src[0].file == GRF));
380 }
381
382 bool
383 fs_inst::can_do_source_mods(struct brw_context *brw)
384 {
385 if (brw->gen == 6 && is_math())
386 return false;
387
388 if (is_send_from_grf())
389 return false;
390
391 if (!backend_instruction::can_do_source_mods())
392 return false;
393
394 return true;
395 }
396
397 void
398 fs_reg::init()
399 {
400 memset(this, 0, sizeof(*this));
401 stride = 1;
402 }
403
404 /** Generic unset register constructor. */
405 fs_reg::fs_reg()
406 {
407 init();
408 this->file = BAD_FILE;
409 }
410
411 /** Immediate value constructor. */
412 fs_reg::fs_reg(float f)
413 {
414 init();
415 this->file = IMM;
416 this->type = BRW_REGISTER_TYPE_F;
417 this->fixed_hw_reg.dw1.f = f;
418 }
419
420 /** Immediate value constructor. */
421 fs_reg::fs_reg(int32_t i)
422 {
423 init();
424 this->file = IMM;
425 this->type = BRW_REGISTER_TYPE_D;
426 this->fixed_hw_reg.dw1.d = i;
427 }
428
429 /** Immediate value constructor. */
430 fs_reg::fs_reg(uint32_t u)
431 {
432 init();
433 this->file = IMM;
434 this->type = BRW_REGISTER_TYPE_UD;
435 this->fixed_hw_reg.dw1.ud = u;
436 }
437
438 /** Fixed brw_reg. */
439 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
440 {
441 init();
442 this->file = HW_REG;
443 this->fixed_hw_reg = fixed_hw_reg;
444 this->type = fixed_hw_reg.type;
445 }
446
447 bool
448 fs_reg::equals(const fs_reg &r) const
449 {
450 return (file == r.file &&
451 reg == r.reg &&
452 reg_offset == r.reg_offset &&
453 subreg_offset == r.subreg_offset &&
454 type == r.type &&
455 negate == r.negate &&
456 abs == r.abs &&
457 !reladdr && !r.reladdr &&
458 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
459 sizeof(fixed_hw_reg)) == 0 &&
460 stride == r.stride);
461 }
462
463 fs_reg &
464 fs_reg::apply_stride(unsigned stride)
465 {
466 assert((this->stride * stride) <= 4 &&
467 (is_power_of_two(stride) || stride == 0) &&
468 file != HW_REG && file != IMM);
469 this->stride *= stride;
470 return *this;
471 }
472
473 fs_reg &
474 fs_reg::set_smear(unsigned subreg)
475 {
476 assert(file != HW_REG && file != IMM);
477 subreg_offset = subreg * type_sz(type);
478 stride = 0;
479 return *this;
480 }
481
482 bool
483 fs_reg::is_contiguous() const
484 {
485 return stride == 1;
486 }
487
488 bool
489 fs_reg::is_valid_3src() const
490 {
491 return file == GRF || file == UNIFORM;
492 }
493
494 int
495 fs_visitor::type_size(const struct glsl_type *type)
496 {
497 unsigned int size, i;
498
499 switch (type->base_type) {
500 case GLSL_TYPE_UINT:
501 case GLSL_TYPE_INT:
502 case GLSL_TYPE_FLOAT:
503 case GLSL_TYPE_BOOL:
504 return type->components();
505 case GLSL_TYPE_ARRAY:
506 return type_size(type->fields.array) * type->length;
507 case GLSL_TYPE_STRUCT:
508 size = 0;
509 for (i = 0; i < type->length; i++) {
510 size += type_size(type->fields.structure[i].type);
511 }
512 return size;
513 case GLSL_TYPE_SAMPLER:
514 /* Samplers take up no register space, since they're baked in at
515 * link time.
516 */
517 return 0;
518 case GLSL_TYPE_ATOMIC_UINT:
519 return 0;
520 case GLSL_TYPE_IMAGE:
521 case GLSL_TYPE_VOID:
522 case GLSL_TYPE_ERROR:
523 case GLSL_TYPE_INTERFACE:
524 unreachable("not reached");
525 }
526
527 return 0;
528 }
529
530 fs_reg
531 fs_visitor::get_timestamp()
532 {
533 assert(brw->gen >= 7);
534
535 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
536 BRW_ARF_TIMESTAMP,
537 0),
538 BRW_REGISTER_TYPE_UD));
539
540 fs_reg dst = fs_reg(this, glsl_type::uint_type);
541
542 fs_inst *mov = emit(MOV(dst, ts));
543 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
544 * even if it's not enabled in the dispatch.
545 */
546 mov->force_writemask_all = true;
547 mov->force_uncompressed = true;
548
549 /* The caller wants the low 32 bits of the timestamp. Since it's running
550 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
551 * which is plenty of time for our purposes. It is identical across the
552 * EUs, but since it's tracking GPU core speed it will increment at a
553 * varying rate as render P-states change.
554 *
555 * The caller could also check if render P-states have changed (or anything
556 * else that might disrupt timing) by setting smear to 2 and checking if
557 * that field is != 0.
558 */
559 dst.set_smear(0);
560
561 return dst;
562 }
563
564 void
565 fs_visitor::emit_shader_time_begin()
566 {
567 current_annotation = "shader time start";
568 shader_start_time = get_timestamp();
569 }
570
571 void
572 fs_visitor::emit_shader_time_end()
573 {
574 current_annotation = "shader time end";
575
576 enum shader_time_shader_type type, written_type, reset_type;
577 if (dispatch_width == 8) {
578 type = ST_FS8;
579 written_type = ST_FS8_WRITTEN;
580 reset_type = ST_FS8_RESET;
581 } else {
582 assert(dispatch_width == 16);
583 type = ST_FS16;
584 written_type = ST_FS16_WRITTEN;
585 reset_type = ST_FS16_RESET;
586 }
587
588 fs_reg shader_end_time = get_timestamp();
589
590 /* Check that there weren't any timestamp reset events (assuming these
591 * were the only two timestamp reads that happened).
592 */
593 fs_reg reset = shader_end_time;
594 reset.set_smear(2);
595 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
596 test->conditional_mod = BRW_CONDITIONAL_Z;
597 emit(IF(BRW_PREDICATE_NORMAL));
598
599 push_force_uncompressed();
600 fs_reg start = shader_start_time;
601 start.negate = true;
602 fs_reg diff = fs_reg(this, glsl_type::uint_type);
603 emit(ADD(diff, start, shader_end_time));
604
605 /* If there were no instructions between the two timestamp gets, the diff
606 * is 2 cycles. Remove that overhead, so I can forget about that when
607 * trying to determine the time taken for single instructions.
608 */
609 emit(ADD(diff, diff, fs_reg(-2u)));
610
611 emit_shader_time_write(type, diff);
612 emit_shader_time_write(written_type, fs_reg(1u));
613 emit(BRW_OPCODE_ELSE);
614 emit_shader_time_write(reset_type, fs_reg(1u));
615 emit(BRW_OPCODE_ENDIF);
616
617 pop_force_uncompressed();
618 }
619
620 void
621 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
622 fs_reg value)
623 {
624 int shader_time_index =
625 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
626 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
627
628 fs_reg payload;
629 if (dispatch_width == 8)
630 payload = fs_reg(this, glsl_type::uvec2_type);
631 else
632 payload = fs_reg(this, glsl_type::uint_type);
633
634 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
635 fs_reg(), payload, offset, value));
636 }
637
638 void
639 fs_visitor::vfail(const char *format, va_list va)
640 {
641 char *msg;
642
643 if (failed)
644 return;
645
646 failed = true;
647
648 msg = ralloc_vasprintf(mem_ctx, format, va);
649 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
650
651 this->fail_msg = msg;
652
653 if (INTEL_DEBUG & DEBUG_WM) {
654 fprintf(stderr, "%s", msg);
655 }
656 }
657
658 void
659 fs_visitor::fail(const char *format, ...)
660 {
661 va_list va;
662
663 va_start(va, format);
664 vfail(format, va);
665 va_end(va);
666 }
667
668 /**
669 * Mark this program as impossible to compile in SIMD16 mode.
670 *
671 * During the SIMD8 compile (which happens first), we can detect and flag
672 * things that are unsupported in SIMD16 mode, so the compiler can skip
673 * the SIMD16 compile altogether.
674 *
675 * During a SIMD16 compile (if one happens anyway), this just calls fail().
676 */
677 void
678 fs_visitor::no16(const char *format, ...)
679 {
680 va_list va;
681
682 va_start(va, format);
683
684 if (dispatch_width == 16) {
685 vfail(format, va);
686 } else {
687 simd16_unsupported = true;
688
689 if (brw->perf_debug) {
690 if (no16_msg)
691 ralloc_vasprintf_append(&no16_msg, format, va);
692 else
693 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
694 }
695 }
696
697 va_end(va);
698 }
699
700 fs_inst *
701 fs_visitor::emit(enum opcode opcode)
702 {
703 return emit(new(mem_ctx) fs_inst(opcode));
704 }
705
706 fs_inst *
707 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
708 {
709 return emit(new(mem_ctx) fs_inst(opcode, dst));
710 }
711
712 fs_inst *
713 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
714 {
715 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
716 }
717
718 fs_inst *
719 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
720 const fs_reg &src1)
721 {
722 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
723 }
724
725 fs_inst *
726 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
727 const fs_reg &src1, const fs_reg &src2)
728 {
729 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
730 }
731
732 fs_inst *
733 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
734 fs_reg src[], int sources)
735 {
736 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
737 }
738
739 void
740 fs_visitor::push_force_uncompressed()
741 {
742 force_uncompressed_stack++;
743 }
744
745 void
746 fs_visitor::pop_force_uncompressed()
747 {
748 force_uncompressed_stack--;
749 assert(force_uncompressed_stack >= 0);
750 }
751
752 /**
753 * Returns true if the instruction has a flag that means it won't
754 * update an entire destination register.
755 *
756 * For example, dead code elimination and live variable analysis want to know
757 * when a write to a variable screens off any preceding values that were in
758 * it.
759 */
760 bool
761 fs_inst::is_partial_write() const
762 {
763 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
764 this->force_uncompressed ||
765 this->force_sechalf || !this->dst.is_contiguous());
766 }
767
768 int
769 fs_inst::regs_read(fs_visitor *v, int arg) const
770 {
771 if (is_tex() && arg == 0 && src[0].file == GRF) {
772 if (v->dispatch_width == 16)
773 return (mlen + 1) / 2;
774 else
775 return mlen;
776 }
777 return 1;
778 }
779
780 bool
781 fs_inst::reads_flag() const
782 {
783 return predicate;
784 }
785
786 bool
787 fs_inst::writes_flag() const
788 {
789 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
790 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
791 }
792
793 /**
794 * Returns how many MRFs an FS opcode will write over.
795 *
796 * Note that this is not the 0 or 1 implied writes in an actual gen
797 * instruction -- the FS opcodes often generate MOVs in addition.
798 */
799 int
800 fs_visitor::implied_mrf_writes(fs_inst *inst)
801 {
802 if (inst->mlen == 0)
803 return 0;
804
805 if (inst->base_mrf == -1)
806 return 0;
807
808 switch (inst->opcode) {
809 case SHADER_OPCODE_RCP:
810 case SHADER_OPCODE_RSQ:
811 case SHADER_OPCODE_SQRT:
812 case SHADER_OPCODE_EXP2:
813 case SHADER_OPCODE_LOG2:
814 case SHADER_OPCODE_SIN:
815 case SHADER_OPCODE_COS:
816 return 1 * dispatch_width / 8;
817 case SHADER_OPCODE_POW:
818 case SHADER_OPCODE_INT_QUOTIENT:
819 case SHADER_OPCODE_INT_REMAINDER:
820 return 2 * dispatch_width / 8;
821 case SHADER_OPCODE_TEX:
822 case FS_OPCODE_TXB:
823 case SHADER_OPCODE_TXD:
824 case SHADER_OPCODE_TXF:
825 case SHADER_OPCODE_TXF_CMS:
826 case SHADER_OPCODE_TXF_MCS:
827 case SHADER_OPCODE_TG4:
828 case SHADER_OPCODE_TG4_OFFSET:
829 case SHADER_OPCODE_TXL:
830 case SHADER_OPCODE_TXS:
831 case SHADER_OPCODE_LOD:
832 return 1;
833 case FS_OPCODE_FB_WRITE:
834 return 2;
835 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
836 case SHADER_OPCODE_GEN4_SCRATCH_READ:
837 return 1;
838 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
839 return inst->mlen;
840 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
841 return 2;
842 case SHADER_OPCODE_UNTYPED_ATOMIC:
843 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
844 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
845 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
846 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
847 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
848 return 0;
849 default:
850 unreachable("not reached");
851 }
852 }
853
854 int
855 fs_visitor::virtual_grf_alloc(int size)
856 {
857 if (virtual_grf_array_size <= virtual_grf_count) {
858 if (virtual_grf_array_size == 0)
859 virtual_grf_array_size = 16;
860 else
861 virtual_grf_array_size *= 2;
862 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
863 virtual_grf_array_size);
864 }
865 virtual_grf_sizes[virtual_grf_count] = size;
866 return virtual_grf_count++;
867 }
868
869 /** Fixed HW reg constructor. */
870 fs_reg::fs_reg(enum register_file file, int reg)
871 {
872 init();
873 this->file = file;
874 this->reg = reg;
875 this->type = BRW_REGISTER_TYPE_F;
876 }
877
878 /** Fixed HW reg constructor. */
879 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
880 {
881 init();
882 this->file = file;
883 this->reg = reg;
884 this->type = type;
885 }
886
887 /** Automatic reg constructor. */
888 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
889 {
890 init();
891
892 this->file = GRF;
893 this->reg = v->virtual_grf_alloc(v->type_size(type));
894 this->reg_offset = 0;
895 this->type = brw_type_for_base_type(type);
896 }
897
898 fs_reg *
899 fs_visitor::variable_storage(ir_variable *var)
900 {
901 return (fs_reg *)hash_table_find(this->variable_ht, var);
902 }
903
904 void
905 import_uniforms_callback(const void *key,
906 void *data,
907 void *closure)
908 {
909 struct hash_table *dst_ht = (struct hash_table *)closure;
910 const fs_reg *reg = (const fs_reg *)data;
911
912 if (reg->file != UNIFORM)
913 return;
914
915 hash_table_insert(dst_ht, data, key);
916 }
917
918 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
919 * This brings in those uniform definitions
920 */
921 void
922 fs_visitor::import_uniforms(fs_visitor *v)
923 {
924 hash_table_call_foreach(v->variable_ht,
925 import_uniforms_callback,
926 variable_ht);
927 this->push_constant_loc = v->push_constant_loc;
928 this->pull_constant_loc = v->pull_constant_loc;
929 this->uniforms = v->uniforms;
930 this->param_size = v->param_size;
931 }
932
933 /* Our support for uniforms is piggy-backed on the struct
934 * gl_fragment_program, because that's where the values actually
935 * get stored, rather than in some global gl_shader_program uniform
936 * store.
937 */
938 void
939 fs_visitor::setup_uniform_values(ir_variable *ir)
940 {
941 int namelen = strlen(ir->name);
942
943 /* The data for our (non-builtin) uniforms is stored in a series of
944 * gl_uniform_driver_storage structs for each subcomponent that
945 * glGetUniformLocation() could name. We know it's been set up in the same
946 * order we'd walk the type, so walk the list of storage and find anything
947 * with our name, or the prefix of a component that starts with our name.
948 */
949 unsigned params_before = uniforms;
950 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
951 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
952
953 if (strncmp(ir->name, storage->name, namelen) != 0 ||
954 (storage->name[namelen] != 0 &&
955 storage->name[namelen] != '.' &&
956 storage->name[namelen] != '[')) {
957 continue;
958 }
959
960 unsigned slots = storage->type->component_slots();
961 if (storage->array_elements)
962 slots *= storage->array_elements;
963
964 for (unsigned i = 0; i < slots; i++) {
965 stage_prog_data->param[uniforms++] = &storage->storage[i];
966 }
967 }
968
969 /* Make sure we actually initialized the right amount of stuff here. */
970 assert(params_before + ir->type->component_slots() == uniforms);
971 (void)params_before;
972 }
973
974
975 /* Our support for builtin uniforms is even scarier than non-builtin.
976 * It sits on top of the PROG_STATE_VAR parameters that are
977 * automatically updated from GL context state.
978 */
979 void
980 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
981 {
982 const ir_state_slot *const slots = ir->state_slots;
983 assert(ir->state_slots != NULL);
984
985 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
986 /* This state reference has already been setup by ir_to_mesa, but we'll
987 * get the same index back here.
988 */
989 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
990 (gl_state_index *)slots[i].tokens);
991
992 /* Add each of the unique swizzles of the element as a parameter.
993 * This'll end up matching the expected layout of the
994 * array/matrix/structure we're trying to fill in.
995 */
996 int last_swiz = -1;
997 for (unsigned int j = 0; j < 4; j++) {
998 int swiz = GET_SWZ(slots[i].swizzle, j);
999 if (swiz == last_swiz)
1000 break;
1001 last_swiz = swiz;
1002
1003 stage_prog_data->param[uniforms++] =
1004 &fp->Base.Parameters->ParameterValues[index][swiz];
1005 }
1006 }
1007 }
1008
1009 fs_reg *
1010 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1011 {
1012 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1013 fs_reg wpos = *reg;
1014 bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1015
1016 /* gl_FragCoord.x */
1017 if (ir->data.pixel_center_integer) {
1018 emit(MOV(wpos, this->pixel_x));
1019 } else {
1020 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1021 }
1022 wpos.reg_offset++;
1023
1024 /* gl_FragCoord.y */
1025 if (!flip && ir->data.pixel_center_integer) {
1026 emit(MOV(wpos, this->pixel_y));
1027 } else {
1028 fs_reg pixel_y = this->pixel_y;
1029 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1030
1031 if (flip) {
1032 pixel_y.negate = true;
1033 offset += key->drawable_height - 1.0;
1034 }
1035
1036 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1037 }
1038 wpos.reg_offset++;
1039
1040 /* gl_FragCoord.z */
1041 if (brw->gen >= 6) {
1042 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1043 } else {
1044 emit(FS_OPCODE_LINTERP, wpos,
1045 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1046 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1047 interp_reg(VARYING_SLOT_POS, 2));
1048 }
1049 wpos.reg_offset++;
1050
1051 /* gl_FragCoord.w: Already set up in emit_interpolation */
1052 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1053
1054 return reg;
1055 }
1056
1057 fs_inst *
1058 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1059 glsl_interp_qualifier interpolation_mode,
1060 bool is_centroid, bool is_sample)
1061 {
1062 brw_wm_barycentric_interp_mode barycoord_mode;
1063 if (brw->gen >= 6) {
1064 if (is_centroid) {
1065 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1066 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1067 else
1068 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1069 } else if (is_sample) {
1070 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1071 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1072 else
1073 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1074 } else {
1075 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1076 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1077 else
1078 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1079 }
1080 } else {
1081 /* On Ironlake and below, there is only one interpolation mode.
1082 * Centroid interpolation doesn't mean anything on this hardware --
1083 * there is no multisampling.
1084 */
1085 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1086 }
1087 return emit(FS_OPCODE_LINTERP, attr,
1088 this->delta_x[barycoord_mode],
1089 this->delta_y[barycoord_mode], interp);
1090 }
1091
1092 fs_reg *
1093 fs_visitor::emit_general_interpolation(ir_variable *ir)
1094 {
1095 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1096 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1097 fs_reg attr = *reg;
1098
1099 unsigned int array_elements;
1100 const glsl_type *type;
1101
1102 if (ir->type->is_array()) {
1103 array_elements = ir->type->length;
1104 if (array_elements == 0) {
1105 fail("dereferenced array '%s' has length 0\n", ir->name);
1106 }
1107 type = ir->type->fields.array;
1108 } else {
1109 array_elements = 1;
1110 type = ir->type;
1111 }
1112
1113 glsl_interp_qualifier interpolation_mode =
1114 ir->determine_interpolation_mode(key->flat_shade);
1115
1116 int location = ir->data.location;
1117 for (unsigned int i = 0; i < array_elements; i++) {
1118 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1119 if (prog_data->urb_setup[location] == -1) {
1120 /* If there's no incoming setup data for this slot, don't
1121 * emit interpolation for it.
1122 */
1123 attr.reg_offset += type->vector_elements;
1124 location++;
1125 continue;
1126 }
1127
1128 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1129 /* Constant interpolation (flat shading) case. The SF has
1130 * handed us defined values in only the constant offset
1131 * field of the setup reg.
1132 */
1133 for (unsigned int k = 0; k < type->vector_elements; k++) {
1134 struct brw_reg interp = interp_reg(location, k);
1135 interp = suboffset(interp, 3);
1136 interp.type = reg->type;
1137 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1138 attr.reg_offset++;
1139 }
1140 } else {
1141 /* Smooth/noperspective interpolation case. */
1142 for (unsigned int k = 0; k < type->vector_elements; k++) {
1143 struct brw_reg interp = interp_reg(location, k);
1144 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1145 /* Get the pixel/sample mask into f0 so that we know
1146 * which pixels are lit. Then, for each channel that is
1147 * unlit, replace the centroid data with non-centroid
1148 * data.
1149 */
1150 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1151
1152 fs_inst *inst;
1153 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1154 false, false);
1155 inst->predicate = BRW_PREDICATE_NORMAL;
1156 inst->predicate_inverse = true;
1157 if (brw->has_pln)
1158 inst->no_dd_clear = true;
1159
1160 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1161 ir->data.centroid && !key->persample_shading,
1162 ir->data.sample || key->persample_shading);
1163 inst->predicate = BRW_PREDICATE_NORMAL;
1164 inst->predicate_inverse = false;
1165 if (brw->has_pln)
1166 inst->no_dd_check = true;
1167
1168 } else {
1169 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1170 ir->data.centroid && !key->persample_shading,
1171 ir->data.sample || key->persample_shading);
1172 }
1173 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1174 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1175 }
1176 attr.reg_offset++;
1177 }
1178
1179 }
1180 location++;
1181 }
1182 }
1183
1184 return reg;
1185 }
1186
1187 fs_reg *
1188 fs_visitor::emit_frontfacing_interpolation()
1189 {
1190 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::bool_type);
1191
1192 if (brw->gen >= 6) {
1193 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1194 * a boolean result from this (~0/true or 0/false).
1195 *
1196 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1197 * this task in only one instruction:
1198 * - a negation source modifier will flip the bit; and
1199 * - a W -> D type conversion will sign extend the bit into the high
1200 * word of the destination.
1201 *
1202 * An ASR 15 fills the low word of the destination.
1203 */
1204 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1205 g0.negate = true;
1206
1207 emit(ASR(*reg, g0, fs_reg(15)));
1208 } else {
1209 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1210 * a boolean result from this (1/true or 0/false).
1211 *
1212 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1213 * the negation source modifier to flip it. Unfortunately the SHR
1214 * instruction only operates on UD (or D with an abs source modifier)
1215 * sources without negation.
1216 *
1217 * Instead, use ASR (which will give ~0/true or 0/false) followed by an
1218 * AND 1.
1219 */
1220 fs_reg asr = fs_reg(this, glsl_type::bool_type);
1221 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1222 g1_6.negate = true;
1223
1224 emit(ASR(asr, g1_6, fs_reg(31)));
1225 emit(AND(*reg, asr, fs_reg(1)));
1226 }
1227
1228 return reg;
1229 }
1230
1231 void
1232 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1233 {
1234 assert(dst.type == BRW_REGISTER_TYPE_F);
1235
1236 if (key->compute_pos_offset) {
1237 /* Convert int_sample_pos to floating point */
1238 emit(MOV(dst, int_sample_pos));
1239 /* Scale to the range [0, 1] */
1240 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1241 }
1242 else {
1243 /* From ARB_sample_shading specification:
1244 * "When rendering to a non-multisample buffer, or if multisample
1245 * rasterization is disabled, gl_SamplePosition will always be
1246 * (0.5, 0.5).
1247 */
1248 emit(MOV(dst, fs_reg(0.5f)));
1249 }
1250 }
1251
1252 fs_reg *
1253 fs_visitor::emit_samplepos_setup()
1254 {
1255 assert(brw->gen >= 6);
1256
1257 this->current_annotation = "compute sample position";
1258 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
1259 fs_reg pos = *reg;
1260 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1261 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1262
1263 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1264 * mode will be enabled.
1265 *
1266 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1267 * R31.1:0 Position Offset X/Y for Slot[3:0]
1268 * R31.3:2 Position Offset X/Y for Slot[7:4]
1269 * .....
1270 *
1271 * The X, Y sample positions come in as bytes in thread payload. So, read
1272 * the positions using vstride=16, width=8, hstride=2.
1273 */
1274 struct brw_reg sample_pos_reg =
1275 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1276 BRW_REGISTER_TYPE_B), 16, 8, 2);
1277
1278 fs_inst *inst = emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1279 if (dispatch_width == 16) {
1280 inst->force_uncompressed = true;
1281 inst = emit(MOV(half(int_sample_x, 1),
1282 fs_reg(suboffset(sample_pos_reg, 16))));
1283 inst->force_sechalf = true;
1284 }
1285 /* Compute gl_SamplePosition.x */
1286 compute_sample_position(pos, int_sample_x);
1287 pos.reg_offset++;
1288 inst = emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1289 if (dispatch_width == 16) {
1290 inst->force_uncompressed = true;
1291 inst = emit(MOV(half(int_sample_y, 1),
1292 fs_reg(suboffset(sample_pos_reg, 17))));
1293 inst->force_sechalf = true;
1294 }
1295 /* Compute gl_SamplePosition.y */
1296 compute_sample_position(pos, int_sample_y);
1297 return reg;
1298 }
1299
1300 fs_reg *
1301 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1302 {
1303 assert(brw->gen >= 6);
1304
1305 this->current_annotation = "compute sample id";
1306 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1307
1308 if (key->compute_sample_id) {
1309 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1310 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1311 t2.type = BRW_REGISTER_TYPE_UW;
1312
1313 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1314 * 8x multisampling, subspan 0 will represent sample N (where N
1315 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1316 * 7. We can find the value of N by looking at R0.0 bits 7:6
1317 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1318 * (since samples are always delivered in pairs). That is, we
1319 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1320 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1321 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1322 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1323 * populating a temporary variable with the sequence (0, 1, 2, 3),
1324 * and then reading from it using vstride=1, width=4, hstride=0.
1325 * These computations hold good for 4x multisampling as well.
1326 *
1327 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1328 * the first four slots are sample 0 of subspan 0; the next four
1329 * are sample 1 of subspan 0; the third group is sample 0 of
1330 * subspan 1, and finally sample 1 of subspan 1.
1331 */
1332 fs_inst *inst;
1333 inst = emit(BRW_OPCODE_AND, t1,
1334 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1335 fs_reg(0xc0));
1336 inst->force_writemask_all = true;
1337 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1338 inst->force_writemask_all = true;
1339 /* This works for both SIMD8 and SIMD16 */
1340 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1341 inst->force_writemask_all = true;
1342 /* This special instruction takes care of setting vstride=1,
1343 * width=4, hstride=0 of t2 during an ADD instruction.
1344 */
1345 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1346 } else {
1347 /* As per GL_ARB_sample_shading specification:
1348 * "When rendering to a non-multisample buffer, or if multisample
1349 * rasterization is disabled, gl_SampleID will always be zero."
1350 */
1351 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1352 }
1353
1354 return reg;
1355 }
1356
1357 fs_reg
1358 fs_visitor::fix_math_operand(fs_reg src)
1359 {
1360 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1361 * might be able to do better by doing execsize = 1 math and then
1362 * expanding that result out, but we would need to be careful with
1363 * masking.
1364 *
1365 * The hardware ignores source modifiers (negate and abs) on math
1366 * instructions, so we also move to a temp to set those up.
1367 */
1368 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1369 !src.abs && !src.negate)
1370 return src;
1371
1372 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1373 * operands to math
1374 */
1375 if (brw->gen >= 7 && src.file != IMM)
1376 return src;
1377
1378 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1379 expanded.type = src.type;
1380 emit(BRW_OPCODE_MOV, expanded, src);
1381 return expanded;
1382 }
1383
1384 fs_inst *
1385 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1386 {
1387 switch (opcode) {
1388 case SHADER_OPCODE_RCP:
1389 case SHADER_OPCODE_RSQ:
1390 case SHADER_OPCODE_SQRT:
1391 case SHADER_OPCODE_EXP2:
1392 case SHADER_OPCODE_LOG2:
1393 case SHADER_OPCODE_SIN:
1394 case SHADER_OPCODE_COS:
1395 break;
1396 default:
1397 unreachable("not reached: bad math opcode");
1398 }
1399
1400 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1401 * might be able to do better by doing execsize = 1 math and then
1402 * expanding that result out, but we would need to be careful with
1403 * masking.
1404 *
1405 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1406 * instructions, so we also move to a temp to set those up.
1407 */
1408 if (brw->gen == 6 || brw->gen == 7)
1409 src = fix_math_operand(src);
1410
1411 fs_inst *inst = emit(opcode, dst, src);
1412
1413 if (brw->gen < 6) {
1414 inst->base_mrf = 2;
1415 inst->mlen = dispatch_width / 8;
1416 }
1417
1418 return inst;
1419 }
1420
1421 fs_inst *
1422 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1423 {
1424 int base_mrf = 2;
1425 fs_inst *inst;
1426
1427 if (brw->gen >= 8) {
1428 inst = emit(opcode, dst, src0, src1);
1429 } else if (brw->gen >= 6) {
1430 src0 = fix_math_operand(src0);
1431 src1 = fix_math_operand(src1);
1432
1433 inst = emit(opcode, dst, src0, src1);
1434 } else {
1435 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1436 * "Message Payload":
1437 *
1438 * "Operand0[7]. For the INT DIV functions, this operand is the
1439 * denominator."
1440 * ...
1441 * "Operand1[7]. For the INT DIV functions, this operand is the
1442 * numerator."
1443 */
1444 bool is_int_div = opcode != SHADER_OPCODE_POW;
1445 fs_reg &op0 = is_int_div ? src1 : src0;
1446 fs_reg &op1 = is_int_div ? src0 : src1;
1447
1448 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1449 inst = emit(opcode, dst, op0, reg_null_f);
1450
1451 inst->base_mrf = base_mrf;
1452 inst->mlen = 2 * dispatch_width / 8;
1453 }
1454 return inst;
1455 }
1456
1457 void
1458 fs_visitor::assign_curb_setup()
1459 {
1460 if (dispatch_width == 8) {
1461 prog_data->base.dispatch_grf_start_reg = payload.num_regs;
1462 } else {
1463 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1464 }
1465
1466 prog_data->base.curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1467
1468 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1469 foreach_in_list(fs_inst, inst, &instructions) {
1470 for (unsigned int i = 0; i < inst->sources; i++) {
1471 if (inst->src[i].file == UNIFORM) {
1472 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1473 int constant_nr;
1474 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1475 constant_nr = push_constant_loc[uniform_nr];
1476 } else {
1477 /* Section 5.11 of the OpenGL 4.1 spec says:
1478 * "Out-of-bounds reads return undefined values, which include
1479 * values from other variables of the active program or zero."
1480 * Just return the first push constant.
1481 */
1482 constant_nr = 0;
1483 }
1484
1485 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1486 constant_nr / 8,
1487 constant_nr % 8);
1488
1489 inst->src[i].file = HW_REG;
1490 inst->src[i].fixed_hw_reg = byte_offset(
1491 retype(brw_reg, inst->src[i].type),
1492 inst->src[i].subreg_offset);
1493 }
1494 }
1495 }
1496 }
1497
1498 void
1499 fs_visitor::calculate_urb_setup()
1500 {
1501 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1502 prog_data->urb_setup[i] = -1;
1503 }
1504
1505 int urb_next = 0;
1506 /* Figure out where each of the incoming setup attributes lands. */
1507 if (brw->gen >= 6) {
1508 if (_mesa_bitcount_64(fp->Base.InputsRead &
1509 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1510 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1511 * first 16 varying inputs, so we can put them wherever we want.
1512 * Just put them in order.
1513 *
1514 * This is useful because it means that (a) inputs not used by the
1515 * fragment shader won't take up valuable register space, and (b) we
1516 * won't have to recompile the fragment shader if it gets paired with
1517 * a different vertex (or geometry) shader.
1518 */
1519 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1520 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1521 BITFIELD64_BIT(i)) {
1522 prog_data->urb_setup[i] = urb_next++;
1523 }
1524 }
1525 } else {
1526 /* We have enough input varyings that the SF/SBE pipeline stage can't
1527 * arbitrarily rearrange them to suit our whim; we have to put them
1528 * in an order that matches the output of the previous pipeline stage
1529 * (geometry or vertex shader).
1530 */
1531 struct brw_vue_map prev_stage_vue_map;
1532 brw_compute_vue_map(brw, &prev_stage_vue_map,
1533 key->input_slots_valid);
1534 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1535 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1536 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1537 slot++) {
1538 int varying = prev_stage_vue_map.slot_to_varying[slot];
1539 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1540 * unused.
1541 */
1542 if (varying != BRW_VARYING_SLOT_COUNT &&
1543 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1544 BITFIELD64_BIT(varying))) {
1545 prog_data->urb_setup[varying] = slot - first_slot;
1546 }
1547 }
1548 urb_next = prev_stage_vue_map.num_slots - first_slot;
1549 }
1550 } else {
1551 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1552 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1553 /* Point size is packed into the header, not as a general attribute */
1554 if (i == VARYING_SLOT_PSIZ)
1555 continue;
1556
1557 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1558 /* The back color slot is skipped when the front color is
1559 * also written to. In addition, some slots can be
1560 * written in the vertex shader and not read in the
1561 * fragment shader. So the register number must always be
1562 * incremented, mapped or not.
1563 */
1564 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1565 prog_data->urb_setup[i] = urb_next;
1566 urb_next++;
1567 }
1568 }
1569
1570 /*
1571 * It's a FS only attribute, and we did interpolation for this attribute
1572 * in SF thread. So, count it here, too.
1573 *
1574 * See compile_sf_prog() for more info.
1575 */
1576 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1577 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1578 }
1579
1580 prog_data->num_varying_inputs = urb_next;
1581 }
1582
1583 void
1584 fs_visitor::assign_urb_setup()
1585 {
1586 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1587
1588 /* Offset all the urb_setup[] index by the actual position of the
1589 * setup regs, now that the location of the constants has been chosen.
1590 */
1591 foreach_in_list(fs_inst, inst, &instructions) {
1592 if (inst->opcode == FS_OPCODE_LINTERP) {
1593 assert(inst->src[2].file == HW_REG);
1594 inst->src[2].fixed_hw_reg.nr += urb_start;
1595 }
1596
1597 if (inst->opcode == FS_OPCODE_CINTERP) {
1598 assert(inst->src[0].file == HW_REG);
1599 inst->src[0].fixed_hw_reg.nr += urb_start;
1600 }
1601 }
1602
1603 /* Each attribute is 4 setup channels, each of which is half a reg. */
1604 this->first_non_payload_grf =
1605 urb_start + prog_data->num_varying_inputs * 2;
1606 }
1607
1608 /**
1609 * Split large virtual GRFs into separate components if we can.
1610 *
1611 * This is mostly duplicated with what brw_fs_vector_splitting does,
1612 * but that's really conservative because it's afraid of doing
1613 * splitting that doesn't result in real progress after the rest of
1614 * the optimization phases, which would cause infinite looping in
1615 * optimization. We can do it once here, safely. This also has the
1616 * opportunity to split interpolated values, or maybe even uniforms,
1617 * which we don't have at the IR level.
1618 *
1619 * We want to split, because virtual GRFs are what we register
1620 * allocate and spill (due to contiguousness requirements for some
1621 * instructions), and they're what we naturally generate in the
1622 * codegen process, but most virtual GRFs don't actually need to be
1623 * contiguous sets of GRFs. If we split, we'll end up with reduced
1624 * live intervals and better dead code elimination and coalescing.
1625 */
1626 void
1627 fs_visitor::split_virtual_grfs()
1628 {
1629 int num_vars = this->virtual_grf_count;
1630 bool split_grf[num_vars];
1631 int new_virtual_grf[num_vars];
1632
1633 /* Try to split anything > 0 sized. */
1634 for (int i = 0; i < num_vars; i++) {
1635 if (this->virtual_grf_sizes[i] != 1)
1636 split_grf[i] = true;
1637 else
1638 split_grf[i] = false;
1639 }
1640
1641 if (brw->has_pln &&
1642 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1643 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1644 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1645 * Gen6, that was the only supported interpolation mode, and since Gen6,
1646 * delta_x and delta_y are in fixed hardware registers.
1647 */
1648 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1649 false;
1650 }
1651
1652 foreach_in_list(fs_inst, inst, &instructions) {
1653 /* If there's a SEND message that requires contiguous destination
1654 * registers, no splitting is allowed.
1655 */
1656 if (inst->regs_written > 1) {
1657 split_grf[inst->dst.reg] = false;
1658 }
1659
1660 /* If we're sending from a GRF, don't split it, on the assumption that
1661 * the send is reading the whole thing.
1662 */
1663 if (inst->is_send_from_grf()) {
1664 for (int i = 0; i < inst->sources; i++) {
1665 if (inst->src[i].file == GRF) {
1666 split_grf[inst->src[i].reg] = false;
1667 }
1668 }
1669 }
1670 }
1671
1672 /* Allocate new space for split regs. Note that the virtual
1673 * numbers will be contiguous.
1674 */
1675 for (int i = 0; i < num_vars; i++) {
1676 if (split_grf[i]) {
1677 new_virtual_grf[i] = virtual_grf_alloc(1);
1678 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1679 int reg = virtual_grf_alloc(1);
1680 assert(reg == new_virtual_grf[i] + j - 1);
1681 (void) reg;
1682 }
1683 this->virtual_grf_sizes[i] = 1;
1684 }
1685 }
1686
1687 foreach_in_list(fs_inst, inst, &instructions) {
1688 if (inst->dst.file == GRF &&
1689 split_grf[inst->dst.reg] &&
1690 inst->dst.reg_offset != 0) {
1691 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1692 inst->dst.reg_offset - 1);
1693 inst->dst.reg_offset = 0;
1694 }
1695 for (int i = 0; i < inst->sources; i++) {
1696 if (inst->src[i].file == GRF &&
1697 split_grf[inst->src[i].reg] &&
1698 inst->src[i].reg_offset != 0) {
1699 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1700 inst->src[i].reg_offset - 1);
1701 inst->src[i].reg_offset = 0;
1702 }
1703 }
1704 }
1705 invalidate_live_intervals(false);
1706 }
1707
1708 /**
1709 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1710 *
1711 * During code generation, we create tons of temporary variables, many of
1712 * which get immediately killed and are never used again. Yet, in later
1713 * optimization and analysis passes, such as compute_live_intervals, we need
1714 * to loop over all the virtual GRFs. Compacting them can save a lot of
1715 * overhead.
1716 */
1717 void
1718 fs_visitor::compact_virtual_grfs()
1719 {
1720 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
1721 return;
1722
1723 /* Mark which virtual GRFs are used, and count how many. */
1724 int remap_table[this->virtual_grf_count];
1725 memset(remap_table, -1, sizeof(remap_table));
1726
1727 foreach_in_list(const fs_inst, inst, &instructions) {
1728 if (inst->dst.file == GRF)
1729 remap_table[inst->dst.reg] = 0;
1730
1731 for (int i = 0; i < inst->sources; i++) {
1732 if (inst->src[i].file == GRF)
1733 remap_table[inst->src[i].reg] = 0;
1734 }
1735 }
1736
1737 /* Compact the GRF arrays. */
1738 int new_index = 0;
1739 for (int i = 0; i < this->virtual_grf_count; i++) {
1740 if (remap_table[i] != -1) {
1741 remap_table[i] = new_index;
1742 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1743 invalidate_live_intervals(false);
1744 ++new_index;
1745 }
1746 }
1747
1748 this->virtual_grf_count = new_index;
1749
1750 /* Patch all the instructions to use the newly renumbered registers */
1751 foreach_in_list(fs_inst, inst, &instructions) {
1752 if (inst->dst.file == GRF)
1753 inst->dst.reg = remap_table[inst->dst.reg];
1754
1755 for (int i = 0; i < inst->sources; i++) {
1756 if (inst->src[i].file == GRF)
1757 inst->src[i].reg = remap_table[inst->src[i].reg];
1758 }
1759 }
1760
1761 /* Patch all the references to delta_x/delta_y, since they're used in
1762 * register allocation.
1763 */
1764 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
1765 if (delta_x[i].file == GRF && remap_table[delta_x[i].reg] != -1) {
1766 delta_x[i].reg = remap_table[delta_x[i].reg];
1767 }
1768 }
1769 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
1770 if (delta_y[i].file == GRF && remap_table[delta_y[i].reg] != -1) {
1771 delta_y[i].reg = remap_table[delta_y[i].reg];
1772 }
1773 }
1774 }
1775
1776 /*
1777 * Implements array access of uniforms by inserting a
1778 * PULL_CONSTANT_LOAD instruction.
1779 *
1780 * Unlike temporary GRF array access (where we don't support it due to
1781 * the difficulty of doing relative addressing on instruction
1782 * destinations), we could potentially do array access of uniforms
1783 * that were loaded in GRF space as push constants. In real-world
1784 * usage we've seen, though, the arrays being used are always larger
1785 * than we could load as push constants, so just always move all
1786 * uniform array access out to a pull constant buffer.
1787 */
1788 void
1789 fs_visitor::move_uniform_array_access_to_pull_constants()
1790 {
1791 if (dispatch_width != 8)
1792 return;
1793
1794 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1795
1796 for (unsigned int i = 0; i < uniforms; i++) {
1797 pull_constant_loc[i] = -1;
1798 }
1799
1800 /* Walk through and find array access of uniforms. Put a copy of that
1801 * uniform in the pull constant buffer.
1802 *
1803 * Note that we don't move constant-indexed accesses to arrays. No
1804 * testing has been done of the performance impact of this choice.
1805 */
1806 foreach_in_list_safe(fs_inst, inst, &instructions) {
1807 for (int i = 0 ; i < inst->sources; i++) {
1808 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1809 continue;
1810
1811 int uniform = inst->src[i].reg;
1812
1813 /* If this array isn't already present in the pull constant buffer,
1814 * add it.
1815 */
1816 if (pull_constant_loc[uniform] == -1) {
1817 const gl_constant_value **values = &stage_prog_data->param[uniform];
1818
1819 assert(param_size[uniform]);
1820
1821 for (int j = 0; j < param_size[uniform]; j++) {
1822 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1823
1824 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1825 values[j];
1826 }
1827 }
1828 }
1829 }
1830 }
1831
1832 /**
1833 * Assign UNIFORM file registers to either push constants or pull constants.
1834 *
1835 * We allow a fragment shader to have more than the specified minimum
1836 * maximum number of fragment shader uniform components (64). If
1837 * there are too many of these, they'd fill up all of register space.
1838 * So, this will push some of them out to the pull constant buffer and
1839 * update the program to load them.
1840 */
1841 void
1842 fs_visitor::assign_constant_locations()
1843 {
1844 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1845 if (dispatch_width != 8)
1846 return;
1847
1848 /* Find which UNIFORM registers are still in use. */
1849 bool is_live[uniforms];
1850 for (unsigned int i = 0; i < uniforms; i++) {
1851 is_live[i] = false;
1852 }
1853
1854 foreach_in_list(fs_inst, inst, &instructions) {
1855 for (int i = 0; i < inst->sources; i++) {
1856 if (inst->src[i].file != UNIFORM)
1857 continue;
1858
1859 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1860 if (constant_nr >= 0 && constant_nr < (int) uniforms)
1861 is_live[constant_nr] = true;
1862 }
1863 }
1864
1865 /* Only allow 16 registers (128 uniform components) as push constants.
1866 *
1867 * Just demote the end of the list. We could probably do better
1868 * here, demoting things that are rarely used in the program first.
1869 *
1870 * If changing this value, note the limitation about total_regs in
1871 * brw_curbe.c.
1872 */
1873 unsigned int max_push_components = 16 * 8;
1874 unsigned int num_push_constants = 0;
1875
1876 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1877
1878 for (unsigned int i = 0; i < uniforms; i++) {
1879 if (!is_live[i] || pull_constant_loc[i] != -1) {
1880 /* This UNIFORM register is either dead, or has already been demoted
1881 * to a pull const. Mark it as no longer living in the param[] array.
1882 */
1883 push_constant_loc[i] = -1;
1884 continue;
1885 }
1886
1887 if (num_push_constants < max_push_components) {
1888 /* Retain as a push constant. Record the location in the params[]
1889 * array.
1890 */
1891 push_constant_loc[i] = num_push_constants++;
1892 } else {
1893 /* Demote to a pull constant. */
1894 push_constant_loc[i] = -1;
1895
1896 int pull_index = stage_prog_data->nr_pull_params++;
1897 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1898 pull_constant_loc[i] = pull_index;
1899 }
1900 }
1901
1902 stage_prog_data->nr_params = num_push_constants;
1903
1904 /* Up until now, the param[] array has been indexed by reg + reg_offset
1905 * of UNIFORM registers. Condense it to only contain the uniforms we
1906 * chose to upload as push constants.
1907 */
1908 for (unsigned int i = 0; i < uniforms; i++) {
1909 int remapped = push_constant_loc[i];
1910
1911 if (remapped == -1)
1912 continue;
1913
1914 assert(remapped <= (int)i);
1915 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1916 }
1917 }
1918
1919 /**
1920 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1921 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1922 */
1923 void
1924 fs_visitor::demote_pull_constants()
1925 {
1926 calculate_cfg();
1927
1928 foreach_block_and_inst (block, fs_inst, inst, cfg) {
1929 for (int i = 0; i < inst->sources; i++) {
1930 if (inst->src[i].file != UNIFORM)
1931 continue;
1932
1933 int pull_index = pull_constant_loc[inst->src[i].reg +
1934 inst->src[i].reg_offset];
1935 if (pull_index == -1)
1936 continue;
1937
1938 /* Set up the annotation tracking for new generated instructions. */
1939 base_ir = inst->ir;
1940 current_annotation = inst->annotation;
1941
1942 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1943 fs_reg dst = fs_reg(this, glsl_type::float_type);
1944
1945 /* Generate a pull load into dst. */
1946 if (inst->src[i].reladdr) {
1947 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1948 surf_index,
1949 *inst->src[i].reladdr,
1950 pull_index);
1951 inst->insert_before(block, &list);
1952 inst->src[i].reladdr = NULL;
1953 } else {
1954 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1955 fs_inst *pull =
1956 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1957 dst, surf_index, offset);
1958 inst->insert_before(block, pull);
1959 inst->src[i].set_smear(pull_index & 3);
1960 }
1961
1962 /* Rewrite the instruction to use the temporary VGRF. */
1963 inst->src[i].file = GRF;
1964 inst->src[i].reg = dst.reg;
1965 inst->src[i].reg_offset = 0;
1966 }
1967 }
1968 invalidate_live_intervals(false);
1969 }
1970
1971 bool
1972 fs_visitor::opt_algebraic()
1973 {
1974 bool progress = false;
1975
1976 foreach_in_list(fs_inst, inst, &instructions) {
1977 switch (inst->opcode) {
1978 case BRW_OPCODE_MUL:
1979 if (inst->src[1].file != IMM)
1980 continue;
1981
1982 /* a * 1.0 = a */
1983 if (inst->src[1].is_one()) {
1984 inst->opcode = BRW_OPCODE_MOV;
1985 inst->src[1] = reg_undef;
1986 progress = true;
1987 break;
1988 }
1989
1990 /* a * 0.0 = 0.0 */
1991 if (inst->src[1].is_zero()) {
1992 inst->opcode = BRW_OPCODE_MOV;
1993 inst->src[0] = inst->src[1];
1994 inst->src[1] = reg_undef;
1995 progress = true;
1996 break;
1997 }
1998
1999 break;
2000 case BRW_OPCODE_ADD:
2001 if (inst->src[1].file != IMM)
2002 continue;
2003
2004 /* a + 0.0 = a */
2005 if (inst->src[1].is_zero()) {
2006 inst->opcode = BRW_OPCODE_MOV;
2007 inst->src[1] = reg_undef;
2008 progress = true;
2009 break;
2010 }
2011 break;
2012 case BRW_OPCODE_OR:
2013 if (inst->src[0].equals(inst->src[1])) {
2014 inst->opcode = BRW_OPCODE_MOV;
2015 inst->src[1] = reg_undef;
2016 progress = true;
2017 break;
2018 }
2019 break;
2020 case BRW_OPCODE_LRP:
2021 if (inst->src[1].equals(inst->src[2])) {
2022 inst->opcode = BRW_OPCODE_MOV;
2023 inst->src[0] = inst->src[1];
2024 inst->src[1] = reg_undef;
2025 inst->src[2] = reg_undef;
2026 progress = true;
2027 break;
2028 }
2029 break;
2030 case BRW_OPCODE_SEL:
2031 if (inst->src[0].equals(inst->src[1])) {
2032 inst->opcode = BRW_OPCODE_MOV;
2033 inst->src[1] = reg_undef;
2034 inst->predicate = BRW_PREDICATE_NONE;
2035 inst->predicate_inverse = false;
2036 progress = true;
2037 } else if (inst->saturate && inst->src[1].file == IMM) {
2038 switch (inst->conditional_mod) {
2039 case BRW_CONDITIONAL_LE:
2040 case BRW_CONDITIONAL_L:
2041 switch (inst->src[1].type) {
2042 case BRW_REGISTER_TYPE_F:
2043 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2044 inst->opcode = BRW_OPCODE_MOV;
2045 inst->src[1] = reg_undef;
2046 progress = true;
2047 }
2048 break;
2049 default:
2050 break;
2051 }
2052 break;
2053 case BRW_CONDITIONAL_GE:
2054 case BRW_CONDITIONAL_G:
2055 switch (inst->src[1].type) {
2056 case BRW_REGISTER_TYPE_F:
2057 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2058 inst->opcode = BRW_OPCODE_MOV;
2059 inst->src[1] = reg_undef;
2060 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2061 progress = true;
2062 }
2063 break;
2064 default:
2065 break;
2066 }
2067 default:
2068 break;
2069 }
2070 }
2071 break;
2072 default:
2073 break;
2074 }
2075 }
2076
2077 return progress;
2078 }
2079
2080 bool
2081 fs_visitor::opt_register_renaming()
2082 {
2083 bool progress = false;
2084 int depth = 0;
2085
2086 int remap[virtual_grf_count];
2087 memset(remap, -1, sizeof(int) * virtual_grf_count);
2088
2089 foreach_in_list(fs_inst, inst, &this->instructions) {
2090 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2091 depth++;
2092 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2093 inst->opcode == BRW_OPCODE_WHILE) {
2094 depth--;
2095 }
2096
2097 /* Rewrite instruction sources. */
2098 for (int i = 0; i < inst->sources; i++) {
2099 if (inst->src[i].file == GRF &&
2100 remap[inst->src[i].reg] != -1 &&
2101 remap[inst->src[i].reg] != inst->src[i].reg) {
2102 inst->src[i].reg = remap[inst->src[i].reg];
2103 progress = true;
2104 }
2105 }
2106
2107 const int dst = inst->dst.reg;
2108
2109 if (depth == 0 &&
2110 inst->dst.file == GRF &&
2111 virtual_grf_sizes[inst->dst.reg] == 1 &&
2112 !inst->is_partial_write()) {
2113 if (remap[dst] == -1) {
2114 remap[dst] = dst;
2115 } else {
2116 remap[dst] = virtual_grf_alloc(1);
2117 inst->dst.reg = remap[dst];
2118 progress = true;
2119 }
2120 } else if (inst->dst.file == GRF &&
2121 remap[dst] != -1 &&
2122 remap[dst] != dst) {
2123 inst->dst.reg = remap[dst];
2124 progress = true;
2125 }
2126 }
2127
2128 if (progress) {
2129 invalidate_live_intervals();
2130
2131 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2132 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2133 delta_x[i].reg = remap[delta_x[i].reg];
2134 }
2135 }
2136 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2137 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2138 delta_y[i].reg = remap[delta_y[i].reg];
2139 }
2140 }
2141 }
2142
2143 return progress;
2144 }
2145
2146 bool
2147 fs_visitor::compute_to_mrf()
2148 {
2149 bool progress = false;
2150 int next_ip = 0;
2151
2152 calculate_live_intervals();
2153
2154 foreach_in_list_safe(fs_inst, inst, &instructions) {
2155 int ip = next_ip;
2156 next_ip++;
2157
2158 if (inst->opcode != BRW_OPCODE_MOV ||
2159 inst->is_partial_write() ||
2160 inst->dst.file != MRF || inst->src[0].file != GRF ||
2161 inst->dst.type != inst->src[0].type ||
2162 inst->src[0].abs || inst->src[0].negate ||
2163 !inst->src[0].is_contiguous() ||
2164 inst->src[0].subreg_offset)
2165 continue;
2166
2167 /* Work out which hardware MRF registers are written by this
2168 * instruction.
2169 */
2170 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2171 int mrf_high;
2172 if (inst->dst.reg & BRW_MRF_COMPR4) {
2173 mrf_high = mrf_low + 4;
2174 } else if (dispatch_width == 16 &&
2175 (!inst->force_uncompressed && !inst->force_sechalf)) {
2176 mrf_high = mrf_low + 1;
2177 } else {
2178 mrf_high = mrf_low;
2179 }
2180
2181 /* Can't compute-to-MRF this GRF if someone else was going to
2182 * read it later.
2183 */
2184 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2185 continue;
2186
2187 /* Found a move of a GRF to a MRF. Let's see if we can go
2188 * rewrite the thing that made this GRF to write into the MRF.
2189 */
2190 fs_inst *scan_inst;
2191 for (scan_inst = (fs_inst *)inst->prev;
2192 !scan_inst->is_head_sentinel();
2193 scan_inst = (fs_inst *)scan_inst->prev) {
2194 if (scan_inst->dst.file == GRF &&
2195 scan_inst->dst.reg == inst->src[0].reg) {
2196 /* Found the last thing to write our reg we want to turn
2197 * into a compute-to-MRF.
2198 */
2199
2200 /* If this one instruction didn't populate all the
2201 * channels, bail. We might be able to rewrite everything
2202 * that writes that reg, but it would require smarter
2203 * tracking to delay the rewriting until complete success.
2204 */
2205 if (scan_inst->is_partial_write())
2206 break;
2207
2208 /* Things returning more than one register would need us to
2209 * understand coalescing out more than one MOV at a time.
2210 */
2211 if (scan_inst->regs_written > 1)
2212 break;
2213
2214 /* SEND instructions can't have MRF as a destination. */
2215 if (scan_inst->mlen)
2216 break;
2217
2218 if (brw->gen == 6) {
2219 /* gen6 math instructions must have the destination be
2220 * GRF, so no compute-to-MRF for them.
2221 */
2222 if (scan_inst->is_math()) {
2223 break;
2224 }
2225 }
2226
2227 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2228 /* Found the creator of our MRF's source value. */
2229 scan_inst->dst.file = MRF;
2230 scan_inst->dst.reg = inst->dst.reg;
2231 scan_inst->saturate |= inst->saturate;
2232 inst->remove();
2233 progress = true;
2234 }
2235 break;
2236 }
2237
2238 /* We don't handle control flow here. Most computation of
2239 * values that end up in MRFs are shortly before the MRF
2240 * write anyway.
2241 */
2242 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2243 break;
2244
2245 /* You can't read from an MRF, so if someone else reads our
2246 * MRF's source GRF that we wanted to rewrite, that stops us.
2247 */
2248 bool interfered = false;
2249 for (int i = 0; i < scan_inst->sources; i++) {
2250 if (scan_inst->src[i].file == GRF &&
2251 scan_inst->src[i].reg == inst->src[0].reg &&
2252 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2253 interfered = true;
2254 }
2255 }
2256 if (interfered)
2257 break;
2258
2259 if (scan_inst->dst.file == MRF) {
2260 /* If somebody else writes our MRF here, we can't
2261 * compute-to-MRF before that.
2262 */
2263 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2264 int scan_mrf_high;
2265
2266 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2267 scan_mrf_high = scan_mrf_low + 4;
2268 } else if (dispatch_width == 16 &&
2269 (!scan_inst->force_uncompressed &&
2270 !scan_inst->force_sechalf)) {
2271 scan_mrf_high = scan_mrf_low + 1;
2272 } else {
2273 scan_mrf_high = scan_mrf_low;
2274 }
2275
2276 if (mrf_low == scan_mrf_low ||
2277 mrf_low == scan_mrf_high ||
2278 mrf_high == scan_mrf_low ||
2279 mrf_high == scan_mrf_high) {
2280 break;
2281 }
2282 }
2283
2284 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2285 /* Found a SEND instruction, which means that there are
2286 * live values in MRFs from base_mrf to base_mrf +
2287 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2288 * above it.
2289 */
2290 if (mrf_low >= scan_inst->base_mrf &&
2291 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2292 break;
2293 }
2294 if (mrf_high >= scan_inst->base_mrf &&
2295 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2296 break;
2297 }
2298 }
2299 }
2300 }
2301
2302 if (progress)
2303 invalidate_live_intervals(false);
2304
2305 return progress;
2306 }
2307
2308 /**
2309 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2310 * instructions to FS_OPCODE_REP_FB_WRITE.
2311 */
2312 void
2313 fs_visitor::try_rep_send()
2314 {
2315 int i, count;
2316 fs_inst *start = NULL;
2317
2318 /* From the Ivybridge PRM, Volume 4 Part 1, section 3.9.11.2
2319 * ("Message Descriptor - Render Target Write"):
2320 *
2321 * "SIMD16_REPDATA message must not be used in SIMD8 pixel-shaders."
2322 */
2323 if (dispatch_width != 16)
2324 return;
2325
2326 /* The constant color write message can't handle anything but the 4 color
2327 * values. We could do MRT, but the loops below would need to understand
2328 * handling the header being enabled or disabled on different messages. It
2329 * also requires that the render target be tiled, which might not be the
2330 * case for some EGLImage paths or if we some day do rendering to PBOs.
2331 */
2332 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH) ||
2333 payload.aa_dest_stencil_reg ||
2334 payload.dest_depth_reg ||
2335 dual_src_output.file != BAD_FILE)
2336 return;
2337
2338 /* The optimization is implemented as one pass through the instruction
2339 * list. We keep track of the most recent block of MOVs into sequential
2340 * MRFs from single, sequential float registers (ie uniforms). Then when
2341 * we find an FB_WRITE opcode, we see if the payload registers match the
2342 * destination registers in our block of MOVs.
2343 */
2344 count = 0;
2345 foreach_in_list_safe(fs_inst, inst, &this->instructions) {
2346 if (count == 0)
2347 start = inst;
2348 if (inst->opcode == BRW_OPCODE_MOV &&
2349 inst->dst.file == MRF &&
2350 inst->dst.reg == start->dst.reg + 2 * count &&
2351 inst->src[0].file == HW_REG &&
2352 inst->src[0].reg_offset == start->src[0].reg_offset + count) {
2353 if (count == 0)
2354 start = inst;
2355 count++;
2356 }
2357
2358 if (inst->opcode == FS_OPCODE_FB_WRITE &&
2359 count == 4 &&
2360 (inst->base_mrf == start->dst.reg ||
2361 (inst->base_mrf + 2 == start->dst.reg && inst->header_present))) {
2362 fs_inst *mov = MOV(start->dst, start->src[0]);
2363
2364 /* Make a MOV that moves the four floats into the replicated write
2365 * payload. Since we're running at the very end of code generation
2366 * we can use hw registers and generate the stride and offsets we
2367 * need for this MOV. We use the first of the eight registers
2368 * allocated for the SIMD16 payload for the four floats.
2369 */
2370 mov->dst.fixed_hw_reg =
2371 brw_vec4_reg(BRW_MESSAGE_REGISTER_FILE,
2372 start->dst.reg, 0);
2373 mov->dst.file = HW_REG;
2374 mov->dst.type = mov->dst.fixed_hw_reg.type;
2375
2376 mov->src[0].fixed_hw_reg =
2377 brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2378 mov->src[0].file = HW_REG;
2379 mov->src[0].type = mov->src[0].fixed_hw_reg.type;
2380 mov->force_writemask_all = true;
2381 mov->dst.type = BRW_REGISTER_TYPE_F;
2382
2383 /* Replace the four MOVs with the new vec4 MOV. */
2384 start->insert_before(mov);
2385 for (i = 0; i < 4; i++)
2386 mov->next->remove();
2387
2388 /* Finally, adjust the message length and set the opcode to
2389 * REP_FB_WRITE for the send, so that the generator will use the
2390 * replicated data mesage type. Then reset count so we'll start
2391 * looking for a new block in case we're in a MRT shader.
2392 */
2393 inst->opcode = FS_OPCODE_REP_FB_WRITE;
2394 inst->mlen -= 7;
2395 count = 0;
2396 }
2397 }
2398
2399 return;
2400 }
2401
2402 /**
2403 * Walks through basic blocks, looking for repeated MRF writes and
2404 * removing the later ones.
2405 */
2406 bool
2407 fs_visitor::remove_duplicate_mrf_writes()
2408 {
2409 fs_inst *last_mrf_move[16];
2410 bool progress = false;
2411
2412 /* Need to update the MRF tracking for compressed instructions. */
2413 if (dispatch_width == 16)
2414 return false;
2415
2416 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2417
2418 calculate_cfg();
2419
2420 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2421 if (inst->is_control_flow()) {
2422 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2423 }
2424
2425 if (inst->opcode == BRW_OPCODE_MOV &&
2426 inst->dst.file == MRF) {
2427 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2428 if (prev_inst && inst->equals(prev_inst)) {
2429 inst->remove(block);
2430 progress = true;
2431 continue;
2432 }
2433 }
2434
2435 /* Clear out the last-write records for MRFs that were overwritten. */
2436 if (inst->dst.file == MRF) {
2437 last_mrf_move[inst->dst.reg] = NULL;
2438 }
2439
2440 if (inst->mlen > 0 && inst->base_mrf != -1) {
2441 /* Found a SEND instruction, which will include two or fewer
2442 * implied MRF writes. We could do better here.
2443 */
2444 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2445 last_mrf_move[inst->base_mrf + i] = NULL;
2446 }
2447 }
2448
2449 /* Clear out any MRF move records whose sources got overwritten. */
2450 if (inst->dst.file == GRF) {
2451 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2452 if (last_mrf_move[i] &&
2453 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2454 last_mrf_move[i] = NULL;
2455 }
2456 }
2457 }
2458
2459 if (inst->opcode == BRW_OPCODE_MOV &&
2460 inst->dst.file == MRF &&
2461 inst->src[0].file == GRF &&
2462 !inst->is_partial_write()) {
2463 last_mrf_move[inst->dst.reg] = inst;
2464 }
2465 }
2466
2467 if (progress)
2468 invalidate_live_intervals();
2469
2470 return progress;
2471 }
2472
2473 static void
2474 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2475 int first_grf, int grf_len)
2476 {
2477 bool inst_simd16 = (dispatch_width > 8 &&
2478 !inst->force_uncompressed &&
2479 !inst->force_sechalf);
2480
2481 /* Clear the flag for registers that actually got read (as expected). */
2482 for (int i = 0; i < inst->sources; i++) {
2483 int grf;
2484 if (inst->src[i].file == GRF) {
2485 grf = inst->src[i].reg;
2486 } else if (inst->src[i].file == HW_REG &&
2487 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2488 grf = inst->src[i].fixed_hw_reg.nr;
2489 } else {
2490 continue;
2491 }
2492
2493 if (grf >= first_grf &&
2494 grf < first_grf + grf_len) {
2495 deps[grf - first_grf] = false;
2496 if (inst_simd16)
2497 deps[grf - first_grf + 1] = false;
2498 }
2499 }
2500 }
2501
2502 /**
2503 * Implements this workaround for the original 965:
2504 *
2505 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2506 * check for post destination dependencies on this instruction, software
2507 * must ensure that there is no destination hazard for the case of ‘write
2508 * followed by a posted write’ shown in the following example.
2509 *
2510 * 1. mov r3 0
2511 * 2. send r3.xy <rest of send instruction>
2512 * 3. mov r2 r3
2513 *
2514 * Due to no post-destination dependency check on the ‘send’, the above
2515 * code sequence could have two instructions (1 and 2) in flight at the
2516 * same time that both consider ‘r3’ as the target of their final writes.
2517 */
2518 void
2519 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2520 {
2521 int reg_size = dispatch_width / 8;
2522 int write_len = inst->regs_written * reg_size;
2523 int first_write_grf = inst->dst.reg;
2524 bool needs_dep[BRW_MAX_MRF];
2525 assert(write_len < (int)sizeof(needs_dep) - 1);
2526
2527 memset(needs_dep, false, sizeof(needs_dep));
2528 memset(needs_dep, true, write_len);
2529
2530 clear_deps_for_inst_src(inst, dispatch_width,
2531 needs_dep, first_write_grf, write_len);
2532
2533 /* Walk backwards looking for writes to registers we're writing which
2534 * aren't read since being written. If we hit the start of the program,
2535 * we assume that there are no outstanding dependencies on entry to the
2536 * program.
2537 */
2538 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2539 !scan_inst->is_head_sentinel();
2540 scan_inst = (fs_inst *)scan_inst->prev) {
2541
2542 /* If we hit control flow, assume that there *are* outstanding
2543 * dependencies, and force their cleanup before our instruction.
2544 */
2545 if (scan_inst->is_control_flow()) {
2546 for (int i = 0; i < write_len; i++) {
2547 if (needs_dep[i]) {
2548 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2549 }
2550 }
2551 return;
2552 }
2553
2554 bool scan_inst_simd16 = (dispatch_width > 8 &&
2555 !scan_inst->force_uncompressed &&
2556 !scan_inst->force_sechalf);
2557
2558 /* We insert our reads as late as possible on the assumption that any
2559 * instruction but a MOV that might have left us an outstanding
2560 * dependency has more latency than a MOV.
2561 */
2562 if (scan_inst->dst.file == GRF) {
2563 for (int i = 0; i < scan_inst->regs_written; i++) {
2564 int reg = scan_inst->dst.reg + i * reg_size;
2565
2566 if (reg >= first_write_grf &&
2567 reg < first_write_grf + write_len &&
2568 needs_dep[reg - first_write_grf]) {
2569 inst->insert_before(DEP_RESOLVE_MOV(reg));
2570 needs_dep[reg - first_write_grf] = false;
2571 if (scan_inst_simd16)
2572 needs_dep[reg - first_write_grf + 1] = false;
2573 }
2574 }
2575 }
2576
2577 /* Clear the flag for registers that actually got read (as expected). */
2578 clear_deps_for_inst_src(scan_inst, dispatch_width,
2579 needs_dep, first_write_grf, write_len);
2580
2581 /* Continue the loop only if we haven't resolved all the dependencies */
2582 int i;
2583 for (i = 0; i < write_len; i++) {
2584 if (needs_dep[i])
2585 break;
2586 }
2587 if (i == write_len)
2588 return;
2589 }
2590 }
2591
2592 /**
2593 * Implements this workaround for the original 965:
2594 *
2595 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2596 * used as a destination register until after it has been sourced by an
2597 * instruction with a different destination register.
2598 */
2599 void
2600 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2601 {
2602 int write_len = inst->regs_written * dispatch_width / 8;
2603 int first_write_grf = inst->dst.reg;
2604 bool needs_dep[BRW_MAX_MRF];
2605 assert(write_len < (int)sizeof(needs_dep) - 1);
2606
2607 memset(needs_dep, false, sizeof(needs_dep));
2608 memset(needs_dep, true, write_len);
2609 /* Walk forwards looking for writes to registers we're writing which aren't
2610 * read before being written.
2611 */
2612 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2613 !scan_inst->is_tail_sentinel();
2614 scan_inst = (fs_inst *)scan_inst->next) {
2615 /* If we hit control flow, force resolve all remaining dependencies. */
2616 if (scan_inst->is_control_flow()) {
2617 for (int i = 0; i < write_len; i++) {
2618 if (needs_dep[i])
2619 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2620 }
2621 return;
2622 }
2623
2624 /* Clear the flag for registers that actually got read (as expected). */
2625 clear_deps_for_inst_src(scan_inst, dispatch_width,
2626 needs_dep, first_write_grf, write_len);
2627
2628 /* We insert our reads as late as possible since they're reading the
2629 * result of a SEND, which has massive latency.
2630 */
2631 if (scan_inst->dst.file == GRF &&
2632 scan_inst->dst.reg >= first_write_grf &&
2633 scan_inst->dst.reg < first_write_grf + write_len &&
2634 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2635 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2636 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2637 }
2638
2639 /* Continue the loop only if we haven't resolved all the dependencies */
2640 int i;
2641 for (i = 0; i < write_len; i++) {
2642 if (needs_dep[i])
2643 break;
2644 }
2645 if (i == write_len)
2646 return;
2647 }
2648
2649 /* If we hit the end of the program, resolve all remaining dependencies out
2650 * of paranoia.
2651 */
2652 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2653 assert(last_inst->eot);
2654 for (int i = 0; i < write_len; i++) {
2655 if (needs_dep[i])
2656 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2657 }
2658 }
2659
2660 void
2661 fs_visitor::insert_gen4_send_dependency_workarounds()
2662 {
2663 if (brw->gen != 4 || brw->is_g4x)
2664 return;
2665
2666 bool progress = false;
2667
2668 /* Note that we're done with register allocation, so GRF fs_regs always
2669 * have a .reg_offset of 0.
2670 */
2671
2672 foreach_in_list_safe(fs_inst, inst, &instructions) {
2673 if (inst->mlen != 0 && inst->dst.file == GRF) {
2674 insert_gen4_pre_send_dependency_workarounds(inst);
2675 insert_gen4_post_send_dependency_workarounds(inst);
2676 progress = true;
2677 }
2678 }
2679
2680 if (progress)
2681 invalidate_live_intervals();
2682 }
2683
2684 /**
2685 * Turns the generic expression-style uniform pull constant load instruction
2686 * into a hardware-specific series of instructions for loading a pull
2687 * constant.
2688 *
2689 * The expression style allows the CSE pass before this to optimize out
2690 * repeated loads from the same offset, and gives the pre-register-allocation
2691 * scheduling full flexibility, while the conversion to native instructions
2692 * allows the post-register-allocation scheduler the best information
2693 * possible.
2694 *
2695 * Note that execution masking for setting up pull constant loads is special:
2696 * the channels that need to be written are unrelated to the current execution
2697 * mask, since a later instruction will use one of the result channels as a
2698 * source operand for all 8 or 16 of its channels.
2699 */
2700 void
2701 fs_visitor::lower_uniform_pull_constant_loads()
2702 {
2703 calculate_cfg();
2704
2705 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2706 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2707 continue;
2708
2709 if (brw->gen >= 7) {
2710 /* The offset arg before was a vec4-aligned byte offset. We need to
2711 * turn it into a dword offset.
2712 */
2713 fs_reg const_offset_reg = inst->src[1];
2714 assert(const_offset_reg.file == IMM &&
2715 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2716 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2717 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2718
2719 /* This is actually going to be a MOV, but since only the first dword
2720 * is accessed, we have a special opcode to do just that one. Note
2721 * that this needs to be an operation that will be considered a def
2722 * by live variable analysis, or register allocation will explode.
2723 */
2724 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2725 payload, const_offset_reg);
2726 setup->force_writemask_all = true;
2727
2728 setup->ir = inst->ir;
2729 setup->annotation = inst->annotation;
2730 inst->insert_before(block, setup);
2731
2732 /* Similarly, this will only populate the first 4 channels of the
2733 * result register (since we only use smear values from 0-3), but we
2734 * don't tell the optimizer.
2735 */
2736 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2737 inst->src[1] = payload;
2738
2739 invalidate_live_intervals(false);
2740 } else {
2741 /* Before register allocation, we didn't tell the scheduler about the
2742 * MRF we use. We know it's safe to use this MRF because nothing
2743 * else does except for register spill/unspill, which generates and
2744 * uses its MRF within a single IR instruction.
2745 */
2746 inst->base_mrf = 14;
2747 inst->mlen = 1;
2748 }
2749 }
2750 }
2751
2752 bool
2753 fs_visitor::lower_load_payload()
2754 {
2755 bool progress = false;
2756
2757 calculate_cfg();
2758
2759 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2760 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2761 fs_reg dst = inst->dst;
2762
2763 /* src[0] represents the (optional) message header. */
2764 if (inst->src[0].file != BAD_FILE) {
2765 inst->insert_before(block, MOV(dst, inst->src[0]));
2766 }
2767 dst.reg_offset++;
2768
2769 for (int i = 1; i < inst->sources; i++) {
2770 inst->insert_before(block, MOV(dst, inst->src[i]));
2771 dst.reg_offset++;
2772 }
2773
2774 inst->remove(block);
2775 progress = true;
2776 }
2777 }
2778
2779 if (progress)
2780 invalidate_live_intervals(false);
2781
2782 return progress;
2783 }
2784
2785 void
2786 fs_visitor::dump_instructions()
2787 {
2788 dump_instructions(NULL);
2789 }
2790
2791 void
2792 fs_visitor::dump_instructions(const char *name)
2793 {
2794 calculate_register_pressure();
2795 FILE *file = stderr;
2796 if (name && geteuid() != 0) {
2797 file = fopen(name, "w");
2798 if (!file)
2799 file = stderr;
2800 }
2801
2802 int ip = 0, max_pressure = 0;
2803 foreach_in_list(backend_instruction, inst, &instructions) {
2804 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2805 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2806 dump_instruction(inst, file);
2807 ++ip;
2808 }
2809 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
2810
2811 if (file != stderr) {
2812 fclose(file);
2813 }
2814 }
2815
2816 void
2817 fs_visitor::dump_instruction(backend_instruction *be_inst)
2818 {
2819 dump_instruction(be_inst, stderr);
2820 }
2821
2822 void
2823 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
2824 {
2825 fs_inst *inst = (fs_inst *)be_inst;
2826
2827 if (inst->predicate) {
2828 fprintf(file, "(%cf0.%d) ",
2829 inst->predicate_inverse ? '-' : '+',
2830 inst->flag_subreg);
2831 }
2832
2833 fprintf(file, "%s", brw_instruction_name(inst->opcode));
2834 if (inst->saturate)
2835 fprintf(file, ".sat");
2836 if (inst->conditional_mod) {
2837 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
2838 if (!inst->predicate &&
2839 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2840 inst->opcode != BRW_OPCODE_IF &&
2841 inst->opcode != BRW_OPCODE_WHILE))) {
2842 fprintf(file, ".f0.%d", inst->flag_subreg);
2843 }
2844 }
2845 fprintf(file, " ");
2846
2847
2848 switch (inst->dst.file) {
2849 case GRF:
2850 fprintf(file, "vgrf%d", inst->dst.reg);
2851 if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2852 inst->dst.subreg_offset)
2853 fprintf(file, "+%d.%d",
2854 inst->dst.reg_offset, inst->dst.subreg_offset);
2855 break;
2856 case MRF:
2857 fprintf(file, "m%d", inst->dst.reg);
2858 break;
2859 case BAD_FILE:
2860 fprintf(file, "(null)");
2861 break;
2862 case UNIFORM:
2863 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2864 break;
2865 case HW_REG:
2866 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2867 switch (inst->dst.fixed_hw_reg.nr) {
2868 case BRW_ARF_NULL:
2869 fprintf(file, "null");
2870 break;
2871 case BRW_ARF_ADDRESS:
2872 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2873 break;
2874 case BRW_ARF_ACCUMULATOR:
2875 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
2876 break;
2877 case BRW_ARF_FLAG:
2878 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2879 inst->dst.fixed_hw_reg.subnr);
2880 break;
2881 default:
2882 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2883 inst->dst.fixed_hw_reg.subnr);
2884 break;
2885 }
2886 } else {
2887 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2888 }
2889 if (inst->dst.fixed_hw_reg.subnr)
2890 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
2891 break;
2892 default:
2893 fprintf(file, "???");
2894 break;
2895 }
2896 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
2897
2898 for (int i = 0; i < inst->sources && inst->src[i].file != BAD_FILE; i++) {
2899 if (inst->src[i].negate)
2900 fprintf(file, "-");
2901 if (inst->src[i].abs)
2902 fprintf(file, "|");
2903 switch (inst->src[i].file) {
2904 case GRF:
2905 fprintf(file, "vgrf%d", inst->src[i].reg);
2906 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2907 inst->src[i].subreg_offset)
2908 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2909 inst->src[i].subreg_offset);
2910 break;
2911 case MRF:
2912 fprintf(file, "***m%d***", inst->src[i].reg);
2913 break;
2914 case UNIFORM:
2915 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2916 if (inst->src[i].reladdr) {
2917 fprintf(file, "+reladdr");
2918 } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2919 inst->src[i].subreg_offset) {
2920 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2921 inst->src[i].subreg_offset);
2922 }
2923 break;
2924 case BAD_FILE:
2925 fprintf(file, "(null)");
2926 break;
2927 case IMM:
2928 switch (inst->src[i].type) {
2929 case BRW_REGISTER_TYPE_F:
2930 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
2931 break;
2932 case BRW_REGISTER_TYPE_D:
2933 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
2934 break;
2935 case BRW_REGISTER_TYPE_UD:
2936 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
2937 break;
2938 default:
2939 fprintf(file, "???");
2940 break;
2941 }
2942 break;
2943 case HW_REG:
2944 if (inst->src[i].fixed_hw_reg.negate)
2945 fprintf(file, "-");
2946 if (inst->src[i].fixed_hw_reg.abs)
2947 fprintf(file, "|");
2948 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2949 switch (inst->src[i].fixed_hw_reg.nr) {
2950 case BRW_ARF_NULL:
2951 fprintf(file, "null");
2952 break;
2953 case BRW_ARF_ADDRESS:
2954 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2955 break;
2956 case BRW_ARF_ACCUMULATOR:
2957 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2958 break;
2959 case BRW_ARF_FLAG:
2960 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2961 inst->src[i].fixed_hw_reg.subnr);
2962 break;
2963 default:
2964 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2965 inst->src[i].fixed_hw_reg.subnr);
2966 break;
2967 }
2968 } else {
2969 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2970 }
2971 if (inst->src[i].fixed_hw_reg.subnr)
2972 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
2973 if (inst->src[i].fixed_hw_reg.abs)
2974 fprintf(file, "|");
2975 break;
2976 default:
2977 fprintf(file, "???");
2978 break;
2979 }
2980 if (inst->src[i].abs)
2981 fprintf(file, "|");
2982
2983 if (inst->src[i].file != IMM) {
2984 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
2985 }
2986
2987 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
2988 fprintf(file, ", ");
2989 }
2990
2991 fprintf(file, " ");
2992
2993 if (inst->force_uncompressed)
2994 fprintf(file, "1sthalf ");
2995
2996 if (inst->force_sechalf)
2997 fprintf(file, "2ndhalf ");
2998
2999 fprintf(file, "\n");
3000 }
3001
3002 /**
3003 * Possibly returns an instruction that set up @param reg.
3004 *
3005 * Sometimes we want to take the result of some expression/variable
3006 * dereference tree and rewrite the instruction generating the result
3007 * of the tree. When processing the tree, we know that the
3008 * instructions generated are all writing temporaries that are dead
3009 * outside of this tree. So, if we have some instructions that write
3010 * a temporary, we're free to point that temp write somewhere else.
3011 *
3012 * Note that this doesn't guarantee that the instruction generated
3013 * only reg -- it might be the size=4 destination of a texture instruction.
3014 */
3015 fs_inst *
3016 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3017 fs_inst *end,
3018 const fs_reg &reg)
3019 {
3020 if (end == start ||
3021 end->is_partial_write() ||
3022 reg.reladdr ||
3023 !reg.equals(end->dst)) {
3024 return NULL;
3025 } else {
3026 return end;
3027 }
3028 }
3029
3030 void
3031 fs_visitor::setup_payload_gen6()
3032 {
3033 bool uses_depth =
3034 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3035 unsigned barycentric_interp_modes = prog_data->barycentric_interp_modes;
3036
3037 assert(brw->gen >= 6);
3038
3039 /* R0-1: masks, pixel X/Y coordinates. */
3040 payload.num_regs = 2;
3041 /* R2: only for 32-pixel dispatch.*/
3042
3043 /* R3-26: barycentric interpolation coordinates. These appear in the
3044 * same order that they appear in the brw_wm_barycentric_interp_mode
3045 * enum. Each set of coordinates occupies 2 registers if dispatch width
3046 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3047 * appear if they were enabled using the "Barycentric Interpolation
3048 * Mode" bits in WM_STATE.
3049 */
3050 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3051 if (barycentric_interp_modes & (1 << i)) {
3052 payload.barycentric_coord_reg[i] = payload.num_regs;
3053 payload.num_regs += 2;
3054 if (dispatch_width == 16) {
3055 payload.num_regs += 2;
3056 }
3057 }
3058 }
3059
3060 /* R27: interpolated depth if uses source depth */
3061 if (uses_depth) {
3062 payload.source_depth_reg = payload.num_regs;
3063 payload.num_regs++;
3064 if (dispatch_width == 16) {
3065 /* R28: interpolated depth if not SIMD8. */
3066 payload.num_regs++;
3067 }
3068 }
3069 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3070 if (uses_depth) {
3071 payload.source_w_reg = payload.num_regs;
3072 payload.num_regs++;
3073 if (dispatch_width == 16) {
3074 /* R30: interpolated W if not SIMD8. */
3075 payload.num_regs++;
3076 }
3077 }
3078
3079 prog_data->uses_pos_offset = key->compute_pos_offset;
3080 /* R31: MSAA position offsets. */
3081 if (prog_data->uses_pos_offset) {
3082 payload.sample_pos_reg = payload.num_regs;
3083 payload.num_regs++;
3084 }
3085
3086 /* R32: MSAA input coverage mask */
3087 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3088 assert(brw->gen >= 7);
3089 payload.sample_mask_in_reg = payload.num_regs;
3090 payload.num_regs++;
3091 if (dispatch_width == 16) {
3092 /* R33: input coverage mask if not SIMD8. */
3093 payload.num_regs++;
3094 }
3095 }
3096
3097 /* R34-: bary for 32-pixel. */
3098 /* R58-59: interp W for 32-pixel. */
3099
3100 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3101 source_depth_to_render_target = true;
3102 }
3103 }
3104
3105 void
3106 fs_visitor::assign_binding_table_offsets()
3107 {
3108 uint32_t next_binding_table_offset = 0;
3109
3110 /* If there are no color regions, we still perform an FB write to a null
3111 * renderbuffer, which we place at surface index 0.
3112 */
3113 prog_data->binding_table.render_target_start = next_binding_table_offset;
3114 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3115
3116 assign_common_binding_table_offsets(next_binding_table_offset);
3117 }
3118
3119 void
3120 fs_visitor::calculate_register_pressure()
3121 {
3122 invalidate_live_intervals(false);
3123 calculate_live_intervals();
3124
3125 unsigned num_instructions = instructions.length();
3126
3127 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3128
3129 for (int reg = 0; reg < virtual_grf_count; reg++) {
3130 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3131 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3132 }
3133 }
3134
3135 /**
3136 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3137 *
3138 * The needs_unlit_centroid_workaround ends up producing one of these per
3139 * channel of centroid input, so it's good to clean them up.
3140 *
3141 * An assumption here is that nothing ever modifies the dispatched pixels
3142 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3143 * dictates that anyway.
3144 */
3145 void
3146 fs_visitor::opt_drop_redundant_mov_to_flags()
3147 {
3148 bool flag_mov_found[2] = {false};
3149
3150 foreach_in_list_safe(fs_inst, inst, &instructions) {
3151 if (inst->is_control_flow()) {
3152 memset(flag_mov_found, 0, sizeof(flag_mov_found));
3153 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3154 if (!flag_mov_found[inst->flag_subreg])
3155 flag_mov_found[inst->flag_subreg] = true;
3156 else
3157 inst->remove();
3158 } else if (inst->writes_flag()) {
3159 flag_mov_found[inst->flag_subreg] = false;
3160 }
3161 }
3162 }
3163
3164 bool
3165 fs_visitor::run()
3166 {
3167 sanity_param_count = fp->Base.Parameters->NumParameters;
3168 bool allocated_without_spills;
3169
3170 assign_binding_table_offsets();
3171
3172 if (brw->gen >= 6)
3173 setup_payload_gen6();
3174 else
3175 setup_payload_gen4();
3176
3177 if (0) {
3178 emit_dummy_fs();
3179 } else {
3180 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3181 emit_shader_time_begin();
3182
3183 calculate_urb_setup();
3184 if (fp->Base.InputsRead > 0) {
3185 if (brw->gen < 6)
3186 emit_interpolation_setup_gen4();
3187 else
3188 emit_interpolation_setup_gen6();
3189 }
3190
3191 /* We handle discards by keeping track of the still-live pixels in f0.1.
3192 * Initialize it with the dispatched pixels.
3193 */
3194 if (fp->UsesKill || key->alpha_test_func) {
3195 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3196 discard_init->flag_subreg = 1;
3197 }
3198
3199 /* Generate FS IR for main(). (the visitor only descends into
3200 * functions called "main").
3201 */
3202 if (shader) {
3203 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3204 base_ir = ir;
3205 this->result = reg_undef;
3206 ir->accept(this);
3207 }
3208 } else {
3209 emit_fragment_program_code();
3210 }
3211 base_ir = NULL;
3212 if (failed)
3213 return false;
3214
3215 emit(FS_OPCODE_PLACEHOLDER_HALT);
3216
3217 if (key->alpha_test_func)
3218 emit_alpha_test();
3219
3220 emit_fb_writes();
3221
3222 split_virtual_grfs();
3223
3224 move_uniform_array_access_to_pull_constants();
3225 assign_constant_locations();
3226 demote_pull_constants();
3227
3228 opt_drop_redundant_mov_to_flags();
3229
3230 #define OPT(pass, args...) do { \
3231 pass_num++; \
3232 bool this_progress = pass(args); \
3233 \
3234 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3235 char filename[64]; \
3236 snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass, \
3237 dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3238 \
3239 backend_visitor::dump_instructions(filename); \
3240 } \
3241 \
3242 progress = progress || this_progress; \
3243 } while (false)
3244
3245 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3246 char filename[64];
3247 snprintf(filename, 64, "fs%d-%04d-00-start",
3248 dispatch_width, shader_prog ? shader_prog->Name : 0);
3249
3250 backend_visitor::dump_instructions(filename);
3251 }
3252
3253 bool progress;
3254 int iteration = 0;
3255 do {
3256 progress = false;
3257 iteration++;
3258 int pass_num = 0;
3259
3260 compact_virtual_grfs();
3261
3262 OPT(remove_duplicate_mrf_writes);
3263
3264 OPT(opt_algebraic);
3265 OPT(opt_cse);
3266 OPT(opt_copy_propagate);
3267 OPT(opt_peephole_predicated_break);
3268 OPT(dead_code_eliminate);
3269 OPT(opt_peephole_sel);
3270 OPT(dead_control_flow_eliminate, this);
3271 OPT(opt_register_renaming);
3272 OPT(opt_saturate_propagation);
3273 OPT(register_coalesce);
3274 OPT(compute_to_mrf);
3275 } while (progress);
3276
3277 if (lower_load_payload()) {
3278 register_coalesce();
3279 dead_code_eliminate();
3280 }
3281
3282 lower_uniform_pull_constant_loads();
3283
3284 assign_curb_setup();
3285 assign_urb_setup();
3286
3287 static enum instruction_scheduler_mode pre_modes[] = {
3288 SCHEDULE_PRE,
3289 SCHEDULE_PRE_NON_LIFO,
3290 SCHEDULE_PRE_LIFO,
3291 };
3292
3293 /* Try each scheduling heuristic to see if it can successfully register
3294 * allocate without spilling. They should be ordered by decreasing
3295 * performance but increasing likelihood of allocating.
3296 */
3297 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3298 schedule_instructions(pre_modes[i]);
3299
3300 if (0) {
3301 assign_regs_trivial();
3302 allocated_without_spills = true;
3303 } else {
3304 allocated_without_spills = assign_regs(false);
3305 }
3306 if (allocated_without_spills)
3307 break;
3308 }
3309
3310 if (!allocated_without_spills) {
3311 /* We assume that any spilling is worse than just dropping back to
3312 * SIMD8. There's probably actually some intermediate point where
3313 * SIMD16 with a couple of spills is still better.
3314 */
3315 if (dispatch_width == 16) {
3316 fail("Failure to register allocate. Reduce number of "
3317 "live scalar values to avoid this.");
3318 } else {
3319 perf_debug("Fragment shader triggered register spilling. "
3320 "Try reducing the number of live scalar values to "
3321 "improve performance.\n");
3322 }
3323
3324 /* Since we're out of heuristics, just go spill registers until we
3325 * get an allocation.
3326 */
3327 while (!assign_regs(true)) {
3328 if (failed)
3329 break;
3330 }
3331 }
3332 }
3333 assert(force_uncompressed_stack == 0);
3334
3335 /* This must come after all optimization and register allocation, since
3336 * it inserts dead code that happens to have side effects, and it does
3337 * so based on the actual physical registers in use.
3338 */
3339 insert_gen4_send_dependency_workarounds();
3340
3341 if (failed)
3342 return false;
3343
3344 if (!allocated_without_spills)
3345 schedule_instructions(SCHEDULE_POST);
3346
3347 if (last_scratch > 0) {
3348 prog_data->base.total_scratch = brw_get_scratch_size(last_scratch);
3349 }
3350
3351 if (brw->use_rep_send)
3352 try_rep_send();
3353
3354 if (dispatch_width == 8)
3355 prog_data->reg_blocks = brw_register_blocks(grf_used);
3356 else
3357 prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3358
3359 /* If any state parameters were appended, then ParameterValues could have
3360 * been realloced, in which case the driver uniform storage set up by
3361 * _mesa_associate_uniform_storage() would point to freed memory. Make
3362 * sure that didn't happen.
3363 */
3364 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3365
3366 calculate_cfg();
3367
3368 return !failed;
3369 }
3370
3371 const unsigned *
3372 brw_wm_fs_emit(struct brw_context *brw,
3373 void *mem_ctx,
3374 const struct brw_wm_prog_key *key,
3375 struct brw_wm_prog_data *prog_data,
3376 struct gl_fragment_program *fp,
3377 struct gl_shader_program *prog,
3378 unsigned *final_assembly_size)
3379 {
3380 bool start_busy = false;
3381 double start_time = 0;
3382
3383 if (unlikely(brw->perf_debug)) {
3384 start_busy = (brw->batch.last_bo &&
3385 drm_intel_bo_busy(brw->batch.last_bo));
3386 start_time = get_time();
3387 }
3388
3389 struct brw_shader *shader = NULL;
3390 if (prog)
3391 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3392
3393 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3394 brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3395
3396 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3397 */
3398 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3399 if (!v.run()) {
3400 if (prog) {
3401 prog->LinkStatus = false;
3402 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3403 }
3404
3405 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3406 v.fail_msg);
3407
3408 return NULL;
3409 }
3410
3411 cfg_t *simd16_cfg = NULL;
3412 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3413 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3414 brw->use_rep_send)) {
3415 if (!v.simd16_unsupported) {
3416 /* Try a SIMD16 compile */
3417 v2.import_uniforms(&v);
3418 if (!v2.run()) {
3419 perf_debug("SIMD16 shader failed to compile, falling back to "
3420 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3421 } else {
3422 simd16_cfg = v2.cfg;
3423 }
3424 } else {
3425 perf_debug("SIMD16 shader unsupported, falling back to "
3426 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3427 }
3428 }
3429
3430 cfg_t *simd8_cfg;
3431 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3432 if (no_simd8 && simd16_cfg) {
3433 simd8_cfg = NULL;
3434 prog_data->no_8 = true;
3435 } else {
3436 simd8_cfg = v.cfg;
3437 prog_data->no_8 = false;
3438 }
3439
3440 const unsigned *assembly = NULL;
3441 fs_generator g(brw, mem_ctx, key, prog_data, prog, fp,
3442 v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3443 assembly = g.generate_assembly(simd8_cfg, simd16_cfg,
3444 final_assembly_size);
3445
3446 if (unlikely(brw->perf_debug) && shader) {
3447 if (shader->compiled_once)
3448 brw_wm_debug_recompile(brw, prog, key);
3449 shader->compiled_once = true;
3450
3451 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3452 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3453 (get_time() - start_time) * 1000);
3454 }
3455 }
3456
3457 return assembly;
3458 }
3459
3460 bool
3461 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3462 {
3463 struct brw_context *brw = brw_context(ctx);
3464 struct brw_wm_prog_key key;
3465
3466 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3467 return true;
3468
3469 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3470 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3471 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3472 bool program_uses_dfdy = fp->UsesDFdy;
3473
3474 memset(&key, 0, sizeof(key));
3475
3476 if (brw->gen < 6) {
3477 if (fp->UsesKill)
3478 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3479
3480 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3481 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3482
3483 /* Just assume depth testing. */
3484 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3485 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3486 }
3487
3488 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3489 BRW_FS_VARYING_INPUT_MASK) > 16)
3490 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3491
3492 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3493 for (unsigned i = 0; i < sampler_count; i++) {
3494 if (fp->Base.ShadowSamplers & (1 << i)) {
3495 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3496 key.tex.swizzles[i] =
3497 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3498 } else {
3499 /* Color sampler: assume no swizzling. */
3500 key.tex.swizzles[i] = SWIZZLE_XYZW;
3501 }
3502 }
3503
3504 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3505 key.drawable_height = ctx->DrawBuffer->Height;
3506 }
3507
3508 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3509 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3510 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3511
3512 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3513 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3514 key.nr_color_regions > 1;
3515 }
3516
3517 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3518 * quality of the derivatives is likely to be determined by the driconf
3519 * option.
3520 */
3521 key.high_quality_derivatives = brw->disable_derivative_optimization;
3522
3523 key.program_string_id = bfp->id;
3524
3525 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3526 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3527
3528 bool success = do_wm_prog(brw, prog, bfp, &key);
3529
3530 brw->wm.base.prog_offset = old_prog_offset;
3531 brw->wm.prog_data = old_prog_data;
3532
3533 return success;
3534 }