i965: Move pre-draw resolve buffers to dd::UpdateState
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init(enum opcode opcode, const fs_reg &dst, fs_reg *src, int sources)
56 {
57 memset(this, 0, sizeof(*this));
58
59 this->opcode = opcode;
60 this->dst = dst;
61 this->src = src;
62 this->sources = sources;
63
64 this->conditional_mod = BRW_CONDITIONAL_NONE;
65
66 /* This will be the case for almost all instructions. */
67 this->regs_written = 1;
68
69 this->writes_accumulator = false;
70 }
71
72 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
73 {
74 fs_reg *src = ralloc_array(this, fs_reg, 3);
75 init(opcode, dst, src, 0);
76 }
77
78 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
79 {
80 fs_reg *src = ralloc_array(this, fs_reg, 3);
81 src[0] = src0;
82 init(opcode, dst, src, 1);
83 }
84
85 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
86 const fs_reg &src1)
87 {
88 fs_reg *src = ralloc_array(this, fs_reg, 3);
89 src[0] = src0;
90 src[1] = src1;
91 init(opcode, dst, src, 2);
92 }
93
94 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
95 const fs_reg &src1, const fs_reg &src2)
96 {
97 fs_reg *src = ralloc_array(this, fs_reg, 3);
98 src[0] = src0;
99 src[1] = src1;
100 src[2] = src2;
101 init(opcode, dst, src, 3);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
105 {
106 init(opcode, dst, src, sources);
107 }
108
109 fs_inst::fs_inst(const fs_inst &that)
110 {
111 memcpy(this, &that, sizeof(that));
112
113 this->src = ralloc_array(this, fs_reg, that.sources);
114
115 for (int i = 0; i < that.sources; i++)
116 this->src[i] = that.src[i];
117 }
118
119 void
120 fs_inst::resize_sources(uint8_t num_sources)
121 {
122 if (this->sources != num_sources) {
123 this->src = reralloc(this, this->src, fs_reg, num_sources);
124 this->sources = num_sources;
125 }
126 }
127
128 #define ALU1(op) \
129 fs_inst * \
130 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
131 { \
132 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
133 }
134
135 #define ALU2(op) \
136 fs_inst * \
137 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
138 const fs_reg &src1) \
139 { \
140 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
141 }
142
143 #define ALU2_ACC(op) \
144 fs_inst * \
145 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
146 const fs_reg &src1) \
147 { \
148 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
149 inst->writes_accumulator = true; \
150 return inst; \
151 }
152
153 #define ALU3(op) \
154 fs_inst * \
155 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
156 const fs_reg &src1, const fs_reg &src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2_ACC(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2_ACC(ADDC)
186 ALU2_ACC(SUBB)
187 ALU2(SEL)
188 ALU2(MAC)
189
190 /** Gen4 predicated IF. */
191 fs_inst *
192 fs_visitor::IF(enum brw_predicate predicate)
193 {
194 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
195 inst->predicate = predicate;
196 return inst;
197 }
198
199 /** Gen6 IF with embedded comparison. */
200 fs_inst *
201 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
202 enum brw_conditional_mod condition)
203 {
204 assert(brw->gen == 6);
205 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
206 reg_null_d, src0, src1);
207 inst->conditional_mod = condition;
208 return inst;
209 }
210
211 /**
212 * CMP: Sets the low bit of the destination channels with the result
213 * of the comparison, while the upper bits are undefined, and updates
214 * the flag register with the packed 16 bits of the result.
215 */
216 fs_inst *
217 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
218 enum brw_conditional_mod condition)
219 {
220 fs_inst *inst;
221
222 /* Take the instruction:
223 *
224 * CMP null<d> src0<f> src1<f>
225 *
226 * Original gen4 does type conversion to the destination type before
227 * comparison, producing garbage results for floating point comparisons.
228 * gen5 does the comparison on the execution type (resolved source types),
229 * so dst type doesn't matter. gen6 does comparison and then uses the
230 * result as if it was the dst type with no conversion, which happens to
231 * mostly work out for float-interpreted-as-int since our comparisons are
232 * for >0, =0, <0.
233 */
234 if (brw->gen == 4) {
235 dst.type = src0.type;
236 if (dst.file == HW_REG)
237 dst.fixed_hw_reg.type = dst.type;
238 }
239
240 resolve_ud_negate(&src0);
241 resolve_ud_negate(&src1);
242
243 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
244 inst->conditional_mod = condition;
245
246 return inst;
247 }
248
249 fs_inst *
250 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
251 {
252 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, dst, src,
253 sources);
254 inst->regs_written = sources;
255
256 return inst;
257 }
258
259 exec_list
260 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
261 const fs_reg &surf_index,
262 const fs_reg &varying_offset,
263 uint32_t const_offset)
264 {
265 exec_list instructions;
266 fs_inst *inst;
267
268 /* We have our constant surface use a pitch of 4 bytes, so our index can
269 * be any component of a vector, and then we load 4 contiguous
270 * components starting from that.
271 *
272 * We break down the const_offset to a portion added to the variable
273 * offset and a portion done using reg_offset, which means that if you
274 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
275 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
276 * CSE can later notice that those loads are all the same and eliminate
277 * the redundant ones.
278 */
279 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
280 instructions.push_tail(ADD(vec4_offset,
281 varying_offset, const_offset & ~3));
282
283 int scale = 1;
284 if (brw->gen == 4 && dispatch_width == 8) {
285 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
286 * u, v, r) as parameters, or we can just use the SIMD16 message
287 * consisting of (header, u). We choose the second, at the cost of a
288 * longer return length.
289 */
290 scale = 2;
291 }
292
293 enum opcode op;
294 if (brw->gen >= 7)
295 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
296 else
297 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
298 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
299 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
300 inst->regs_written = 4 * scale;
301 instructions.push_tail(inst);
302
303 if (brw->gen < 7) {
304 inst->base_mrf = 13;
305 inst->header_present = true;
306 if (brw->gen == 4)
307 inst->mlen = 3;
308 else
309 inst->mlen = 1 + dispatch_width / 8;
310 }
311
312 vec4_result.reg_offset += (const_offset & 3) * scale;
313 instructions.push_tail(MOV(dst, vec4_result));
314
315 return instructions;
316 }
317
318 /**
319 * A helper for MOV generation for fixing up broken hardware SEND dependency
320 * handling.
321 */
322 fs_inst *
323 fs_visitor::DEP_RESOLVE_MOV(int grf)
324 {
325 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
326
327 inst->ir = NULL;
328 inst->annotation = "send dependency resolve";
329
330 /* The caller always wants uncompressed to emit the minimal extra
331 * dependencies, and to avoid having to deal with aligning its regs to 2.
332 */
333 inst->force_uncompressed = true;
334
335 return inst;
336 }
337
338 bool
339 fs_inst::equals(fs_inst *inst) const
340 {
341 return (opcode == inst->opcode &&
342 dst.equals(inst->dst) &&
343 src[0].equals(inst->src[0]) &&
344 src[1].equals(inst->src[1]) &&
345 src[2].equals(inst->src[2]) &&
346 saturate == inst->saturate &&
347 predicate == inst->predicate &&
348 conditional_mod == inst->conditional_mod &&
349 mlen == inst->mlen &&
350 base_mrf == inst->base_mrf &&
351 target == inst->target &&
352 eot == inst->eot &&
353 header_present == inst->header_present &&
354 shadow_compare == inst->shadow_compare &&
355 offset == inst->offset);
356 }
357
358 bool
359 fs_inst::overwrites_reg(const fs_reg &reg) const
360 {
361 return (reg.file == dst.file &&
362 reg.reg == dst.reg &&
363 reg.reg_offset >= dst.reg_offset &&
364 reg.reg_offset < dst.reg_offset + regs_written);
365 }
366
367 bool
368 fs_inst::is_send_from_grf() const
369 {
370 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
371 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
372 opcode == FS_OPCODE_INTERPOLATE_AT_CENTROID ||
373 opcode == FS_OPCODE_INTERPOLATE_AT_SAMPLE ||
374 opcode == FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET ||
375 opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET ||
376 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
377 src[1].file == GRF) ||
378 (is_tex() && src[0].file == GRF));
379 }
380
381 bool
382 fs_inst::can_do_source_mods(struct brw_context *brw)
383 {
384 if (brw->gen == 6 && is_math())
385 return false;
386
387 if (is_send_from_grf())
388 return false;
389
390 if (!backend_instruction::can_do_source_mods())
391 return false;
392
393 return true;
394 }
395
396 void
397 fs_reg::init()
398 {
399 memset(this, 0, sizeof(*this));
400 stride = 1;
401 }
402
403 /** Generic unset register constructor. */
404 fs_reg::fs_reg()
405 {
406 init();
407 this->file = BAD_FILE;
408 }
409
410 /** Immediate value constructor. */
411 fs_reg::fs_reg(float f)
412 {
413 init();
414 this->file = IMM;
415 this->type = BRW_REGISTER_TYPE_F;
416 this->fixed_hw_reg.dw1.f = f;
417 }
418
419 /** Immediate value constructor. */
420 fs_reg::fs_reg(int32_t i)
421 {
422 init();
423 this->file = IMM;
424 this->type = BRW_REGISTER_TYPE_D;
425 this->fixed_hw_reg.dw1.d = i;
426 }
427
428 /** Immediate value constructor. */
429 fs_reg::fs_reg(uint32_t u)
430 {
431 init();
432 this->file = IMM;
433 this->type = BRW_REGISTER_TYPE_UD;
434 this->fixed_hw_reg.dw1.ud = u;
435 }
436
437 /** Fixed brw_reg. */
438 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
439 {
440 init();
441 this->file = HW_REG;
442 this->fixed_hw_reg = fixed_hw_reg;
443 this->type = fixed_hw_reg.type;
444 }
445
446 bool
447 fs_reg::equals(const fs_reg &r) const
448 {
449 return (file == r.file &&
450 reg == r.reg &&
451 reg_offset == r.reg_offset &&
452 subreg_offset == r.subreg_offset &&
453 type == r.type &&
454 negate == r.negate &&
455 abs == r.abs &&
456 !reladdr && !r.reladdr &&
457 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
458 sizeof(fixed_hw_reg)) == 0 &&
459 stride == r.stride);
460 }
461
462 fs_reg &
463 fs_reg::apply_stride(unsigned stride)
464 {
465 assert((this->stride * stride) <= 4 &&
466 (is_power_of_two(stride) || stride == 0) &&
467 file != HW_REG && file != IMM);
468 this->stride *= stride;
469 return *this;
470 }
471
472 fs_reg &
473 fs_reg::set_smear(unsigned subreg)
474 {
475 assert(file != HW_REG && file != IMM);
476 subreg_offset = subreg * type_sz(type);
477 stride = 0;
478 return *this;
479 }
480
481 bool
482 fs_reg::is_contiguous() const
483 {
484 return stride == 1;
485 }
486
487 bool
488 fs_reg::is_valid_3src() const
489 {
490 return file == GRF || file == UNIFORM;
491 }
492
493 int
494 fs_visitor::type_size(const struct glsl_type *type)
495 {
496 unsigned int size, i;
497
498 switch (type->base_type) {
499 case GLSL_TYPE_UINT:
500 case GLSL_TYPE_INT:
501 case GLSL_TYPE_FLOAT:
502 case GLSL_TYPE_BOOL:
503 return type->components();
504 case GLSL_TYPE_ARRAY:
505 return type_size(type->fields.array) * type->length;
506 case GLSL_TYPE_STRUCT:
507 size = 0;
508 for (i = 0; i < type->length; i++) {
509 size += type_size(type->fields.structure[i].type);
510 }
511 return size;
512 case GLSL_TYPE_SAMPLER:
513 /* Samplers take up no register space, since they're baked in at
514 * link time.
515 */
516 return 0;
517 case GLSL_TYPE_ATOMIC_UINT:
518 return 0;
519 case GLSL_TYPE_IMAGE:
520 case GLSL_TYPE_VOID:
521 case GLSL_TYPE_ERROR:
522 case GLSL_TYPE_INTERFACE:
523 unreachable("not reached");
524 }
525
526 return 0;
527 }
528
529 fs_reg
530 fs_visitor::get_timestamp()
531 {
532 assert(brw->gen >= 7);
533
534 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
535 BRW_ARF_TIMESTAMP,
536 0),
537 BRW_REGISTER_TYPE_UD));
538
539 fs_reg dst = fs_reg(this, glsl_type::uint_type);
540
541 fs_inst *mov = emit(MOV(dst, ts));
542 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
543 * even if it's not enabled in the dispatch.
544 */
545 mov->force_writemask_all = true;
546 mov->force_uncompressed = true;
547
548 /* The caller wants the low 32 bits of the timestamp. Since it's running
549 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
550 * which is plenty of time for our purposes. It is identical across the
551 * EUs, but since it's tracking GPU core speed it will increment at a
552 * varying rate as render P-states change.
553 *
554 * The caller could also check if render P-states have changed (or anything
555 * else that might disrupt timing) by setting smear to 2 and checking if
556 * that field is != 0.
557 */
558 dst.set_smear(0);
559
560 return dst;
561 }
562
563 void
564 fs_visitor::emit_shader_time_begin()
565 {
566 current_annotation = "shader time start";
567 shader_start_time = get_timestamp();
568 }
569
570 void
571 fs_visitor::emit_shader_time_end()
572 {
573 current_annotation = "shader time end";
574
575 enum shader_time_shader_type type, written_type, reset_type;
576 if (dispatch_width == 8) {
577 type = ST_FS8;
578 written_type = ST_FS8_WRITTEN;
579 reset_type = ST_FS8_RESET;
580 } else {
581 assert(dispatch_width == 16);
582 type = ST_FS16;
583 written_type = ST_FS16_WRITTEN;
584 reset_type = ST_FS16_RESET;
585 }
586
587 fs_reg shader_end_time = get_timestamp();
588
589 /* Check that there weren't any timestamp reset events (assuming these
590 * were the only two timestamp reads that happened).
591 */
592 fs_reg reset = shader_end_time;
593 reset.set_smear(2);
594 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
595 test->conditional_mod = BRW_CONDITIONAL_Z;
596 emit(IF(BRW_PREDICATE_NORMAL));
597
598 push_force_uncompressed();
599 fs_reg start = shader_start_time;
600 start.negate = true;
601 fs_reg diff = fs_reg(this, glsl_type::uint_type);
602 emit(ADD(diff, start, shader_end_time));
603
604 /* If there were no instructions between the two timestamp gets, the diff
605 * is 2 cycles. Remove that overhead, so I can forget about that when
606 * trying to determine the time taken for single instructions.
607 */
608 emit(ADD(diff, diff, fs_reg(-2u)));
609
610 emit_shader_time_write(type, diff);
611 emit_shader_time_write(written_type, fs_reg(1u));
612 emit(BRW_OPCODE_ELSE);
613 emit_shader_time_write(reset_type, fs_reg(1u));
614 emit(BRW_OPCODE_ENDIF);
615
616 pop_force_uncompressed();
617 }
618
619 void
620 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
621 fs_reg value)
622 {
623 int shader_time_index =
624 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
625 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
626
627 fs_reg payload;
628 if (dispatch_width == 8)
629 payload = fs_reg(this, glsl_type::uvec2_type);
630 else
631 payload = fs_reg(this, glsl_type::uint_type);
632
633 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
634 fs_reg(), payload, offset, value));
635 }
636
637 void
638 fs_visitor::vfail(const char *format, va_list va)
639 {
640 char *msg;
641
642 if (failed)
643 return;
644
645 failed = true;
646
647 msg = ralloc_vasprintf(mem_ctx, format, va);
648 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
649
650 this->fail_msg = msg;
651
652 if (INTEL_DEBUG & DEBUG_WM) {
653 fprintf(stderr, "%s", msg);
654 }
655 }
656
657 void
658 fs_visitor::fail(const char *format, ...)
659 {
660 va_list va;
661
662 va_start(va, format);
663 vfail(format, va);
664 va_end(va);
665 }
666
667 /**
668 * Mark this program as impossible to compile in SIMD16 mode.
669 *
670 * During the SIMD8 compile (which happens first), we can detect and flag
671 * things that are unsupported in SIMD16 mode, so the compiler can skip
672 * the SIMD16 compile altogether.
673 *
674 * During a SIMD16 compile (if one happens anyway), this just calls fail().
675 */
676 void
677 fs_visitor::no16(const char *format, ...)
678 {
679 va_list va;
680
681 va_start(va, format);
682
683 if (dispatch_width == 16) {
684 vfail(format, va);
685 } else {
686 simd16_unsupported = true;
687
688 if (brw->perf_debug) {
689 if (no16_msg)
690 ralloc_vasprintf_append(&no16_msg, format, va);
691 else
692 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
693 }
694 }
695
696 va_end(va);
697 }
698
699 fs_inst *
700 fs_visitor::emit(enum opcode opcode)
701 {
702 return emit(new(mem_ctx) fs_inst(opcode));
703 }
704
705 fs_inst *
706 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
707 {
708 return emit(new(mem_ctx) fs_inst(opcode, dst));
709 }
710
711 fs_inst *
712 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
713 {
714 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
715 }
716
717 fs_inst *
718 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
719 const fs_reg &src1)
720 {
721 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
722 }
723
724 fs_inst *
725 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
726 const fs_reg &src1, const fs_reg &src2)
727 {
728 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
729 }
730
731 fs_inst *
732 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
733 fs_reg src[], int sources)
734 {
735 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
736 }
737
738 void
739 fs_visitor::push_force_uncompressed()
740 {
741 force_uncompressed_stack++;
742 }
743
744 void
745 fs_visitor::pop_force_uncompressed()
746 {
747 force_uncompressed_stack--;
748 assert(force_uncompressed_stack >= 0);
749 }
750
751 /**
752 * Returns true if the instruction has a flag that means it won't
753 * update an entire destination register.
754 *
755 * For example, dead code elimination and live variable analysis want to know
756 * when a write to a variable screens off any preceding values that were in
757 * it.
758 */
759 bool
760 fs_inst::is_partial_write() const
761 {
762 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
763 this->force_uncompressed ||
764 this->force_sechalf || !this->dst.is_contiguous());
765 }
766
767 int
768 fs_inst::regs_read(fs_visitor *v, int arg) const
769 {
770 if (is_tex() && arg == 0 && src[0].file == GRF) {
771 if (v->dispatch_width == 16)
772 return (mlen + 1) / 2;
773 else
774 return mlen;
775 }
776 return 1;
777 }
778
779 bool
780 fs_inst::reads_flag() const
781 {
782 return predicate;
783 }
784
785 bool
786 fs_inst::writes_flag() const
787 {
788 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
789 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
790 }
791
792 /**
793 * Returns how many MRFs an FS opcode will write over.
794 *
795 * Note that this is not the 0 or 1 implied writes in an actual gen
796 * instruction -- the FS opcodes often generate MOVs in addition.
797 */
798 int
799 fs_visitor::implied_mrf_writes(fs_inst *inst)
800 {
801 if (inst->mlen == 0)
802 return 0;
803
804 if (inst->base_mrf == -1)
805 return 0;
806
807 switch (inst->opcode) {
808 case SHADER_OPCODE_RCP:
809 case SHADER_OPCODE_RSQ:
810 case SHADER_OPCODE_SQRT:
811 case SHADER_OPCODE_EXP2:
812 case SHADER_OPCODE_LOG2:
813 case SHADER_OPCODE_SIN:
814 case SHADER_OPCODE_COS:
815 return 1 * dispatch_width / 8;
816 case SHADER_OPCODE_POW:
817 case SHADER_OPCODE_INT_QUOTIENT:
818 case SHADER_OPCODE_INT_REMAINDER:
819 return 2 * dispatch_width / 8;
820 case SHADER_OPCODE_TEX:
821 case FS_OPCODE_TXB:
822 case SHADER_OPCODE_TXD:
823 case SHADER_OPCODE_TXF:
824 case SHADER_OPCODE_TXF_CMS:
825 case SHADER_OPCODE_TXF_MCS:
826 case SHADER_OPCODE_TG4:
827 case SHADER_OPCODE_TG4_OFFSET:
828 case SHADER_OPCODE_TXL:
829 case SHADER_OPCODE_TXS:
830 case SHADER_OPCODE_LOD:
831 return 1;
832 case FS_OPCODE_FB_WRITE:
833 return 2;
834 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
835 case SHADER_OPCODE_GEN4_SCRATCH_READ:
836 return 1;
837 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
838 return inst->mlen;
839 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
840 return 2;
841 case SHADER_OPCODE_UNTYPED_ATOMIC:
842 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
843 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
844 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
845 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
846 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
847 return 0;
848 default:
849 unreachable("not reached");
850 }
851 }
852
853 int
854 fs_visitor::virtual_grf_alloc(int size)
855 {
856 if (virtual_grf_array_size <= virtual_grf_count) {
857 if (virtual_grf_array_size == 0)
858 virtual_grf_array_size = 16;
859 else
860 virtual_grf_array_size *= 2;
861 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
862 virtual_grf_array_size);
863 }
864 virtual_grf_sizes[virtual_grf_count] = size;
865 return virtual_grf_count++;
866 }
867
868 /** Fixed HW reg constructor. */
869 fs_reg::fs_reg(enum register_file file, int reg)
870 {
871 init();
872 this->file = file;
873 this->reg = reg;
874 this->type = BRW_REGISTER_TYPE_F;
875 }
876
877 /** Fixed HW reg constructor. */
878 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
879 {
880 init();
881 this->file = file;
882 this->reg = reg;
883 this->type = type;
884 }
885
886 /** Automatic reg constructor. */
887 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
888 {
889 init();
890
891 this->file = GRF;
892 this->reg = v->virtual_grf_alloc(v->type_size(type));
893 this->reg_offset = 0;
894 this->type = brw_type_for_base_type(type);
895 }
896
897 fs_reg *
898 fs_visitor::variable_storage(ir_variable *var)
899 {
900 return (fs_reg *)hash_table_find(this->variable_ht, var);
901 }
902
903 void
904 import_uniforms_callback(const void *key,
905 void *data,
906 void *closure)
907 {
908 struct hash_table *dst_ht = (struct hash_table *)closure;
909 const fs_reg *reg = (const fs_reg *)data;
910
911 if (reg->file != UNIFORM)
912 return;
913
914 hash_table_insert(dst_ht, data, key);
915 }
916
917 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
918 * This brings in those uniform definitions
919 */
920 void
921 fs_visitor::import_uniforms(fs_visitor *v)
922 {
923 hash_table_call_foreach(v->variable_ht,
924 import_uniforms_callback,
925 variable_ht);
926 this->push_constant_loc = v->push_constant_loc;
927 this->pull_constant_loc = v->pull_constant_loc;
928 this->uniforms = v->uniforms;
929 this->param_size = v->param_size;
930 }
931
932 /* Our support for uniforms is piggy-backed on the struct
933 * gl_fragment_program, because that's where the values actually
934 * get stored, rather than in some global gl_shader_program uniform
935 * store.
936 */
937 void
938 fs_visitor::setup_uniform_values(ir_variable *ir)
939 {
940 int namelen = strlen(ir->name);
941
942 /* The data for our (non-builtin) uniforms is stored in a series of
943 * gl_uniform_driver_storage structs for each subcomponent that
944 * glGetUniformLocation() could name. We know it's been set up in the same
945 * order we'd walk the type, so walk the list of storage and find anything
946 * with our name, or the prefix of a component that starts with our name.
947 */
948 unsigned params_before = uniforms;
949 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
950 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
951
952 if (strncmp(ir->name, storage->name, namelen) != 0 ||
953 (storage->name[namelen] != 0 &&
954 storage->name[namelen] != '.' &&
955 storage->name[namelen] != '[')) {
956 continue;
957 }
958
959 unsigned slots = storage->type->component_slots();
960 if (storage->array_elements)
961 slots *= storage->array_elements;
962
963 for (unsigned i = 0; i < slots; i++) {
964 stage_prog_data->param[uniforms++] = &storage->storage[i];
965 }
966 }
967
968 /* Make sure we actually initialized the right amount of stuff here. */
969 assert(params_before + ir->type->component_slots() == uniforms);
970 (void)params_before;
971 }
972
973
974 /* Our support for builtin uniforms is even scarier than non-builtin.
975 * It sits on top of the PROG_STATE_VAR parameters that are
976 * automatically updated from GL context state.
977 */
978 void
979 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
980 {
981 const ir_state_slot *const slots = ir->state_slots;
982 assert(ir->state_slots != NULL);
983
984 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
985 /* This state reference has already been setup by ir_to_mesa, but we'll
986 * get the same index back here.
987 */
988 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
989 (gl_state_index *)slots[i].tokens);
990
991 /* Add each of the unique swizzles of the element as a parameter.
992 * This'll end up matching the expected layout of the
993 * array/matrix/structure we're trying to fill in.
994 */
995 int last_swiz = -1;
996 for (unsigned int j = 0; j < 4; j++) {
997 int swiz = GET_SWZ(slots[i].swizzle, j);
998 if (swiz == last_swiz)
999 break;
1000 last_swiz = swiz;
1001
1002 stage_prog_data->param[uniforms++] =
1003 &fp->Base.Parameters->ParameterValues[index][swiz];
1004 }
1005 }
1006 }
1007
1008 fs_reg *
1009 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1010 {
1011 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1012 fs_reg wpos = *reg;
1013 bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1014
1015 /* gl_FragCoord.x */
1016 if (ir->data.pixel_center_integer) {
1017 emit(MOV(wpos, this->pixel_x));
1018 } else {
1019 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1020 }
1021 wpos.reg_offset++;
1022
1023 /* gl_FragCoord.y */
1024 if (!flip && ir->data.pixel_center_integer) {
1025 emit(MOV(wpos, this->pixel_y));
1026 } else {
1027 fs_reg pixel_y = this->pixel_y;
1028 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1029
1030 if (flip) {
1031 pixel_y.negate = true;
1032 offset += key->drawable_height - 1.0;
1033 }
1034
1035 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1036 }
1037 wpos.reg_offset++;
1038
1039 /* gl_FragCoord.z */
1040 if (brw->gen >= 6) {
1041 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1042 } else {
1043 emit(FS_OPCODE_LINTERP, wpos,
1044 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1045 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1046 interp_reg(VARYING_SLOT_POS, 2));
1047 }
1048 wpos.reg_offset++;
1049
1050 /* gl_FragCoord.w: Already set up in emit_interpolation */
1051 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1052
1053 return reg;
1054 }
1055
1056 fs_inst *
1057 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1058 glsl_interp_qualifier interpolation_mode,
1059 bool is_centroid, bool is_sample)
1060 {
1061 brw_wm_barycentric_interp_mode barycoord_mode;
1062 if (brw->gen >= 6) {
1063 if (is_centroid) {
1064 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1065 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1066 else
1067 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1068 } else if (is_sample) {
1069 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1070 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1071 else
1072 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1073 } else {
1074 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1075 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1076 else
1077 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1078 }
1079 } else {
1080 /* On Ironlake and below, there is only one interpolation mode.
1081 * Centroid interpolation doesn't mean anything on this hardware --
1082 * there is no multisampling.
1083 */
1084 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1085 }
1086 return emit(FS_OPCODE_LINTERP, attr,
1087 this->delta_x[barycoord_mode],
1088 this->delta_y[barycoord_mode], interp);
1089 }
1090
1091 fs_reg *
1092 fs_visitor::emit_general_interpolation(ir_variable *ir)
1093 {
1094 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1095 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1096 fs_reg attr = *reg;
1097
1098 unsigned int array_elements;
1099 const glsl_type *type;
1100
1101 if (ir->type->is_array()) {
1102 array_elements = ir->type->length;
1103 if (array_elements == 0) {
1104 fail("dereferenced array '%s' has length 0\n", ir->name);
1105 }
1106 type = ir->type->fields.array;
1107 } else {
1108 array_elements = 1;
1109 type = ir->type;
1110 }
1111
1112 glsl_interp_qualifier interpolation_mode =
1113 ir->determine_interpolation_mode(key->flat_shade);
1114
1115 int location = ir->data.location;
1116 for (unsigned int i = 0; i < array_elements; i++) {
1117 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1118 if (prog_data->urb_setup[location] == -1) {
1119 /* If there's no incoming setup data for this slot, don't
1120 * emit interpolation for it.
1121 */
1122 attr.reg_offset += type->vector_elements;
1123 location++;
1124 continue;
1125 }
1126
1127 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1128 /* Constant interpolation (flat shading) case. The SF has
1129 * handed us defined values in only the constant offset
1130 * field of the setup reg.
1131 */
1132 for (unsigned int k = 0; k < type->vector_elements; k++) {
1133 struct brw_reg interp = interp_reg(location, k);
1134 interp = suboffset(interp, 3);
1135 interp.type = reg->type;
1136 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1137 attr.reg_offset++;
1138 }
1139 } else {
1140 /* Smooth/noperspective interpolation case. */
1141 for (unsigned int k = 0; k < type->vector_elements; k++) {
1142 struct brw_reg interp = interp_reg(location, k);
1143 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1144 /* Get the pixel/sample mask into f0 so that we know
1145 * which pixels are lit. Then, for each channel that is
1146 * unlit, replace the centroid data with non-centroid
1147 * data.
1148 */
1149 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1150
1151 fs_inst *inst;
1152 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1153 false, false);
1154 inst->predicate = BRW_PREDICATE_NORMAL;
1155 inst->predicate_inverse = true;
1156 if (brw->has_pln)
1157 inst->no_dd_clear = true;
1158
1159 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1160 ir->data.centroid && !key->persample_shading,
1161 ir->data.sample || key->persample_shading);
1162 inst->predicate = BRW_PREDICATE_NORMAL;
1163 inst->predicate_inverse = false;
1164 if (brw->has_pln)
1165 inst->no_dd_check = true;
1166
1167 } else {
1168 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1169 ir->data.centroid && !key->persample_shading,
1170 ir->data.sample || key->persample_shading);
1171 }
1172 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1173 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1174 }
1175 attr.reg_offset++;
1176 }
1177
1178 }
1179 location++;
1180 }
1181 }
1182
1183 return reg;
1184 }
1185
1186 fs_reg *
1187 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1188 {
1189 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1190
1191 /* The frontfacing comes in as a bit in the thread payload. */
1192 if (brw->gen >= 6) {
1193 emit(BRW_OPCODE_ASR, *reg,
1194 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1195 fs_reg(15));
1196 emit(BRW_OPCODE_NOT, *reg, *reg);
1197 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1198 } else {
1199 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1200 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1201 * us front face
1202 */
1203 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1204 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1205 }
1206
1207 return reg;
1208 }
1209
1210 void
1211 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1212 {
1213 assert(dst.type == BRW_REGISTER_TYPE_F);
1214
1215 if (key->compute_pos_offset) {
1216 /* Convert int_sample_pos to floating point */
1217 emit(MOV(dst, int_sample_pos));
1218 /* Scale to the range [0, 1] */
1219 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1220 }
1221 else {
1222 /* From ARB_sample_shading specification:
1223 * "When rendering to a non-multisample buffer, or if multisample
1224 * rasterization is disabled, gl_SamplePosition will always be
1225 * (0.5, 0.5).
1226 */
1227 emit(MOV(dst, fs_reg(0.5f)));
1228 }
1229 }
1230
1231 fs_reg *
1232 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1233 {
1234 assert(brw->gen >= 6);
1235 assert(ir->type == glsl_type::vec2_type);
1236
1237 this->current_annotation = "compute sample position";
1238 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1239 fs_reg pos = *reg;
1240 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1241 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1242
1243 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1244 * mode will be enabled.
1245 *
1246 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1247 * R31.1:0 Position Offset X/Y for Slot[3:0]
1248 * R31.3:2 Position Offset X/Y for Slot[7:4]
1249 * .....
1250 *
1251 * The X, Y sample positions come in as bytes in thread payload. So, read
1252 * the positions using vstride=16, width=8, hstride=2.
1253 */
1254 struct brw_reg sample_pos_reg =
1255 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1256 BRW_REGISTER_TYPE_B), 16, 8, 2);
1257
1258 fs_inst *inst = emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1259 if (dispatch_width == 16) {
1260 inst->force_uncompressed = true;
1261 inst = emit(MOV(half(int_sample_x, 1),
1262 fs_reg(suboffset(sample_pos_reg, 16))));
1263 inst->force_sechalf = true;
1264 }
1265 /* Compute gl_SamplePosition.x */
1266 compute_sample_position(pos, int_sample_x);
1267 pos.reg_offset++;
1268 inst = emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1269 if (dispatch_width == 16) {
1270 inst->force_uncompressed = true;
1271 inst = emit(MOV(half(int_sample_y, 1),
1272 fs_reg(suboffset(sample_pos_reg, 17))));
1273 inst->force_sechalf = true;
1274 }
1275 /* Compute gl_SamplePosition.y */
1276 compute_sample_position(pos, int_sample_y);
1277 return reg;
1278 }
1279
1280 fs_reg *
1281 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1282 {
1283 assert(brw->gen >= 6);
1284
1285 this->current_annotation = "compute sample id";
1286 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1287
1288 if (key->compute_sample_id) {
1289 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1290 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1291 t2.type = BRW_REGISTER_TYPE_UW;
1292
1293 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1294 * 8x multisampling, subspan 0 will represent sample N (where N
1295 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1296 * 7. We can find the value of N by looking at R0.0 bits 7:6
1297 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1298 * (since samples are always delivered in pairs). That is, we
1299 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1300 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1301 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1302 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1303 * populating a temporary variable with the sequence (0, 1, 2, 3),
1304 * and then reading from it using vstride=1, width=4, hstride=0.
1305 * These computations hold good for 4x multisampling as well.
1306 *
1307 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1308 * the first four slots are sample 0 of subspan 0; the next four
1309 * are sample 1 of subspan 0; the third group is sample 0 of
1310 * subspan 1, and finally sample 1 of subspan 1.
1311 */
1312 fs_inst *inst;
1313 inst = emit(BRW_OPCODE_AND, t1,
1314 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1315 fs_reg(0xc0));
1316 inst->force_writemask_all = true;
1317 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1318 inst->force_writemask_all = true;
1319 /* This works for both SIMD8 and SIMD16 */
1320 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1321 inst->force_writemask_all = true;
1322 /* This special instruction takes care of setting vstride=1,
1323 * width=4, hstride=0 of t2 during an ADD instruction.
1324 */
1325 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1326 } else {
1327 /* As per GL_ARB_sample_shading specification:
1328 * "When rendering to a non-multisample buffer, or if multisample
1329 * rasterization is disabled, gl_SampleID will always be zero."
1330 */
1331 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1332 }
1333
1334 return reg;
1335 }
1336
1337 fs_reg
1338 fs_visitor::fix_math_operand(fs_reg src)
1339 {
1340 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1341 * might be able to do better by doing execsize = 1 math and then
1342 * expanding that result out, but we would need to be careful with
1343 * masking.
1344 *
1345 * The hardware ignores source modifiers (negate and abs) on math
1346 * instructions, so we also move to a temp to set those up.
1347 */
1348 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1349 !src.abs && !src.negate)
1350 return src;
1351
1352 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1353 * operands to math
1354 */
1355 if (brw->gen >= 7 && src.file != IMM)
1356 return src;
1357
1358 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1359 expanded.type = src.type;
1360 emit(BRW_OPCODE_MOV, expanded, src);
1361 return expanded;
1362 }
1363
1364 fs_inst *
1365 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1366 {
1367 switch (opcode) {
1368 case SHADER_OPCODE_RCP:
1369 case SHADER_OPCODE_RSQ:
1370 case SHADER_OPCODE_SQRT:
1371 case SHADER_OPCODE_EXP2:
1372 case SHADER_OPCODE_LOG2:
1373 case SHADER_OPCODE_SIN:
1374 case SHADER_OPCODE_COS:
1375 break;
1376 default:
1377 unreachable("not reached: bad math opcode");
1378 }
1379
1380 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1381 * might be able to do better by doing execsize = 1 math and then
1382 * expanding that result out, but we would need to be careful with
1383 * masking.
1384 *
1385 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1386 * instructions, so we also move to a temp to set those up.
1387 */
1388 if (brw->gen == 6 || brw->gen == 7)
1389 src = fix_math_operand(src);
1390
1391 fs_inst *inst = emit(opcode, dst, src);
1392
1393 if (brw->gen < 6) {
1394 inst->base_mrf = 2;
1395 inst->mlen = dispatch_width / 8;
1396 }
1397
1398 return inst;
1399 }
1400
1401 fs_inst *
1402 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1403 {
1404 int base_mrf = 2;
1405 fs_inst *inst;
1406
1407 if (brw->gen >= 8) {
1408 inst = emit(opcode, dst, src0, src1);
1409 } else if (brw->gen >= 6) {
1410 src0 = fix_math_operand(src0);
1411 src1 = fix_math_operand(src1);
1412
1413 inst = emit(opcode, dst, src0, src1);
1414 } else {
1415 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1416 * "Message Payload":
1417 *
1418 * "Operand0[7]. For the INT DIV functions, this operand is the
1419 * denominator."
1420 * ...
1421 * "Operand1[7]. For the INT DIV functions, this operand is the
1422 * numerator."
1423 */
1424 bool is_int_div = opcode != SHADER_OPCODE_POW;
1425 fs_reg &op0 = is_int_div ? src1 : src0;
1426 fs_reg &op1 = is_int_div ? src0 : src1;
1427
1428 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1429 inst = emit(opcode, dst, op0, reg_null_f);
1430
1431 inst->base_mrf = base_mrf;
1432 inst->mlen = 2 * dispatch_width / 8;
1433 }
1434 return inst;
1435 }
1436
1437 void
1438 fs_visitor::assign_curb_setup()
1439 {
1440 if (dispatch_width == 8) {
1441 prog_data->base.dispatch_grf_start_reg = payload.num_regs;
1442 } else {
1443 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1444 }
1445
1446 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1447
1448 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1449 foreach_in_list(fs_inst, inst, &instructions) {
1450 for (unsigned int i = 0; i < inst->sources; i++) {
1451 if (inst->src[i].file == UNIFORM) {
1452 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1453 int constant_nr;
1454 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1455 constant_nr = push_constant_loc[uniform_nr];
1456 } else {
1457 /* Section 5.11 of the OpenGL 4.1 spec says:
1458 * "Out-of-bounds reads return undefined values, which include
1459 * values from other variables of the active program or zero."
1460 * Just return the first push constant.
1461 */
1462 constant_nr = 0;
1463 }
1464
1465 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1466 constant_nr / 8,
1467 constant_nr % 8);
1468
1469 inst->src[i].file = HW_REG;
1470 inst->src[i].fixed_hw_reg = byte_offset(
1471 retype(brw_reg, inst->src[i].type),
1472 inst->src[i].subreg_offset);
1473 }
1474 }
1475 }
1476 }
1477
1478 void
1479 fs_visitor::calculate_urb_setup()
1480 {
1481 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1482 prog_data->urb_setup[i] = -1;
1483 }
1484
1485 int urb_next = 0;
1486 /* Figure out where each of the incoming setup attributes lands. */
1487 if (brw->gen >= 6) {
1488 if (_mesa_bitcount_64(fp->Base.InputsRead &
1489 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1490 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1491 * first 16 varying inputs, so we can put them wherever we want.
1492 * Just put them in order.
1493 *
1494 * This is useful because it means that (a) inputs not used by the
1495 * fragment shader won't take up valuable register space, and (b) we
1496 * won't have to recompile the fragment shader if it gets paired with
1497 * a different vertex (or geometry) shader.
1498 */
1499 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1500 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1501 BITFIELD64_BIT(i)) {
1502 prog_data->urb_setup[i] = urb_next++;
1503 }
1504 }
1505 } else {
1506 /* We have enough input varyings that the SF/SBE pipeline stage can't
1507 * arbitrarily rearrange them to suit our whim; we have to put them
1508 * in an order that matches the output of the previous pipeline stage
1509 * (geometry or vertex shader).
1510 */
1511 struct brw_vue_map prev_stage_vue_map;
1512 brw_compute_vue_map(brw, &prev_stage_vue_map,
1513 key->input_slots_valid);
1514 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1515 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1516 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1517 slot++) {
1518 int varying = prev_stage_vue_map.slot_to_varying[slot];
1519 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1520 * unused.
1521 */
1522 if (varying != BRW_VARYING_SLOT_COUNT &&
1523 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1524 BITFIELD64_BIT(varying))) {
1525 prog_data->urb_setup[varying] = slot - first_slot;
1526 }
1527 }
1528 urb_next = prev_stage_vue_map.num_slots - first_slot;
1529 }
1530 } else {
1531 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1532 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1533 /* Point size is packed into the header, not as a general attribute */
1534 if (i == VARYING_SLOT_PSIZ)
1535 continue;
1536
1537 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1538 /* The back color slot is skipped when the front color is
1539 * also written to. In addition, some slots can be
1540 * written in the vertex shader and not read in the
1541 * fragment shader. So the register number must always be
1542 * incremented, mapped or not.
1543 */
1544 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1545 prog_data->urb_setup[i] = urb_next;
1546 urb_next++;
1547 }
1548 }
1549
1550 /*
1551 * It's a FS only attribute, and we did interpolation for this attribute
1552 * in SF thread. So, count it here, too.
1553 *
1554 * See compile_sf_prog() for more info.
1555 */
1556 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1557 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1558 }
1559
1560 prog_data->num_varying_inputs = urb_next;
1561 }
1562
1563 void
1564 fs_visitor::assign_urb_setup()
1565 {
1566 int urb_start = payload.num_regs + prog_data->curb_read_length;
1567
1568 /* Offset all the urb_setup[] index by the actual position of the
1569 * setup regs, now that the location of the constants has been chosen.
1570 */
1571 foreach_in_list(fs_inst, inst, &instructions) {
1572 if (inst->opcode == FS_OPCODE_LINTERP) {
1573 assert(inst->src[2].file == HW_REG);
1574 inst->src[2].fixed_hw_reg.nr += urb_start;
1575 }
1576
1577 if (inst->opcode == FS_OPCODE_CINTERP) {
1578 assert(inst->src[0].file == HW_REG);
1579 inst->src[0].fixed_hw_reg.nr += urb_start;
1580 }
1581 }
1582
1583 /* Each attribute is 4 setup channels, each of which is half a reg. */
1584 this->first_non_payload_grf =
1585 urb_start + prog_data->num_varying_inputs * 2;
1586 }
1587
1588 /**
1589 * Split large virtual GRFs into separate components if we can.
1590 *
1591 * This is mostly duplicated with what brw_fs_vector_splitting does,
1592 * but that's really conservative because it's afraid of doing
1593 * splitting that doesn't result in real progress after the rest of
1594 * the optimization phases, which would cause infinite looping in
1595 * optimization. We can do it once here, safely. This also has the
1596 * opportunity to split interpolated values, or maybe even uniforms,
1597 * which we don't have at the IR level.
1598 *
1599 * We want to split, because virtual GRFs are what we register
1600 * allocate and spill (due to contiguousness requirements for some
1601 * instructions), and they're what we naturally generate in the
1602 * codegen process, but most virtual GRFs don't actually need to be
1603 * contiguous sets of GRFs. If we split, we'll end up with reduced
1604 * live intervals and better dead code elimination and coalescing.
1605 */
1606 void
1607 fs_visitor::split_virtual_grfs()
1608 {
1609 int num_vars = this->virtual_grf_count;
1610 bool split_grf[num_vars];
1611 int new_virtual_grf[num_vars];
1612
1613 /* Try to split anything > 0 sized. */
1614 for (int i = 0; i < num_vars; i++) {
1615 if (this->virtual_grf_sizes[i] != 1)
1616 split_grf[i] = true;
1617 else
1618 split_grf[i] = false;
1619 }
1620
1621 if (brw->has_pln &&
1622 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1623 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1624 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1625 * Gen6, that was the only supported interpolation mode, and since Gen6,
1626 * delta_x and delta_y are in fixed hardware registers.
1627 */
1628 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1629 false;
1630 }
1631
1632 foreach_in_list(fs_inst, inst, &instructions) {
1633 /* If there's a SEND message that requires contiguous destination
1634 * registers, no splitting is allowed.
1635 */
1636 if (inst->regs_written > 1) {
1637 split_grf[inst->dst.reg] = false;
1638 }
1639
1640 /* If we're sending from a GRF, don't split it, on the assumption that
1641 * the send is reading the whole thing.
1642 */
1643 if (inst->is_send_from_grf()) {
1644 for (int i = 0; i < inst->sources; i++) {
1645 if (inst->src[i].file == GRF) {
1646 split_grf[inst->src[i].reg] = false;
1647 }
1648 }
1649 }
1650 }
1651
1652 /* Allocate new space for split regs. Note that the virtual
1653 * numbers will be contiguous.
1654 */
1655 for (int i = 0; i < num_vars; i++) {
1656 if (split_grf[i]) {
1657 new_virtual_grf[i] = virtual_grf_alloc(1);
1658 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1659 int reg = virtual_grf_alloc(1);
1660 assert(reg == new_virtual_grf[i] + j - 1);
1661 (void) reg;
1662 }
1663 this->virtual_grf_sizes[i] = 1;
1664 }
1665 }
1666
1667 foreach_in_list(fs_inst, inst, &instructions) {
1668 if (inst->dst.file == GRF &&
1669 split_grf[inst->dst.reg] &&
1670 inst->dst.reg_offset != 0) {
1671 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1672 inst->dst.reg_offset - 1);
1673 inst->dst.reg_offset = 0;
1674 }
1675 for (int i = 0; i < inst->sources; i++) {
1676 if (inst->src[i].file == GRF &&
1677 split_grf[inst->src[i].reg] &&
1678 inst->src[i].reg_offset != 0) {
1679 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1680 inst->src[i].reg_offset - 1);
1681 inst->src[i].reg_offset = 0;
1682 }
1683 }
1684 }
1685 invalidate_live_intervals();
1686 }
1687
1688 /**
1689 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1690 *
1691 * During code generation, we create tons of temporary variables, many of
1692 * which get immediately killed and are never used again. Yet, in later
1693 * optimization and analysis passes, such as compute_live_intervals, we need
1694 * to loop over all the virtual GRFs. Compacting them can save a lot of
1695 * overhead.
1696 */
1697 void
1698 fs_visitor::compact_virtual_grfs()
1699 {
1700 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER))
1701 return;
1702
1703 /* Mark which virtual GRFs are used, and count how many. */
1704 int remap_table[this->virtual_grf_count];
1705 memset(remap_table, -1, sizeof(remap_table));
1706
1707 foreach_in_list(const fs_inst, inst, &instructions) {
1708 if (inst->dst.file == GRF)
1709 remap_table[inst->dst.reg] = 0;
1710
1711 for (int i = 0; i < inst->sources; i++) {
1712 if (inst->src[i].file == GRF)
1713 remap_table[inst->src[i].reg] = 0;
1714 }
1715 }
1716
1717 /* Compact the GRF arrays. */
1718 int new_index = 0;
1719 for (int i = 0; i < this->virtual_grf_count; i++) {
1720 if (remap_table[i] != -1) {
1721 remap_table[i] = new_index;
1722 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1723 invalidate_live_intervals();
1724 ++new_index;
1725 }
1726 }
1727
1728 this->virtual_grf_count = new_index;
1729
1730 /* Patch all the instructions to use the newly renumbered registers */
1731 foreach_in_list(fs_inst, inst, &instructions) {
1732 if (inst->dst.file == GRF)
1733 inst->dst.reg = remap_table[inst->dst.reg];
1734
1735 for (int i = 0; i < inst->sources; i++) {
1736 if (inst->src[i].file == GRF)
1737 inst->src[i].reg = remap_table[inst->src[i].reg];
1738 }
1739 }
1740
1741 /* Patch all the references to delta_x/delta_y, since they're used in
1742 * register allocation.
1743 */
1744 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
1745 if (delta_x[i].file == GRF && remap_table[delta_x[i].reg] != -1) {
1746 delta_x[i].reg = remap_table[delta_x[i].reg];
1747 }
1748 }
1749 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
1750 if (delta_y[i].file == GRF && remap_table[delta_y[i].reg] != -1) {
1751 delta_y[i].reg = remap_table[delta_y[i].reg];
1752 }
1753 }
1754 }
1755
1756 /*
1757 * Implements array access of uniforms by inserting a
1758 * PULL_CONSTANT_LOAD instruction.
1759 *
1760 * Unlike temporary GRF array access (where we don't support it due to
1761 * the difficulty of doing relative addressing on instruction
1762 * destinations), we could potentially do array access of uniforms
1763 * that were loaded in GRF space as push constants. In real-world
1764 * usage we've seen, though, the arrays being used are always larger
1765 * than we could load as push constants, so just always move all
1766 * uniform array access out to a pull constant buffer.
1767 */
1768 void
1769 fs_visitor::move_uniform_array_access_to_pull_constants()
1770 {
1771 if (dispatch_width != 8)
1772 return;
1773
1774 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1775
1776 for (unsigned int i = 0; i < uniforms; i++) {
1777 pull_constant_loc[i] = -1;
1778 }
1779
1780 /* Walk through and find array access of uniforms. Put a copy of that
1781 * uniform in the pull constant buffer.
1782 *
1783 * Note that we don't move constant-indexed accesses to arrays. No
1784 * testing has been done of the performance impact of this choice.
1785 */
1786 foreach_in_list_safe(fs_inst, inst, &instructions) {
1787 for (int i = 0 ; i < inst->sources; i++) {
1788 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1789 continue;
1790
1791 int uniform = inst->src[i].reg;
1792
1793 /* If this array isn't already present in the pull constant buffer,
1794 * add it.
1795 */
1796 if (pull_constant_loc[uniform] == -1) {
1797 const gl_constant_value **values = &stage_prog_data->param[uniform];
1798
1799 assert(param_size[uniform]);
1800
1801 for (int j = 0; j < param_size[uniform]; j++) {
1802 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
1803
1804 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
1805 values[j];
1806 }
1807 }
1808 }
1809 }
1810 }
1811
1812 /**
1813 * Assign UNIFORM file registers to either push constants or pull constants.
1814 *
1815 * We allow a fragment shader to have more than the specified minimum
1816 * maximum number of fragment shader uniform components (64). If
1817 * there are too many of these, they'd fill up all of register space.
1818 * So, this will push some of them out to the pull constant buffer and
1819 * update the program to load them.
1820 */
1821 void
1822 fs_visitor::assign_constant_locations()
1823 {
1824 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
1825 if (dispatch_width != 8)
1826 return;
1827
1828 /* Find which UNIFORM registers are still in use. */
1829 bool is_live[uniforms];
1830 for (unsigned int i = 0; i < uniforms; i++) {
1831 is_live[i] = false;
1832 }
1833
1834 foreach_in_list(fs_inst, inst, &instructions) {
1835 for (int i = 0; i < inst->sources; i++) {
1836 if (inst->src[i].file != UNIFORM)
1837 continue;
1838
1839 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1840 if (constant_nr >= 0 && constant_nr < (int) uniforms)
1841 is_live[constant_nr] = true;
1842 }
1843 }
1844
1845 /* Only allow 16 registers (128 uniform components) as push constants.
1846 *
1847 * Just demote the end of the list. We could probably do better
1848 * here, demoting things that are rarely used in the program first.
1849 *
1850 * If changing this value, note the limitation about total_regs in
1851 * brw_curbe.c.
1852 */
1853 unsigned int max_push_components = 16 * 8;
1854 unsigned int num_push_constants = 0;
1855
1856 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
1857
1858 for (unsigned int i = 0; i < uniforms; i++) {
1859 if (!is_live[i] || pull_constant_loc[i] != -1) {
1860 /* This UNIFORM register is either dead, or has already been demoted
1861 * to a pull const. Mark it as no longer living in the param[] array.
1862 */
1863 push_constant_loc[i] = -1;
1864 continue;
1865 }
1866
1867 if (num_push_constants < max_push_components) {
1868 /* Retain as a push constant. Record the location in the params[]
1869 * array.
1870 */
1871 push_constant_loc[i] = num_push_constants++;
1872 } else {
1873 /* Demote to a pull constant. */
1874 push_constant_loc[i] = -1;
1875
1876 int pull_index = stage_prog_data->nr_pull_params++;
1877 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
1878 pull_constant_loc[i] = pull_index;
1879 }
1880 }
1881
1882 stage_prog_data->nr_params = num_push_constants;
1883
1884 /* Up until now, the param[] array has been indexed by reg + reg_offset
1885 * of UNIFORM registers. Condense it to only contain the uniforms we
1886 * chose to upload as push constants.
1887 */
1888 for (unsigned int i = 0; i < uniforms; i++) {
1889 int remapped = push_constant_loc[i];
1890
1891 if (remapped == -1)
1892 continue;
1893
1894 assert(remapped <= (int)i);
1895 stage_prog_data->param[remapped] = stage_prog_data->param[i];
1896 }
1897 }
1898
1899 /**
1900 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
1901 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
1902 */
1903 void
1904 fs_visitor::demote_pull_constants()
1905 {
1906 foreach_in_list(fs_inst, inst, &instructions) {
1907 for (int i = 0; i < inst->sources; i++) {
1908 if (inst->src[i].file != UNIFORM)
1909 continue;
1910
1911 int pull_index = pull_constant_loc[inst->src[i].reg +
1912 inst->src[i].reg_offset];
1913 if (pull_index == -1)
1914 continue;
1915
1916 /* Set up the annotation tracking for new generated instructions. */
1917 base_ir = inst->ir;
1918 current_annotation = inst->annotation;
1919
1920 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
1921 fs_reg dst = fs_reg(this, glsl_type::float_type);
1922
1923 /* Generate a pull load into dst. */
1924 if (inst->src[i].reladdr) {
1925 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
1926 surf_index,
1927 *inst->src[i].reladdr,
1928 pull_index);
1929 inst->insert_before(&list);
1930 inst->src[i].reladdr = NULL;
1931 } else {
1932 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1933 fs_inst *pull =
1934 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1935 dst, surf_index, offset);
1936 inst->insert_before(pull);
1937 inst->src[i].set_smear(pull_index & 3);
1938 }
1939
1940 /* Rewrite the instruction to use the temporary VGRF. */
1941 inst->src[i].file = GRF;
1942 inst->src[i].reg = dst.reg;
1943 inst->src[i].reg_offset = 0;
1944 }
1945 }
1946 invalidate_live_intervals();
1947 }
1948
1949 bool
1950 fs_visitor::opt_algebraic()
1951 {
1952 bool progress = false;
1953
1954 foreach_in_list(fs_inst, inst, &instructions) {
1955 switch (inst->opcode) {
1956 case BRW_OPCODE_MUL:
1957 if (inst->src[1].file != IMM)
1958 continue;
1959
1960 /* a * 1.0 = a */
1961 if (inst->src[1].is_one()) {
1962 inst->opcode = BRW_OPCODE_MOV;
1963 inst->src[1] = reg_undef;
1964 progress = true;
1965 break;
1966 }
1967
1968 /* a * 0.0 = 0.0 */
1969 if (inst->src[1].is_zero()) {
1970 inst->opcode = BRW_OPCODE_MOV;
1971 inst->src[0] = inst->src[1];
1972 inst->src[1] = reg_undef;
1973 progress = true;
1974 break;
1975 }
1976
1977 break;
1978 case BRW_OPCODE_ADD:
1979 if (inst->src[1].file != IMM)
1980 continue;
1981
1982 /* a + 0.0 = a */
1983 if (inst->src[1].is_zero()) {
1984 inst->opcode = BRW_OPCODE_MOV;
1985 inst->src[1] = reg_undef;
1986 progress = true;
1987 break;
1988 }
1989 break;
1990 case BRW_OPCODE_OR:
1991 if (inst->src[0].equals(inst->src[1])) {
1992 inst->opcode = BRW_OPCODE_MOV;
1993 inst->src[1] = reg_undef;
1994 progress = true;
1995 break;
1996 }
1997 break;
1998 case BRW_OPCODE_LRP:
1999 if (inst->src[1].equals(inst->src[2])) {
2000 inst->opcode = BRW_OPCODE_MOV;
2001 inst->src[0] = inst->src[1];
2002 inst->src[1] = reg_undef;
2003 inst->src[2] = reg_undef;
2004 progress = true;
2005 break;
2006 }
2007 break;
2008 case BRW_OPCODE_SEL:
2009 if (inst->src[0].equals(inst->src[1])) {
2010 inst->opcode = BRW_OPCODE_MOV;
2011 inst->src[1] = reg_undef;
2012 inst->predicate = BRW_PREDICATE_NONE;
2013 inst->predicate_inverse = false;
2014 progress = true;
2015 } else if (inst->saturate && inst->src[1].file == IMM) {
2016 switch (inst->conditional_mod) {
2017 case BRW_CONDITIONAL_LE:
2018 case BRW_CONDITIONAL_L:
2019 switch (inst->src[1].type) {
2020 case BRW_REGISTER_TYPE_F:
2021 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2022 inst->opcode = BRW_OPCODE_MOV;
2023 inst->src[1] = reg_undef;
2024 progress = true;
2025 }
2026 break;
2027 default:
2028 break;
2029 }
2030 break;
2031 case BRW_CONDITIONAL_GE:
2032 case BRW_CONDITIONAL_G:
2033 switch (inst->src[1].type) {
2034 case BRW_REGISTER_TYPE_F:
2035 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2036 inst->opcode = BRW_OPCODE_MOV;
2037 inst->src[1] = reg_undef;
2038 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2039 progress = true;
2040 }
2041 break;
2042 default:
2043 break;
2044 }
2045 default:
2046 break;
2047 }
2048 }
2049 break;
2050 default:
2051 break;
2052 }
2053 }
2054
2055 return progress;
2056 }
2057
2058 bool
2059 fs_visitor::opt_register_renaming()
2060 {
2061 bool progress = false;
2062 int depth = 0;
2063
2064 int remap[virtual_grf_count];
2065 memset(remap, -1, sizeof(int) * virtual_grf_count);
2066
2067 foreach_in_list(fs_inst, inst, &this->instructions) {
2068 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2069 depth++;
2070 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2071 inst->opcode == BRW_OPCODE_WHILE) {
2072 depth--;
2073 }
2074
2075 /* Rewrite instruction sources. */
2076 for (int i = 0; i < inst->sources; i++) {
2077 if (inst->src[i].file == GRF &&
2078 remap[inst->src[i].reg] != -1 &&
2079 remap[inst->src[i].reg] != inst->src[i].reg) {
2080 inst->src[i].reg = remap[inst->src[i].reg];
2081 progress = true;
2082 }
2083 }
2084
2085 const int dst = inst->dst.reg;
2086
2087 if (depth == 0 &&
2088 inst->dst.file == GRF &&
2089 virtual_grf_sizes[inst->dst.reg] == 1 &&
2090 !inst->is_partial_write()) {
2091 if (remap[dst] == -1) {
2092 remap[dst] = dst;
2093 } else {
2094 remap[dst] = virtual_grf_alloc(1);
2095 inst->dst.reg = remap[dst];
2096 progress = true;
2097 }
2098 } else if (inst->dst.file == GRF &&
2099 remap[dst] != -1 &&
2100 remap[dst] != dst) {
2101 inst->dst.reg = remap[dst];
2102 progress = true;
2103 }
2104 }
2105
2106 if (progress) {
2107 invalidate_live_intervals();
2108
2109 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2110 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2111 delta_x[i].reg = remap[delta_x[i].reg];
2112 }
2113 }
2114 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2115 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2116 delta_y[i].reg = remap[delta_y[i].reg];
2117 }
2118 }
2119 }
2120
2121 return progress;
2122 }
2123
2124 bool
2125 fs_visitor::compute_to_mrf()
2126 {
2127 bool progress = false;
2128 int next_ip = 0;
2129
2130 calculate_live_intervals();
2131
2132 foreach_in_list_safe(fs_inst, inst, &instructions) {
2133 int ip = next_ip;
2134 next_ip++;
2135
2136 if (inst->opcode != BRW_OPCODE_MOV ||
2137 inst->is_partial_write() ||
2138 inst->dst.file != MRF || inst->src[0].file != GRF ||
2139 inst->dst.type != inst->src[0].type ||
2140 inst->src[0].abs || inst->src[0].negate ||
2141 !inst->src[0].is_contiguous() ||
2142 inst->src[0].subreg_offset)
2143 continue;
2144
2145 /* Work out which hardware MRF registers are written by this
2146 * instruction.
2147 */
2148 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2149 int mrf_high;
2150 if (inst->dst.reg & BRW_MRF_COMPR4) {
2151 mrf_high = mrf_low + 4;
2152 } else if (dispatch_width == 16 &&
2153 (!inst->force_uncompressed && !inst->force_sechalf)) {
2154 mrf_high = mrf_low + 1;
2155 } else {
2156 mrf_high = mrf_low;
2157 }
2158
2159 /* Can't compute-to-MRF this GRF if someone else was going to
2160 * read it later.
2161 */
2162 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2163 continue;
2164
2165 /* Found a move of a GRF to a MRF. Let's see if we can go
2166 * rewrite the thing that made this GRF to write into the MRF.
2167 */
2168 fs_inst *scan_inst;
2169 for (scan_inst = (fs_inst *)inst->prev;
2170 !scan_inst->is_head_sentinel();
2171 scan_inst = (fs_inst *)scan_inst->prev) {
2172 if (scan_inst->dst.file == GRF &&
2173 scan_inst->dst.reg == inst->src[0].reg) {
2174 /* Found the last thing to write our reg we want to turn
2175 * into a compute-to-MRF.
2176 */
2177
2178 /* If this one instruction didn't populate all the
2179 * channels, bail. We might be able to rewrite everything
2180 * that writes that reg, but it would require smarter
2181 * tracking to delay the rewriting until complete success.
2182 */
2183 if (scan_inst->is_partial_write())
2184 break;
2185
2186 /* Things returning more than one register would need us to
2187 * understand coalescing out more than one MOV at a time.
2188 */
2189 if (scan_inst->regs_written > 1)
2190 break;
2191
2192 /* SEND instructions can't have MRF as a destination. */
2193 if (scan_inst->mlen)
2194 break;
2195
2196 if (brw->gen == 6) {
2197 /* gen6 math instructions must have the destination be
2198 * GRF, so no compute-to-MRF for them.
2199 */
2200 if (scan_inst->is_math()) {
2201 break;
2202 }
2203 }
2204
2205 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2206 /* Found the creator of our MRF's source value. */
2207 scan_inst->dst.file = MRF;
2208 scan_inst->dst.reg = inst->dst.reg;
2209 scan_inst->saturate |= inst->saturate;
2210 inst->remove();
2211 progress = true;
2212 }
2213 break;
2214 }
2215
2216 /* We don't handle control flow here. Most computation of
2217 * values that end up in MRFs are shortly before the MRF
2218 * write anyway.
2219 */
2220 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2221 break;
2222
2223 /* You can't read from an MRF, so if someone else reads our
2224 * MRF's source GRF that we wanted to rewrite, that stops us.
2225 */
2226 bool interfered = false;
2227 for (int i = 0; i < scan_inst->sources; i++) {
2228 if (scan_inst->src[i].file == GRF &&
2229 scan_inst->src[i].reg == inst->src[0].reg &&
2230 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2231 interfered = true;
2232 }
2233 }
2234 if (interfered)
2235 break;
2236
2237 if (scan_inst->dst.file == MRF) {
2238 /* If somebody else writes our MRF here, we can't
2239 * compute-to-MRF before that.
2240 */
2241 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2242 int scan_mrf_high;
2243
2244 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2245 scan_mrf_high = scan_mrf_low + 4;
2246 } else if (dispatch_width == 16 &&
2247 (!scan_inst->force_uncompressed &&
2248 !scan_inst->force_sechalf)) {
2249 scan_mrf_high = scan_mrf_low + 1;
2250 } else {
2251 scan_mrf_high = scan_mrf_low;
2252 }
2253
2254 if (mrf_low == scan_mrf_low ||
2255 mrf_low == scan_mrf_high ||
2256 mrf_high == scan_mrf_low ||
2257 mrf_high == scan_mrf_high) {
2258 break;
2259 }
2260 }
2261
2262 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2263 /* Found a SEND instruction, which means that there are
2264 * live values in MRFs from base_mrf to base_mrf +
2265 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2266 * above it.
2267 */
2268 if (mrf_low >= scan_inst->base_mrf &&
2269 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2270 break;
2271 }
2272 if (mrf_high >= scan_inst->base_mrf &&
2273 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2274 break;
2275 }
2276 }
2277 }
2278 }
2279
2280 if (progress)
2281 invalidate_live_intervals();
2282
2283 return progress;
2284 }
2285
2286 /**
2287 * Walks through basic blocks, looking for repeated MRF writes and
2288 * removing the later ones.
2289 */
2290 bool
2291 fs_visitor::remove_duplicate_mrf_writes()
2292 {
2293 fs_inst *last_mrf_move[16];
2294 bool progress = false;
2295
2296 /* Need to update the MRF tracking for compressed instructions. */
2297 if (dispatch_width == 16)
2298 return false;
2299
2300 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2301
2302 foreach_in_list_safe(fs_inst, inst, &instructions) {
2303 if (inst->is_control_flow()) {
2304 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2305 }
2306
2307 if (inst->opcode == BRW_OPCODE_MOV &&
2308 inst->dst.file == MRF) {
2309 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2310 if (prev_inst && inst->equals(prev_inst)) {
2311 inst->remove();
2312 progress = true;
2313 continue;
2314 }
2315 }
2316
2317 /* Clear out the last-write records for MRFs that were overwritten. */
2318 if (inst->dst.file == MRF) {
2319 last_mrf_move[inst->dst.reg] = NULL;
2320 }
2321
2322 if (inst->mlen > 0 && inst->base_mrf != -1) {
2323 /* Found a SEND instruction, which will include two or fewer
2324 * implied MRF writes. We could do better here.
2325 */
2326 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2327 last_mrf_move[inst->base_mrf + i] = NULL;
2328 }
2329 }
2330
2331 /* Clear out any MRF move records whose sources got overwritten. */
2332 if (inst->dst.file == GRF) {
2333 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2334 if (last_mrf_move[i] &&
2335 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2336 last_mrf_move[i] = NULL;
2337 }
2338 }
2339 }
2340
2341 if (inst->opcode == BRW_OPCODE_MOV &&
2342 inst->dst.file == MRF &&
2343 inst->src[0].file == GRF &&
2344 !inst->is_partial_write()) {
2345 last_mrf_move[inst->dst.reg] = inst;
2346 }
2347 }
2348
2349 if (progress)
2350 invalidate_live_intervals();
2351
2352 return progress;
2353 }
2354
2355 static void
2356 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2357 int first_grf, int grf_len)
2358 {
2359 bool inst_simd16 = (dispatch_width > 8 &&
2360 !inst->force_uncompressed &&
2361 !inst->force_sechalf);
2362
2363 /* Clear the flag for registers that actually got read (as expected). */
2364 for (int i = 0; i < inst->sources; i++) {
2365 int grf;
2366 if (inst->src[i].file == GRF) {
2367 grf = inst->src[i].reg;
2368 } else if (inst->src[i].file == HW_REG &&
2369 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2370 grf = inst->src[i].fixed_hw_reg.nr;
2371 } else {
2372 continue;
2373 }
2374
2375 if (grf >= first_grf &&
2376 grf < first_grf + grf_len) {
2377 deps[grf - first_grf] = false;
2378 if (inst_simd16)
2379 deps[grf - first_grf + 1] = false;
2380 }
2381 }
2382 }
2383
2384 /**
2385 * Implements this workaround for the original 965:
2386 *
2387 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2388 * check for post destination dependencies on this instruction, software
2389 * must ensure that there is no destination hazard for the case of ‘write
2390 * followed by a posted write’ shown in the following example.
2391 *
2392 * 1. mov r3 0
2393 * 2. send r3.xy <rest of send instruction>
2394 * 3. mov r2 r3
2395 *
2396 * Due to no post-destination dependency check on the ‘send’, the above
2397 * code sequence could have two instructions (1 and 2) in flight at the
2398 * same time that both consider ‘r3’ as the target of their final writes.
2399 */
2400 void
2401 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2402 {
2403 int reg_size = dispatch_width / 8;
2404 int write_len = inst->regs_written * reg_size;
2405 int first_write_grf = inst->dst.reg;
2406 bool needs_dep[BRW_MAX_MRF];
2407 assert(write_len < (int)sizeof(needs_dep) - 1);
2408
2409 memset(needs_dep, false, sizeof(needs_dep));
2410 memset(needs_dep, true, write_len);
2411
2412 clear_deps_for_inst_src(inst, dispatch_width,
2413 needs_dep, first_write_grf, write_len);
2414
2415 /* Walk backwards looking for writes to registers we're writing which
2416 * aren't read since being written. If we hit the start of the program,
2417 * we assume that there are no outstanding dependencies on entry to the
2418 * program.
2419 */
2420 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2421 !scan_inst->is_head_sentinel();
2422 scan_inst = (fs_inst *)scan_inst->prev) {
2423
2424 /* If we hit control flow, assume that there *are* outstanding
2425 * dependencies, and force their cleanup before our instruction.
2426 */
2427 if (scan_inst->is_control_flow()) {
2428 for (int i = 0; i < write_len; i++) {
2429 if (needs_dep[i]) {
2430 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2431 }
2432 }
2433 return;
2434 }
2435
2436 bool scan_inst_simd16 = (dispatch_width > 8 &&
2437 !scan_inst->force_uncompressed &&
2438 !scan_inst->force_sechalf);
2439
2440 /* We insert our reads as late as possible on the assumption that any
2441 * instruction but a MOV that might have left us an outstanding
2442 * dependency has more latency than a MOV.
2443 */
2444 if (scan_inst->dst.file == GRF) {
2445 for (int i = 0; i < scan_inst->regs_written; i++) {
2446 int reg = scan_inst->dst.reg + i * reg_size;
2447
2448 if (reg >= first_write_grf &&
2449 reg < first_write_grf + write_len &&
2450 needs_dep[reg - first_write_grf]) {
2451 inst->insert_before(DEP_RESOLVE_MOV(reg));
2452 needs_dep[reg - first_write_grf] = false;
2453 if (scan_inst_simd16)
2454 needs_dep[reg - first_write_grf + 1] = false;
2455 }
2456 }
2457 }
2458
2459 /* Clear the flag for registers that actually got read (as expected). */
2460 clear_deps_for_inst_src(scan_inst, dispatch_width,
2461 needs_dep, first_write_grf, write_len);
2462
2463 /* Continue the loop only if we haven't resolved all the dependencies */
2464 int i;
2465 for (i = 0; i < write_len; i++) {
2466 if (needs_dep[i])
2467 break;
2468 }
2469 if (i == write_len)
2470 return;
2471 }
2472 }
2473
2474 /**
2475 * Implements this workaround for the original 965:
2476 *
2477 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2478 * used as a destination register until after it has been sourced by an
2479 * instruction with a different destination register.
2480 */
2481 void
2482 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2483 {
2484 int write_len = inst->regs_written * dispatch_width / 8;
2485 int first_write_grf = inst->dst.reg;
2486 bool needs_dep[BRW_MAX_MRF];
2487 assert(write_len < (int)sizeof(needs_dep) - 1);
2488
2489 memset(needs_dep, false, sizeof(needs_dep));
2490 memset(needs_dep, true, write_len);
2491 /* Walk forwards looking for writes to registers we're writing which aren't
2492 * read before being written.
2493 */
2494 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2495 !scan_inst->is_tail_sentinel();
2496 scan_inst = (fs_inst *)scan_inst->next) {
2497 /* If we hit control flow, force resolve all remaining dependencies. */
2498 if (scan_inst->is_control_flow()) {
2499 for (int i = 0; i < write_len; i++) {
2500 if (needs_dep[i])
2501 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2502 }
2503 return;
2504 }
2505
2506 /* Clear the flag for registers that actually got read (as expected). */
2507 clear_deps_for_inst_src(scan_inst, dispatch_width,
2508 needs_dep, first_write_grf, write_len);
2509
2510 /* We insert our reads as late as possible since they're reading the
2511 * result of a SEND, which has massive latency.
2512 */
2513 if (scan_inst->dst.file == GRF &&
2514 scan_inst->dst.reg >= first_write_grf &&
2515 scan_inst->dst.reg < first_write_grf + write_len &&
2516 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2517 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2518 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2519 }
2520
2521 /* Continue the loop only if we haven't resolved all the dependencies */
2522 int i;
2523 for (i = 0; i < write_len; i++) {
2524 if (needs_dep[i])
2525 break;
2526 }
2527 if (i == write_len)
2528 return;
2529 }
2530
2531 /* If we hit the end of the program, resolve all remaining dependencies out
2532 * of paranoia.
2533 */
2534 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2535 assert(last_inst->eot);
2536 for (int i = 0; i < write_len; i++) {
2537 if (needs_dep[i])
2538 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2539 }
2540 }
2541
2542 void
2543 fs_visitor::insert_gen4_send_dependency_workarounds()
2544 {
2545 if (brw->gen != 4 || brw->is_g4x)
2546 return;
2547
2548 bool progress = false;
2549
2550 /* Note that we're done with register allocation, so GRF fs_regs always
2551 * have a .reg_offset of 0.
2552 */
2553
2554 foreach_in_list_safe(fs_inst, inst, &instructions) {
2555 if (inst->mlen != 0 && inst->dst.file == GRF) {
2556 insert_gen4_pre_send_dependency_workarounds(inst);
2557 insert_gen4_post_send_dependency_workarounds(inst);
2558 progress = true;
2559 }
2560 }
2561
2562 if (progress)
2563 invalidate_live_intervals();
2564 }
2565
2566 /**
2567 * Turns the generic expression-style uniform pull constant load instruction
2568 * into a hardware-specific series of instructions for loading a pull
2569 * constant.
2570 *
2571 * The expression style allows the CSE pass before this to optimize out
2572 * repeated loads from the same offset, and gives the pre-register-allocation
2573 * scheduling full flexibility, while the conversion to native instructions
2574 * allows the post-register-allocation scheduler the best information
2575 * possible.
2576 *
2577 * Note that execution masking for setting up pull constant loads is special:
2578 * the channels that need to be written are unrelated to the current execution
2579 * mask, since a later instruction will use one of the result channels as a
2580 * source operand for all 8 or 16 of its channels.
2581 */
2582 void
2583 fs_visitor::lower_uniform_pull_constant_loads()
2584 {
2585 foreach_in_list(fs_inst, inst, &instructions) {
2586 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2587 continue;
2588
2589 if (brw->gen >= 7) {
2590 /* The offset arg before was a vec4-aligned byte offset. We need to
2591 * turn it into a dword offset.
2592 */
2593 fs_reg const_offset_reg = inst->src[1];
2594 assert(const_offset_reg.file == IMM &&
2595 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2596 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2597 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2598
2599 /* This is actually going to be a MOV, but since only the first dword
2600 * is accessed, we have a special opcode to do just that one. Note
2601 * that this needs to be an operation that will be considered a def
2602 * by live variable analysis, or register allocation will explode.
2603 */
2604 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2605 payload, const_offset_reg);
2606 setup->force_writemask_all = true;
2607
2608 setup->ir = inst->ir;
2609 setup->annotation = inst->annotation;
2610 inst->insert_before(setup);
2611
2612 /* Similarly, this will only populate the first 4 channels of the
2613 * result register (since we only use smear values from 0-3), but we
2614 * don't tell the optimizer.
2615 */
2616 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2617 inst->src[1] = payload;
2618
2619 invalidate_live_intervals();
2620 } else {
2621 /* Before register allocation, we didn't tell the scheduler about the
2622 * MRF we use. We know it's safe to use this MRF because nothing
2623 * else does except for register spill/unspill, which generates and
2624 * uses its MRF within a single IR instruction.
2625 */
2626 inst->base_mrf = 14;
2627 inst->mlen = 1;
2628 }
2629 }
2630 }
2631
2632 bool
2633 fs_visitor::lower_load_payload()
2634 {
2635 bool progress = false;
2636
2637 foreach_in_list_safe(fs_inst, inst, &instructions) {
2638 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
2639 fs_reg dst = inst->dst;
2640
2641 /* src[0] represents the (optional) message header. */
2642 if (inst->src[0].file != BAD_FILE) {
2643 inst->insert_before(MOV(dst, inst->src[0]));
2644 }
2645 dst.reg_offset++;
2646
2647 for (int i = 1; i < inst->sources; i++) {
2648 inst->insert_before(MOV(dst, inst->src[i]));
2649 dst.reg_offset++;
2650 }
2651
2652 inst->remove();
2653 progress = true;
2654 }
2655 }
2656
2657 if (progress)
2658 invalidate_live_intervals();
2659
2660 return progress;
2661 }
2662
2663 void
2664 fs_visitor::dump_instructions()
2665 {
2666 dump_instructions(NULL);
2667 }
2668
2669 void
2670 fs_visitor::dump_instructions(const char *name)
2671 {
2672 calculate_register_pressure();
2673 FILE *file = stderr;
2674 if (name && geteuid() != 0) {
2675 file = fopen(name, "w");
2676 if (!file)
2677 file = stderr;
2678 }
2679
2680 int ip = 0, max_pressure = 0;
2681 foreach_in_list(backend_instruction, inst, &instructions) {
2682 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
2683 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
2684 dump_instruction(inst, file);
2685 ++ip;
2686 }
2687 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
2688
2689 if (file != stderr) {
2690 fclose(file);
2691 }
2692 }
2693
2694 void
2695 fs_visitor::dump_instruction(backend_instruction *be_inst)
2696 {
2697 dump_instruction(be_inst, stderr);
2698 }
2699
2700 void
2701 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
2702 {
2703 fs_inst *inst = (fs_inst *)be_inst;
2704
2705 if (inst->predicate) {
2706 fprintf(file, "(%cf0.%d) ",
2707 inst->predicate_inverse ? '-' : '+',
2708 inst->flag_subreg);
2709 }
2710
2711 fprintf(file, "%s", brw_instruction_name(inst->opcode));
2712 if (inst->saturate)
2713 fprintf(file, ".sat");
2714 if (inst->conditional_mod) {
2715 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
2716 if (!inst->predicate &&
2717 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2718 inst->opcode != BRW_OPCODE_IF &&
2719 inst->opcode != BRW_OPCODE_WHILE))) {
2720 fprintf(file, ".f0.%d", inst->flag_subreg);
2721 }
2722 }
2723 fprintf(file, " ");
2724
2725
2726 switch (inst->dst.file) {
2727 case GRF:
2728 fprintf(file, "vgrf%d", inst->dst.reg);
2729 if (virtual_grf_sizes[inst->dst.reg] != 1 ||
2730 inst->dst.subreg_offset)
2731 fprintf(file, "+%d.%d",
2732 inst->dst.reg_offset, inst->dst.subreg_offset);
2733 break;
2734 case MRF:
2735 fprintf(file, "m%d", inst->dst.reg);
2736 break;
2737 case BAD_FILE:
2738 fprintf(file, "(null)");
2739 break;
2740 case UNIFORM:
2741 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
2742 break;
2743 case HW_REG:
2744 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2745 switch (inst->dst.fixed_hw_reg.nr) {
2746 case BRW_ARF_NULL:
2747 fprintf(file, "null");
2748 break;
2749 case BRW_ARF_ADDRESS:
2750 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
2751 break;
2752 case BRW_ARF_ACCUMULATOR:
2753 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
2754 break;
2755 case BRW_ARF_FLAG:
2756 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2757 inst->dst.fixed_hw_reg.subnr);
2758 break;
2759 default:
2760 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
2761 inst->dst.fixed_hw_reg.subnr);
2762 break;
2763 }
2764 } else {
2765 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
2766 }
2767 if (inst->dst.fixed_hw_reg.subnr)
2768 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
2769 break;
2770 default:
2771 fprintf(file, "???");
2772 break;
2773 }
2774 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
2775
2776 for (int i = 0; i < inst->sources && inst->src[i].file != BAD_FILE; i++) {
2777 if (inst->src[i].negate)
2778 fprintf(file, "-");
2779 if (inst->src[i].abs)
2780 fprintf(file, "|");
2781 switch (inst->src[i].file) {
2782 case GRF:
2783 fprintf(file, "vgrf%d", inst->src[i].reg);
2784 if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2785 inst->src[i].subreg_offset)
2786 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2787 inst->src[i].subreg_offset);
2788 break;
2789 case MRF:
2790 fprintf(file, "***m%d***", inst->src[i].reg);
2791 break;
2792 case UNIFORM:
2793 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
2794 if (inst->src[i].reladdr) {
2795 fprintf(file, "+reladdr");
2796 } else if (virtual_grf_sizes[inst->src[i].reg] != 1 ||
2797 inst->src[i].subreg_offset) {
2798 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
2799 inst->src[i].subreg_offset);
2800 }
2801 break;
2802 case BAD_FILE:
2803 fprintf(file, "(null)");
2804 break;
2805 case IMM:
2806 switch (inst->src[i].type) {
2807 case BRW_REGISTER_TYPE_F:
2808 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
2809 break;
2810 case BRW_REGISTER_TYPE_D:
2811 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
2812 break;
2813 case BRW_REGISTER_TYPE_UD:
2814 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
2815 break;
2816 default:
2817 fprintf(file, "???");
2818 break;
2819 }
2820 break;
2821 case HW_REG:
2822 if (inst->src[i].fixed_hw_reg.negate)
2823 fprintf(file, "-");
2824 if (inst->src[i].fixed_hw_reg.abs)
2825 fprintf(file, "|");
2826 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
2827 switch (inst->src[i].fixed_hw_reg.nr) {
2828 case BRW_ARF_NULL:
2829 fprintf(file, "null");
2830 break;
2831 case BRW_ARF_ADDRESS:
2832 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
2833 break;
2834 case BRW_ARF_ACCUMULATOR:
2835 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
2836 break;
2837 case BRW_ARF_FLAG:
2838 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2839 inst->src[i].fixed_hw_reg.subnr);
2840 break;
2841 default:
2842 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
2843 inst->src[i].fixed_hw_reg.subnr);
2844 break;
2845 }
2846 } else {
2847 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2848 }
2849 if (inst->src[i].fixed_hw_reg.subnr)
2850 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
2851 if (inst->src[i].fixed_hw_reg.abs)
2852 fprintf(file, "|");
2853 break;
2854 default:
2855 fprintf(file, "???");
2856 break;
2857 }
2858 if (inst->src[i].abs)
2859 fprintf(file, "|");
2860
2861 if (inst->src[i].file != IMM) {
2862 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
2863 }
2864
2865 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
2866 fprintf(file, ", ");
2867 }
2868
2869 fprintf(file, " ");
2870
2871 if (inst->force_uncompressed)
2872 fprintf(file, "1sthalf ");
2873
2874 if (inst->force_sechalf)
2875 fprintf(file, "2ndhalf ");
2876
2877 fprintf(file, "\n");
2878 }
2879
2880 /**
2881 * Possibly returns an instruction that set up @param reg.
2882 *
2883 * Sometimes we want to take the result of some expression/variable
2884 * dereference tree and rewrite the instruction generating the result
2885 * of the tree. When processing the tree, we know that the
2886 * instructions generated are all writing temporaries that are dead
2887 * outside of this tree. So, if we have some instructions that write
2888 * a temporary, we're free to point that temp write somewhere else.
2889 *
2890 * Note that this doesn't guarantee that the instruction generated
2891 * only reg -- it might be the size=4 destination of a texture instruction.
2892 */
2893 fs_inst *
2894 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2895 fs_inst *end,
2896 const fs_reg &reg)
2897 {
2898 if (end == start ||
2899 end->is_partial_write() ||
2900 reg.reladdr ||
2901 !reg.equals(end->dst)) {
2902 return NULL;
2903 } else {
2904 return end;
2905 }
2906 }
2907
2908 void
2909 fs_visitor::setup_payload_gen6()
2910 {
2911 bool uses_depth =
2912 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2913 unsigned barycentric_interp_modes = prog_data->barycentric_interp_modes;
2914
2915 assert(brw->gen >= 6);
2916
2917 /* R0-1: masks, pixel X/Y coordinates. */
2918 payload.num_regs = 2;
2919 /* R2: only for 32-pixel dispatch.*/
2920
2921 /* R3-26: barycentric interpolation coordinates. These appear in the
2922 * same order that they appear in the brw_wm_barycentric_interp_mode
2923 * enum. Each set of coordinates occupies 2 registers if dispatch width
2924 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2925 * appear if they were enabled using the "Barycentric Interpolation
2926 * Mode" bits in WM_STATE.
2927 */
2928 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2929 if (barycentric_interp_modes & (1 << i)) {
2930 payload.barycentric_coord_reg[i] = payload.num_regs;
2931 payload.num_regs += 2;
2932 if (dispatch_width == 16) {
2933 payload.num_regs += 2;
2934 }
2935 }
2936 }
2937
2938 /* R27: interpolated depth if uses source depth */
2939 if (uses_depth) {
2940 payload.source_depth_reg = payload.num_regs;
2941 payload.num_regs++;
2942 if (dispatch_width == 16) {
2943 /* R28: interpolated depth if not SIMD8. */
2944 payload.num_regs++;
2945 }
2946 }
2947 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2948 if (uses_depth) {
2949 payload.source_w_reg = payload.num_regs;
2950 payload.num_regs++;
2951 if (dispatch_width == 16) {
2952 /* R30: interpolated W if not SIMD8. */
2953 payload.num_regs++;
2954 }
2955 }
2956
2957 prog_data->uses_pos_offset = key->compute_pos_offset;
2958 /* R31: MSAA position offsets. */
2959 if (prog_data->uses_pos_offset) {
2960 payload.sample_pos_reg = payload.num_regs;
2961 payload.num_regs++;
2962 }
2963
2964 /* R32: MSAA input coverage mask */
2965 if (fp->Base.SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
2966 assert(brw->gen >= 7);
2967 payload.sample_mask_in_reg = payload.num_regs;
2968 payload.num_regs++;
2969 if (dispatch_width == 16) {
2970 /* R33: input coverage mask if not SIMD8. */
2971 payload.num_regs++;
2972 }
2973 }
2974
2975 /* R34-: bary for 32-pixel. */
2976 /* R58-59: interp W for 32-pixel. */
2977
2978 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2979 source_depth_to_render_target = true;
2980 }
2981 }
2982
2983 void
2984 fs_visitor::assign_binding_table_offsets()
2985 {
2986 uint32_t next_binding_table_offset = 0;
2987
2988 /* If there are no color regions, we still perform an FB write to a null
2989 * renderbuffer, which we place at surface index 0.
2990 */
2991 prog_data->binding_table.render_target_start = next_binding_table_offset;
2992 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
2993
2994 assign_common_binding_table_offsets(next_binding_table_offset);
2995 }
2996
2997 void
2998 fs_visitor::calculate_register_pressure()
2999 {
3000 invalidate_live_intervals();
3001 calculate_live_intervals();
3002
3003 unsigned num_instructions = instructions.length();
3004
3005 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3006
3007 for (int reg = 0; reg < virtual_grf_count; reg++) {
3008 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3009 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3010 }
3011 }
3012
3013 /**
3014 * Look for repeated FS_OPCODE_MOV_DISPATCH_TO_FLAGS and drop the later ones.
3015 *
3016 * The needs_unlit_centroid_workaround ends up producing one of these per
3017 * channel of centroid input, so it's good to clean them up.
3018 *
3019 * An assumption here is that nothing ever modifies the dispatched pixels
3020 * value that FS_OPCODE_MOV_DISPATCH_TO_FLAGS reads from, but the hardware
3021 * dictates that anyway.
3022 */
3023 void
3024 fs_visitor::opt_drop_redundant_mov_to_flags()
3025 {
3026 bool flag_mov_found[2] = {false};
3027
3028 foreach_in_list_safe(fs_inst, inst, &instructions) {
3029 if (inst->is_control_flow()) {
3030 memset(flag_mov_found, 0, sizeof(flag_mov_found));
3031 } else if (inst->opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS) {
3032 if (!flag_mov_found[inst->flag_subreg])
3033 flag_mov_found[inst->flag_subreg] = true;
3034 else
3035 inst->remove();
3036 } else if (inst->writes_flag()) {
3037 flag_mov_found[inst->flag_subreg] = false;
3038 }
3039 }
3040 }
3041
3042 bool
3043 fs_visitor::run()
3044 {
3045 sanity_param_count = fp->Base.Parameters->NumParameters;
3046 bool allocated_without_spills;
3047
3048 assign_binding_table_offsets();
3049
3050 if (brw->gen >= 6)
3051 setup_payload_gen6();
3052 else
3053 setup_payload_gen4();
3054
3055 if (0) {
3056 emit_dummy_fs();
3057 } else {
3058 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3059 emit_shader_time_begin();
3060
3061 calculate_urb_setup();
3062 if (fp->Base.InputsRead > 0) {
3063 if (brw->gen < 6)
3064 emit_interpolation_setup_gen4();
3065 else
3066 emit_interpolation_setup_gen6();
3067 }
3068
3069 /* We handle discards by keeping track of the still-live pixels in f0.1.
3070 * Initialize it with the dispatched pixels.
3071 */
3072 if (fp->UsesKill || key->alpha_test_func) {
3073 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3074 discard_init->flag_subreg = 1;
3075 }
3076
3077 /* Generate FS IR for main(). (the visitor only descends into
3078 * functions called "main").
3079 */
3080 if (shader) {
3081 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3082 base_ir = ir;
3083 this->result = reg_undef;
3084 ir->accept(this);
3085 }
3086 } else {
3087 emit_fragment_program_code();
3088 }
3089 base_ir = NULL;
3090 if (failed)
3091 return false;
3092
3093 emit(FS_OPCODE_PLACEHOLDER_HALT);
3094
3095 if (key->alpha_test_func)
3096 emit_alpha_test();
3097
3098 emit_fb_writes();
3099
3100 split_virtual_grfs();
3101
3102 move_uniform_array_access_to_pull_constants();
3103 assign_constant_locations();
3104 demote_pull_constants();
3105
3106 opt_drop_redundant_mov_to_flags();
3107
3108 #define OPT(pass, args...) do { \
3109 pass_num++; \
3110 bool this_progress = pass(args); \
3111 \
3112 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3113 char filename[64]; \
3114 snprintf(filename, 64, "fs%d-%04d-%02d-%02d-" #pass, \
3115 dispatch_width, shader_prog->Name, iteration, pass_num); \
3116 \
3117 backend_visitor::dump_instructions(filename); \
3118 } \
3119 \
3120 progress = progress || this_progress; \
3121 } while (false)
3122
3123 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3124 char filename[64];
3125 snprintf(filename, 64, "fs%d-%04d-00-start",
3126 dispatch_width, shader_prog->Name);
3127
3128 backend_visitor::dump_instructions(filename);
3129 }
3130
3131 bool progress;
3132 int iteration = 0;
3133 do {
3134 progress = false;
3135 iteration++;
3136 int pass_num = 0;
3137
3138 compact_virtual_grfs();
3139
3140 OPT(remove_duplicate_mrf_writes);
3141
3142 OPT(opt_algebraic);
3143 OPT(opt_cse);
3144 OPT(opt_copy_propagate);
3145 OPT(opt_peephole_predicated_break);
3146 OPT(dead_code_eliminate);
3147 OPT(opt_peephole_sel);
3148 OPT(dead_control_flow_eliminate, this);
3149 OPT(opt_register_renaming);
3150 OPT(opt_saturate_propagation);
3151 OPT(register_coalesce);
3152 OPT(compute_to_mrf);
3153 } while (progress);
3154
3155 if (lower_load_payload()) {
3156 register_coalesce();
3157 dead_code_eliminate();
3158 }
3159
3160 lower_uniform_pull_constant_loads();
3161
3162 assign_curb_setup();
3163 assign_urb_setup();
3164
3165 static enum instruction_scheduler_mode pre_modes[] = {
3166 SCHEDULE_PRE,
3167 SCHEDULE_PRE_NON_LIFO,
3168 SCHEDULE_PRE_LIFO,
3169 };
3170
3171 /* Try each scheduling heuristic to see if it can successfully register
3172 * allocate without spilling. They should be ordered by decreasing
3173 * performance but increasing likelihood of allocating.
3174 */
3175 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3176 schedule_instructions(pre_modes[i]);
3177
3178 if (0) {
3179 assign_regs_trivial();
3180 allocated_without_spills = true;
3181 } else {
3182 allocated_without_spills = assign_regs(false);
3183 }
3184 if (allocated_without_spills)
3185 break;
3186 }
3187
3188 if (!allocated_without_spills) {
3189 /* We assume that any spilling is worse than just dropping back to
3190 * SIMD8. There's probably actually some intermediate point where
3191 * SIMD16 with a couple of spills is still better.
3192 */
3193 if (dispatch_width == 16) {
3194 fail("Failure to register allocate. Reduce number of "
3195 "live scalar values to avoid this.");
3196 } else {
3197 perf_debug("Fragment shader triggered register spilling. "
3198 "Try reducing the number of live scalar values to "
3199 "improve performance.\n");
3200 }
3201
3202 /* Since we're out of heuristics, just go spill registers until we
3203 * get an allocation.
3204 */
3205 while (!assign_regs(true)) {
3206 if (failed)
3207 break;
3208 }
3209 }
3210 }
3211 assert(force_uncompressed_stack == 0);
3212
3213 /* This must come after all optimization and register allocation, since
3214 * it inserts dead code that happens to have side effects, and it does
3215 * so based on the actual physical registers in use.
3216 */
3217 insert_gen4_send_dependency_workarounds();
3218
3219 if (failed)
3220 return false;
3221
3222 if (!allocated_without_spills)
3223 schedule_instructions(SCHEDULE_POST);
3224
3225 if (last_scratch > 0) {
3226 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3227 }
3228
3229 if (dispatch_width == 8)
3230 prog_data->reg_blocks = brw_register_blocks(grf_used);
3231 else
3232 prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3233
3234 /* If any state parameters were appended, then ParameterValues could have
3235 * been realloced, in which case the driver uniform storage set up by
3236 * _mesa_associate_uniform_storage() would point to freed memory. Make
3237 * sure that didn't happen.
3238 */
3239 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3240
3241 return !failed;
3242 }
3243
3244 const unsigned *
3245 brw_wm_fs_emit(struct brw_context *brw,
3246 void *mem_ctx,
3247 const struct brw_wm_prog_key *key,
3248 struct brw_wm_prog_data *prog_data,
3249 struct gl_fragment_program *fp,
3250 struct gl_shader_program *prog,
3251 unsigned *final_assembly_size)
3252 {
3253 bool start_busy = false;
3254 double start_time = 0;
3255
3256 if (unlikely(brw->perf_debug)) {
3257 start_busy = (brw->batch.last_bo &&
3258 drm_intel_bo_busy(brw->batch.last_bo));
3259 start_time = get_time();
3260 }
3261
3262 struct brw_shader *shader = NULL;
3263 if (prog)
3264 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3265
3266 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3267 brw_dump_ir(brw, "fragment", prog, &shader->base, &fp->Base);
3268
3269 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3270 */
3271 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3272 if (!v.run()) {
3273 if (prog) {
3274 prog->LinkStatus = false;
3275 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3276 }
3277
3278 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3279 v.fail_msg);
3280
3281 return NULL;
3282 }
3283
3284 exec_list *simd16_instructions = NULL;
3285 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3286 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3287 if (!v.simd16_unsupported) {
3288 /* Try a SIMD16 compile */
3289 v2.import_uniforms(&v);
3290 if (!v2.run()) {
3291 perf_debug("SIMD16 shader failed to compile, falling back to "
3292 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3293 } else {
3294 simd16_instructions = &v2.instructions;
3295 }
3296 } else {
3297 perf_debug("SIMD16 shader unsupported, falling back to "
3298 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3299 }
3300 }
3301
3302 exec_list *simd8_instructions;
3303 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3304 if (no_simd8 && simd16_instructions) {
3305 simd8_instructions = NULL;
3306 prog_data->no_8 = true;
3307 } else {
3308 simd8_instructions = &v.instructions;
3309 prog_data->no_8 = false;
3310 }
3311
3312 const unsigned *assembly = NULL;
3313 fs_generator g(brw, mem_ctx, key, prog_data, prog, fp,
3314 v.runtime_check_aads_emit, INTEL_DEBUG & DEBUG_WM);
3315 assembly = g.generate_assembly(simd8_instructions, simd16_instructions,
3316 final_assembly_size);
3317
3318 if (unlikely(brw->perf_debug) && shader) {
3319 if (shader->compiled_once)
3320 brw_wm_debug_recompile(brw, prog, key);
3321 shader->compiled_once = true;
3322
3323 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3324 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3325 (get_time() - start_time) * 1000);
3326 }
3327 }
3328
3329 return assembly;
3330 }
3331
3332 bool
3333 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3334 {
3335 struct brw_context *brw = brw_context(ctx);
3336 struct brw_wm_prog_key key;
3337
3338 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3339 return true;
3340
3341 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3342 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3343 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3344 bool program_uses_dfdy = fp->UsesDFdy;
3345
3346 memset(&key, 0, sizeof(key));
3347
3348 if (brw->gen < 6) {
3349 if (fp->UsesKill)
3350 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3351
3352 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3353 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3354
3355 /* Just assume depth testing. */
3356 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3357 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3358 }
3359
3360 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3361 BRW_FS_VARYING_INPUT_MASK) > 16)
3362 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3363
3364 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3365 for (unsigned i = 0; i < sampler_count; i++) {
3366 if (fp->Base.ShadowSamplers & (1 << i)) {
3367 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3368 key.tex.swizzles[i] =
3369 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3370 } else {
3371 /* Color sampler: assume no swizzling. */
3372 key.tex.swizzles[i] = SWIZZLE_XYZW;
3373 }
3374 }
3375
3376 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3377 key.drawable_height = ctx->DrawBuffer->Height;
3378 }
3379
3380 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3381 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3382 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3383
3384 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3385 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3386 key.nr_color_regions > 1;
3387 }
3388
3389 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3390 * quality of the derivatives is likely to be determined by the driconf
3391 * option.
3392 */
3393 key.high_quality_derivatives = brw->disable_derivative_optimization;
3394
3395 key.program_string_id = bfp->id;
3396
3397 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3398 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3399
3400 bool success = do_wm_prog(brw, prog, bfp, &key);
3401
3402 brw->wm.base.prog_offset = old_prog_offset;
3403 brw->wm.prog_data = old_prog_data;
3404
3405 return success;
3406 }