i965: Add support for saturating immediates.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "util/register_allocate.h"
42 #include "program/hash_table.h"
43 #include "brw_context.h"
44 #include "brw_eu.h"
45 #include "brw_wm.h"
46 }
47 #include "brw_fs.h"
48 #include "brw_cfg.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53 #include "program/sampler.h"
54
55 void
56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
57 fs_reg *src, int sources)
58 {
59 memset(this, 0, sizeof(*this));
60
61 this->opcode = opcode;
62 this->dst = dst;
63 this->src = src;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (int i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (int i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 case ATTR:
101 assert(this->src[i].width > 0);
102 if (this->src[i].width == 1) {
103 this->src[i].effective_width = this->exec_size;
104 } else {
105 this->src[i].effective_width = this->src[i].width;
106 }
107 break;
108 case IMM:
109 case UNIFORM:
110 this->src[i].effective_width = this->exec_size;
111 break;
112 default:
113 unreachable("Invalid source register file");
114 }
115 }
116 this->dst.effective_width = this->exec_size;
117
118 this->conditional_mod = BRW_CONDITIONAL_NONE;
119
120 /* This will be the case for almost all instructions. */
121 switch (dst.file) {
122 case GRF:
123 case HW_REG:
124 case MRF:
125 case ATTR:
126 this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
127 break;
128 case BAD_FILE:
129 this->regs_written = 0;
130 break;
131 case IMM:
132 case UNIFORM:
133 unreachable("Invalid destination register file");
134 default:
135 unreachable("Invalid register file");
136 }
137
138 this->writes_accumulator = false;
139 }
140
141 fs_inst::fs_inst()
142 {
143 fs_reg *src = ralloc_array(this, fs_reg, 3);
144 init(BRW_OPCODE_NOP, 8, dst, src, 0);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
148 {
149 fs_reg *src = ralloc_array(this, fs_reg, 3);
150 init(opcode, exec_size, reg_undef, src, 0);
151 }
152
153 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
154 {
155 fs_reg *src = ralloc_array(this, fs_reg, 3);
156 init(opcode, 0, dst, src, 0);
157 }
158
159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
160 const fs_reg &src0)
161 {
162 fs_reg *src = ralloc_array(this, fs_reg, 3);
163 src[0] = src0;
164 init(opcode, exec_size, dst, src, 1);
165 }
166
167 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
168 {
169 fs_reg *src = ralloc_array(this, fs_reg, 3);
170 src[0] = src0;
171 init(opcode, 0, dst, src, 1);
172 }
173
174 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
175 const fs_reg &src0, const fs_reg &src1)
176 {
177 fs_reg *src = ralloc_array(this, fs_reg, 3);
178 src[0] = src0;
179 src[1] = src1;
180 init(opcode, exec_size, dst, src, 2);
181 }
182
183 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
184 const fs_reg &src1)
185 {
186 fs_reg *src = ralloc_array(this, fs_reg, 3);
187 src[0] = src0;
188 src[1] = src1;
189 init(opcode, 0, dst, src, 2);
190 }
191
192 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
193 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
194 {
195 fs_reg *src = ralloc_array(this, fs_reg, 3);
196 src[0] = src0;
197 src[1] = src1;
198 src[2] = src2;
199 init(opcode, exec_size, dst, src, 3);
200 }
201
202 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
203 const fs_reg &src1, const fs_reg &src2)
204 {
205 fs_reg *src = ralloc_array(this, fs_reg, 3);
206 src[0] = src0;
207 src[1] = src1;
208 src[2] = src2;
209 init(opcode, 0, dst, src, 3);
210 }
211
212 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
213 {
214 init(opcode, 0, dst, src, sources);
215 }
216
217 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
218 fs_reg src[], int sources)
219 {
220 init(opcode, exec_width, dst, src, sources);
221 }
222
223 fs_inst::fs_inst(const fs_inst &that)
224 {
225 memcpy(this, &that, sizeof(that));
226
227 this->src = ralloc_array(this, fs_reg, that.sources);
228
229 for (int i = 0; i < that.sources; i++)
230 this->src[i] = that.src[i];
231 }
232
233 void
234 fs_inst::resize_sources(uint8_t num_sources)
235 {
236 if (this->sources != num_sources) {
237 this->src = reralloc(this, this->src, fs_reg, num_sources);
238 this->sources = num_sources;
239 }
240 }
241
242 #define ALU1(op) \
243 fs_inst * \
244 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
245 { \
246 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
247 }
248
249 #define ALU2(op) \
250 fs_inst * \
251 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
252 const fs_reg &src1) \
253 { \
254 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
255 }
256
257 #define ALU2_ACC(op) \
258 fs_inst * \
259 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
260 const fs_reg &src1) \
261 { \
262 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
263 inst->writes_accumulator = true; \
264 return inst; \
265 }
266
267 #define ALU3(op) \
268 fs_inst * \
269 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
270 const fs_reg &src1, const fs_reg &src2) \
271 { \
272 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
273 }
274
275 ALU1(NOT)
276 ALU1(MOV)
277 ALU1(FRC)
278 ALU1(RNDD)
279 ALU1(RNDE)
280 ALU1(RNDZ)
281 ALU2(ADD)
282 ALU2(MUL)
283 ALU2_ACC(MACH)
284 ALU2(AND)
285 ALU2(OR)
286 ALU2(XOR)
287 ALU2(SHL)
288 ALU2(SHR)
289 ALU2(ASR)
290 ALU3(LRP)
291 ALU1(BFREV)
292 ALU3(BFE)
293 ALU2(BFI1)
294 ALU3(BFI2)
295 ALU1(FBH)
296 ALU1(FBL)
297 ALU1(CBIT)
298 ALU3(MAD)
299 ALU2_ACC(ADDC)
300 ALU2_ACC(SUBB)
301 ALU2(SEL)
302 ALU2(MAC)
303
304 /** Gen4 predicated IF. */
305 fs_inst *
306 fs_visitor::IF(enum brw_predicate predicate)
307 {
308 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
309 inst->predicate = predicate;
310 return inst;
311 }
312
313 /** Gen6 IF with embedded comparison. */
314 fs_inst *
315 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
316 enum brw_conditional_mod condition)
317 {
318 assert(brw->gen == 6);
319 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
320 reg_null_d, src0, src1);
321 inst->conditional_mod = condition;
322 return inst;
323 }
324
325 /**
326 * CMP: Sets the low bit of the destination channels with the result
327 * of the comparison, while the upper bits are undefined, and updates
328 * the flag register with the packed 16 bits of the result.
329 */
330 fs_inst *
331 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
332 enum brw_conditional_mod condition)
333 {
334 fs_inst *inst;
335
336 /* Take the instruction:
337 *
338 * CMP null<d> src0<f> src1<f>
339 *
340 * Original gen4 does type conversion to the destination type before
341 * comparison, producing garbage results for floating point comparisons.
342 * gen5 does the comparison on the execution type (resolved source types),
343 * so dst type doesn't matter. gen6 does comparison and then uses the
344 * result as if it was the dst type with no conversion, which happens to
345 * mostly work out for float-interpreted-as-int since our comparisons are
346 * for >0, =0, <0.
347 */
348 if (brw->gen == 4) {
349 dst.type = src0.type;
350 if (dst.file == HW_REG)
351 dst.fixed_hw_reg.type = dst.type;
352 }
353
354 resolve_ud_negate(&src0);
355 resolve_ud_negate(&src1);
356
357 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
358 inst->conditional_mod = condition;
359
360 return inst;
361 }
362
363 fs_inst *
364 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
365 {
366 uint8_t exec_size = dst.width;
367 for (int i = 0; i < sources; ++i) {
368 assert(src[i].width % dst.width == 0);
369 if (src[i].width > exec_size)
370 exec_size = src[i].width;
371 }
372
373 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
374 dst, src, sources);
375 inst->regs_written = 0;
376 for (int i = 0; i < sources; ++i) {
377 /* The LOAD_PAYLOAD instruction only really makes sense if we are
378 * dealing with whole registers. If this ever changes, we can deal
379 * with it later.
380 */
381 int size = src[i].effective_width * type_sz(src[i].type);
382 assert(size % 32 == 0);
383 inst->regs_written += (size + 31) / 32;
384 }
385
386 return inst;
387 }
388
389 exec_list
390 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
391 const fs_reg &surf_index,
392 const fs_reg &varying_offset,
393 uint32_t const_offset)
394 {
395 exec_list instructions;
396 fs_inst *inst;
397
398 /* We have our constant surface use a pitch of 4 bytes, so our index can
399 * be any component of a vector, and then we load 4 contiguous
400 * components starting from that.
401 *
402 * We break down the const_offset to a portion added to the variable
403 * offset and a portion done using reg_offset, which means that if you
404 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
405 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
406 * CSE can later notice that those loads are all the same and eliminate
407 * the redundant ones.
408 */
409 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
410 instructions.push_tail(ADD(vec4_offset,
411 varying_offset, fs_reg(const_offset & ~3)));
412
413 int scale = 1;
414 if (brw->gen == 4 && dst.width == 8) {
415 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
416 * u, v, r) as parameters, or we can just use the SIMD16 message
417 * consisting of (header, u). We choose the second, at the cost of a
418 * longer return length.
419 */
420 scale = 2;
421 }
422
423 enum opcode op;
424 if (brw->gen >= 7)
425 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
426 else
427 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
428
429 assert(dst.width % 8 == 0);
430 int regs_written = 4 * (dst.width / 8) * scale;
431 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
432 dst.type, dst.width);
433 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
434 inst->regs_written = regs_written;
435 instructions.push_tail(inst);
436
437 if (brw->gen < 7) {
438 inst->base_mrf = 13;
439 inst->header_present = true;
440 if (brw->gen == 4)
441 inst->mlen = 3;
442 else
443 inst->mlen = 1 + dispatch_width / 8;
444 }
445
446 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
447 instructions.push_tail(MOV(dst, result));
448
449 return instructions;
450 }
451
452 /**
453 * A helper for MOV generation for fixing up broken hardware SEND dependency
454 * handling.
455 */
456 fs_inst *
457 fs_visitor::DEP_RESOLVE_MOV(int grf)
458 {
459 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
460
461 inst->ir = NULL;
462 inst->annotation = "send dependency resolve";
463
464 /* The caller always wants uncompressed to emit the minimal extra
465 * dependencies, and to avoid having to deal with aligning its regs to 2.
466 */
467 inst->exec_size = 8;
468
469 return inst;
470 }
471
472 bool
473 fs_inst::equals(fs_inst *inst) const
474 {
475 return (opcode == inst->opcode &&
476 dst.equals(inst->dst) &&
477 src[0].equals(inst->src[0]) &&
478 src[1].equals(inst->src[1]) &&
479 src[2].equals(inst->src[2]) &&
480 saturate == inst->saturate &&
481 predicate == inst->predicate &&
482 conditional_mod == inst->conditional_mod &&
483 mlen == inst->mlen &&
484 base_mrf == inst->base_mrf &&
485 target == inst->target &&
486 eot == inst->eot &&
487 header_present == inst->header_present &&
488 shadow_compare == inst->shadow_compare &&
489 exec_size == inst->exec_size &&
490 offset == inst->offset);
491 }
492
493 bool
494 fs_inst::overwrites_reg(const fs_reg &reg) const
495 {
496 return (reg.file == dst.file &&
497 reg.reg == dst.reg &&
498 reg.reg_offset >= dst.reg_offset &&
499 reg.reg_offset < dst.reg_offset + regs_written);
500 }
501
502 bool
503 fs_inst::is_send_from_grf() const
504 {
505 switch (opcode) {
506 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
507 case SHADER_OPCODE_SHADER_TIME_ADD:
508 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
509 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
510 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
511 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
512 case SHADER_OPCODE_UNTYPED_ATOMIC:
513 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
514 case SHADER_OPCODE_URB_WRITE_SIMD8:
515 return true;
516 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
517 return src[1].file == GRF;
518 case FS_OPCODE_FB_WRITE:
519 return src[0].file == GRF;
520 default:
521 if (is_tex())
522 return src[0].file == GRF;
523
524 return false;
525 }
526 }
527
528 bool
529 fs_inst::can_do_source_mods(struct brw_context *brw)
530 {
531 if (brw->gen == 6 && is_math())
532 return false;
533
534 if (is_send_from_grf())
535 return false;
536
537 if (!backend_instruction::can_do_source_mods())
538 return false;
539
540 return true;
541 }
542
543 void
544 fs_reg::init()
545 {
546 memset(this, 0, sizeof(*this));
547 stride = 1;
548 }
549
550 /** Generic unset register constructor. */
551 fs_reg::fs_reg()
552 {
553 init();
554 this->file = BAD_FILE;
555 }
556
557 /** Immediate value constructor. */
558 fs_reg::fs_reg(float f)
559 {
560 init();
561 this->file = IMM;
562 this->type = BRW_REGISTER_TYPE_F;
563 this->fixed_hw_reg.dw1.f = f;
564 this->width = 1;
565 }
566
567 /** Immediate value constructor. */
568 fs_reg::fs_reg(int32_t i)
569 {
570 init();
571 this->file = IMM;
572 this->type = BRW_REGISTER_TYPE_D;
573 this->fixed_hw_reg.dw1.d = i;
574 this->width = 1;
575 }
576
577 /** Immediate value constructor. */
578 fs_reg::fs_reg(uint32_t u)
579 {
580 init();
581 this->file = IMM;
582 this->type = BRW_REGISTER_TYPE_UD;
583 this->fixed_hw_reg.dw1.ud = u;
584 this->width = 1;
585 }
586
587 /** Vector float immediate value constructor. */
588 fs_reg::fs_reg(uint8_t vf[4])
589 {
590 init();
591 this->file = IMM;
592 this->type = BRW_REGISTER_TYPE_VF;
593 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
594 }
595
596 /** Vector float immediate value constructor. */
597 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
598 {
599 init();
600 this->file = IMM;
601 this->type = BRW_REGISTER_TYPE_VF;
602 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
603 (vf1 << 8) |
604 (vf2 << 16) |
605 (vf3 << 24);
606 }
607
608 /** Fixed brw_reg. */
609 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
610 {
611 init();
612 this->file = HW_REG;
613 this->fixed_hw_reg = fixed_hw_reg;
614 this->type = fixed_hw_reg.type;
615 this->width = 1 << fixed_hw_reg.width;
616 }
617
618 bool
619 fs_reg::equals(const fs_reg &r) const
620 {
621 return (file == r.file &&
622 reg == r.reg &&
623 reg_offset == r.reg_offset &&
624 subreg_offset == r.subreg_offset &&
625 type == r.type &&
626 negate == r.negate &&
627 abs == r.abs &&
628 !reladdr && !r.reladdr &&
629 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
630 width == r.width &&
631 stride == r.stride);
632 }
633
634 fs_reg &
635 fs_reg::set_smear(unsigned subreg)
636 {
637 assert(file != HW_REG && file != IMM);
638 subreg_offset = subreg * type_sz(type);
639 stride = 0;
640 return *this;
641 }
642
643 bool
644 fs_reg::is_contiguous() const
645 {
646 return stride == 1;
647 }
648
649 int
650 fs_visitor::type_size(const struct glsl_type *type)
651 {
652 unsigned int size, i;
653
654 switch (type->base_type) {
655 case GLSL_TYPE_UINT:
656 case GLSL_TYPE_INT:
657 case GLSL_TYPE_FLOAT:
658 case GLSL_TYPE_BOOL:
659 return type->components();
660 case GLSL_TYPE_ARRAY:
661 return type_size(type->fields.array) * type->length;
662 case GLSL_TYPE_STRUCT:
663 size = 0;
664 for (i = 0; i < type->length; i++) {
665 size += type_size(type->fields.structure[i].type);
666 }
667 return size;
668 case GLSL_TYPE_SAMPLER:
669 /* Samplers take up no register space, since they're baked in at
670 * link time.
671 */
672 return 0;
673 case GLSL_TYPE_ATOMIC_UINT:
674 return 0;
675 case GLSL_TYPE_IMAGE:
676 case GLSL_TYPE_VOID:
677 case GLSL_TYPE_ERROR:
678 case GLSL_TYPE_INTERFACE:
679 unreachable("not reached");
680 }
681
682 return 0;
683 }
684
685 fs_reg
686 fs_visitor::get_timestamp()
687 {
688 assert(brw->gen >= 7);
689
690 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
691 BRW_ARF_TIMESTAMP,
692 0),
693 BRW_REGISTER_TYPE_UD));
694
695 fs_reg dst = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 4);
696
697 fs_inst *mov = emit(MOV(dst, ts));
698 /* We want to read the 3 fields we care about even if it's not enabled in
699 * the dispatch.
700 */
701 mov->force_writemask_all = true;
702
703 /* The caller wants the low 32 bits of the timestamp. Since it's running
704 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
705 * which is plenty of time for our purposes. It is identical across the
706 * EUs, but since it's tracking GPU core speed it will increment at a
707 * varying rate as render P-states change.
708 *
709 * The caller could also check if render P-states have changed (or anything
710 * else that might disrupt timing) by setting smear to 2 and checking if
711 * that field is != 0.
712 */
713 dst.set_smear(0);
714
715 return dst;
716 }
717
718 void
719 fs_visitor::emit_shader_time_begin()
720 {
721 current_annotation = "shader time start";
722 shader_start_time = get_timestamp();
723 }
724
725 void
726 fs_visitor::emit_shader_time_end()
727 {
728 current_annotation = "shader time end";
729
730 enum shader_time_shader_type type, written_type, reset_type;
731 if (dispatch_width == 8) {
732 type = ST_FS8;
733 written_type = ST_FS8_WRITTEN;
734 reset_type = ST_FS8_RESET;
735 } else {
736 assert(dispatch_width == 16);
737 type = ST_FS16;
738 written_type = ST_FS16_WRITTEN;
739 reset_type = ST_FS16_RESET;
740 }
741
742 fs_reg shader_end_time = get_timestamp();
743
744 /* Check that there weren't any timestamp reset events (assuming these
745 * were the only two timestamp reads that happened).
746 */
747 fs_reg reset = shader_end_time;
748 reset.set_smear(2);
749 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
750 test->conditional_mod = BRW_CONDITIONAL_Z;
751 emit(IF(BRW_PREDICATE_NORMAL));
752
753 fs_reg start = shader_start_time;
754 start.negate = true;
755 fs_reg diff = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 1);
756 emit(ADD(diff, start, shader_end_time));
757
758 /* If there were no instructions between the two timestamp gets, the diff
759 * is 2 cycles. Remove that overhead, so I can forget about that when
760 * trying to determine the time taken for single instructions.
761 */
762 emit(ADD(diff, diff, fs_reg(-2u)));
763
764 emit_shader_time_write(type, diff);
765 emit_shader_time_write(written_type, fs_reg(1u));
766 emit(BRW_OPCODE_ELSE);
767 emit_shader_time_write(reset_type, fs_reg(1u));
768 emit(BRW_OPCODE_ENDIF);
769 }
770
771 void
772 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
773 fs_reg value)
774 {
775 int shader_time_index =
776 brw_get_shader_time_index(brw, shader_prog, prog, type);
777 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
778
779 fs_reg payload;
780 if (dispatch_width == 8)
781 payload = fs_reg(this, glsl_type::uvec2_type);
782 else
783 payload = fs_reg(this, glsl_type::uint_type);
784
785 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
786 fs_reg(), payload, offset, value));
787 }
788
789 void
790 fs_visitor::vfail(const char *format, va_list va)
791 {
792 char *msg;
793
794 if (failed)
795 return;
796
797 failed = true;
798
799 msg = ralloc_vasprintf(mem_ctx, format, va);
800 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
801
802 this->fail_msg = msg;
803
804 if (INTEL_DEBUG & DEBUG_WM) {
805 fprintf(stderr, "%s", msg);
806 }
807 }
808
809 void
810 fs_visitor::fail(const char *format, ...)
811 {
812 va_list va;
813
814 va_start(va, format);
815 vfail(format, va);
816 va_end(va);
817 }
818
819 /**
820 * Mark this program as impossible to compile in SIMD16 mode.
821 *
822 * During the SIMD8 compile (which happens first), we can detect and flag
823 * things that are unsupported in SIMD16 mode, so the compiler can skip
824 * the SIMD16 compile altogether.
825 *
826 * During a SIMD16 compile (if one happens anyway), this just calls fail().
827 */
828 void
829 fs_visitor::no16(const char *format, ...)
830 {
831 va_list va;
832
833 va_start(va, format);
834
835 if (dispatch_width == 16) {
836 vfail(format, va);
837 } else {
838 simd16_unsupported = true;
839
840 if (brw->perf_debug) {
841 if (no16_msg)
842 ralloc_vasprintf_append(&no16_msg, format, va);
843 else
844 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
845 }
846 }
847
848 va_end(va);
849 }
850
851 fs_inst *
852 fs_visitor::emit(enum opcode opcode)
853 {
854 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
855 }
856
857 fs_inst *
858 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
859 {
860 return emit(new(mem_ctx) fs_inst(opcode, dst));
861 }
862
863 fs_inst *
864 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
865 {
866 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
867 }
868
869 fs_inst *
870 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
871 const fs_reg &src1)
872 {
873 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
874 }
875
876 fs_inst *
877 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
878 const fs_reg &src1, const fs_reg &src2)
879 {
880 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
881 }
882
883 fs_inst *
884 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
885 fs_reg src[], int sources)
886 {
887 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
888 }
889
890 /**
891 * Returns true if the instruction has a flag that means it won't
892 * update an entire destination register.
893 *
894 * For example, dead code elimination and live variable analysis want to know
895 * when a write to a variable screens off any preceding values that were in
896 * it.
897 */
898 bool
899 fs_inst::is_partial_write() const
900 {
901 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
902 (this->dst.width * type_sz(this->dst.type)) < 32 ||
903 !this->dst.is_contiguous());
904 }
905
906 int
907 fs_inst::regs_read(fs_visitor *v, int arg) const
908 {
909 if (is_tex() && arg == 0 && src[0].file == GRF) {
910 return mlen;
911 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
912 return mlen;
913 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
914 return mlen;
915 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
916 return mlen;
917 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
918 return mlen;
919 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
920 return mlen;
921 }
922
923 switch (src[arg].file) {
924 case BAD_FILE:
925 case UNIFORM:
926 case IMM:
927 return 1;
928 case GRF:
929 case HW_REG:
930 if (src[arg].stride == 0) {
931 return 1;
932 } else {
933 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
934 return (size + 31) / 32;
935 }
936 case MRF:
937 unreachable("MRF registers are not allowed as sources");
938 default:
939 unreachable("Invalid register file");
940 }
941 }
942
943 bool
944 fs_inst::reads_flag() const
945 {
946 return predicate;
947 }
948
949 bool
950 fs_inst::writes_flag() const
951 {
952 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
953 opcode != BRW_OPCODE_IF &&
954 opcode != BRW_OPCODE_WHILE)) ||
955 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
956 }
957
958 /**
959 * Returns how many MRFs an FS opcode will write over.
960 *
961 * Note that this is not the 0 or 1 implied writes in an actual gen
962 * instruction -- the FS opcodes often generate MOVs in addition.
963 */
964 int
965 fs_visitor::implied_mrf_writes(fs_inst *inst)
966 {
967 if (inst->mlen == 0)
968 return 0;
969
970 if (inst->base_mrf == -1)
971 return 0;
972
973 switch (inst->opcode) {
974 case SHADER_OPCODE_RCP:
975 case SHADER_OPCODE_RSQ:
976 case SHADER_OPCODE_SQRT:
977 case SHADER_OPCODE_EXP2:
978 case SHADER_OPCODE_LOG2:
979 case SHADER_OPCODE_SIN:
980 case SHADER_OPCODE_COS:
981 return 1 * dispatch_width / 8;
982 case SHADER_OPCODE_POW:
983 case SHADER_OPCODE_INT_QUOTIENT:
984 case SHADER_OPCODE_INT_REMAINDER:
985 return 2 * dispatch_width / 8;
986 case SHADER_OPCODE_TEX:
987 case FS_OPCODE_TXB:
988 case SHADER_OPCODE_TXD:
989 case SHADER_OPCODE_TXF:
990 case SHADER_OPCODE_TXF_CMS:
991 case SHADER_OPCODE_TXF_MCS:
992 case SHADER_OPCODE_TG4:
993 case SHADER_OPCODE_TG4_OFFSET:
994 case SHADER_OPCODE_TXL:
995 case SHADER_OPCODE_TXS:
996 case SHADER_OPCODE_LOD:
997 return 1;
998 case FS_OPCODE_FB_WRITE:
999 return 2;
1000 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1001 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1002 return 1;
1003 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1004 return inst->mlen;
1005 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1006 return 2;
1007 case SHADER_OPCODE_UNTYPED_ATOMIC:
1008 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1009 case SHADER_OPCODE_URB_WRITE_SIMD8:
1010 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1011 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1012 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1013 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1014 return 0;
1015 default:
1016 unreachable("not reached");
1017 }
1018 }
1019
1020 int
1021 fs_visitor::virtual_grf_alloc(int size)
1022 {
1023 if (virtual_grf_array_size <= virtual_grf_count) {
1024 if (virtual_grf_array_size == 0)
1025 virtual_grf_array_size = 16;
1026 else
1027 virtual_grf_array_size *= 2;
1028 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
1029 virtual_grf_array_size);
1030 }
1031 virtual_grf_sizes[virtual_grf_count] = size;
1032 return virtual_grf_count++;
1033 }
1034
1035 /** Fixed HW reg constructor. */
1036 fs_reg::fs_reg(enum register_file file, int reg)
1037 {
1038 init();
1039 this->file = file;
1040 this->reg = reg;
1041 this->type = BRW_REGISTER_TYPE_F;
1042
1043 switch (file) {
1044 case UNIFORM:
1045 this->width = 1;
1046 break;
1047 default:
1048 this->width = 8;
1049 }
1050 }
1051
1052 /** Fixed HW reg constructor. */
1053 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1054 {
1055 init();
1056 this->file = file;
1057 this->reg = reg;
1058 this->type = type;
1059
1060 switch (file) {
1061 case UNIFORM:
1062 this->width = 1;
1063 break;
1064 default:
1065 this->width = 8;
1066 }
1067 }
1068
1069 /** Fixed HW reg constructor. */
1070 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1071 uint8_t width)
1072 {
1073 init();
1074 this->file = file;
1075 this->reg = reg;
1076 this->type = type;
1077 this->width = width;
1078 }
1079
1080 /** Automatic reg constructor. */
1081 fs_reg::fs_reg(fs_visitor *v, const struct glsl_type *type)
1082 {
1083 init();
1084 int reg_width = v->dispatch_width / 8;
1085
1086 this->file = GRF;
1087 this->reg = v->virtual_grf_alloc(v->type_size(type) * reg_width);
1088 this->reg_offset = 0;
1089 this->type = brw_type_for_base_type(type);
1090 this->width = v->dispatch_width;
1091 assert(this->width == 8 || this->width == 16);
1092 }
1093
1094 fs_reg *
1095 fs_visitor::variable_storage(ir_variable *var)
1096 {
1097 return (fs_reg *)hash_table_find(this->variable_ht, var);
1098 }
1099
1100 void
1101 import_uniforms_callback(const void *key,
1102 void *data,
1103 void *closure)
1104 {
1105 struct hash_table *dst_ht = (struct hash_table *)closure;
1106 const fs_reg *reg = (const fs_reg *)data;
1107
1108 if (reg->file != UNIFORM)
1109 return;
1110
1111 hash_table_insert(dst_ht, data, key);
1112 }
1113
1114 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1115 * This brings in those uniform definitions
1116 */
1117 void
1118 fs_visitor::import_uniforms(fs_visitor *v)
1119 {
1120 hash_table_call_foreach(v->variable_ht,
1121 import_uniforms_callback,
1122 variable_ht);
1123 this->push_constant_loc = v->push_constant_loc;
1124 this->pull_constant_loc = v->pull_constant_loc;
1125 this->uniforms = v->uniforms;
1126 this->param_size = v->param_size;
1127 }
1128
1129 /* Our support for uniforms is piggy-backed on the struct
1130 * gl_fragment_program, because that's where the values actually
1131 * get stored, rather than in some global gl_shader_program uniform
1132 * store.
1133 */
1134 void
1135 fs_visitor::setup_uniform_values(ir_variable *ir)
1136 {
1137 int namelen = strlen(ir->name);
1138
1139 /* The data for our (non-builtin) uniforms is stored in a series of
1140 * gl_uniform_driver_storage structs for each subcomponent that
1141 * glGetUniformLocation() could name. We know it's been set up in the same
1142 * order we'd walk the type, so walk the list of storage and find anything
1143 * with our name, or the prefix of a component that starts with our name.
1144 */
1145 unsigned params_before = uniforms;
1146 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1147 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1148
1149 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1150 (storage->name[namelen] != 0 &&
1151 storage->name[namelen] != '.' &&
1152 storage->name[namelen] != '[')) {
1153 continue;
1154 }
1155
1156 unsigned slots = storage->type->component_slots();
1157 if (storage->array_elements)
1158 slots *= storage->array_elements;
1159
1160 for (unsigned i = 0; i < slots; i++) {
1161 stage_prog_data->param[uniforms++] = &storage->storage[i];
1162 }
1163 }
1164
1165 /* Make sure we actually initialized the right amount of stuff here. */
1166 assert(params_before + ir->type->component_slots() == uniforms);
1167 (void)params_before;
1168 }
1169
1170
1171 /* Our support for builtin uniforms is even scarier than non-builtin.
1172 * It sits on top of the PROG_STATE_VAR parameters that are
1173 * automatically updated from GL context state.
1174 */
1175 void
1176 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1177 {
1178 const ir_state_slot *const slots = ir->get_state_slots();
1179 assert(slots != NULL);
1180
1181 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1182 /* This state reference has already been setup by ir_to_mesa, but we'll
1183 * get the same index back here.
1184 */
1185 int index = _mesa_add_state_reference(this->prog->Parameters,
1186 (gl_state_index *)slots[i].tokens);
1187
1188 /* Add each of the unique swizzles of the element as a parameter.
1189 * This'll end up matching the expected layout of the
1190 * array/matrix/structure we're trying to fill in.
1191 */
1192 int last_swiz = -1;
1193 for (unsigned int j = 0; j < 4; j++) {
1194 int swiz = GET_SWZ(slots[i].swizzle, j);
1195 if (swiz == last_swiz)
1196 break;
1197 last_swiz = swiz;
1198
1199 stage_prog_data->param[uniforms++] =
1200 &prog->Parameters->ParameterValues[index][swiz];
1201 }
1202 }
1203 }
1204
1205 fs_reg *
1206 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
1207 {
1208 assert(stage == MESA_SHADER_FRAGMENT);
1209 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1210 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1211 fs_reg wpos = *reg;
1212 bool flip = !ir->data.origin_upper_left ^ key->render_to_fbo;
1213
1214 /* gl_FragCoord.x */
1215 if (ir->data.pixel_center_integer) {
1216 emit(MOV(wpos, this->pixel_x));
1217 } else {
1218 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1219 }
1220 wpos = offset(wpos, 1);
1221
1222 /* gl_FragCoord.y */
1223 if (!flip && ir->data.pixel_center_integer) {
1224 emit(MOV(wpos, this->pixel_y));
1225 } else {
1226 fs_reg pixel_y = this->pixel_y;
1227 float offset = (ir->data.pixel_center_integer ? 0.0 : 0.5);
1228
1229 if (flip) {
1230 pixel_y.negate = true;
1231 offset += key->drawable_height - 1.0;
1232 }
1233
1234 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1235 }
1236 wpos = offset(wpos, 1);
1237
1238 /* gl_FragCoord.z */
1239 if (brw->gen >= 6) {
1240 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1241 } else {
1242 emit(FS_OPCODE_LINTERP, wpos,
1243 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1244 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1245 interp_reg(VARYING_SLOT_POS, 2));
1246 }
1247 wpos = offset(wpos, 1);
1248
1249 /* gl_FragCoord.w: Already set up in emit_interpolation */
1250 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1251
1252 return reg;
1253 }
1254
1255 fs_inst *
1256 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1257 glsl_interp_qualifier interpolation_mode,
1258 bool is_centroid, bool is_sample)
1259 {
1260 brw_wm_barycentric_interp_mode barycoord_mode;
1261 if (brw->gen >= 6) {
1262 if (is_centroid) {
1263 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1264 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1265 else
1266 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1267 } else if (is_sample) {
1268 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1269 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1270 else
1271 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1272 } else {
1273 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1274 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1275 else
1276 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1277 }
1278 } else {
1279 /* On Ironlake and below, there is only one interpolation mode.
1280 * Centroid interpolation doesn't mean anything on this hardware --
1281 * there is no multisampling.
1282 */
1283 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1284 }
1285 return emit(FS_OPCODE_LINTERP, attr,
1286 this->delta_x[barycoord_mode],
1287 this->delta_y[barycoord_mode], interp);
1288 }
1289
1290 fs_reg *
1291 fs_visitor::emit_general_interpolation(ir_variable *ir)
1292 {
1293 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1294 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1295 fs_reg attr = *reg;
1296
1297 assert(stage == MESA_SHADER_FRAGMENT);
1298 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1299 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1300
1301 unsigned int array_elements;
1302 const glsl_type *type;
1303
1304 if (ir->type->is_array()) {
1305 array_elements = ir->type->length;
1306 if (array_elements == 0) {
1307 fail("dereferenced array '%s' has length 0\n", ir->name);
1308 }
1309 type = ir->type->fields.array;
1310 } else {
1311 array_elements = 1;
1312 type = ir->type;
1313 }
1314
1315 glsl_interp_qualifier interpolation_mode =
1316 ir->determine_interpolation_mode(key->flat_shade);
1317
1318 int location = ir->data.location;
1319 for (unsigned int i = 0; i < array_elements; i++) {
1320 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1321 if (prog_data->urb_setup[location] == -1) {
1322 /* If there's no incoming setup data for this slot, don't
1323 * emit interpolation for it.
1324 */
1325 attr = offset(attr, type->vector_elements);
1326 location++;
1327 continue;
1328 }
1329
1330 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1331 /* Constant interpolation (flat shading) case. The SF has
1332 * handed us defined values in only the constant offset
1333 * field of the setup reg.
1334 */
1335 for (unsigned int k = 0; k < type->vector_elements; k++) {
1336 struct brw_reg interp = interp_reg(location, k);
1337 interp = suboffset(interp, 3);
1338 interp.type = reg->type;
1339 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1340 attr = offset(attr, 1);
1341 }
1342 } else {
1343 /* Smooth/noperspective interpolation case. */
1344 for (unsigned int k = 0; k < type->vector_elements; k++) {
1345 struct brw_reg interp = interp_reg(location, k);
1346 if (brw->needs_unlit_centroid_workaround && ir->data.centroid) {
1347 /* Get the pixel/sample mask into f0 so that we know
1348 * which pixels are lit. Then, for each channel that is
1349 * unlit, replace the centroid data with non-centroid
1350 * data.
1351 */
1352 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1353
1354 fs_inst *inst;
1355 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1356 false, false);
1357 inst->predicate = BRW_PREDICATE_NORMAL;
1358 inst->predicate_inverse = true;
1359 if (brw->has_pln)
1360 inst->no_dd_clear = true;
1361
1362 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1363 ir->data.centroid && !key->persample_shading,
1364 ir->data.sample || key->persample_shading);
1365 inst->predicate = BRW_PREDICATE_NORMAL;
1366 inst->predicate_inverse = false;
1367 if (brw->has_pln)
1368 inst->no_dd_check = true;
1369
1370 } else {
1371 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1372 ir->data.centroid && !key->persample_shading,
1373 ir->data.sample || key->persample_shading);
1374 }
1375 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1376 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1377 }
1378 attr = offset(attr, 1);
1379 }
1380
1381 }
1382 location++;
1383 }
1384 }
1385
1386 return reg;
1387 }
1388
1389 fs_reg *
1390 fs_visitor::emit_frontfacing_interpolation()
1391 {
1392 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::bool_type);
1393
1394 if (brw->gen >= 6) {
1395 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1396 * a boolean result from this (~0/true or 0/false).
1397 *
1398 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1399 * this task in only one instruction:
1400 * - a negation source modifier will flip the bit; and
1401 * - a W -> D type conversion will sign extend the bit into the high
1402 * word of the destination.
1403 *
1404 * An ASR 15 fills the low word of the destination.
1405 */
1406 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1407 g0.negate = true;
1408
1409 emit(ASR(*reg, g0, fs_reg(15)));
1410 } else {
1411 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1412 * a boolean result from this (1/true or 0/false).
1413 *
1414 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1415 * the negation source modifier to flip it. Unfortunately the SHR
1416 * instruction only operates on UD (or D with an abs source modifier)
1417 * sources without negation.
1418 *
1419 * Instead, use ASR (which will give ~0/true or 0/false).
1420 */
1421 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1422 g1_6.negate = true;
1423
1424 emit(ASR(*reg, g1_6, fs_reg(31)));
1425 }
1426
1427 return reg;
1428 }
1429
1430 void
1431 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1432 {
1433 assert(stage == MESA_SHADER_FRAGMENT);
1434 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1435 assert(dst.type == BRW_REGISTER_TYPE_F);
1436
1437 if (key->compute_pos_offset) {
1438 /* Convert int_sample_pos to floating point */
1439 emit(MOV(dst, int_sample_pos));
1440 /* Scale to the range [0, 1] */
1441 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1442 }
1443 else {
1444 /* From ARB_sample_shading specification:
1445 * "When rendering to a non-multisample buffer, or if multisample
1446 * rasterization is disabled, gl_SamplePosition will always be
1447 * (0.5, 0.5).
1448 */
1449 emit(MOV(dst, fs_reg(0.5f)));
1450 }
1451 }
1452
1453 fs_reg *
1454 fs_visitor::emit_samplepos_setup()
1455 {
1456 assert(brw->gen >= 6);
1457
1458 this->current_annotation = "compute sample position";
1459 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::vec2_type);
1460 fs_reg pos = *reg;
1461 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1462 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1463
1464 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1465 * mode will be enabled.
1466 *
1467 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1468 * R31.1:0 Position Offset X/Y for Slot[3:0]
1469 * R31.3:2 Position Offset X/Y for Slot[7:4]
1470 * .....
1471 *
1472 * The X, Y sample positions come in as bytes in thread payload. So, read
1473 * the positions using vstride=16, width=8, hstride=2.
1474 */
1475 struct brw_reg sample_pos_reg =
1476 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1477 BRW_REGISTER_TYPE_B), 16, 8, 2);
1478
1479 if (dispatch_width == 8) {
1480 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1481 } else {
1482 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1483 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1484 ->force_sechalf = true;
1485 }
1486 /* Compute gl_SamplePosition.x */
1487 compute_sample_position(pos, int_sample_x);
1488 pos = offset(pos, 1);
1489 if (dispatch_width == 8) {
1490 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1491 } else {
1492 emit(MOV(half(int_sample_y, 0),
1493 fs_reg(suboffset(sample_pos_reg, 1))));
1494 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1495 ->force_sechalf = true;
1496 }
1497 /* Compute gl_SamplePosition.y */
1498 compute_sample_position(pos, int_sample_y);
1499 return reg;
1500 }
1501
1502 fs_reg *
1503 fs_visitor::emit_sampleid_setup()
1504 {
1505 assert(stage == MESA_SHADER_FRAGMENT);
1506 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1507 assert(brw->gen >= 6);
1508
1509 this->current_annotation = "compute sample id";
1510 fs_reg *reg = new(this->mem_ctx) fs_reg(this, glsl_type::int_type);
1511
1512 if (key->compute_sample_id) {
1513 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1514 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1515 t2.type = BRW_REGISTER_TYPE_UW;
1516
1517 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1518 * 8x multisampling, subspan 0 will represent sample N (where N
1519 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1520 * 7. We can find the value of N by looking at R0.0 bits 7:6
1521 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1522 * (since samples are always delivered in pairs). That is, we
1523 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1524 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1525 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1526 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1527 * populating a temporary variable with the sequence (0, 1, 2, 3),
1528 * and then reading from it using vstride=1, width=4, hstride=0.
1529 * These computations hold good for 4x multisampling as well.
1530 *
1531 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1532 * the first four slots are sample 0 of subspan 0; the next four
1533 * are sample 1 of subspan 0; the third group is sample 0 of
1534 * subspan 1, and finally sample 1 of subspan 1.
1535 */
1536 fs_inst *inst;
1537 inst = emit(BRW_OPCODE_AND, t1,
1538 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1539 fs_reg(0xc0));
1540 inst->force_writemask_all = true;
1541 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1542 inst->force_writemask_all = true;
1543 /* This works for both SIMD8 and SIMD16 */
1544 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1545 inst->force_writemask_all = true;
1546 /* This special instruction takes care of setting vstride=1,
1547 * width=4, hstride=0 of t2 during an ADD instruction.
1548 */
1549 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1550 } else {
1551 /* As per GL_ARB_sample_shading specification:
1552 * "When rendering to a non-multisample buffer, or if multisample
1553 * rasterization is disabled, gl_SampleID will always be zero."
1554 */
1555 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1556 }
1557
1558 return reg;
1559 }
1560
1561 fs_reg
1562 fs_visitor::fix_math_operand(fs_reg src)
1563 {
1564 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1565 * might be able to do better by doing execsize = 1 math and then
1566 * expanding that result out, but we would need to be careful with
1567 * masking.
1568 *
1569 * The hardware ignores source modifiers (negate and abs) on math
1570 * instructions, so we also move to a temp to set those up.
1571 */
1572 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1573 !src.abs && !src.negate)
1574 return src;
1575
1576 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1577 * operands to math
1578 */
1579 if (brw->gen >= 7 && src.file != IMM)
1580 return src;
1581
1582 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1583 expanded.type = src.type;
1584 emit(BRW_OPCODE_MOV, expanded, src);
1585 return expanded;
1586 }
1587
1588 fs_inst *
1589 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1590 {
1591 switch (opcode) {
1592 case SHADER_OPCODE_RCP:
1593 case SHADER_OPCODE_RSQ:
1594 case SHADER_OPCODE_SQRT:
1595 case SHADER_OPCODE_EXP2:
1596 case SHADER_OPCODE_LOG2:
1597 case SHADER_OPCODE_SIN:
1598 case SHADER_OPCODE_COS:
1599 break;
1600 default:
1601 unreachable("not reached: bad math opcode");
1602 }
1603
1604 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1605 * might be able to do better by doing execsize = 1 math and then
1606 * expanding that result out, but we would need to be careful with
1607 * masking.
1608 *
1609 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1610 * instructions, so we also move to a temp to set those up.
1611 */
1612 if (brw->gen == 6 || brw->gen == 7)
1613 src = fix_math_operand(src);
1614
1615 fs_inst *inst = emit(opcode, dst, src);
1616
1617 if (brw->gen < 6) {
1618 inst->base_mrf = 2;
1619 inst->mlen = dispatch_width / 8;
1620 }
1621
1622 return inst;
1623 }
1624
1625 fs_inst *
1626 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1627 {
1628 int base_mrf = 2;
1629 fs_inst *inst;
1630
1631 if (brw->gen >= 8) {
1632 inst = emit(opcode, dst, src0, src1);
1633 } else if (brw->gen >= 6) {
1634 src0 = fix_math_operand(src0);
1635 src1 = fix_math_operand(src1);
1636
1637 inst = emit(opcode, dst, src0, src1);
1638 } else {
1639 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1640 * "Message Payload":
1641 *
1642 * "Operand0[7]. For the INT DIV functions, this operand is the
1643 * denominator."
1644 * ...
1645 * "Operand1[7]. For the INT DIV functions, this operand is the
1646 * numerator."
1647 */
1648 bool is_int_div = opcode != SHADER_OPCODE_POW;
1649 fs_reg &op0 = is_int_div ? src1 : src0;
1650 fs_reg &op1 = is_int_div ? src0 : src1;
1651
1652 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1653 inst = emit(opcode, dst, op0, reg_null_f);
1654
1655 inst->base_mrf = base_mrf;
1656 inst->mlen = 2 * dispatch_width / 8;
1657 }
1658 return inst;
1659 }
1660
1661 void
1662 fs_visitor::assign_curb_setup()
1663 {
1664 if (dispatch_width == 8) {
1665 prog_data->dispatch_grf_start_reg = payload.num_regs;
1666 } else {
1667 assert(stage == MESA_SHADER_FRAGMENT);
1668 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1669 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1670 }
1671
1672 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1673
1674 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1675 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1676 for (unsigned int i = 0; i < inst->sources; i++) {
1677 if (inst->src[i].file == UNIFORM) {
1678 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1679 int constant_nr;
1680 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1681 constant_nr = push_constant_loc[uniform_nr];
1682 } else {
1683 /* Section 5.11 of the OpenGL 4.1 spec says:
1684 * "Out-of-bounds reads return undefined values, which include
1685 * values from other variables of the active program or zero."
1686 * Just return the first push constant.
1687 */
1688 constant_nr = 0;
1689 }
1690
1691 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1692 constant_nr / 8,
1693 constant_nr % 8);
1694
1695 inst->src[i].file = HW_REG;
1696 inst->src[i].fixed_hw_reg = byte_offset(
1697 retype(brw_reg, inst->src[i].type),
1698 inst->src[i].subreg_offset);
1699 }
1700 }
1701 }
1702 }
1703
1704 void
1705 fs_visitor::calculate_urb_setup()
1706 {
1707 assert(stage == MESA_SHADER_FRAGMENT);
1708 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1709 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1710
1711 memset(prog_data->urb_setup, -1,
1712 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1713
1714 int urb_next = 0;
1715 /* Figure out where each of the incoming setup attributes lands. */
1716 if (brw->gen >= 6) {
1717 if (_mesa_bitcount_64(prog->InputsRead &
1718 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1719 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1720 * first 16 varying inputs, so we can put them wherever we want.
1721 * Just put them in order.
1722 *
1723 * This is useful because it means that (a) inputs not used by the
1724 * fragment shader won't take up valuable register space, and (b) we
1725 * won't have to recompile the fragment shader if it gets paired with
1726 * a different vertex (or geometry) shader.
1727 */
1728 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1729 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1730 BITFIELD64_BIT(i)) {
1731 prog_data->urb_setup[i] = urb_next++;
1732 }
1733 }
1734 } else {
1735 /* We have enough input varyings that the SF/SBE pipeline stage can't
1736 * arbitrarily rearrange them to suit our whim; we have to put them
1737 * in an order that matches the output of the previous pipeline stage
1738 * (geometry or vertex shader).
1739 */
1740 struct brw_vue_map prev_stage_vue_map;
1741 brw_compute_vue_map(brw, &prev_stage_vue_map,
1742 key->input_slots_valid);
1743 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1744 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1745 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1746 slot++) {
1747 int varying = prev_stage_vue_map.slot_to_varying[slot];
1748 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1749 * unused.
1750 */
1751 if (varying != BRW_VARYING_SLOT_COUNT &&
1752 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1753 BITFIELD64_BIT(varying))) {
1754 prog_data->urb_setup[varying] = slot - first_slot;
1755 }
1756 }
1757 urb_next = prev_stage_vue_map.num_slots - first_slot;
1758 }
1759 } else {
1760 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1761 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1762 /* Point size is packed into the header, not as a general attribute */
1763 if (i == VARYING_SLOT_PSIZ)
1764 continue;
1765
1766 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1767 /* The back color slot is skipped when the front color is
1768 * also written to. In addition, some slots can be
1769 * written in the vertex shader and not read in the
1770 * fragment shader. So the register number must always be
1771 * incremented, mapped or not.
1772 */
1773 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1774 prog_data->urb_setup[i] = urb_next;
1775 urb_next++;
1776 }
1777 }
1778
1779 /*
1780 * It's a FS only attribute, and we did interpolation for this attribute
1781 * in SF thread. So, count it here, too.
1782 *
1783 * See compile_sf_prog() for more info.
1784 */
1785 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1786 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1787 }
1788
1789 prog_data->num_varying_inputs = urb_next;
1790 }
1791
1792 void
1793 fs_visitor::assign_urb_setup()
1794 {
1795 assert(stage == MESA_SHADER_FRAGMENT);
1796 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1797
1798 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1799
1800 /* Offset all the urb_setup[] index by the actual position of the
1801 * setup regs, now that the location of the constants has been chosen.
1802 */
1803 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1804 if (inst->opcode == FS_OPCODE_LINTERP) {
1805 assert(inst->src[2].file == HW_REG);
1806 inst->src[2].fixed_hw_reg.nr += urb_start;
1807 }
1808
1809 if (inst->opcode == FS_OPCODE_CINTERP) {
1810 assert(inst->src[0].file == HW_REG);
1811 inst->src[0].fixed_hw_reg.nr += urb_start;
1812 }
1813 }
1814
1815 /* Each attribute is 4 setup channels, each of which is half a reg. */
1816 this->first_non_payload_grf =
1817 urb_start + prog_data->num_varying_inputs * 2;
1818 }
1819
1820 void
1821 fs_visitor::assign_vs_urb_setup()
1822 {
1823 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1824 int grf, count, slot, channel, attr;
1825
1826 assert(stage == MESA_SHADER_VERTEX);
1827 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1828 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1829 count++;
1830
1831 /* Each attribute is 4 regs. */
1832 this->first_non_payload_grf =
1833 payload.num_regs + prog_data->curb_read_length + count * 4;
1834
1835 unsigned vue_entries =
1836 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1837
1838 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1839 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1840
1841 assert(vs_prog_data->base.urb_read_length <= 15);
1842
1843 /* Rewrite all ATTR file references to the hw grf that they land in. */
1844 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1845 for (int i = 0; i < inst->sources; i++) {
1846 if (inst->src[i].file == ATTR) {
1847
1848 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1849 slot = count - 1;
1850 } else {
1851 /* Attributes come in in a contiguous block, ordered by their
1852 * gl_vert_attrib value. That means we can compute the slot
1853 * number for an attribute by masking out the enabled
1854 * attributes before it and counting the bits.
1855 */
1856 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1857 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1858 BITFIELD64_MASK(attr));
1859 }
1860
1861 channel = inst->src[i].reg_offset & 3;
1862
1863 grf = payload.num_regs +
1864 prog_data->curb_read_length +
1865 slot * 4 + channel;
1866
1867 inst->src[i].file = HW_REG;
1868 inst->src[i].fixed_hw_reg =
1869 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1870 }
1871 }
1872 }
1873 }
1874
1875 /**
1876 * Split large virtual GRFs into separate components if we can.
1877 *
1878 * This is mostly duplicated with what brw_fs_vector_splitting does,
1879 * but that's really conservative because it's afraid of doing
1880 * splitting that doesn't result in real progress after the rest of
1881 * the optimization phases, which would cause infinite looping in
1882 * optimization. We can do it once here, safely. This also has the
1883 * opportunity to split interpolated values, or maybe even uniforms,
1884 * which we don't have at the IR level.
1885 *
1886 * We want to split, because virtual GRFs are what we register
1887 * allocate and spill (due to contiguousness requirements for some
1888 * instructions), and they're what we naturally generate in the
1889 * codegen process, but most virtual GRFs don't actually need to be
1890 * contiguous sets of GRFs. If we split, we'll end up with reduced
1891 * live intervals and better dead code elimination and coalescing.
1892 */
1893 void
1894 fs_visitor::split_virtual_grfs()
1895 {
1896 int num_vars = this->virtual_grf_count;
1897
1898 /* Count the total number of registers */
1899 int reg_count = 0;
1900 int vgrf_to_reg[num_vars];
1901 for (int i = 0; i < num_vars; i++) {
1902 vgrf_to_reg[i] = reg_count;
1903 reg_count += virtual_grf_sizes[i];
1904 }
1905
1906 /* An array of "split points". For each register slot, this indicates
1907 * if this slot can be separated from the previous slot. Every time an
1908 * instruction uses multiple elements of a register (as a source or
1909 * destination), we mark the used slots as inseparable. Then we go
1910 * through and split the registers into the smallest pieces we can.
1911 */
1912 bool split_points[reg_count];
1913 memset(split_points, 0, sizeof(split_points));
1914
1915 /* Mark all used registers as fully splittable */
1916 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1917 if (inst->dst.file == GRF) {
1918 int reg = vgrf_to_reg[inst->dst.reg];
1919 for (int j = 1; j < this->virtual_grf_sizes[inst->dst.reg]; j++)
1920 split_points[reg + j] = true;
1921 }
1922
1923 for (int i = 0; i < inst->sources; i++) {
1924 if (inst->src[i].file == GRF) {
1925 int reg = vgrf_to_reg[inst->src[i].reg];
1926 for (int j = 1; j < this->virtual_grf_sizes[inst->src[i].reg]; j++)
1927 split_points[reg + j] = true;
1928 }
1929 }
1930 }
1931
1932 if (brw->has_pln &&
1933 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1934 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1935 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1936 * Gen6, that was the only supported interpolation mode, and since Gen6,
1937 * delta_x and delta_y are in fixed hardware registers.
1938 */
1939 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1940 split_points[vgrf_to_reg[vgrf] + 1] = false;
1941 }
1942
1943 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1944 if (inst->dst.file == GRF) {
1945 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1946 for (int j = 1; j < inst->regs_written; j++)
1947 split_points[reg + j] = false;
1948 }
1949 for (int i = 0; i < inst->sources; i++) {
1950 if (inst->src[i].file == GRF) {
1951 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1952 for (int j = 1; j < inst->regs_read(this, i); j++)
1953 split_points[reg + j] = false;
1954 }
1955 }
1956 }
1957
1958 int new_virtual_grf[reg_count];
1959 int new_reg_offset[reg_count];
1960
1961 int reg = 0;
1962 for (int i = 0; i < num_vars; i++) {
1963 /* The first one should always be 0 as a quick sanity check. */
1964 assert(split_points[reg] == false);
1965
1966 /* j = 0 case */
1967 new_reg_offset[reg] = 0;
1968 reg++;
1969 int offset = 1;
1970
1971 /* j > 0 case */
1972 for (int j = 1; j < virtual_grf_sizes[i]; j++) {
1973 /* If this is a split point, reset the offset to 0 and allocate a
1974 * new virtual GRF for the previous offset many registers
1975 */
1976 if (split_points[reg]) {
1977 assert(offset <= MAX_VGRF_SIZE);
1978 int grf = virtual_grf_alloc(offset);
1979 for (int k = reg - offset; k < reg; k++)
1980 new_virtual_grf[k] = grf;
1981 offset = 0;
1982 }
1983 new_reg_offset[reg] = offset;
1984 offset++;
1985 reg++;
1986 }
1987
1988 /* The last one gets the original register number */
1989 assert(offset <= MAX_VGRF_SIZE);
1990 virtual_grf_sizes[i] = offset;
1991 for (int k = reg - offset; k < reg; k++)
1992 new_virtual_grf[k] = i;
1993 }
1994 assert(reg == reg_count);
1995
1996 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1997 if (inst->dst.file == GRF) {
1998 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1999 inst->dst.reg = new_virtual_grf[reg];
2000 inst->dst.reg_offset = new_reg_offset[reg];
2001 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
2002 }
2003 for (int i = 0; i < inst->sources; i++) {
2004 if (inst->src[i].file == GRF) {
2005 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2006 inst->src[i].reg = new_virtual_grf[reg];
2007 inst->src[i].reg_offset = new_reg_offset[reg];
2008 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
2009 }
2010 }
2011 }
2012 invalidate_live_intervals();
2013 }
2014
2015 /**
2016 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2017 *
2018 * During code generation, we create tons of temporary variables, many of
2019 * which get immediately killed and are never used again. Yet, in later
2020 * optimization and analysis passes, such as compute_live_intervals, we need
2021 * to loop over all the virtual GRFs. Compacting them can save a lot of
2022 * overhead.
2023 */
2024 bool
2025 fs_visitor::compact_virtual_grfs()
2026 {
2027 bool progress = false;
2028 int remap_table[this->virtual_grf_count];
2029 memset(remap_table, -1, sizeof(remap_table));
2030
2031 /* Mark which virtual GRFs are used. */
2032 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2033 if (inst->dst.file == GRF)
2034 remap_table[inst->dst.reg] = 0;
2035
2036 for (int i = 0; i < inst->sources; i++) {
2037 if (inst->src[i].file == GRF)
2038 remap_table[inst->src[i].reg] = 0;
2039 }
2040 }
2041
2042 /* Compact the GRF arrays. */
2043 int new_index = 0;
2044 for (int i = 0; i < this->virtual_grf_count; i++) {
2045 if (remap_table[i] == -1) {
2046 /* We just found an unused register. This means that we are
2047 * actually going to compact something.
2048 */
2049 progress = true;
2050 } else {
2051 remap_table[i] = new_index;
2052 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
2053 invalidate_live_intervals();
2054 ++new_index;
2055 }
2056 }
2057
2058 this->virtual_grf_count = new_index;
2059
2060 /* Patch all the instructions to use the newly renumbered registers */
2061 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2062 if (inst->dst.file == GRF)
2063 inst->dst.reg = remap_table[inst->dst.reg];
2064
2065 for (int i = 0; i < inst->sources; i++) {
2066 if (inst->src[i].file == GRF)
2067 inst->src[i].reg = remap_table[inst->src[i].reg];
2068 }
2069 }
2070
2071 /* Patch all the references to delta_x/delta_y, since they're used in
2072 * register allocation. If they're unused, switch them to BAD_FILE so
2073 * we don't think some random VGRF is delta_x/delta_y.
2074 */
2075 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2076 if (delta_x[i].file == GRF) {
2077 if (remap_table[delta_x[i].reg] != -1) {
2078 delta_x[i].reg = remap_table[delta_x[i].reg];
2079 } else {
2080 delta_x[i].file = BAD_FILE;
2081 }
2082 }
2083 }
2084 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2085 if (delta_y[i].file == GRF) {
2086 if (remap_table[delta_y[i].reg] != -1) {
2087 delta_y[i].reg = remap_table[delta_y[i].reg];
2088 } else {
2089 delta_y[i].file = BAD_FILE;
2090 }
2091 }
2092 }
2093
2094 return progress;
2095 }
2096
2097 /*
2098 * Implements array access of uniforms by inserting a
2099 * PULL_CONSTANT_LOAD instruction.
2100 *
2101 * Unlike temporary GRF array access (where we don't support it due to
2102 * the difficulty of doing relative addressing on instruction
2103 * destinations), we could potentially do array access of uniforms
2104 * that were loaded in GRF space as push constants. In real-world
2105 * usage we've seen, though, the arrays being used are always larger
2106 * than we could load as push constants, so just always move all
2107 * uniform array access out to a pull constant buffer.
2108 */
2109 void
2110 fs_visitor::move_uniform_array_access_to_pull_constants()
2111 {
2112 if (dispatch_width != 8)
2113 return;
2114
2115 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2116 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2117
2118 /* Walk through and find array access of uniforms. Put a copy of that
2119 * uniform in the pull constant buffer.
2120 *
2121 * Note that we don't move constant-indexed accesses to arrays. No
2122 * testing has been done of the performance impact of this choice.
2123 */
2124 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2125 for (int i = 0 ; i < inst->sources; i++) {
2126 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2127 continue;
2128
2129 int uniform = inst->src[i].reg;
2130
2131 /* If this array isn't already present in the pull constant buffer,
2132 * add it.
2133 */
2134 if (pull_constant_loc[uniform] == -1) {
2135 const gl_constant_value **values = &stage_prog_data->param[uniform];
2136
2137 assert(param_size[uniform]);
2138
2139 for (int j = 0; j < param_size[uniform]; j++) {
2140 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2141
2142 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2143 values[j];
2144 }
2145 }
2146 }
2147 }
2148 }
2149
2150 /**
2151 * Assign UNIFORM file registers to either push constants or pull constants.
2152 *
2153 * We allow a fragment shader to have more than the specified minimum
2154 * maximum number of fragment shader uniform components (64). If
2155 * there are too many of these, they'd fill up all of register space.
2156 * So, this will push some of them out to the pull constant buffer and
2157 * update the program to load them.
2158 */
2159 void
2160 fs_visitor::assign_constant_locations()
2161 {
2162 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2163 if (dispatch_width != 8)
2164 return;
2165
2166 /* Find which UNIFORM registers are still in use. */
2167 bool is_live[uniforms];
2168 for (unsigned int i = 0; i < uniforms; i++) {
2169 is_live[i] = false;
2170 }
2171
2172 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2173 for (int i = 0; i < inst->sources; i++) {
2174 if (inst->src[i].file != UNIFORM)
2175 continue;
2176
2177 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2178 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2179 is_live[constant_nr] = true;
2180 }
2181 }
2182
2183 /* Only allow 16 registers (128 uniform components) as push constants.
2184 *
2185 * Just demote the end of the list. We could probably do better
2186 * here, demoting things that are rarely used in the program first.
2187 *
2188 * If changing this value, note the limitation about total_regs in
2189 * brw_curbe.c.
2190 */
2191 unsigned int max_push_components = 16 * 8;
2192 unsigned int num_push_constants = 0;
2193
2194 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2195
2196 for (unsigned int i = 0; i < uniforms; i++) {
2197 if (!is_live[i] || pull_constant_loc[i] != -1) {
2198 /* This UNIFORM register is either dead, or has already been demoted
2199 * to a pull const. Mark it as no longer living in the param[] array.
2200 */
2201 push_constant_loc[i] = -1;
2202 continue;
2203 }
2204
2205 if (num_push_constants < max_push_components) {
2206 /* Retain as a push constant. Record the location in the params[]
2207 * array.
2208 */
2209 push_constant_loc[i] = num_push_constants++;
2210 } else {
2211 /* Demote to a pull constant. */
2212 push_constant_loc[i] = -1;
2213
2214 int pull_index = stage_prog_data->nr_pull_params++;
2215 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2216 pull_constant_loc[i] = pull_index;
2217 }
2218 }
2219
2220 stage_prog_data->nr_params = num_push_constants;
2221
2222 /* Up until now, the param[] array has been indexed by reg + reg_offset
2223 * of UNIFORM registers. Condense it to only contain the uniforms we
2224 * chose to upload as push constants.
2225 */
2226 for (unsigned int i = 0; i < uniforms; i++) {
2227 int remapped = push_constant_loc[i];
2228
2229 if (remapped == -1)
2230 continue;
2231
2232 assert(remapped <= (int)i);
2233 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2234 }
2235 }
2236
2237 /**
2238 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2239 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2240 */
2241 void
2242 fs_visitor::demote_pull_constants()
2243 {
2244 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2245 for (int i = 0; i < inst->sources; i++) {
2246 if (inst->src[i].file != UNIFORM)
2247 continue;
2248
2249 int pull_index = pull_constant_loc[inst->src[i].reg +
2250 inst->src[i].reg_offset];
2251 if (pull_index == -1)
2252 continue;
2253
2254 /* Set up the annotation tracking for new generated instructions. */
2255 base_ir = inst->ir;
2256 current_annotation = inst->annotation;
2257
2258 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2259 fs_reg dst = fs_reg(this, glsl_type::float_type);
2260
2261 /* Generate a pull load into dst. */
2262 if (inst->src[i].reladdr) {
2263 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2264 surf_index,
2265 *inst->src[i].reladdr,
2266 pull_index);
2267 inst->insert_before(block, &list);
2268 inst->src[i].reladdr = NULL;
2269 } else {
2270 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2271 fs_inst *pull =
2272 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2273 dst, surf_index, offset);
2274 inst->insert_before(block, pull);
2275 inst->src[i].set_smear(pull_index & 3);
2276 }
2277
2278 /* Rewrite the instruction to use the temporary VGRF. */
2279 inst->src[i].file = GRF;
2280 inst->src[i].reg = dst.reg;
2281 inst->src[i].reg_offset = 0;
2282 inst->src[i].width = dispatch_width;
2283 }
2284 }
2285 invalidate_live_intervals();
2286 }
2287
2288 bool
2289 fs_visitor::opt_algebraic()
2290 {
2291 bool progress = false;
2292
2293 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2294 switch (inst->opcode) {
2295 case BRW_OPCODE_MOV:
2296 if (inst->src[0].file != IMM)
2297 break;
2298
2299 if (inst->saturate) {
2300 if (inst->dst.type != inst->src[0].type)
2301 assert(!"unimplemented: saturate mixed types");
2302
2303 if (brw_saturate_immediate(inst->dst.type,
2304 &inst->src[0].fixed_hw_reg)) {
2305 inst->saturate = false;
2306 progress = true;
2307 }
2308 }
2309 break;
2310
2311 case BRW_OPCODE_MUL:
2312 if (inst->src[1].file != IMM)
2313 continue;
2314
2315 /* a * 1.0 = a */
2316 if (inst->src[1].is_one()) {
2317 inst->opcode = BRW_OPCODE_MOV;
2318 inst->src[1] = reg_undef;
2319 progress = true;
2320 break;
2321 }
2322
2323 /* a * 0.0 = 0.0 */
2324 if (inst->src[1].is_zero()) {
2325 inst->opcode = BRW_OPCODE_MOV;
2326 inst->src[0] = inst->src[1];
2327 inst->src[1] = reg_undef;
2328 progress = true;
2329 break;
2330 }
2331
2332 break;
2333 case BRW_OPCODE_ADD:
2334 if (inst->src[1].file != IMM)
2335 continue;
2336
2337 /* a + 0.0 = a */
2338 if (inst->src[1].is_zero()) {
2339 inst->opcode = BRW_OPCODE_MOV;
2340 inst->src[1] = reg_undef;
2341 progress = true;
2342 break;
2343 }
2344 break;
2345 case BRW_OPCODE_OR:
2346 if (inst->src[0].equals(inst->src[1])) {
2347 inst->opcode = BRW_OPCODE_MOV;
2348 inst->src[1] = reg_undef;
2349 progress = true;
2350 break;
2351 }
2352 break;
2353 case BRW_OPCODE_LRP:
2354 if (inst->src[1].equals(inst->src[2])) {
2355 inst->opcode = BRW_OPCODE_MOV;
2356 inst->src[0] = inst->src[1];
2357 inst->src[1] = reg_undef;
2358 inst->src[2] = reg_undef;
2359 progress = true;
2360 break;
2361 }
2362 break;
2363 case BRW_OPCODE_SEL:
2364 if (inst->src[0].equals(inst->src[1])) {
2365 inst->opcode = BRW_OPCODE_MOV;
2366 inst->src[1] = reg_undef;
2367 inst->predicate = BRW_PREDICATE_NONE;
2368 inst->predicate_inverse = false;
2369 progress = true;
2370 } else if (inst->saturate && inst->src[1].file == IMM) {
2371 switch (inst->conditional_mod) {
2372 case BRW_CONDITIONAL_LE:
2373 case BRW_CONDITIONAL_L:
2374 switch (inst->src[1].type) {
2375 case BRW_REGISTER_TYPE_F:
2376 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2377 inst->opcode = BRW_OPCODE_MOV;
2378 inst->src[1] = reg_undef;
2379 progress = true;
2380 }
2381 break;
2382 default:
2383 break;
2384 }
2385 break;
2386 case BRW_CONDITIONAL_GE:
2387 case BRW_CONDITIONAL_G:
2388 switch (inst->src[1].type) {
2389 case BRW_REGISTER_TYPE_F:
2390 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2391 inst->opcode = BRW_OPCODE_MOV;
2392 inst->src[1] = reg_undef;
2393 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2394 progress = true;
2395 }
2396 break;
2397 default:
2398 break;
2399 }
2400 default:
2401 break;
2402 }
2403 }
2404 break;
2405 case SHADER_OPCODE_RCP: {
2406 fs_inst *prev = (fs_inst *)inst->prev;
2407 if (prev->opcode == SHADER_OPCODE_SQRT) {
2408 if (inst->src[0].equals(prev->dst)) {
2409 inst->opcode = SHADER_OPCODE_RSQ;
2410 inst->src[0] = prev->src[0];
2411 progress = true;
2412 }
2413 }
2414 break;
2415 }
2416 default:
2417 break;
2418 }
2419 }
2420
2421 return progress;
2422 }
2423
2424 bool
2425 fs_visitor::opt_register_renaming()
2426 {
2427 bool progress = false;
2428 int depth = 0;
2429
2430 int remap[virtual_grf_count];
2431 memset(remap, -1, sizeof(int) * virtual_grf_count);
2432
2433 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2434 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2435 depth++;
2436 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2437 inst->opcode == BRW_OPCODE_WHILE) {
2438 depth--;
2439 }
2440
2441 /* Rewrite instruction sources. */
2442 for (int i = 0; i < inst->sources; i++) {
2443 if (inst->src[i].file == GRF &&
2444 remap[inst->src[i].reg] != -1 &&
2445 remap[inst->src[i].reg] != inst->src[i].reg) {
2446 inst->src[i].reg = remap[inst->src[i].reg];
2447 progress = true;
2448 }
2449 }
2450
2451 const int dst = inst->dst.reg;
2452
2453 if (depth == 0 &&
2454 inst->dst.file == GRF &&
2455 virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
2456 !inst->is_partial_write()) {
2457 if (remap[dst] == -1) {
2458 remap[dst] = dst;
2459 } else {
2460 remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
2461 inst->dst.reg = remap[dst];
2462 progress = true;
2463 }
2464 } else if (inst->dst.file == GRF &&
2465 remap[dst] != -1 &&
2466 remap[dst] != dst) {
2467 inst->dst.reg = remap[dst];
2468 progress = true;
2469 }
2470 }
2471
2472 if (progress) {
2473 invalidate_live_intervals();
2474
2475 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2476 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2477 delta_x[i].reg = remap[delta_x[i].reg];
2478 }
2479 }
2480 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2481 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2482 delta_y[i].reg = remap[delta_y[i].reg];
2483 }
2484 }
2485 }
2486
2487 return progress;
2488 }
2489
2490 bool
2491 fs_visitor::compute_to_mrf()
2492 {
2493 bool progress = false;
2494 int next_ip = 0;
2495
2496 /* No MRFs on Gen >= 7. */
2497 if (brw->gen >= 7)
2498 return false;
2499
2500 calculate_live_intervals();
2501
2502 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2503 int ip = next_ip;
2504 next_ip++;
2505
2506 if (inst->opcode != BRW_OPCODE_MOV ||
2507 inst->is_partial_write() ||
2508 inst->dst.file != MRF || inst->src[0].file != GRF ||
2509 inst->dst.type != inst->src[0].type ||
2510 inst->src[0].abs || inst->src[0].negate ||
2511 !inst->src[0].is_contiguous() ||
2512 inst->src[0].subreg_offset)
2513 continue;
2514
2515 /* Work out which hardware MRF registers are written by this
2516 * instruction.
2517 */
2518 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2519 int mrf_high;
2520 if (inst->dst.reg & BRW_MRF_COMPR4) {
2521 mrf_high = mrf_low + 4;
2522 } else if (inst->exec_size == 16) {
2523 mrf_high = mrf_low + 1;
2524 } else {
2525 mrf_high = mrf_low;
2526 }
2527
2528 /* Can't compute-to-MRF this GRF if someone else was going to
2529 * read it later.
2530 */
2531 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2532 continue;
2533
2534 /* Found a move of a GRF to a MRF. Let's see if we can go
2535 * rewrite the thing that made this GRF to write into the MRF.
2536 */
2537 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2538 if (scan_inst->dst.file == GRF &&
2539 scan_inst->dst.reg == inst->src[0].reg) {
2540 /* Found the last thing to write our reg we want to turn
2541 * into a compute-to-MRF.
2542 */
2543
2544 /* If this one instruction didn't populate all the
2545 * channels, bail. We might be able to rewrite everything
2546 * that writes that reg, but it would require smarter
2547 * tracking to delay the rewriting until complete success.
2548 */
2549 if (scan_inst->is_partial_write())
2550 break;
2551
2552 /* Things returning more than one register would need us to
2553 * understand coalescing out more than one MOV at a time.
2554 */
2555 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2556 break;
2557
2558 /* SEND instructions can't have MRF as a destination. */
2559 if (scan_inst->mlen)
2560 break;
2561
2562 if (brw->gen == 6) {
2563 /* gen6 math instructions must have the destination be
2564 * GRF, so no compute-to-MRF for them.
2565 */
2566 if (scan_inst->is_math()) {
2567 break;
2568 }
2569 }
2570
2571 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2572 /* Found the creator of our MRF's source value. */
2573 scan_inst->dst.file = MRF;
2574 scan_inst->dst.reg = inst->dst.reg;
2575 scan_inst->saturate |= inst->saturate;
2576 inst->remove(block);
2577 progress = true;
2578 }
2579 break;
2580 }
2581
2582 /* We don't handle control flow here. Most computation of
2583 * values that end up in MRFs are shortly before the MRF
2584 * write anyway.
2585 */
2586 if (block->start() == scan_inst)
2587 break;
2588
2589 /* You can't read from an MRF, so if someone else reads our
2590 * MRF's source GRF that we wanted to rewrite, that stops us.
2591 */
2592 bool interfered = false;
2593 for (int i = 0; i < scan_inst->sources; i++) {
2594 if (scan_inst->src[i].file == GRF &&
2595 scan_inst->src[i].reg == inst->src[0].reg &&
2596 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2597 interfered = true;
2598 }
2599 }
2600 if (interfered)
2601 break;
2602
2603 if (scan_inst->dst.file == MRF) {
2604 /* If somebody else writes our MRF here, we can't
2605 * compute-to-MRF before that.
2606 */
2607 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2608 int scan_mrf_high;
2609
2610 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2611 scan_mrf_high = scan_mrf_low + 4;
2612 } else if (scan_inst->exec_size == 16) {
2613 scan_mrf_high = scan_mrf_low + 1;
2614 } else {
2615 scan_mrf_high = scan_mrf_low;
2616 }
2617
2618 if (mrf_low == scan_mrf_low ||
2619 mrf_low == scan_mrf_high ||
2620 mrf_high == scan_mrf_low ||
2621 mrf_high == scan_mrf_high) {
2622 break;
2623 }
2624 }
2625
2626 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2627 /* Found a SEND instruction, which means that there are
2628 * live values in MRFs from base_mrf to base_mrf +
2629 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2630 * above it.
2631 */
2632 if (mrf_low >= scan_inst->base_mrf &&
2633 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2634 break;
2635 }
2636 if (mrf_high >= scan_inst->base_mrf &&
2637 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2638 break;
2639 }
2640 }
2641 }
2642 }
2643
2644 if (progress)
2645 invalidate_live_intervals();
2646
2647 return progress;
2648 }
2649
2650 /**
2651 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2652 * instructions to FS_OPCODE_REP_FB_WRITE.
2653 */
2654 void
2655 fs_visitor::emit_repclear_shader()
2656 {
2657 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2658 int base_mrf = 1;
2659 int color_mrf = base_mrf + 2;
2660
2661 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2662 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2663 mov->force_writemask_all = true;
2664
2665 fs_inst *write;
2666 if (key->nr_color_regions == 1) {
2667 write = emit(FS_OPCODE_REP_FB_WRITE);
2668 write->saturate = key->clamp_fragment_color;
2669 write->base_mrf = color_mrf;
2670 write->target = 0;
2671 write->header_present = false;
2672 write->mlen = 1;
2673 } else {
2674 assume(key->nr_color_regions > 0);
2675 for (int i = 0; i < key->nr_color_regions; ++i) {
2676 write = emit(FS_OPCODE_REP_FB_WRITE);
2677 write->saturate = key->clamp_fragment_color;
2678 write->base_mrf = base_mrf;
2679 write->target = i;
2680 write->header_present = true;
2681 write->mlen = 3;
2682 }
2683 }
2684 write->eot = true;
2685
2686 calculate_cfg();
2687
2688 assign_constant_locations();
2689 assign_curb_setup();
2690
2691 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2692 assert(mov->src[0].file == HW_REG);
2693 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2694 }
2695
2696 /**
2697 * Walks through basic blocks, looking for repeated MRF writes and
2698 * removing the later ones.
2699 */
2700 bool
2701 fs_visitor::remove_duplicate_mrf_writes()
2702 {
2703 fs_inst *last_mrf_move[16];
2704 bool progress = false;
2705
2706 /* Need to update the MRF tracking for compressed instructions. */
2707 if (dispatch_width == 16)
2708 return false;
2709
2710 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2711
2712 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2713 if (inst->is_control_flow()) {
2714 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2715 }
2716
2717 if (inst->opcode == BRW_OPCODE_MOV &&
2718 inst->dst.file == MRF) {
2719 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2720 if (prev_inst && inst->equals(prev_inst)) {
2721 inst->remove(block);
2722 progress = true;
2723 continue;
2724 }
2725 }
2726
2727 /* Clear out the last-write records for MRFs that were overwritten. */
2728 if (inst->dst.file == MRF) {
2729 last_mrf_move[inst->dst.reg] = NULL;
2730 }
2731
2732 if (inst->mlen > 0 && inst->base_mrf != -1) {
2733 /* Found a SEND instruction, which will include two or fewer
2734 * implied MRF writes. We could do better here.
2735 */
2736 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2737 last_mrf_move[inst->base_mrf + i] = NULL;
2738 }
2739 }
2740
2741 /* Clear out any MRF move records whose sources got overwritten. */
2742 if (inst->dst.file == GRF) {
2743 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2744 if (last_mrf_move[i] &&
2745 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2746 last_mrf_move[i] = NULL;
2747 }
2748 }
2749 }
2750
2751 if (inst->opcode == BRW_OPCODE_MOV &&
2752 inst->dst.file == MRF &&
2753 inst->src[0].file == GRF &&
2754 !inst->is_partial_write()) {
2755 last_mrf_move[inst->dst.reg] = inst;
2756 }
2757 }
2758
2759 if (progress)
2760 invalidate_live_intervals();
2761
2762 return progress;
2763 }
2764
2765 static void
2766 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2767 int first_grf, int grf_len)
2768 {
2769 /* Clear the flag for registers that actually got read (as expected). */
2770 for (int i = 0; i < inst->sources; i++) {
2771 int grf;
2772 if (inst->src[i].file == GRF) {
2773 grf = inst->src[i].reg;
2774 } else if (inst->src[i].file == HW_REG &&
2775 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2776 grf = inst->src[i].fixed_hw_reg.nr;
2777 } else {
2778 continue;
2779 }
2780
2781 if (grf >= first_grf &&
2782 grf < first_grf + grf_len) {
2783 deps[grf - first_grf] = false;
2784 if (inst->exec_size == 16)
2785 deps[grf - first_grf + 1] = false;
2786 }
2787 }
2788 }
2789
2790 /**
2791 * Implements this workaround for the original 965:
2792 *
2793 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2794 * check for post destination dependencies on this instruction, software
2795 * must ensure that there is no destination hazard for the case of ‘write
2796 * followed by a posted write’ shown in the following example.
2797 *
2798 * 1. mov r3 0
2799 * 2. send r3.xy <rest of send instruction>
2800 * 3. mov r2 r3
2801 *
2802 * Due to no post-destination dependency check on the ‘send’, the above
2803 * code sequence could have two instructions (1 and 2) in flight at the
2804 * same time that both consider ‘r3’ as the target of their final writes.
2805 */
2806 void
2807 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2808 fs_inst *inst)
2809 {
2810 int write_len = inst->regs_written;
2811 int first_write_grf = inst->dst.reg;
2812 bool needs_dep[BRW_MAX_MRF];
2813 assert(write_len < (int)sizeof(needs_dep) - 1);
2814
2815 memset(needs_dep, false, sizeof(needs_dep));
2816 memset(needs_dep, true, write_len);
2817
2818 clear_deps_for_inst_src(inst, dispatch_width,
2819 needs_dep, first_write_grf, write_len);
2820
2821 /* Walk backwards looking for writes to registers we're writing which
2822 * aren't read since being written. If we hit the start of the program,
2823 * we assume that there are no outstanding dependencies on entry to the
2824 * program.
2825 */
2826 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2827 /* If we hit control flow, assume that there *are* outstanding
2828 * dependencies, and force their cleanup before our instruction.
2829 */
2830 if (block->start() == scan_inst) {
2831 for (int i = 0; i < write_len; i++) {
2832 if (needs_dep[i]) {
2833 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2834 }
2835 }
2836 return;
2837 }
2838
2839 /* We insert our reads as late as possible on the assumption that any
2840 * instruction but a MOV that might have left us an outstanding
2841 * dependency has more latency than a MOV.
2842 */
2843 if (scan_inst->dst.file == GRF) {
2844 for (int i = 0; i < scan_inst->regs_written; i++) {
2845 int reg = scan_inst->dst.reg + i;
2846
2847 if (reg >= first_write_grf &&
2848 reg < first_write_grf + write_len &&
2849 needs_dep[reg - first_write_grf]) {
2850 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2851 needs_dep[reg - first_write_grf] = false;
2852 if (scan_inst->exec_size == 16)
2853 needs_dep[reg - first_write_grf + 1] = false;
2854 }
2855 }
2856 }
2857
2858 /* Clear the flag for registers that actually got read (as expected). */
2859 clear_deps_for_inst_src(scan_inst, dispatch_width,
2860 needs_dep, first_write_grf, write_len);
2861
2862 /* Continue the loop only if we haven't resolved all the dependencies */
2863 int i;
2864 for (i = 0; i < write_len; i++) {
2865 if (needs_dep[i])
2866 break;
2867 }
2868 if (i == write_len)
2869 return;
2870 }
2871 }
2872
2873 /**
2874 * Implements this workaround for the original 965:
2875 *
2876 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2877 * used as a destination register until after it has been sourced by an
2878 * instruction with a different destination register.
2879 */
2880 void
2881 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2882 {
2883 int write_len = inst->regs_written;
2884 int first_write_grf = inst->dst.reg;
2885 bool needs_dep[BRW_MAX_MRF];
2886 assert(write_len < (int)sizeof(needs_dep) - 1);
2887
2888 memset(needs_dep, false, sizeof(needs_dep));
2889 memset(needs_dep, true, write_len);
2890 /* Walk forwards looking for writes to registers we're writing which aren't
2891 * read before being written.
2892 */
2893 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2894 /* If we hit control flow, force resolve all remaining dependencies. */
2895 if (block->end() == scan_inst) {
2896 for (int i = 0; i < write_len; i++) {
2897 if (needs_dep[i])
2898 scan_inst->insert_before(block,
2899 DEP_RESOLVE_MOV(first_write_grf + i));
2900 }
2901 return;
2902 }
2903
2904 /* Clear the flag for registers that actually got read (as expected). */
2905 clear_deps_for_inst_src(scan_inst, dispatch_width,
2906 needs_dep, first_write_grf, write_len);
2907
2908 /* We insert our reads as late as possible since they're reading the
2909 * result of a SEND, which has massive latency.
2910 */
2911 if (scan_inst->dst.file == GRF &&
2912 scan_inst->dst.reg >= first_write_grf &&
2913 scan_inst->dst.reg < first_write_grf + write_len &&
2914 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2915 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2916 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2917 }
2918
2919 /* Continue the loop only if we haven't resolved all the dependencies */
2920 int i;
2921 for (i = 0; i < write_len; i++) {
2922 if (needs_dep[i])
2923 break;
2924 }
2925 if (i == write_len)
2926 return;
2927 }
2928
2929 /* If we hit the end of the program, resolve all remaining dependencies out
2930 * of paranoia.
2931 */
2932 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2933 assert(last_inst->eot);
2934 for (int i = 0; i < write_len; i++) {
2935 if (needs_dep[i])
2936 last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2937 }
2938 }
2939
2940 void
2941 fs_visitor::insert_gen4_send_dependency_workarounds()
2942 {
2943 if (brw->gen != 4 || brw->is_g4x)
2944 return;
2945
2946 bool progress = false;
2947
2948 /* Note that we're done with register allocation, so GRF fs_regs always
2949 * have a .reg_offset of 0.
2950 */
2951
2952 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2953 if (inst->mlen != 0 && inst->dst.file == GRF) {
2954 insert_gen4_pre_send_dependency_workarounds(block, inst);
2955 insert_gen4_post_send_dependency_workarounds(block, inst);
2956 progress = true;
2957 }
2958 }
2959
2960 if (progress)
2961 invalidate_live_intervals();
2962 }
2963
2964 /**
2965 * Turns the generic expression-style uniform pull constant load instruction
2966 * into a hardware-specific series of instructions for loading a pull
2967 * constant.
2968 *
2969 * The expression style allows the CSE pass before this to optimize out
2970 * repeated loads from the same offset, and gives the pre-register-allocation
2971 * scheduling full flexibility, while the conversion to native instructions
2972 * allows the post-register-allocation scheduler the best information
2973 * possible.
2974 *
2975 * Note that execution masking for setting up pull constant loads is special:
2976 * the channels that need to be written are unrelated to the current execution
2977 * mask, since a later instruction will use one of the result channels as a
2978 * source operand for all 8 or 16 of its channels.
2979 */
2980 void
2981 fs_visitor::lower_uniform_pull_constant_loads()
2982 {
2983 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2984 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2985 continue;
2986
2987 if (brw->gen >= 7) {
2988 /* The offset arg before was a vec4-aligned byte offset. We need to
2989 * turn it into a dword offset.
2990 */
2991 fs_reg const_offset_reg = inst->src[1];
2992 assert(const_offset_reg.file == IMM &&
2993 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2994 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
2995 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2996
2997 /* This is actually going to be a MOV, but since only the first dword
2998 * is accessed, we have a special opcode to do just that one. Note
2999 * that this needs to be an operation that will be considered a def
3000 * by live variable analysis, or register allocation will explode.
3001 */
3002 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3003 8, payload, const_offset_reg);
3004 setup->force_writemask_all = true;
3005
3006 setup->ir = inst->ir;
3007 setup->annotation = inst->annotation;
3008 inst->insert_before(block, setup);
3009
3010 /* Similarly, this will only populate the first 4 channels of the
3011 * result register (since we only use smear values from 0-3), but we
3012 * don't tell the optimizer.
3013 */
3014 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3015 inst->src[1] = payload;
3016
3017 invalidate_live_intervals();
3018 } else {
3019 /* Before register allocation, we didn't tell the scheduler about the
3020 * MRF we use. We know it's safe to use this MRF because nothing
3021 * else does except for register spill/unspill, which generates and
3022 * uses its MRF within a single IR instruction.
3023 */
3024 inst->base_mrf = 14;
3025 inst->mlen = 1;
3026 }
3027 }
3028 }
3029
3030 bool
3031 fs_visitor::lower_load_payload()
3032 {
3033 bool progress = false;
3034
3035 int vgrf_to_reg[virtual_grf_count];
3036 int reg_count = 16; /* Leave room for MRF */
3037 for (int i = 0; i < virtual_grf_count; ++i) {
3038 vgrf_to_reg[i] = reg_count;
3039 reg_count += virtual_grf_sizes[i];
3040 }
3041
3042 struct {
3043 bool written:1; /* Whether this register has ever been written */
3044 bool force_writemask_all:1;
3045 bool force_sechalf:1;
3046 } metadata[reg_count];
3047 memset(metadata, 0, sizeof(metadata));
3048
3049 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3050 int dst_reg;
3051 if (inst->dst.file == GRF) {
3052 dst_reg = vgrf_to_reg[inst->dst.reg];
3053 } else {
3054 /* MRF */
3055 dst_reg = inst->dst.reg;
3056 }
3057
3058 if (inst->dst.file == MRF || inst->dst.file == GRF) {
3059 bool force_sechalf = inst->force_sechalf;
3060 bool toggle_sechalf = inst->dst.width == 16 &&
3061 type_sz(inst->dst.type) == 4;
3062 for (int i = 0; i < inst->regs_written; ++i) {
3063 metadata[dst_reg + i].written = true;
3064 metadata[dst_reg + i].force_sechalf = force_sechalf;
3065 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3066 force_sechalf = (toggle_sechalf != force_sechalf);
3067 }
3068 }
3069
3070 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3071 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3072 fs_reg dst = inst->dst;
3073
3074 for (int i = 0; i < inst->sources; i++) {
3075 dst.width = inst->src[i].effective_width;
3076 dst.type = inst->src[i].type;
3077
3078 if (inst->src[i].file == BAD_FILE) {
3079 /* Do nothing but otherwise increment as normal */
3080 } else if (dst.file == MRF &&
3081 dst.width == 8 &&
3082 brw->has_compr4 &&
3083 i + 4 < inst->sources &&
3084 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3085 fs_reg compr4_dst = dst;
3086 compr4_dst.reg += BRW_MRF_COMPR4;
3087 compr4_dst.width = 16;
3088 fs_reg compr4_src = inst->src[i];
3089 compr4_src.width = 16;
3090 fs_inst *mov = MOV(compr4_dst, compr4_src);
3091 mov->force_writemask_all = true;
3092 inst->insert_before(block, mov);
3093 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3094 inst->src[i + 4].file = BAD_FILE;
3095 } else {
3096 fs_inst *mov = MOV(dst, inst->src[i]);
3097 if (inst->src[i].file == GRF) {
3098 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3099 inst->src[i].reg_offset;
3100 mov->force_sechalf = metadata[src_reg].force_sechalf;
3101 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3102 metadata[dst_reg] = metadata[src_reg];
3103 if (dst.width * type_sz(dst.type) > 32) {
3104 assert((!metadata[src_reg].written ||
3105 !metadata[src_reg].force_sechalf) &&
3106 (!metadata[src_reg + 1].written ||
3107 metadata[src_reg + 1].force_sechalf));
3108 metadata[dst_reg + 1] = metadata[src_reg + 1];
3109 }
3110 } else {
3111 metadata[dst_reg].force_writemask_all = false;
3112 metadata[dst_reg].force_sechalf = false;
3113 if (dst.width == 16) {
3114 metadata[dst_reg + 1].force_writemask_all = false;
3115 metadata[dst_reg + 1].force_sechalf = true;
3116 }
3117 }
3118 inst->insert_before(block, mov);
3119 }
3120
3121 dst = offset(dst, 1);
3122 }
3123
3124 inst->remove(block);
3125 progress = true;
3126 }
3127 }
3128
3129 if (progress)
3130 invalidate_live_intervals();
3131
3132 return progress;
3133 }
3134
3135 void
3136 fs_visitor::dump_instructions()
3137 {
3138 dump_instructions(NULL);
3139 }
3140
3141 void
3142 fs_visitor::dump_instructions(const char *name)
3143 {
3144 calculate_register_pressure();
3145 FILE *file = stderr;
3146 if (name && geteuid() != 0) {
3147 file = fopen(name, "w");
3148 if (!file)
3149 file = stderr;
3150 }
3151
3152 int ip = 0, max_pressure = 0;
3153 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3154 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3155 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3156 dump_instruction(inst, file);
3157 ++ip;
3158 }
3159 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3160
3161 if (file != stderr) {
3162 fclose(file);
3163 }
3164 }
3165
3166 void
3167 fs_visitor::dump_instruction(backend_instruction *be_inst)
3168 {
3169 dump_instruction(be_inst, stderr);
3170 }
3171
3172 void
3173 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3174 {
3175 fs_inst *inst = (fs_inst *)be_inst;
3176
3177 if (inst->predicate) {
3178 fprintf(file, "(%cf0.%d) ",
3179 inst->predicate_inverse ? '-' : '+',
3180 inst->flag_subreg);
3181 }
3182
3183 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3184 if (inst->saturate)
3185 fprintf(file, ".sat");
3186 if (inst->conditional_mod) {
3187 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3188 if (!inst->predicate &&
3189 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3190 inst->opcode != BRW_OPCODE_IF &&
3191 inst->opcode != BRW_OPCODE_WHILE))) {
3192 fprintf(file, ".f0.%d", inst->flag_subreg);
3193 }
3194 }
3195 fprintf(file, "(%d) ", inst->exec_size);
3196
3197
3198 switch (inst->dst.file) {
3199 case GRF:
3200 fprintf(file, "vgrf%d", inst->dst.reg);
3201 if (inst->dst.width != dispatch_width)
3202 fprintf(file, "@%d", inst->dst.width);
3203 if (virtual_grf_sizes[inst->dst.reg] != inst->dst.width / 8 ||
3204 inst->dst.subreg_offset)
3205 fprintf(file, "+%d.%d",
3206 inst->dst.reg_offset, inst->dst.subreg_offset);
3207 break;
3208 case MRF:
3209 fprintf(file, "m%d", inst->dst.reg);
3210 break;
3211 case BAD_FILE:
3212 fprintf(file, "(null)");
3213 break;
3214 case UNIFORM:
3215 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3216 break;
3217 case ATTR:
3218 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3219 break;
3220 case HW_REG:
3221 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3222 switch (inst->dst.fixed_hw_reg.nr) {
3223 case BRW_ARF_NULL:
3224 fprintf(file, "null");
3225 break;
3226 case BRW_ARF_ADDRESS:
3227 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3228 break;
3229 case BRW_ARF_ACCUMULATOR:
3230 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3231 break;
3232 case BRW_ARF_FLAG:
3233 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3234 inst->dst.fixed_hw_reg.subnr);
3235 break;
3236 default:
3237 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3238 inst->dst.fixed_hw_reg.subnr);
3239 break;
3240 }
3241 } else {
3242 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3243 }
3244 if (inst->dst.fixed_hw_reg.subnr)
3245 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3246 break;
3247 default:
3248 fprintf(file, "???");
3249 break;
3250 }
3251 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3252
3253 for (int i = 0; i < inst->sources; i++) {
3254 if (inst->src[i].negate)
3255 fprintf(file, "-");
3256 if (inst->src[i].abs)
3257 fprintf(file, "|");
3258 switch (inst->src[i].file) {
3259 case GRF:
3260 fprintf(file, "vgrf%d", inst->src[i].reg);
3261 if (inst->src[i].width != dispatch_width)
3262 fprintf(file, "@%d", inst->src[i].width);
3263 if (virtual_grf_sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3264 inst->src[i].subreg_offset)
3265 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3266 inst->src[i].subreg_offset);
3267 break;
3268 case MRF:
3269 fprintf(file, "***m%d***", inst->src[i].reg);
3270 break;
3271 case ATTR:
3272 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3273 break;
3274 case UNIFORM:
3275 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3276 if (inst->src[i].reladdr) {
3277 fprintf(file, "+reladdr");
3278 } else if (inst->src[i].subreg_offset) {
3279 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3280 inst->src[i].subreg_offset);
3281 }
3282 break;
3283 case BAD_FILE:
3284 fprintf(file, "(null)");
3285 break;
3286 case IMM:
3287 switch (inst->src[i].type) {
3288 case BRW_REGISTER_TYPE_F:
3289 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3290 break;
3291 case BRW_REGISTER_TYPE_D:
3292 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3293 break;
3294 case BRW_REGISTER_TYPE_UD:
3295 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3296 break;
3297 case BRW_REGISTER_TYPE_VF:
3298 fprintf(stderr, "[%-gF, %-gF, %-gF, %-gF]",
3299 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3300 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3301 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3302 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3303 break;
3304 default:
3305 fprintf(file, "???");
3306 break;
3307 }
3308 break;
3309 case HW_REG:
3310 if (inst->src[i].fixed_hw_reg.negate)
3311 fprintf(file, "-");
3312 if (inst->src[i].fixed_hw_reg.abs)
3313 fprintf(file, "|");
3314 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3315 switch (inst->src[i].fixed_hw_reg.nr) {
3316 case BRW_ARF_NULL:
3317 fprintf(file, "null");
3318 break;
3319 case BRW_ARF_ADDRESS:
3320 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3321 break;
3322 case BRW_ARF_ACCUMULATOR:
3323 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3324 break;
3325 case BRW_ARF_FLAG:
3326 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3327 inst->src[i].fixed_hw_reg.subnr);
3328 break;
3329 default:
3330 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3331 inst->src[i].fixed_hw_reg.subnr);
3332 break;
3333 }
3334 } else {
3335 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3336 }
3337 if (inst->src[i].fixed_hw_reg.subnr)
3338 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3339 if (inst->src[i].fixed_hw_reg.abs)
3340 fprintf(file, "|");
3341 break;
3342 default:
3343 fprintf(file, "???");
3344 break;
3345 }
3346 if (inst->src[i].abs)
3347 fprintf(file, "|");
3348
3349 if (inst->src[i].file != IMM) {
3350 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3351 }
3352
3353 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3354 fprintf(file, ", ");
3355 }
3356
3357 fprintf(file, " ");
3358
3359 if (dispatch_width == 16 && inst->exec_size == 8) {
3360 if (inst->force_sechalf)
3361 fprintf(file, "2ndhalf ");
3362 else
3363 fprintf(file, "1sthalf ");
3364 }
3365
3366 fprintf(file, "\n");
3367 }
3368
3369 /**
3370 * Possibly returns an instruction that set up @param reg.
3371 *
3372 * Sometimes we want to take the result of some expression/variable
3373 * dereference tree and rewrite the instruction generating the result
3374 * of the tree. When processing the tree, we know that the
3375 * instructions generated are all writing temporaries that are dead
3376 * outside of this tree. So, if we have some instructions that write
3377 * a temporary, we're free to point that temp write somewhere else.
3378 *
3379 * Note that this doesn't guarantee that the instruction generated
3380 * only reg -- it might be the size=4 destination of a texture instruction.
3381 */
3382 fs_inst *
3383 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3384 fs_inst *end,
3385 const fs_reg &reg)
3386 {
3387 if (end == start ||
3388 end->is_partial_write() ||
3389 reg.reladdr ||
3390 !reg.equals(end->dst)) {
3391 return NULL;
3392 } else {
3393 return end;
3394 }
3395 }
3396
3397 void
3398 fs_visitor::setup_payload_gen6()
3399 {
3400 bool uses_depth =
3401 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3402 unsigned barycentric_interp_modes =
3403 (stage == MESA_SHADER_FRAGMENT) ?
3404 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3405
3406 assert(brw->gen >= 6);
3407
3408 /* R0-1: masks, pixel X/Y coordinates. */
3409 payload.num_regs = 2;
3410 /* R2: only for 32-pixel dispatch.*/
3411
3412 /* R3-26: barycentric interpolation coordinates. These appear in the
3413 * same order that they appear in the brw_wm_barycentric_interp_mode
3414 * enum. Each set of coordinates occupies 2 registers if dispatch width
3415 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3416 * appear if they were enabled using the "Barycentric Interpolation
3417 * Mode" bits in WM_STATE.
3418 */
3419 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3420 if (barycentric_interp_modes & (1 << i)) {
3421 payload.barycentric_coord_reg[i] = payload.num_regs;
3422 payload.num_regs += 2;
3423 if (dispatch_width == 16) {
3424 payload.num_regs += 2;
3425 }
3426 }
3427 }
3428
3429 /* R27: interpolated depth if uses source depth */
3430 if (uses_depth) {
3431 payload.source_depth_reg = payload.num_regs;
3432 payload.num_regs++;
3433 if (dispatch_width == 16) {
3434 /* R28: interpolated depth if not SIMD8. */
3435 payload.num_regs++;
3436 }
3437 }
3438 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3439 if (uses_depth) {
3440 payload.source_w_reg = payload.num_regs;
3441 payload.num_regs++;
3442 if (dispatch_width == 16) {
3443 /* R30: interpolated W if not SIMD8. */
3444 payload.num_regs++;
3445 }
3446 }
3447
3448 if (stage == MESA_SHADER_FRAGMENT) {
3449 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3450 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3451 prog_data->uses_pos_offset = key->compute_pos_offset;
3452 /* R31: MSAA position offsets. */
3453 if (prog_data->uses_pos_offset) {
3454 payload.sample_pos_reg = payload.num_regs;
3455 payload.num_regs++;
3456 }
3457 }
3458
3459 /* R32: MSAA input coverage mask */
3460 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3461 assert(brw->gen >= 7);
3462 payload.sample_mask_in_reg = payload.num_regs;
3463 payload.num_regs++;
3464 if (dispatch_width == 16) {
3465 /* R33: input coverage mask if not SIMD8. */
3466 payload.num_regs++;
3467 }
3468 }
3469
3470 /* R34-: bary for 32-pixel. */
3471 /* R58-59: interp W for 32-pixel. */
3472
3473 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3474 source_depth_to_render_target = true;
3475 }
3476 }
3477
3478 void
3479 fs_visitor::setup_vs_payload()
3480 {
3481 /* R0: thread header, R1: urb handles */
3482 payload.num_regs = 2;
3483 }
3484
3485 void
3486 fs_visitor::assign_binding_table_offsets()
3487 {
3488 assert(stage == MESA_SHADER_FRAGMENT);
3489 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3490 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3491 uint32_t next_binding_table_offset = 0;
3492
3493 /* If there are no color regions, we still perform an FB write to a null
3494 * renderbuffer, which we place at surface index 0.
3495 */
3496 prog_data->binding_table.render_target_start = next_binding_table_offset;
3497 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3498
3499 assign_common_binding_table_offsets(next_binding_table_offset);
3500 }
3501
3502 void
3503 fs_visitor::calculate_register_pressure()
3504 {
3505 invalidate_live_intervals();
3506 calculate_live_intervals();
3507
3508 unsigned num_instructions = 0;
3509 foreach_block(block, cfg)
3510 num_instructions += block->instructions.length();
3511
3512 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3513
3514 for (int reg = 0; reg < virtual_grf_count; reg++) {
3515 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3516 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3517 }
3518 }
3519
3520 void
3521 fs_visitor::optimize()
3522 {
3523 const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3524
3525 calculate_cfg();
3526
3527 split_virtual_grfs();
3528
3529 move_uniform_array_access_to_pull_constants();
3530 assign_constant_locations();
3531 demote_pull_constants();
3532
3533 #define OPT(pass, args...) do { \
3534 pass_num++; \
3535 bool this_progress = pass(args); \
3536 \
3537 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3538 char filename[64]; \
3539 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3540 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3541 \
3542 backend_visitor::dump_instructions(filename); \
3543 } \
3544 \
3545 progress = progress || this_progress; \
3546 } while (false)
3547
3548 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3549 char filename[64];
3550 snprintf(filename, 64, "%s%d-%04d-00-start",
3551 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3552
3553 backend_visitor::dump_instructions(filename);
3554 }
3555
3556 bool progress;
3557 int iteration = 0;
3558 do {
3559 progress = false;
3560 iteration++;
3561 int pass_num = 0;
3562
3563 OPT(remove_duplicate_mrf_writes);
3564
3565 OPT(opt_algebraic);
3566 OPT(opt_cse);
3567 OPT(opt_copy_propagate);
3568 OPT(opt_peephole_predicated_break);
3569 OPT(dead_code_eliminate);
3570 OPT(opt_peephole_sel);
3571 OPT(dead_control_flow_eliminate, this);
3572 OPT(opt_register_renaming);
3573 OPT(opt_saturate_propagation);
3574 OPT(register_coalesce);
3575 OPT(compute_to_mrf);
3576
3577 OPT(compact_virtual_grfs);
3578 } while (progress);
3579
3580 if (lower_load_payload()) {
3581 split_virtual_grfs();
3582 register_coalesce();
3583 compute_to_mrf();
3584 dead_code_eliminate();
3585 }
3586
3587 lower_uniform_pull_constant_loads();
3588 }
3589
3590 void
3591 fs_visitor::allocate_registers()
3592 {
3593 bool allocated_without_spills;
3594
3595 static const enum instruction_scheduler_mode pre_modes[] = {
3596 SCHEDULE_PRE,
3597 SCHEDULE_PRE_NON_LIFO,
3598 SCHEDULE_PRE_LIFO,
3599 };
3600
3601 /* Try each scheduling heuristic to see if it can successfully register
3602 * allocate without spilling. They should be ordered by decreasing
3603 * performance but increasing likelihood of allocating.
3604 */
3605 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3606 schedule_instructions(pre_modes[i]);
3607
3608 if (0) {
3609 assign_regs_trivial();
3610 allocated_without_spills = true;
3611 } else {
3612 allocated_without_spills = assign_regs(false);
3613 }
3614 if (allocated_without_spills)
3615 break;
3616 }
3617
3618 if (!allocated_without_spills) {
3619 const char *stage_name = stage == MESA_SHADER_VERTEX ?
3620 "Vertex" : "Fragment";
3621
3622 /* We assume that any spilling is worse than just dropping back to
3623 * SIMD8. There's probably actually some intermediate point where
3624 * SIMD16 with a couple of spills is still better.
3625 */
3626 if (dispatch_width == 16) {
3627 fail("Failure to register allocate. Reduce number of "
3628 "live scalar values to avoid this.");
3629 } else {
3630 perf_debug("%s shader triggered register spilling. "
3631 "Try reducing the number of live scalar values to "
3632 "improve performance.\n", stage_name);
3633 }
3634
3635 /* Since we're out of heuristics, just go spill registers until we
3636 * get an allocation.
3637 */
3638 while (!assign_regs(true)) {
3639 if (failed)
3640 break;
3641 }
3642 }
3643
3644 /* This must come after all optimization and register allocation, since
3645 * it inserts dead code that happens to have side effects, and it does
3646 * so based on the actual physical registers in use.
3647 */
3648 insert_gen4_send_dependency_workarounds();
3649
3650 if (failed)
3651 return;
3652
3653 if (!allocated_without_spills)
3654 schedule_instructions(SCHEDULE_POST);
3655
3656 if (last_scratch > 0)
3657 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3658 }
3659
3660 bool
3661 fs_visitor::run_vs()
3662 {
3663 assert(stage == MESA_SHADER_VERTEX);
3664
3665 assign_common_binding_table_offsets(0);
3666 setup_vs_payload();
3667
3668 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3669 emit_shader_time_begin();
3670
3671 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3672 base_ir = ir;
3673 this->result = reg_undef;
3674 ir->accept(this);
3675 }
3676 base_ir = NULL;
3677 if (failed)
3678 return false;
3679
3680 emit_urb_writes();
3681
3682 optimize();
3683
3684 assign_curb_setup();
3685 assign_vs_urb_setup();
3686
3687 allocate_registers();
3688
3689 return !failed;
3690 }
3691
3692 bool
3693 fs_visitor::run_fs()
3694 {
3695 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3696 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3697
3698 assert(stage == MESA_SHADER_FRAGMENT);
3699
3700 sanity_param_count = prog->Parameters->NumParameters;
3701
3702 assign_binding_table_offsets();
3703
3704 if (brw->gen >= 6)
3705 setup_payload_gen6();
3706 else
3707 setup_payload_gen4();
3708
3709 if (0) {
3710 emit_dummy_fs();
3711 } else if (brw->use_rep_send && dispatch_width == 16) {
3712 emit_repclear_shader();
3713 } else {
3714 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3715 emit_shader_time_begin();
3716
3717 calculate_urb_setup();
3718 if (prog->InputsRead > 0) {
3719 if (brw->gen < 6)
3720 emit_interpolation_setup_gen4();
3721 else
3722 emit_interpolation_setup_gen6();
3723 }
3724
3725 /* We handle discards by keeping track of the still-live pixels in f0.1.
3726 * Initialize it with the dispatched pixels.
3727 */
3728 if (wm_prog_data->uses_kill) {
3729 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3730 discard_init->flag_subreg = 1;
3731 }
3732
3733 /* Generate FS IR for main(). (the visitor only descends into
3734 * functions called "main").
3735 */
3736 if (shader) {
3737 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3738 base_ir = ir;
3739 this->result = reg_undef;
3740 ir->accept(this);
3741 }
3742 } else {
3743 emit_fragment_program_code();
3744 }
3745 base_ir = NULL;
3746 if (failed)
3747 return false;
3748
3749 emit(FS_OPCODE_PLACEHOLDER_HALT);
3750
3751 if (wm_key->alpha_test_func)
3752 emit_alpha_test();
3753
3754 emit_fb_writes();
3755
3756 optimize();
3757
3758 assign_curb_setup();
3759 assign_urb_setup();
3760
3761 allocate_registers();
3762
3763 if (failed)
3764 return false;
3765 }
3766
3767 if (dispatch_width == 8)
3768 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3769 else
3770 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3771
3772 /* If any state parameters were appended, then ParameterValues could have
3773 * been realloced, in which case the driver uniform storage set up by
3774 * _mesa_associate_uniform_storage() would point to freed memory. Make
3775 * sure that didn't happen.
3776 */
3777 assert(sanity_param_count == prog->Parameters->NumParameters);
3778
3779 return !failed;
3780 }
3781
3782 const unsigned *
3783 brw_wm_fs_emit(struct brw_context *brw,
3784 void *mem_ctx,
3785 const struct brw_wm_prog_key *key,
3786 struct brw_wm_prog_data *prog_data,
3787 struct gl_fragment_program *fp,
3788 struct gl_shader_program *prog,
3789 unsigned *final_assembly_size)
3790 {
3791 bool start_busy = false;
3792 double start_time = 0;
3793
3794 if (unlikely(brw->perf_debug)) {
3795 start_busy = (brw->batch.last_bo &&
3796 drm_intel_bo_busy(brw->batch.last_bo));
3797 start_time = get_time();
3798 }
3799
3800 struct brw_shader *shader = NULL;
3801 if (prog)
3802 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3803
3804 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3805 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3806
3807 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3808 */
3809 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3810 if (!v.run_fs()) {
3811 if (prog) {
3812 prog->LinkStatus = false;
3813 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3814 }
3815
3816 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3817 v.fail_msg);
3818
3819 return NULL;
3820 }
3821
3822 cfg_t *simd16_cfg = NULL;
3823 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3824 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3825 brw->use_rep_send)) {
3826 if (!v.simd16_unsupported) {
3827 /* Try a SIMD16 compile */
3828 v2.import_uniforms(&v);
3829 if (!v2.run_fs()) {
3830 perf_debug("SIMD16 shader failed to compile, falling back to "
3831 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3832 } else {
3833 simd16_cfg = v2.cfg;
3834 }
3835 } else {
3836 perf_debug("SIMD16 shader unsupported, falling back to "
3837 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3838 }
3839 }
3840
3841 cfg_t *simd8_cfg;
3842 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3843 if (no_simd8 && simd16_cfg) {
3844 simd8_cfg = NULL;
3845 prog_data->no_8 = true;
3846 } else {
3847 simd8_cfg = v.cfg;
3848 prog_data->no_8 = false;
3849 }
3850
3851 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
3852 &fp->Base, v.runtime_check_aads_emit);
3853
3854 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3855 char *name;
3856 if (prog)
3857 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
3858 prog->Label ? prog->Label : "unnamed",
3859 prog->Name);
3860 else
3861 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
3862
3863 g.enable_debug(name);
3864 }
3865
3866 if (simd8_cfg)
3867 g.generate_code(simd8_cfg, 8);
3868 if (simd16_cfg)
3869 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
3870
3871 if (unlikely(brw->perf_debug) && shader) {
3872 if (shader->compiled_once)
3873 brw_wm_debug_recompile(brw, prog, key);
3874 shader->compiled_once = true;
3875
3876 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3877 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3878 (get_time() - start_time) * 1000);
3879 }
3880 }
3881
3882 return g.get_assembly(final_assembly_size);
3883 }
3884
3885 extern "C" bool
3886 brw_fs_precompile(struct gl_context *ctx,
3887 struct gl_shader_program *shader_prog,
3888 struct gl_program *prog)
3889 {
3890 struct brw_context *brw = brw_context(ctx);
3891 struct brw_wm_prog_key key;
3892
3893 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
3894 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3895 bool program_uses_dfdy = fp->UsesDFdy;
3896
3897 memset(&key, 0, sizeof(key));
3898
3899 if (brw->gen < 6) {
3900 if (fp->UsesKill)
3901 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3902
3903 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3904 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3905
3906 /* Just assume depth testing. */
3907 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3908 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3909 }
3910
3911 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3912 BRW_FS_VARYING_INPUT_MASK) > 16)
3913 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3914
3915 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3916 for (unsigned i = 0; i < sampler_count; i++) {
3917 if (fp->Base.ShadowSamplers & (1 << i)) {
3918 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3919 key.tex.swizzles[i] =
3920 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3921 } else {
3922 /* Color sampler: assume no swizzling. */
3923 key.tex.swizzles[i] = SWIZZLE_XYZW;
3924 }
3925 }
3926
3927 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3928 key.drawable_height = ctx->DrawBuffer->Height;
3929 }
3930
3931 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3932 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3933 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3934
3935 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3936 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3937 key.nr_color_regions > 1;
3938 }
3939
3940 key.program_string_id = bfp->id;
3941
3942 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3943 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3944
3945 bool success = do_wm_prog(brw, shader_prog, bfp, &key);
3946
3947 brw->wm.base.prog_offset = old_prog_offset;
3948 brw->wm.prog_data = old_prog_data;
3949
3950 return success;
3951 }