i965: Fix INTEL_DEBUG=shader_time for SIMD8 VS (and GS).
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "util/register_allocate.h"
42 #include "program/hash_table.h"
43 #include "brw_context.h"
44 #include "brw_eu.h"
45 #include "brw_wm.h"
46 }
47 #include "brw_fs.h"
48 #include "brw_cfg.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53 #include "program/sampler.h"
54
55 void
56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
57 fs_reg *src, int sources)
58 {
59 memset(this, 0, sizeof(*this));
60
61 this->opcode = opcode;
62 this->dst = dst;
63 this->src = src;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (int i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (int i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 case ATTR:
101 assert(this->src[i].width > 0);
102 if (this->src[i].width == 1) {
103 this->src[i].effective_width = this->exec_size;
104 } else {
105 this->src[i].effective_width = this->src[i].width;
106 }
107 break;
108 case IMM:
109 case UNIFORM:
110 this->src[i].effective_width = this->exec_size;
111 break;
112 default:
113 unreachable("Invalid source register file");
114 }
115 }
116 this->dst.effective_width = this->exec_size;
117
118 this->conditional_mod = BRW_CONDITIONAL_NONE;
119
120 /* This will be the case for almost all instructions. */
121 switch (dst.file) {
122 case GRF:
123 case HW_REG:
124 case MRF:
125 case ATTR:
126 this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
127 break;
128 case BAD_FILE:
129 this->regs_written = 0;
130 break;
131 case IMM:
132 case UNIFORM:
133 unreachable("Invalid destination register file");
134 default:
135 unreachable("Invalid register file");
136 }
137
138 this->writes_accumulator = false;
139 }
140
141 fs_inst::fs_inst()
142 {
143 fs_reg *src = ralloc_array(this, fs_reg, 3);
144 init(BRW_OPCODE_NOP, 8, dst, src, 0);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
148 {
149 fs_reg *src = ralloc_array(this, fs_reg, 3);
150 init(opcode, exec_size, reg_undef, src, 0);
151 }
152
153 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
154 {
155 fs_reg *src = ralloc_array(this, fs_reg, 3);
156 init(opcode, 0, dst, src, 0);
157 }
158
159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
160 const fs_reg &src0)
161 {
162 fs_reg *src = ralloc_array(this, fs_reg, 3);
163 src[0] = src0;
164 init(opcode, exec_size, dst, src, 1);
165 }
166
167 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
168 {
169 fs_reg *src = ralloc_array(this, fs_reg, 3);
170 src[0] = src0;
171 init(opcode, 0, dst, src, 1);
172 }
173
174 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
175 const fs_reg &src0, const fs_reg &src1)
176 {
177 fs_reg *src = ralloc_array(this, fs_reg, 3);
178 src[0] = src0;
179 src[1] = src1;
180 init(opcode, exec_size, dst, src, 2);
181 }
182
183 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
184 const fs_reg &src1)
185 {
186 fs_reg *src = ralloc_array(this, fs_reg, 3);
187 src[0] = src0;
188 src[1] = src1;
189 init(opcode, 0, dst, src, 2);
190 }
191
192 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
193 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
194 {
195 fs_reg *src = ralloc_array(this, fs_reg, 3);
196 src[0] = src0;
197 src[1] = src1;
198 src[2] = src2;
199 init(opcode, exec_size, dst, src, 3);
200 }
201
202 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
203 const fs_reg &src1, const fs_reg &src2)
204 {
205 fs_reg *src = ralloc_array(this, fs_reg, 3);
206 src[0] = src0;
207 src[1] = src1;
208 src[2] = src2;
209 init(opcode, 0, dst, src, 3);
210 }
211
212 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
213 {
214 init(opcode, 0, dst, src, sources);
215 }
216
217 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
218 fs_reg src[], int sources)
219 {
220 init(opcode, exec_width, dst, src, sources);
221 }
222
223 fs_inst::fs_inst(const fs_inst &that)
224 {
225 memcpy(this, &that, sizeof(that));
226
227 this->src = ralloc_array(this, fs_reg, that.sources);
228
229 for (int i = 0; i < that.sources; i++)
230 this->src[i] = that.src[i];
231 }
232
233 void
234 fs_inst::resize_sources(uint8_t num_sources)
235 {
236 if (this->sources != num_sources) {
237 this->src = reralloc(this, this->src, fs_reg, num_sources);
238 this->sources = num_sources;
239 }
240 }
241
242 #define ALU1(op) \
243 fs_inst * \
244 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
245 { \
246 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
247 }
248
249 #define ALU2(op) \
250 fs_inst * \
251 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
252 const fs_reg &src1) \
253 { \
254 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
255 }
256
257 #define ALU2_ACC(op) \
258 fs_inst * \
259 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
260 const fs_reg &src1) \
261 { \
262 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
263 inst->writes_accumulator = true; \
264 return inst; \
265 }
266
267 #define ALU3(op) \
268 fs_inst * \
269 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
270 const fs_reg &src1, const fs_reg &src2) \
271 { \
272 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
273 }
274
275 ALU1(NOT)
276 ALU1(MOV)
277 ALU1(FRC)
278 ALU1(RNDD)
279 ALU1(RNDE)
280 ALU1(RNDZ)
281 ALU2(ADD)
282 ALU2(MUL)
283 ALU2_ACC(MACH)
284 ALU2(AND)
285 ALU2(OR)
286 ALU2(XOR)
287 ALU2(SHL)
288 ALU2(SHR)
289 ALU2(ASR)
290 ALU3(LRP)
291 ALU1(BFREV)
292 ALU3(BFE)
293 ALU2(BFI1)
294 ALU3(BFI2)
295 ALU1(FBH)
296 ALU1(FBL)
297 ALU1(CBIT)
298 ALU3(MAD)
299 ALU2_ACC(ADDC)
300 ALU2_ACC(SUBB)
301 ALU2(SEL)
302 ALU2(MAC)
303
304 /** Gen4 predicated IF. */
305 fs_inst *
306 fs_visitor::IF(enum brw_predicate predicate)
307 {
308 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
309 inst->predicate = predicate;
310 return inst;
311 }
312
313 /** Gen6 IF with embedded comparison. */
314 fs_inst *
315 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
316 enum brw_conditional_mod condition)
317 {
318 assert(brw->gen == 6);
319 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
320 reg_null_d, src0, src1);
321 inst->conditional_mod = condition;
322 return inst;
323 }
324
325 /**
326 * CMP: Sets the low bit of the destination channels with the result
327 * of the comparison, while the upper bits are undefined, and updates
328 * the flag register with the packed 16 bits of the result.
329 */
330 fs_inst *
331 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
332 enum brw_conditional_mod condition)
333 {
334 fs_inst *inst;
335
336 /* Take the instruction:
337 *
338 * CMP null<d> src0<f> src1<f>
339 *
340 * Original gen4 does type conversion to the destination type before
341 * comparison, producing garbage results for floating point comparisons.
342 *
343 * The destination type doesn't matter on newer generations, so we set the
344 * type to match src0 so we can compact the instruction.
345 */
346 dst.type = src0.type;
347 if (dst.file == HW_REG)
348 dst.fixed_hw_reg.type = dst.type;
349
350 resolve_ud_negate(&src0);
351 resolve_ud_negate(&src1);
352
353 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
354 inst->conditional_mod = condition;
355
356 return inst;
357 }
358
359 fs_inst *
360 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
361 {
362 uint8_t exec_size = dst.width;
363 for (int i = 0; i < sources; ++i) {
364 assert(src[i].width % dst.width == 0);
365 if (src[i].width > exec_size)
366 exec_size = src[i].width;
367 }
368
369 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
370 dst, src, sources);
371 inst->regs_written = 0;
372 for (int i = 0; i < sources; ++i) {
373 /* The LOAD_PAYLOAD instruction only really makes sense if we are
374 * dealing with whole registers. If this ever changes, we can deal
375 * with it later.
376 */
377 int size = src[i].effective_width * type_sz(src[i].type);
378 assert(size % 32 == 0);
379 inst->regs_written += (size + 31) / 32;
380 }
381
382 return inst;
383 }
384
385 exec_list
386 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
387 const fs_reg &surf_index,
388 const fs_reg &varying_offset,
389 uint32_t const_offset)
390 {
391 exec_list instructions;
392 fs_inst *inst;
393
394 /* We have our constant surface use a pitch of 4 bytes, so our index can
395 * be any component of a vector, and then we load 4 contiguous
396 * components starting from that.
397 *
398 * We break down the const_offset to a portion added to the variable
399 * offset and a portion done using reg_offset, which means that if you
400 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
401 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
402 * CSE can later notice that those loads are all the same and eliminate
403 * the redundant ones.
404 */
405 fs_reg vec4_offset = vgrf(glsl_type::int_type);
406 instructions.push_tail(ADD(vec4_offset,
407 varying_offset, fs_reg(const_offset & ~3)));
408
409 int scale = 1;
410 if (brw->gen == 4 && dst.width == 8) {
411 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
412 * u, v, r) as parameters, or we can just use the SIMD16 message
413 * consisting of (header, u). We choose the second, at the cost of a
414 * longer return length.
415 */
416 scale = 2;
417 }
418
419 enum opcode op;
420 if (brw->gen >= 7)
421 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
422 else
423 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
424
425 assert(dst.width % 8 == 0);
426 int regs_written = 4 * (dst.width / 8) * scale;
427 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
428 dst.type, dst.width);
429 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
430 inst->regs_written = regs_written;
431 instructions.push_tail(inst);
432
433 if (brw->gen < 7) {
434 inst->base_mrf = 13;
435 inst->header_present = true;
436 if (brw->gen == 4)
437 inst->mlen = 3;
438 else
439 inst->mlen = 1 + dispatch_width / 8;
440 }
441
442 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
443 instructions.push_tail(MOV(dst, result));
444
445 return instructions;
446 }
447
448 /**
449 * A helper for MOV generation for fixing up broken hardware SEND dependency
450 * handling.
451 */
452 fs_inst *
453 fs_visitor::DEP_RESOLVE_MOV(int grf)
454 {
455 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
456
457 inst->ir = NULL;
458 inst->annotation = "send dependency resolve";
459
460 /* The caller always wants uncompressed to emit the minimal extra
461 * dependencies, and to avoid having to deal with aligning its regs to 2.
462 */
463 inst->exec_size = 8;
464
465 return inst;
466 }
467
468 bool
469 fs_inst::equals(fs_inst *inst) const
470 {
471 return (opcode == inst->opcode &&
472 dst.equals(inst->dst) &&
473 src[0].equals(inst->src[0]) &&
474 src[1].equals(inst->src[1]) &&
475 src[2].equals(inst->src[2]) &&
476 saturate == inst->saturate &&
477 predicate == inst->predicate &&
478 conditional_mod == inst->conditional_mod &&
479 mlen == inst->mlen &&
480 base_mrf == inst->base_mrf &&
481 target == inst->target &&
482 eot == inst->eot &&
483 header_present == inst->header_present &&
484 shadow_compare == inst->shadow_compare &&
485 exec_size == inst->exec_size &&
486 offset == inst->offset);
487 }
488
489 bool
490 fs_inst::overwrites_reg(const fs_reg &reg) const
491 {
492 return (reg.file == dst.file &&
493 reg.reg == dst.reg &&
494 reg.reg_offset >= dst.reg_offset &&
495 reg.reg_offset < dst.reg_offset + regs_written);
496 }
497
498 bool
499 fs_inst::is_send_from_grf() const
500 {
501 switch (opcode) {
502 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
503 case SHADER_OPCODE_SHADER_TIME_ADD:
504 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
505 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
506 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
507 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
508 case SHADER_OPCODE_UNTYPED_ATOMIC:
509 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
510 case SHADER_OPCODE_URB_WRITE_SIMD8:
511 return true;
512 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
513 return src[1].file == GRF;
514 case FS_OPCODE_FB_WRITE:
515 return src[0].file == GRF;
516 default:
517 if (is_tex())
518 return src[0].file == GRF;
519
520 return false;
521 }
522 }
523
524 bool
525 fs_inst::can_do_source_mods(struct brw_context *brw)
526 {
527 if (brw->gen == 6 && is_math())
528 return false;
529
530 if (is_send_from_grf())
531 return false;
532
533 if (!backend_instruction::can_do_source_mods())
534 return false;
535
536 return true;
537 }
538
539 void
540 fs_reg::init()
541 {
542 memset(this, 0, sizeof(*this));
543 stride = 1;
544 }
545
546 /** Generic unset register constructor. */
547 fs_reg::fs_reg()
548 {
549 init();
550 this->file = BAD_FILE;
551 }
552
553 /** Immediate value constructor. */
554 fs_reg::fs_reg(float f)
555 {
556 init();
557 this->file = IMM;
558 this->type = BRW_REGISTER_TYPE_F;
559 this->fixed_hw_reg.dw1.f = f;
560 this->width = 1;
561 }
562
563 /** Immediate value constructor. */
564 fs_reg::fs_reg(int32_t i)
565 {
566 init();
567 this->file = IMM;
568 this->type = BRW_REGISTER_TYPE_D;
569 this->fixed_hw_reg.dw1.d = i;
570 this->width = 1;
571 }
572
573 /** Immediate value constructor. */
574 fs_reg::fs_reg(uint32_t u)
575 {
576 init();
577 this->file = IMM;
578 this->type = BRW_REGISTER_TYPE_UD;
579 this->fixed_hw_reg.dw1.ud = u;
580 this->width = 1;
581 }
582
583 /** Vector float immediate value constructor. */
584 fs_reg::fs_reg(uint8_t vf[4])
585 {
586 init();
587 this->file = IMM;
588 this->type = BRW_REGISTER_TYPE_VF;
589 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
590 }
591
592 /** Vector float immediate value constructor. */
593 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
594 {
595 init();
596 this->file = IMM;
597 this->type = BRW_REGISTER_TYPE_VF;
598 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
599 (vf1 << 8) |
600 (vf2 << 16) |
601 (vf3 << 24);
602 }
603
604 /** Fixed brw_reg. */
605 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
606 {
607 init();
608 this->file = HW_REG;
609 this->fixed_hw_reg = fixed_hw_reg;
610 this->type = fixed_hw_reg.type;
611 this->width = 1 << fixed_hw_reg.width;
612 }
613
614 bool
615 fs_reg::equals(const fs_reg &r) const
616 {
617 return (file == r.file &&
618 reg == r.reg &&
619 reg_offset == r.reg_offset &&
620 subreg_offset == r.subreg_offset &&
621 type == r.type &&
622 negate == r.negate &&
623 abs == r.abs &&
624 !reladdr && !r.reladdr &&
625 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
626 width == r.width &&
627 stride == r.stride);
628 }
629
630 fs_reg &
631 fs_reg::set_smear(unsigned subreg)
632 {
633 assert(file != HW_REG && file != IMM);
634 subreg_offset = subreg * type_sz(type);
635 stride = 0;
636 return *this;
637 }
638
639 bool
640 fs_reg::is_contiguous() const
641 {
642 return stride == 1;
643 }
644
645 int
646 fs_visitor::type_size(const struct glsl_type *type)
647 {
648 unsigned int size, i;
649
650 switch (type->base_type) {
651 case GLSL_TYPE_UINT:
652 case GLSL_TYPE_INT:
653 case GLSL_TYPE_FLOAT:
654 case GLSL_TYPE_BOOL:
655 return type->components();
656 case GLSL_TYPE_ARRAY:
657 return type_size(type->fields.array) * type->length;
658 case GLSL_TYPE_STRUCT:
659 size = 0;
660 for (i = 0; i < type->length; i++) {
661 size += type_size(type->fields.structure[i].type);
662 }
663 return size;
664 case GLSL_TYPE_SAMPLER:
665 /* Samplers take up no register space, since they're baked in at
666 * link time.
667 */
668 return 0;
669 case GLSL_TYPE_ATOMIC_UINT:
670 return 0;
671 case GLSL_TYPE_IMAGE:
672 case GLSL_TYPE_VOID:
673 case GLSL_TYPE_ERROR:
674 case GLSL_TYPE_INTERFACE:
675 unreachable("not reached");
676 }
677
678 return 0;
679 }
680
681 fs_reg
682 fs_visitor::get_timestamp()
683 {
684 assert(brw->gen >= 7);
685
686 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
687 BRW_ARF_TIMESTAMP,
688 0),
689 BRW_REGISTER_TYPE_UD));
690
691 fs_reg dst = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 4);
692
693 fs_inst *mov = emit(MOV(dst, ts));
694 /* We want to read the 3 fields we care about even if it's not enabled in
695 * the dispatch.
696 */
697 mov->force_writemask_all = true;
698
699 /* The caller wants the low 32 bits of the timestamp. Since it's running
700 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
701 * which is plenty of time for our purposes. It is identical across the
702 * EUs, but since it's tracking GPU core speed it will increment at a
703 * varying rate as render P-states change.
704 *
705 * The caller could also check if render P-states have changed (or anything
706 * else that might disrupt timing) by setting smear to 2 and checking if
707 * that field is != 0.
708 */
709 dst.set_smear(0);
710
711 return dst;
712 }
713
714 void
715 fs_visitor::emit_shader_time_begin()
716 {
717 current_annotation = "shader time start";
718 shader_start_time = get_timestamp();
719 }
720
721 void
722 fs_visitor::emit_shader_time_end()
723 {
724 current_annotation = "shader time end";
725
726 enum shader_time_shader_type type, written_type, reset_type;
727 switch (stage) {
728 case MESA_SHADER_VERTEX:
729 type = ST_VS;
730 written_type = ST_VS_WRITTEN;
731 reset_type = ST_VS_RESET;
732 break;
733 case MESA_SHADER_GEOMETRY:
734 type = ST_GS;
735 written_type = ST_GS_WRITTEN;
736 reset_type = ST_GS_RESET;
737 break;
738 case MESA_SHADER_FRAGMENT:
739 if (dispatch_width == 8) {
740 type = ST_FS8;
741 written_type = ST_FS8_WRITTEN;
742 reset_type = ST_FS8_RESET;
743 } else {
744 assert(dispatch_width == 16);
745 type = ST_FS16;
746 written_type = ST_FS16_WRITTEN;
747 reset_type = ST_FS16_RESET;
748 }
749 break;
750 default:
751 unreachable("fs_visitor::emit_shader_time_end missing code");
752 }
753
754 fs_reg shader_end_time = get_timestamp();
755
756 /* Check that there weren't any timestamp reset events (assuming these
757 * were the only two timestamp reads that happened).
758 */
759 fs_reg reset = shader_end_time;
760 reset.set_smear(2);
761 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
762 test->conditional_mod = BRW_CONDITIONAL_Z;
763 emit(IF(BRW_PREDICATE_NORMAL));
764
765 fs_reg start = shader_start_time;
766 start.negate = true;
767 fs_reg diff = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 1);
768 emit(ADD(diff, start, shader_end_time));
769
770 /* If there were no instructions between the two timestamp gets, the diff
771 * is 2 cycles. Remove that overhead, so I can forget about that when
772 * trying to determine the time taken for single instructions.
773 */
774 emit(ADD(diff, diff, fs_reg(-2u)));
775
776 emit_shader_time_write(type, diff);
777 emit_shader_time_write(written_type, fs_reg(1u));
778 emit(BRW_OPCODE_ELSE);
779 emit_shader_time_write(reset_type, fs_reg(1u));
780 emit(BRW_OPCODE_ENDIF);
781 }
782
783 void
784 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
785 fs_reg value)
786 {
787 int shader_time_index =
788 brw_get_shader_time_index(brw, shader_prog, prog, type);
789 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
790
791 fs_reg payload;
792 if (dispatch_width == 8)
793 payload = vgrf(glsl_type::uvec2_type);
794 else
795 payload = vgrf(glsl_type::uint_type);
796
797 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
798 fs_reg(), payload, offset, value));
799 }
800
801 void
802 fs_visitor::vfail(const char *format, va_list va)
803 {
804 char *msg;
805
806 if (failed)
807 return;
808
809 failed = true;
810
811 msg = ralloc_vasprintf(mem_ctx, format, va);
812 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
813
814 this->fail_msg = msg;
815
816 if (INTEL_DEBUG & DEBUG_WM) {
817 fprintf(stderr, "%s", msg);
818 }
819 }
820
821 void
822 fs_visitor::fail(const char *format, ...)
823 {
824 va_list va;
825
826 va_start(va, format);
827 vfail(format, va);
828 va_end(va);
829 }
830
831 /**
832 * Mark this program as impossible to compile in SIMD16 mode.
833 *
834 * During the SIMD8 compile (which happens first), we can detect and flag
835 * things that are unsupported in SIMD16 mode, so the compiler can skip
836 * the SIMD16 compile altogether.
837 *
838 * During a SIMD16 compile (if one happens anyway), this just calls fail().
839 */
840 void
841 fs_visitor::no16(const char *format, ...)
842 {
843 va_list va;
844
845 va_start(va, format);
846
847 if (dispatch_width == 16) {
848 vfail(format, va);
849 } else {
850 simd16_unsupported = true;
851
852 if (brw->perf_debug) {
853 if (no16_msg)
854 ralloc_vasprintf_append(&no16_msg, format, va);
855 else
856 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
857 }
858 }
859
860 va_end(va);
861 }
862
863 fs_inst *
864 fs_visitor::emit(enum opcode opcode)
865 {
866 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
867 }
868
869 fs_inst *
870 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
871 {
872 return emit(new(mem_ctx) fs_inst(opcode, dst));
873 }
874
875 fs_inst *
876 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
877 {
878 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
879 }
880
881 fs_inst *
882 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
883 const fs_reg &src1)
884 {
885 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
886 }
887
888 fs_inst *
889 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
890 const fs_reg &src1, const fs_reg &src2)
891 {
892 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
893 }
894
895 fs_inst *
896 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
897 fs_reg src[], int sources)
898 {
899 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
900 }
901
902 /**
903 * Returns true if the instruction has a flag that means it won't
904 * update an entire destination register.
905 *
906 * For example, dead code elimination and live variable analysis want to know
907 * when a write to a variable screens off any preceding values that were in
908 * it.
909 */
910 bool
911 fs_inst::is_partial_write() const
912 {
913 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
914 (this->dst.width * type_sz(this->dst.type)) < 32 ||
915 !this->dst.is_contiguous());
916 }
917
918 int
919 fs_inst::regs_read(fs_visitor *v, int arg) const
920 {
921 if (is_tex() && arg == 0 && src[0].file == GRF) {
922 return mlen;
923 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
924 return mlen;
925 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
926 return mlen;
927 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
928 return mlen;
929 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
930 return mlen;
931 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
932 return mlen;
933 }
934
935 switch (src[arg].file) {
936 case BAD_FILE:
937 case UNIFORM:
938 case IMM:
939 return 1;
940 case GRF:
941 case HW_REG:
942 if (src[arg].stride == 0) {
943 return 1;
944 } else {
945 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
946 return (size + 31) / 32;
947 }
948 case MRF:
949 unreachable("MRF registers are not allowed as sources");
950 default:
951 unreachable("Invalid register file");
952 }
953 }
954
955 bool
956 fs_inst::reads_flag() const
957 {
958 return predicate;
959 }
960
961 bool
962 fs_inst::writes_flag() const
963 {
964 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
965 opcode != BRW_OPCODE_IF &&
966 opcode != BRW_OPCODE_WHILE)) ||
967 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
968 }
969
970 /**
971 * Returns how many MRFs an FS opcode will write over.
972 *
973 * Note that this is not the 0 or 1 implied writes in an actual gen
974 * instruction -- the FS opcodes often generate MOVs in addition.
975 */
976 int
977 fs_visitor::implied_mrf_writes(fs_inst *inst)
978 {
979 if (inst->mlen == 0)
980 return 0;
981
982 if (inst->base_mrf == -1)
983 return 0;
984
985 switch (inst->opcode) {
986 case SHADER_OPCODE_RCP:
987 case SHADER_OPCODE_RSQ:
988 case SHADER_OPCODE_SQRT:
989 case SHADER_OPCODE_EXP2:
990 case SHADER_OPCODE_LOG2:
991 case SHADER_OPCODE_SIN:
992 case SHADER_OPCODE_COS:
993 return 1 * dispatch_width / 8;
994 case SHADER_OPCODE_POW:
995 case SHADER_OPCODE_INT_QUOTIENT:
996 case SHADER_OPCODE_INT_REMAINDER:
997 return 2 * dispatch_width / 8;
998 case SHADER_OPCODE_TEX:
999 case FS_OPCODE_TXB:
1000 case SHADER_OPCODE_TXD:
1001 case SHADER_OPCODE_TXF:
1002 case SHADER_OPCODE_TXF_CMS:
1003 case SHADER_OPCODE_TXF_MCS:
1004 case SHADER_OPCODE_TG4:
1005 case SHADER_OPCODE_TG4_OFFSET:
1006 case SHADER_OPCODE_TXL:
1007 case SHADER_OPCODE_TXS:
1008 case SHADER_OPCODE_LOD:
1009 return 1;
1010 case FS_OPCODE_FB_WRITE:
1011 return 2;
1012 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
1013 case SHADER_OPCODE_GEN4_SCRATCH_READ:
1014 return 1;
1015 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1016 return inst->mlen;
1017 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1018 return 2;
1019 case SHADER_OPCODE_UNTYPED_ATOMIC:
1020 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1021 case SHADER_OPCODE_URB_WRITE_SIMD8:
1022 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1023 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1024 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1025 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1026 return 0;
1027 default:
1028 unreachable("not reached");
1029 }
1030 }
1031
1032 int
1033 fs_visitor::virtual_grf_alloc(int size)
1034 {
1035 if (virtual_grf_array_size <= virtual_grf_count) {
1036 if (virtual_grf_array_size == 0)
1037 virtual_grf_array_size = 16;
1038 else
1039 virtual_grf_array_size *= 2;
1040 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
1041 virtual_grf_array_size);
1042 }
1043 virtual_grf_sizes[virtual_grf_count] = size;
1044 return virtual_grf_count++;
1045 }
1046
1047 fs_reg
1048 fs_visitor::vgrf(const glsl_type *const type)
1049 {
1050 int reg_width = dispatch_width / 8;
1051 return fs_reg(GRF, virtual_grf_alloc(type_size(type) * reg_width),
1052 brw_type_for_base_type(type), dispatch_width);
1053 }
1054
1055 fs_reg
1056 fs_visitor::vgrf(int num_components)
1057 {
1058 int reg_width = dispatch_width / 8;
1059 return fs_reg(GRF, virtual_grf_alloc(num_components * reg_width),
1060 BRW_REGISTER_TYPE_F, dispatch_width);
1061 }
1062
1063 /** Fixed HW reg constructor. */
1064 fs_reg::fs_reg(enum register_file file, int reg)
1065 {
1066 init();
1067 this->file = file;
1068 this->reg = reg;
1069 this->type = BRW_REGISTER_TYPE_F;
1070
1071 switch (file) {
1072 case UNIFORM:
1073 this->width = 1;
1074 break;
1075 default:
1076 this->width = 8;
1077 }
1078 }
1079
1080 /** Fixed HW reg constructor. */
1081 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1082 {
1083 init();
1084 this->file = file;
1085 this->reg = reg;
1086 this->type = type;
1087
1088 switch (file) {
1089 case UNIFORM:
1090 this->width = 1;
1091 break;
1092 default:
1093 this->width = 8;
1094 }
1095 }
1096
1097 /** Fixed HW reg constructor. */
1098 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1099 uint8_t width)
1100 {
1101 init();
1102 this->file = file;
1103 this->reg = reg;
1104 this->type = type;
1105 this->width = width;
1106 }
1107
1108 fs_reg *
1109 fs_visitor::variable_storage(ir_variable *var)
1110 {
1111 return (fs_reg *)hash_table_find(this->variable_ht, var);
1112 }
1113
1114 void
1115 import_uniforms_callback(const void *key,
1116 void *data,
1117 void *closure)
1118 {
1119 struct hash_table *dst_ht = (struct hash_table *)closure;
1120 const fs_reg *reg = (const fs_reg *)data;
1121
1122 if (reg->file != UNIFORM)
1123 return;
1124
1125 hash_table_insert(dst_ht, data, key);
1126 }
1127
1128 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1129 * This brings in those uniform definitions
1130 */
1131 void
1132 fs_visitor::import_uniforms(fs_visitor *v)
1133 {
1134 hash_table_call_foreach(v->variable_ht,
1135 import_uniforms_callback,
1136 variable_ht);
1137 this->push_constant_loc = v->push_constant_loc;
1138 this->pull_constant_loc = v->pull_constant_loc;
1139 this->uniforms = v->uniforms;
1140 this->param_size = v->param_size;
1141 }
1142
1143 /* Our support for uniforms is piggy-backed on the struct
1144 * gl_fragment_program, because that's where the values actually
1145 * get stored, rather than in some global gl_shader_program uniform
1146 * store.
1147 */
1148 void
1149 fs_visitor::setup_uniform_values(ir_variable *ir)
1150 {
1151 int namelen = strlen(ir->name);
1152
1153 /* The data for our (non-builtin) uniforms is stored in a series of
1154 * gl_uniform_driver_storage structs for each subcomponent that
1155 * glGetUniformLocation() could name. We know it's been set up in the same
1156 * order we'd walk the type, so walk the list of storage and find anything
1157 * with our name, or the prefix of a component that starts with our name.
1158 */
1159 unsigned params_before = uniforms;
1160 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1161 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1162
1163 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1164 (storage->name[namelen] != 0 &&
1165 storage->name[namelen] != '.' &&
1166 storage->name[namelen] != '[')) {
1167 continue;
1168 }
1169
1170 unsigned slots = storage->type->component_slots();
1171 if (storage->array_elements)
1172 slots *= storage->array_elements;
1173
1174 for (unsigned i = 0; i < slots; i++) {
1175 stage_prog_data->param[uniforms++] = &storage->storage[i];
1176 }
1177 }
1178
1179 /* Make sure we actually initialized the right amount of stuff here. */
1180 assert(params_before + ir->type->component_slots() == uniforms);
1181 (void)params_before;
1182 }
1183
1184
1185 /* Our support for builtin uniforms is even scarier than non-builtin.
1186 * It sits on top of the PROG_STATE_VAR parameters that are
1187 * automatically updated from GL context state.
1188 */
1189 void
1190 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1191 {
1192 const ir_state_slot *const slots = ir->get_state_slots();
1193 assert(slots != NULL);
1194
1195 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1196 /* This state reference has already been setup by ir_to_mesa, but we'll
1197 * get the same index back here.
1198 */
1199 int index = _mesa_add_state_reference(this->prog->Parameters,
1200 (gl_state_index *)slots[i].tokens);
1201
1202 /* Add each of the unique swizzles of the element as a parameter.
1203 * This'll end up matching the expected layout of the
1204 * array/matrix/structure we're trying to fill in.
1205 */
1206 int last_swiz = -1;
1207 for (unsigned int j = 0; j < 4; j++) {
1208 int swiz = GET_SWZ(slots[i].swizzle, j);
1209 if (swiz == last_swiz)
1210 break;
1211 last_swiz = swiz;
1212
1213 stage_prog_data->param[uniforms++] =
1214 &prog->Parameters->ParameterValues[index][swiz];
1215 }
1216 }
1217 }
1218
1219 fs_reg *
1220 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1221 bool origin_upper_left)
1222 {
1223 assert(stage == MESA_SHADER_FRAGMENT);
1224 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1225 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1226 fs_reg wpos = *reg;
1227 bool flip = !origin_upper_left ^ key->render_to_fbo;
1228
1229 /* gl_FragCoord.x */
1230 if (pixel_center_integer) {
1231 emit(MOV(wpos, this->pixel_x));
1232 } else {
1233 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1234 }
1235 wpos = offset(wpos, 1);
1236
1237 /* gl_FragCoord.y */
1238 if (!flip && pixel_center_integer) {
1239 emit(MOV(wpos, this->pixel_y));
1240 } else {
1241 fs_reg pixel_y = this->pixel_y;
1242 float offset = (pixel_center_integer ? 0.0 : 0.5);
1243
1244 if (flip) {
1245 pixel_y.negate = true;
1246 offset += key->drawable_height - 1.0;
1247 }
1248
1249 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1250 }
1251 wpos = offset(wpos, 1);
1252
1253 /* gl_FragCoord.z */
1254 if (brw->gen >= 6) {
1255 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1256 } else {
1257 emit(FS_OPCODE_LINTERP, wpos,
1258 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1259 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1260 interp_reg(VARYING_SLOT_POS, 2));
1261 }
1262 wpos = offset(wpos, 1);
1263
1264 /* gl_FragCoord.w: Already set up in emit_interpolation */
1265 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1266
1267 return reg;
1268 }
1269
1270 fs_inst *
1271 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1272 glsl_interp_qualifier interpolation_mode,
1273 bool is_centroid, bool is_sample)
1274 {
1275 brw_wm_barycentric_interp_mode barycoord_mode;
1276 if (brw->gen >= 6) {
1277 if (is_centroid) {
1278 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1279 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1280 else
1281 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1282 } else if (is_sample) {
1283 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1284 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1285 else
1286 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1287 } else {
1288 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1289 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1290 else
1291 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1292 }
1293 } else {
1294 /* On Ironlake and below, there is only one interpolation mode.
1295 * Centroid interpolation doesn't mean anything on this hardware --
1296 * there is no multisampling.
1297 */
1298 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1299 }
1300 return emit(FS_OPCODE_LINTERP, attr,
1301 this->delta_x[barycoord_mode],
1302 this->delta_y[barycoord_mode], interp);
1303 }
1304
1305 void
1306 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1307 const glsl_type *type,
1308 glsl_interp_qualifier interpolation_mode,
1309 int location, bool mod_centroid,
1310 bool mod_sample)
1311 {
1312 attr.type = brw_type_for_base_type(type->get_scalar_type());
1313
1314 assert(stage == MESA_SHADER_FRAGMENT);
1315 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1316 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1317
1318 unsigned int array_elements;
1319
1320 if (type->is_array()) {
1321 array_elements = type->length;
1322 if (array_elements == 0) {
1323 fail("dereferenced array '%s' has length 0\n", name);
1324 }
1325 type = type->fields.array;
1326 } else {
1327 array_elements = 1;
1328 }
1329
1330 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1331 bool is_gl_Color =
1332 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1333 if (key->flat_shade && is_gl_Color) {
1334 interpolation_mode = INTERP_QUALIFIER_FLAT;
1335 } else {
1336 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1337 }
1338 }
1339
1340 for (unsigned int i = 0; i < array_elements; i++) {
1341 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1342 if (prog_data->urb_setup[location] == -1) {
1343 /* If there's no incoming setup data for this slot, don't
1344 * emit interpolation for it.
1345 */
1346 attr = offset(attr, type->vector_elements);
1347 location++;
1348 continue;
1349 }
1350
1351 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1352 /* Constant interpolation (flat shading) case. The SF has
1353 * handed us defined values in only the constant offset
1354 * field of the setup reg.
1355 */
1356 for (unsigned int k = 0; k < type->vector_elements; k++) {
1357 struct brw_reg interp = interp_reg(location, k);
1358 interp = suboffset(interp, 3);
1359 interp.type = attr.type;
1360 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1361 attr = offset(attr, 1);
1362 }
1363 } else {
1364 /* Smooth/noperspective interpolation case. */
1365 for (unsigned int k = 0; k < type->vector_elements; k++) {
1366 struct brw_reg interp = interp_reg(location, k);
1367 if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1368 /* Get the pixel/sample mask into f0 so that we know
1369 * which pixels are lit. Then, for each channel that is
1370 * unlit, replace the centroid data with non-centroid
1371 * data.
1372 */
1373 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1374
1375 fs_inst *inst;
1376 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1377 false, false);
1378 inst->predicate = BRW_PREDICATE_NORMAL;
1379 inst->predicate_inverse = true;
1380 if (brw->has_pln)
1381 inst->no_dd_clear = true;
1382
1383 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1384 mod_centroid && !key->persample_shading,
1385 mod_sample || key->persample_shading);
1386 inst->predicate = BRW_PREDICATE_NORMAL;
1387 inst->predicate_inverse = false;
1388 if (brw->has_pln)
1389 inst->no_dd_check = true;
1390
1391 } else {
1392 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1393 mod_centroid && !key->persample_shading,
1394 mod_sample || key->persample_shading);
1395 }
1396 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1397 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1398 }
1399 attr = offset(attr, 1);
1400 }
1401
1402 }
1403 location++;
1404 }
1405 }
1406 }
1407
1408 fs_reg *
1409 fs_visitor::emit_frontfacing_interpolation()
1410 {
1411 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1412
1413 if (brw->gen >= 6) {
1414 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1415 * a boolean result from this (~0/true or 0/false).
1416 *
1417 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1418 * this task in only one instruction:
1419 * - a negation source modifier will flip the bit; and
1420 * - a W -> D type conversion will sign extend the bit into the high
1421 * word of the destination.
1422 *
1423 * An ASR 15 fills the low word of the destination.
1424 */
1425 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1426 g0.negate = true;
1427
1428 emit(ASR(*reg, g0, fs_reg(15)));
1429 } else {
1430 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1431 * a boolean result from this (1/true or 0/false).
1432 *
1433 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1434 * the negation source modifier to flip it. Unfortunately the SHR
1435 * instruction only operates on UD (or D with an abs source modifier)
1436 * sources without negation.
1437 *
1438 * Instead, use ASR (which will give ~0/true or 0/false).
1439 */
1440 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1441 g1_6.negate = true;
1442
1443 emit(ASR(*reg, g1_6, fs_reg(31)));
1444 }
1445
1446 return reg;
1447 }
1448
1449 void
1450 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1451 {
1452 assert(stage == MESA_SHADER_FRAGMENT);
1453 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1454 assert(dst.type == BRW_REGISTER_TYPE_F);
1455
1456 if (key->compute_pos_offset) {
1457 /* Convert int_sample_pos to floating point */
1458 emit(MOV(dst, int_sample_pos));
1459 /* Scale to the range [0, 1] */
1460 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1461 }
1462 else {
1463 /* From ARB_sample_shading specification:
1464 * "When rendering to a non-multisample buffer, or if multisample
1465 * rasterization is disabled, gl_SamplePosition will always be
1466 * (0.5, 0.5).
1467 */
1468 emit(MOV(dst, fs_reg(0.5f)));
1469 }
1470 }
1471
1472 fs_reg *
1473 fs_visitor::emit_samplepos_setup()
1474 {
1475 assert(brw->gen >= 6);
1476
1477 this->current_annotation = "compute sample position";
1478 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1479 fs_reg pos = *reg;
1480 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1481 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1482
1483 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1484 * mode will be enabled.
1485 *
1486 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1487 * R31.1:0 Position Offset X/Y for Slot[3:0]
1488 * R31.3:2 Position Offset X/Y for Slot[7:4]
1489 * .....
1490 *
1491 * The X, Y sample positions come in as bytes in thread payload. So, read
1492 * the positions using vstride=16, width=8, hstride=2.
1493 */
1494 struct brw_reg sample_pos_reg =
1495 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1496 BRW_REGISTER_TYPE_B), 16, 8, 2);
1497
1498 if (dispatch_width == 8) {
1499 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1500 } else {
1501 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1502 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1503 ->force_sechalf = true;
1504 }
1505 /* Compute gl_SamplePosition.x */
1506 compute_sample_position(pos, int_sample_x);
1507 pos = offset(pos, 1);
1508 if (dispatch_width == 8) {
1509 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1510 } else {
1511 emit(MOV(half(int_sample_y, 0),
1512 fs_reg(suboffset(sample_pos_reg, 1))));
1513 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1514 ->force_sechalf = true;
1515 }
1516 /* Compute gl_SamplePosition.y */
1517 compute_sample_position(pos, int_sample_y);
1518 return reg;
1519 }
1520
1521 fs_reg *
1522 fs_visitor::emit_sampleid_setup()
1523 {
1524 assert(stage == MESA_SHADER_FRAGMENT);
1525 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1526 assert(brw->gen >= 6);
1527
1528 this->current_annotation = "compute sample id";
1529 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1530
1531 if (key->compute_sample_id) {
1532 fs_reg t1 = vgrf(glsl_type::int_type);
1533 fs_reg t2 = vgrf(glsl_type::int_type);
1534 t2.type = BRW_REGISTER_TYPE_UW;
1535
1536 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1537 * 8x multisampling, subspan 0 will represent sample N (where N
1538 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1539 * 7. We can find the value of N by looking at R0.0 bits 7:6
1540 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1541 * (since samples are always delivered in pairs). That is, we
1542 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1543 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1544 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1545 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1546 * populating a temporary variable with the sequence (0, 1, 2, 3),
1547 * and then reading from it using vstride=1, width=4, hstride=0.
1548 * These computations hold good for 4x multisampling as well.
1549 *
1550 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1551 * the first four slots are sample 0 of subspan 0; the next four
1552 * are sample 1 of subspan 0; the third group is sample 0 of
1553 * subspan 1, and finally sample 1 of subspan 1.
1554 */
1555 fs_inst *inst;
1556 inst = emit(BRW_OPCODE_AND, t1,
1557 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1558 fs_reg(0xc0));
1559 inst->force_writemask_all = true;
1560 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1561 inst->force_writemask_all = true;
1562 /* This works for both SIMD8 and SIMD16 */
1563 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1564 inst->force_writemask_all = true;
1565 /* This special instruction takes care of setting vstride=1,
1566 * width=4, hstride=0 of t2 during an ADD instruction.
1567 */
1568 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1569 } else {
1570 /* As per GL_ARB_sample_shading specification:
1571 * "When rendering to a non-multisample buffer, or if multisample
1572 * rasterization is disabled, gl_SampleID will always be zero."
1573 */
1574 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1575 }
1576
1577 return reg;
1578 }
1579
1580 fs_reg
1581 fs_visitor::fix_math_operand(fs_reg src)
1582 {
1583 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1584 * might be able to do better by doing execsize = 1 math and then
1585 * expanding that result out, but we would need to be careful with
1586 * masking.
1587 *
1588 * The hardware ignores source modifiers (negate and abs) on math
1589 * instructions, so we also move to a temp to set those up.
1590 */
1591 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1592 !src.abs && !src.negate)
1593 return src;
1594
1595 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1596 * operands to math
1597 */
1598 if (brw->gen >= 7 && src.file != IMM)
1599 return src;
1600
1601 fs_reg expanded = vgrf(glsl_type::float_type);
1602 expanded.type = src.type;
1603 emit(BRW_OPCODE_MOV, expanded, src);
1604 return expanded;
1605 }
1606
1607 fs_inst *
1608 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1609 {
1610 switch (opcode) {
1611 case SHADER_OPCODE_RCP:
1612 case SHADER_OPCODE_RSQ:
1613 case SHADER_OPCODE_SQRT:
1614 case SHADER_OPCODE_EXP2:
1615 case SHADER_OPCODE_LOG2:
1616 case SHADER_OPCODE_SIN:
1617 case SHADER_OPCODE_COS:
1618 break;
1619 default:
1620 unreachable("not reached: bad math opcode");
1621 }
1622
1623 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1624 * might be able to do better by doing execsize = 1 math and then
1625 * expanding that result out, but we would need to be careful with
1626 * masking.
1627 *
1628 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1629 * instructions, so we also move to a temp to set those up.
1630 */
1631 if (brw->gen == 6 || brw->gen == 7)
1632 src = fix_math_operand(src);
1633
1634 fs_inst *inst = emit(opcode, dst, src);
1635
1636 if (brw->gen < 6) {
1637 inst->base_mrf = 2;
1638 inst->mlen = dispatch_width / 8;
1639 }
1640
1641 return inst;
1642 }
1643
1644 fs_inst *
1645 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1646 {
1647 int base_mrf = 2;
1648 fs_inst *inst;
1649
1650 if (brw->gen >= 8) {
1651 inst = emit(opcode, dst, src0, src1);
1652 } else if (brw->gen >= 6) {
1653 src0 = fix_math_operand(src0);
1654 src1 = fix_math_operand(src1);
1655
1656 inst = emit(opcode, dst, src0, src1);
1657 } else {
1658 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1659 * "Message Payload":
1660 *
1661 * "Operand0[7]. For the INT DIV functions, this operand is the
1662 * denominator."
1663 * ...
1664 * "Operand1[7]. For the INT DIV functions, this operand is the
1665 * numerator."
1666 */
1667 bool is_int_div = opcode != SHADER_OPCODE_POW;
1668 fs_reg &op0 = is_int_div ? src1 : src0;
1669 fs_reg &op1 = is_int_div ? src0 : src1;
1670
1671 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1672 inst = emit(opcode, dst, op0, reg_null_f);
1673
1674 inst->base_mrf = base_mrf;
1675 inst->mlen = 2 * dispatch_width / 8;
1676 }
1677 return inst;
1678 }
1679
1680 void
1681 fs_visitor::assign_curb_setup()
1682 {
1683 if (dispatch_width == 8) {
1684 prog_data->dispatch_grf_start_reg = payload.num_regs;
1685 } else {
1686 assert(stage == MESA_SHADER_FRAGMENT);
1687 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1688 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1689 }
1690
1691 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1692
1693 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1694 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1695 for (unsigned int i = 0; i < inst->sources; i++) {
1696 if (inst->src[i].file == UNIFORM) {
1697 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1698 int constant_nr;
1699 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1700 constant_nr = push_constant_loc[uniform_nr];
1701 } else {
1702 /* Section 5.11 of the OpenGL 4.1 spec says:
1703 * "Out-of-bounds reads return undefined values, which include
1704 * values from other variables of the active program or zero."
1705 * Just return the first push constant.
1706 */
1707 constant_nr = 0;
1708 }
1709
1710 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1711 constant_nr / 8,
1712 constant_nr % 8);
1713
1714 inst->src[i].file = HW_REG;
1715 inst->src[i].fixed_hw_reg = byte_offset(
1716 retype(brw_reg, inst->src[i].type),
1717 inst->src[i].subreg_offset);
1718 }
1719 }
1720 }
1721 }
1722
1723 void
1724 fs_visitor::calculate_urb_setup()
1725 {
1726 assert(stage == MESA_SHADER_FRAGMENT);
1727 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1728 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1729
1730 memset(prog_data->urb_setup, -1,
1731 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1732
1733 int urb_next = 0;
1734 /* Figure out where each of the incoming setup attributes lands. */
1735 if (brw->gen >= 6) {
1736 if (_mesa_bitcount_64(prog->InputsRead &
1737 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1738 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1739 * first 16 varying inputs, so we can put them wherever we want.
1740 * Just put them in order.
1741 *
1742 * This is useful because it means that (a) inputs not used by the
1743 * fragment shader won't take up valuable register space, and (b) we
1744 * won't have to recompile the fragment shader if it gets paired with
1745 * a different vertex (or geometry) shader.
1746 */
1747 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1748 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1749 BITFIELD64_BIT(i)) {
1750 prog_data->urb_setup[i] = urb_next++;
1751 }
1752 }
1753 } else {
1754 /* We have enough input varyings that the SF/SBE pipeline stage can't
1755 * arbitrarily rearrange them to suit our whim; we have to put them
1756 * in an order that matches the output of the previous pipeline stage
1757 * (geometry or vertex shader).
1758 */
1759 struct brw_vue_map prev_stage_vue_map;
1760 brw_compute_vue_map(brw, &prev_stage_vue_map,
1761 key->input_slots_valid);
1762 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1763 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1764 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1765 slot++) {
1766 int varying = prev_stage_vue_map.slot_to_varying[slot];
1767 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1768 * unused.
1769 */
1770 if (varying != BRW_VARYING_SLOT_COUNT &&
1771 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1772 BITFIELD64_BIT(varying))) {
1773 prog_data->urb_setup[varying] = slot - first_slot;
1774 }
1775 }
1776 urb_next = prev_stage_vue_map.num_slots - first_slot;
1777 }
1778 } else {
1779 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1780 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1781 /* Point size is packed into the header, not as a general attribute */
1782 if (i == VARYING_SLOT_PSIZ)
1783 continue;
1784
1785 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1786 /* The back color slot is skipped when the front color is
1787 * also written to. In addition, some slots can be
1788 * written in the vertex shader and not read in the
1789 * fragment shader. So the register number must always be
1790 * incremented, mapped or not.
1791 */
1792 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1793 prog_data->urb_setup[i] = urb_next;
1794 urb_next++;
1795 }
1796 }
1797
1798 /*
1799 * It's a FS only attribute, and we did interpolation for this attribute
1800 * in SF thread. So, count it here, too.
1801 *
1802 * See compile_sf_prog() for more info.
1803 */
1804 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1805 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1806 }
1807
1808 prog_data->num_varying_inputs = urb_next;
1809 }
1810
1811 void
1812 fs_visitor::assign_urb_setup()
1813 {
1814 assert(stage == MESA_SHADER_FRAGMENT);
1815 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1816
1817 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1818
1819 /* Offset all the urb_setup[] index by the actual position of the
1820 * setup regs, now that the location of the constants has been chosen.
1821 */
1822 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1823 if (inst->opcode == FS_OPCODE_LINTERP) {
1824 assert(inst->src[2].file == HW_REG);
1825 inst->src[2].fixed_hw_reg.nr += urb_start;
1826 }
1827
1828 if (inst->opcode == FS_OPCODE_CINTERP) {
1829 assert(inst->src[0].file == HW_REG);
1830 inst->src[0].fixed_hw_reg.nr += urb_start;
1831 }
1832 }
1833
1834 /* Each attribute is 4 setup channels, each of which is half a reg. */
1835 this->first_non_payload_grf =
1836 urb_start + prog_data->num_varying_inputs * 2;
1837 }
1838
1839 void
1840 fs_visitor::assign_vs_urb_setup()
1841 {
1842 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1843 int grf, count, slot, channel, attr;
1844
1845 assert(stage == MESA_SHADER_VERTEX);
1846 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1847 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1848 count++;
1849
1850 /* Each attribute is 4 regs. */
1851 this->first_non_payload_grf =
1852 payload.num_regs + prog_data->curb_read_length + count * 4;
1853
1854 unsigned vue_entries =
1855 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1856
1857 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1858 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1859
1860 assert(vs_prog_data->base.urb_read_length <= 15);
1861
1862 /* Rewrite all ATTR file references to the hw grf that they land in. */
1863 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1864 for (int i = 0; i < inst->sources; i++) {
1865 if (inst->src[i].file == ATTR) {
1866
1867 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1868 slot = count - 1;
1869 } else {
1870 /* Attributes come in in a contiguous block, ordered by their
1871 * gl_vert_attrib value. That means we can compute the slot
1872 * number for an attribute by masking out the enabled
1873 * attributes before it and counting the bits.
1874 */
1875 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1876 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1877 BITFIELD64_MASK(attr));
1878 }
1879
1880 channel = inst->src[i].reg_offset & 3;
1881
1882 grf = payload.num_regs +
1883 prog_data->curb_read_length +
1884 slot * 4 + channel;
1885
1886 inst->src[i].file = HW_REG;
1887 inst->src[i].fixed_hw_reg =
1888 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1889 }
1890 }
1891 }
1892 }
1893
1894 /**
1895 * Split large virtual GRFs into separate components if we can.
1896 *
1897 * This is mostly duplicated with what brw_fs_vector_splitting does,
1898 * but that's really conservative because it's afraid of doing
1899 * splitting that doesn't result in real progress after the rest of
1900 * the optimization phases, which would cause infinite looping in
1901 * optimization. We can do it once here, safely. This also has the
1902 * opportunity to split interpolated values, or maybe even uniforms,
1903 * which we don't have at the IR level.
1904 *
1905 * We want to split, because virtual GRFs are what we register
1906 * allocate and spill (due to contiguousness requirements for some
1907 * instructions), and they're what we naturally generate in the
1908 * codegen process, but most virtual GRFs don't actually need to be
1909 * contiguous sets of GRFs. If we split, we'll end up with reduced
1910 * live intervals and better dead code elimination and coalescing.
1911 */
1912 void
1913 fs_visitor::split_virtual_grfs()
1914 {
1915 int num_vars = this->virtual_grf_count;
1916
1917 /* Count the total number of registers */
1918 int reg_count = 0;
1919 int vgrf_to_reg[num_vars];
1920 for (int i = 0; i < num_vars; i++) {
1921 vgrf_to_reg[i] = reg_count;
1922 reg_count += virtual_grf_sizes[i];
1923 }
1924
1925 /* An array of "split points". For each register slot, this indicates
1926 * if this slot can be separated from the previous slot. Every time an
1927 * instruction uses multiple elements of a register (as a source or
1928 * destination), we mark the used slots as inseparable. Then we go
1929 * through and split the registers into the smallest pieces we can.
1930 */
1931 bool split_points[reg_count];
1932 memset(split_points, 0, sizeof(split_points));
1933
1934 /* Mark all used registers as fully splittable */
1935 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1936 if (inst->dst.file == GRF) {
1937 int reg = vgrf_to_reg[inst->dst.reg];
1938 for (int j = 1; j < this->virtual_grf_sizes[inst->dst.reg]; j++)
1939 split_points[reg + j] = true;
1940 }
1941
1942 for (int i = 0; i < inst->sources; i++) {
1943 if (inst->src[i].file == GRF) {
1944 int reg = vgrf_to_reg[inst->src[i].reg];
1945 for (int j = 1; j < this->virtual_grf_sizes[inst->src[i].reg]; j++)
1946 split_points[reg + j] = true;
1947 }
1948 }
1949 }
1950
1951 if (brw->has_pln &&
1952 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1953 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1954 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1955 * Gen6, that was the only supported interpolation mode, and since Gen6,
1956 * delta_x and delta_y are in fixed hardware registers.
1957 */
1958 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1959 split_points[vgrf_to_reg[vgrf] + 1] = false;
1960 }
1961
1962 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1963 if (inst->dst.file == GRF) {
1964 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1965 for (int j = 1; j < inst->regs_written; j++)
1966 split_points[reg + j] = false;
1967 }
1968 for (int i = 0; i < inst->sources; i++) {
1969 if (inst->src[i].file == GRF) {
1970 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1971 for (int j = 1; j < inst->regs_read(this, i); j++)
1972 split_points[reg + j] = false;
1973 }
1974 }
1975 }
1976
1977 int new_virtual_grf[reg_count];
1978 int new_reg_offset[reg_count];
1979
1980 int reg = 0;
1981 for (int i = 0; i < num_vars; i++) {
1982 /* The first one should always be 0 as a quick sanity check. */
1983 assert(split_points[reg] == false);
1984
1985 /* j = 0 case */
1986 new_reg_offset[reg] = 0;
1987 reg++;
1988 int offset = 1;
1989
1990 /* j > 0 case */
1991 for (int j = 1; j < virtual_grf_sizes[i]; j++) {
1992 /* If this is a split point, reset the offset to 0 and allocate a
1993 * new virtual GRF for the previous offset many registers
1994 */
1995 if (split_points[reg]) {
1996 assert(offset <= MAX_VGRF_SIZE);
1997 int grf = virtual_grf_alloc(offset);
1998 for (int k = reg - offset; k < reg; k++)
1999 new_virtual_grf[k] = grf;
2000 offset = 0;
2001 }
2002 new_reg_offset[reg] = offset;
2003 offset++;
2004 reg++;
2005 }
2006
2007 /* The last one gets the original register number */
2008 assert(offset <= MAX_VGRF_SIZE);
2009 virtual_grf_sizes[i] = offset;
2010 for (int k = reg - offset; k < reg; k++)
2011 new_virtual_grf[k] = i;
2012 }
2013 assert(reg == reg_count);
2014
2015 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2016 if (inst->dst.file == GRF) {
2017 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2018 inst->dst.reg = new_virtual_grf[reg];
2019 inst->dst.reg_offset = new_reg_offset[reg];
2020 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
2021 }
2022 for (int i = 0; i < inst->sources; i++) {
2023 if (inst->src[i].file == GRF) {
2024 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2025 inst->src[i].reg = new_virtual_grf[reg];
2026 inst->src[i].reg_offset = new_reg_offset[reg];
2027 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
2028 }
2029 }
2030 }
2031 invalidate_live_intervals();
2032 }
2033
2034 /**
2035 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2036 *
2037 * During code generation, we create tons of temporary variables, many of
2038 * which get immediately killed and are never used again. Yet, in later
2039 * optimization and analysis passes, such as compute_live_intervals, we need
2040 * to loop over all the virtual GRFs. Compacting them can save a lot of
2041 * overhead.
2042 */
2043 bool
2044 fs_visitor::compact_virtual_grfs()
2045 {
2046 bool progress = false;
2047 int remap_table[this->virtual_grf_count];
2048 memset(remap_table, -1, sizeof(remap_table));
2049
2050 /* Mark which virtual GRFs are used. */
2051 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2052 if (inst->dst.file == GRF)
2053 remap_table[inst->dst.reg] = 0;
2054
2055 for (int i = 0; i < inst->sources; i++) {
2056 if (inst->src[i].file == GRF)
2057 remap_table[inst->src[i].reg] = 0;
2058 }
2059 }
2060
2061 /* Compact the GRF arrays. */
2062 int new_index = 0;
2063 for (int i = 0; i < this->virtual_grf_count; i++) {
2064 if (remap_table[i] == -1) {
2065 /* We just found an unused register. This means that we are
2066 * actually going to compact something.
2067 */
2068 progress = true;
2069 } else {
2070 remap_table[i] = new_index;
2071 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
2072 invalidate_live_intervals();
2073 ++new_index;
2074 }
2075 }
2076
2077 this->virtual_grf_count = new_index;
2078
2079 /* Patch all the instructions to use the newly renumbered registers */
2080 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2081 if (inst->dst.file == GRF)
2082 inst->dst.reg = remap_table[inst->dst.reg];
2083
2084 for (int i = 0; i < inst->sources; i++) {
2085 if (inst->src[i].file == GRF)
2086 inst->src[i].reg = remap_table[inst->src[i].reg];
2087 }
2088 }
2089
2090 /* Patch all the references to delta_x/delta_y, since they're used in
2091 * register allocation. If they're unused, switch them to BAD_FILE so
2092 * we don't think some random VGRF is delta_x/delta_y.
2093 */
2094 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2095 if (delta_x[i].file == GRF) {
2096 if (remap_table[delta_x[i].reg] != -1) {
2097 delta_x[i].reg = remap_table[delta_x[i].reg];
2098 } else {
2099 delta_x[i].file = BAD_FILE;
2100 }
2101 }
2102 }
2103 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2104 if (delta_y[i].file == GRF) {
2105 if (remap_table[delta_y[i].reg] != -1) {
2106 delta_y[i].reg = remap_table[delta_y[i].reg];
2107 } else {
2108 delta_y[i].file = BAD_FILE;
2109 }
2110 }
2111 }
2112
2113 return progress;
2114 }
2115
2116 /*
2117 * Implements array access of uniforms by inserting a
2118 * PULL_CONSTANT_LOAD instruction.
2119 *
2120 * Unlike temporary GRF array access (where we don't support it due to
2121 * the difficulty of doing relative addressing on instruction
2122 * destinations), we could potentially do array access of uniforms
2123 * that were loaded in GRF space as push constants. In real-world
2124 * usage we've seen, though, the arrays being used are always larger
2125 * than we could load as push constants, so just always move all
2126 * uniform array access out to a pull constant buffer.
2127 */
2128 void
2129 fs_visitor::move_uniform_array_access_to_pull_constants()
2130 {
2131 if (dispatch_width != 8)
2132 return;
2133
2134 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2135 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2136
2137 /* Walk through and find array access of uniforms. Put a copy of that
2138 * uniform in the pull constant buffer.
2139 *
2140 * Note that we don't move constant-indexed accesses to arrays. No
2141 * testing has been done of the performance impact of this choice.
2142 */
2143 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2144 for (int i = 0 ; i < inst->sources; i++) {
2145 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2146 continue;
2147
2148 int uniform = inst->src[i].reg;
2149
2150 /* If this array isn't already present in the pull constant buffer,
2151 * add it.
2152 */
2153 if (pull_constant_loc[uniform] == -1) {
2154 const gl_constant_value **values = &stage_prog_data->param[uniform];
2155
2156 assert(param_size[uniform]);
2157
2158 for (int j = 0; j < param_size[uniform]; j++) {
2159 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2160
2161 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2162 values[j];
2163 }
2164 }
2165 }
2166 }
2167 }
2168
2169 /**
2170 * Assign UNIFORM file registers to either push constants or pull constants.
2171 *
2172 * We allow a fragment shader to have more than the specified minimum
2173 * maximum number of fragment shader uniform components (64). If
2174 * there are too many of these, they'd fill up all of register space.
2175 * So, this will push some of them out to the pull constant buffer and
2176 * update the program to load them.
2177 */
2178 void
2179 fs_visitor::assign_constant_locations()
2180 {
2181 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2182 if (dispatch_width != 8)
2183 return;
2184
2185 /* Find which UNIFORM registers are still in use. */
2186 bool is_live[uniforms];
2187 for (unsigned int i = 0; i < uniforms; i++) {
2188 is_live[i] = false;
2189 }
2190
2191 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2192 for (int i = 0; i < inst->sources; i++) {
2193 if (inst->src[i].file != UNIFORM)
2194 continue;
2195
2196 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2197 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2198 is_live[constant_nr] = true;
2199 }
2200 }
2201
2202 /* Only allow 16 registers (128 uniform components) as push constants.
2203 *
2204 * Just demote the end of the list. We could probably do better
2205 * here, demoting things that are rarely used in the program first.
2206 *
2207 * If changing this value, note the limitation about total_regs in
2208 * brw_curbe.c.
2209 */
2210 unsigned int max_push_components = 16 * 8;
2211 unsigned int num_push_constants = 0;
2212
2213 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2214
2215 for (unsigned int i = 0; i < uniforms; i++) {
2216 if (!is_live[i] || pull_constant_loc[i] != -1) {
2217 /* This UNIFORM register is either dead, or has already been demoted
2218 * to a pull const. Mark it as no longer living in the param[] array.
2219 */
2220 push_constant_loc[i] = -1;
2221 continue;
2222 }
2223
2224 if (num_push_constants < max_push_components) {
2225 /* Retain as a push constant. Record the location in the params[]
2226 * array.
2227 */
2228 push_constant_loc[i] = num_push_constants++;
2229 } else {
2230 /* Demote to a pull constant. */
2231 push_constant_loc[i] = -1;
2232
2233 int pull_index = stage_prog_data->nr_pull_params++;
2234 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2235 pull_constant_loc[i] = pull_index;
2236 }
2237 }
2238
2239 stage_prog_data->nr_params = num_push_constants;
2240
2241 /* Up until now, the param[] array has been indexed by reg + reg_offset
2242 * of UNIFORM registers. Condense it to only contain the uniforms we
2243 * chose to upload as push constants.
2244 */
2245 for (unsigned int i = 0; i < uniforms; i++) {
2246 int remapped = push_constant_loc[i];
2247
2248 if (remapped == -1)
2249 continue;
2250
2251 assert(remapped <= (int)i);
2252 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2253 }
2254 }
2255
2256 /**
2257 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2258 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2259 */
2260 void
2261 fs_visitor::demote_pull_constants()
2262 {
2263 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2264 for (int i = 0; i < inst->sources; i++) {
2265 if (inst->src[i].file != UNIFORM)
2266 continue;
2267
2268 int pull_index = pull_constant_loc[inst->src[i].reg +
2269 inst->src[i].reg_offset];
2270 if (pull_index == -1)
2271 continue;
2272
2273 /* Set up the annotation tracking for new generated instructions. */
2274 base_ir = inst->ir;
2275 current_annotation = inst->annotation;
2276
2277 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2278 fs_reg dst = vgrf(glsl_type::float_type);
2279
2280 /* Generate a pull load into dst. */
2281 if (inst->src[i].reladdr) {
2282 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2283 surf_index,
2284 *inst->src[i].reladdr,
2285 pull_index);
2286 inst->insert_before(block, &list);
2287 inst->src[i].reladdr = NULL;
2288 } else {
2289 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2290 fs_inst *pull =
2291 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2292 dst, surf_index, offset);
2293 inst->insert_before(block, pull);
2294 inst->src[i].set_smear(pull_index & 3);
2295 }
2296
2297 /* Rewrite the instruction to use the temporary VGRF. */
2298 inst->src[i].file = GRF;
2299 inst->src[i].reg = dst.reg;
2300 inst->src[i].reg_offset = 0;
2301 inst->src[i].width = dispatch_width;
2302 }
2303 }
2304 invalidate_live_intervals();
2305 }
2306
2307 bool
2308 fs_visitor::opt_algebraic()
2309 {
2310 bool progress = false;
2311
2312 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2313 switch (inst->opcode) {
2314 case BRW_OPCODE_MOV:
2315 if (inst->src[0].file != IMM)
2316 break;
2317
2318 if (inst->saturate) {
2319 if (inst->dst.type != inst->src[0].type)
2320 assert(!"unimplemented: saturate mixed types");
2321
2322 if (brw_saturate_immediate(inst->dst.type,
2323 &inst->src[0].fixed_hw_reg)) {
2324 inst->saturate = false;
2325 progress = true;
2326 }
2327 }
2328 break;
2329
2330 case BRW_OPCODE_MUL:
2331 if (inst->src[1].file != IMM)
2332 continue;
2333
2334 /* a * 1.0 = a */
2335 if (inst->src[1].is_one()) {
2336 inst->opcode = BRW_OPCODE_MOV;
2337 inst->src[1] = reg_undef;
2338 progress = true;
2339 break;
2340 }
2341
2342 /* a * 0.0 = 0.0 */
2343 if (inst->src[1].is_zero()) {
2344 inst->opcode = BRW_OPCODE_MOV;
2345 inst->src[0] = inst->src[1];
2346 inst->src[1] = reg_undef;
2347 progress = true;
2348 break;
2349 }
2350
2351 break;
2352 case BRW_OPCODE_ADD:
2353 if (inst->src[1].file != IMM)
2354 continue;
2355
2356 /* a + 0.0 = a */
2357 if (inst->src[1].is_zero()) {
2358 inst->opcode = BRW_OPCODE_MOV;
2359 inst->src[1] = reg_undef;
2360 progress = true;
2361 break;
2362 }
2363 break;
2364 case BRW_OPCODE_OR:
2365 if (inst->src[0].equals(inst->src[1])) {
2366 inst->opcode = BRW_OPCODE_MOV;
2367 inst->src[1] = reg_undef;
2368 progress = true;
2369 break;
2370 }
2371 break;
2372 case BRW_OPCODE_LRP:
2373 if (inst->src[1].equals(inst->src[2])) {
2374 inst->opcode = BRW_OPCODE_MOV;
2375 inst->src[0] = inst->src[1];
2376 inst->src[1] = reg_undef;
2377 inst->src[2] = reg_undef;
2378 progress = true;
2379 break;
2380 }
2381 break;
2382 case BRW_OPCODE_CMP:
2383 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2384 inst->src[0].abs &&
2385 inst->src[0].negate &&
2386 inst->src[1].is_zero()) {
2387 inst->src[0].abs = false;
2388 inst->src[0].negate = false;
2389 inst->conditional_mod = BRW_CONDITIONAL_Z;
2390 progress = true;
2391 break;
2392 }
2393 break;
2394 case BRW_OPCODE_SEL:
2395 if (inst->src[0].equals(inst->src[1])) {
2396 inst->opcode = BRW_OPCODE_MOV;
2397 inst->src[1] = reg_undef;
2398 inst->predicate = BRW_PREDICATE_NONE;
2399 inst->predicate_inverse = false;
2400 progress = true;
2401 } else if (inst->saturate && inst->src[1].file == IMM) {
2402 switch (inst->conditional_mod) {
2403 case BRW_CONDITIONAL_LE:
2404 case BRW_CONDITIONAL_L:
2405 switch (inst->src[1].type) {
2406 case BRW_REGISTER_TYPE_F:
2407 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2408 inst->opcode = BRW_OPCODE_MOV;
2409 inst->src[1] = reg_undef;
2410 progress = true;
2411 }
2412 break;
2413 default:
2414 break;
2415 }
2416 break;
2417 case BRW_CONDITIONAL_GE:
2418 case BRW_CONDITIONAL_G:
2419 switch (inst->src[1].type) {
2420 case BRW_REGISTER_TYPE_F:
2421 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2422 inst->opcode = BRW_OPCODE_MOV;
2423 inst->src[1] = reg_undef;
2424 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2425 progress = true;
2426 }
2427 break;
2428 default:
2429 break;
2430 }
2431 default:
2432 break;
2433 }
2434 }
2435 break;
2436 case SHADER_OPCODE_RCP: {
2437 fs_inst *prev = (fs_inst *)inst->prev;
2438 if (prev->opcode == SHADER_OPCODE_SQRT) {
2439 if (inst->src[0].equals(prev->dst)) {
2440 inst->opcode = SHADER_OPCODE_RSQ;
2441 inst->src[0] = prev->src[0];
2442 progress = true;
2443 }
2444 }
2445 break;
2446 }
2447 default:
2448 break;
2449 }
2450 }
2451
2452 return progress;
2453 }
2454
2455 bool
2456 fs_visitor::opt_register_renaming()
2457 {
2458 bool progress = false;
2459 int depth = 0;
2460
2461 int remap[virtual_grf_count];
2462 memset(remap, -1, sizeof(int) * virtual_grf_count);
2463
2464 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2465 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2466 depth++;
2467 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2468 inst->opcode == BRW_OPCODE_WHILE) {
2469 depth--;
2470 }
2471
2472 /* Rewrite instruction sources. */
2473 for (int i = 0; i < inst->sources; i++) {
2474 if (inst->src[i].file == GRF &&
2475 remap[inst->src[i].reg] != -1 &&
2476 remap[inst->src[i].reg] != inst->src[i].reg) {
2477 inst->src[i].reg = remap[inst->src[i].reg];
2478 progress = true;
2479 }
2480 }
2481
2482 const int dst = inst->dst.reg;
2483
2484 if (depth == 0 &&
2485 inst->dst.file == GRF &&
2486 virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
2487 !inst->is_partial_write()) {
2488 if (remap[dst] == -1) {
2489 remap[dst] = dst;
2490 } else {
2491 remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
2492 inst->dst.reg = remap[dst];
2493 progress = true;
2494 }
2495 } else if (inst->dst.file == GRF &&
2496 remap[dst] != -1 &&
2497 remap[dst] != dst) {
2498 inst->dst.reg = remap[dst];
2499 progress = true;
2500 }
2501 }
2502
2503 if (progress) {
2504 invalidate_live_intervals();
2505
2506 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2507 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2508 delta_x[i].reg = remap[delta_x[i].reg];
2509 }
2510 }
2511 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2512 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2513 delta_y[i].reg = remap[delta_y[i].reg];
2514 }
2515 }
2516 }
2517
2518 return progress;
2519 }
2520
2521 bool
2522 fs_visitor::compute_to_mrf()
2523 {
2524 bool progress = false;
2525 int next_ip = 0;
2526
2527 /* No MRFs on Gen >= 7. */
2528 if (brw->gen >= 7)
2529 return false;
2530
2531 calculate_live_intervals();
2532
2533 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2534 int ip = next_ip;
2535 next_ip++;
2536
2537 if (inst->opcode != BRW_OPCODE_MOV ||
2538 inst->is_partial_write() ||
2539 inst->dst.file != MRF || inst->src[0].file != GRF ||
2540 inst->dst.type != inst->src[0].type ||
2541 inst->src[0].abs || inst->src[0].negate ||
2542 !inst->src[0].is_contiguous() ||
2543 inst->src[0].subreg_offset)
2544 continue;
2545
2546 /* Work out which hardware MRF registers are written by this
2547 * instruction.
2548 */
2549 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2550 int mrf_high;
2551 if (inst->dst.reg & BRW_MRF_COMPR4) {
2552 mrf_high = mrf_low + 4;
2553 } else if (inst->exec_size == 16) {
2554 mrf_high = mrf_low + 1;
2555 } else {
2556 mrf_high = mrf_low;
2557 }
2558
2559 /* Can't compute-to-MRF this GRF if someone else was going to
2560 * read it later.
2561 */
2562 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2563 continue;
2564
2565 /* Found a move of a GRF to a MRF. Let's see if we can go
2566 * rewrite the thing that made this GRF to write into the MRF.
2567 */
2568 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2569 if (scan_inst->dst.file == GRF &&
2570 scan_inst->dst.reg == inst->src[0].reg) {
2571 /* Found the last thing to write our reg we want to turn
2572 * into a compute-to-MRF.
2573 */
2574
2575 /* If this one instruction didn't populate all the
2576 * channels, bail. We might be able to rewrite everything
2577 * that writes that reg, but it would require smarter
2578 * tracking to delay the rewriting until complete success.
2579 */
2580 if (scan_inst->is_partial_write())
2581 break;
2582
2583 /* Things returning more than one register would need us to
2584 * understand coalescing out more than one MOV at a time.
2585 */
2586 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2587 break;
2588
2589 /* SEND instructions can't have MRF as a destination. */
2590 if (scan_inst->mlen)
2591 break;
2592
2593 if (brw->gen == 6) {
2594 /* gen6 math instructions must have the destination be
2595 * GRF, so no compute-to-MRF for them.
2596 */
2597 if (scan_inst->is_math()) {
2598 break;
2599 }
2600 }
2601
2602 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2603 /* Found the creator of our MRF's source value. */
2604 scan_inst->dst.file = MRF;
2605 scan_inst->dst.reg = inst->dst.reg;
2606 scan_inst->saturate |= inst->saturate;
2607 inst->remove(block);
2608 progress = true;
2609 }
2610 break;
2611 }
2612
2613 /* We don't handle control flow here. Most computation of
2614 * values that end up in MRFs are shortly before the MRF
2615 * write anyway.
2616 */
2617 if (block->start() == scan_inst)
2618 break;
2619
2620 /* You can't read from an MRF, so if someone else reads our
2621 * MRF's source GRF that we wanted to rewrite, that stops us.
2622 */
2623 bool interfered = false;
2624 for (int i = 0; i < scan_inst->sources; i++) {
2625 if (scan_inst->src[i].file == GRF &&
2626 scan_inst->src[i].reg == inst->src[0].reg &&
2627 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2628 interfered = true;
2629 }
2630 }
2631 if (interfered)
2632 break;
2633
2634 if (scan_inst->dst.file == MRF) {
2635 /* If somebody else writes our MRF here, we can't
2636 * compute-to-MRF before that.
2637 */
2638 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2639 int scan_mrf_high;
2640
2641 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2642 scan_mrf_high = scan_mrf_low + 4;
2643 } else if (scan_inst->exec_size == 16) {
2644 scan_mrf_high = scan_mrf_low + 1;
2645 } else {
2646 scan_mrf_high = scan_mrf_low;
2647 }
2648
2649 if (mrf_low == scan_mrf_low ||
2650 mrf_low == scan_mrf_high ||
2651 mrf_high == scan_mrf_low ||
2652 mrf_high == scan_mrf_high) {
2653 break;
2654 }
2655 }
2656
2657 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2658 /* Found a SEND instruction, which means that there are
2659 * live values in MRFs from base_mrf to base_mrf +
2660 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2661 * above it.
2662 */
2663 if (mrf_low >= scan_inst->base_mrf &&
2664 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2665 break;
2666 }
2667 if (mrf_high >= scan_inst->base_mrf &&
2668 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2669 break;
2670 }
2671 }
2672 }
2673 }
2674
2675 if (progress)
2676 invalidate_live_intervals();
2677
2678 return progress;
2679 }
2680
2681 /**
2682 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2683 * instructions to FS_OPCODE_REP_FB_WRITE.
2684 */
2685 void
2686 fs_visitor::emit_repclear_shader()
2687 {
2688 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2689 int base_mrf = 1;
2690 int color_mrf = base_mrf + 2;
2691
2692 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2693 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2694 mov->force_writemask_all = true;
2695
2696 fs_inst *write;
2697 if (key->nr_color_regions == 1) {
2698 write = emit(FS_OPCODE_REP_FB_WRITE);
2699 write->saturate = key->clamp_fragment_color;
2700 write->base_mrf = color_mrf;
2701 write->target = 0;
2702 write->header_present = false;
2703 write->mlen = 1;
2704 } else {
2705 assume(key->nr_color_regions > 0);
2706 for (int i = 0; i < key->nr_color_regions; ++i) {
2707 write = emit(FS_OPCODE_REP_FB_WRITE);
2708 write->saturate = key->clamp_fragment_color;
2709 write->base_mrf = base_mrf;
2710 write->target = i;
2711 write->header_present = true;
2712 write->mlen = 3;
2713 }
2714 }
2715 write->eot = true;
2716
2717 calculate_cfg();
2718
2719 assign_constant_locations();
2720 assign_curb_setup();
2721
2722 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2723 assert(mov->src[0].file == HW_REG);
2724 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2725 }
2726
2727 /**
2728 * Walks through basic blocks, looking for repeated MRF writes and
2729 * removing the later ones.
2730 */
2731 bool
2732 fs_visitor::remove_duplicate_mrf_writes()
2733 {
2734 fs_inst *last_mrf_move[16];
2735 bool progress = false;
2736
2737 /* Need to update the MRF tracking for compressed instructions. */
2738 if (dispatch_width == 16)
2739 return false;
2740
2741 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2742
2743 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2744 if (inst->is_control_flow()) {
2745 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2746 }
2747
2748 if (inst->opcode == BRW_OPCODE_MOV &&
2749 inst->dst.file == MRF) {
2750 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2751 if (prev_inst && inst->equals(prev_inst)) {
2752 inst->remove(block);
2753 progress = true;
2754 continue;
2755 }
2756 }
2757
2758 /* Clear out the last-write records for MRFs that were overwritten. */
2759 if (inst->dst.file == MRF) {
2760 last_mrf_move[inst->dst.reg] = NULL;
2761 }
2762
2763 if (inst->mlen > 0 && inst->base_mrf != -1) {
2764 /* Found a SEND instruction, which will include two or fewer
2765 * implied MRF writes. We could do better here.
2766 */
2767 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2768 last_mrf_move[inst->base_mrf + i] = NULL;
2769 }
2770 }
2771
2772 /* Clear out any MRF move records whose sources got overwritten. */
2773 if (inst->dst.file == GRF) {
2774 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2775 if (last_mrf_move[i] &&
2776 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2777 last_mrf_move[i] = NULL;
2778 }
2779 }
2780 }
2781
2782 if (inst->opcode == BRW_OPCODE_MOV &&
2783 inst->dst.file == MRF &&
2784 inst->src[0].file == GRF &&
2785 !inst->is_partial_write()) {
2786 last_mrf_move[inst->dst.reg] = inst;
2787 }
2788 }
2789
2790 if (progress)
2791 invalidate_live_intervals();
2792
2793 return progress;
2794 }
2795
2796 static void
2797 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2798 int first_grf, int grf_len)
2799 {
2800 /* Clear the flag for registers that actually got read (as expected). */
2801 for (int i = 0; i < inst->sources; i++) {
2802 int grf;
2803 if (inst->src[i].file == GRF) {
2804 grf = inst->src[i].reg;
2805 } else if (inst->src[i].file == HW_REG &&
2806 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2807 grf = inst->src[i].fixed_hw_reg.nr;
2808 } else {
2809 continue;
2810 }
2811
2812 if (grf >= first_grf &&
2813 grf < first_grf + grf_len) {
2814 deps[grf - first_grf] = false;
2815 if (inst->exec_size == 16)
2816 deps[grf - first_grf + 1] = false;
2817 }
2818 }
2819 }
2820
2821 /**
2822 * Implements this workaround for the original 965:
2823 *
2824 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2825 * check for post destination dependencies on this instruction, software
2826 * must ensure that there is no destination hazard for the case of ‘write
2827 * followed by a posted write’ shown in the following example.
2828 *
2829 * 1. mov r3 0
2830 * 2. send r3.xy <rest of send instruction>
2831 * 3. mov r2 r3
2832 *
2833 * Due to no post-destination dependency check on the ‘send’, the above
2834 * code sequence could have two instructions (1 and 2) in flight at the
2835 * same time that both consider ‘r3’ as the target of their final writes.
2836 */
2837 void
2838 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2839 fs_inst *inst)
2840 {
2841 int write_len = inst->regs_written;
2842 int first_write_grf = inst->dst.reg;
2843 bool needs_dep[BRW_MAX_MRF];
2844 assert(write_len < (int)sizeof(needs_dep) - 1);
2845
2846 memset(needs_dep, false, sizeof(needs_dep));
2847 memset(needs_dep, true, write_len);
2848
2849 clear_deps_for_inst_src(inst, dispatch_width,
2850 needs_dep, first_write_grf, write_len);
2851
2852 /* Walk backwards looking for writes to registers we're writing which
2853 * aren't read since being written. If we hit the start of the program,
2854 * we assume that there are no outstanding dependencies on entry to the
2855 * program.
2856 */
2857 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2858 /* If we hit control flow, assume that there *are* outstanding
2859 * dependencies, and force their cleanup before our instruction.
2860 */
2861 if (block->start() == scan_inst) {
2862 for (int i = 0; i < write_len; i++) {
2863 if (needs_dep[i]) {
2864 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2865 }
2866 }
2867 return;
2868 }
2869
2870 /* We insert our reads as late as possible on the assumption that any
2871 * instruction but a MOV that might have left us an outstanding
2872 * dependency has more latency than a MOV.
2873 */
2874 if (scan_inst->dst.file == GRF) {
2875 for (int i = 0; i < scan_inst->regs_written; i++) {
2876 int reg = scan_inst->dst.reg + i;
2877
2878 if (reg >= first_write_grf &&
2879 reg < first_write_grf + write_len &&
2880 needs_dep[reg - first_write_grf]) {
2881 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2882 needs_dep[reg - first_write_grf] = false;
2883 if (scan_inst->exec_size == 16)
2884 needs_dep[reg - first_write_grf + 1] = false;
2885 }
2886 }
2887 }
2888
2889 /* Clear the flag for registers that actually got read (as expected). */
2890 clear_deps_for_inst_src(scan_inst, dispatch_width,
2891 needs_dep, first_write_grf, write_len);
2892
2893 /* Continue the loop only if we haven't resolved all the dependencies */
2894 int i;
2895 for (i = 0; i < write_len; i++) {
2896 if (needs_dep[i])
2897 break;
2898 }
2899 if (i == write_len)
2900 return;
2901 }
2902 }
2903
2904 /**
2905 * Implements this workaround for the original 965:
2906 *
2907 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2908 * used as a destination register until after it has been sourced by an
2909 * instruction with a different destination register.
2910 */
2911 void
2912 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2913 {
2914 int write_len = inst->regs_written;
2915 int first_write_grf = inst->dst.reg;
2916 bool needs_dep[BRW_MAX_MRF];
2917 assert(write_len < (int)sizeof(needs_dep) - 1);
2918
2919 memset(needs_dep, false, sizeof(needs_dep));
2920 memset(needs_dep, true, write_len);
2921 /* Walk forwards looking for writes to registers we're writing which aren't
2922 * read before being written.
2923 */
2924 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2925 /* If we hit control flow, force resolve all remaining dependencies. */
2926 if (block->end() == scan_inst) {
2927 for (int i = 0; i < write_len; i++) {
2928 if (needs_dep[i])
2929 scan_inst->insert_before(block,
2930 DEP_RESOLVE_MOV(first_write_grf + i));
2931 }
2932 return;
2933 }
2934
2935 /* Clear the flag for registers that actually got read (as expected). */
2936 clear_deps_for_inst_src(scan_inst, dispatch_width,
2937 needs_dep, first_write_grf, write_len);
2938
2939 /* We insert our reads as late as possible since they're reading the
2940 * result of a SEND, which has massive latency.
2941 */
2942 if (scan_inst->dst.file == GRF &&
2943 scan_inst->dst.reg >= first_write_grf &&
2944 scan_inst->dst.reg < first_write_grf + write_len &&
2945 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2946 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2947 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2948 }
2949
2950 /* Continue the loop only if we haven't resolved all the dependencies */
2951 int i;
2952 for (i = 0; i < write_len; i++) {
2953 if (needs_dep[i])
2954 break;
2955 }
2956 if (i == write_len)
2957 return;
2958 }
2959
2960 /* If we hit the end of the program, resolve all remaining dependencies out
2961 * of paranoia.
2962 */
2963 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2964 assert(last_inst->eot);
2965 for (int i = 0; i < write_len; i++) {
2966 if (needs_dep[i])
2967 last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2968 }
2969 }
2970
2971 void
2972 fs_visitor::insert_gen4_send_dependency_workarounds()
2973 {
2974 if (brw->gen != 4 || brw->is_g4x)
2975 return;
2976
2977 bool progress = false;
2978
2979 /* Note that we're done with register allocation, so GRF fs_regs always
2980 * have a .reg_offset of 0.
2981 */
2982
2983 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2984 if (inst->mlen != 0 && inst->dst.file == GRF) {
2985 insert_gen4_pre_send_dependency_workarounds(block, inst);
2986 insert_gen4_post_send_dependency_workarounds(block, inst);
2987 progress = true;
2988 }
2989 }
2990
2991 if (progress)
2992 invalidate_live_intervals();
2993 }
2994
2995 /**
2996 * Turns the generic expression-style uniform pull constant load instruction
2997 * into a hardware-specific series of instructions for loading a pull
2998 * constant.
2999 *
3000 * The expression style allows the CSE pass before this to optimize out
3001 * repeated loads from the same offset, and gives the pre-register-allocation
3002 * scheduling full flexibility, while the conversion to native instructions
3003 * allows the post-register-allocation scheduler the best information
3004 * possible.
3005 *
3006 * Note that execution masking for setting up pull constant loads is special:
3007 * the channels that need to be written are unrelated to the current execution
3008 * mask, since a later instruction will use one of the result channels as a
3009 * source operand for all 8 or 16 of its channels.
3010 */
3011 void
3012 fs_visitor::lower_uniform_pull_constant_loads()
3013 {
3014 foreach_block_and_inst (block, fs_inst, inst, cfg) {
3015 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3016 continue;
3017
3018 if (brw->gen >= 7) {
3019 /* The offset arg before was a vec4-aligned byte offset. We need to
3020 * turn it into a dword offset.
3021 */
3022 fs_reg const_offset_reg = inst->src[1];
3023 assert(const_offset_reg.file == IMM &&
3024 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3025 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3026 fs_reg payload = vgrf(glsl_type::uint_type);
3027
3028 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3029 * Reserve space for the register.
3030 */
3031 if (brw->gen >= 9) {
3032 payload.reg_offset++;
3033 virtual_grf_sizes[payload.reg] = 2;
3034 }
3035
3036 /* This is actually going to be a MOV, but since only the first dword
3037 * is accessed, we have a special opcode to do just that one. Note
3038 * that this needs to be an operation that will be considered a def
3039 * by live variable analysis, or register allocation will explode.
3040 */
3041 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3042 8, payload, const_offset_reg);
3043 setup->force_writemask_all = true;
3044
3045 setup->ir = inst->ir;
3046 setup->annotation = inst->annotation;
3047 inst->insert_before(block, setup);
3048
3049 /* Similarly, this will only populate the first 4 channels of the
3050 * result register (since we only use smear values from 0-3), but we
3051 * don't tell the optimizer.
3052 */
3053 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3054 inst->src[1] = payload;
3055
3056 invalidate_live_intervals();
3057 } else {
3058 /* Before register allocation, we didn't tell the scheduler about the
3059 * MRF we use. We know it's safe to use this MRF because nothing
3060 * else does except for register spill/unspill, which generates and
3061 * uses its MRF within a single IR instruction.
3062 */
3063 inst->base_mrf = 14;
3064 inst->mlen = 1;
3065 }
3066 }
3067 }
3068
3069 bool
3070 fs_visitor::lower_load_payload()
3071 {
3072 bool progress = false;
3073
3074 int vgrf_to_reg[virtual_grf_count];
3075 int reg_count = 16; /* Leave room for MRF */
3076 for (int i = 0; i < virtual_grf_count; ++i) {
3077 vgrf_to_reg[i] = reg_count;
3078 reg_count += virtual_grf_sizes[i];
3079 }
3080
3081 struct {
3082 bool written:1; /* Whether this register has ever been written */
3083 bool force_writemask_all:1;
3084 bool force_sechalf:1;
3085 } metadata[reg_count];
3086 memset(metadata, 0, sizeof(metadata));
3087
3088 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3089 int dst_reg;
3090 if (inst->dst.file == GRF) {
3091 dst_reg = vgrf_to_reg[inst->dst.reg];
3092 } else {
3093 /* MRF */
3094 dst_reg = inst->dst.reg;
3095 }
3096
3097 if (inst->dst.file == MRF || inst->dst.file == GRF) {
3098 bool force_sechalf = inst->force_sechalf;
3099 bool toggle_sechalf = inst->dst.width == 16 &&
3100 type_sz(inst->dst.type) == 4;
3101 for (int i = 0; i < inst->regs_written; ++i) {
3102 metadata[dst_reg + i].written = true;
3103 metadata[dst_reg + i].force_sechalf = force_sechalf;
3104 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3105 force_sechalf = (toggle_sechalf != force_sechalf);
3106 }
3107 }
3108
3109 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3110 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3111 fs_reg dst = inst->dst;
3112
3113 for (int i = 0; i < inst->sources; i++) {
3114 dst.width = inst->src[i].effective_width;
3115 dst.type = inst->src[i].type;
3116
3117 if (inst->src[i].file == BAD_FILE) {
3118 /* Do nothing but otherwise increment as normal */
3119 } else if (dst.file == MRF &&
3120 dst.width == 8 &&
3121 brw->has_compr4 &&
3122 i + 4 < inst->sources &&
3123 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3124 fs_reg compr4_dst = dst;
3125 compr4_dst.reg += BRW_MRF_COMPR4;
3126 compr4_dst.width = 16;
3127 fs_reg compr4_src = inst->src[i];
3128 compr4_src.width = 16;
3129 fs_inst *mov = MOV(compr4_dst, compr4_src);
3130 mov->force_writemask_all = true;
3131 inst->insert_before(block, mov);
3132 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3133 inst->src[i + 4].file = BAD_FILE;
3134 } else {
3135 fs_inst *mov = MOV(dst, inst->src[i]);
3136 if (inst->src[i].file == GRF) {
3137 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3138 inst->src[i].reg_offset;
3139 mov->force_sechalf = metadata[src_reg].force_sechalf;
3140 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3141 metadata[dst_reg] = metadata[src_reg];
3142 if (dst.width * type_sz(dst.type) > 32) {
3143 assert((!metadata[src_reg].written ||
3144 !metadata[src_reg].force_sechalf) &&
3145 (!metadata[src_reg + 1].written ||
3146 metadata[src_reg + 1].force_sechalf));
3147 metadata[dst_reg + 1] = metadata[src_reg + 1];
3148 }
3149 } else {
3150 metadata[dst_reg].force_writemask_all = false;
3151 metadata[dst_reg].force_sechalf = false;
3152 if (dst.width == 16) {
3153 metadata[dst_reg + 1].force_writemask_all = false;
3154 metadata[dst_reg + 1].force_sechalf = true;
3155 }
3156 }
3157 inst->insert_before(block, mov);
3158 }
3159
3160 dst = offset(dst, 1);
3161 }
3162
3163 inst->remove(block);
3164 progress = true;
3165 }
3166 }
3167
3168 if (progress)
3169 invalidate_live_intervals();
3170
3171 return progress;
3172 }
3173
3174 void
3175 fs_visitor::dump_instructions()
3176 {
3177 dump_instructions(NULL);
3178 }
3179
3180 void
3181 fs_visitor::dump_instructions(const char *name)
3182 {
3183 calculate_register_pressure();
3184 FILE *file = stderr;
3185 if (name && geteuid() != 0) {
3186 file = fopen(name, "w");
3187 if (!file)
3188 file = stderr;
3189 }
3190
3191 int ip = 0, max_pressure = 0;
3192 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3193 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3194 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3195 dump_instruction(inst, file);
3196 ++ip;
3197 }
3198 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3199
3200 if (file != stderr) {
3201 fclose(file);
3202 }
3203 }
3204
3205 void
3206 fs_visitor::dump_instruction(backend_instruction *be_inst)
3207 {
3208 dump_instruction(be_inst, stderr);
3209 }
3210
3211 void
3212 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3213 {
3214 fs_inst *inst = (fs_inst *)be_inst;
3215
3216 if (inst->predicate) {
3217 fprintf(file, "(%cf0.%d) ",
3218 inst->predicate_inverse ? '-' : '+',
3219 inst->flag_subreg);
3220 }
3221
3222 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3223 if (inst->saturate)
3224 fprintf(file, ".sat");
3225 if (inst->conditional_mod) {
3226 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3227 if (!inst->predicate &&
3228 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3229 inst->opcode != BRW_OPCODE_IF &&
3230 inst->opcode != BRW_OPCODE_WHILE))) {
3231 fprintf(file, ".f0.%d", inst->flag_subreg);
3232 }
3233 }
3234 fprintf(file, "(%d) ", inst->exec_size);
3235
3236
3237 switch (inst->dst.file) {
3238 case GRF:
3239 fprintf(file, "vgrf%d", inst->dst.reg);
3240 if (inst->dst.width != dispatch_width)
3241 fprintf(file, "@%d", inst->dst.width);
3242 if (virtual_grf_sizes[inst->dst.reg] != inst->dst.width / 8 ||
3243 inst->dst.subreg_offset)
3244 fprintf(file, "+%d.%d",
3245 inst->dst.reg_offset, inst->dst.subreg_offset);
3246 break;
3247 case MRF:
3248 fprintf(file, "m%d", inst->dst.reg);
3249 break;
3250 case BAD_FILE:
3251 fprintf(file, "(null)");
3252 break;
3253 case UNIFORM:
3254 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3255 break;
3256 case ATTR:
3257 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3258 break;
3259 case HW_REG:
3260 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3261 switch (inst->dst.fixed_hw_reg.nr) {
3262 case BRW_ARF_NULL:
3263 fprintf(file, "null");
3264 break;
3265 case BRW_ARF_ADDRESS:
3266 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3267 break;
3268 case BRW_ARF_ACCUMULATOR:
3269 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3270 break;
3271 case BRW_ARF_FLAG:
3272 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3273 inst->dst.fixed_hw_reg.subnr);
3274 break;
3275 default:
3276 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3277 inst->dst.fixed_hw_reg.subnr);
3278 break;
3279 }
3280 } else {
3281 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3282 }
3283 if (inst->dst.fixed_hw_reg.subnr)
3284 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3285 break;
3286 default:
3287 fprintf(file, "???");
3288 break;
3289 }
3290 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3291
3292 for (int i = 0; i < inst->sources; i++) {
3293 if (inst->src[i].negate)
3294 fprintf(file, "-");
3295 if (inst->src[i].abs)
3296 fprintf(file, "|");
3297 switch (inst->src[i].file) {
3298 case GRF:
3299 fprintf(file, "vgrf%d", inst->src[i].reg);
3300 if (inst->src[i].width != dispatch_width)
3301 fprintf(file, "@%d", inst->src[i].width);
3302 if (virtual_grf_sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3303 inst->src[i].subreg_offset)
3304 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3305 inst->src[i].subreg_offset);
3306 break;
3307 case MRF:
3308 fprintf(file, "***m%d***", inst->src[i].reg);
3309 break;
3310 case ATTR:
3311 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3312 break;
3313 case UNIFORM:
3314 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3315 if (inst->src[i].reladdr) {
3316 fprintf(file, "+reladdr");
3317 } else if (inst->src[i].subreg_offset) {
3318 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3319 inst->src[i].subreg_offset);
3320 }
3321 break;
3322 case BAD_FILE:
3323 fprintf(file, "(null)");
3324 break;
3325 case IMM:
3326 switch (inst->src[i].type) {
3327 case BRW_REGISTER_TYPE_F:
3328 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3329 break;
3330 case BRW_REGISTER_TYPE_D:
3331 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3332 break;
3333 case BRW_REGISTER_TYPE_UD:
3334 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3335 break;
3336 case BRW_REGISTER_TYPE_VF:
3337 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3338 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3339 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3340 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3341 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3342 break;
3343 default:
3344 fprintf(file, "???");
3345 break;
3346 }
3347 break;
3348 case HW_REG:
3349 if (inst->src[i].fixed_hw_reg.negate)
3350 fprintf(file, "-");
3351 if (inst->src[i].fixed_hw_reg.abs)
3352 fprintf(file, "|");
3353 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3354 switch (inst->src[i].fixed_hw_reg.nr) {
3355 case BRW_ARF_NULL:
3356 fprintf(file, "null");
3357 break;
3358 case BRW_ARF_ADDRESS:
3359 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3360 break;
3361 case BRW_ARF_ACCUMULATOR:
3362 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3363 break;
3364 case BRW_ARF_FLAG:
3365 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3366 inst->src[i].fixed_hw_reg.subnr);
3367 break;
3368 default:
3369 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3370 inst->src[i].fixed_hw_reg.subnr);
3371 break;
3372 }
3373 } else {
3374 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3375 }
3376 if (inst->src[i].fixed_hw_reg.subnr)
3377 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3378 if (inst->src[i].fixed_hw_reg.abs)
3379 fprintf(file, "|");
3380 break;
3381 default:
3382 fprintf(file, "???");
3383 break;
3384 }
3385 if (inst->src[i].abs)
3386 fprintf(file, "|");
3387
3388 if (inst->src[i].file != IMM) {
3389 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3390 }
3391
3392 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3393 fprintf(file, ", ");
3394 }
3395
3396 fprintf(file, " ");
3397
3398 if (dispatch_width == 16 && inst->exec_size == 8) {
3399 if (inst->force_sechalf)
3400 fprintf(file, "2ndhalf ");
3401 else
3402 fprintf(file, "1sthalf ");
3403 }
3404
3405 fprintf(file, "\n");
3406 }
3407
3408 /**
3409 * Possibly returns an instruction that set up @param reg.
3410 *
3411 * Sometimes we want to take the result of some expression/variable
3412 * dereference tree and rewrite the instruction generating the result
3413 * of the tree. When processing the tree, we know that the
3414 * instructions generated are all writing temporaries that are dead
3415 * outside of this tree. So, if we have some instructions that write
3416 * a temporary, we're free to point that temp write somewhere else.
3417 *
3418 * Note that this doesn't guarantee that the instruction generated
3419 * only reg -- it might be the size=4 destination of a texture instruction.
3420 */
3421 fs_inst *
3422 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3423 fs_inst *end,
3424 const fs_reg &reg)
3425 {
3426 if (end == start ||
3427 end->is_partial_write() ||
3428 reg.reladdr ||
3429 !reg.equals(end->dst)) {
3430 return NULL;
3431 } else {
3432 return end;
3433 }
3434 }
3435
3436 void
3437 fs_visitor::setup_payload_gen6()
3438 {
3439 bool uses_depth =
3440 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3441 unsigned barycentric_interp_modes =
3442 (stage == MESA_SHADER_FRAGMENT) ?
3443 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3444
3445 assert(brw->gen >= 6);
3446
3447 /* R0-1: masks, pixel X/Y coordinates. */
3448 payload.num_regs = 2;
3449 /* R2: only for 32-pixel dispatch.*/
3450
3451 /* R3-26: barycentric interpolation coordinates. These appear in the
3452 * same order that they appear in the brw_wm_barycentric_interp_mode
3453 * enum. Each set of coordinates occupies 2 registers if dispatch width
3454 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3455 * appear if they were enabled using the "Barycentric Interpolation
3456 * Mode" bits in WM_STATE.
3457 */
3458 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3459 if (barycentric_interp_modes & (1 << i)) {
3460 payload.barycentric_coord_reg[i] = payload.num_regs;
3461 payload.num_regs += 2;
3462 if (dispatch_width == 16) {
3463 payload.num_regs += 2;
3464 }
3465 }
3466 }
3467
3468 /* R27: interpolated depth if uses source depth */
3469 if (uses_depth) {
3470 payload.source_depth_reg = payload.num_regs;
3471 payload.num_regs++;
3472 if (dispatch_width == 16) {
3473 /* R28: interpolated depth if not SIMD8. */
3474 payload.num_regs++;
3475 }
3476 }
3477 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3478 if (uses_depth) {
3479 payload.source_w_reg = payload.num_regs;
3480 payload.num_regs++;
3481 if (dispatch_width == 16) {
3482 /* R30: interpolated W if not SIMD8. */
3483 payload.num_regs++;
3484 }
3485 }
3486
3487 if (stage == MESA_SHADER_FRAGMENT) {
3488 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3489 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3490 prog_data->uses_pos_offset = key->compute_pos_offset;
3491 /* R31: MSAA position offsets. */
3492 if (prog_data->uses_pos_offset) {
3493 payload.sample_pos_reg = payload.num_regs;
3494 payload.num_regs++;
3495 }
3496 }
3497
3498 /* R32: MSAA input coverage mask */
3499 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3500 assert(brw->gen >= 7);
3501 payload.sample_mask_in_reg = payload.num_regs;
3502 payload.num_regs++;
3503 if (dispatch_width == 16) {
3504 /* R33: input coverage mask if not SIMD8. */
3505 payload.num_regs++;
3506 }
3507 }
3508
3509 /* R34-: bary for 32-pixel. */
3510 /* R58-59: interp W for 32-pixel. */
3511
3512 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3513 source_depth_to_render_target = true;
3514 }
3515 }
3516
3517 void
3518 fs_visitor::setup_vs_payload()
3519 {
3520 /* R0: thread header, R1: urb handles */
3521 payload.num_regs = 2;
3522 }
3523
3524 void
3525 fs_visitor::assign_binding_table_offsets()
3526 {
3527 assert(stage == MESA_SHADER_FRAGMENT);
3528 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3529 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3530 uint32_t next_binding_table_offset = 0;
3531
3532 /* If there are no color regions, we still perform an FB write to a null
3533 * renderbuffer, which we place at surface index 0.
3534 */
3535 prog_data->binding_table.render_target_start = next_binding_table_offset;
3536 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3537
3538 assign_common_binding_table_offsets(next_binding_table_offset);
3539 }
3540
3541 void
3542 fs_visitor::calculate_register_pressure()
3543 {
3544 invalidate_live_intervals();
3545 calculate_live_intervals();
3546
3547 unsigned num_instructions = 0;
3548 foreach_block(block, cfg)
3549 num_instructions += block->instructions.length();
3550
3551 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3552
3553 for (int reg = 0; reg < virtual_grf_count; reg++) {
3554 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3555 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3556 }
3557 }
3558
3559 void
3560 fs_visitor::optimize()
3561 {
3562 const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3563
3564 calculate_cfg();
3565
3566 split_virtual_grfs();
3567
3568 move_uniform_array_access_to_pull_constants();
3569 assign_constant_locations();
3570 demote_pull_constants();
3571
3572 #define OPT(pass, args...) ({ \
3573 pass_num++; \
3574 bool this_progress = pass(args); \
3575 \
3576 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3577 char filename[64]; \
3578 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3579 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3580 \
3581 backend_visitor::dump_instructions(filename); \
3582 } \
3583 \
3584 progress = progress || this_progress; \
3585 this_progress; \
3586 })
3587
3588 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3589 char filename[64];
3590 snprintf(filename, 64, "%s%d-%04d-00-start",
3591 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3592
3593 backend_visitor::dump_instructions(filename);
3594 }
3595
3596 bool progress;
3597 int iteration = 0;
3598 int pass_num = 0;
3599 do {
3600 progress = false;
3601 pass_num = 0;
3602 iteration++;
3603
3604 OPT(remove_duplicate_mrf_writes);
3605
3606 OPT(opt_algebraic);
3607 OPT(opt_cse);
3608 OPT(opt_copy_propagate);
3609 OPT(opt_peephole_predicated_break);
3610 OPT(opt_cmod_propagation);
3611 OPT(dead_code_eliminate);
3612 OPT(opt_peephole_sel);
3613 OPT(dead_control_flow_eliminate, this);
3614 OPT(opt_register_renaming);
3615 OPT(opt_saturate_propagation);
3616 OPT(register_coalesce);
3617 OPT(compute_to_mrf);
3618
3619 OPT(compact_virtual_grfs);
3620 } while (progress);
3621
3622 pass_num = 0;
3623
3624 if (OPT(lower_load_payload)) {
3625 split_virtual_grfs();
3626 OPT(register_coalesce);
3627 OPT(compute_to_mrf);
3628 OPT(dead_code_eliminate);
3629 }
3630
3631 lower_uniform_pull_constant_loads();
3632 }
3633
3634 /**
3635 * Three source instruction must have a GRF/MRF destination register.
3636 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3637 */
3638 void
3639 fs_visitor::fixup_3src_null_dest()
3640 {
3641 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3642 if (inst->is_3src() && inst->dst.is_null()) {
3643 inst->dst = fs_reg(GRF, virtual_grf_alloc(dispatch_width / 8),
3644 inst->dst.type);
3645 }
3646 }
3647 }
3648
3649 void
3650 fs_visitor::allocate_registers()
3651 {
3652 bool allocated_without_spills;
3653
3654 static const enum instruction_scheduler_mode pre_modes[] = {
3655 SCHEDULE_PRE,
3656 SCHEDULE_PRE_NON_LIFO,
3657 SCHEDULE_PRE_LIFO,
3658 };
3659
3660 /* Try each scheduling heuristic to see if it can successfully register
3661 * allocate without spilling. They should be ordered by decreasing
3662 * performance but increasing likelihood of allocating.
3663 */
3664 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3665 schedule_instructions(pre_modes[i]);
3666
3667 if (0) {
3668 assign_regs_trivial();
3669 allocated_without_spills = true;
3670 } else {
3671 allocated_without_spills = assign_regs(false);
3672 }
3673 if (allocated_without_spills)
3674 break;
3675 }
3676
3677 if (!allocated_without_spills) {
3678 const char *stage_name = stage == MESA_SHADER_VERTEX ?
3679 "Vertex" : "Fragment";
3680
3681 /* We assume that any spilling is worse than just dropping back to
3682 * SIMD8. There's probably actually some intermediate point where
3683 * SIMD16 with a couple of spills is still better.
3684 */
3685 if (dispatch_width == 16) {
3686 fail("Failure to register allocate. Reduce number of "
3687 "live scalar values to avoid this.");
3688 } else {
3689 perf_debug("%s shader triggered register spilling. "
3690 "Try reducing the number of live scalar values to "
3691 "improve performance.\n", stage_name);
3692 }
3693
3694 /* Since we're out of heuristics, just go spill registers until we
3695 * get an allocation.
3696 */
3697 while (!assign_regs(true)) {
3698 if (failed)
3699 break;
3700 }
3701 }
3702
3703 /* This must come after all optimization and register allocation, since
3704 * it inserts dead code that happens to have side effects, and it does
3705 * so based on the actual physical registers in use.
3706 */
3707 insert_gen4_send_dependency_workarounds();
3708
3709 if (failed)
3710 return;
3711
3712 if (!allocated_without_spills)
3713 schedule_instructions(SCHEDULE_POST);
3714
3715 if (last_scratch > 0)
3716 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3717 }
3718
3719 bool
3720 fs_visitor::run_vs()
3721 {
3722 assert(stage == MESA_SHADER_VERTEX);
3723
3724 assign_common_binding_table_offsets(0);
3725 setup_vs_payload();
3726
3727 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3728 emit_shader_time_begin();
3729
3730 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3731 base_ir = ir;
3732 this->result = reg_undef;
3733 ir->accept(this);
3734 }
3735 base_ir = NULL;
3736 if (failed)
3737 return false;
3738
3739 emit_urb_writes();
3740
3741 optimize();
3742
3743 assign_curb_setup();
3744 assign_vs_urb_setup();
3745
3746 fixup_3src_null_dest();
3747 allocate_registers();
3748
3749 return !failed;
3750 }
3751
3752 bool
3753 fs_visitor::run_fs()
3754 {
3755 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3756 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3757
3758 assert(stage == MESA_SHADER_FRAGMENT);
3759
3760 sanity_param_count = prog->Parameters->NumParameters;
3761
3762 assign_binding_table_offsets();
3763
3764 if (brw->gen >= 6)
3765 setup_payload_gen6();
3766 else
3767 setup_payload_gen4();
3768
3769 if (0) {
3770 emit_dummy_fs();
3771 } else if (brw->use_rep_send && dispatch_width == 16) {
3772 emit_repclear_shader();
3773 } else {
3774 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3775 emit_shader_time_begin();
3776
3777 calculate_urb_setup();
3778 if (prog->InputsRead > 0) {
3779 if (brw->gen < 6)
3780 emit_interpolation_setup_gen4();
3781 else
3782 emit_interpolation_setup_gen6();
3783 }
3784
3785 /* We handle discards by keeping track of the still-live pixels in f0.1.
3786 * Initialize it with the dispatched pixels.
3787 */
3788 if (wm_prog_data->uses_kill) {
3789 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3790 discard_init->flag_subreg = 1;
3791 }
3792
3793 /* Generate FS IR for main(). (the visitor only descends into
3794 * functions called "main").
3795 */
3796 if (shader) {
3797 if (getenv("INTEL_USE_NIR") != NULL) {
3798 emit_nir_code();
3799 } else {
3800 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3801 base_ir = ir;
3802 this->result = reg_undef;
3803 ir->accept(this);
3804 }
3805 }
3806 } else {
3807 emit_fragment_program_code();
3808 }
3809 base_ir = NULL;
3810 if (failed)
3811 return false;
3812
3813 emit(FS_OPCODE_PLACEHOLDER_HALT);
3814
3815 if (wm_key->alpha_test_func)
3816 emit_alpha_test();
3817
3818 emit_fb_writes();
3819
3820 optimize();
3821
3822 assign_curb_setup();
3823 assign_urb_setup();
3824
3825 fixup_3src_null_dest();
3826 allocate_registers();
3827
3828 if (failed)
3829 return false;
3830 }
3831
3832 if (dispatch_width == 8)
3833 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3834 else
3835 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3836
3837 /* If any state parameters were appended, then ParameterValues could have
3838 * been realloced, in which case the driver uniform storage set up by
3839 * _mesa_associate_uniform_storage() would point to freed memory. Make
3840 * sure that didn't happen.
3841 */
3842 assert(sanity_param_count == prog->Parameters->NumParameters);
3843
3844 return !failed;
3845 }
3846
3847 const unsigned *
3848 brw_wm_fs_emit(struct brw_context *brw,
3849 void *mem_ctx,
3850 const struct brw_wm_prog_key *key,
3851 struct brw_wm_prog_data *prog_data,
3852 struct gl_fragment_program *fp,
3853 struct gl_shader_program *prog,
3854 unsigned *final_assembly_size)
3855 {
3856 bool start_busy = false;
3857 double start_time = 0;
3858
3859 if (unlikely(brw->perf_debug)) {
3860 start_busy = (brw->batch.last_bo &&
3861 drm_intel_bo_busy(brw->batch.last_bo));
3862 start_time = get_time();
3863 }
3864
3865 struct brw_shader *shader = NULL;
3866 if (prog)
3867 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3868
3869 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3870 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3871
3872 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3873 */
3874 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3875 if (!v.run_fs()) {
3876 if (prog) {
3877 prog->LinkStatus = false;
3878 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3879 }
3880
3881 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3882 v.fail_msg);
3883
3884 return NULL;
3885 }
3886
3887 cfg_t *simd16_cfg = NULL;
3888 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3889 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3890 brw->use_rep_send)) {
3891 if (!v.simd16_unsupported) {
3892 /* Try a SIMD16 compile */
3893 v2.import_uniforms(&v);
3894 if (!v2.run_fs()) {
3895 perf_debug("SIMD16 shader failed to compile, falling back to "
3896 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3897 } else {
3898 simd16_cfg = v2.cfg;
3899 }
3900 } else {
3901 perf_debug("SIMD16 shader unsupported, falling back to "
3902 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3903 }
3904 }
3905
3906 cfg_t *simd8_cfg;
3907 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3908 if (no_simd8 && simd16_cfg) {
3909 simd8_cfg = NULL;
3910 prog_data->no_8 = true;
3911 } else {
3912 simd8_cfg = v.cfg;
3913 prog_data->no_8 = false;
3914 }
3915
3916 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
3917 &fp->Base, v.runtime_check_aads_emit, "FS");
3918
3919 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3920 char *name;
3921 if (prog)
3922 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
3923 prog->Label ? prog->Label : "unnamed",
3924 prog->Name);
3925 else
3926 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
3927
3928 g.enable_debug(name);
3929 }
3930
3931 if (simd8_cfg)
3932 g.generate_code(simd8_cfg, 8);
3933 if (simd16_cfg)
3934 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
3935
3936 if (unlikely(brw->perf_debug) && shader) {
3937 if (shader->compiled_once)
3938 brw_wm_debug_recompile(brw, prog, key);
3939 shader->compiled_once = true;
3940
3941 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3942 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3943 (get_time() - start_time) * 1000);
3944 }
3945 }
3946
3947 return g.get_assembly(final_assembly_size);
3948 }
3949
3950 extern "C" bool
3951 brw_fs_precompile(struct gl_context *ctx,
3952 struct gl_shader_program *shader_prog,
3953 struct gl_program *prog)
3954 {
3955 struct brw_context *brw = brw_context(ctx);
3956 struct brw_wm_prog_key key;
3957
3958 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
3959 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3960 bool program_uses_dfdy = fp->UsesDFdy;
3961
3962 memset(&key, 0, sizeof(key));
3963
3964 if (brw->gen < 6) {
3965 if (fp->UsesKill)
3966 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3967
3968 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3969 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3970
3971 /* Just assume depth testing. */
3972 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3973 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3974 }
3975
3976 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3977 BRW_FS_VARYING_INPUT_MASK) > 16)
3978 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3979
3980 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
3981 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3982 for (unsigned i = 0; i < sampler_count; i++) {
3983 if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
3984 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3985 key.tex.swizzles[i] =
3986 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3987 } else {
3988 /* Color sampler: assume no swizzling. */
3989 key.tex.swizzles[i] = SWIZZLE_XYZW;
3990 }
3991 }
3992
3993 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3994 key.drawable_height = ctx->DrawBuffer->Height;
3995 }
3996
3997 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3998 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3999 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
4000
4001 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
4002 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
4003 key.nr_color_regions > 1;
4004 }
4005
4006 key.program_string_id = bfp->id;
4007
4008 uint32_t old_prog_offset = brw->wm.base.prog_offset;
4009 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
4010
4011 bool success = do_wm_prog(brw, shader_prog, bfp, &key);
4012
4013 brw->wm.base.prog_offset = old_prog_offset;
4014 brw->wm.prog_data = old_prog_data;
4015
4016 return success;
4017 }