i965: Set CMP's destination type to src0's type.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "util/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "util/register_allocate.h"
42 #include "program/hash_table.h"
43 #include "brw_context.h"
44 #include "brw_eu.h"
45 #include "brw_wm.h"
46 }
47 #include "brw_fs.h"
48 #include "brw_cfg.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53 #include "program/sampler.h"
54
55 void
56 fs_inst::init(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
57 fs_reg *src, int sources)
58 {
59 memset(this, 0, sizeof(*this));
60
61 this->opcode = opcode;
62 this->dst = dst;
63 this->src = src;
64 this->sources = sources;
65 this->exec_size = exec_size;
66
67 assert(dst.file != IMM && dst.file != UNIFORM);
68
69 /* If exec_size == 0, try to guess it from the registers. Since all
70 * manner of things may use hardware registers, we first try to guess
71 * based on GRF registers. If this fails, we will go ahead and take the
72 * width from the destination register.
73 */
74 if (this->exec_size == 0) {
75 if (dst.file == GRF) {
76 this->exec_size = dst.width;
77 } else {
78 for (int i = 0; i < sources; ++i) {
79 if (src[i].file != GRF && src[i].file != ATTR)
80 continue;
81
82 if (this->exec_size <= 1)
83 this->exec_size = src[i].width;
84 assert(src[i].width == 1 || src[i].width == this->exec_size);
85 }
86 }
87
88 if (this->exec_size == 0 && dst.file != BAD_FILE)
89 this->exec_size = dst.width;
90 }
91 assert(this->exec_size != 0);
92
93 for (int i = 0; i < sources; ++i) {
94 switch (this->src[i].file) {
95 case BAD_FILE:
96 this->src[i].effective_width = 8;
97 break;
98 case GRF:
99 case HW_REG:
100 case ATTR:
101 assert(this->src[i].width > 0);
102 if (this->src[i].width == 1) {
103 this->src[i].effective_width = this->exec_size;
104 } else {
105 this->src[i].effective_width = this->src[i].width;
106 }
107 break;
108 case IMM:
109 case UNIFORM:
110 this->src[i].effective_width = this->exec_size;
111 break;
112 default:
113 unreachable("Invalid source register file");
114 }
115 }
116 this->dst.effective_width = this->exec_size;
117
118 this->conditional_mod = BRW_CONDITIONAL_NONE;
119
120 /* This will be the case for almost all instructions. */
121 switch (dst.file) {
122 case GRF:
123 case HW_REG:
124 case MRF:
125 case ATTR:
126 this->regs_written = (dst.width * dst.stride * type_sz(dst.type) + 31) / 32;
127 break;
128 case BAD_FILE:
129 this->regs_written = 0;
130 break;
131 case IMM:
132 case UNIFORM:
133 unreachable("Invalid destination register file");
134 default:
135 unreachable("Invalid register file");
136 }
137
138 this->writes_accumulator = false;
139 }
140
141 fs_inst::fs_inst()
142 {
143 fs_reg *src = ralloc_array(this, fs_reg, 3);
144 init(BRW_OPCODE_NOP, 8, dst, src, 0);
145 }
146
147 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size)
148 {
149 fs_reg *src = ralloc_array(this, fs_reg, 3);
150 init(opcode, exec_size, reg_undef, src, 0);
151 }
152
153 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst)
154 {
155 fs_reg *src = ralloc_array(this, fs_reg, 3);
156 init(opcode, 0, dst, src, 0);
157 }
158
159 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
160 const fs_reg &src0)
161 {
162 fs_reg *src = ralloc_array(this, fs_reg, 3);
163 src[0] = src0;
164 init(opcode, exec_size, dst, src, 1);
165 }
166
167 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
168 {
169 fs_reg *src = ralloc_array(this, fs_reg, 3);
170 src[0] = src0;
171 init(opcode, 0, dst, src, 1);
172 }
173
174 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
175 const fs_reg &src0, const fs_reg &src1)
176 {
177 fs_reg *src = ralloc_array(this, fs_reg, 3);
178 src[0] = src0;
179 src[1] = src1;
180 init(opcode, exec_size, dst, src, 2);
181 }
182
183 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
184 const fs_reg &src1)
185 {
186 fs_reg *src = ralloc_array(this, fs_reg, 3);
187 src[0] = src0;
188 src[1] = src1;
189 init(opcode, 0, dst, src, 2);
190 }
191
192 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_size, const fs_reg &dst,
193 const fs_reg &src0, const fs_reg &src1, const fs_reg &src2)
194 {
195 fs_reg *src = ralloc_array(this, fs_reg, 3);
196 src[0] = src0;
197 src[1] = src1;
198 src[2] = src2;
199 init(opcode, exec_size, dst, src, 3);
200 }
201
202 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
203 const fs_reg &src1, const fs_reg &src2)
204 {
205 fs_reg *src = ralloc_array(this, fs_reg, 3);
206 src[0] = src0;
207 src[1] = src1;
208 src[2] = src2;
209 init(opcode, 0, dst, src, 3);
210 }
211
212 fs_inst::fs_inst(enum opcode opcode, const fs_reg &dst, fs_reg src[], int sources)
213 {
214 init(opcode, 0, dst, src, sources);
215 }
216
217 fs_inst::fs_inst(enum opcode opcode, uint8_t exec_width, const fs_reg &dst,
218 fs_reg src[], int sources)
219 {
220 init(opcode, exec_width, dst, src, sources);
221 }
222
223 fs_inst::fs_inst(const fs_inst &that)
224 {
225 memcpy(this, &that, sizeof(that));
226
227 this->src = ralloc_array(this, fs_reg, that.sources);
228
229 for (int i = 0; i < that.sources; i++)
230 this->src[i] = that.src[i];
231 }
232
233 void
234 fs_inst::resize_sources(uint8_t num_sources)
235 {
236 if (this->sources != num_sources) {
237 this->src = reralloc(this, this->src, fs_reg, num_sources);
238 this->sources = num_sources;
239 }
240 }
241
242 #define ALU1(op) \
243 fs_inst * \
244 fs_visitor::op(const fs_reg &dst, const fs_reg &src0) \
245 { \
246 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
247 }
248
249 #define ALU2(op) \
250 fs_inst * \
251 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
252 const fs_reg &src1) \
253 { \
254 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
255 }
256
257 #define ALU2_ACC(op) \
258 fs_inst * \
259 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
260 const fs_reg &src1) \
261 { \
262 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1);\
263 inst->writes_accumulator = true; \
264 return inst; \
265 }
266
267 #define ALU3(op) \
268 fs_inst * \
269 fs_visitor::op(const fs_reg &dst, const fs_reg &src0, \
270 const fs_reg &src1, const fs_reg &src2) \
271 { \
272 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
273 }
274
275 ALU1(NOT)
276 ALU1(MOV)
277 ALU1(FRC)
278 ALU1(RNDD)
279 ALU1(RNDE)
280 ALU1(RNDZ)
281 ALU2(ADD)
282 ALU2(MUL)
283 ALU2_ACC(MACH)
284 ALU2(AND)
285 ALU2(OR)
286 ALU2(XOR)
287 ALU2(SHL)
288 ALU2(SHR)
289 ALU2(ASR)
290 ALU3(LRP)
291 ALU1(BFREV)
292 ALU3(BFE)
293 ALU2(BFI1)
294 ALU3(BFI2)
295 ALU1(FBH)
296 ALU1(FBL)
297 ALU1(CBIT)
298 ALU3(MAD)
299 ALU2_ACC(ADDC)
300 ALU2_ACC(SUBB)
301 ALU2(SEL)
302 ALU2(MAC)
303
304 /** Gen4 predicated IF. */
305 fs_inst *
306 fs_visitor::IF(enum brw_predicate predicate)
307 {
308 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width);
309 inst->predicate = predicate;
310 return inst;
311 }
312
313 /** Gen6 IF with embedded comparison. */
314 fs_inst *
315 fs_visitor::IF(const fs_reg &src0, const fs_reg &src1,
316 enum brw_conditional_mod condition)
317 {
318 assert(brw->gen == 6);
319 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF, dispatch_width,
320 reg_null_d, src0, src1);
321 inst->conditional_mod = condition;
322 return inst;
323 }
324
325 /**
326 * CMP: Sets the low bit of the destination channels with the result
327 * of the comparison, while the upper bits are undefined, and updates
328 * the flag register with the packed 16 bits of the result.
329 */
330 fs_inst *
331 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1,
332 enum brw_conditional_mod condition)
333 {
334 fs_inst *inst;
335
336 /* Take the instruction:
337 *
338 * CMP null<d> src0<f> src1<f>
339 *
340 * Original gen4 does type conversion to the destination type before
341 * comparison, producing garbage results for floating point comparisons.
342 *
343 * The destination type doesn't matter on newer generations, so we set the
344 * type to match src0 so we can compact the instruction.
345 */
346 dst.type = src0.type;
347 if (dst.file == HW_REG)
348 dst.fixed_hw_reg.type = dst.type;
349
350 resolve_ud_negate(&src0);
351 resolve_ud_negate(&src1);
352
353 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
354 inst->conditional_mod = condition;
355
356 return inst;
357 }
358
359 fs_inst *
360 fs_visitor::LOAD_PAYLOAD(const fs_reg &dst, fs_reg *src, int sources)
361 {
362 uint8_t exec_size = dst.width;
363 for (int i = 0; i < sources; ++i) {
364 assert(src[i].width % dst.width == 0);
365 if (src[i].width > exec_size)
366 exec_size = src[i].width;
367 }
368
369 fs_inst *inst = new(mem_ctx) fs_inst(SHADER_OPCODE_LOAD_PAYLOAD, exec_size,
370 dst, src, sources);
371 inst->regs_written = 0;
372 for (int i = 0; i < sources; ++i) {
373 /* The LOAD_PAYLOAD instruction only really makes sense if we are
374 * dealing with whole registers. If this ever changes, we can deal
375 * with it later.
376 */
377 int size = src[i].effective_width * type_sz(src[i].type);
378 assert(size % 32 == 0);
379 inst->regs_written += (size + 31) / 32;
380 }
381
382 return inst;
383 }
384
385 exec_list
386 fs_visitor::VARYING_PULL_CONSTANT_LOAD(const fs_reg &dst,
387 const fs_reg &surf_index,
388 const fs_reg &varying_offset,
389 uint32_t const_offset)
390 {
391 exec_list instructions;
392 fs_inst *inst;
393
394 /* We have our constant surface use a pitch of 4 bytes, so our index can
395 * be any component of a vector, and then we load 4 contiguous
396 * components starting from that.
397 *
398 * We break down the const_offset to a portion added to the variable
399 * offset and a portion done using reg_offset, which means that if you
400 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
401 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
402 * CSE can later notice that those loads are all the same and eliminate
403 * the redundant ones.
404 */
405 fs_reg vec4_offset = vgrf(glsl_type::int_type);
406 instructions.push_tail(ADD(vec4_offset,
407 varying_offset, fs_reg(const_offset & ~3)));
408
409 int scale = 1;
410 if (brw->gen == 4 && dst.width == 8) {
411 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
412 * u, v, r) as parameters, or we can just use the SIMD16 message
413 * consisting of (header, u). We choose the second, at the cost of a
414 * longer return length.
415 */
416 scale = 2;
417 }
418
419 enum opcode op;
420 if (brw->gen >= 7)
421 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
422 else
423 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
424
425 assert(dst.width % 8 == 0);
426 int regs_written = 4 * (dst.width / 8) * scale;
427 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(regs_written),
428 dst.type, dst.width);
429 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
430 inst->regs_written = regs_written;
431 instructions.push_tail(inst);
432
433 if (brw->gen < 7) {
434 inst->base_mrf = 13;
435 inst->header_present = true;
436 if (brw->gen == 4)
437 inst->mlen = 3;
438 else
439 inst->mlen = 1 + dispatch_width / 8;
440 }
441
442 fs_reg result = offset(vec4_result, (const_offset & 3) * scale);
443 instructions.push_tail(MOV(dst, result));
444
445 return instructions;
446 }
447
448 /**
449 * A helper for MOV generation for fixing up broken hardware SEND dependency
450 * handling.
451 */
452 fs_inst *
453 fs_visitor::DEP_RESOLVE_MOV(int grf)
454 {
455 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
456
457 inst->ir = NULL;
458 inst->annotation = "send dependency resolve";
459
460 /* The caller always wants uncompressed to emit the minimal extra
461 * dependencies, and to avoid having to deal with aligning its regs to 2.
462 */
463 inst->exec_size = 8;
464
465 return inst;
466 }
467
468 bool
469 fs_inst::equals(fs_inst *inst) const
470 {
471 return (opcode == inst->opcode &&
472 dst.equals(inst->dst) &&
473 src[0].equals(inst->src[0]) &&
474 src[1].equals(inst->src[1]) &&
475 src[2].equals(inst->src[2]) &&
476 saturate == inst->saturate &&
477 predicate == inst->predicate &&
478 conditional_mod == inst->conditional_mod &&
479 mlen == inst->mlen &&
480 base_mrf == inst->base_mrf &&
481 target == inst->target &&
482 eot == inst->eot &&
483 header_present == inst->header_present &&
484 shadow_compare == inst->shadow_compare &&
485 exec_size == inst->exec_size &&
486 offset == inst->offset);
487 }
488
489 bool
490 fs_inst::overwrites_reg(const fs_reg &reg) const
491 {
492 return (reg.file == dst.file &&
493 reg.reg == dst.reg &&
494 reg.reg_offset >= dst.reg_offset &&
495 reg.reg_offset < dst.reg_offset + regs_written);
496 }
497
498 bool
499 fs_inst::is_send_from_grf() const
500 {
501 switch (opcode) {
502 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7:
503 case SHADER_OPCODE_SHADER_TIME_ADD:
504 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
505 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
506 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
507 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
508 case SHADER_OPCODE_UNTYPED_ATOMIC:
509 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
510 case SHADER_OPCODE_URB_WRITE_SIMD8:
511 return true;
512 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
513 return src[1].file == GRF;
514 case FS_OPCODE_FB_WRITE:
515 return src[0].file == GRF;
516 default:
517 if (is_tex())
518 return src[0].file == GRF;
519
520 return false;
521 }
522 }
523
524 bool
525 fs_inst::can_do_source_mods(struct brw_context *brw)
526 {
527 if (brw->gen == 6 && is_math())
528 return false;
529
530 if (is_send_from_grf())
531 return false;
532
533 if (!backend_instruction::can_do_source_mods())
534 return false;
535
536 return true;
537 }
538
539 void
540 fs_reg::init()
541 {
542 memset(this, 0, sizeof(*this));
543 stride = 1;
544 }
545
546 /** Generic unset register constructor. */
547 fs_reg::fs_reg()
548 {
549 init();
550 this->file = BAD_FILE;
551 }
552
553 /** Immediate value constructor. */
554 fs_reg::fs_reg(float f)
555 {
556 init();
557 this->file = IMM;
558 this->type = BRW_REGISTER_TYPE_F;
559 this->fixed_hw_reg.dw1.f = f;
560 this->width = 1;
561 }
562
563 /** Immediate value constructor. */
564 fs_reg::fs_reg(int32_t i)
565 {
566 init();
567 this->file = IMM;
568 this->type = BRW_REGISTER_TYPE_D;
569 this->fixed_hw_reg.dw1.d = i;
570 this->width = 1;
571 }
572
573 /** Immediate value constructor. */
574 fs_reg::fs_reg(uint32_t u)
575 {
576 init();
577 this->file = IMM;
578 this->type = BRW_REGISTER_TYPE_UD;
579 this->fixed_hw_reg.dw1.ud = u;
580 this->width = 1;
581 }
582
583 /** Vector float immediate value constructor. */
584 fs_reg::fs_reg(uint8_t vf[4])
585 {
586 init();
587 this->file = IMM;
588 this->type = BRW_REGISTER_TYPE_VF;
589 memcpy(&this->fixed_hw_reg.dw1.ud, vf, sizeof(unsigned));
590 }
591
592 /** Vector float immediate value constructor. */
593 fs_reg::fs_reg(uint8_t vf0, uint8_t vf1, uint8_t vf2, uint8_t vf3)
594 {
595 init();
596 this->file = IMM;
597 this->type = BRW_REGISTER_TYPE_VF;
598 this->fixed_hw_reg.dw1.ud = (vf0 << 0) |
599 (vf1 << 8) |
600 (vf2 << 16) |
601 (vf3 << 24);
602 }
603
604 /** Fixed brw_reg. */
605 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
606 {
607 init();
608 this->file = HW_REG;
609 this->fixed_hw_reg = fixed_hw_reg;
610 this->type = fixed_hw_reg.type;
611 this->width = 1 << fixed_hw_reg.width;
612 }
613
614 bool
615 fs_reg::equals(const fs_reg &r) const
616 {
617 return (file == r.file &&
618 reg == r.reg &&
619 reg_offset == r.reg_offset &&
620 subreg_offset == r.subreg_offset &&
621 type == r.type &&
622 negate == r.negate &&
623 abs == r.abs &&
624 !reladdr && !r.reladdr &&
625 memcmp(&fixed_hw_reg, &r.fixed_hw_reg, sizeof(fixed_hw_reg)) == 0 &&
626 width == r.width &&
627 stride == r.stride);
628 }
629
630 fs_reg &
631 fs_reg::set_smear(unsigned subreg)
632 {
633 assert(file != HW_REG && file != IMM);
634 subreg_offset = subreg * type_sz(type);
635 stride = 0;
636 return *this;
637 }
638
639 bool
640 fs_reg::is_contiguous() const
641 {
642 return stride == 1;
643 }
644
645 int
646 fs_visitor::type_size(const struct glsl_type *type)
647 {
648 unsigned int size, i;
649
650 switch (type->base_type) {
651 case GLSL_TYPE_UINT:
652 case GLSL_TYPE_INT:
653 case GLSL_TYPE_FLOAT:
654 case GLSL_TYPE_BOOL:
655 return type->components();
656 case GLSL_TYPE_ARRAY:
657 return type_size(type->fields.array) * type->length;
658 case GLSL_TYPE_STRUCT:
659 size = 0;
660 for (i = 0; i < type->length; i++) {
661 size += type_size(type->fields.structure[i].type);
662 }
663 return size;
664 case GLSL_TYPE_SAMPLER:
665 /* Samplers take up no register space, since they're baked in at
666 * link time.
667 */
668 return 0;
669 case GLSL_TYPE_ATOMIC_UINT:
670 return 0;
671 case GLSL_TYPE_IMAGE:
672 case GLSL_TYPE_VOID:
673 case GLSL_TYPE_ERROR:
674 case GLSL_TYPE_INTERFACE:
675 unreachable("not reached");
676 }
677
678 return 0;
679 }
680
681 fs_reg
682 fs_visitor::get_timestamp()
683 {
684 assert(brw->gen >= 7);
685
686 fs_reg ts = fs_reg(retype(brw_vec4_reg(BRW_ARCHITECTURE_REGISTER_FILE,
687 BRW_ARF_TIMESTAMP,
688 0),
689 BRW_REGISTER_TYPE_UD));
690
691 fs_reg dst = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 4);
692
693 fs_inst *mov = emit(MOV(dst, ts));
694 /* We want to read the 3 fields we care about even if it's not enabled in
695 * the dispatch.
696 */
697 mov->force_writemask_all = true;
698
699 /* The caller wants the low 32 bits of the timestamp. Since it's running
700 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
701 * which is plenty of time for our purposes. It is identical across the
702 * EUs, but since it's tracking GPU core speed it will increment at a
703 * varying rate as render P-states change.
704 *
705 * The caller could also check if render P-states have changed (or anything
706 * else that might disrupt timing) by setting smear to 2 and checking if
707 * that field is != 0.
708 */
709 dst.set_smear(0);
710
711 return dst;
712 }
713
714 void
715 fs_visitor::emit_shader_time_begin()
716 {
717 current_annotation = "shader time start";
718 shader_start_time = get_timestamp();
719 }
720
721 void
722 fs_visitor::emit_shader_time_end()
723 {
724 current_annotation = "shader time end";
725
726 enum shader_time_shader_type type, written_type, reset_type;
727 if (dispatch_width == 8) {
728 type = ST_FS8;
729 written_type = ST_FS8_WRITTEN;
730 reset_type = ST_FS8_RESET;
731 } else {
732 assert(dispatch_width == 16);
733 type = ST_FS16;
734 written_type = ST_FS16_WRITTEN;
735 reset_type = ST_FS16_RESET;
736 }
737
738 fs_reg shader_end_time = get_timestamp();
739
740 /* Check that there weren't any timestamp reset events (assuming these
741 * were the only two timestamp reads that happened).
742 */
743 fs_reg reset = shader_end_time;
744 reset.set_smear(2);
745 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
746 test->conditional_mod = BRW_CONDITIONAL_Z;
747 emit(IF(BRW_PREDICATE_NORMAL));
748
749 fs_reg start = shader_start_time;
750 start.negate = true;
751 fs_reg diff = fs_reg(GRF, virtual_grf_alloc(1), BRW_REGISTER_TYPE_UD, 1);
752 emit(ADD(diff, start, shader_end_time));
753
754 /* If there were no instructions between the two timestamp gets, the diff
755 * is 2 cycles. Remove that overhead, so I can forget about that when
756 * trying to determine the time taken for single instructions.
757 */
758 emit(ADD(diff, diff, fs_reg(-2u)));
759
760 emit_shader_time_write(type, diff);
761 emit_shader_time_write(written_type, fs_reg(1u));
762 emit(BRW_OPCODE_ELSE);
763 emit_shader_time_write(reset_type, fs_reg(1u));
764 emit(BRW_OPCODE_ENDIF);
765 }
766
767 void
768 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
769 fs_reg value)
770 {
771 int shader_time_index =
772 brw_get_shader_time_index(brw, shader_prog, prog, type);
773 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
774
775 fs_reg payload;
776 if (dispatch_width == 8)
777 payload = vgrf(glsl_type::uvec2_type);
778 else
779 payload = vgrf(glsl_type::uint_type);
780
781 emit(new(mem_ctx) fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
782 fs_reg(), payload, offset, value));
783 }
784
785 void
786 fs_visitor::vfail(const char *format, va_list va)
787 {
788 char *msg;
789
790 if (failed)
791 return;
792
793 failed = true;
794
795 msg = ralloc_vasprintf(mem_ctx, format, va);
796 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
797
798 this->fail_msg = msg;
799
800 if (INTEL_DEBUG & DEBUG_WM) {
801 fprintf(stderr, "%s", msg);
802 }
803 }
804
805 void
806 fs_visitor::fail(const char *format, ...)
807 {
808 va_list va;
809
810 va_start(va, format);
811 vfail(format, va);
812 va_end(va);
813 }
814
815 /**
816 * Mark this program as impossible to compile in SIMD16 mode.
817 *
818 * During the SIMD8 compile (which happens first), we can detect and flag
819 * things that are unsupported in SIMD16 mode, so the compiler can skip
820 * the SIMD16 compile altogether.
821 *
822 * During a SIMD16 compile (if one happens anyway), this just calls fail().
823 */
824 void
825 fs_visitor::no16(const char *format, ...)
826 {
827 va_list va;
828
829 va_start(va, format);
830
831 if (dispatch_width == 16) {
832 vfail(format, va);
833 } else {
834 simd16_unsupported = true;
835
836 if (brw->perf_debug) {
837 if (no16_msg)
838 ralloc_vasprintf_append(&no16_msg, format, va);
839 else
840 no16_msg = ralloc_vasprintf(mem_ctx, format, va);
841 }
842 }
843
844 va_end(va);
845 }
846
847 fs_inst *
848 fs_visitor::emit(enum opcode opcode)
849 {
850 return emit(new(mem_ctx) fs_inst(opcode, dispatch_width));
851 }
852
853 fs_inst *
854 fs_visitor::emit(enum opcode opcode, const fs_reg &dst)
855 {
856 return emit(new(mem_ctx) fs_inst(opcode, dst));
857 }
858
859 fs_inst *
860 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0)
861 {
862 return emit(new(mem_ctx) fs_inst(opcode, dst, src0));
863 }
864
865 fs_inst *
866 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
867 const fs_reg &src1)
868 {
869 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1));
870 }
871
872 fs_inst *
873 fs_visitor::emit(enum opcode opcode, const fs_reg &dst, const fs_reg &src0,
874 const fs_reg &src1, const fs_reg &src2)
875 {
876 return emit(new(mem_ctx) fs_inst(opcode, dst, src0, src1, src2));
877 }
878
879 fs_inst *
880 fs_visitor::emit(enum opcode opcode, const fs_reg &dst,
881 fs_reg src[], int sources)
882 {
883 return emit(new(mem_ctx) fs_inst(opcode, dst, src, sources));
884 }
885
886 /**
887 * Returns true if the instruction has a flag that means it won't
888 * update an entire destination register.
889 *
890 * For example, dead code elimination and live variable analysis want to know
891 * when a write to a variable screens off any preceding values that were in
892 * it.
893 */
894 bool
895 fs_inst::is_partial_write() const
896 {
897 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
898 (this->dst.width * type_sz(this->dst.type)) < 32 ||
899 !this->dst.is_contiguous());
900 }
901
902 int
903 fs_inst::regs_read(fs_visitor *v, int arg) const
904 {
905 if (is_tex() && arg == 0 && src[0].file == GRF) {
906 return mlen;
907 } else if (opcode == FS_OPCODE_FB_WRITE && arg == 0) {
908 return mlen;
909 } else if (opcode == SHADER_OPCODE_URB_WRITE_SIMD8 && arg == 0) {
910 return mlen;
911 } else if (opcode == SHADER_OPCODE_UNTYPED_ATOMIC && arg == 0) {
912 return mlen;
913 } else if (opcode == SHADER_OPCODE_UNTYPED_SURFACE_READ && arg == 0) {
914 return mlen;
915 } else if (opcode == FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET && arg == 0) {
916 return mlen;
917 }
918
919 switch (src[arg].file) {
920 case BAD_FILE:
921 case UNIFORM:
922 case IMM:
923 return 1;
924 case GRF:
925 case HW_REG:
926 if (src[arg].stride == 0) {
927 return 1;
928 } else {
929 int size = src[arg].width * src[arg].stride * type_sz(src[arg].type);
930 return (size + 31) / 32;
931 }
932 case MRF:
933 unreachable("MRF registers are not allowed as sources");
934 default:
935 unreachable("Invalid register file");
936 }
937 }
938
939 bool
940 fs_inst::reads_flag() const
941 {
942 return predicate;
943 }
944
945 bool
946 fs_inst::writes_flag() const
947 {
948 return (conditional_mod && (opcode != BRW_OPCODE_SEL &&
949 opcode != BRW_OPCODE_IF &&
950 opcode != BRW_OPCODE_WHILE)) ||
951 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
952 }
953
954 /**
955 * Returns how many MRFs an FS opcode will write over.
956 *
957 * Note that this is not the 0 or 1 implied writes in an actual gen
958 * instruction -- the FS opcodes often generate MOVs in addition.
959 */
960 int
961 fs_visitor::implied_mrf_writes(fs_inst *inst)
962 {
963 if (inst->mlen == 0)
964 return 0;
965
966 if (inst->base_mrf == -1)
967 return 0;
968
969 switch (inst->opcode) {
970 case SHADER_OPCODE_RCP:
971 case SHADER_OPCODE_RSQ:
972 case SHADER_OPCODE_SQRT:
973 case SHADER_OPCODE_EXP2:
974 case SHADER_OPCODE_LOG2:
975 case SHADER_OPCODE_SIN:
976 case SHADER_OPCODE_COS:
977 return 1 * dispatch_width / 8;
978 case SHADER_OPCODE_POW:
979 case SHADER_OPCODE_INT_QUOTIENT:
980 case SHADER_OPCODE_INT_REMAINDER:
981 return 2 * dispatch_width / 8;
982 case SHADER_OPCODE_TEX:
983 case FS_OPCODE_TXB:
984 case SHADER_OPCODE_TXD:
985 case SHADER_OPCODE_TXF:
986 case SHADER_OPCODE_TXF_CMS:
987 case SHADER_OPCODE_TXF_MCS:
988 case SHADER_OPCODE_TG4:
989 case SHADER_OPCODE_TG4_OFFSET:
990 case SHADER_OPCODE_TXL:
991 case SHADER_OPCODE_TXS:
992 case SHADER_OPCODE_LOD:
993 return 1;
994 case FS_OPCODE_FB_WRITE:
995 return 2;
996 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
997 case SHADER_OPCODE_GEN4_SCRATCH_READ:
998 return 1;
999 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
1000 return inst->mlen;
1001 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
1002 return 2;
1003 case SHADER_OPCODE_UNTYPED_ATOMIC:
1004 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
1005 case SHADER_OPCODE_URB_WRITE_SIMD8:
1006 case FS_OPCODE_INTERPOLATE_AT_CENTROID:
1007 case FS_OPCODE_INTERPOLATE_AT_SAMPLE:
1008 case FS_OPCODE_INTERPOLATE_AT_SHARED_OFFSET:
1009 case FS_OPCODE_INTERPOLATE_AT_PER_SLOT_OFFSET:
1010 return 0;
1011 default:
1012 unreachable("not reached");
1013 }
1014 }
1015
1016 int
1017 fs_visitor::virtual_grf_alloc(int size)
1018 {
1019 if (virtual_grf_array_size <= virtual_grf_count) {
1020 if (virtual_grf_array_size == 0)
1021 virtual_grf_array_size = 16;
1022 else
1023 virtual_grf_array_size *= 2;
1024 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
1025 virtual_grf_array_size);
1026 }
1027 virtual_grf_sizes[virtual_grf_count] = size;
1028 return virtual_grf_count++;
1029 }
1030
1031 fs_reg
1032 fs_visitor::vgrf(const glsl_type *const type)
1033 {
1034 int reg_width = dispatch_width / 8;
1035 return fs_reg(GRF, virtual_grf_alloc(type_size(type) * reg_width),
1036 brw_type_for_base_type(type), dispatch_width);
1037 }
1038
1039 fs_reg
1040 fs_visitor::vgrf(int num_components)
1041 {
1042 int reg_width = dispatch_width / 8;
1043 return fs_reg(GRF, virtual_grf_alloc(num_components * reg_width),
1044 BRW_REGISTER_TYPE_F, dispatch_width);
1045 }
1046
1047 /** Fixed HW reg constructor. */
1048 fs_reg::fs_reg(enum register_file file, int reg)
1049 {
1050 init();
1051 this->file = file;
1052 this->reg = reg;
1053 this->type = BRW_REGISTER_TYPE_F;
1054
1055 switch (file) {
1056 case UNIFORM:
1057 this->width = 1;
1058 break;
1059 default:
1060 this->width = 8;
1061 }
1062 }
1063
1064 /** Fixed HW reg constructor. */
1065 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type)
1066 {
1067 init();
1068 this->file = file;
1069 this->reg = reg;
1070 this->type = type;
1071
1072 switch (file) {
1073 case UNIFORM:
1074 this->width = 1;
1075 break;
1076 default:
1077 this->width = 8;
1078 }
1079 }
1080
1081 /** Fixed HW reg constructor. */
1082 fs_reg::fs_reg(enum register_file file, int reg, enum brw_reg_type type,
1083 uint8_t width)
1084 {
1085 init();
1086 this->file = file;
1087 this->reg = reg;
1088 this->type = type;
1089 this->width = width;
1090 }
1091
1092 fs_reg *
1093 fs_visitor::variable_storage(ir_variable *var)
1094 {
1095 return (fs_reg *)hash_table_find(this->variable_ht, var);
1096 }
1097
1098 void
1099 import_uniforms_callback(const void *key,
1100 void *data,
1101 void *closure)
1102 {
1103 struct hash_table *dst_ht = (struct hash_table *)closure;
1104 const fs_reg *reg = (const fs_reg *)data;
1105
1106 if (reg->file != UNIFORM)
1107 return;
1108
1109 hash_table_insert(dst_ht, data, key);
1110 }
1111
1112 /* For SIMD16, we need to follow from the uniform setup of SIMD8 dispatch.
1113 * This brings in those uniform definitions
1114 */
1115 void
1116 fs_visitor::import_uniforms(fs_visitor *v)
1117 {
1118 hash_table_call_foreach(v->variable_ht,
1119 import_uniforms_callback,
1120 variable_ht);
1121 this->push_constant_loc = v->push_constant_loc;
1122 this->pull_constant_loc = v->pull_constant_loc;
1123 this->uniforms = v->uniforms;
1124 this->param_size = v->param_size;
1125 }
1126
1127 /* Our support for uniforms is piggy-backed on the struct
1128 * gl_fragment_program, because that's where the values actually
1129 * get stored, rather than in some global gl_shader_program uniform
1130 * store.
1131 */
1132 void
1133 fs_visitor::setup_uniform_values(ir_variable *ir)
1134 {
1135 int namelen = strlen(ir->name);
1136
1137 /* The data for our (non-builtin) uniforms is stored in a series of
1138 * gl_uniform_driver_storage structs for each subcomponent that
1139 * glGetUniformLocation() could name. We know it's been set up in the same
1140 * order we'd walk the type, so walk the list of storage and find anything
1141 * with our name, or the prefix of a component that starts with our name.
1142 */
1143 unsigned params_before = uniforms;
1144 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
1145 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
1146
1147 if (strncmp(ir->name, storage->name, namelen) != 0 ||
1148 (storage->name[namelen] != 0 &&
1149 storage->name[namelen] != '.' &&
1150 storage->name[namelen] != '[')) {
1151 continue;
1152 }
1153
1154 unsigned slots = storage->type->component_slots();
1155 if (storage->array_elements)
1156 slots *= storage->array_elements;
1157
1158 for (unsigned i = 0; i < slots; i++) {
1159 stage_prog_data->param[uniforms++] = &storage->storage[i];
1160 }
1161 }
1162
1163 /* Make sure we actually initialized the right amount of stuff here. */
1164 assert(params_before + ir->type->component_slots() == uniforms);
1165 (void)params_before;
1166 }
1167
1168
1169 /* Our support for builtin uniforms is even scarier than non-builtin.
1170 * It sits on top of the PROG_STATE_VAR parameters that are
1171 * automatically updated from GL context state.
1172 */
1173 void
1174 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
1175 {
1176 const ir_state_slot *const slots = ir->get_state_slots();
1177 assert(slots != NULL);
1178
1179 for (unsigned int i = 0; i < ir->get_num_state_slots(); i++) {
1180 /* This state reference has already been setup by ir_to_mesa, but we'll
1181 * get the same index back here.
1182 */
1183 int index = _mesa_add_state_reference(this->prog->Parameters,
1184 (gl_state_index *)slots[i].tokens);
1185
1186 /* Add each of the unique swizzles of the element as a parameter.
1187 * This'll end up matching the expected layout of the
1188 * array/matrix/structure we're trying to fill in.
1189 */
1190 int last_swiz = -1;
1191 for (unsigned int j = 0; j < 4; j++) {
1192 int swiz = GET_SWZ(slots[i].swizzle, j);
1193 if (swiz == last_swiz)
1194 break;
1195 last_swiz = swiz;
1196
1197 stage_prog_data->param[uniforms++] =
1198 &prog->Parameters->ParameterValues[index][swiz];
1199 }
1200 }
1201 }
1202
1203 fs_reg *
1204 fs_visitor::emit_fragcoord_interpolation(bool pixel_center_integer,
1205 bool origin_upper_left)
1206 {
1207 assert(stage == MESA_SHADER_FRAGMENT);
1208 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1209 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec4_type));
1210 fs_reg wpos = *reg;
1211 bool flip = !origin_upper_left ^ key->render_to_fbo;
1212
1213 /* gl_FragCoord.x */
1214 if (pixel_center_integer) {
1215 emit(MOV(wpos, this->pixel_x));
1216 } else {
1217 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
1218 }
1219 wpos = offset(wpos, 1);
1220
1221 /* gl_FragCoord.y */
1222 if (!flip && pixel_center_integer) {
1223 emit(MOV(wpos, this->pixel_y));
1224 } else {
1225 fs_reg pixel_y = this->pixel_y;
1226 float offset = (pixel_center_integer ? 0.0 : 0.5);
1227
1228 if (flip) {
1229 pixel_y.negate = true;
1230 offset += key->drawable_height - 1.0;
1231 }
1232
1233 emit(ADD(wpos, pixel_y, fs_reg(offset)));
1234 }
1235 wpos = offset(wpos, 1);
1236
1237 /* gl_FragCoord.z */
1238 if (brw->gen >= 6) {
1239 emit(MOV(wpos, fs_reg(brw_vec8_grf(payload.source_depth_reg, 0))));
1240 } else {
1241 emit(FS_OPCODE_LINTERP, wpos,
1242 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1243 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
1244 interp_reg(VARYING_SLOT_POS, 2));
1245 }
1246 wpos = offset(wpos, 1);
1247
1248 /* gl_FragCoord.w: Already set up in emit_interpolation */
1249 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
1250
1251 return reg;
1252 }
1253
1254 fs_inst *
1255 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
1256 glsl_interp_qualifier interpolation_mode,
1257 bool is_centroid, bool is_sample)
1258 {
1259 brw_wm_barycentric_interp_mode barycoord_mode;
1260 if (brw->gen >= 6) {
1261 if (is_centroid) {
1262 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1263 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1264 else
1265 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1266 } else if (is_sample) {
1267 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1268 barycoord_mode = BRW_WM_PERSPECTIVE_SAMPLE_BARYCENTRIC;
1269 else
1270 barycoord_mode = BRW_WM_NONPERSPECTIVE_SAMPLE_BARYCENTRIC;
1271 } else {
1272 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1273 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1274 else
1275 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1276 }
1277 } else {
1278 /* On Ironlake and below, there is only one interpolation mode.
1279 * Centroid interpolation doesn't mean anything on this hardware --
1280 * there is no multisampling.
1281 */
1282 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1283 }
1284 return emit(FS_OPCODE_LINTERP, attr,
1285 this->delta_x[barycoord_mode],
1286 this->delta_y[barycoord_mode], interp);
1287 }
1288
1289 void
1290 fs_visitor::emit_general_interpolation(fs_reg attr, const char *name,
1291 const glsl_type *type,
1292 glsl_interp_qualifier interpolation_mode,
1293 int location, bool mod_centroid,
1294 bool mod_sample)
1295 {
1296 attr.type = brw_type_for_base_type(type->get_scalar_type());
1297
1298 assert(stage == MESA_SHADER_FRAGMENT);
1299 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1300 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1301
1302 unsigned int array_elements;
1303
1304 if (type->is_array()) {
1305 array_elements = type->length;
1306 if (array_elements == 0) {
1307 fail("dereferenced array '%s' has length 0\n", name);
1308 }
1309 type = type->fields.array;
1310 } else {
1311 array_elements = 1;
1312 }
1313
1314 if (interpolation_mode == INTERP_QUALIFIER_NONE) {
1315 bool is_gl_Color =
1316 location == VARYING_SLOT_COL0 || location == VARYING_SLOT_COL1;
1317 if (key->flat_shade && is_gl_Color) {
1318 interpolation_mode = INTERP_QUALIFIER_FLAT;
1319 } else {
1320 interpolation_mode = INTERP_QUALIFIER_SMOOTH;
1321 }
1322 }
1323
1324 for (unsigned int i = 0; i < array_elements; i++) {
1325 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1326 if (prog_data->urb_setup[location] == -1) {
1327 /* If there's no incoming setup data for this slot, don't
1328 * emit interpolation for it.
1329 */
1330 attr = offset(attr, type->vector_elements);
1331 location++;
1332 continue;
1333 }
1334
1335 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1336 /* Constant interpolation (flat shading) case. The SF has
1337 * handed us defined values in only the constant offset
1338 * field of the setup reg.
1339 */
1340 for (unsigned int k = 0; k < type->vector_elements; k++) {
1341 struct brw_reg interp = interp_reg(location, k);
1342 interp = suboffset(interp, 3);
1343 interp.type = attr.type;
1344 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1345 attr = offset(attr, 1);
1346 }
1347 } else {
1348 /* Smooth/noperspective interpolation case. */
1349 for (unsigned int k = 0; k < type->vector_elements; k++) {
1350 struct brw_reg interp = interp_reg(location, k);
1351 if (brw->needs_unlit_centroid_workaround && mod_centroid) {
1352 /* Get the pixel/sample mask into f0 so that we know
1353 * which pixels are lit. Then, for each channel that is
1354 * unlit, replace the centroid data with non-centroid
1355 * data.
1356 */
1357 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1358
1359 fs_inst *inst;
1360 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1361 false, false);
1362 inst->predicate = BRW_PREDICATE_NORMAL;
1363 inst->predicate_inverse = true;
1364 if (brw->has_pln)
1365 inst->no_dd_clear = true;
1366
1367 inst = emit_linterp(attr, fs_reg(interp), interpolation_mode,
1368 mod_centroid && !key->persample_shading,
1369 mod_sample || key->persample_shading);
1370 inst->predicate = BRW_PREDICATE_NORMAL;
1371 inst->predicate_inverse = false;
1372 if (brw->has_pln)
1373 inst->no_dd_check = true;
1374
1375 } else {
1376 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1377 mod_centroid && !key->persample_shading,
1378 mod_sample || key->persample_shading);
1379 }
1380 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1381 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1382 }
1383 attr = offset(attr, 1);
1384 }
1385
1386 }
1387 location++;
1388 }
1389 }
1390 }
1391
1392 fs_reg *
1393 fs_visitor::emit_frontfacing_interpolation()
1394 {
1395 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::bool_type));
1396
1397 if (brw->gen >= 6) {
1398 /* Bit 15 of g0.0 is 0 if the polygon is front facing. We want to create
1399 * a boolean result from this (~0/true or 0/false).
1400 *
1401 * We can use the fact that bit 15 is the MSB of g0.0:W to accomplish
1402 * this task in only one instruction:
1403 * - a negation source modifier will flip the bit; and
1404 * - a W -> D type conversion will sign extend the bit into the high
1405 * word of the destination.
1406 *
1407 * An ASR 15 fills the low word of the destination.
1408 */
1409 fs_reg g0 = fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_W));
1410 g0.negate = true;
1411
1412 emit(ASR(*reg, g0, fs_reg(15)));
1413 } else {
1414 /* Bit 31 of g1.6 is 0 if the polygon is front facing. We want to create
1415 * a boolean result from this (1/true or 0/false).
1416 *
1417 * Like in the above case, since the bit is the MSB of g1.6:UD we can use
1418 * the negation source modifier to flip it. Unfortunately the SHR
1419 * instruction only operates on UD (or D with an abs source modifier)
1420 * sources without negation.
1421 *
1422 * Instead, use ASR (which will give ~0/true or 0/false).
1423 */
1424 fs_reg g1_6 = fs_reg(retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_D));
1425 g1_6.negate = true;
1426
1427 emit(ASR(*reg, g1_6, fs_reg(31)));
1428 }
1429
1430 return reg;
1431 }
1432
1433 void
1434 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1435 {
1436 assert(stage == MESA_SHADER_FRAGMENT);
1437 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1438 assert(dst.type == BRW_REGISTER_TYPE_F);
1439
1440 if (key->compute_pos_offset) {
1441 /* Convert int_sample_pos to floating point */
1442 emit(MOV(dst, int_sample_pos));
1443 /* Scale to the range [0, 1] */
1444 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1445 }
1446 else {
1447 /* From ARB_sample_shading specification:
1448 * "When rendering to a non-multisample buffer, or if multisample
1449 * rasterization is disabled, gl_SamplePosition will always be
1450 * (0.5, 0.5).
1451 */
1452 emit(MOV(dst, fs_reg(0.5f)));
1453 }
1454 }
1455
1456 fs_reg *
1457 fs_visitor::emit_samplepos_setup()
1458 {
1459 assert(brw->gen >= 6);
1460
1461 this->current_annotation = "compute sample position";
1462 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::vec2_type));
1463 fs_reg pos = *reg;
1464 fs_reg int_sample_x = vgrf(glsl_type::int_type);
1465 fs_reg int_sample_y = vgrf(glsl_type::int_type);
1466
1467 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1468 * mode will be enabled.
1469 *
1470 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1471 * R31.1:0 Position Offset X/Y for Slot[3:0]
1472 * R31.3:2 Position Offset X/Y for Slot[7:4]
1473 * .....
1474 *
1475 * The X, Y sample positions come in as bytes in thread payload. So, read
1476 * the positions using vstride=16, width=8, hstride=2.
1477 */
1478 struct brw_reg sample_pos_reg =
1479 stride(retype(brw_vec1_grf(payload.sample_pos_reg, 0),
1480 BRW_REGISTER_TYPE_B), 16, 8, 2);
1481
1482 if (dispatch_width == 8) {
1483 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1484 } else {
1485 emit(MOV(half(int_sample_x, 0), fs_reg(sample_pos_reg)));
1486 emit(MOV(half(int_sample_x, 1), fs_reg(suboffset(sample_pos_reg, 16))))
1487 ->force_sechalf = true;
1488 }
1489 /* Compute gl_SamplePosition.x */
1490 compute_sample_position(pos, int_sample_x);
1491 pos = offset(pos, 1);
1492 if (dispatch_width == 8) {
1493 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1494 } else {
1495 emit(MOV(half(int_sample_y, 0),
1496 fs_reg(suboffset(sample_pos_reg, 1))));
1497 emit(MOV(half(int_sample_y, 1), fs_reg(suboffset(sample_pos_reg, 17))))
1498 ->force_sechalf = true;
1499 }
1500 /* Compute gl_SamplePosition.y */
1501 compute_sample_position(pos, int_sample_y);
1502 return reg;
1503 }
1504
1505 fs_reg *
1506 fs_visitor::emit_sampleid_setup()
1507 {
1508 assert(stage == MESA_SHADER_FRAGMENT);
1509 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1510 assert(brw->gen >= 6);
1511
1512 this->current_annotation = "compute sample id";
1513 fs_reg *reg = new(this->mem_ctx) fs_reg(vgrf(glsl_type::int_type));
1514
1515 if (key->compute_sample_id) {
1516 fs_reg t1 = vgrf(glsl_type::int_type);
1517 fs_reg t2 = vgrf(glsl_type::int_type);
1518 t2.type = BRW_REGISTER_TYPE_UW;
1519
1520 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1521 * 8x multisampling, subspan 0 will represent sample N (where N
1522 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1523 * 7. We can find the value of N by looking at R0.0 bits 7:6
1524 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1525 * (since samples are always delivered in pairs). That is, we
1526 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1527 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1528 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1529 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1530 * populating a temporary variable with the sequence (0, 1, 2, 3),
1531 * and then reading from it using vstride=1, width=4, hstride=0.
1532 * These computations hold good for 4x multisampling as well.
1533 *
1534 * For 2x MSAA and SIMD16, we want to use the sequence (0, 1, 0, 1):
1535 * the first four slots are sample 0 of subspan 0; the next four
1536 * are sample 1 of subspan 0; the third group is sample 0 of
1537 * subspan 1, and finally sample 1 of subspan 1.
1538 */
1539 fs_inst *inst;
1540 inst = emit(BRW_OPCODE_AND, t1,
1541 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_UD)),
1542 fs_reg(0xc0));
1543 inst->force_writemask_all = true;
1544 inst = emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1545 inst->force_writemask_all = true;
1546 /* This works for both SIMD8 and SIMD16 */
1547 inst = emit(MOV(t2, brw_imm_v(key->persample_2x ? 0x1010 : 0x3210)));
1548 inst->force_writemask_all = true;
1549 /* This special instruction takes care of setting vstride=1,
1550 * width=4, hstride=0 of t2 during an ADD instruction.
1551 */
1552 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1553 } else {
1554 /* As per GL_ARB_sample_shading specification:
1555 * "When rendering to a non-multisample buffer, or if multisample
1556 * rasterization is disabled, gl_SampleID will always be zero."
1557 */
1558 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1559 }
1560
1561 return reg;
1562 }
1563
1564 fs_reg
1565 fs_visitor::fix_math_operand(fs_reg src)
1566 {
1567 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1568 * might be able to do better by doing execsize = 1 math and then
1569 * expanding that result out, but we would need to be careful with
1570 * masking.
1571 *
1572 * The hardware ignores source modifiers (negate and abs) on math
1573 * instructions, so we also move to a temp to set those up.
1574 */
1575 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1576 !src.abs && !src.negate)
1577 return src;
1578
1579 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1580 * operands to math
1581 */
1582 if (brw->gen >= 7 && src.file != IMM)
1583 return src;
1584
1585 fs_reg expanded = vgrf(glsl_type::float_type);
1586 expanded.type = src.type;
1587 emit(BRW_OPCODE_MOV, expanded, src);
1588 return expanded;
1589 }
1590
1591 fs_inst *
1592 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1593 {
1594 switch (opcode) {
1595 case SHADER_OPCODE_RCP:
1596 case SHADER_OPCODE_RSQ:
1597 case SHADER_OPCODE_SQRT:
1598 case SHADER_OPCODE_EXP2:
1599 case SHADER_OPCODE_LOG2:
1600 case SHADER_OPCODE_SIN:
1601 case SHADER_OPCODE_COS:
1602 break;
1603 default:
1604 unreachable("not reached: bad math opcode");
1605 }
1606
1607 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1608 * might be able to do better by doing execsize = 1 math and then
1609 * expanding that result out, but we would need to be careful with
1610 * masking.
1611 *
1612 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1613 * instructions, so we also move to a temp to set those up.
1614 */
1615 if (brw->gen == 6 || brw->gen == 7)
1616 src = fix_math_operand(src);
1617
1618 fs_inst *inst = emit(opcode, dst, src);
1619
1620 if (brw->gen < 6) {
1621 inst->base_mrf = 2;
1622 inst->mlen = dispatch_width / 8;
1623 }
1624
1625 return inst;
1626 }
1627
1628 fs_inst *
1629 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1630 {
1631 int base_mrf = 2;
1632 fs_inst *inst;
1633
1634 if (brw->gen >= 8) {
1635 inst = emit(opcode, dst, src0, src1);
1636 } else if (brw->gen >= 6) {
1637 src0 = fix_math_operand(src0);
1638 src1 = fix_math_operand(src1);
1639
1640 inst = emit(opcode, dst, src0, src1);
1641 } else {
1642 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1643 * "Message Payload":
1644 *
1645 * "Operand0[7]. For the INT DIV functions, this operand is the
1646 * denominator."
1647 * ...
1648 * "Operand1[7]. For the INT DIV functions, this operand is the
1649 * numerator."
1650 */
1651 bool is_int_div = opcode != SHADER_OPCODE_POW;
1652 fs_reg &op0 = is_int_div ? src1 : src0;
1653 fs_reg &op1 = is_int_div ? src0 : src1;
1654
1655 emit(MOV(fs_reg(MRF, base_mrf + 1, op1.type, dispatch_width), op1));
1656 inst = emit(opcode, dst, op0, reg_null_f);
1657
1658 inst->base_mrf = base_mrf;
1659 inst->mlen = 2 * dispatch_width / 8;
1660 }
1661 return inst;
1662 }
1663
1664 void
1665 fs_visitor::assign_curb_setup()
1666 {
1667 if (dispatch_width == 8) {
1668 prog_data->dispatch_grf_start_reg = payload.num_regs;
1669 } else {
1670 assert(stage == MESA_SHADER_FRAGMENT);
1671 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1672 prog_data->dispatch_grf_start_reg_16 = payload.num_regs;
1673 }
1674
1675 prog_data->curb_read_length = ALIGN(stage_prog_data->nr_params, 8) / 8;
1676
1677 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1678 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1679 for (unsigned int i = 0; i < inst->sources; i++) {
1680 if (inst->src[i].file == UNIFORM) {
1681 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1682 int constant_nr;
1683 if (uniform_nr >= 0 && uniform_nr < (int) uniforms) {
1684 constant_nr = push_constant_loc[uniform_nr];
1685 } else {
1686 /* Section 5.11 of the OpenGL 4.1 spec says:
1687 * "Out-of-bounds reads return undefined values, which include
1688 * values from other variables of the active program or zero."
1689 * Just return the first push constant.
1690 */
1691 constant_nr = 0;
1692 }
1693
1694 struct brw_reg brw_reg = brw_vec1_grf(payload.num_regs +
1695 constant_nr / 8,
1696 constant_nr % 8);
1697
1698 inst->src[i].file = HW_REG;
1699 inst->src[i].fixed_hw_reg = byte_offset(
1700 retype(brw_reg, inst->src[i].type),
1701 inst->src[i].subreg_offset);
1702 }
1703 }
1704 }
1705 }
1706
1707 void
1708 fs_visitor::calculate_urb_setup()
1709 {
1710 assert(stage == MESA_SHADER_FRAGMENT);
1711 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1712 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
1713
1714 memset(prog_data->urb_setup, -1,
1715 sizeof(prog_data->urb_setup[0]) * VARYING_SLOT_MAX);
1716
1717 int urb_next = 0;
1718 /* Figure out where each of the incoming setup attributes lands. */
1719 if (brw->gen >= 6) {
1720 if (_mesa_bitcount_64(prog->InputsRead &
1721 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1722 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1723 * first 16 varying inputs, so we can put them wherever we want.
1724 * Just put them in order.
1725 *
1726 * This is useful because it means that (a) inputs not used by the
1727 * fragment shader won't take up valuable register space, and (b) we
1728 * won't have to recompile the fragment shader if it gets paired with
1729 * a different vertex (or geometry) shader.
1730 */
1731 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1732 if (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1733 BITFIELD64_BIT(i)) {
1734 prog_data->urb_setup[i] = urb_next++;
1735 }
1736 }
1737 } else {
1738 /* We have enough input varyings that the SF/SBE pipeline stage can't
1739 * arbitrarily rearrange them to suit our whim; we have to put them
1740 * in an order that matches the output of the previous pipeline stage
1741 * (geometry or vertex shader).
1742 */
1743 struct brw_vue_map prev_stage_vue_map;
1744 brw_compute_vue_map(brw, &prev_stage_vue_map,
1745 key->input_slots_valid);
1746 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1747 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1748 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1749 slot++) {
1750 int varying = prev_stage_vue_map.slot_to_varying[slot];
1751 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1752 * unused.
1753 */
1754 if (varying != BRW_VARYING_SLOT_COUNT &&
1755 (prog->InputsRead & BRW_FS_VARYING_INPUT_MASK &
1756 BITFIELD64_BIT(varying))) {
1757 prog_data->urb_setup[varying] = slot - first_slot;
1758 }
1759 }
1760 urb_next = prev_stage_vue_map.num_slots - first_slot;
1761 }
1762 } else {
1763 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1764 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1765 /* Point size is packed into the header, not as a general attribute */
1766 if (i == VARYING_SLOT_PSIZ)
1767 continue;
1768
1769 if (key->input_slots_valid & BITFIELD64_BIT(i)) {
1770 /* The back color slot is skipped when the front color is
1771 * also written to. In addition, some slots can be
1772 * written in the vertex shader and not read in the
1773 * fragment shader. So the register number must always be
1774 * incremented, mapped or not.
1775 */
1776 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1777 prog_data->urb_setup[i] = urb_next;
1778 urb_next++;
1779 }
1780 }
1781
1782 /*
1783 * It's a FS only attribute, and we did interpolation for this attribute
1784 * in SF thread. So, count it here, too.
1785 *
1786 * See compile_sf_prog() for more info.
1787 */
1788 if (prog->InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1789 prog_data->urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1790 }
1791
1792 prog_data->num_varying_inputs = urb_next;
1793 }
1794
1795 void
1796 fs_visitor::assign_urb_setup()
1797 {
1798 assert(stage == MESA_SHADER_FRAGMENT);
1799 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
1800
1801 int urb_start = payload.num_regs + prog_data->base.curb_read_length;
1802
1803 /* Offset all the urb_setup[] index by the actual position of the
1804 * setup regs, now that the location of the constants has been chosen.
1805 */
1806 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1807 if (inst->opcode == FS_OPCODE_LINTERP) {
1808 assert(inst->src[2].file == HW_REG);
1809 inst->src[2].fixed_hw_reg.nr += urb_start;
1810 }
1811
1812 if (inst->opcode == FS_OPCODE_CINTERP) {
1813 assert(inst->src[0].file == HW_REG);
1814 inst->src[0].fixed_hw_reg.nr += urb_start;
1815 }
1816 }
1817
1818 /* Each attribute is 4 setup channels, each of which is half a reg. */
1819 this->first_non_payload_grf =
1820 urb_start + prog_data->num_varying_inputs * 2;
1821 }
1822
1823 void
1824 fs_visitor::assign_vs_urb_setup()
1825 {
1826 brw_vs_prog_data *vs_prog_data = (brw_vs_prog_data *) prog_data;
1827 int grf, count, slot, channel, attr;
1828
1829 assert(stage == MESA_SHADER_VERTEX);
1830 count = _mesa_bitcount_64(vs_prog_data->inputs_read);
1831 if (vs_prog_data->uses_vertexid || vs_prog_data->uses_instanceid)
1832 count++;
1833
1834 /* Each attribute is 4 regs. */
1835 this->first_non_payload_grf =
1836 payload.num_regs + prog_data->curb_read_length + count * 4;
1837
1838 unsigned vue_entries =
1839 MAX2(count, vs_prog_data->base.vue_map.num_slots);
1840
1841 vs_prog_data->base.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1842 vs_prog_data->base.urb_read_length = (count + 1) / 2;
1843
1844 assert(vs_prog_data->base.urb_read_length <= 15);
1845
1846 /* Rewrite all ATTR file references to the hw grf that they land in. */
1847 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1848 for (int i = 0; i < inst->sources; i++) {
1849 if (inst->src[i].file == ATTR) {
1850
1851 if (inst->src[i].reg == VERT_ATTRIB_MAX) {
1852 slot = count - 1;
1853 } else {
1854 /* Attributes come in in a contiguous block, ordered by their
1855 * gl_vert_attrib value. That means we can compute the slot
1856 * number for an attribute by masking out the enabled
1857 * attributes before it and counting the bits.
1858 */
1859 attr = inst->src[i].reg + inst->src[i].reg_offset / 4;
1860 slot = _mesa_bitcount_64(vs_prog_data->inputs_read &
1861 BITFIELD64_MASK(attr));
1862 }
1863
1864 channel = inst->src[i].reg_offset & 3;
1865
1866 grf = payload.num_regs +
1867 prog_data->curb_read_length +
1868 slot * 4 + channel;
1869
1870 inst->src[i].file = HW_REG;
1871 inst->src[i].fixed_hw_reg =
1872 retype(brw_vec8_grf(grf, 0), inst->src[i].type);
1873 }
1874 }
1875 }
1876 }
1877
1878 /**
1879 * Split large virtual GRFs into separate components if we can.
1880 *
1881 * This is mostly duplicated with what brw_fs_vector_splitting does,
1882 * but that's really conservative because it's afraid of doing
1883 * splitting that doesn't result in real progress after the rest of
1884 * the optimization phases, which would cause infinite looping in
1885 * optimization. We can do it once here, safely. This also has the
1886 * opportunity to split interpolated values, or maybe even uniforms,
1887 * which we don't have at the IR level.
1888 *
1889 * We want to split, because virtual GRFs are what we register
1890 * allocate and spill (due to contiguousness requirements for some
1891 * instructions), and they're what we naturally generate in the
1892 * codegen process, but most virtual GRFs don't actually need to be
1893 * contiguous sets of GRFs. If we split, we'll end up with reduced
1894 * live intervals and better dead code elimination and coalescing.
1895 */
1896 void
1897 fs_visitor::split_virtual_grfs()
1898 {
1899 int num_vars = this->virtual_grf_count;
1900
1901 /* Count the total number of registers */
1902 int reg_count = 0;
1903 int vgrf_to_reg[num_vars];
1904 for (int i = 0; i < num_vars; i++) {
1905 vgrf_to_reg[i] = reg_count;
1906 reg_count += virtual_grf_sizes[i];
1907 }
1908
1909 /* An array of "split points". For each register slot, this indicates
1910 * if this slot can be separated from the previous slot. Every time an
1911 * instruction uses multiple elements of a register (as a source or
1912 * destination), we mark the used slots as inseparable. Then we go
1913 * through and split the registers into the smallest pieces we can.
1914 */
1915 bool split_points[reg_count];
1916 memset(split_points, 0, sizeof(split_points));
1917
1918 /* Mark all used registers as fully splittable */
1919 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1920 if (inst->dst.file == GRF) {
1921 int reg = vgrf_to_reg[inst->dst.reg];
1922 for (int j = 1; j < this->virtual_grf_sizes[inst->dst.reg]; j++)
1923 split_points[reg + j] = true;
1924 }
1925
1926 for (int i = 0; i < inst->sources; i++) {
1927 if (inst->src[i].file == GRF) {
1928 int reg = vgrf_to_reg[inst->src[i].reg];
1929 for (int j = 1; j < this->virtual_grf_sizes[inst->src[i].reg]; j++)
1930 split_points[reg + j] = true;
1931 }
1932 }
1933 }
1934
1935 if (brw->has_pln &&
1936 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1937 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1938 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1939 * Gen6, that was the only supported interpolation mode, and since Gen6,
1940 * delta_x and delta_y are in fixed hardware registers.
1941 */
1942 int vgrf = this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg;
1943 split_points[vgrf_to_reg[vgrf] + 1] = false;
1944 }
1945
1946 foreach_block_and_inst(block, fs_inst, inst, cfg) {
1947 if (inst->dst.file == GRF) {
1948 int reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
1949 for (int j = 1; j < inst->regs_written; j++)
1950 split_points[reg + j] = false;
1951 }
1952 for (int i = 0; i < inst->sources; i++) {
1953 if (inst->src[i].file == GRF) {
1954 int reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
1955 for (int j = 1; j < inst->regs_read(this, i); j++)
1956 split_points[reg + j] = false;
1957 }
1958 }
1959 }
1960
1961 int new_virtual_grf[reg_count];
1962 int new_reg_offset[reg_count];
1963
1964 int reg = 0;
1965 for (int i = 0; i < num_vars; i++) {
1966 /* The first one should always be 0 as a quick sanity check. */
1967 assert(split_points[reg] == false);
1968
1969 /* j = 0 case */
1970 new_reg_offset[reg] = 0;
1971 reg++;
1972 int offset = 1;
1973
1974 /* j > 0 case */
1975 for (int j = 1; j < virtual_grf_sizes[i]; j++) {
1976 /* If this is a split point, reset the offset to 0 and allocate a
1977 * new virtual GRF for the previous offset many registers
1978 */
1979 if (split_points[reg]) {
1980 assert(offset <= MAX_VGRF_SIZE);
1981 int grf = virtual_grf_alloc(offset);
1982 for (int k = reg - offset; k < reg; k++)
1983 new_virtual_grf[k] = grf;
1984 offset = 0;
1985 }
1986 new_reg_offset[reg] = offset;
1987 offset++;
1988 reg++;
1989 }
1990
1991 /* The last one gets the original register number */
1992 assert(offset <= MAX_VGRF_SIZE);
1993 virtual_grf_sizes[i] = offset;
1994 for (int k = reg - offset; k < reg; k++)
1995 new_virtual_grf[k] = i;
1996 }
1997 assert(reg == reg_count);
1998
1999 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2000 if (inst->dst.file == GRF) {
2001 reg = vgrf_to_reg[inst->dst.reg] + inst->dst.reg_offset;
2002 inst->dst.reg = new_virtual_grf[reg];
2003 inst->dst.reg_offset = new_reg_offset[reg];
2004 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
2005 }
2006 for (int i = 0; i < inst->sources; i++) {
2007 if (inst->src[i].file == GRF) {
2008 reg = vgrf_to_reg[inst->src[i].reg] + inst->src[i].reg_offset;
2009 inst->src[i].reg = new_virtual_grf[reg];
2010 inst->src[i].reg_offset = new_reg_offset[reg];
2011 assert(new_reg_offset[reg] < virtual_grf_sizes[new_virtual_grf[reg]]);
2012 }
2013 }
2014 }
2015 invalidate_live_intervals();
2016 }
2017
2018 /**
2019 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
2020 *
2021 * During code generation, we create tons of temporary variables, many of
2022 * which get immediately killed and are never used again. Yet, in later
2023 * optimization and analysis passes, such as compute_live_intervals, we need
2024 * to loop over all the virtual GRFs. Compacting them can save a lot of
2025 * overhead.
2026 */
2027 bool
2028 fs_visitor::compact_virtual_grfs()
2029 {
2030 bool progress = false;
2031 int remap_table[this->virtual_grf_count];
2032 memset(remap_table, -1, sizeof(remap_table));
2033
2034 /* Mark which virtual GRFs are used. */
2035 foreach_block_and_inst(block, const fs_inst, inst, cfg) {
2036 if (inst->dst.file == GRF)
2037 remap_table[inst->dst.reg] = 0;
2038
2039 for (int i = 0; i < inst->sources; i++) {
2040 if (inst->src[i].file == GRF)
2041 remap_table[inst->src[i].reg] = 0;
2042 }
2043 }
2044
2045 /* Compact the GRF arrays. */
2046 int new_index = 0;
2047 for (int i = 0; i < this->virtual_grf_count; i++) {
2048 if (remap_table[i] == -1) {
2049 /* We just found an unused register. This means that we are
2050 * actually going to compact something.
2051 */
2052 progress = true;
2053 } else {
2054 remap_table[i] = new_index;
2055 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
2056 invalidate_live_intervals();
2057 ++new_index;
2058 }
2059 }
2060
2061 this->virtual_grf_count = new_index;
2062
2063 /* Patch all the instructions to use the newly renumbered registers */
2064 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2065 if (inst->dst.file == GRF)
2066 inst->dst.reg = remap_table[inst->dst.reg];
2067
2068 for (int i = 0; i < inst->sources; i++) {
2069 if (inst->src[i].file == GRF)
2070 inst->src[i].reg = remap_table[inst->src[i].reg];
2071 }
2072 }
2073
2074 /* Patch all the references to delta_x/delta_y, since they're used in
2075 * register allocation. If they're unused, switch them to BAD_FILE so
2076 * we don't think some random VGRF is delta_x/delta_y.
2077 */
2078 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2079 if (delta_x[i].file == GRF) {
2080 if (remap_table[delta_x[i].reg] != -1) {
2081 delta_x[i].reg = remap_table[delta_x[i].reg];
2082 } else {
2083 delta_x[i].file = BAD_FILE;
2084 }
2085 }
2086 }
2087 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2088 if (delta_y[i].file == GRF) {
2089 if (remap_table[delta_y[i].reg] != -1) {
2090 delta_y[i].reg = remap_table[delta_y[i].reg];
2091 } else {
2092 delta_y[i].file = BAD_FILE;
2093 }
2094 }
2095 }
2096
2097 return progress;
2098 }
2099
2100 /*
2101 * Implements array access of uniforms by inserting a
2102 * PULL_CONSTANT_LOAD instruction.
2103 *
2104 * Unlike temporary GRF array access (where we don't support it due to
2105 * the difficulty of doing relative addressing on instruction
2106 * destinations), we could potentially do array access of uniforms
2107 * that were loaded in GRF space as push constants. In real-world
2108 * usage we've seen, though, the arrays being used are always larger
2109 * than we could load as push constants, so just always move all
2110 * uniform array access out to a pull constant buffer.
2111 */
2112 void
2113 fs_visitor::move_uniform_array_access_to_pull_constants()
2114 {
2115 if (dispatch_width != 8)
2116 return;
2117
2118 pull_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2119 memset(pull_constant_loc, -1, sizeof(pull_constant_loc[0]) * uniforms);
2120
2121 /* Walk through and find array access of uniforms. Put a copy of that
2122 * uniform in the pull constant buffer.
2123 *
2124 * Note that we don't move constant-indexed accesses to arrays. No
2125 * testing has been done of the performance impact of this choice.
2126 */
2127 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2128 for (int i = 0 ; i < inst->sources; i++) {
2129 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
2130 continue;
2131
2132 int uniform = inst->src[i].reg;
2133
2134 /* If this array isn't already present in the pull constant buffer,
2135 * add it.
2136 */
2137 if (pull_constant_loc[uniform] == -1) {
2138 const gl_constant_value **values = &stage_prog_data->param[uniform];
2139
2140 assert(param_size[uniform]);
2141
2142 for (int j = 0; j < param_size[uniform]; j++) {
2143 pull_constant_loc[uniform + j] = stage_prog_data->nr_pull_params;
2144
2145 stage_prog_data->pull_param[stage_prog_data->nr_pull_params++] =
2146 values[j];
2147 }
2148 }
2149 }
2150 }
2151 }
2152
2153 /**
2154 * Assign UNIFORM file registers to either push constants or pull constants.
2155 *
2156 * We allow a fragment shader to have more than the specified minimum
2157 * maximum number of fragment shader uniform components (64). If
2158 * there are too many of these, they'd fill up all of register space.
2159 * So, this will push some of them out to the pull constant buffer and
2160 * update the program to load them.
2161 */
2162 void
2163 fs_visitor::assign_constant_locations()
2164 {
2165 /* Only the first compile (SIMD8 mode) gets to decide on locations. */
2166 if (dispatch_width != 8)
2167 return;
2168
2169 /* Find which UNIFORM registers are still in use. */
2170 bool is_live[uniforms];
2171 for (unsigned int i = 0; i < uniforms; i++) {
2172 is_live[i] = false;
2173 }
2174
2175 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2176 for (int i = 0; i < inst->sources; i++) {
2177 if (inst->src[i].file != UNIFORM)
2178 continue;
2179
2180 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
2181 if (constant_nr >= 0 && constant_nr < (int) uniforms)
2182 is_live[constant_nr] = true;
2183 }
2184 }
2185
2186 /* Only allow 16 registers (128 uniform components) as push constants.
2187 *
2188 * Just demote the end of the list. We could probably do better
2189 * here, demoting things that are rarely used in the program first.
2190 *
2191 * If changing this value, note the limitation about total_regs in
2192 * brw_curbe.c.
2193 */
2194 unsigned int max_push_components = 16 * 8;
2195 unsigned int num_push_constants = 0;
2196
2197 push_constant_loc = ralloc_array(mem_ctx, int, uniforms);
2198
2199 for (unsigned int i = 0; i < uniforms; i++) {
2200 if (!is_live[i] || pull_constant_loc[i] != -1) {
2201 /* This UNIFORM register is either dead, or has already been demoted
2202 * to a pull const. Mark it as no longer living in the param[] array.
2203 */
2204 push_constant_loc[i] = -1;
2205 continue;
2206 }
2207
2208 if (num_push_constants < max_push_components) {
2209 /* Retain as a push constant. Record the location in the params[]
2210 * array.
2211 */
2212 push_constant_loc[i] = num_push_constants++;
2213 } else {
2214 /* Demote to a pull constant. */
2215 push_constant_loc[i] = -1;
2216
2217 int pull_index = stage_prog_data->nr_pull_params++;
2218 stage_prog_data->pull_param[pull_index] = stage_prog_data->param[i];
2219 pull_constant_loc[i] = pull_index;
2220 }
2221 }
2222
2223 stage_prog_data->nr_params = num_push_constants;
2224
2225 /* Up until now, the param[] array has been indexed by reg + reg_offset
2226 * of UNIFORM registers. Condense it to only contain the uniforms we
2227 * chose to upload as push constants.
2228 */
2229 for (unsigned int i = 0; i < uniforms; i++) {
2230 int remapped = push_constant_loc[i];
2231
2232 if (remapped == -1)
2233 continue;
2234
2235 assert(remapped <= (int)i);
2236 stage_prog_data->param[remapped] = stage_prog_data->param[i];
2237 }
2238 }
2239
2240 /**
2241 * Replace UNIFORM register file access with either UNIFORM_PULL_CONSTANT_LOAD
2242 * or VARYING_PULL_CONSTANT_LOAD instructions which load values into VGRFs.
2243 */
2244 void
2245 fs_visitor::demote_pull_constants()
2246 {
2247 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2248 for (int i = 0; i < inst->sources; i++) {
2249 if (inst->src[i].file != UNIFORM)
2250 continue;
2251
2252 int pull_index = pull_constant_loc[inst->src[i].reg +
2253 inst->src[i].reg_offset];
2254 if (pull_index == -1)
2255 continue;
2256
2257 /* Set up the annotation tracking for new generated instructions. */
2258 base_ir = inst->ir;
2259 current_annotation = inst->annotation;
2260
2261 fs_reg surf_index(stage_prog_data->binding_table.pull_constants_start);
2262 fs_reg dst = vgrf(glsl_type::float_type);
2263
2264 /* Generate a pull load into dst. */
2265 if (inst->src[i].reladdr) {
2266 exec_list list = VARYING_PULL_CONSTANT_LOAD(dst,
2267 surf_index,
2268 *inst->src[i].reladdr,
2269 pull_index);
2270 inst->insert_before(block, &list);
2271 inst->src[i].reladdr = NULL;
2272 } else {
2273 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
2274 fs_inst *pull =
2275 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD, 8,
2276 dst, surf_index, offset);
2277 inst->insert_before(block, pull);
2278 inst->src[i].set_smear(pull_index & 3);
2279 }
2280
2281 /* Rewrite the instruction to use the temporary VGRF. */
2282 inst->src[i].file = GRF;
2283 inst->src[i].reg = dst.reg;
2284 inst->src[i].reg_offset = 0;
2285 inst->src[i].width = dispatch_width;
2286 }
2287 }
2288 invalidate_live_intervals();
2289 }
2290
2291 bool
2292 fs_visitor::opt_algebraic()
2293 {
2294 bool progress = false;
2295
2296 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2297 switch (inst->opcode) {
2298 case BRW_OPCODE_MOV:
2299 if (inst->src[0].file != IMM)
2300 break;
2301
2302 if (inst->saturate) {
2303 if (inst->dst.type != inst->src[0].type)
2304 assert(!"unimplemented: saturate mixed types");
2305
2306 if (brw_saturate_immediate(inst->dst.type,
2307 &inst->src[0].fixed_hw_reg)) {
2308 inst->saturate = false;
2309 progress = true;
2310 }
2311 }
2312 break;
2313
2314 case BRW_OPCODE_MUL:
2315 if (inst->src[1].file != IMM)
2316 continue;
2317
2318 /* a * 1.0 = a */
2319 if (inst->src[1].is_one()) {
2320 inst->opcode = BRW_OPCODE_MOV;
2321 inst->src[1] = reg_undef;
2322 progress = true;
2323 break;
2324 }
2325
2326 /* a * 0.0 = 0.0 */
2327 if (inst->src[1].is_zero()) {
2328 inst->opcode = BRW_OPCODE_MOV;
2329 inst->src[0] = inst->src[1];
2330 inst->src[1] = reg_undef;
2331 progress = true;
2332 break;
2333 }
2334
2335 break;
2336 case BRW_OPCODE_ADD:
2337 if (inst->src[1].file != IMM)
2338 continue;
2339
2340 /* a + 0.0 = a */
2341 if (inst->src[1].is_zero()) {
2342 inst->opcode = BRW_OPCODE_MOV;
2343 inst->src[1] = reg_undef;
2344 progress = true;
2345 break;
2346 }
2347 break;
2348 case BRW_OPCODE_OR:
2349 if (inst->src[0].equals(inst->src[1])) {
2350 inst->opcode = BRW_OPCODE_MOV;
2351 inst->src[1] = reg_undef;
2352 progress = true;
2353 break;
2354 }
2355 break;
2356 case BRW_OPCODE_LRP:
2357 if (inst->src[1].equals(inst->src[2])) {
2358 inst->opcode = BRW_OPCODE_MOV;
2359 inst->src[0] = inst->src[1];
2360 inst->src[1] = reg_undef;
2361 inst->src[2] = reg_undef;
2362 progress = true;
2363 break;
2364 }
2365 break;
2366 case BRW_OPCODE_CMP:
2367 if (inst->conditional_mod == BRW_CONDITIONAL_GE &&
2368 inst->src[0].abs &&
2369 inst->src[0].negate &&
2370 inst->src[1].is_zero()) {
2371 inst->src[0].abs = false;
2372 inst->src[0].negate = false;
2373 inst->conditional_mod = BRW_CONDITIONAL_Z;
2374 progress = true;
2375 break;
2376 }
2377 break;
2378 case BRW_OPCODE_SEL:
2379 if (inst->src[0].equals(inst->src[1])) {
2380 inst->opcode = BRW_OPCODE_MOV;
2381 inst->src[1] = reg_undef;
2382 inst->predicate = BRW_PREDICATE_NONE;
2383 inst->predicate_inverse = false;
2384 progress = true;
2385 } else if (inst->saturate && inst->src[1].file == IMM) {
2386 switch (inst->conditional_mod) {
2387 case BRW_CONDITIONAL_LE:
2388 case BRW_CONDITIONAL_L:
2389 switch (inst->src[1].type) {
2390 case BRW_REGISTER_TYPE_F:
2391 if (inst->src[1].fixed_hw_reg.dw1.f >= 1.0f) {
2392 inst->opcode = BRW_OPCODE_MOV;
2393 inst->src[1] = reg_undef;
2394 progress = true;
2395 }
2396 break;
2397 default:
2398 break;
2399 }
2400 break;
2401 case BRW_CONDITIONAL_GE:
2402 case BRW_CONDITIONAL_G:
2403 switch (inst->src[1].type) {
2404 case BRW_REGISTER_TYPE_F:
2405 if (inst->src[1].fixed_hw_reg.dw1.f <= 0.0f) {
2406 inst->opcode = BRW_OPCODE_MOV;
2407 inst->src[1] = reg_undef;
2408 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2409 progress = true;
2410 }
2411 break;
2412 default:
2413 break;
2414 }
2415 default:
2416 break;
2417 }
2418 }
2419 break;
2420 case SHADER_OPCODE_RCP: {
2421 fs_inst *prev = (fs_inst *)inst->prev;
2422 if (prev->opcode == SHADER_OPCODE_SQRT) {
2423 if (inst->src[0].equals(prev->dst)) {
2424 inst->opcode = SHADER_OPCODE_RSQ;
2425 inst->src[0] = prev->src[0];
2426 progress = true;
2427 }
2428 }
2429 break;
2430 }
2431 default:
2432 break;
2433 }
2434 }
2435
2436 return progress;
2437 }
2438
2439 bool
2440 fs_visitor::opt_register_renaming()
2441 {
2442 bool progress = false;
2443 int depth = 0;
2444
2445 int remap[virtual_grf_count];
2446 memset(remap, -1, sizeof(int) * virtual_grf_count);
2447
2448 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2449 if (inst->opcode == BRW_OPCODE_IF || inst->opcode == BRW_OPCODE_DO) {
2450 depth++;
2451 } else if (inst->opcode == BRW_OPCODE_ENDIF ||
2452 inst->opcode == BRW_OPCODE_WHILE) {
2453 depth--;
2454 }
2455
2456 /* Rewrite instruction sources. */
2457 for (int i = 0; i < inst->sources; i++) {
2458 if (inst->src[i].file == GRF &&
2459 remap[inst->src[i].reg] != -1 &&
2460 remap[inst->src[i].reg] != inst->src[i].reg) {
2461 inst->src[i].reg = remap[inst->src[i].reg];
2462 progress = true;
2463 }
2464 }
2465
2466 const int dst = inst->dst.reg;
2467
2468 if (depth == 0 &&
2469 inst->dst.file == GRF &&
2470 virtual_grf_sizes[inst->dst.reg] == inst->dst.width / 8 &&
2471 !inst->is_partial_write()) {
2472 if (remap[dst] == -1) {
2473 remap[dst] = dst;
2474 } else {
2475 remap[dst] = virtual_grf_alloc(inst->dst.width / 8);
2476 inst->dst.reg = remap[dst];
2477 progress = true;
2478 }
2479 } else if (inst->dst.file == GRF &&
2480 remap[dst] != -1 &&
2481 remap[dst] != dst) {
2482 inst->dst.reg = remap[dst];
2483 progress = true;
2484 }
2485 }
2486
2487 if (progress) {
2488 invalidate_live_intervals();
2489
2490 for (unsigned i = 0; i < ARRAY_SIZE(delta_x); i++) {
2491 if (delta_x[i].file == GRF && remap[delta_x[i].reg] != -1) {
2492 delta_x[i].reg = remap[delta_x[i].reg];
2493 }
2494 }
2495 for (unsigned i = 0; i < ARRAY_SIZE(delta_y); i++) {
2496 if (delta_y[i].file == GRF && remap[delta_y[i].reg] != -1) {
2497 delta_y[i].reg = remap[delta_y[i].reg];
2498 }
2499 }
2500 }
2501
2502 return progress;
2503 }
2504
2505 bool
2506 fs_visitor::compute_to_mrf()
2507 {
2508 bool progress = false;
2509 int next_ip = 0;
2510
2511 /* No MRFs on Gen >= 7. */
2512 if (brw->gen >= 7)
2513 return false;
2514
2515 calculate_live_intervals();
2516
2517 foreach_block_and_inst_safe(block, fs_inst, inst, cfg) {
2518 int ip = next_ip;
2519 next_ip++;
2520
2521 if (inst->opcode != BRW_OPCODE_MOV ||
2522 inst->is_partial_write() ||
2523 inst->dst.file != MRF || inst->src[0].file != GRF ||
2524 inst->dst.type != inst->src[0].type ||
2525 inst->src[0].abs || inst->src[0].negate ||
2526 !inst->src[0].is_contiguous() ||
2527 inst->src[0].subreg_offset)
2528 continue;
2529
2530 /* Work out which hardware MRF registers are written by this
2531 * instruction.
2532 */
2533 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2534 int mrf_high;
2535 if (inst->dst.reg & BRW_MRF_COMPR4) {
2536 mrf_high = mrf_low + 4;
2537 } else if (inst->exec_size == 16) {
2538 mrf_high = mrf_low + 1;
2539 } else {
2540 mrf_high = mrf_low;
2541 }
2542
2543 /* Can't compute-to-MRF this GRF if someone else was going to
2544 * read it later.
2545 */
2546 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2547 continue;
2548
2549 /* Found a move of a GRF to a MRF. Let's see if we can go
2550 * rewrite the thing that made this GRF to write into the MRF.
2551 */
2552 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2553 if (scan_inst->dst.file == GRF &&
2554 scan_inst->dst.reg == inst->src[0].reg) {
2555 /* Found the last thing to write our reg we want to turn
2556 * into a compute-to-MRF.
2557 */
2558
2559 /* If this one instruction didn't populate all the
2560 * channels, bail. We might be able to rewrite everything
2561 * that writes that reg, but it would require smarter
2562 * tracking to delay the rewriting until complete success.
2563 */
2564 if (scan_inst->is_partial_write())
2565 break;
2566
2567 /* Things returning more than one register would need us to
2568 * understand coalescing out more than one MOV at a time.
2569 */
2570 if (scan_inst->regs_written > scan_inst->dst.width / 8)
2571 break;
2572
2573 /* SEND instructions can't have MRF as a destination. */
2574 if (scan_inst->mlen)
2575 break;
2576
2577 if (brw->gen == 6) {
2578 /* gen6 math instructions must have the destination be
2579 * GRF, so no compute-to-MRF for them.
2580 */
2581 if (scan_inst->is_math()) {
2582 break;
2583 }
2584 }
2585
2586 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2587 /* Found the creator of our MRF's source value. */
2588 scan_inst->dst.file = MRF;
2589 scan_inst->dst.reg = inst->dst.reg;
2590 scan_inst->saturate |= inst->saturate;
2591 inst->remove(block);
2592 progress = true;
2593 }
2594 break;
2595 }
2596
2597 /* We don't handle control flow here. Most computation of
2598 * values that end up in MRFs are shortly before the MRF
2599 * write anyway.
2600 */
2601 if (block->start() == scan_inst)
2602 break;
2603
2604 /* You can't read from an MRF, so if someone else reads our
2605 * MRF's source GRF that we wanted to rewrite, that stops us.
2606 */
2607 bool interfered = false;
2608 for (int i = 0; i < scan_inst->sources; i++) {
2609 if (scan_inst->src[i].file == GRF &&
2610 scan_inst->src[i].reg == inst->src[0].reg &&
2611 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2612 interfered = true;
2613 }
2614 }
2615 if (interfered)
2616 break;
2617
2618 if (scan_inst->dst.file == MRF) {
2619 /* If somebody else writes our MRF here, we can't
2620 * compute-to-MRF before that.
2621 */
2622 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2623 int scan_mrf_high;
2624
2625 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2626 scan_mrf_high = scan_mrf_low + 4;
2627 } else if (scan_inst->exec_size == 16) {
2628 scan_mrf_high = scan_mrf_low + 1;
2629 } else {
2630 scan_mrf_high = scan_mrf_low;
2631 }
2632
2633 if (mrf_low == scan_mrf_low ||
2634 mrf_low == scan_mrf_high ||
2635 mrf_high == scan_mrf_low ||
2636 mrf_high == scan_mrf_high) {
2637 break;
2638 }
2639 }
2640
2641 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2642 /* Found a SEND instruction, which means that there are
2643 * live values in MRFs from base_mrf to base_mrf +
2644 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2645 * above it.
2646 */
2647 if (mrf_low >= scan_inst->base_mrf &&
2648 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2649 break;
2650 }
2651 if (mrf_high >= scan_inst->base_mrf &&
2652 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2653 break;
2654 }
2655 }
2656 }
2657 }
2658
2659 if (progress)
2660 invalidate_live_intervals();
2661
2662 return progress;
2663 }
2664
2665 /**
2666 * Once we've generated code, try to convert normal FS_OPCODE_FB_WRITE
2667 * instructions to FS_OPCODE_REP_FB_WRITE.
2668 */
2669 void
2670 fs_visitor::emit_repclear_shader()
2671 {
2672 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
2673 int base_mrf = 1;
2674 int color_mrf = base_mrf + 2;
2675
2676 fs_inst *mov = emit(MOV(vec4(brw_message_reg(color_mrf)),
2677 fs_reg(UNIFORM, 0, BRW_REGISTER_TYPE_F)));
2678 mov->force_writemask_all = true;
2679
2680 fs_inst *write;
2681 if (key->nr_color_regions == 1) {
2682 write = emit(FS_OPCODE_REP_FB_WRITE);
2683 write->saturate = key->clamp_fragment_color;
2684 write->base_mrf = color_mrf;
2685 write->target = 0;
2686 write->header_present = false;
2687 write->mlen = 1;
2688 } else {
2689 assume(key->nr_color_regions > 0);
2690 for (int i = 0; i < key->nr_color_regions; ++i) {
2691 write = emit(FS_OPCODE_REP_FB_WRITE);
2692 write->saturate = key->clamp_fragment_color;
2693 write->base_mrf = base_mrf;
2694 write->target = i;
2695 write->header_present = true;
2696 write->mlen = 3;
2697 }
2698 }
2699 write->eot = true;
2700
2701 calculate_cfg();
2702
2703 assign_constant_locations();
2704 assign_curb_setup();
2705
2706 /* Now that we have the uniform assigned, go ahead and force it to a vec4. */
2707 assert(mov->src[0].file == HW_REG);
2708 mov->src[0] = brw_vec4_grf(mov->src[0].fixed_hw_reg.nr, 0);
2709 }
2710
2711 /**
2712 * Walks through basic blocks, looking for repeated MRF writes and
2713 * removing the later ones.
2714 */
2715 bool
2716 fs_visitor::remove_duplicate_mrf_writes()
2717 {
2718 fs_inst *last_mrf_move[16];
2719 bool progress = false;
2720
2721 /* Need to update the MRF tracking for compressed instructions. */
2722 if (dispatch_width == 16)
2723 return false;
2724
2725 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2726
2727 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
2728 if (inst->is_control_flow()) {
2729 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2730 }
2731
2732 if (inst->opcode == BRW_OPCODE_MOV &&
2733 inst->dst.file == MRF) {
2734 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2735 if (prev_inst && inst->equals(prev_inst)) {
2736 inst->remove(block);
2737 progress = true;
2738 continue;
2739 }
2740 }
2741
2742 /* Clear out the last-write records for MRFs that were overwritten. */
2743 if (inst->dst.file == MRF) {
2744 last_mrf_move[inst->dst.reg] = NULL;
2745 }
2746
2747 if (inst->mlen > 0 && inst->base_mrf != -1) {
2748 /* Found a SEND instruction, which will include two or fewer
2749 * implied MRF writes. We could do better here.
2750 */
2751 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2752 last_mrf_move[inst->base_mrf + i] = NULL;
2753 }
2754 }
2755
2756 /* Clear out any MRF move records whose sources got overwritten. */
2757 if (inst->dst.file == GRF) {
2758 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2759 if (last_mrf_move[i] &&
2760 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2761 last_mrf_move[i] = NULL;
2762 }
2763 }
2764 }
2765
2766 if (inst->opcode == BRW_OPCODE_MOV &&
2767 inst->dst.file == MRF &&
2768 inst->src[0].file == GRF &&
2769 !inst->is_partial_write()) {
2770 last_mrf_move[inst->dst.reg] = inst;
2771 }
2772 }
2773
2774 if (progress)
2775 invalidate_live_intervals();
2776
2777 return progress;
2778 }
2779
2780 static void
2781 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2782 int first_grf, int grf_len)
2783 {
2784 /* Clear the flag for registers that actually got read (as expected). */
2785 for (int i = 0; i < inst->sources; i++) {
2786 int grf;
2787 if (inst->src[i].file == GRF) {
2788 grf = inst->src[i].reg;
2789 } else if (inst->src[i].file == HW_REG &&
2790 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2791 grf = inst->src[i].fixed_hw_reg.nr;
2792 } else {
2793 continue;
2794 }
2795
2796 if (grf >= first_grf &&
2797 grf < first_grf + grf_len) {
2798 deps[grf - first_grf] = false;
2799 if (inst->exec_size == 16)
2800 deps[grf - first_grf + 1] = false;
2801 }
2802 }
2803 }
2804
2805 /**
2806 * Implements this workaround for the original 965:
2807 *
2808 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2809 * check for post destination dependencies on this instruction, software
2810 * must ensure that there is no destination hazard for the case of ‘write
2811 * followed by a posted write’ shown in the following example.
2812 *
2813 * 1. mov r3 0
2814 * 2. send r3.xy <rest of send instruction>
2815 * 3. mov r2 r3
2816 *
2817 * Due to no post-destination dependency check on the ‘send’, the above
2818 * code sequence could have two instructions (1 and 2) in flight at the
2819 * same time that both consider ‘r3’ as the target of their final writes.
2820 */
2821 void
2822 fs_visitor::insert_gen4_pre_send_dependency_workarounds(bblock_t *block,
2823 fs_inst *inst)
2824 {
2825 int write_len = inst->regs_written;
2826 int first_write_grf = inst->dst.reg;
2827 bool needs_dep[BRW_MAX_MRF];
2828 assert(write_len < (int)sizeof(needs_dep) - 1);
2829
2830 memset(needs_dep, false, sizeof(needs_dep));
2831 memset(needs_dep, true, write_len);
2832
2833 clear_deps_for_inst_src(inst, dispatch_width,
2834 needs_dep, first_write_grf, write_len);
2835
2836 /* Walk backwards looking for writes to registers we're writing which
2837 * aren't read since being written. If we hit the start of the program,
2838 * we assume that there are no outstanding dependencies on entry to the
2839 * program.
2840 */
2841 foreach_inst_in_block_reverse_starting_from(fs_inst, scan_inst, inst, block) {
2842 /* If we hit control flow, assume that there *are* outstanding
2843 * dependencies, and force their cleanup before our instruction.
2844 */
2845 if (block->start() == scan_inst) {
2846 for (int i = 0; i < write_len; i++) {
2847 if (needs_dep[i]) {
2848 inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2849 }
2850 }
2851 return;
2852 }
2853
2854 /* We insert our reads as late as possible on the assumption that any
2855 * instruction but a MOV that might have left us an outstanding
2856 * dependency has more latency than a MOV.
2857 */
2858 if (scan_inst->dst.file == GRF) {
2859 for (int i = 0; i < scan_inst->regs_written; i++) {
2860 int reg = scan_inst->dst.reg + i;
2861
2862 if (reg >= first_write_grf &&
2863 reg < first_write_grf + write_len &&
2864 needs_dep[reg - first_write_grf]) {
2865 inst->insert_before(block, DEP_RESOLVE_MOV(reg));
2866 needs_dep[reg - first_write_grf] = false;
2867 if (scan_inst->exec_size == 16)
2868 needs_dep[reg - first_write_grf + 1] = false;
2869 }
2870 }
2871 }
2872
2873 /* Clear the flag for registers that actually got read (as expected). */
2874 clear_deps_for_inst_src(scan_inst, dispatch_width,
2875 needs_dep, first_write_grf, write_len);
2876
2877 /* Continue the loop only if we haven't resolved all the dependencies */
2878 int i;
2879 for (i = 0; i < write_len; i++) {
2880 if (needs_dep[i])
2881 break;
2882 }
2883 if (i == write_len)
2884 return;
2885 }
2886 }
2887
2888 /**
2889 * Implements this workaround for the original 965:
2890 *
2891 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2892 * used as a destination register until after it has been sourced by an
2893 * instruction with a different destination register.
2894 */
2895 void
2896 fs_visitor::insert_gen4_post_send_dependency_workarounds(bblock_t *block, fs_inst *inst)
2897 {
2898 int write_len = inst->regs_written;
2899 int first_write_grf = inst->dst.reg;
2900 bool needs_dep[BRW_MAX_MRF];
2901 assert(write_len < (int)sizeof(needs_dep) - 1);
2902
2903 memset(needs_dep, false, sizeof(needs_dep));
2904 memset(needs_dep, true, write_len);
2905 /* Walk forwards looking for writes to registers we're writing which aren't
2906 * read before being written.
2907 */
2908 foreach_inst_in_block_starting_from(fs_inst, scan_inst, inst, block) {
2909 /* If we hit control flow, force resolve all remaining dependencies. */
2910 if (block->end() == scan_inst) {
2911 for (int i = 0; i < write_len; i++) {
2912 if (needs_dep[i])
2913 scan_inst->insert_before(block,
2914 DEP_RESOLVE_MOV(first_write_grf + i));
2915 }
2916 return;
2917 }
2918
2919 /* Clear the flag for registers that actually got read (as expected). */
2920 clear_deps_for_inst_src(scan_inst, dispatch_width,
2921 needs_dep, first_write_grf, write_len);
2922
2923 /* We insert our reads as late as possible since they're reading the
2924 * result of a SEND, which has massive latency.
2925 */
2926 if (scan_inst->dst.file == GRF &&
2927 scan_inst->dst.reg >= first_write_grf &&
2928 scan_inst->dst.reg < first_write_grf + write_len &&
2929 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2930 scan_inst->insert_before(block, DEP_RESOLVE_MOV(scan_inst->dst.reg));
2931 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2932 }
2933
2934 /* Continue the loop only if we haven't resolved all the dependencies */
2935 int i;
2936 for (i = 0; i < write_len; i++) {
2937 if (needs_dep[i])
2938 break;
2939 }
2940 if (i == write_len)
2941 return;
2942 }
2943
2944 /* If we hit the end of the program, resolve all remaining dependencies out
2945 * of paranoia.
2946 */
2947 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2948 assert(last_inst->eot);
2949 for (int i = 0; i < write_len; i++) {
2950 if (needs_dep[i])
2951 last_inst->insert_before(block, DEP_RESOLVE_MOV(first_write_grf + i));
2952 }
2953 }
2954
2955 void
2956 fs_visitor::insert_gen4_send_dependency_workarounds()
2957 {
2958 if (brw->gen != 4 || brw->is_g4x)
2959 return;
2960
2961 bool progress = false;
2962
2963 /* Note that we're done with register allocation, so GRF fs_regs always
2964 * have a .reg_offset of 0.
2965 */
2966
2967 foreach_block_and_inst(block, fs_inst, inst, cfg) {
2968 if (inst->mlen != 0 && inst->dst.file == GRF) {
2969 insert_gen4_pre_send_dependency_workarounds(block, inst);
2970 insert_gen4_post_send_dependency_workarounds(block, inst);
2971 progress = true;
2972 }
2973 }
2974
2975 if (progress)
2976 invalidate_live_intervals();
2977 }
2978
2979 /**
2980 * Turns the generic expression-style uniform pull constant load instruction
2981 * into a hardware-specific series of instructions for loading a pull
2982 * constant.
2983 *
2984 * The expression style allows the CSE pass before this to optimize out
2985 * repeated loads from the same offset, and gives the pre-register-allocation
2986 * scheduling full flexibility, while the conversion to native instructions
2987 * allows the post-register-allocation scheduler the best information
2988 * possible.
2989 *
2990 * Note that execution masking for setting up pull constant loads is special:
2991 * the channels that need to be written are unrelated to the current execution
2992 * mask, since a later instruction will use one of the result channels as a
2993 * source operand for all 8 or 16 of its channels.
2994 */
2995 void
2996 fs_visitor::lower_uniform_pull_constant_loads()
2997 {
2998 foreach_block_and_inst (block, fs_inst, inst, cfg) {
2999 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
3000 continue;
3001
3002 if (brw->gen >= 7) {
3003 /* The offset arg before was a vec4-aligned byte offset. We need to
3004 * turn it into a dword offset.
3005 */
3006 fs_reg const_offset_reg = inst->src[1];
3007 assert(const_offset_reg.file == IMM &&
3008 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
3009 const_offset_reg.fixed_hw_reg.dw1.ud /= 4;
3010 fs_reg payload = vgrf(glsl_type::uint_type);
3011
3012 /* We have to use a message header on Skylake to get SIMD4x2 mode.
3013 * Reserve space for the register.
3014 */
3015 if (brw->gen >= 9) {
3016 payload.reg_offset++;
3017 virtual_grf_sizes[payload.reg] = 2;
3018 }
3019
3020 /* This is actually going to be a MOV, but since only the first dword
3021 * is accessed, we have a special opcode to do just that one. Note
3022 * that this needs to be an operation that will be considered a def
3023 * by live variable analysis, or register allocation will explode.
3024 */
3025 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
3026 8, payload, const_offset_reg);
3027 setup->force_writemask_all = true;
3028
3029 setup->ir = inst->ir;
3030 setup->annotation = inst->annotation;
3031 inst->insert_before(block, setup);
3032
3033 /* Similarly, this will only populate the first 4 channels of the
3034 * result register (since we only use smear values from 0-3), but we
3035 * don't tell the optimizer.
3036 */
3037 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
3038 inst->src[1] = payload;
3039
3040 invalidate_live_intervals();
3041 } else {
3042 /* Before register allocation, we didn't tell the scheduler about the
3043 * MRF we use. We know it's safe to use this MRF because nothing
3044 * else does except for register spill/unspill, which generates and
3045 * uses its MRF within a single IR instruction.
3046 */
3047 inst->base_mrf = 14;
3048 inst->mlen = 1;
3049 }
3050 }
3051 }
3052
3053 bool
3054 fs_visitor::lower_load_payload()
3055 {
3056 bool progress = false;
3057
3058 int vgrf_to_reg[virtual_grf_count];
3059 int reg_count = 16; /* Leave room for MRF */
3060 for (int i = 0; i < virtual_grf_count; ++i) {
3061 vgrf_to_reg[i] = reg_count;
3062 reg_count += virtual_grf_sizes[i];
3063 }
3064
3065 struct {
3066 bool written:1; /* Whether this register has ever been written */
3067 bool force_writemask_all:1;
3068 bool force_sechalf:1;
3069 } metadata[reg_count];
3070 memset(metadata, 0, sizeof(metadata));
3071
3072 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3073 int dst_reg;
3074 if (inst->dst.file == GRF) {
3075 dst_reg = vgrf_to_reg[inst->dst.reg];
3076 } else {
3077 /* MRF */
3078 dst_reg = inst->dst.reg;
3079 }
3080
3081 if (inst->dst.file == MRF || inst->dst.file == GRF) {
3082 bool force_sechalf = inst->force_sechalf;
3083 bool toggle_sechalf = inst->dst.width == 16 &&
3084 type_sz(inst->dst.type) == 4;
3085 for (int i = 0; i < inst->regs_written; ++i) {
3086 metadata[dst_reg + i].written = true;
3087 metadata[dst_reg + i].force_sechalf = force_sechalf;
3088 metadata[dst_reg + i].force_writemask_all = inst->force_writemask_all;
3089 force_sechalf = (toggle_sechalf != force_sechalf);
3090 }
3091 }
3092
3093 if (inst->opcode == SHADER_OPCODE_LOAD_PAYLOAD) {
3094 assert(inst->dst.file == MRF || inst->dst.file == GRF);
3095 fs_reg dst = inst->dst;
3096
3097 for (int i = 0; i < inst->sources; i++) {
3098 dst.width = inst->src[i].effective_width;
3099 dst.type = inst->src[i].type;
3100
3101 if (inst->src[i].file == BAD_FILE) {
3102 /* Do nothing but otherwise increment as normal */
3103 } else if (dst.file == MRF &&
3104 dst.width == 8 &&
3105 brw->has_compr4 &&
3106 i + 4 < inst->sources &&
3107 inst->src[i + 4].equals(horiz_offset(inst->src[i], 8))) {
3108 fs_reg compr4_dst = dst;
3109 compr4_dst.reg += BRW_MRF_COMPR4;
3110 compr4_dst.width = 16;
3111 fs_reg compr4_src = inst->src[i];
3112 compr4_src.width = 16;
3113 fs_inst *mov = MOV(compr4_dst, compr4_src);
3114 mov->force_writemask_all = true;
3115 inst->insert_before(block, mov);
3116 /* Mark i+4 as BAD_FILE so we don't emit a MOV for it */
3117 inst->src[i + 4].file = BAD_FILE;
3118 } else {
3119 fs_inst *mov = MOV(dst, inst->src[i]);
3120 if (inst->src[i].file == GRF) {
3121 int src_reg = vgrf_to_reg[inst->src[i].reg] +
3122 inst->src[i].reg_offset;
3123 mov->force_sechalf = metadata[src_reg].force_sechalf;
3124 mov->force_writemask_all = metadata[src_reg].force_writemask_all;
3125 metadata[dst_reg] = metadata[src_reg];
3126 if (dst.width * type_sz(dst.type) > 32) {
3127 assert((!metadata[src_reg].written ||
3128 !metadata[src_reg].force_sechalf) &&
3129 (!metadata[src_reg + 1].written ||
3130 metadata[src_reg + 1].force_sechalf));
3131 metadata[dst_reg + 1] = metadata[src_reg + 1];
3132 }
3133 } else {
3134 metadata[dst_reg].force_writemask_all = false;
3135 metadata[dst_reg].force_sechalf = false;
3136 if (dst.width == 16) {
3137 metadata[dst_reg + 1].force_writemask_all = false;
3138 metadata[dst_reg + 1].force_sechalf = true;
3139 }
3140 }
3141 inst->insert_before(block, mov);
3142 }
3143
3144 dst = offset(dst, 1);
3145 }
3146
3147 inst->remove(block);
3148 progress = true;
3149 }
3150 }
3151
3152 if (progress)
3153 invalidate_live_intervals();
3154
3155 return progress;
3156 }
3157
3158 void
3159 fs_visitor::dump_instructions()
3160 {
3161 dump_instructions(NULL);
3162 }
3163
3164 void
3165 fs_visitor::dump_instructions(const char *name)
3166 {
3167 calculate_register_pressure();
3168 FILE *file = stderr;
3169 if (name && geteuid() != 0) {
3170 file = fopen(name, "w");
3171 if (!file)
3172 file = stderr;
3173 }
3174
3175 int ip = 0, max_pressure = 0;
3176 foreach_block_and_inst(block, backend_instruction, inst, cfg) {
3177 max_pressure = MAX2(max_pressure, regs_live_at_ip[ip]);
3178 fprintf(file, "{%3d} %4d: ", regs_live_at_ip[ip], ip);
3179 dump_instruction(inst, file);
3180 ++ip;
3181 }
3182 fprintf(file, "Maximum %3d registers live at once.\n", max_pressure);
3183
3184 if (file != stderr) {
3185 fclose(file);
3186 }
3187 }
3188
3189 void
3190 fs_visitor::dump_instruction(backend_instruction *be_inst)
3191 {
3192 dump_instruction(be_inst, stderr);
3193 }
3194
3195 void
3196 fs_visitor::dump_instruction(backend_instruction *be_inst, FILE *file)
3197 {
3198 fs_inst *inst = (fs_inst *)be_inst;
3199
3200 if (inst->predicate) {
3201 fprintf(file, "(%cf0.%d) ",
3202 inst->predicate_inverse ? '-' : '+',
3203 inst->flag_subreg);
3204 }
3205
3206 fprintf(file, "%s", brw_instruction_name(inst->opcode));
3207 if (inst->saturate)
3208 fprintf(file, ".sat");
3209 if (inst->conditional_mod) {
3210 fprintf(file, "%s", conditional_modifier[inst->conditional_mod]);
3211 if (!inst->predicate &&
3212 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
3213 inst->opcode != BRW_OPCODE_IF &&
3214 inst->opcode != BRW_OPCODE_WHILE))) {
3215 fprintf(file, ".f0.%d", inst->flag_subreg);
3216 }
3217 }
3218 fprintf(file, "(%d) ", inst->exec_size);
3219
3220
3221 switch (inst->dst.file) {
3222 case GRF:
3223 fprintf(file, "vgrf%d", inst->dst.reg);
3224 if (inst->dst.width != dispatch_width)
3225 fprintf(file, "@%d", inst->dst.width);
3226 if (virtual_grf_sizes[inst->dst.reg] != inst->dst.width / 8 ||
3227 inst->dst.subreg_offset)
3228 fprintf(file, "+%d.%d",
3229 inst->dst.reg_offset, inst->dst.subreg_offset);
3230 break;
3231 case MRF:
3232 fprintf(file, "m%d", inst->dst.reg);
3233 break;
3234 case BAD_FILE:
3235 fprintf(file, "(null)");
3236 break;
3237 case UNIFORM:
3238 fprintf(file, "***u%d***", inst->dst.reg + inst->dst.reg_offset);
3239 break;
3240 case ATTR:
3241 fprintf(file, "***attr%d***", inst->dst.reg + inst->dst.reg_offset);
3242 break;
3243 case HW_REG:
3244 if (inst->dst.fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3245 switch (inst->dst.fixed_hw_reg.nr) {
3246 case BRW_ARF_NULL:
3247 fprintf(file, "null");
3248 break;
3249 case BRW_ARF_ADDRESS:
3250 fprintf(file, "a0.%d", inst->dst.fixed_hw_reg.subnr);
3251 break;
3252 case BRW_ARF_ACCUMULATOR:
3253 fprintf(file, "acc%d", inst->dst.fixed_hw_reg.subnr);
3254 break;
3255 case BRW_ARF_FLAG:
3256 fprintf(file, "f%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3257 inst->dst.fixed_hw_reg.subnr);
3258 break;
3259 default:
3260 fprintf(file, "arf%d.%d", inst->dst.fixed_hw_reg.nr & 0xf,
3261 inst->dst.fixed_hw_reg.subnr);
3262 break;
3263 }
3264 } else {
3265 fprintf(file, "hw_reg%d", inst->dst.fixed_hw_reg.nr);
3266 }
3267 if (inst->dst.fixed_hw_reg.subnr)
3268 fprintf(file, "+%d", inst->dst.fixed_hw_reg.subnr);
3269 break;
3270 default:
3271 fprintf(file, "???");
3272 break;
3273 }
3274 fprintf(file, ":%s, ", brw_reg_type_letters(inst->dst.type));
3275
3276 for (int i = 0; i < inst->sources; i++) {
3277 if (inst->src[i].negate)
3278 fprintf(file, "-");
3279 if (inst->src[i].abs)
3280 fprintf(file, "|");
3281 switch (inst->src[i].file) {
3282 case GRF:
3283 fprintf(file, "vgrf%d", inst->src[i].reg);
3284 if (inst->src[i].width != dispatch_width)
3285 fprintf(file, "@%d", inst->src[i].width);
3286 if (virtual_grf_sizes[inst->src[i].reg] != inst->src[i].width / 8 ||
3287 inst->src[i].subreg_offset)
3288 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3289 inst->src[i].subreg_offset);
3290 break;
3291 case MRF:
3292 fprintf(file, "***m%d***", inst->src[i].reg);
3293 break;
3294 case ATTR:
3295 fprintf(file, "attr%d", inst->src[i].reg + inst->src[i].reg_offset);
3296 break;
3297 case UNIFORM:
3298 fprintf(file, "u%d", inst->src[i].reg + inst->src[i].reg_offset);
3299 if (inst->src[i].reladdr) {
3300 fprintf(file, "+reladdr");
3301 } else if (inst->src[i].subreg_offset) {
3302 fprintf(file, "+%d.%d", inst->src[i].reg_offset,
3303 inst->src[i].subreg_offset);
3304 }
3305 break;
3306 case BAD_FILE:
3307 fprintf(file, "(null)");
3308 break;
3309 case IMM:
3310 switch (inst->src[i].type) {
3311 case BRW_REGISTER_TYPE_F:
3312 fprintf(file, "%ff", inst->src[i].fixed_hw_reg.dw1.f);
3313 break;
3314 case BRW_REGISTER_TYPE_D:
3315 fprintf(file, "%dd", inst->src[i].fixed_hw_reg.dw1.d);
3316 break;
3317 case BRW_REGISTER_TYPE_UD:
3318 fprintf(file, "%uu", inst->src[i].fixed_hw_reg.dw1.ud);
3319 break;
3320 case BRW_REGISTER_TYPE_VF:
3321 fprintf(file, "[%-gF, %-gF, %-gF, %-gF]",
3322 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 0) & 0xff),
3323 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 8) & 0xff),
3324 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 16) & 0xff),
3325 brw_vf_to_float((inst->src[i].fixed_hw_reg.dw1.ud >> 24) & 0xff));
3326 break;
3327 default:
3328 fprintf(file, "???");
3329 break;
3330 }
3331 break;
3332 case HW_REG:
3333 if (inst->src[i].fixed_hw_reg.negate)
3334 fprintf(file, "-");
3335 if (inst->src[i].fixed_hw_reg.abs)
3336 fprintf(file, "|");
3337 if (inst->src[i].fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE) {
3338 switch (inst->src[i].fixed_hw_reg.nr) {
3339 case BRW_ARF_NULL:
3340 fprintf(file, "null");
3341 break;
3342 case BRW_ARF_ADDRESS:
3343 fprintf(file, "a0.%d", inst->src[i].fixed_hw_reg.subnr);
3344 break;
3345 case BRW_ARF_ACCUMULATOR:
3346 fprintf(file, "acc%d", inst->src[i].fixed_hw_reg.subnr);
3347 break;
3348 case BRW_ARF_FLAG:
3349 fprintf(file, "f%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3350 inst->src[i].fixed_hw_reg.subnr);
3351 break;
3352 default:
3353 fprintf(file, "arf%d.%d", inst->src[i].fixed_hw_reg.nr & 0xf,
3354 inst->src[i].fixed_hw_reg.subnr);
3355 break;
3356 }
3357 } else {
3358 fprintf(file, "hw_reg%d", inst->src[i].fixed_hw_reg.nr);
3359 }
3360 if (inst->src[i].fixed_hw_reg.subnr)
3361 fprintf(file, "+%d", inst->src[i].fixed_hw_reg.subnr);
3362 if (inst->src[i].fixed_hw_reg.abs)
3363 fprintf(file, "|");
3364 break;
3365 default:
3366 fprintf(file, "???");
3367 break;
3368 }
3369 if (inst->src[i].abs)
3370 fprintf(file, "|");
3371
3372 if (inst->src[i].file != IMM) {
3373 fprintf(file, ":%s", brw_reg_type_letters(inst->src[i].type));
3374 }
3375
3376 if (i < inst->sources - 1 && inst->src[i + 1].file != BAD_FILE)
3377 fprintf(file, ", ");
3378 }
3379
3380 fprintf(file, " ");
3381
3382 if (dispatch_width == 16 && inst->exec_size == 8) {
3383 if (inst->force_sechalf)
3384 fprintf(file, "2ndhalf ");
3385 else
3386 fprintf(file, "1sthalf ");
3387 }
3388
3389 fprintf(file, "\n");
3390 }
3391
3392 /**
3393 * Possibly returns an instruction that set up @param reg.
3394 *
3395 * Sometimes we want to take the result of some expression/variable
3396 * dereference tree and rewrite the instruction generating the result
3397 * of the tree. When processing the tree, we know that the
3398 * instructions generated are all writing temporaries that are dead
3399 * outside of this tree. So, if we have some instructions that write
3400 * a temporary, we're free to point that temp write somewhere else.
3401 *
3402 * Note that this doesn't guarantee that the instruction generated
3403 * only reg -- it might be the size=4 destination of a texture instruction.
3404 */
3405 fs_inst *
3406 fs_visitor::get_instruction_generating_reg(fs_inst *start,
3407 fs_inst *end,
3408 const fs_reg &reg)
3409 {
3410 if (end == start ||
3411 end->is_partial_write() ||
3412 reg.reladdr ||
3413 !reg.equals(end->dst)) {
3414 return NULL;
3415 } else {
3416 return end;
3417 }
3418 }
3419
3420 void
3421 fs_visitor::setup_payload_gen6()
3422 {
3423 bool uses_depth =
3424 (prog->InputsRead & (1 << VARYING_SLOT_POS)) != 0;
3425 unsigned barycentric_interp_modes =
3426 (stage == MESA_SHADER_FRAGMENT) ?
3427 ((brw_wm_prog_data*) this->prog_data)->barycentric_interp_modes : 0;
3428
3429 assert(brw->gen >= 6);
3430
3431 /* R0-1: masks, pixel X/Y coordinates. */
3432 payload.num_regs = 2;
3433 /* R2: only for 32-pixel dispatch.*/
3434
3435 /* R3-26: barycentric interpolation coordinates. These appear in the
3436 * same order that they appear in the brw_wm_barycentric_interp_mode
3437 * enum. Each set of coordinates occupies 2 registers if dispatch width
3438 * == 8 and 4 registers if dispatch width == 16. Coordinates only
3439 * appear if they were enabled using the "Barycentric Interpolation
3440 * Mode" bits in WM_STATE.
3441 */
3442 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
3443 if (barycentric_interp_modes & (1 << i)) {
3444 payload.barycentric_coord_reg[i] = payload.num_regs;
3445 payload.num_regs += 2;
3446 if (dispatch_width == 16) {
3447 payload.num_regs += 2;
3448 }
3449 }
3450 }
3451
3452 /* R27: interpolated depth if uses source depth */
3453 if (uses_depth) {
3454 payload.source_depth_reg = payload.num_regs;
3455 payload.num_regs++;
3456 if (dispatch_width == 16) {
3457 /* R28: interpolated depth if not SIMD8. */
3458 payload.num_regs++;
3459 }
3460 }
3461 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3462 if (uses_depth) {
3463 payload.source_w_reg = payload.num_regs;
3464 payload.num_regs++;
3465 if (dispatch_width == 16) {
3466 /* R30: interpolated W if not SIMD8. */
3467 payload.num_regs++;
3468 }
3469 }
3470
3471 if (stage == MESA_SHADER_FRAGMENT) {
3472 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3473 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3474 prog_data->uses_pos_offset = key->compute_pos_offset;
3475 /* R31: MSAA position offsets. */
3476 if (prog_data->uses_pos_offset) {
3477 payload.sample_pos_reg = payload.num_regs;
3478 payload.num_regs++;
3479 }
3480 }
3481
3482 /* R32: MSAA input coverage mask */
3483 if (prog->SystemValuesRead & SYSTEM_BIT_SAMPLE_MASK_IN) {
3484 assert(brw->gen >= 7);
3485 payload.sample_mask_in_reg = payload.num_regs;
3486 payload.num_regs++;
3487 if (dispatch_width == 16) {
3488 /* R33: input coverage mask if not SIMD8. */
3489 payload.num_regs++;
3490 }
3491 }
3492
3493 /* R34-: bary for 32-pixel. */
3494 /* R58-59: interp W for 32-pixel. */
3495
3496 if (prog->OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3497 source_depth_to_render_target = true;
3498 }
3499 }
3500
3501 void
3502 fs_visitor::setup_vs_payload()
3503 {
3504 /* R0: thread header, R1: urb handles */
3505 payload.num_regs = 2;
3506 }
3507
3508 void
3509 fs_visitor::assign_binding_table_offsets()
3510 {
3511 assert(stage == MESA_SHADER_FRAGMENT);
3512 brw_wm_prog_data *prog_data = (brw_wm_prog_data*) this->prog_data;
3513 brw_wm_prog_key *key = (brw_wm_prog_key*) this->key;
3514 uint32_t next_binding_table_offset = 0;
3515
3516 /* If there are no color regions, we still perform an FB write to a null
3517 * renderbuffer, which we place at surface index 0.
3518 */
3519 prog_data->binding_table.render_target_start = next_binding_table_offset;
3520 next_binding_table_offset += MAX2(key->nr_color_regions, 1);
3521
3522 assign_common_binding_table_offsets(next_binding_table_offset);
3523 }
3524
3525 void
3526 fs_visitor::calculate_register_pressure()
3527 {
3528 invalidate_live_intervals();
3529 calculate_live_intervals();
3530
3531 unsigned num_instructions = 0;
3532 foreach_block(block, cfg)
3533 num_instructions += block->instructions.length();
3534
3535 regs_live_at_ip = rzalloc_array(mem_ctx, int, num_instructions);
3536
3537 for (int reg = 0; reg < virtual_grf_count; reg++) {
3538 for (int ip = virtual_grf_start[reg]; ip <= virtual_grf_end[reg]; ip++)
3539 regs_live_at_ip[ip] += virtual_grf_sizes[reg];
3540 }
3541 }
3542
3543 void
3544 fs_visitor::optimize()
3545 {
3546 const char *stage_name = stage == MESA_SHADER_VERTEX ? "vs" : "fs";
3547
3548 calculate_cfg();
3549
3550 split_virtual_grfs();
3551
3552 move_uniform_array_access_to_pull_constants();
3553 assign_constant_locations();
3554 demote_pull_constants();
3555
3556 #define OPT(pass, args...) ({ \
3557 pass_num++; \
3558 bool this_progress = pass(args); \
3559 \
3560 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER) && this_progress) { \
3561 char filename[64]; \
3562 snprintf(filename, 64, "%s%d-%04d-%02d-%02d-" #pass, \
3563 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0, iteration, pass_num); \
3564 \
3565 backend_visitor::dump_instructions(filename); \
3566 } \
3567 \
3568 progress = progress || this_progress; \
3569 this_progress; \
3570 })
3571
3572 if (unlikely(INTEL_DEBUG & DEBUG_OPTIMIZER)) {
3573 char filename[64];
3574 snprintf(filename, 64, "%s%d-%04d-00-start",
3575 stage_name, dispatch_width, shader_prog ? shader_prog->Name : 0);
3576
3577 backend_visitor::dump_instructions(filename);
3578 }
3579
3580 bool progress;
3581 int iteration = 0;
3582 int pass_num = 0;
3583 do {
3584 progress = false;
3585 pass_num = 0;
3586 iteration++;
3587
3588 OPT(remove_duplicate_mrf_writes);
3589
3590 OPT(opt_algebraic);
3591 OPT(opt_cse);
3592 OPT(opt_copy_propagate);
3593 OPT(opt_peephole_predicated_break);
3594 OPT(opt_cmod_propagation);
3595 OPT(dead_code_eliminate);
3596 OPT(opt_peephole_sel);
3597 OPT(dead_control_flow_eliminate, this);
3598 OPT(opt_register_renaming);
3599 OPT(opt_saturate_propagation);
3600 OPT(register_coalesce);
3601 OPT(compute_to_mrf);
3602
3603 OPT(compact_virtual_grfs);
3604 } while (progress);
3605
3606 pass_num = 0;
3607
3608 if (OPT(lower_load_payload)) {
3609 split_virtual_grfs();
3610 OPT(register_coalesce);
3611 OPT(compute_to_mrf);
3612 OPT(dead_code_eliminate);
3613 }
3614
3615 lower_uniform_pull_constant_loads();
3616 }
3617
3618 /**
3619 * Three source instruction must have a GRF/MRF destination register.
3620 * ARF NULL is not allowed. Fix that up by allocating a temporary GRF.
3621 */
3622 void
3623 fs_visitor::fixup_3src_null_dest()
3624 {
3625 foreach_block_and_inst_safe (block, fs_inst, inst, cfg) {
3626 if (inst->is_3src() && inst->dst.is_null()) {
3627 inst->dst = fs_reg(GRF, virtual_grf_alloc(dispatch_width / 8),
3628 inst->dst.type);
3629 }
3630 }
3631 }
3632
3633 void
3634 fs_visitor::allocate_registers()
3635 {
3636 bool allocated_without_spills;
3637
3638 static const enum instruction_scheduler_mode pre_modes[] = {
3639 SCHEDULE_PRE,
3640 SCHEDULE_PRE_NON_LIFO,
3641 SCHEDULE_PRE_LIFO,
3642 };
3643
3644 /* Try each scheduling heuristic to see if it can successfully register
3645 * allocate without spilling. They should be ordered by decreasing
3646 * performance but increasing likelihood of allocating.
3647 */
3648 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3649 schedule_instructions(pre_modes[i]);
3650
3651 if (0) {
3652 assign_regs_trivial();
3653 allocated_without_spills = true;
3654 } else {
3655 allocated_without_spills = assign_regs(false);
3656 }
3657 if (allocated_without_spills)
3658 break;
3659 }
3660
3661 if (!allocated_without_spills) {
3662 const char *stage_name = stage == MESA_SHADER_VERTEX ?
3663 "Vertex" : "Fragment";
3664
3665 /* We assume that any spilling is worse than just dropping back to
3666 * SIMD8. There's probably actually some intermediate point where
3667 * SIMD16 with a couple of spills is still better.
3668 */
3669 if (dispatch_width == 16) {
3670 fail("Failure to register allocate. Reduce number of "
3671 "live scalar values to avoid this.");
3672 } else {
3673 perf_debug("%s shader triggered register spilling. "
3674 "Try reducing the number of live scalar values to "
3675 "improve performance.\n", stage_name);
3676 }
3677
3678 /* Since we're out of heuristics, just go spill registers until we
3679 * get an allocation.
3680 */
3681 while (!assign_regs(true)) {
3682 if (failed)
3683 break;
3684 }
3685 }
3686
3687 /* This must come after all optimization and register allocation, since
3688 * it inserts dead code that happens to have side effects, and it does
3689 * so based on the actual physical registers in use.
3690 */
3691 insert_gen4_send_dependency_workarounds();
3692
3693 if (failed)
3694 return;
3695
3696 if (!allocated_without_spills)
3697 schedule_instructions(SCHEDULE_POST);
3698
3699 if (last_scratch > 0)
3700 prog_data->total_scratch = brw_get_scratch_size(last_scratch);
3701 }
3702
3703 bool
3704 fs_visitor::run_vs()
3705 {
3706 assert(stage == MESA_SHADER_VERTEX);
3707
3708 assign_common_binding_table_offsets(0);
3709 setup_vs_payload();
3710
3711 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3712 emit_shader_time_begin();
3713
3714 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3715 base_ir = ir;
3716 this->result = reg_undef;
3717 ir->accept(this);
3718 }
3719 base_ir = NULL;
3720 if (failed)
3721 return false;
3722
3723 emit_urb_writes();
3724
3725 optimize();
3726
3727 assign_curb_setup();
3728 assign_vs_urb_setup();
3729
3730 fixup_3src_null_dest();
3731 allocate_registers();
3732
3733 return !failed;
3734 }
3735
3736 bool
3737 fs_visitor::run_fs()
3738 {
3739 brw_wm_prog_data *wm_prog_data = (brw_wm_prog_data *) this->prog_data;
3740 brw_wm_prog_key *wm_key = (brw_wm_prog_key *) this->key;
3741
3742 assert(stage == MESA_SHADER_FRAGMENT);
3743
3744 sanity_param_count = prog->Parameters->NumParameters;
3745
3746 assign_binding_table_offsets();
3747
3748 if (brw->gen >= 6)
3749 setup_payload_gen6();
3750 else
3751 setup_payload_gen4();
3752
3753 if (0) {
3754 emit_dummy_fs();
3755 } else if (brw->use_rep_send && dispatch_width == 16) {
3756 emit_repclear_shader();
3757 } else {
3758 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3759 emit_shader_time_begin();
3760
3761 calculate_urb_setup();
3762 if (prog->InputsRead > 0) {
3763 if (brw->gen < 6)
3764 emit_interpolation_setup_gen4();
3765 else
3766 emit_interpolation_setup_gen6();
3767 }
3768
3769 /* We handle discards by keeping track of the still-live pixels in f0.1.
3770 * Initialize it with the dispatched pixels.
3771 */
3772 if (wm_prog_data->uses_kill) {
3773 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3774 discard_init->flag_subreg = 1;
3775 }
3776
3777 /* Generate FS IR for main(). (the visitor only descends into
3778 * functions called "main").
3779 */
3780 if (shader) {
3781 if (getenv("INTEL_USE_NIR") != NULL) {
3782 emit_nir_code();
3783 } else {
3784 foreach_in_list(ir_instruction, ir, shader->base.ir) {
3785 base_ir = ir;
3786 this->result = reg_undef;
3787 ir->accept(this);
3788 }
3789 }
3790 } else {
3791 emit_fragment_program_code();
3792 }
3793 base_ir = NULL;
3794 if (failed)
3795 return false;
3796
3797 emit(FS_OPCODE_PLACEHOLDER_HALT);
3798
3799 if (wm_key->alpha_test_func)
3800 emit_alpha_test();
3801
3802 emit_fb_writes();
3803
3804 optimize();
3805
3806 assign_curb_setup();
3807 assign_urb_setup();
3808
3809 fixup_3src_null_dest();
3810 allocate_registers();
3811
3812 if (failed)
3813 return false;
3814 }
3815
3816 if (dispatch_width == 8)
3817 wm_prog_data->reg_blocks = brw_register_blocks(grf_used);
3818 else
3819 wm_prog_data->reg_blocks_16 = brw_register_blocks(grf_used);
3820
3821 /* If any state parameters were appended, then ParameterValues could have
3822 * been realloced, in which case the driver uniform storage set up by
3823 * _mesa_associate_uniform_storage() would point to freed memory. Make
3824 * sure that didn't happen.
3825 */
3826 assert(sanity_param_count == prog->Parameters->NumParameters);
3827
3828 return !failed;
3829 }
3830
3831 const unsigned *
3832 brw_wm_fs_emit(struct brw_context *brw,
3833 void *mem_ctx,
3834 const struct brw_wm_prog_key *key,
3835 struct brw_wm_prog_data *prog_data,
3836 struct gl_fragment_program *fp,
3837 struct gl_shader_program *prog,
3838 unsigned *final_assembly_size)
3839 {
3840 bool start_busy = false;
3841 double start_time = 0;
3842
3843 if (unlikely(brw->perf_debug)) {
3844 start_busy = (brw->batch.last_bo &&
3845 drm_intel_bo_busy(brw->batch.last_bo));
3846 start_time = get_time();
3847 }
3848
3849 struct brw_shader *shader = NULL;
3850 if (prog)
3851 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3852
3853 if (unlikely(INTEL_DEBUG & DEBUG_WM))
3854 brw_dump_ir("fragment", prog, &shader->base, &fp->Base);
3855
3856 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3857 */
3858 fs_visitor v(brw, mem_ctx, key, prog_data, prog, fp, 8);
3859 if (!v.run_fs()) {
3860 if (prog) {
3861 prog->LinkStatus = false;
3862 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3863 }
3864
3865 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3866 v.fail_msg);
3867
3868 return NULL;
3869 }
3870
3871 cfg_t *simd16_cfg = NULL;
3872 fs_visitor v2(brw, mem_ctx, key, prog_data, prog, fp, 16);
3873 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16) ||
3874 brw->use_rep_send)) {
3875 if (!v.simd16_unsupported) {
3876 /* Try a SIMD16 compile */
3877 v2.import_uniforms(&v);
3878 if (!v2.run_fs()) {
3879 perf_debug("SIMD16 shader failed to compile, falling back to "
3880 "SIMD8 at a 10-20%% performance cost: %s", v2.fail_msg);
3881 } else {
3882 simd16_cfg = v2.cfg;
3883 }
3884 } else {
3885 perf_debug("SIMD16 shader unsupported, falling back to "
3886 "SIMD8 at a 10-20%% performance cost: %s", v.no16_msg);
3887 }
3888 }
3889
3890 cfg_t *simd8_cfg;
3891 int no_simd8 = (INTEL_DEBUG & DEBUG_NO8) || brw->no_simd8;
3892 if (no_simd8 && simd16_cfg) {
3893 simd8_cfg = NULL;
3894 prog_data->no_8 = true;
3895 } else {
3896 simd8_cfg = v.cfg;
3897 prog_data->no_8 = false;
3898 }
3899
3900 fs_generator g(brw, mem_ctx, (void *) key, &prog_data->base,
3901 &fp->Base, v.runtime_check_aads_emit, "FS");
3902
3903 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3904 char *name;
3905 if (prog)
3906 name = ralloc_asprintf(mem_ctx, "%s fragment shader %d",
3907 prog->Label ? prog->Label : "unnamed",
3908 prog->Name);
3909 else
3910 name = ralloc_asprintf(mem_ctx, "fragment program %d", fp->Base.Id);
3911
3912 g.enable_debug(name);
3913 }
3914
3915 if (simd8_cfg)
3916 g.generate_code(simd8_cfg, 8);
3917 if (simd16_cfg)
3918 prog_data->prog_offset_16 = g.generate_code(simd16_cfg, 16);
3919
3920 if (unlikely(brw->perf_debug) && shader) {
3921 if (shader->compiled_once)
3922 brw_wm_debug_recompile(brw, prog, key);
3923 shader->compiled_once = true;
3924
3925 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3926 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3927 (get_time() - start_time) * 1000);
3928 }
3929 }
3930
3931 return g.get_assembly(final_assembly_size);
3932 }
3933
3934 extern "C" bool
3935 brw_fs_precompile(struct gl_context *ctx,
3936 struct gl_shader_program *shader_prog,
3937 struct gl_program *prog)
3938 {
3939 struct brw_context *brw = brw_context(ctx);
3940 struct brw_wm_prog_key key;
3941
3942 struct gl_fragment_program *fp = (struct gl_fragment_program *) prog;
3943 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3944 bool program_uses_dfdy = fp->UsesDFdy;
3945
3946 memset(&key, 0, sizeof(key));
3947
3948 if (brw->gen < 6) {
3949 if (fp->UsesKill)
3950 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3951
3952 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3953 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3954
3955 /* Just assume depth testing. */
3956 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3957 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3958 }
3959
3960 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3961 BRW_FS_VARYING_INPUT_MASK) > 16)
3962 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3963
3964 const bool has_shader_channel_select = brw->is_haswell || brw->gen >= 8;
3965 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3966 for (unsigned i = 0; i < sampler_count; i++) {
3967 if (!has_shader_channel_select && (fp->Base.ShadowSamplers & (1 << i))) {
3968 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3969 key.tex.swizzles[i] =
3970 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3971 } else {
3972 /* Color sampler: assume no swizzling. */
3973 key.tex.swizzles[i] = SWIZZLE_XYZW;
3974 }
3975 }
3976
3977 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3978 key.drawable_height = ctx->DrawBuffer->Height;
3979 }
3980
3981 key.nr_color_regions = _mesa_bitcount_64(fp->Base.OutputsWritten &
3982 ~(BITFIELD64_BIT(FRAG_RESULT_DEPTH) |
3983 BITFIELD64_BIT(FRAG_RESULT_SAMPLE_MASK)));
3984
3985 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3986 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer) ||
3987 key.nr_color_regions > 1;
3988 }
3989
3990 key.program_string_id = bfp->id;
3991
3992 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3993 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3994
3995 bool success = do_wm_prog(brw, shader_prog, bfp, &key);
3996
3997 brw->wm.base.prog_offset = old_prog_offset;
3998 brw->wm.prog_data = old_prog_data;
3999
4000 return success;
4001 }