i965/gs: Add a case to brwNewProgram() for geometry shaders.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/uniforms.h"
39 #include "main/fbobject.h"
40 #include "program/prog_parameter.h"
41 #include "program/prog_print.h"
42 #include "program/register_allocate.h"
43 #include "program/sampler.h"
44 #include "program/hash_table.h"
45 #include "brw_context.h"
46 #include "brw_eu.h"
47 #include "brw_wm.h"
48 }
49 #include "brw_fs.h"
50 #include "glsl/glsl_types.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63
64 /* This will be the case for almost all instructions. */
65 this->regs_written = 1;
66 }
67
68 fs_inst::fs_inst()
69 {
70 init();
71 }
72
73 fs_inst::fs_inst(enum opcode opcode)
74 {
75 init();
76 this->opcode = opcode;
77 }
78
79 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
80 {
81 init();
82 this->opcode = opcode;
83 this->dst = dst;
84
85 if (dst.file == GRF)
86 assert(dst.reg_offset >= 0);
87 }
88
89 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
90 {
91 init();
92 this->opcode = opcode;
93 this->dst = dst;
94 this->src[0] = src0;
95
96 if (dst.file == GRF)
97 assert(dst.reg_offset >= 0);
98 if (src[0].file == GRF)
99 assert(src[0].reg_offset >= 0);
100 }
101
102 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
103 {
104 init();
105 this->opcode = opcode;
106 this->dst = dst;
107 this->src[0] = src0;
108 this->src[1] = src1;
109
110 if (dst.file == GRF)
111 assert(dst.reg_offset >= 0);
112 if (src[0].file == GRF)
113 assert(src[0].reg_offset >= 0);
114 if (src[1].file == GRF)
115 assert(src[1].reg_offset >= 0);
116 }
117
118 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
119 fs_reg src0, fs_reg src1, fs_reg src2)
120 {
121 init();
122 this->opcode = opcode;
123 this->dst = dst;
124 this->src[0] = src0;
125 this->src[1] = src1;
126 this->src[2] = src2;
127
128 if (dst.file == GRF)
129 assert(dst.reg_offset >= 0);
130 if (src[0].file == GRF)
131 assert(src[0].reg_offset >= 0);
132 if (src[1].file == GRF)
133 assert(src[1].reg_offset >= 0);
134 if (src[2].file == GRF)
135 assert(src[2].reg_offset >= 0);
136 }
137
138 #define ALU1(op) \
139 fs_inst * \
140 fs_visitor::op(fs_reg dst, fs_reg src0) \
141 { \
142 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
143 }
144
145 #define ALU2(op) \
146 fs_inst * \
147 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
148 { \
149 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
150 }
151
152 #define ALU3(op) \
153 fs_inst * \
154 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
155 { \
156 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
157 }
158
159 ALU1(NOT)
160 ALU1(MOV)
161 ALU1(FRC)
162 ALU1(RNDD)
163 ALU1(RNDE)
164 ALU1(RNDZ)
165 ALU2(ADD)
166 ALU2(MUL)
167 ALU2(MACH)
168 ALU2(AND)
169 ALU2(OR)
170 ALU2(XOR)
171 ALU2(SHL)
172 ALU2(SHR)
173 ALU2(ASR)
174 ALU3(LRP)
175 ALU1(BFREV)
176 ALU3(BFE)
177 ALU2(BFI1)
178 ALU3(BFI2)
179 ALU1(FBH)
180 ALU1(FBL)
181 ALU1(CBIT)
182
183 /** Gen4 predicated IF. */
184 fs_inst *
185 fs_visitor::IF(uint32_t predicate)
186 {
187 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
188 inst->predicate = predicate;
189 return inst;
190 }
191
192 /** Gen6+ IF with embedded comparison. */
193 fs_inst *
194 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
195 {
196 assert(brw->gen >= 6);
197 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
198 reg_null_d, src0, src1);
199 inst->conditional_mod = condition;
200 return inst;
201 }
202
203 /**
204 * CMP: Sets the low bit of the destination channels with the result
205 * of the comparison, while the upper bits are undefined, and updates
206 * the flag register with the packed 16 bits of the result.
207 */
208 fs_inst *
209 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
210 {
211 fs_inst *inst;
212
213 /* Take the instruction:
214 *
215 * CMP null<d> src0<f> src1<f>
216 *
217 * Original gen4 does type conversion to the destination type before
218 * comparison, producing garbage results for floating point comparisons.
219 * gen5 does the comparison on the execution type (resolved source types),
220 * so dst type doesn't matter. gen6 does comparison and then uses the
221 * result as if it was the dst type with no conversion, which happens to
222 * mostly work out for float-interpreted-as-int since our comparisons are
223 * for >0, =0, <0.
224 */
225 if (brw->gen == 4) {
226 dst.type = src0.type;
227 if (dst.file == HW_REG)
228 dst.fixed_hw_reg.type = dst.type;
229 }
230
231 resolve_ud_negate(&src0);
232 resolve_ud_negate(&src1);
233
234 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
235 inst->conditional_mod = condition;
236
237 return inst;
238 }
239
240 exec_list
241 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
242 fs_reg varying_offset,
243 uint32_t const_offset)
244 {
245 exec_list instructions;
246 fs_inst *inst;
247
248 /* We have our constant surface use a pitch of 4 bytes, so our index can
249 * be any component of a vector, and then we load 4 contiguous
250 * components starting from that.
251 *
252 * We break down the const_offset to a portion added to the variable
253 * offset and a portion done using reg_offset, which means that if you
254 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
255 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
256 * CSE can later notice that those loads are all the same and eliminate
257 * the redundant ones.
258 */
259 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
260 instructions.push_tail(ADD(vec4_offset,
261 varying_offset, const_offset & ~3));
262
263 int scale = 1;
264 if (brw->gen == 4 && dispatch_width == 8) {
265 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
266 * u, v, r) as parameters, or we can just use the SIMD16 message
267 * consisting of (header, u). We choose the second, at the cost of a
268 * longer return length.
269 */
270 scale = 2;
271 }
272
273 enum opcode op;
274 if (brw->gen >= 7)
275 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
276 else
277 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
278 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
279 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
280 inst->regs_written = 4 * scale;
281 instructions.push_tail(inst);
282
283 if (brw->gen < 7) {
284 inst->base_mrf = 13;
285 inst->header_present = true;
286 if (brw->gen == 4)
287 inst->mlen = 3;
288 else
289 inst->mlen = 1 + dispatch_width / 8;
290 }
291
292 vec4_result.reg_offset += (const_offset & 3) * scale;
293 instructions.push_tail(MOV(dst, vec4_result));
294
295 return instructions;
296 }
297
298 /**
299 * A helper for MOV generation for fixing up broken hardware SEND dependency
300 * handling.
301 */
302 fs_inst *
303 fs_visitor::DEP_RESOLVE_MOV(int grf)
304 {
305 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
306
307 inst->ir = NULL;
308 inst->annotation = "send dependency resolve";
309
310 /* The caller always wants uncompressed to emit the minimal extra
311 * dependencies, and to avoid having to deal with aligning its regs to 2.
312 */
313 inst->force_uncompressed = true;
314
315 return inst;
316 }
317
318 bool
319 fs_inst::equals(fs_inst *inst)
320 {
321 return (opcode == inst->opcode &&
322 dst.equals(inst->dst) &&
323 src[0].equals(inst->src[0]) &&
324 src[1].equals(inst->src[1]) &&
325 src[2].equals(inst->src[2]) &&
326 saturate == inst->saturate &&
327 predicate == inst->predicate &&
328 conditional_mod == inst->conditional_mod &&
329 mlen == inst->mlen &&
330 base_mrf == inst->base_mrf &&
331 sampler == inst->sampler &&
332 target == inst->target &&
333 eot == inst->eot &&
334 header_present == inst->header_present &&
335 shadow_compare == inst->shadow_compare &&
336 offset == inst->offset);
337 }
338
339 bool
340 fs_inst::overwrites_reg(const fs_reg &reg)
341 {
342 return (reg.file == dst.file &&
343 reg.reg == dst.reg &&
344 reg.reg_offset >= dst.reg_offset &&
345 reg.reg_offset < dst.reg_offset + regs_written);
346 }
347
348 bool
349 fs_inst::is_send_from_grf()
350 {
351 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
352 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
353 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
354 src[1].file == GRF));
355 }
356
357 bool
358 fs_visitor::can_do_source_mods(fs_inst *inst)
359 {
360 if (brw->gen == 6 && inst->is_math())
361 return false;
362
363 if (inst->is_send_from_grf())
364 return false;
365
366 return true;
367 }
368
369 void
370 fs_reg::init()
371 {
372 memset(this, 0, sizeof(*this));
373 this->smear = -1;
374 }
375
376 /** Generic unset register constructor. */
377 fs_reg::fs_reg()
378 {
379 init();
380 this->file = BAD_FILE;
381 }
382
383 /** Immediate value constructor. */
384 fs_reg::fs_reg(float f)
385 {
386 init();
387 this->file = IMM;
388 this->type = BRW_REGISTER_TYPE_F;
389 this->imm.f = f;
390 }
391
392 /** Immediate value constructor. */
393 fs_reg::fs_reg(int32_t i)
394 {
395 init();
396 this->file = IMM;
397 this->type = BRW_REGISTER_TYPE_D;
398 this->imm.i = i;
399 }
400
401 /** Immediate value constructor. */
402 fs_reg::fs_reg(uint32_t u)
403 {
404 init();
405 this->file = IMM;
406 this->type = BRW_REGISTER_TYPE_UD;
407 this->imm.u = u;
408 }
409
410 /** Fixed brw_reg Immediate value constructor. */
411 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
412 {
413 init();
414 this->file = HW_REG;
415 this->fixed_hw_reg = fixed_hw_reg;
416 this->type = fixed_hw_reg.type;
417 }
418
419 bool
420 fs_reg::equals(const fs_reg &r) const
421 {
422 return (file == r.file &&
423 reg == r.reg &&
424 reg_offset == r.reg_offset &&
425 type == r.type &&
426 negate == r.negate &&
427 abs == r.abs &&
428 !reladdr && !r.reladdr &&
429 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
430 sizeof(fixed_hw_reg)) == 0 &&
431 smear == r.smear &&
432 imm.u == r.imm.u);
433 }
434
435 bool
436 fs_reg::is_zero() const
437 {
438 if (file != IMM)
439 return false;
440
441 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
442 }
443
444 bool
445 fs_reg::is_one() const
446 {
447 if (file != IMM)
448 return false;
449
450 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
451 }
452
453 bool
454 fs_reg::is_valid_3src() const
455 {
456 return file == GRF || file == UNIFORM;
457 }
458
459 int
460 fs_visitor::type_size(const struct glsl_type *type)
461 {
462 unsigned int size, i;
463
464 switch (type->base_type) {
465 case GLSL_TYPE_UINT:
466 case GLSL_TYPE_INT:
467 case GLSL_TYPE_FLOAT:
468 case GLSL_TYPE_BOOL:
469 return type->components();
470 case GLSL_TYPE_ARRAY:
471 return type_size(type->fields.array) * type->length;
472 case GLSL_TYPE_STRUCT:
473 size = 0;
474 for (i = 0; i < type->length; i++) {
475 size += type_size(type->fields.structure[i].type);
476 }
477 return size;
478 case GLSL_TYPE_SAMPLER:
479 /* Samplers take up no register space, since they're baked in at
480 * link time.
481 */
482 return 0;
483 case GLSL_TYPE_VOID:
484 case GLSL_TYPE_ERROR:
485 case GLSL_TYPE_INTERFACE:
486 assert(!"not reached");
487 break;
488 }
489
490 return 0;
491 }
492
493 fs_reg
494 fs_visitor::get_timestamp()
495 {
496 assert(brw->gen >= 7);
497
498 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
499 BRW_ARF_TIMESTAMP,
500 0),
501 BRW_REGISTER_TYPE_UD));
502
503 fs_reg dst = fs_reg(this, glsl_type::uint_type);
504
505 fs_inst *mov = emit(MOV(dst, ts));
506 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
507 * even if it's not enabled in the dispatch.
508 */
509 mov->force_writemask_all = true;
510 mov->force_uncompressed = true;
511
512 /* The caller wants the low 32 bits of the timestamp. Since it's running
513 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
514 * which is plenty of time for our purposes. It is identical across the
515 * EUs, but since it's tracking GPU core speed it will increment at a
516 * varying rate as render P-states change.
517 *
518 * The caller could also check if render P-states have changed (or anything
519 * else that might disrupt timing) by setting smear to 2 and checking if
520 * that field is != 0.
521 */
522 dst.smear = 0;
523
524 return dst;
525 }
526
527 void
528 fs_visitor::emit_shader_time_begin()
529 {
530 current_annotation = "shader time start";
531 shader_start_time = get_timestamp();
532 }
533
534 void
535 fs_visitor::emit_shader_time_end()
536 {
537 current_annotation = "shader time end";
538
539 enum shader_time_shader_type type, written_type, reset_type;
540 if (dispatch_width == 8) {
541 type = ST_FS8;
542 written_type = ST_FS8_WRITTEN;
543 reset_type = ST_FS8_RESET;
544 } else {
545 assert(dispatch_width == 16);
546 type = ST_FS16;
547 written_type = ST_FS16_WRITTEN;
548 reset_type = ST_FS16_RESET;
549 }
550
551 fs_reg shader_end_time = get_timestamp();
552
553 /* Check that there weren't any timestamp reset events (assuming these
554 * were the only two timestamp reads that happened).
555 */
556 fs_reg reset = shader_end_time;
557 reset.smear = 2;
558 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
559 test->conditional_mod = BRW_CONDITIONAL_Z;
560 emit(IF(BRW_PREDICATE_NORMAL));
561
562 push_force_uncompressed();
563 fs_reg start = shader_start_time;
564 start.negate = true;
565 fs_reg diff = fs_reg(this, glsl_type::uint_type);
566 emit(ADD(diff, start, shader_end_time));
567
568 /* If there were no instructions between the two timestamp gets, the diff
569 * is 2 cycles. Remove that overhead, so I can forget about that when
570 * trying to determine the time taken for single instructions.
571 */
572 emit(ADD(diff, diff, fs_reg(-2u)));
573
574 emit_shader_time_write(type, diff);
575 emit_shader_time_write(written_type, fs_reg(1u));
576 emit(BRW_OPCODE_ELSE);
577 emit_shader_time_write(reset_type, fs_reg(1u));
578 emit(BRW_OPCODE_ENDIF);
579
580 pop_force_uncompressed();
581 }
582
583 void
584 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
585 fs_reg value)
586 {
587 int shader_time_index =
588 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
589 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
590
591 fs_reg payload;
592 if (dispatch_width == 8)
593 payload = fs_reg(this, glsl_type::uvec2_type);
594 else
595 payload = fs_reg(this, glsl_type::uint_type);
596
597 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
598 fs_reg(), payload, offset, value));
599 }
600
601 void
602 fs_visitor::fail(const char *format, ...)
603 {
604 va_list va;
605 char *msg;
606
607 if (failed)
608 return;
609
610 failed = true;
611
612 va_start(va, format);
613 msg = ralloc_vasprintf(mem_ctx, format, va);
614 va_end(va);
615 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
616
617 this->fail_msg = msg;
618
619 if (INTEL_DEBUG & DEBUG_WM) {
620 fprintf(stderr, "%s", msg);
621 }
622 }
623
624 fs_inst *
625 fs_visitor::emit(enum opcode opcode)
626 {
627 return emit(fs_inst(opcode));
628 }
629
630 fs_inst *
631 fs_visitor::emit(enum opcode opcode, fs_reg dst)
632 {
633 return emit(fs_inst(opcode, dst));
634 }
635
636 fs_inst *
637 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
638 {
639 return emit(fs_inst(opcode, dst, src0));
640 }
641
642 fs_inst *
643 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
644 {
645 return emit(fs_inst(opcode, dst, src0, src1));
646 }
647
648 fs_inst *
649 fs_visitor::emit(enum opcode opcode, fs_reg dst,
650 fs_reg src0, fs_reg src1, fs_reg src2)
651 {
652 return emit(fs_inst(opcode, dst, src0, src1, src2));
653 }
654
655 void
656 fs_visitor::push_force_uncompressed()
657 {
658 force_uncompressed_stack++;
659 }
660
661 void
662 fs_visitor::pop_force_uncompressed()
663 {
664 force_uncompressed_stack--;
665 assert(force_uncompressed_stack >= 0);
666 }
667
668 void
669 fs_visitor::push_force_sechalf()
670 {
671 force_sechalf_stack++;
672 }
673
674 void
675 fs_visitor::pop_force_sechalf()
676 {
677 force_sechalf_stack--;
678 assert(force_sechalf_stack >= 0);
679 }
680
681 /**
682 * Returns true if the instruction has a flag that means it won't
683 * update an entire destination register.
684 *
685 * For example, dead code elimination and live variable analysis want to know
686 * when a write to a variable screens off any preceding values that were in
687 * it.
688 */
689 bool
690 fs_inst::is_partial_write()
691 {
692 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
693 this->force_uncompressed ||
694 this->force_sechalf);
695 }
696
697 /**
698 * Returns how many MRFs an FS opcode will write over.
699 *
700 * Note that this is not the 0 or 1 implied writes in an actual gen
701 * instruction -- the FS opcodes often generate MOVs in addition.
702 */
703 int
704 fs_visitor::implied_mrf_writes(fs_inst *inst)
705 {
706 if (inst->mlen == 0)
707 return 0;
708
709 switch (inst->opcode) {
710 case SHADER_OPCODE_RCP:
711 case SHADER_OPCODE_RSQ:
712 case SHADER_OPCODE_SQRT:
713 case SHADER_OPCODE_EXP2:
714 case SHADER_OPCODE_LOG2:
715 case SHADER_OPCODE_SIN:
716 case SHADER_OPCODE_COS:
717 return 1 * dispatch_width / 8;
718 case SHADER_OPCODE_POW:
719 case SHADER_OPCODE_INT_QUOTIENT:
720 case SHADER_OPCODE_INT_REMAINDER:
721 return 2 * dispatch_width / 8;
722 case SHADER_OPCODE_TEX:
723 case FS_OPCODE_TXB:
724 case SHADER_OPCODE_TXD:
725 case SHADER_OPCODE_TXF:
726 case SHADER_OPCODE_TXF_MS:
727 case SHADER_OPCODE_TXL:
728 case SHADER_OPCODE_TXS:
729 case SHADER_OPCODE_LOD:
730 return 1;
731 case FS_OPCODE_FB_WRITE:
732 return 2;
733 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
734 case FS_OPCODE_UNSPILL:
735 return 1;
736 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
737 return inst->mlen;
738 case FS_OPCODE_SPILL:
739 return 2;
740 default:
741 assert(!"not reached");
742 return inst->mlen;
743 }
744 }
745
746 int
747 fs_visitor::virtual_grf_alloc(int size)
748 {
749 if (virtual_grf_array_size <= virtual_grf_count) {
750 if (virtual_grf_array_size == 0)
751 virtual_grf_array_size = 16;
752 else
753 virtual_grf_array_size *= 2;
754 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
755 virtual_grf_array_size);
756 }
757 virtual_grf_sizes[virtual_grf_count] = size;
758 return virtual_grf_count++;
759 }
760
761 /** Fixed HW reg constructor. */
762 fs_reg::fs_reg(enum register_file file, int reg)
763 {
764 init();
765 this->file = file;
766 this->reg = reg;
767 this->type = BRW_REGISTER_TYPE_F;
768 }
769
770 /** Fixed HW reg constructor. */
771 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
772 {
773 init();
774 this->file = file;
775 this->reg = reg;
776 this->type = type;
777 }
778
779 /** Automatic reg constructor. */
780 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
781 {
782 init();
783
784 this->file = GRF;
785 this->reg = v->virtual_grf_alloc(v->type_size(type));
786 this->reg_offset = 0;
787 this->type = brw_type_for_base_type(type);
788 }
789
790 fs_reg *
791 fs_visitor::variable_storage(ir_variable *var)
792 {
793 return (fs_reg *)hash_table_find(this->variable_ht, var);
794 }
795
796 void
797 import_uniforms_callback(const void *key,
798 void *data,
799 void *closure)
800 {
801 struct hash_table *dst_ht = (struct hash_table *)closure;
802 const fs_reg *reg = (const fs_reg *)data;
803
804 if (reg->file != UNIFORM)
805 return;
806
807 hash_table_insert(dst_ht, data, key);
808 }
809
810 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
811 * This brings in those uniform definitions
812 */
813 void
814 fs_visitor::import_uniforms(fs_visitor *v)
815 {
816 hash_table_call_foreach(v->variable_ht,
817 import_uniforms_callback,
818 variable_ht);
819 this->params_remap = v->params_remap;
820 this->nr_params_remap = v->nr_params_remap;
821 }
822
823 /* Our support for uniforms is piggy-backed on the struct
824 * gl_fragment_program, because that's where the values actually
825 * get stored, rather than in some global gl_shader_program uniform
826 * store.
827 */
828 void
829 fs_visitor::setup_uniform_values(ir_variable *ir)
830 {
831 int namelen = strlen(ir->name);
832
833 /* The data for our (non-builtin) uniforms is stored in a series of
834 * gl_uniform_driver_storage structs for each subcomponent that
835 * glGetUniformLocation() could name. We know it's been set up in the same
836 * order we'd walk the type, so walk the list of storage and find anything
837 * with our name, or the prefix of a component that starts with our name.
838 */
839 unsigned params_before = c->prog_data.nr_params;
840 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
841 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
842
843 if (strncmp(ir->name, storage->name, namelen) != 0 ||
844 (storage->name[namelen] != 0 &&
845 storage->name[namelen] != '.' &&
846 storage->name[namelen] != '[')) {
847 continue;
848 }
849
850 unsigned slots = storage->type->component_slots();
851 if (storage->array_elements)
852 slots *= storage->array_elements;
853
854 for (unsigned i = 0; i < slots; i++) {
855 c->prog_data.param[c->prog_data.nr_params++] =
856 &storage->storage[i].f;
857 }
858 }
859
860 /* Make sure we actually initialized the right amount of stuff here. */
861 assert(params_before + ir->type->component_slots() ==
862 c->prog_data.nr_params);
863 (void)params_before;
864 }
865
866
867 /* Our support for builtin uniforms is even scarier than non-builtin.
868 * It sits on top of the PROG_STATE_VAR parameters that are
869 * automatically updated from GL context state.
870 */
871 void
872 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
873 {
874 const ir_state_slot *const slots = ir->state_slots;
875 assert(ir->state_slots != NULL);
876
877 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
878 /* This state reference has already been setup by ir_to_mesa, but we'll
879 * get the same index back here.
880 */
881 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
882 (gl_state_index *)slots[i].tokens);
883
884 /* Add each of the unique swizzles of the element as a parameter.
885 * This'll end up matching the expected layout of the
886 * array/matrix/structure we're trying to fill in.
887 */
888 int last_swiz = -1;
889 for (unsigned int j = 0; j < 4; j++) {
890 int swiz = GET_SWZ(slots[i].swizzle, j);
891 if (swiz == last_swiz)
892 break;
893 last_swiz = swiz;
894
895 c->prog_data.param[c->prog_data.nr_params++] =
896 &fp->Base.Parameters->ParameterValues[index][swiz].f;
897 }
898 }
899 }
900
901 fs_reg *
902 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
903 {
904 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
905 fs_reg wpos = *reg;
906 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
907
908 /* gl_FragCoord.x */
909 if (ir->pixel_center_integer) {
910 emit(MOV(wpos, this->pixel_x));
911 } else {
912 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
913 }
914 wpos.reg_offset++;
915
916 /* gl_FragCoord.y */
917 if (!flip && ir->pixel_center_integer) {
918 emit(MOV(wpos, this->pixel_y));
919 } else {
920 fs_reg pixel_y = this->pixel_y;
921 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
922
923 if (flip) {
924 pixel_y.negate = true;
925 offset += c->key.drawable_height - 1.0;
926 }
927
928 emit(ADD(wpos, pixel_y, fs_reg(offset)));
929 }
930 wpos.reg_offset++;
931
932 /* gl_FragCoord.z */
933 if (brw->gen >= 6) {
934 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
935 } else {
936 emit(FS_OPCODE_LINTERP, wpos,
937 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
938 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
939 interp_reg(VARYING_SLOT_POS, 2));
940 }
941 wpos.reg_offset++;
942
943 /* gl_FragCoord.w: Already set up in emit_interpolation */
944 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
945
946 return reg;
947 }
948
949 fs_inst *
950 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
951 glsl_interp_qualifier interpolation_mode,
952 bool is_centroid)
953 {
954 brw_wm_barycentric_interp_mode barycoord_mode;
955 if (brw->gen >= 6) {
956 if (is_centroid) {
957 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
958 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
959 else
960 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
961 } else {
962 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
963 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
964 else
965 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
966 }
967 } else {
968 /* On Ironlake and below, there is only one interpolation mode.
969 * Centroid interpolation doesn't mean anything on this hardware --
970 * there is no multisampling.
971 */
972 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
973 }
974 return emit(FS_OPCODE_LINTERP, attr,
975 this->delta_x[barycoord_mode],
976 this->delta_y[barycoord_mode], interp);
977 }
978
979 fs_reg *
980 fs_visitor::emit_general_interpolation(ir_variable *ir)
981 {
982 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
983 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
984 fs_reg attr = *reg;
985
986 unsigned int array_elements;
987 const glsl_type *type;
988
989 if (ir->type->is_array()) {
990 array_elements = ir->type->length;
991 if (array_elements == 0) {
992 fail("dereferenced array '%s' has length 0\n", ir->name);
993 }
994 type = ir->type->fields.array;
995 } else {
996 array_elements = 1;
997 type = ir->type;
998 }
999
1000 glsl_interp_qualifier interpolation_mode =
1001 ir->determine_interpolation_mode(c->key.flat_shade);
1002
1003 int location = ir->location;
1004 for (unsigned int i = 0; i < array_elements; i++) {
1005 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1006 if (urb_setup[location] == -1) {
1007 /* If there's no incoming setup data for this slot, don't
1008 * emit interpolation for it.
1009 */
1010 attr.reg_offset += type->vector_elements;
1011 location++;
1012 continue;
1013 }
1014
1015 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1016 /* Constant interpolation (flat shading) case. The SF has
1017 * handed us defined values in only the constant offset
1018 * field of the setup reg.
1019 */
1020 for (unsigned int k = 0; k < type->vector_elements; k++) {
1021 struct brw_reg interp = interp_reg(location, k);
1022 interp = suboffset(interp, 3);
1023 interp.type = reg->type;
1024 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1025 attr.reg_offset++;
1026 }
1027 } else {
1028 /* Smooth/noperspective interpolation case. */
1029 for (unsigned int k = 0; k < type->vector_elements; k++) {
1030 /* FINISHME: At some point we probably want to push
1031 * this farther by giving similar treatment to the
1032 * other potentially constant components of the
1033 * attribute, as well as making brw_vs_constval.c
1034 * handle varyings other than gl_TexCoord.
1035 */
1036 struct brw_reg interp = interp_reg(location, k);
1037 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1038 ir->centroid);
1039 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1040 /* Get the pixel/sample mask into f0 so that we know
1041 * which pixels are lit. Then, for each channel that is
1042 * unlit, replace the centroid data with non-centroid
1043 * data.
1044 */
1045 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1046 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1047 interpolation_mode, false);
1048 inst->predicate = BRW_PREDICATE_NORMAL;
1049 inst->predicate_inverse = true;
1050 }
1051 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1052 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1053 }
1054 attr.reg_offset++;
1055 }
1056
1057 }
1058 location++;
1059 }
1060 }
1061
1062 return reg;
1063 }
1064
1065 fs_reg *
1066 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1067 {
1068 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1069
1070 /* The frontfacing comes in as a bit in the thread payload. */
1071 if (brw->gen >= 6) {
1072 emit(BRW_OPCODE_ASR, *reg,
1073 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1074 fs_reg(15));
1075 emit(BRW_OPCODE_NOT, *reg, *reg);
1076 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1077 } else {
1078 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1079 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1080 * us front face
1081 */
1082 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1083 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1084 }
1085
1086 return reg;
1087 }
1088
1089 fs_reg
1090 fs_visitor::fix_math_operand(fs_reg src)
1091 {
1092 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1093 * might be able to do better by doing execsize = 1 math and then
1094 * expanding that result out, but we would need to be careful with
1095 * masking.
1096 *
1097 * The hardware ignores source modifiers (negate and abs) on math
1098 * instructions, so we also move to a temp to set those up.
1099 */
1100 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1101 !src.abs && !src.negate)
1102 return src;
1103
1104 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1105 * operands to math
1106 */
1107 if (brw->gen >= 7 && src.file != IMM)
1108 return src;
1109
1110 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1111 expanded.type = src.type;
1112 emit(BRW_OPCODE_MOV, expanded, src);
1113 return expanded;
1114 }
1115
1116 fs_inst *
1117 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1118 {
1119 switch (opcode) {
1120 case SHADER_OPCODE_RCP:
1121 case SHADER_OPCODE_RSQ:
1122 case SHADER_OPCODE_SQRT:
1123 case SHADER_OPCODE_EXP2:
1124 case SHADER_OPCODE_LOG2:
1125 case SHADER_OPCODE_SIN:
1126 case SHADER_OPCODE_COS:
1127 break;
1128 default:
1129 assert(!"not reached: bad math opcode");
1130 return NULL;
1131 }
1132
1133 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1134 * might be able to do better by doing execsize = 1 math and then
1135 * expanding that result out, but we would need to be careful with
1136 * masking.
1137 *
1138 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1139 * instructions, so we also move to a temp to set those up.
1140 */
1141 if (brw->gen >= 6)
1142 src = fix_math_operand(src);
1143
1144 fs_inst *inst = emit(opcode, dst, src);
1145
1146 if (brw->gen < 6) {
1147 inst->base_mrf = 2;
1148 inst->mlen = dispatch_width / 8;
1149 }
1150
1151 return inst;
1152 }
1153
1154 fs_inst *
1155 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1156 {
1157 int base_mrf = 2;
1158 fs_inst *inst;
1159
1160 switch (opcode) {
1161 case SHADER_OPCODE_INT_QUOTIENT:
1162 case SHADER_OPCODE_INT_REMAINDER:
1163 if (brw->gen >= 7 && dispatch_width == 16)
1164 fail("16-wide INTDIV unsupported\n");
1165 break;
1166 case SHADER_OPCODE_POW:
1167 break;
1168 default:
1169 assert(!"not reached: unsupported binary math opcode.");
1170 return NULL;
1171 }
1172
1173 if (brw->gen >= 6) {
1174 src0 = fix_math_operand(src0);
1175 src1 = fix_math_operand(src1);
1176
1177 inst = emit(opcode, dst, src0, src1);
1178 } else {
1179 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1180 * "Message Payload":
1181 *
1182 * "Operand0[7]. For the INT DIV functions, this operand is the
1183 * denominator."
1184 * ...
1185 * "Operand1[7]. For the INT DIV functions, this operand is the
1186 * numerator."
1187 */
1188 bool is_int_div = opcode != SHADER_OPCODE_POW;
1189 fs_reg &op0 = is_int_div ? src1 : src0;
1190 fs_reg &op1 = is_int_div ? src0 : src1;
1191
1192 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1193 inst = emit(opcode, dst, op0, reg_null_f);
1194
1195 inst->base_mrf = base_mrf;
1196 inst->mlen = 2 * dispatch_width / 8;
1197 }
1198 return inst;
1199 }
1200
1201 void
1202 fs_visitor::assign_curb_setup()
1203 {
1204 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1205 if (dispatch_width == 8) {
1206 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1207 } else {
1208 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1209 }
1210
1211 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1212 foreach_list(node, &this->instructions) {
1213 fs_inst *inst = (fs_inst *)node;
1214
1215 for (unsigned int i = 0; i < 3; i++) {
1216 if (inst->src[i].file == UNIFORM) {
1217 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1218 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1219 constant_nr / 8,
1220 constant_nr % 8);
1221
1222 inst->src[i].file = HW_REG;
1223 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1224 }
1225 }
1226 }
1227 }
1228
1229 void
1230 fs_visitor::calculate_urb_setup()
1231 {
1232 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1233 urb_setup[i] = -1;
1234 }
1235
1236 int urb_next = 0;
1237 /* Figure out where each of the incoming setup attributes lands. */
1238 if (brw->gen >= 6) {
1239 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1240 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1241 urb_setup[i] = urb_next++;
1242 }
1243 }
1244 } else {
1245 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1246 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1247 /* Point size is packed into the header, not as a general attribute */
1248 if (i == VARYING_SLOT_PSIZ)
1249 continue;
1250
1251 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1252 /* The back color slot is skipped when the front color is
1253 * also written to. In addition, some slots can be
1254 * written in the vertex shader and not read in the
1255 * fragment shader. So the register number must always be
1256 * incremented, mapped or not.
1257 */
1258 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1259 urb_setup[i] = urb_next;
1260 urb_next++;
1261 }
1262 }
1263
1264 /*
1265 * It's a FS only attribute, and we did interpolation for this attribute
1266 * in SF thread. So, count it here, too.
1267 *
1268 * See compile_sf_prog() for more info.
1269 */
1270 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1271 urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1272 }
1273
1274 /* Each attribute is 4 setup channels, each of which is half a reg. */
1275 c->prog_data.urb_read_length = urb_next * 2;
1276 }
1277
1278 void
1279 fs_visitor::assign_urb_setup()
1280 {
1281 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1282
1283 /* Offset all the urb_setup[] index by the actual position of the
1284 * setup regs, now that the location of the constants has been chosen.
1285 */
1286 foreach_list(node, &this->instructions) {
1287 fs_inst *inst = (fs_inst *)node;
1288
1289 if (inst->opcode == FS_OPCODE_LINTERP) {
1290 assert(inst->src[2].file == HW_REG);
1291 inst->src[2].fixed_hw_reg.nr += urb_start;
1292 }
1293
1294 if (inst->opcode == FS_OPCODE_CINTERP) {
1295 assert(inst->src[0].file == HW_REG);
1296 inst->src[0].fixed_hw_reg.nr += urb_start;
1297 }
1298 }
1299
1300 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1301 }
1302
1303 /**
1304 * Split large virtual GRFs into separate components if we can.
1305 *
1306 * This is mostly duplicated with what brw_fs_vector_splitting does,
1307 * but that's really conservative because it's afraid of doing
1308 * splitting that doesn't result in real progress after the rest of
1309 * the optimization phases, which would cause infinite looping in
1310 * optimization. We can do it once here, safely. This also has the
1311 * opportunity to split interpolated values, or maybe even uniforms,
1312 * which we don't have at the IR level.
1313 *
1314 * We want to split, because virtual GRFs are what we register
1315 * allocate and spill (due to contiguousness requirements for some
1316 * instructions), and they're what we naturally generate in the
1317 * codegen process, but most virtual GRFs don't actually need to be
1318 * contiguous sets of GRFs. If we split, we'll end up with reduced
1319 * live intervals and better dead code elimination and coalescing.
1320 */
1321 void
1322 fs_visitor::split_virtual_grfs()
1323 {
1324 int num_vars = this->virtual_grf_count;
1325 bool split_grf[num_vars];
1326 int new_virtual_grf[num_vars];
1327
1328 /* Try to split anything > 0 sized. */
1329 for (int i = 0; i < num_vars; i++) {
1330 if (this->virtual_grf_sizes[i] != 1)
1331 split_grf[i] = true;
1332 else
1333 split_grf[i] = false;
1334 }
1335
1336 if (brw->has_pln &&
1337 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1338 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1339 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1340 * Gen6, that was the only supported interpolation mode, and since Gen6,
1341 * delta_x and delta_y are in fixed hardware registers.
1342 */
1343 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1344 false;
1345 }
1346
1347 foreach_list(node, &this->instructions) {
1348 fs_inst *inst = (fs_inst *)node;
1349
1350 /* If there's a SEND message that requires contiguous destination
1351 * registers, no splitting is allowed.
1352 */
1353 if (inst->regs_written > 1) {
1354 split_grf[inst->dst.reg] = false;
1355 }
1356
1357 /* If we're sending from a GRF, don't split it, on the assumption that
1358 * the send is reading the whole thing.
1359 */
1360 if (inst->is_send_from_grf()) {
1361 split_grf[inst->src[0].reg] = false;
1362 }
1363 }
1364
1365 /* Allocate new space for split regs. Note that the virtual
1366 * numbers will be contiguous.
1367 */
1368 for (int i = 0; i < num_vars; i++) {
1369 if (split_grf[i]) {
1370 new_virtual_grf[i] = virtual_grf_alloc(1);
1371 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1372 int reg = virtual_grf_alloc(1);
1373 assert(reg == new_virtual_grf[i] + j - 1);
1374 (void) reg;
1375 }
1376 this->virtual_grf_sizes[i] = 1;
1377 }
1378 }
1379
1380 foreach_list(node, &this->instructions) {
1381 fs_inst *inst = (fs_inst *)node;
1382
1383 if (inst->dst.file == GRF &&
1384 split_grf[inst->dst.reg] &&
1385 inst->dst.reg_offset != 0) {
1386 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1387 inst->dst.reg_offset - 1);
1388 inst->dst.reg_offset = 0;
1389 }
1390 for (int i = 0; i < 3; i++) {
1391 if (inst->src[i].file == GRF &&
1392 split_grf[inst->src[i].reg] &&
1393 inst->src[i].reg_offset != 0) {
1394 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1395 inst->src[i].reg_offset - 1);
1396 inst->src[i].reg_offset = 0;
1397 }
1398 }
1399 }
1400 this->live_intervals_valid = false;
1401 }
1402
1403 /**
1404 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1405 *
1406 * During code generation, we create tons of temporary variables, many of
1407 * which get immediately killed and are never used again. Yet, in later
1408 * optimization and analysis passes, such as compute_live_intervals, we need
1409 * to loop over all the virtual GRFs. Compacting them can save a lot of
1410 * overhead.
1411 */
1412 void
1413 fs_visitor::compact_virtual_grfs()
1414 {
1415 /* Mark which virtual GRFs are used, and count how many. */
1416 int remap_table[this->virtual_grf_count];
1417 memset(remap_table, -1, sizeof(remap_table));
1418
1419 foreach_list(node, &this->instructions) {
1420 const fs_inst *inst = (const fs_inst *) node;
1421
1422 if (inst->dst.file == GRF)
1423 remap_table[inst->dst.reg] = 0;
1424
1425 for (int i = 0; i < 3; i++) {
1426 if (inst->src[i].file == GRF)
1427 remap_table[inst->src[i].reg] = 0;
1428 }
1429 }
1430
1431 /* In addition to registers used in instructions, fs_visitor keeps
1432 * direct references to certain special values which must be patched:
1433 */
1434 fs_reg *special[] = {
1435 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1436 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1437 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1438 &delta_x[0], &delta_x[1], &delta_x[2],
1439 &delta_x[3], &delta_x[4], &delta_x[5],
1440 &delta_y[0], &delta_y[1], &delta_y[2],
1441 &delta_y[3], &delta_y[4], &delta_y[5],
1442 };
1443 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1444 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1445
1446 /* Treat all special values as used, to be conservative */
1447 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1448 if (special[i]->file == GRF)
1449 remap_table[special[i]->reg] = 0;
1450 }
1451
1452 /* Compact the GRF arrays. */
1453 int new_index = 0;
1454 for (int i = 0; i < this->virtual_grf_count; i++) {
1455 if (remap_table[i] != -1) {
1456 remap_table[i] = new_index;
1457 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1458 if (live_intervals_valid) {
1459 virtual_grf_start[new_index] = virtual_grf_start[i];
1460 virtual_grf_end[new_index] = virtual_grf_end[i];
1461 }
1462 ++new_index;
1463 }
1464 }
1465
1466 this->virtual_grf_count = new_index;
1467
1468 /* Patch all the instructions to use the newly renumbered registers */
1469 foreach_list(node, &this->instructions) {
1470 fs_inst *inst = (fs_inst *) node;
1471
1472 if (inst->dst.file == GRF)
1473 inst->dst.reg = remap_table[inst->dst.reg];
1474
1475 for (int i = 0; i < 3; i++) {
1476 if (inst->src[i].file == GRF)
1477 inst->src[i].reg = remap_table[inst->src[i].reg];
1478 }
1479 }
1480
1481 /* Patch all the references to special values */
1482 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1483 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1484 special[i]->reg = remap_table[special[i]->reg];
1485 }
1486 }
1487
1488 bool
1489 fs_visitor::remove_dead_constants()
1490 {
1491 if (dispatch_width == 8) {
1492 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1493 this->nr_params_remap = c->prog_data.nr_params;
1494
1495 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1496 this->params_remap[i] = -1;
1497
1498 /* Find which params are still in use. */
1499 foreach_list(node, &this->instructions) {
1500 fs_inst *inst = (fs_inst *)node;
1501
1502 for (int i = 0; i < 3; i++) {
1503 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1504
1505 if (inst->src[i].file != UNIFORM)
1506 continue;
1507
1508 /* Section 5.11 of the OpenGL 4.3 spec says:
1509 *
1510 * "Out-of-bounds reads return undefined values, which include
1511 * values from other variables of the active program or zero."
1512 */
1513 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1514 constant_nr = 0;
1515 }
1516
1517 /* For now, set this to non-negative. We'll give it the
1518 * actual new number in a moment, in order to keep the
1519 * register numbers nicely ordered.
1520 */
1521 this->params_remap[constant_nr] = 0;
1522 }
1523 }
1524
1525 /* Figure out what the new numbers for the params will be. At some
1526 * point when we're doing uniform array access, we're going to want
1527 * to keep the distinction between .reg and .reg_offset, but for
1528 * now we don't care.
1529 */
1530 unsigned int new_nr_params = 0;
1531 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1532 if (this->params_remap[i] != -1) {
1533 this->params_remap[i] = new_nr_params++;
1534 }
1535 }
1536
1537 /* Update the list of params to be uploaded to match our new numbering. */
1538 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1539 int remapped = this->params_remap[i];
1540
1541 if (remapped == -1)
1542 continue;
1543
1544 c->prog_data.param[remapped] = c->prog_data.param[i];
1545 }
1546
1547 c->prog_data.nr_params = new_nr_params;
1548 } else {
1549 /* This should have been generated in the 8-wide pass already. */
1550 assert(this->params_remap);
1551 }
1552
1553 /* Now do the renumbering of the shader to remove unused params. */
1554 foreach_list(node, &this->instructions) {
1555 fs_inst *inst = (fs_inst *)node;
1556
1557 for (int i = 0; i < 3; i++) {
1558 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1559
1560 if (inst->src[i].file != UNIFORM)
1561 continue;
1562
1563 /* as above alias to 0 */
1564 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1565 constant_nr = 0;
1566 }
1567 assert(this->params_remap[constant_nr] != -1);
1568 inst->src[i].reg = this->params_remap[constant_nr];
1569 inst->src[i].reg_offset = 0;
1570 }
1571 }
1572
1573 return true;
1574 }
1575
1576 /*
1577 * Implements array access of uniforms by inserting a
1578 * PULL_CONSTANT_LOAD instruction.
1579 *
1580 * Unlike temporary GRF array access (where we don't support it due to
1581 * the difficulty of doing relative addressing on instruction
1582 * destinations), we could potentially do array access of uniforms
1583 * that were loaded in GRF space as push constants. In real-world
1584 * usage we've seen, though, the arrays being used are always larger
1585 * than we could load as push constants, so just always move all
1586 * uniform array access out to a pull constant buffer.
1587 */
1588 void
1589 fs_visitor::move_uniform_array_access_to_pull_constants()
1590 {
1591 int pull_constant_loc[c->prog_data.nr_params];
1592
1593 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1594 pull_constant_loc[i] = -1;
1595 }
1596
1597 /* Walk through and find array access of uniforms. Put a copy of that
1598 * uniform in the pull constant buffer.
1599 *
1600 * Note that we don't move constant-indexed accesses to arrays. No
1601 * testing has been done of the performance impact of this choice.
1602 */
1603 foreach_list_safe(node, &this->instructions) {
1604 fs_inst *inst = (fs_inst *)node;
1605
1606 for (int i = 0 ; i < 3; i++) {
1607 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1608 continue;
1609
1610 int uniform = inst->src[i].reg;
1611
1612 /* If this array isn't already present in the pull constant buffer,
1613 * add it.
1614 */
1615 if (pull_constant_loc[uniform] == -1) {
1616 const float **values = &c->prog_data.param[uniform];
1617
1618 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1619
1620 assert(param_size[uniform]);
1621
1622 for (int j = 0; j < param_size[uniform]; j++) {
1623 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1624 values[j];
1625 }
1626 }
1627
1628 /* Set up the annotation tracking for new generated instructions. */
1629 base_ir = inst->ir;
1630 current_annotation = inst->annotation;
1631
1632 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1633 fs_reg temp = fs_reg(this, glsl_type::float_type);
1634 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1635 surf_index,
1636 *inst->src[i].reladdr,
1637 pull_constant_loc[uniform] +
1638 inst->src[i].reg_offset);
1639 inst->insert_before(&list);
1640
1641 inst->src[i].file = temp.file;
1642 inst->src[i].reg = temp.reg;
1643 inst->src[i].reg_offset = temp.reg_offset;
1644 inst->src[i].reladdr = NULL;
1645 }
1646 }
1647 }
1648
1649 /**
1650 * Choose accesses from the UNIFORM file to demote to using the pull
1651 * constant buffer.
1652 *
1653 * We allow a fragment shader to have more than the specified minimum
1654 * maximum number of fragment shader uniform components (64). If
1655 * there are too many of these, they'd fill up all of register space.
1656 * So, this will push some of them out to the pull constant buffer and
1657 * update the program to load them.
1658 */
1659 void
1660 fs_visitor::setup_pull_constants()
1661 {
1662 /* Only allow 16 registers (128 uniform components) as push constants. */
1663 unsigned int max_uniform_components = 16 * 8;
1664 if (c->prog_data.nr_params <= max_uniform_components)
1665 return;
1666
1667 if (dispatch_width == 16) {
1668 fail("Pull constants not supported in 16-wide\n");
1669 return;
1670 }
1671
1672 /* Just demote the end of the list. We could probably do better
1673 * here, demoting things that are rarely used in the program first.
1674 */
1675 unsigned int pull_uniform_base = max_uniform_components;
1676
1677 int pull_constant_loc[c->prog_data.nr_params];
1678 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1679 if (i < pull_uniform_base) {
1680 pull_constant_loc[i] = -1;
1681 } else {
1682 pull_constant_loc[i] = -1;
1683 /* If our constant is already being uploaded for reladdr purposes,
1684 * reuse it.
1685 */
1686 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1687 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1688 pull_constant_loc[i] = j;
1689 break;
1690 }
1691 }
1692 if (pull_constant_loc[i] == -1) {
1693 int pull_index = c->prog_data.nr_pull_params++;
1694 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1695 pull_constant_loc[i] = pull_index;;
1696 }
1697 }
1698 }
1699 c->prog_data.nr_params = pull_uniform_base;
1700
1701 foreach_list(node, &this->instructions) {
1702 fs_inst *inst = (fs_inst *)node;
1703
1704 for (int i = 0; i < 3; i++) {
1705 if (inst->src[i].file != UNIFORM)
1706 continue;
1707
1708 int pull_index = pull_constant_loc[inst->src[i].reg +
1709 inst->src[i].reg_offset];
1710 if (pull_index == -1)
1711 continue;
1712
1713 assert(!inst->src[i].reladdr);
1714
1715 fs_reg dst = fs_reg(this, glsl_type::float_type);
1716 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1717 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1718 fs_inst *pull =
1719 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1720 dst, index, offset);
1721 pull->ir = inst->ir;
1722 pull->annotation = inst->annotation;
1723
1724 inst->insert_before(pull);
1725
1726 inst->src[i].file = GRF;
1727 inst->src[i].reg = dst.reg;
1728 inst->src[i].reg_offset = 0;
1729 inst->src[i].smear = pull_index & 3;
1730 }
1731 }
1732 }
1733
1734 bool
1735 fs_visitor::opt_algebraic()
1736 {
1737 bool progress = false;
1738
1739 foreach_list(node, &this->instructions) {
1740 fs_inst *inst = (fs_inst *)node;
1741
1742 switch (inst->opcode) {
1743 case BRW_OPCODE_MUL:
1744 if (inst->src[1].file != IMM)
1745 continue;
1746
1747 /* a * 1.0 = a */
1748 if (inst->src[1].is_one()) {
1749 inst->opcode = BRW_OPCODE_MOV;
1750 inst->src[1] = reg_undef;
1751 progress = true;
1752 break;
1753 }
1754
1755 /* a * 0.0 = 0.0 */
1756 if (inst->src[1].is_zero()) {
1757 inst->opcode = BRW_OPCODE_MOV;
1758 inst->src[0] = inst->src[1];
1759 inst->src[1] = reg_undef;
1760 progress = true;
1761 break;
1762 }
1763
1764 break;
1765 case BRW_OPCODE_ADD:
1766 if (inst->src[1].file != IMM)
1767 continue;
1768
1769 /* a + 0.0 = a */
1770 if (inst->src[1].is_zero()) {
1771 inst->opcode = BRW_OPCODE_MOV;
1772 inst->src[1] = reg_undef;
1773 progress = true;
1774 break;
1775 }
1776 break;
1777 default:
1778 break;
1779 }
1780 }
1781
1782 return progress;
1783 }
1784
1785 /**
1786 * Removes any instructions writing a VGRF where that VGRF is not used by any
1787 * later instruction.
1788 */
1789 bool
1790 fs_visitor::dead_code_eliminate()
1791 {
1792 bool progress = false;
1793 int pc = 0;
1794
1795 calculate_live_intervals();
1796
1797 foreach_list_safe(node, &this->instructions) {
1798 fs_inst *inst = (fs_inst *)node;
1799
1800 if (inst->dst.file == GRF) {
1801 assert(this->virtual_grf_end[inst->dst.reg] >= pc);
1802 if (this->virtual_grf_end[inst->dst.reg] == pc) {
1803 inst->remove();
1804 progress = true;
1805 }
1806 }
1807
1808 pc++;
1809 }
1810
1811 if (progress)
1812 live_intervals_valid = false;
1813
1814 return progress;
1815 }
1816
1817 struct dead_code_hash_key
1818 {
1819 int vgrf;
1820 int reg_offset;
1821 };
1822
1823 static bool
1824 dead_code_hash_compare(const void *a, const void *b)
1825 {
1826 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
1827 }
1828
1829 static void
1830 clear_dead_code_hash(struct hash_table *ht)
1831 {
1832 struct hash_entry *entry;
1833
1834 hash_table_foreach(ht, entry) {
1835 _mesa_hash_table_remove(ht, entry);
1836 }
1837 }
1838
1839 static void
1840 insert_dead_code_hash(struct hash_table *ht,
1841 int vgrf, int reg_offset, fs_inst *inst)
1842 {
1843 /* We don't bother freeing keys, because they'll be GCed with the ht. */
1844 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
1845
1846 key->vgrf = vgrf;
1847 key->reg_offset = reg_offset;
1848
1849 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
1850 }
1851
1852 static struct hash_entry *
1853 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
1854 {
1855 struct dead_code_hash_key key;
1856
1857 key.vgrf = vgrf;
1858 key.reg_offset = reg_offset;
1859
1860 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
1861 }
1862
1863 static void
1864 remove_dead_code_hash(struct hash_table *ht,
1865 int vgrf, int reg_offset)
1866 {
1867 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
1868 if (!entry)
1869 return;
1870
1871 _mesa_hash_table_remove(ht, entry);
1872 }
1873
1874 /**
1875 * Walks basic blocks, removing any regs that are written but not read before
1876 * being redefined.
1877 *
1878 * The dead_code_eliminate() function implements a global dead code
1879 * elimination, but it only handles the removing the last write to a register
1880 * if it's never read. This one can handle intermediate writes, but only
1881 * within a basic block.
1882 */
1883 bool
1884 fs_visitor::dead_code_eliminate_local()
1885 {
1886 struct hash_table *ht;
1887 bool progress = false;
1888
1889 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
1890
1891 foreach_list_safe(node, &this->instructions) {
1892 fs_inst *inst = (fs_inst *)node;
1893
1894 /* At a basic block, empty the HT since we don't understand dataflow
1895 * here.
1896 */
1897 if (inst->is_control_flow()) {
1898 clear_dead_code_hash(ht);
1899 continue;
1900 }
1901
1902 /* Clear the HT of any instructions that got read. */
1903 for (int i = 0; i < 3; i++) {
1904 fs_reg src = inst->src[i];
1905 if (src.file != GRF)
1906 continue;
1907
1908 int read = 1;
1909 if (inst->is_send_from_grf())
1910 read = virtual_grf_sizes[src.reg] - src.reg_offset;
1911
1912 for (int reg_offset = src.reg_offset;
1913 reg_offset < src.reg_offset + read;
1914 reg_offset++) {
1915 remove_dead_code_hash(ht, src.reg, reg_offset);
1916 }
1917 }
1918
1919 /* Add any update of a GRF to the HT, removing a previous write if it
1920 * wasn't read.
1921 */
1922 if (inst->dst.file == GRF) {
1923 if (inst->regs_written > 1) {
1924 /* We don't know how to trim channels from an instruction's
1925 * writes, so we can't incrementally remove unread channels from
1926 * it. Just remove whatever it overwrites from the table
1927 */
1928 for (int i = 0; i < inst->regs_written; i++) {
1929 remove_dead_code_hash(ht,
1930 inst->dst.reg,
1931 inst->dst.reg_offset + i);
1932 }
1933 } else {
1934 struct hash_entry *entry =
1935 get_dead_code_hash_entry(ht, inst->dst.reg,
1936 inst->dst.reg_offset);
1937
1938 if (inst->is_partial_write()) {
1939 /* For a partial write, we can't remove any previous dead code
1940 * candidate, since we're just modifying their result, but we can
1941 * be dead code eliminiated ourselves.
1942 */
1943 if (entry) {
1944 entry->data = inst;
1945 } else {
1946 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1947 inst);
1948 }
1949 } else {
1950 if (entry) {
1951 /* We're completely updating a channel, and there was a
1952 * previous write to the channel that wasn't read. Kill it!
1953 */
1954 fs_inst *inst = (fs_inst *)entry->data;
1955 inst->remove();
1956 progress = true;
1957 _mesa_hash_table_remove(ht, entry);
1958 }
1959
1960 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
1961 inst);
1962 }
1963 }
1964 }
1965 }
1966
1967 _mesa_hash_table_destroy(ht, NULL);
1968
1969 if (progress)
1970 live_intervals_valid = false;
1971
1972 return progress;
1973 }
1974
1975 /**
1976 * Implements a second type of register coalescing: This one checks if
1977 * the two regs involved in a raw move don't interfere, in which case
1978 * they can both by stored in the same place and the MOV removed.
1979 */
1980 bool
1981 fs_visitor::register_coalesce_2()
1982 {
1983 bool progress = false;
1984
1985 calculate_live_intervals();
1986
1987 foreach_list_safe(node, &this->instructions) {
1988 fs_inst *inst = (fs_inst *)node;
1989
1990 if (inst->opcode != BRW_OPCODE_MOV ||
1991 inst->is_partial_write() ||
1992 inst->saturate ||
1993 inst->src[0].file != GRF ||
1994 inst->src[0].negate ||
1995 inst->src[0].abs ||
1996 inst->src[0].smear != -1 ||
1997 inst->dst.file != GRF ||
1998 inst->dst.type != inst->src[0].type ||
1999 virtual_grf_sizes[inst->src[0].reg] != 1 ||
2000 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
2001 continue;
2002 }
2003
2004 int reg_from = inst->src[0].reg;
2005 assert(inst->src[0].reg_offset == 0);
2006 int reg_to = inst->dst.reg;
2007 int reg_to_offset = inst->dst.reg_offset;
2008
2009 foreach_list(node, &this->instructions) {
2010 fs_inst *scan_inst = (fs_inst *)node;
2011
2012 if (scan_inst->dst.file == GRF &&
2013 scan_inst->dst.reg == reg_from) {
2014 scan_inst->dst.reg = reg_to;
2015 scan_inst->dst.reg_offset = reg_to_offset;
2016 }
2017 for (int i = 0; i < 3; i++) {
2018 if (scan_inst->src[i].file == GRF &&
2019 scan_inst->src[i].reg == reg_from) {
2020 scan_inst->src[i].reg = reg_to;
2021 scan_inst->src[i].reg_offset = reg_to_offset;
2022 }
2023 }
2024 }
2025
2026 inst->remove();
2027
2028 /* We don't need to recalculate live intervals inside the loop despite
2029 * flagging live_intervals_valid because we only use live intervals for
2030 * the interferes test, and we must have had a situation where the
2031 * intervals were:
2032 *
2033 * from to
2034 * ^
2035 * |
2036 * v
2037 * ^
2038 * |
2039 * v
2040 *
2041 * Some register R that might get coalesced with one of these two could
2042 * only be referencing "to", otherwise "from"'s range would have been
2043 * longer. R's range could also only start at the end of "to" or later,
2044 * otherwise it will conflict with "to" when we try to coalesce "to"
2045 * into Rw anyway.
2046 */
2047 live_intervals_valid = false;
2048
2049 progress = true;
2050 continue;
2051 }
2052
2053 return progress;
2054 }
2055
2056 bool
2057 fs_visitor::register_coalesce()
2058 {
2059 bool progress = false;
2060 int if_depth = 0;
2061 int loop_depth = 0;
2062
2063 foreach_list_safe(node, &this->instructions) {
2064 fs_inst *inst = (fs_inst *)node;
2065
2066 /* Make sure that we dominate the instructions we're going to
2067 * scan for interfering with our coalescing, or we won't have
2068 * scanned enough to see if anything interferes with our
2069 * coalescing. We don't dominate the following instructions if
2070 * we're in a loop or an if block.
2071 */
2072 switch (inst->opcode) {
2073 case BRW_OPCODE_DO:
2074 loop_depth++;
2075 break;
2076 case BRW_OPCODE_WHILE:
2077 loop_depth--;
2078 break;
2079 case BRW_OPCODE_IF:
2080 if_depth++;
2081 break;
2082 case BRW_OPCODE_ENDIF:
2083 if_depth--;
2084 break;
2085 default:
2086 break;
2087 }
2088 if (loop_depth || if_depth)
2089 continue;
2090
2091 if (inst->opcode != BRW_OPCODE_MOV ||
2092 inst->is_partial_write() ||
2093 inst->saturate ||
2094 inst->dst.file != GRF || (inst->src[0].file != GRF &&
2095 inst->src[0].file != UNIFORM)||
2096 inst->dst.type != inst->src[0].type)
2097 continue;
2098
2099 bool has_source_modifiers = (inst->src[0].abs ||
2100 inst->src[0].negate ||
2101 inst->src[0].smear != -1 ||
2102 inst->src[0].file == UNIFORM);
2103
2104 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
2105 * them: check for no writes to either one until the exit of the
2106 * program.
2107 */
2108 bool interfered = false;
2109
2110 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2111 !scan_inst->is_tail_sentinel();
2112 scan_inst = (fs_inst *)scan_inst->next) {
2113 if (scan_inst->dst.file == GRF) {
2114 if (scan_inst->overwrites_reg(inst->dst) ||
2115 scan_inst->overwrites_reg(inst->src[0])) {
2116 interfered = true;
2117 break;
2118 }
2119 }
2120
2121 if (has_source_modifiers) {
2122 for (int i = 0; i < 3; i++) {
2123 if (scan_inst->src[i].file == GRF &&
2124 scan_inst->src[i].reg == inst->dst.reg &&
2125 scan_inst->src[i].reg_offset == inst->dst.reg_offset &&
2126 inst->dst.type != scan_inst->src[i].type)
2127 {
2128 interfered = true;
2129 break;
2130 }
2131 }
2132 }
2133
2134
2135 /* The gen6 MATH instruction can't handle source modifiers or
2136 * unusual register regions, so avoid coalescing those for
2137 * now. We should do something more specific.
2138 */
2139 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
2140 interfered = true;
2141 break;
2142 }
2143
2144 /* The accumulator result appears to get used for the
2145 * conditional modifier generation. When negating a UD
2146 * value, there is a 33rd bit generated for the sign in the
2147 * accumulator value, so now you can't check, for example,
2148 * equality with a 32-bit value. See piglit fs-op-neg-uint.
2149 */
2150 if (scan_inst->conditional_mod &&
2151 inst->src[0].negate &&
2152 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
2153 interfered = true;
2154 break;
2155 }
2156 }
2157 if (interfered) {
2158 continue;
2159 }
2160
2161 /* Rewrite the later usage to point at the source of the move to
2162 * be removed.
2163 */
2164 for (fs_inst *scan_inst = inst;
2165 !scan_inst->is_tail_sentinel();
2166 scan_inst = (fs_inst *)scan_inst->next) {
2167 for (int i = 0; i < 3; i++) {
2168 if (scan_inst->src[i].file == GRF &&
2169 scan_inst->src[i].reg == inst->dst.reg &&
2170 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
2171 fs_reg new_src = inst->src[0];
2172 if (scan_inst->src[i].abs) {
2173 new_src.negate = 0;
2174 new_src.abs = 1;
2175 }
2176 new_src.negate ^= scan_inst->src[i].negate;
2177 scan_inst->src[i] = new_src;
2178 }
2179 }
2180 }
2181
2182 inst->remove();
2183 progress = true;
2184 }
2185
2186 if (progress)
2187 live_intervals_valid = false;
2188
2189 return progress;
2190 }
2191
2192
2193 bool
2194 fs_visitor::compute_to_mrf()
2195 {
2196 bool progress = false;
2197 int next_ip = 0;
2198
2199 calculate_live_intervals();
2200
2201 foreach_list_safe(node, &this->instructions) {
2202 fs_inst *inst = (fs_inst *)node;
2203
2204 int ip = next_ip;
2205 next_ip++;
2206
2207 if (inst->opcode != BRW_OPCODE_MOV ||
2208 inst->is_partial_write() ||
2209 inst->dst.file != MRF || inst->src[0].file != GRF ||
2210 inst->dst.type != inst->src[0].type ||
2211 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2212 continue;
2213
2214 /* Work out which hardware MRF registers are written by this
2215 * instruction.
2216 */
2217 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2218 int mrf_high;
2219 if (inst->dst.reg & BRW_MRF_COMPR4) {
2220 mrf_high = mrf_low + 4;
2221 } else if (dispatch_width == 16 &&
2222 (!inst->force_uncompressed && !inst->force_sechalf)) {
2223 mrf_high = mrf_low + 1;
2224 } else {
2225 mrf_high = mrf_low;
2226 }
2227
2228 /* Can't compute-to-MRF this GRF if someone else was going to
2229 * read it later.
2230 */
2231 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2232 continue;
2233
2234 /* Found a move of a GRF to a MRF. Let's see if we can go
2235 * rewrite the thing that made this GRF to write into the MRF.
2236 */
2237 fs_inst *scan_inst;
2238 for (scan_inst = (fs_inst *)inst->prev;
2239 scan_inst->prev != NULL;
2240 scan_inst = (fs_inst *)scan_inst->prev) {
2241 if (scan_inst->dst.file == GRF &&
2242 scan_inst->dst.reg == inst->src[0].reg) {
2243 /* Found the last thing to write our reg we want to turn
2244 * into a compute-to-MRF.
2245 */
2246
2247 /* If this one instruction didn't populate all the
2248 * channels, bail. We might be able to rewrite everything
2249 * that writes that reg, but it would require smarter
2250 * tracking to delay the rewriting until complete success.
2251 */
2252 if (scan_inst->is_partial_write())
2253 break;
2254
2255 /* Things returning more than one register would need us to
2256 * understand coalescing out more than one MOV at a time.
2257 */
2258 if (scan_inst->regs_written > 1)
2259 break;
2260
2261 /* SEND instructions can't have MRF as a destination. */
2262 if (scan_inst->mlen)
2263 break;
2264
2265 if (brw->gen == 6) {
2266 /* gen6 math instructions must have the destination be
2267 * GRF, so no compute-to-MRF for them.
2268 */
2269 if (scan_inst->is_math()) {
2270 break;
2271 }
2272 }
2273
2274 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2275 /* Found the creator of our MRF's source value. */
2276 scan_inst->dst.file = MRF;
2277 scan_inst->dst.reg = inst->dst.reg;
2278 scan_inst->saturate |= inst->saturate;
2279 inst->remove();
2280 progress = true;
2281 }
2282 break;
2283 }
2284
2285 /* We don't handle control flow here. Most computation of
2286 * values that end up in MRFs are shortly before the MRF
2287 * write anyway.
2288 */
2289 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2290 break;
2291
2292 /* You can't read from an MRF, so if someone else reads our
2293 * MRF's source GRF that we wanted to rewrite, that stops us.
2294 */
2295 bool interfered = false;
2296 for (int i = 0; i < 3; i++) {
2297 if (scan_inst->src[i].file == GRF &&
2298 scan_inst->src[i].reg == inst->src[0].reg &&
2299 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2300 interfered = true;
2301 }
2302 }
2303 if (interfered)
2304 break;
2305
2306 if (scan_inst->dst.file == MRF) {
2307 /* If somebody else writes our MRF here, we can't
2308 * compute-to-MRF before that.
2309 */
2310 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2311 int scan_mrf_high;
2312
2313 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2314 scan_mrf_high = scan_mrf_low + 4;
2315 } else if (dispatch_width == 16 &&
2316 (!scan_inst->force_uncompressed &&
2317 !scan_inst->force_sechalf)) {
2318 scan_mrf_high = scan_mrf_low + 1;
2319 } else {
2320 scan_mrf_high = scan_mrf_low;
2321 }
2322
2323 if (mrf_low == scan_mrf_low ||
2324 mrf_low == scan_mrf_high ||
2325 mrf_high == scan_mrf_low ||
2326 mrf_high == scan_mrf_high) {
2327 break;
2328 }
2329 }
2330
2331 if (scan_inst->mlen > 0) {
2332 /* Found a SEND instruction, which means that there are
2333 * live values in MRFs from base_mrf to base_mrf +
2334 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2335 * above it.
2336 */
2337 if (mrf_low >= scan_inst->base_mrf &&
2338 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2339 break;
2340 }
2341 if (mrf_high >= scan_inst->base_mrf &&
2342 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2343 break;
2344 }
2345 }
2346 }
2347 }
2348
2349 if (progress)
2350 live_intervals_valid = false;
2351
2352 return progress;
2353 }
2354
2355 /**
2356 * Walks through basic blocks, looking for repeated MRF writes and
2357 * removing the later ones.
2358 */
2359 bool
2360 fs_visitor::remove_duplicate_mrf_writes()
2361 {
2362 fs_inst *last_mrf_move[16];
2363 bool progress = false;
2364
2365 /* Need to update the MRF tracking for compressed instructions. */
2366 if (dispatch_width == 16)
2367 return false;
2368
2369 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2370
2371 foreach_list_safe(node, &this->instructions) {
2372 fs_inst *inst = (fs_inst *)node;
2373
2374 if (inst->is_control_flow()) {
2375 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2376 }
2377
2378 if (inst->opcode == BRW_OPCODE_MOV &&
2379 inst->dst.file == MRF) {
2380 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2381 if (prev_inst && inst->equals(prev_inst)) {
2382 inst->remove();
2383 progress = true;
2384 continue;
2385 }
2386 }
2387
2388 /* Clear out the last-write records for MRFs that were overwritten. */
2389 if (inst->dst.file == MRF) {
2390 last_mrf_move[inst->dst.reg] = NULL;
2391 }
2392
2393 if (inst->mlen > 0) {
2394 /* Found a SEND instruction, which will include two or fewer
2395 * implied MRF writes. We could do better here.
2396 */
2397 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2398 last_mrf_move[inst->base_mrf + i] = NULL;
2399 }
2400 }
2401
2402 /* Clear out any MRF move records whose sources got overwritten. */
2403 if (inst->dst.file == GRF) {
2404 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2405 if (last_mrf_move[i] &&
2406 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2407 last_mrf_move[i] = NULL;
2408 }
2409 }
2410 }
2411
2412 if (inst->opcode == BRW_OPCODE_MOV &&
2413 inst->dst.file == MRF &&
2414 inst->src[0].file == GRF &&
2415 !inst->is_partial_write()) {
2416 last_mrf_move[inst->dst.reg] = inst;
2417 }
2418 }
2419
2420 if (progress)
2421 live_intervals_valid = false;
2422
2423 return progress;
2424 }
2425
2426 static void
2427 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2428 int first_grf, int grf_len)
2429 {
2430 bool inst_16wide = (dispatch_width > 8 &&
2431 !inst->force_uncompressed &&
2432 !inst->force_sechalf);
2433
2434 /* Clear the flag for registers that actually got read (as expected). */
2435 for (int i = 0; i < 3; i++) {
2436 int grf;
2437 if (inst->src[i].file == GRF) {
2438 grf = inst->src[i].reg;
2439 } else if (inst->src[i].file == HW_REG &&
2440 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2441 grf = inst->src[i].fixed_hw_reg.nr;
2442 } else {
2443 continue;
2444 }
2445
2446 if (grf >= first_grf &&
2447 grf < first_grf + grf_len) {
2448 deps[grf - first_grf] = false;
2449 if (inst_16wide)
2450 deps[grf - first_grf + 1] = false;
2451 }
2452 }
2453 }
2454
2455 /**
2456 * Implements this workaround for the original 965:
2457 *
2458 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2459 * check for post destination dependencies on this instruction, software
2460 * must ensure that there is no destination hazard for the case of ‘write
2461 * followed by a posted write’ shown in the following example.
2462 *
2463 * 1. mov r3 0
2464 * 2. send r3.xy <rest of send instruction>
2465 * 3. mov r2 r3
2466 *
2467 * Due to no post-destination dependency check on the ‘send’, the above
2468 * code sequence could have two instructions (1 and 2) in flight at the
2469 * same time that both consider ‘r3’ as the target of their final writes.
2470 */
2471 void
2472 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2473 {
2474 int reg_size = dispatch_width / 8;
2475 int write_len = inst->regs_written * reg_size;
2476 int first_write_grf = inst->dst.reg;
2477 bool needs_dep[BRW_MAX_MRF];
2478 assert(write_len < (int)sizeof(needs_dep) - 1);
2479
2480 memset(needs_dep, false, sizeof(needs_dep));
2481 memset(needs_dep, true, write_len);
2482
2483 clear_deps_for_inst_src(inst, dispatch_width,
2484 needs_dep, first_write_grf, write_len);
2485
2486 /* Walk backwards looking for writes to registers we're writing which
2487 * aren't read since being written. If we hit the start of the program,
2488 * we assume that there are no outstanding dependencies on entry to the
2489 * program.
2490 */
2491 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2492 scan_inst != NULL;
2493 scan_inst = (fs_inst *)scan_inst->prev) {
2494
2495 /* If we hit control flow, assume that there *are* outstanding
2496 * dependencies, and force their cleanup before our instruction.
2497 */
2498 if (scan_inst->is_control_flow()) {
2499 for (int i = 0; i < write_len; i++) {
2500 if (needs_dep[i]) {
2501 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2502 }
2503 }
2504 return;
2505 }
2506
2507 bool scan_inst_16wide = (dispatch_width > 8 &&
2508 !scan_inst->force_uncompressed &&
2509 !scan_inst->force_sechalf);
2510
2511 /* We insert our reads as late as possible on the assumption that any
2512 * instruction but a MOV that might have left us an outstanding
2513 * dependency has more latency than a MOV.
2514 */
2515 if (scan_inst->dst.file == GRF) {
2516 for (int i = 0; i < scan_inst->regs_written; i++) {
2517 int reg = scan_inst->dst.reg + i * reg_size;
2518
2519 if (reg >= first_write_grf &&
2520 reg < first_write_grf + write_len &&
2521 needs_dep[reg - first_write_grf]) {
2522 inst->insert_before(DEP_RESOLVE_MOV(reg));
2523 needs_dep[reg - first_write_grf] = false;
2524 if (scan_inst_16wide)
2525 needs_dep[reg - first_write_grf + 1] = false;
2526 }
2527 }
2528 }
2529
2530 /* Clear the flag for registers that actually got read (as expected). */
2531 clear_deps_for_inst_src(scan_inst, dispatch_width,
2532 needs_dep, first_write_grf, write_len);
2533
2534 /* Continue the loop only if we haven't resolved all the dependencies */
2535 int i;
2536 for (i = 0; i < write_len; i++) {
2537 if (needs_dep[i])
2538 break;
2539 }
2540 if (i == write_len)
2541 return;
2542 }
2543 }
2544
2545 /**
2546 * Implements this workaround for the original 965:
2547 *
2548 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2549 * used as a destination register until after it has been sourced by an
2550 * instruction with a different destination register.
2551 */
2552 void
2553 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2554 {
2555 int write_len = inst->regs_written * dispatch_width / 8;
2556 int first_write_grf = inst->dst.reg;
2557 bool needs_dep[BRW_MAX_MRF];
2558 assert(write_len < (int)sizeof(needs_dep) - 1);
2559
2560 memset(needs_dep, false, sizeof(needs_dep));
2561 memset(needs_dep, true, write_len);
2562 /* Walk forwards looking for writes to registers we're writing which aren't
2563 * read before being written.
2564 */
2565 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2566 !scan_inst->is_tail_sentinel();
2567 scan_inst = (fs_inst *)scan_inst->next) {
2568 /* If we hit control flow, force resolve all remaining dependencies. */
2569 if (scan_inst->is_control_flow()) {
2570 for (int i = 0; i < write_len; i++) {
2571 if (needs_dep[i])
2572 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2573 }
2574 return;
2575 }
2576
2577 /* Clear the flag for registers that actually got read (as expected). */
2578 clear_deps_for_inst_src(scan_inst, dispatch_width,
2579 needs_dep, first_write_grf, write_len);
2580
2581 /* We insert our reads as late as possible since they're reading the
2582 * result of a SEND, which has massive latency.
2583 */
2584 if (scan_inst->dst.file == GRF &&
2585 scan_inst->dst.reg >= first_write_grf &&
2586 scan_inst->dst.reg < first_write_grf + write_len &&
2587 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2588 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2589 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2590 }
2591
2592 /* Continue the loop only if we haven't resolved all the dependencies */
2593 int i;
2594 for (i = 0; i < write_len; i++) {
2595 if (needs_dep[i])
2596 break;
2597 }
2598 if (i == write_len)
2599 return;
2600 }
2601
2602 /* If we hit the end of the program, resolve all remaining dependencies out
2603 * of paranoia.
2604 */
2605 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2606 assert(last_inst->eot);
2607 for (int i = 0; i < write_len; i++) {
2608 if (needs_dep[i])
2609 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2610 }
2611 }
2612
2613 void
2614 fs_visitor::insert_gen4_send_dependency_workarounds()
2615 {
2616 if (brw->gen != 4 || brw->is_g4x)
2617 return;
2618
2619 /* Note that we're done with register allocation, so GRF fs_regs always
2620 * have a .reg_offset of 0.
2621 */
2622
2623 foreach_list_safe(node, &this->instructions) {
2624 fs_inst *inst = (fs_inst *)node;
2625
2626 if (inst->mlen != 0 && inst->dst.file == GRF) {
2627 insert_gen4_pre_send_dependency_workarounds(inst);
2628 insert_gen4_post_send_dependency_workarounds(inst);
2629 }
2630 }
2631 }
2632
2633 /**
2634 * Turns the generic expression-style uniform pull constant load instruction
2635 * into a hardware-specific series of instructions for loading a pull
2636 * constant.
2637 *
2638 * The expression style allows the CSE pass before this to optimize out
2639 * repeated loads from the same offset, and gives the pre-register-allocation
2640 * scheduling full flexibility, while the conversion to native instructions
2641 * allows the post-register-allocation scheduler the best information
2642 * possible.
2643 *
2644 * Note that execution masking for setting up pull constant loads is special:
2645 * the channels that need to be written are unrelated to the current execution
2646 * mask, since a later instruction will use one of the result channels as a
2647 * source operand for all 8 or 16 of its channels.
2648 */
2649 void
2650 fs_visitor::lower_uniform_pull_constant_loads()
2651 {
2652 foreach_list(node, &this->instructions) {
2653 fs_inst *inst = (fs_inst *)node;
2654
2655 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2656 continue;
2657
2658 if (brw->gen >= 7) {
2659 /* The offset arg before was a vec4-aligned byte offset. We need to
2660 * turn it into a dword offset.
2661 */
2662 fs_reg const_offset_reg = inst->src[1];
2663 assert(const_offset_reg.file == IMM &&
2664 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2665 const_offset_reg.imm.u /= 4;
2666 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2667
2668 /* This is actually going to be a MOV, but since only the first dword
2669 * is accessed, we have a special opcode to do just that one. Note
2670 * that this needs to be an operation that will be considered a def
2671 * by live variable analysis, or register allocation will explode.
2672 */
2673 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2674 payload, const_offset_reg);
2675 setup->force_writemask_all = true;
2676
2677 setup->ir = inst->ir;
2678 setup->annotation = inst->annotation;
2679 inst->insert_before(setup);
2680
2681 /* Similarly, this will only populate the first 4 channels of the
2682 * result register (since we only use smear values from 0-3), but we
2683 * don't tell the optimizer.
2684 */
2685 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2686 inst->src[1] = payload;
2687
2688 this->live_intervals_valid = false;
2689 } else {
2690 /* Before register allocation, we didn't tell the scheduler about the
2691 * MRF we use. We know it's safe to use this MRF because nothing
2692 * else does except for register spill/unspill, which generates and
2693 * uses its MRF within a single IR instruction.
2694 */
2695 inst->base_mrf = 14;
2696 inst->mlen = 1;
2697 }
2698 }
2699 }
2700
2701 void
2702 fs_visitor::dump_instruction(backend_instruction *be_inst)
2703 {
2704 fs_inst *inst = (fs_inst *)be_inst;
2705
2706 if (inst->predicate) {
2707 printf("(%cf0.%d) ",
2708 inst->predicate_inverse ? '-' : '+',
2709 inst->flag_subreg);
2710 }
2711
2712 printf("%s", brw_instruction_name(inst->opcode));
2713 if (inst->saturate)
2714 printf(".sat");
2715 if (inst->conditional_mod) {
2716 printf(".cmod");
2717 if (!inst->predicate &&
2718 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2719 inst->opcode != BRW_OPCODE_IF &&
2720 inst->opcode != BRW_OPCODE_WHILE))) {
2721 printf(".f0.%d", inst->flag_subreg);
2722 }
2723 }
2724 printf(" ");
2725
2726
2727 switch (inst->dst.file) {
2728 case GRF:
2729 printf("vgrf%d", inst->dst.reg);
2730 if (inst->dst.reg_offset)
2731 printf("+%d", inst->dst.reg_offset);
2732 break;
2733 case MRF:
2734 printf("m%d", inst->dst.reg);
2735 break;
2736 case BAD_FILE:
2737 printf("(null)");
2738 break;
2739 case UNIFORM:
2740 printf("***u%d***", inst->dst.reg);
2741 break;
2742 case ARF:
2743 if (inst->dst.reg == BRW_ARF_NULL)
2744 printf("(null)");
2745 else
2746 printf("arf%d", inst->dst.reg);
2747 break;
2748 default:
2749 printf("???");
2750 break;
2751 }
2752 printf(", ");
2753
2754 for (int i = 0; i < 3; i++) {
2755 if (inst->src[i].negate)
2756 printf("-");
2757 if (inst->src[i].abs)
2758 printf("|");
2759 switch (inst->src[i].file) {
2760 case GRF:
2761 printf("vgrf%d", inst->src[i].reg);
2762 if (inst->src[i].reg_offset)
2763 printf("+%d", inst->src[i].reg_offset);
2764 break;
2765 case MRF:
2766 printf("***m%d***", inst->src[i].reg);
2767 break;
2768 case UNIFORM:
2769 printf("u%d", inst->src[i].reg);
2770 if (inst->src[i].reg_offset)
2771 printf(".%d", inst->src[i].reg_offset);
2772 break;
2773 case BAD_FILE:
2774 printf("(null)");
2775 break;
2776 case IMM:
2777 switch (inst->src[i].type) {
2778 case BRW_REGISTER_TYPE_F:
2779 printf("%ff", inst->src[i].imm.f);
2780 break;
2781 case BRW_REGISTER_TYPE_D:
2782 printf("%dd", inst->src[i].imm.i);
2783 break;
2784 case BRW_REGISTER_TYPE_UD:
2785 printf("%uu", inst->src[i].imm.u);
2786 break;
2787 default:
2788 printf("???");
2789 break;
2790 }
2791 break;
2792 default:
2793 printf("???");
2794 break;
2795 }
2796 if (inst->src[i].abs)
2797 printf("|");
2798
2799 if (i < 3)
2800 printf(", ");
2801 }
2802
2803 printf(" ");
2804
2805 if (inst->force_uncompressed)
2806 printf("1sthalf ");
2807
2808 if (inst->force_sechalf)
2809 printf("2ndhalf ");
2810
2811 printf("\n");
2812 }
2813
2814 /**
2815 * Possibly returns an instruction that set up @param reg.
2816 *
2817 * Sometimes we want to take the result of some expression/variable
2818 * dereference tree and rewrite the instruction generating the result
2819 * of the tree. When processing the tree, we know that the
2820 * instructions generated are all writing temporaries that are dead
2821 * outside of this tree. So, if we have some instructions that write
2822 * a temporary, we're free to point that temp write somewhere else.
2823 *
2824 * Note that this doesn't guarantee that the instruction generated
2825 * only reg -- it might be the size=4 destination of a texture instruction.
2826 */
2827 fs_inst *
2828 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2829 fs_inst *end,
2830 fs_reg reg)
2831 {
2832 if (end == start ||
2833 end->is_partial_write() ||
2834 reg.reladdr ||
2835 !reg.equals(end->dst)) {
2836 return NULL;
2837 } else {
2838 return end;
2839 }
2840 }
2841
2842 void
2843 fs_visitor::setup_payload_gen6()
2844 {
2845 bool uses_depth =
2846 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2847 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2848
2849 assert(brw->gen >= 6);
2850
2851 /* R0-1: masks, pixel X/Y coordinates. */
2852 c->nr_payload_regs = 2;
2853 /* R2: only for 32-pixel dispatch.*/
2854
2855 /* R3-26: barycentric interpolation coordinates. These appear in the
2856 * same order that they appear in the brw_wm_barycentric_interp_mode
2857 * enum. Each set of coordinates occupies 2 registers if dispatch width
2858 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2859 * appear if they were enabled using the "Barycentric Interpolation
2860 * Mode" bits in WM_STATE.
2861 */
2862 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2863 if (barycentric_interp_modes & (1 << i)) {
2864 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2865 c->nr_payload_regs += 2;
2866 if (dispatch_width == 16) {
2867 c->nr_payload_regs += 2;
2868 }
2869 }
2870 }
2871
2872 /* R27: interpolated depth if uses source depth */
2873 if (uses_depth) {
2874 c->source_depth_reg = c->nr_payload_regs;
2875 c->nr_payload_regs++;
2876 if (dispatch_width == 16) {
2877 /* R28: interpolated depth if not 8-wide. */
2878 c->nr_payload_regs++;
2879 }
2880 }
2881 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2882 if (uses_depth) {
2883 c->source_w_reg = c->nr_payload_regs;
2884 c->nr_payload_regs++;
2885 if (dispatch_width == 16) {
2886 /* R30: interpolated W if not 8-wide. */
2887 c->nr_payload_regs++;
2888 }
2889 }
2890 /* R31: MSAA position offsets. */
2891 /* R32-: bary for 32-pixel. */
2892 /* R58-59: interp W for 32-pixel. */
2893
2894 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2895 c->source_depth_to_render_target = true;
2896 }
2897 }
2898
2899 bool
2900 fs_visitor::run()
2901 {
2902 sanity_param_count = fp->Base.Parameters->NumParameters;
2903 uint32_t orig_nr_params = c->prog_data.nr_params;
2904
2905 if (brw->gen >= 6)
2906 setup_payload_gen6();
2907 else
2908 setup_payload_gen4();
2909
2910 if (0) {
2911 emit_dummy_fs();
2912 } else {
2913 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
2914 emit_shader_time_begin();
2915
2916 calculate_urb_setup();
2917 if (brw->gen < 6)
2918 emit_interpolation_setup_gen4();
2919 else
2920 emit_interpolation_setup_gen6();
2921
2922 /* We handle discards by keeping track of the still-live pixels in f0.1.
2923 * Initialize it with the dispatched pixels.
2924 */
2925 if (fp->UsesKill) {
2926 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
2927 discard_init->flag_subreg = 1;
2928 }
2929
2930 /* Generate FS IR for main(). (the visitor only descends into
2931 * functions called "main").
2932 */
2933 if (shader) {
2934 foreach_list(node, &*shader->ir) {
2935 ir_instruction *ir = (ir_instruction *)node;
2936 base_ir = ir;
2937 this->result = reg_undef;
2938 ir->accept(this);
2939 }
2940 } else {
2941 emit_fragment_program_code();
2942 }
2943 base_ir = NULL;
2944 if (failed)
2945 return false;
2946
2947 emit(FS_OPCODE_PLACEHOLDER_HALT);
2948
2949 emit_fb_writes();
2950
2951 split_virtual_grfs();
2952
2953 move_uniform_array_access_to_pull_constants();
2954 setup_pull_constants();
2955
2956 bool progress;
2957 do {
2958 progress = false;
2959
2960 compact_virtual_grfs();
2961
2962 progress = remove_duplicate_mrf_writes() || progress;
2963
2964 progress = opt_algebraic() || progress;
2965 progress = opt_cse() || progress;
2966 progress = opt_copy_propagate() || progress;
2967 progress = dead_code_eliminate() || progress;
2968 progress = dead_code_eliminate_local() || progress;
2969 progress = register_coalesce() || progress;
2970 progress = register_coalesce_2() || progress;
2971 progress = compute_to_mrf() || progress;
2972 } while (progress);
2973
2974 remove_dead_constants();
2975
2976 schedule_instructions(false);
2977
2978 lower_uniform_pull_constant_loads();
2979
2980 assign_curb_setup();
2981 assign_urb_setup();
2982
2983 if (0) {
2984 /* Debug of register spilling: Go spill everything. */
2985 for (int i = 0; i < virtual_grf_count; i++) {
2986 spill_reg(i);
2987 }
2988 }
2989
2990 if (0)
2991 assign_regs_trivial();
2992 else {
2993 while (!assign_regs()) {
2994 if (failed)
2995 break;
2996 }
2997 }
2998 }
2999 assert(force_uncompressed_stack == 0);
3000 assert(force_sechalf_stack == 0);
3001
3002 /* This must come after all optimization and register allocation, since
3003 * it inserts dead code that happens to have side effects, and it does
3004 * so based on the actual physical registers in use.
3005 */
3006 insert_gen4_send_dependency_workarounds();
3007
3008 if (failed)
3009 return false;
3010
3011 schedule_instructions(true);
3012
3013 if (dispatch_width == 8) {
3014 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3015 } else {
3016 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3017
3018 /* Make sure we didn't try to sneak in an extra uniform */
3019 assert(orig_nr_params == c->prog_data.nr_params);
3020 (void) orig_nr_params;
3021 }
3022
3023 /* If any state parameters were appended, then ParameterValues could have
3024 * been realloced, in which case the driver uniform storage set up by
3025 * _mesa_associate_uniform_storage() would point to freed memory. Make
3026 * sure that didn't happen.
3027 */
3028 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3029
3030 return !failed;
3031 }
3032
3033 const unsigned *
3034 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3035 struct gl_fragment_program *fp,
3036 struct gl_shader_program *prog,
3037 unsigned *final_assembly_size)
3038 {
3039 bool start_busy = false;
3040 float start_time = 0;
3041
3042 if (unlikely(brw->perf_debug)) {
3043 start_busy = (brw->batch.last_bo &&
3044 drm_intel_bo_busy(brw->batch.last_bo));
3045 start_time = get_time();
3046 }
3047
3048 struct brw_shader *shader = NULL;
3049 if (prog)
3050 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3051
3052 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3053 if (prog) {
3054 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3055 _mesa_print_ir(shader->ir, NULL);
3056 printf("\n\n");
3057 } else {
3058 printf("ARB_fragment_program %d ir for native fragment shader\n",
3059 fp->Base.Id);
3060 _mesa_print_program(&fp->Base);
3061 }
3062 }
3063
3064 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3065 */
3066 fs_visitor v(brw, c, prog, fp, 8);
3067 if (!v.run()) {
3068 if (prog) {
3069 prog->LinkStatus = false;
3070 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3071 }
3072
3073 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3074 v.fail_msg);
3075
3076 return NULL;
3077 }
3078
3079 exec_list *simd16_instructions = NULL;
3080 fs_visitor v2(brw, c, prog, fp, 16);
3081 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3082 if (c->prog_data.nr_pull_params == 0) {
3083 /* Try a 16-wide compile */
3084 v2.import_uniforms(&v);
3085 if (!v2.run()) {
3086 perf_debug("16-wide shader failed to compile, falling back to "
3087 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3088 } else {
3089 simd16_instructions = &v2.instructions;
3090 }
3091 } else {
3092 perf_debug("Skipping 16-wide due to pull parameters.\n");
3093 }
3094 }
3095
3096 c->prog_data.dispatch_width = 8;
3097
3098 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3099 const unsigned *generated = g.generate_assembly(&v.instructions,
3100 simd16_instructions,
3101 final_assembly_size);
3102
3103 if (unlikely(brw->perf_debug) && shader) {
3104 if (shader->compiled_once)
3105 brw_wm_debug_recompile(brw, prog, &c->key);
3106 shader->compiled_once = true;
3107
3108 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3109 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3110 (get_time() - start_time) * 1000);
3111 }
3112 }
3113
3114 return generated;
3115 }
3116
3117 bool
3118 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3119 {
3120 struct brw_context *brw = brw_context(ctx);
3121 struct brw_wm_prog_key key;
3122
3123 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3124 return true;
3125
3126 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3127 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3128 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3129 bool program_uses_dfdy = fp->UsesDFdy;
3130
3131 memset(&key, 0, sizeof(key));
3132
3133 if (brw->gen < 6) {
3134 if (fp->UsesKill)
3135 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3136
3137 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3138 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3139
3140 /* Just assume depth testing. */
3141 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3142 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3143 }
3144
3145 if (brw->gen < 6)
3146 key.input_slots_valid |= BITFIELD64_BIT(VARYING_SLOT_POS);
3147
3148 for (int i = 0; i < VARYING_SLOT_MAX; i++) {
3149 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
3150 continue;
3151
3152 if (brw->gen < 6) {
3153 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
3154 key.input_slots_valid |= BITFIELD64_BIT(i);
3155 }
3156 }
3157
3158 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3159
3160 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3161 for (unsigned i = 0; i < sampler_count; i++) {
3162 if (fp->Base.ShadowSamplers & (1 << i)) {
3163 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3164 key.tex.swizzles[i] =
3165 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3166 } else {
3167 /* Color sampler: assume no swizzling. */
3168 key.tex.swizzles[i] = SWIZZLE_XYZW;
3169 }
3170 }
3171
3172 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3173 key.drawable_height = ctx->DrawBuffer->Height;
3174 }
3175
3176 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3177 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3178 }
3179
3180 key.nr_color_regions = 1;
3181
3182 key.program_string_id = bfp->id;
3183
3184 uint32_t old_prog_offset = brw->wm.prog_offset;
3185 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3186
3187 bool success = do_wm_prog(brw, prog, bfp, &key);
3188
3189 brw->wm.prog_offset = old_prog_offset;
3190 brw->wm.prog_data = old_prog_data;
3191
3192 return success;
3193 }