i965: Don't print extra (null) arguments in dump_instruction().
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/hash_table.h"
36 #include "main/macros.h"
37 #include "main/shaderobj.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "brw_dead_control_flow.h"
50 #include "main/uniforms.h"
51 #include "brw_fs_live_variables.h"
52 #include "glsl/glsl_types.h"
53
54 void
55 fs_inst::init()
56 {
57 memset(this, 0, sizeof(*this));
58 this->opcode = BRW_OPCODE_NOP;
59 this->conditional_mod = BRW_CONDITIONAL_NONE;
60
61 this->dst = reg_undef;
62 this->src[0] = reg_undef;
63 this->src[1] = reg_undef;
64 this->src[2] = reg_undef;
65
66 /* This will be the case for almost all instructions. */
67 this->regs_written = 1;
68 }
69
70 fs_inst::fs_inst()
71 {
72 init();
73 }
74
75 fs_inst::fs_inst(enum opcode opcode)
76 {
77 init();
78 this->opcode = opcode;
79 }
80
81 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
82 {
83 init();
84 this->opcode = opcode;
85 this->dst = dst;
86
87 if (dst.file == GRF)
88 assert(dst.reg_offset >= 0);
89 }
90
91 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
92 {
93 init();
94 this->opcode = opcode;
95 this->dst = dst;
96 this->src[0] = src0;
97
98 if (dst.file == GRF)
99 assert(dst.reg_offset >= 0);
100 if (src[0].file == GRF)
101 assert(src[0].reg_offset >= 0);
102 }
103
104 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
105 {
106 init();
107 this->opcode = opcode;
108 this->dst = dst;
109 this->src[0] = src0;
110 this->src[1] = src1;
111
112 if (dst.file == GRF)
113 assert(dst.reg_offset >= 0);
114 if (src[0].file == GRF)
115 assert(src[0].reg_offset >= 0);
116 if (src[1].file == GRF)
117 assert(src[1].reg_offset >= 0);
118 }
119
120 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
121 fs_reg src0, fs_reg src1, fs_reg src2)
122 {
123 init();
124 this->opcode = opcode;
125 this->dst = dst;
126 this->src[0] = src0;
127 this->src[1] = src1;
128 this->src[2] = src2;
129
130 if (dst.file == GRF)
131 assert(dst.reg_offset >= 0);
132 if (src[0].file == GRF)
133 assert(src[0].reg_offset >= 0);
134 if (src[1].file == GRF)
135 assert(src[1].reg_offset >= 0);
136 if (src[2].file == GRF)
137 assert(src[2].reg_offset >= 0);
138 }
139
140 #define ALU1(op) \
141 fs_inst * \
142 fs_visitor::op(fs_reg dst, fs_reg src0) \
143 { \
144 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
145 }
146
147 #define ALU2(op) \
148 fs_inst * \
149 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
150 { \
151 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
152 }
153
154 #define ALU3(op) \
155 fs_inst * \
156 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1, fs_reg src2) \
157 { \
158 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1, src2);\
159 }
160
161 ALU1(NOT)
162 ALU1(MOV)
163 ALU1(FRC)
164 ALU1(RNDD)
165 ALU1(RNDE)
166 ALU1(RNDZ)
167 ALU2(ADD)
168 ALU2(MUL)
169 ALU2(MACH)
170 ALU2(AND)
171 ALU2(OR)
172 ALU2(XOR)
173 ALU2(SHL)
174 ALU2(SHR)
175 ALU2(ASR)
176 ALU3(LRP)
177 ALU1(BFREV)
178 ALU3(BFE)
179 ALU2(BFI1)
180 ALU3(BFI2)
181 ALU1(FBH)
182 ALU1(FBL)
183 ALU1(CBIT)
184 ALU3(MAD)
185 ALU2(ADDC)
186 ALU2(SUBB)
187
188 /** Gen4 predicated IF. */
189 fs_inst *
190 fs_visitor::IF(uint32_t predicate)
191 {
192 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
193 inst->predicate = predicate;
194 return inst;
195 }
196
197 /** Gen6 IF with embedded comparison. */
198 fs_inst *
199 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
200 {
201 assert(brw->gen == 6);
202 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
203 reg_null_d, src0, src1);
204 inst->conditional_mod = condition;
205 return inst;
206 }
207
208 /**
209 * CMP: Sets the low bit of the destination channels with the result
210 * of the comparison, while the upper bits are undefined, and updates
211 * the flag register with the packed 16 bits of the result.
212 */
213 fs_inst *
214 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
215 {
216 fs_inst *inst;
217
218 /* Take the instruction:
219 *
220 * CMP null<d> src0<f> src1<f>
221 *
222 * Original gen4 does type conversion to the destination type before
223 * comparison, producing garbage results for floating point comparisons.
224 * gen5 does the comparison on the execution type (resolved source types),
225 * so dst type doesn't matter. gen6 does comparison and then uses the
226 * result as if it was the dst type with no conversion, which happens to
227 * mostly work out for float-interpreted-as-int since our comparisons are
228 * for >0, =0, <0.
229 */
230 if (brw->gen == 4) {
231 dst.type = src0.type;
232 if (dst.file == HW_REG)
233 dst.fixed_hw_reg.type = dst.type;
234 }
235
236 resolve_ud_negate(&src0);
237 resolve_ud_negate(&src1);
238
239 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
240 inst->conditional_mod = condition;
241
242 return inst;
243 }
244
245 exec_list
246 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
247 fs_reg varying_offset,
248 uint32_t const_offset)
249 {
250 exec_list instructions;
251 fs_inst *inst;
252
253 /* We have our constant surface use a pitch of 4 bytes, so our index can
254 * be any component of a vector, and then we load 4 contiguous
255 * components starting from that.
256 *
257 * We break down the const_offset to a portion added to the variable
258 * offset and a portion done using reg_offset, which means that if you
259 * have GLSL using something like "uniform vec4 a[20]; gl_FragColor =
260 * a[i]", we'll temporarily generate 4 vec4 loads from offset i * 4, and
261 * CSE can later notice that those loads are all the same and eliminate
262 * the redundant ones.
263 */
264 fs_reg vec4_offset = fs_reg(this, glsl_type::int_type);
265 instructions.push_tail(ADD(vec4_offset,
266 varying_offset, const_offset & ~3));
267
268 int scale = 1;
269 if (brw->gen == 4 && dispatch_width == 8) {
270 /* Pre-gen5, we can either use a SIMD8 message that requires (header,
271 * u, v, r) as parameters, or we can just use the SIMD16 message
272 * consisting of (header, u). We choose the second, at the cost of a
273 * longer return length.
274 */
275 scale = 2;
276 }
277
278 enum opcode op;
279 if (brw->gen >= 7)
280 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
281 else
282 op = FS_OPCODE_VARYING_PULL_CONSTANT_LOAD;
283 fs_reg vec4_result = fs_reg(GRF, virtual_grf_alloc(4 * scale), dst.type);
284 inst = new(mem_ctx) fs_inst(op, vec4_result, surf_index, vec4_offset);
285 inst->regs_written = 4 * scale;
286 instructions.push_tail(inst);
287
288 if (brw->gen < 7) {
289 inst->base_mrf = 13;
290 inst->header_present = true;
291 if (brw->gen == 4)
292 inst->mlen = 3;
293 else
294 inst->mlen = 1 + dispatch_width / 8;
295 }
296
297 vec4_result.reg_offset += (const_offset & 3) * scale;
298 instructions.push_tail(MOV(dst, vec4_result));
299
300 return instructions;
301 }
302
303 /**
304 * A helper for MOV generation for fixing up broken hardware SEND dependency
305 * handling.
306 */
307 fs_inst *
308 fs_visitor::DEP_RESOLVE_MOV(int grf)
309 {
310 fs_inst *inst = MOV(brw_null_reg(), fs_reg(GRF, grf, BRW_REGISTER_TYPE_F));
311
312 inst->ir = NULL;
313 inst->annotation = "send dependency resolve";
314
315 /* The caller always wants uncompressed to emit the minimal extra
316 * dependencies, and to avoid having to deal with aligning its regs to 2.
317 */
318 inst->force_uncompressed = true;
319
320 return inst;
321 }
322
323 bool
324 fs_inst::equals(fs_inst *inst)
325 {
326 return (opcode == inst->opcode &&
327 dst.equals(inst->dst) &&
328 src[0].equals(inst->src[0]) &&
329 src[1].equals(inst->src[1]) &&
330 src[2].equals(inst->src[2]) &&
331 saturate == inst->saturate &&
332 predicate == inst->predicate &&
333 conditional_mod == inst->conditional_mod &&
334 mlen == inst->mlen &&
335 base_mrf == inst->base_mrf &&
336 sampler == inst->sampler &&
337 target == inst->target &&
338 eot == inst->eot &&
339 header_present == inst->header_present &&
340 shadow_compare == inst->shadow_compare &&
341 offset == inst->offset);
342 }
343
344 bool
345 fs_inst::overwrites_reg(const fs_reg &reg)
346 {
347 return (reg.file == dst.file &&
348 reg.reg == dst.reg &&
349 reg.reg_offset >= dst.reg_offset &&
350 reg.reg_offset < dst.reg_offset + regs_written);
351 }
352
353 bool
354 fs_inst::is_send_from_grf()
355 {
356 return (opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7 ||
357 opcode == SHADER_OPCODE_SHADER_TIME_ADD ||
358 (opcode == FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD &&
359 src[1].file == GRF) ||
360 (is_tex() && src[0].file == GRF));
361 }
362
363 bool
364 fs_visitor::can_do_source_mods(fs_inst *inst)
365 {
366 if (brw->gen == 6 && inst->is_math())
367 return false;
368
369 if (inst->is_send_from_grf())
370 return false;
371
372 if (!inst->can_do_source_mods())
373 return false;
374
375 return true;
376 }
377
378 void
379 fs_reg::init()
380 {
381 memset(this, 0, sizeof(*this));
382 this->smear = -1;
383 }
384
385 /** Generic unset register constructor. */
386 fs_reg::fs_reg()
387 {
388 init();
389 this->file = BAD_FILE;
390 }
391
392 /** Immediate value constructor. */
393 fs_reg::fs_reg(float f)
394 {
395 init();
396 this->file = IMM;
397 this->type = BRW_REGISTER_TYPE_F;
398 this->imm.f = f;
399 }
400
401 /** Immediate value constructor. */
402 fs_reg::fs_reg(int32_t i)
403 {
404 init();
405 this->file = IMM;
406 this->type = BRW_REGISTER_TYPE_D;
407 this->imm.i = i;
408 }
409
410 /** Immediate value constructor. */
411 fs_reg::fs_reg(uint32_t u)
412 {
413 init();
414 this->file = IMM;
415 this->type = BRW_REGISTER_TYPE_UD;
416 this->imm.u = u;
417 }
418
419 /** Fixed brw_reg. */
420 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
421 {
422 init();
423 this->file = HW_REG;
424 this->fixed_hw_reg = fixed_hw_reg;
425 this->type = fixed_hw_reg.type;
426 }
427
428 bool
429 fs_reg::equals(const fs_reg &r) const
430 {
431 return (file == r.file &&
432 reg == r.reg &&
433 reg_offset == r.reg_offset &&
434 type == r.type &&
435 negate == r.negate &&
436 abs == r.abs &&
437 !reladdr && !r.reladdr &&
438 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
439 sizeof(fixed_hw_reg)) == 0 &&
440 smear == r.smear &&
441 imm.u == r.imm.u);
442 }
443
444 fs_reg
445 fs_reg::retype(uint32_t type)
446 {
447 fs_reg result = *this;
448 result.type = type;
449 return result;
450 }
451
452 bool
453 fs_reg::is_zero() const
454 {
455 if (file != IMM)
456 return false;
457
458 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
459 }
460
461 bool
462 fs_reg::is_one() const
463 {
464 if (file != IMM)
465 return false;
466
467 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
468 }
469
470 bool
471 fs_reg::is_null() const
472 {
473 return file == HW_REG &&
474 fixed_hw_reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
475 fixed_hw_reg.nr == BRW_ARF_NULL;
476 }
477
478 bool
479 fs_reg::is_valid_3src() const
480 {
481 return file == GRF || file == UNIFORM;
482 }
483
484 int
485 fs_visitor::type_size(const struct glsl_type *type)
486 {
487 unsigned int size, i;
488
489 switch (type->base_type) {
490 case GLSL_TYPE_UINT:
491 case GLSL_TYPE_INT:
492 case GLSL_TYPE_FLOAT:
493 case GLSL_TYPE_BOOL:
494 return type->components();
495 case GLSL_TYPE_ARRAY:
496 return type_size(type->fields.array) * type->length;
497 case GLSL_TYPE_STRUCT:
498 size = 0;
499 for (i = 0; i < type->length; i++) {
500 size += type_size(type->fields.structure[i].type);
501 }
502 return size;
503 case GLSL_TYPE_SAMPLER:
504 /* Samplers take up no register space, since they're baked in at
505 * link time.
506 */
507 return 0;
508 case GLSL_TYPE_ATOMIC_UINT:
509 return 0;
510 case GLSL_TYPE_VOID:
511 case GLSL_TYPE_ERROR:
512 case GLSL_TYPE_INTERFACE:
513 assert(!"not reached");
514 break;
515 }
516
517 return 0;
518 }
519
520 fs_reg
521 fs_visitor::get_timestamp()
522 {
523 assert(brw->gen >= 7);
524
525 fs_reg ts = fs_reg(retype(brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
526 BRW_ARF_TIMESTAMP,
527 0),
528 BRW_REGISTER_TYPE_UD));
529
530 fs_reg dst = fs_reg(this, glsl_type::uint_type);
531
532 fs_inst *mov = emit(MOV(dst, ts));
533 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
534 * even if it's not enabled in the dispatch.
535 */
536 mov->force_writemask_all = true;
537 mov->force_uncompressed = true;
538
539 /* The caller wants the low 32 bits of the timestamp. Since it's running
540 * at the GPU clock rate of ~1.2ghz, it will roll over every ~3 seconds,
541 * which is plenty of time for our purposes. It is identical across the
542 * EUs, but since it's tracking GPU core speed it will increment at a
543 * varying rate as render P-states change.
544 *
545 * The caller could also check if render P-states have changed (or anything
546 * else that might disrupt timing) by setting smear to 2 and checking if
547 * that field is != 0.
548 */
549 dst.smear = 0;
550
551 return dst;
552 }
553
554 void
555 fs_visitor::emit_shader_time_begin()
556 {
557 current_annotation = "shader time start";
558 shader_start_time = get_timestamp();
559 }
560
561 void
562 fs_visitor::emit_shader_time_end()
563 {
564 current_annotation = "shader time end";
565
566 enum shader_time_shader_type type, written_type, reset_type;
567 if (dispatch_width == 8) {
568 type = ST_FS8;
569 written_type = ST_FS8_WRITTEN;
570 reset_type = ST_FS8_RESET;
571 } else {
572 assert(dispatch_width == 16);
573 type = ST_FS16;
574 written_type = ST_FS16_WRITTEN;
575 reset_type = ST_FS16_RESET;
576 }
577
578 fs_reg shader_end_time = get_timestamp();
579
580 /* Check that there weren't any timestamp reset events (assuming these
581 * were the only two timestamp reads that happened).
582 */
583 fs_reg reset = shader_end_time;
584 reset.smear = 2;
585 fs_inst *test = emit(AND(reg_null_d, reset, fs_reg(1u)));
586 test->conditional_mod = BRW_CONDITIONAL_Z;
587 emit(IF(BRW_PREDICATE_NORMAL));
588
589 push_force_uncompressed();
590 fs_reg start = shader_start_time;
591 start.negate = true;
592 fs_reg diff = fs_reg(this, glsl_type::uint_type);
593 emit(ADD(diff, start, shader_end_time));
594
595 /* If there were no instructions between the two timestamp gets, the diff
596 * is 2 cycles. Remove that overhead, so I can forget about that when
597 * trying to determine the time taken for single instructions.
598 */
599 emit(ADD(diff, diff, fs_reg(-2u)));
600
601 emit_shader_time_write(type, diff);
602 emit_shader_time_write(written_type, fs_reg(1u));
603 emit(BRW_OPCODE_ELSE);
604 emit_shader_time_write(reset_type, fs_reg(1u));
605 emit(BRW_OPCODE_ENDIF);
606
607 pop_force_uncompressed();
608 }
609
610 void
611 fs_visitor::emit_shader_time_write(enum shader_time_shader_type type,
612 fs_reg value)
613 {
614 int shader_time_index =
615 brw_get_shader_time_index(brw, shader_prog, &fp->Base, type);
616 fs_reg offset = fs_reg(shader_time_index * SHADER_TIME_STRIDE);
617
618 fs_reg payload;
619 if (dispatch_width == 8)
620 payload = fs_reg(this, glsl_type::uvec2_type);
621 else
622 payload = fs_reg(this, glsl_type::uint_type);
623
624 emit(fs_inst(SHADER_OPCODE_SHADER_TIME_ADD,
625 fs_reg(), payload, offset, value));
626 }
627
628 void
629 fs_visitor::fail(const char *format, ...)
630 {
631 va_list va;
632 char *msg;
633
634 if (failed)
635 return;
636
637 failed = true;
638
639 va_start(va, format);
640 msg = ralloc_vasprintf(mem_ctx, format, va);
641 va_end(va);
642 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
643
644 this->fail_msg = msg;
645
646 if (INTEL_DEBUG & DEBUG_WM) {
647 fprintf(stderr, "%s", msg);
648 }
649 }
650
651 fs_inst *
652 fs_visitor::emit(enum opcode opcode)
653 {
654 return emit(fs_inst(opcode));
655 }
656
657 fs_inst *
658 fs_visitor::emit(enum opcode opcode, fs_reg dst)
659 {
660 return emit(fs_inst(opcode, dst));
661 }
662
663 fs_inst *
664 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
665 {
666 return emit(fs_inst(opcode, dst, src0));
667 }
668
669 fs_inst *
670 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
671 {
672 return emit(fs_inst(opcode, dst, src0, src1));
673 }
674
675 fs_inst *
676 fs_visitor::emit(enum opcode opcode, fs_reg dst,
677 fs_reg src0, fs_reg src1, fs_reg src2)
678 {
679 return emit(fs_inst(opcode, dst, src0, src1, src2));
680 }
681
682 void
683 fs_visitor::push_force_uncompressed()
684 {
685 force_uncompressed_stack++;
686 }
687
688 void
689 fs_visitor::pop_force_uncompressed()
690 {
691 force_uncompressed_stack--;
692 assert(force_uncompressed_stack >= 0);
693 }
694
695 /**
696 * Returns true if the instruction has a flag that means it won't
697 * update an entire destination register.
698 *
699 * For example, dead code elimination and live variable analysis want to know
700 * when a write to a variable screens off any preceding values that were in
701 * it.
702 */
703 bool
704 fs_inst::is_partial_write()
705 {
706 return ((this->predicate && this->opcode != BRW_OPCODE_SEL) ||
707 this->force_uncompressed ||
708 this->force_sechalf);
709 }
710
711 int
712 fs_inst::regs_read(fs_visitor *v, int arg)
713 {
714 if (is_tex() && arg == 0 && src[0].file == GRF) {
715 if (v->dispatch_width == 16)
716 return (mlen + 1) / 2;
717 else
718 return mlen;
719 }
720 return 1;
721 }
722
723 bool
724 fs_inst::reads_flag()
725 {
726 return predicate;
727 }
728
729 bool
730 fs_inst::writes_flag()
731 {
732 return (conditional_mod && opcode != BRW_OPCODE_SEL) ||
733 opcode == FS_OPCODE_MOV_DISPATCH_TO_FLAGS;
734 }
735
736 /**
737 * Returns how many MRFs an FS opcode will write over.
738 *
739 * Note that this is not the 0 or 1 implied writes in an actual gen
740 * instruction -- the FS opcodes often generate MOVs in addition.
741 */
742 int
743 fs_visitor::implied_mrf_writes(fs_inst *inst)
744 {
745 if (inst->mlen == 0)
746 return 0;
747
748 if (inst->base_mrf == -1)
749 return 0;
750
751 switch (inst->opcode) {
752 case SHADER_OPCODE_RCP:
753 case SHADER_OPCODE_RSQ:
754 case SHADER_OPCODE_SQRT:
755 case SHADER_OPCODE_EXP2:
756 case SHADER_OPCODE_LOG2:
757 case SHADER_OPCODE_SIN:
758 case SHADER_OPCODE_COS:
759 return 1 * dispatch_width / 8;
760 case SHADER_OPCODE_POW:
761 case SHADER_OPCODE_INT_QUOTIENT:
762 case SHADER_OPCODE_INT_REMAINDER:
763 return 2 * dispatch_width / 8;
764 case SHADER_OPCODE_TEX:
765 case FS_OPCODE_TXB:
766 case SHADER_OPCODE_TXD:
767 case SHADER_OPCODE_TXF:
768 case SHADER_OPCODE_TXF_MS:
769 case SHADER_OPCODE_TG4:
770 case SHADER_OPCODE_TG4_OFFSET:
771 case SHADER_OPCODE_TXL:
772 case SHADER_OPCODE_TXS:
773 case SHADER_OPCODE_LOD:
774 return 1;
775 case FS_OPCODE_FB_WRITE:
776 return 2;
777 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
778 case SHADER_OPCODE_GEN4_SCRATCH_READ:
779 return 1;
780 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
781 return inst->mlen;
782 case SHADER_OPCODE_GEN4_SCRATCH_WRITE:
783 return 2;
784 case SHADER_OPCODE_UNTYPED_ATOMIC:
785 case SHADER_OPCODE_UNTYPED_SURFACE_READ:
786 return 0;
787 default:
788 assert(!"not reached");
789 return inst->mlen;
790 }
791 }
792
793 int
794 fs_visitor::virtual_grf_alloc(int size)
795 {
796 if (virtual_grf_array_size <= virtual_grf_count) {
797 if (virtual_grf_array_size == 0)
798 virtual_grf_array_size = 16;
799 else
800 virtual_grf_array_size *= 2;
801 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
802 virtual_grf_array_size);
803 }
804 virtual_grf_sizes[virtual_grf_count] = size;
805 return virtual_grf_count++;
806 }
807
808 /** Fixed HW reg constructor. */
809 fs_reg::fs_reg(enum register_file file, int reg)
810 {
811 init();
812 this->file = file;
813 this->reg = reg;
814 this->type = BRW_REGISTER_TYPE_F;
815 }
816
817 /** Fixed HW reg constructor. */
818 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
819 {
820 init();
821 this->file = file;
822 this->reg = reg;
823 this->type = type;
824 }
825
826 /** Automatic reg constructor. */
827 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
828 {
829 init();
830
831 this->file = GRF;
832 this->reg = v->virtual_grf_alloc(v->type_size(type));
833 this->reg_offset = 0;
834 this->type = brw_type_for_base_type(type);
835 }
836
837 fs_reg *
838 fs_visitor::variable_storage(ir_variable *var)
839 {
840 return (fs_reg *)hash_table_find(this->variable_ht, var);
841 }
842
843 void
844 import_uniforms_callback(const void *key,
845 void *data,
846 void *closure)
847 {
848 struct hash_table *dst_ht = (struct hash_table *)closure;
849 const fs_reg *reg = (const fs_reg *)data;
850
851 if (reg->file != UNIFORM)
852 return;
853
854 hash_table_insert(dst_ht, data, key);
855 }
856
857 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
858 * This brings in those uniform definitions
859 */
860 void
861 fs_visitor::import_uniforms(fs_visitor *v)
862 {
863 hash_table_call_foreach(v->variable_ht,
864 import_uniforms_callback,
865 variable_ht);
866 this->params_remap = v->params_remap;
867 this->nr_params_remap = v->nr_params_remap;
868 }
869
870 /* Our support for uniforms is piggy-backed on the struct
871 * gl_fragment_program, because that's where the values actually
872 * get stored, rather than in some global gl_shader_program uniform
873 * store.
874 */
875 void
876 fs_visitor::setup_uniform_values(ir_variable *ir)
877 {
878 int namelen = strlen(ir->name);
879
880 /* The data for our (non-builtin) uniforms is stored in a series of
881 * gl_uniform_driver_storage structs for each subcomponent that
882 * glGetUniformLocation() could name. We know it's been set up in the same
883 * order we'd walk the type, so walk the list of storage and find anything
884 * with our name, or the prefix of a component that starts with our name.
885 */
886 unsigned params_before = c->prog_data.nr_params;
887 for (unsigned u = 0; u < shader_prog->NumUserUniformStorage; u++) {
888 struct gl_uniform_storage *storage = &shader_prog->UniformStorage[u];
889
890 if (strncmp(ir->name, storage->name, namelen) != 0 ||
891 (storage->name[namelen] != 0 &&
892 storage->name[namelen] != '.' &&
893 storage->name[namelen] != '[')) {
894 continue;
895 }
896
897 unsigned slots = storage->type->component_slots();
898 if (storage->array_elements)
899 slots *= storage->array_elements;
900
901 for (unsigned i = 0; i < slots; i++) {
902 c->prog_data.param[c->prog_data.nr_params++] =
903 &storage->storage[i].f;
904 }
905 }
906
907 /* Make sure we actually initialized the right amount of stuff here. */
908 assert(params_before + ir->type->component_slots() ==
909 c->prog_data.nr_params);
910 (void)params_before;
911 }
912
913
914 /* Our support for builtin uniforms is even scarier than non-builtin.
915 * It sits on top of the PROG_STATE_VAR parameters that are
916 * automatically updated from GL context state.
917 */
918 void
919 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
920 {
921 const ir_state_slot *const slots = ir->state_slots;
922 assert(ir->state_slots != NULL);
923
924 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
925 /* This state reference has already been setup by ir_to_mesa, but we'll
926 * get the same index back here.
927 */
928 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
929 (gl_state_index *)slots[i].tokens);
930
931 /* Add each of the unique swizzles of the element as a parameter.
932 * This'll end up matching the expected layout of the
933 * array/matrix/structure we're trying to fill in.
934 */
935 int last_swiz = -1;
936 for (unsigned int j = 0; j < 4; j++) {
937 int swiz = GET_SWZ(slots[i].swizzle, j);
938 if (swiz == last_swiz)
939 break;
940 last_swiz = swiz;
941
942 c->prog_data.param[c->prog_data.nr_params++] =
943 &fp->Base.Parameters->ParameterValues[index][swiz].f;
944 }
945 }
946 }
947
948 fs_reg *
949 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
950 {
951 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
952 fs_reg wpos = *reg;
953 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
954
955 /* gl_FragCoord.x */
956 if (ir->pixel_center_integer) {
957 emit(MOV(wpos, this->pixel_x));
958 } else {
959 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
960 }
961 wpos.reg_offset++;
962
963 /* gl_FragCoord.y */
964 if (!flip && ir->pixel_center_integer) {
965 emit(MOV(wpos, this->pixel_y));
966 } else {
967 fs_reg pixel_y = this->pixel_y;
968 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
969
970 if (flip) {
971 pixel_y.negate = true;
972 offset += c->key.drawable_height - 1.0;
973 }
974
975 emit(ADD(wpos, pixel_y, fs_reg(offset)));
976 }
977 wpos.reg_offset++;
978
979 /* gl_FragCoord.z */
980 if (brw->gen >= 6) {
981 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
982 } else {
983 emit(FS_OPCODE_LINTERP, wpos,
984 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
985 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
986 interp_reg(VARYING_SLOT_POS, 2));
987 }
988 wpos.reg_offset++;
989
990 /* gl_FragCoord.w: Already set up in emit_interpolation */
991 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
992
993 return reg;
994 }
995
996 fs_inst *
997 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
998 glsl_interp_qualifier interpolation_mode,
999 bool is_centroid)
1000 {
1001 brw_wm_barycentric_interp_mode barycoord_mode;
1002 if (brw->gen >= 6) {
1003 if (is_centroid) {
1004 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1005 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
1006 else
1007 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
1008 } else {
1009 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
1010 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1011 else
1012 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
1013 }
1014 } else {
1015 /* On Ironlake and below, there is only one interpolation mode.
1016 * Centroid interpolation doesn't mean anything on this hardware --
1017 * there is no multisampling.
1018 */
1019 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
1020 }
1021 return emit(FS_OPCODE_LINTERP, attr,
1022 this->delta_x[barycoord_mode],
1023 this->delta_y[barycoord_mode], interp);
1024 }
1025
1026 fs_reg *
1027 fs_visitor::emit_general_interpolation(ir_variable *ir)
1028 {
1029 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1030 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
1031 fs_reg attr = *reg;
1032
1033 unsigned int array_elements;
1034 const glsl_type *type;
1035
1036 if (ir->type->is_array()) {
1037 array_elements = ir->type->length;
1038 if (array_elements == 0) {
1039 fail("dereferenced array '%s' has length 0\n", ir->name);
1040 }
1041 type = ir->type->fields.array;
1042 } else {
1043 array_elements = 1;
1044 type = ir->type;
1045 }
1046
1047 glsl_interp_qualifier interpolation_mode =
1048 ir->determine_interpolation_mode(c->key.flat_shade);
1049
1050 int location = ir->location;
1051 for (unsigned int i = 0; i < array_elements; i++) {
1052 for (unsigned int j = 0; j < type->matrix_columns; j++) {
1053 if (c->prog_data.urb_setup[location] == -1) {
1054 /* If there's no incoming setup data for this slot, don't
1055 * emit interpolation for it.
1056 */
1057 attr.reg_offset += type->vector_elements;
1058 location++;
1059 continue;
1060 }
1061
1062 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
1063 /* Constant interpolation (flat shading) case. The SF has
1064 * handed us defined values in only the constant offset
1065 * field of the setup reg.
1066 */
1067 for (unsigned int k = 0; k < type->vector_elements; k++) {
1068 struct brw_reg interp = interp_reg(location, k);
1069 interp = suboffset(interp, 3);
1070 interp.type = reg->type;
1071 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
1072 attr.reg_offset++;
1073 }
1074 } else {
1075 /* Smooth/noperspective interpolation case. */
1076 for (unsigned int k = 0; k < type->vector_elements; k++) {
1077 /* FINISHME: At some point we probably want to push
1078 * this farther by giving similar treatment to the
1079 * other potentially constant components of the
1080 * attribute, as well as making brw_vs_constval.c
1081 * handle varyings other than gl_TexCoord.
1082 */
1083 struct brw_reg interp = interp_reg(location, k);
1084 emit_linterp(attr, fs_reg(interp), interpolation_mode,
1085 ir->centroid);
1086 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
1087 /* Get the pixel/sample mask into f0 so that we know
1088 * which pixels are lit. Then, for each channel that is
1089 * unlit, replace the centroid data with non-centroid
1090 * data.
1091 */
1092 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
1093 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
1094 interpolation_mode, false);
1095 inst->predicate = BRW_PREDICATE_NORMAL;
1096 inst->predicate_inverse = true;
1097 }
1098 if (brw->gen < 6 && interpolation_mode == INTERP_QUALIFIER_SMOOTH) {
1099 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
1100 }
1101 attr.reg_offset++;
1102 }
1103
1104 }
1105 location++;
1106 }
1107 }
1108
1109 return reg;
1110 }
1111
1112 fs_reg *
1113 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
1114 {
1115 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1116
1117 /* The frontfacing comes in as a bit in the thread payload. */
1118 if (brw->gen >= 6) {
1119 emit(BRW_OPCODE_ASR, *reg,
1120 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1121 fs_reg(15));
1122 emit(BRW_OPCODE_NOT, *reg, *reg);
1123 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
1124 } else {
1125 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
1126 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
1127 * us front face
1128 */
1129 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
1130 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
1131 }
1132
1133 return reg;
1134 }
1135
1136 void
1137 fs_visitor::compute_sample_position(fs_reg dst, fs_reg int_sample_pos)
1138 {
1139 assert(dst.type == BRW_REGISTER_TYPE_F);
1140
1141 if (c->key.compute_pos_offset) {
1142 /* Convert int_sample_pos to floating point */
1143 emit(MOV(dst, int_sample_pos));
1144 /* Scale to the range [0, 1] */
1145 emit(MUL(dst, dst, fs_reg(1 / 16.0f)));
1146 }
1147 else {
1148 /* From ARB_sample_shading specification:
1149 * "When rendering to a non-multisample buffer, or if multisample
1150 * rasterization is disabled, gl_SamplePosition will always be
1151 * (0.5, 0.5).
1152 */
1153 emit(MOV(dst, fs_reg(0.5f)));
1154 }
1155 }
1156
1157 fs_reg *
1158 fs_visitor::emit_samplepos_setup(ir_variable *ir)
1159 {
1160 assert(brw->gen >= 6);
1161 assert(ir->type == glsl_type::vec2_type);
1162
1163 this->current_annotation = "compute sample position";
1164 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1165 fs_reg pos = *reg;
1166 fs_reg int_sample_x = fs_reg(this, glsl_type::int_type);
1167 fs_reg int_sample_y = fs_reg(this, glsl_type::int_type);
1168
1169 /* WM will be run in MSDISPMODE_PERSAMPLE. So, only one of SIMD8 or SIMD16
1170 * mode will be enabled.
1171 *
1172 * From the Ivy Bridge PRM, volume 2 part 1, page 344:
1173 * R31.1:0 Position Offset X/Y for Slot[3:0]
1174 * R31.3:2 Position Offset X/Y for Slot[7:4]
1175 * .....
1176 *
1177 * The X, Y sample positions come in as bytes in thread payload. So, read
1178 * the positions using vstride=16, width=8, hstride=2.
1179 */
1180 struct brw_reg sample_pos_reg =
1181 stride(retype(brw_vec1_grf(c->sample_pos_reg, 0),
1182 BRW_REGISTER_TYPE_B), 16, 8, 2);
1183
1184 emit(MOV(int_sample_x, fs_reg(sample_pos_reg)));
1185 if (dispatch_width == 16) {
1186 int_sample_x.sechalf = true;
1187 fs_inst *inst = emit(MOV(int_sample_x,
1188 fs_reg(suboffset(sample_pos_reg, 16))));
1189 inst->force_sechalf = true;
1190 int_sample_x.sechalf = false;
1191 }
1192 /* Compute gl_SamplePosition.x */
1193 compute_sample_position(pos, int_sample_x);
1194 pos.reg_offset++;
1195 emit(MOV(int_sample_y, fs_reg(suboffset(sample_pos_reg, 1))));
1196 if (dispatch_width == 16) {
1197 int_sample_y.sechalf = true;
1198 fs_inst *inst = emit(MOV(int_sample_y,
1199 fs_reg(suboffset(sample_pos_reg, 17))));
1200 inst->force_sechalf = true;
1201 int_sample_y.sechalf = false;
1202 }
1203 /* Compute gl_SamplePosition.y */
1204 compute_sample_position(pos, int_sample_y);
1205 return reg;
1206 }
1207
1208 fs_reg *
1209 fs_visitor::emit_sampleid_setup(ir_variable *ir)
1210 {
1211 assert(brw->gen >= 6);
1212
1213 this->current_annotation = "compute sample id";
1214 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
1215
1216 if (c->key.compute_sample_id) {
1217 fs_reg t1 = fs_reg(this, glsl_type::int_type);
1218 fs_reg t2 = fs_reg(this, glsl_type::int_type);
1219 t2.type = BRW_REGISTER_TYPE_UW;
1220
1221 /* The PS will be run in MSDISPMODE_PERSAMPLE. For example with
1222 * 8x multisampling, subspan 0 will represent sample N (where N
1223 * is 0, 2, 4 or 6), subspan 1 will represent sample 1, 3, 5 or
1224 * 7. We can find the value of N by looking at R0.0 bits 7:6
1225 * ("Starting Sample Pair Index (SSPI)") and multiplying by two
1226 * (since samples are always delivered in pairs). That is, we
1227 * compute 2*((R0.0 & 0xc0) >> 6) == (R0.0 & 0xc0) >> 5. Then
1228 * we need to add N to the sequence (0, 0, 0, 0, 1, 1, 1, 1) in
1229 * case of SIMD8 and sequence (0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2,
1230 * 2, 3, 3, 3, 3) in case of SIMD16. We compute this sequence by
1231 * populating a temporary variable with the sequence (0, 1, 2, 3),
1232 * and then reading from it using vstride=1, width=4, hstride=0.
1233 * These computations hold good for 4x multisampling as well.
1234 */
1235 emit(BRW_OPCODE_AND, t1,
1236 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
1237 fs_reg(brw_imm_d(0xc0)));
1238 emit(BRW_OPCODE_SHR, t1, t1, fs_reg(5));
1239 /* This works for both SIMD8 and SIMD16 */
1240 emit(MOV(t2, brw_imm_v(0x3210)));
1241 /* This special instruction takes care of setting vstride=1,
1242 * width=4, hstride=0 of t2 during an ADD instruction.
1243 */
1244 emit(FS_OPCODE_SET_SAMPLE_ID, *reg, t1, t2);
1245 } else {
1246 /* As per GL_ARB_sample_shading specification:
1247 * "When rendering to a non-multisample buffer, or if multisample
1248 * rasterization is disabled, gl_SampleID will always be zero."
1249 */
1250 emit(BRW_OPCODE_MOV, *reg, fs_reg(0));
1251 }
1252
1253 return reg;
1254 }
1255
1256 fs_reg
1257 fs_visitor::fix_math_operand(fs_reg src)
1258 {
1259 /* Can't do hstride == 0 args on gen6 math, so expand it out. We
1260 * might be able to do better by doing execsize = 1 math and then
1261 * expanding that result out, but we would need to be careful with
1262 * masking.
1263 *
1264 * The hardware ignores source modifiers (negate and abs) on math
1265 * instructions, so we also move to a temp to set those up.
1266 */
1267 if (brw->gen == 6 && src.file != UNIFORM && src.file != IMM &&
1268 !src.abs && !src.negate)
1269 return src;
1270
1271 /* Gen7 relaxes most of the above restrictions, but still can't use IMM
1272 * operands to math
1273 */
1274 if (brw->gen >= 7 && src.file != IMM)
1275 return src;
1276
1277 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1278 expanded.type = src.type;
1279 emit(BRW_OPCODE_MOV, expanded, src);
1280 return expanded;
1281 }
1282
1283 fs_inst *
1284 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
1285 {
1286 switch (opcode) {
1287 case SHADER_OPCODE_RCP:
1288 case SHADER_OPCODE_RSQ:
1289 case SHADER_OPCODE_SQRT:
1290 case SHADER_OPCODE_EXP2:
1291 case SHADER_OPCODE_LOG2:
1292 case SHADER_OPCODE_SIN:
1293 case SHADER_OPCODE_COS:
1294 break;
1295 default:
1296 assert(!"not reached: bad math opcode");
1297 return NULL;
1298 }
1299
1300 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
1301 * might be able to do better by doing execsize = 1 math and then
1302 * expanding that result out, but we would need to be careful with
1303 * masking.
1304 *
1305 * Gen 6 hardware ignores source modifiers (negate and abs) on math
1306 * instructions, so we also move to a temp to set those up.
1307 */
1308 if (brw->gen >= 6)
1309 src = fix_math_operand(src);
1310
1311 fs_inst *inst = emit(opcode, dst, src);
1312
1313 if (brw->gen < 6) {
1314 inst->base_mrf = 2;
1315 inst->mlen = dispatch_width / 8;
1316 }
1317
1318 return inst;
1319 }
1320
1321 fs_inst *
1322 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
1323 {
1324 int base_mrf = 2;
1325 fs_inst *inst;
1326
1327 switch (opcode) {
1328 case SHADER_OPCODE_INT_QUOTIENT:
1329 case SHADER_OPCODE_INT_REMAINDER:
1330 if (brw->gen >= 7 && dispatch_width == 16)
1331 fail("16-wide INTDIV unsupported\n");
1332 break;
1333 case SHADER_OPCODE_POW:
1334 break;
1335 default:
1336 assert(!"not reached: unsupported binary math opcode.");
1337 return NULL;
1338 }
1339
1340 if (brw->gen >= 6) {
1341 src0 = fix_math_operand(src0);
1342 src1 = fix_math_operand(src1);
1343
1344 inst = emit(opcode, dst, src0, src1);
1345 } else {
1346 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1347 * "Message Payload":
1348 *
1349 * "Operand0[7]. For the INT DIV functions, this operand is the
1350 * denominator."
1351 * ...
1352 * "Operand1[7]. For the INT DIV functions, this operand is the
1353 * numerator."
1354 */
1355 bool is_int_div = opcode != SHADER_OPCODE_POW;
1356 fs_reg &op0 = is_int_div ? src1 : src0;
1357 fs_reg &op1 = is_int_div ? src0 : src1;
1358
1359 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1360 inst = emit(opcode, dst, op0, reg_null_f);
1361
1362 inst->base_mrf = base_mrf;
1363 inst->mlen = 2 * dispatch_width / 8;
1364 }
1365 return inst;
1366 }
1367
1368 void
1369 fs_visitor::assign_curb_setup()
1370 {
1371 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1372 if (dispatch_width == 8) {
1373 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1374 } else {
1375 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1376 }
1377
1378 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1379 foreach_list(node, &this->instructions) {
1380 fs_inst *inst = (fs_inst *)node;
1381
1382 for (unsigned int i = 0; i < 3; i++) {
1383 if (inst->src[i].file == UNIFORM) {
1384 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1385 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1386 constant_nr / 8,
1387 constant_nr % 8);
1388
1389 inst->src[i].file = HW_REG;
1390 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1391 }
1392 }
1393 }
1394 }
1395
1396 void
1397 fs_visitor::calculate_urb_setup()
1398 {
1399 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1400 c->prog_data.urb_setup[i] = -1;
1401 }
1402
1403 int urb_next = 0;
1404 /* Figure out where each of the incoming setup attributes lands. */
1405 if (brw->gen >= 6) {
1406 if (_mesa_bitcount_64(fp->Base.InputsRead &
1407 BRW_FS_VARYING_INPUT_MASK) <= 16) {
1408 /* The SF/SBE pipeline stage can do arbitrary rearrangement of the
1409 * first 16 varying inputs, so we can put them wherever we want.
1410 * Just put them in order.
1411 *
1412 * This is useful because it means that (a) inputs not used by the
1413 * fragment shader won't take up valuable register space, and (b) we
1414 * won't have to recompile the fragment shader if it gets paired with
1415 * a different vertex (or geometry) shader.
1416 */
1417 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1418 if (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1419 BITFIELD64_BIT(i)) {
1420 c->prog_data.urb_setup[i] = urb_next++;
1421 }
1422 }
1423 } else {
1424 /* We have enough input varyings that the SF/SBE pipeline stage can't
1425 * arbitrarily rearrange them to suit our whim; we have to put them
1426 * in an order that matches the output of the previous pipeline stage
1427 * (geometry or vertex shader).
1428 */
1429 struct brw_vue_map prev_stage_vue_map;
1430 brw_compute_vue_map(brw, &prev_stage_vue_map,
1431 c->key.input_slots_valid);
1432 int first_slot = 2 * BRW_SF_URB_ENTRY_READ_OFFSET;
1433 assert(prev_stage_vue_map.num_slots <= first_slot + 32);
1434 for (int slot = first_slot; slot < prev_stage_vue_map.num_slots;
1435 slot++) {
1436 int varying = prev_stage_vue_map.slot_to_varying[slot];
1437 /* Note that varying == BRW_VARYING_SLOT_COUNT when a slot is
1438 * unused.
1439 */
1440 if (varying != BRW_VARYING_SLOT_COUNT &&
1441 (fp->Base.InputsRead & BRW_FS_VARYING_INPUT_MASK &
1442 BITFIELD64_BIT(varying))) {
1443 c->prog_data.urb_setup[varying] = slot - first_slot;
1444 }
1445 }
1446 urb_next = prev_stage_vue_map.num_slots - first_slot;
1447 }
1448 } else {
1449 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1450 for (unsigned int i = 0; i < VARYING_SLOT_MAX; i++) {
1451 /* Point size is packed into the header, not as a general attribute */
1452 if (i == VARYING_SLOT_PSIZ)
1453 continue;
1454
1455 if (c->key.input_slots_valid & BITFIELD64_BIT(i)) {
1456 /* The back color slot is skipped when the front color is
1457 * also written to. In addition, some slots can be
1458 * written in the vertex shader and not read in the
1459 * fragment shader. So the register number must always be
1460 * incremented, mapped or not.
1461 */
1462 if (_mesa_varying_slot_in_fs((gl_varying_slot) i))
1463 c->prog_data.urb_setup[i] = urb_next;
1464 urb_next++;
1465 }
1466 }
1467
1468 /*
1469 * It's a FS only attribute, and we did interpolation for this attribute
1470 * in SF thread. So, count it here, too.
1471 *
1472 * See compile_sf_prog() for more info.
1473 */
1474 if (fp->Base.InputsRead & BITFIELD64_BIT(VARYING_SLOT_PNTC))
1475 c->prog_data.urb_setup[VARYING_SLOT_PNTC] = urb_next++;
1476 }
1477
1478 c->prog_data.num_varying_inputs = urb_next;
1479 }
1480
1481 void
1482 fs_visitor::assign_urb_setup()
1483 {
1484 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1485
1486 /* Offset all the urb_setup[] index by the actual position of the
1487 * setup regs, now that the location of the constants has been chosen.
1488 */
1489 foreach_list(node, &this->instructions) {
1490 fs_inst *inst = (fs_inst *)node;
1491
1492 if (inst->opcode == FS_OPCODE_LINTERP) {
1493 assert(inst->src[2].file == HW_REG);
1494 inst->src[2].fixed_hw_reg.nr += urb_start;
1495 }
1496
1497 if (inst->opcode == FS_OPCODE_CINTERP) {
1498 assert(inst->src[0].file == HW_REG);
1499 inst->src[0].fixed_hw_reg.nr += urb_start;
1500 }
1501 }
1502
1503 /* Each attribute is 4 setup channels, each of which is half a reg. */
1504 this->first_non_payload_grf =
1505 urb_start + c->prog_data.num_varying_inputs * 2;
1506 }
1507
1508 /**
1509 * Split large virtual GRFs into separate components if we can.
1510 *
1511 * This is mostly duplicated with what brw_fs_vector_splitting does,
1512 * but that's really conservative because it's afraid of doing
1513 * splitting that doesn't result in real progress after the rest of
1514 * the optimization phases, which would cause infinite looping in
1515 * optimization. We can do it once here, safely. This also has the
1516 * opportunity to split interpolated values, or maybe even uniforms,
1517 * which we don't have at the IR level.
1518 *
1519 * We want to split, because virtual GRFs are what we register
1520 * allocate and spill (due to contiguousness requirements for some
1521 * instructions), and they're what we naturally generate in the
1522 * codegen process, but most virtual GRFs don't actually need to be
1523 * contiguous sets of GRFs. If we split, we'll end up with reduced
1524 * live intervals and better dead code elimination and coalescing.
1525 */
1526 void
1527 fs_visitor::split_virtual_grfs()
1528 {
1529 int num_vars = this->virtual_grf_count;
1530 bool split_grf[num_vars];
1531 int new_virtual_grf[num_vars];
1532
1533 /* Try to split anything > 0 sized. */
1534 for (int i = 0; i < num_vars; i++) {
1535 if (this->virtual_grf_sizes[i] != 1)
1536 split_grf[i] = true;
1537 else
1538 split_grf[i] = false;
1539 }
1540
1541 if (brw->has_pln &&
1542 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1543 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1544 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1545 * Gen6, that was the only supported interpolation mode, and since Gen6,
1546 * delta_x and delta_y are in fixed hardware registers.
1547 */
1548 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1549 false;
1550 }
1551
1552 foreach_list(node, &this->instructions) {
1553 fs_inst *inst = (fs_inst *)node;
1554
1555 /* If there's a SEND message that requires contiguous destination
1556 * registers, no splitting is allowed.
1557 */
1558 if (inst->regs_written > 1) {
1559 split_grf[inst->dst.reg] = false;
1560 }
1561
1562 /* If we're sending from a GRF, don't split it, on the assumption that
1563 * the send is reading the whole thing.
1564 */
1565 if (inst->is_send_from_grf()) {
1566 for (int i = 0; i < 3; i++) {
1567 if (inst->src[i].file == GRF) {
1568 split_grf[inst->src[i].reg] = false;
1569 }
1570 }
1571 }
1572 }
1573
1574 /* Allocate new space for split regs. Note that the virtual
1575 * numbers will be contiguous.
1576 */
1577 for (int i = 0; i < num_vars; i++) {
1578 if (split_grf[i]) {
1579 new_virtual_grf[i] = virtual_grf_alloc(1);
1580 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1581 int reg = virtual_grf_alloc(1);
1582 assert(reg == new_virtual_grf[i] + j - 1);
1583 (void) reg;
1584 }
1585 this->virtual_grf_sizes[i] = 1;
1586 }
1587 }
1588
1589 foreach_list(node, &this->instructions) {
1590 fs_inst *inst = (fs_inst *)node;
1591
1592 if (inst->dst.file == GRF &&
1593 split_grf[inst->dst.reg] &&
1594 inst->dst.reg_offset != 0) {
1595 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1596 inst->dst.reg_offset - 1);
1597 inst->dst.reg_offset = 0;
1598 }
1599 for (int i = 0; i < 3; i++) {
1600 if (inst->src[i].file == GRF &&
1601 split_grf[inst->src[i].reg] &&
1602 inst->src[i].reg_offset != 0) {
1603 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1604 inst->src[i].reg_offset - 1);
1605 inst->src[i].reg_offset = 0;
1606 }
1607 }
1608 }
1609 invalidate_live_intervals();
1610 }
1611
1612 /**
1613 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1614 *
1615 * During code generation, we create tons of temporary variables, many of
1616 * which get immediately killed and are never used again. Yet, in later
1617 * optimization and analysis passes, such as compute_live_intervals, we need
1618 * to loop over all the virtual GRFs. Compacting them can save a lot of
1619 * overhead.
1620 */
1621 void
1622 fs_visitor::compact_virtual_grfs()
1623 {
1624 /* Mark which virtual GRFs are used, and count how many. */
1625 int remap_table[this->virtual_grf_count];
1626 memset(remap_table, -1, sizeof(remap_table));
1627
1628 foreach_list(node, &this->instructions) {
1629 const fs_inst *inst = (const fs_inst *) node;
1630
1631 if (inst->dst.file == GRF)
1632 remap_table[inst->dst.reg] = 0;
1633
1634 for (int i = 0; i < 3; i++) {
1635 if (inst->src[i].file == GRF)
1636 remap_table[inst->src[i].reg] = 0;
1637 }
1638 }
1639
1640 /* In addition to registers used in instructions, fs_visitor keeps
1641 * direct references to certain special values which must be patched:
1642 */
1643 fs_reg *special[] = {
1644 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1645 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1646 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1647 &delta_x[0], &delta_x[1], &delta_x[2],
1648 &delta_x[3], &delta_x[4], &delta_x[5],
1649 &delta_y[0], &delta_y[1], &delta_y[2],
1650 &delta_y[3], &delta_y[4], &delta_y[5],
1651 };
1652 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1653 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1654
1655 /* Treat all special values as used, to be conservative */
1656 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1657 if (special[i]->file == GRF)
1658 remap_table[special[i]->reg] = 0;
1659 }
1660
1661 /* Compact the GRF arrays. */
1662 int new_index = 0;
1663 for (int i = 0; i < this->virtual_grf_count; i++) {
1664 if (remap_table[i] != -1) {
1665 remap_table[i] = new_index;
1666 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1667 invalidate_live_intervals();
1668 ++new_index;
1669 }
1670 }
1671
1672 this->virtual_grf_count = new_index;
1673
1674 /* Patch all the instructions to use the newly renumbered registers */
1675 foreach_list(node, &this->instructions) {
1676 fs_inst *inst = (fs_inst *) node;
1677
1678 if (inst->dst.file == GRF)
1679 inst->dst.reg = remap_table[inst->dst.reg];
1680
1681 for (int i = 0; i < 3; i++) {
1682 if (inst->src[i].file == GRF)
1683 inst->src[i].reg = remap_table[inst->src[i].reg];
1684 }
1685 }
1686
1687 /* Patch all the references to special values */
1688 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1689 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1690 special[i]->reg = remap_table[special[i]->reg];
1691 }
1692 }
1693
1694 bool
1695 fs_visitor::remove_dead_constants()
1696 {
1697 if (dispatch_width == 8) {
1698 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1699 this->nr_params_remap = c->prog_data.nr_params;
1700
1701 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1702 this->params_remap[i] = -1;
1703
1704 /* Find which params are still in use. */
1705 foreach_list(node, &this->instructions) {
1706 fs_inst *inst = (fs_inst *)node;
1707
1708 for (int i = 0; i < 3; i++) {
1709 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1710
1711 if (inst->src[i].file != UNIFORM)
1712 continue;
1713
1714 /* Section 5.11 of the OpenGL 4.3 spec says:
1715 *
1716 * "Out-of-bounds reads return undefined values, which include
1717 * values from other variables of the active program or zero."
1718 */
1719 if (constant_nr < 0 || constant_nr >= (int)c->prog_data.nr_params) {
1720 constant_nr = 0;
1721 }
1722
1723 /* For now, set this to non-negative. We'll give it the
1724 * actual new number in a moment, in order to keep the
1725 * register numbers nicely ordered.
1726 */
1727 this->params_remap[constant_nr] = 0;
1728 }
1729 }
1730
1731 /* Figure out what the new numbers for the params will be. At some
1732 * point when we're doing uniform array access, we're going to want
1733 * to keep the distinction between .reg and .reg_offset, but for
1734 * now we don't care.
1735 */
1736 unsigned int new_nr_params = 0;
1737 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1738 if (this->params_remap[i] != -1) {
1739 this->params_remap[i] = new_nr_params++;
1740 }
1741 }
1742
1743 /* Update the list of params to be uploaded to match our new numbering. */
1744 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1745 int remapped = this->params_remap[i];
1746
1747 if (remapped == -1)
1748 continue;
1749
1750 c->prog_data.param[remapped] = c->prog_data.param[i];
1751 }
1752
1753 c->prog_data.nr_params = new_nr_params;
1754 } else {
1755 /* This should have been generated in the 8-wide pass already. */
1756 assert(this->params_remap);
1757 }
1758
1759 /* Now do the renumbering of the shader to remove unused params. */
1760 foreach_list(node, &this->instructions) {
1761 fs_inst *inst = (fs_inst *)node;
1762
1763 for (int i = 0; i < 3; i++) {
1764 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1765
1766 if (inst->src[i].file != UNIFORM)
1767 continue;
1768
1769 /* as above alias to 0 */
1770 if (constant_nr < 0 || constant_nr >= (int)this->nr_params_remap) {
1771 constant_nr = 0;
1772 }
1773 assert(this->params_remap[constant_nr] != -1);
1774 inst->src[i].reg = this->params_remap[constant_nr];
1775 inst->src[i].reg_offset = 0;
1776 }
1777 }
1778
1779 return true;
1780 }
1781
1782 /*
1783 * Implements array access of uniforms by inserting a
1784 * PULL_CONSTANT_LOAD instruction.
1785 *
1786 * Unlike temporary GRF array access (where we don't support it due to
1787 * the difficulty of doing relative addressing on instruction
1788 * destinations), we could potentially do array access of uniforms
1789 * that were loaded in GRF space as push constants. In real-world
1790 * usage we've seen, though, the arrays being used are always larger
1791 * than we could load as push constants, so just always move all
1792 * uniform array access out to a pull constant buffer.
1793 */
1794 void
1795 fs_visitor::move_uniform_array_access_to_pull_constants()
1796 {
1797 int pull_constant_loc[c->prog_data.nr_params];
1798
1799 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1800 pull_constant_loc[i] = -1;
1801 }
1802
1803 /* Walk through and find array access of uniforms. Put a copy of that
1804 * uniform in the pull constant buffer.
1805 *
1806 * Note that we don't move constant-indexed accesses to arrays. No
1807 * testing has been done of the performance impact of this choice.
1808 */
1809 foreach_list_safe(node, &this->instructions) {
1810 fs_inst *inst = (fs_inst *)node;
1811
1812 for (int i = 0 ; i < 3; i++) {
1813 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1814 continue;
1815
1816 int uniform = inst->src[i].reg;
1817
1818 /* If this array isn't already present in the pull constant buffer,
1819 * add it.
1820 */
1821 if (pull_constant_loc[uniform] == -1) {
1822 const float **values = &c->prog_data.param[uniform];
1823
1824 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1825
1826 assert(param_size[uniform]);
1827
1828 for (int j = 0; j < param_size[uniform]; j++) {
1829 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1830 values[j];
1831 }
1832 }
1833
1834 /* Set up the annotation tracking for new generated instructions. */
1835 base_ir = inst->ir;
1836 current_annotation = inst->annotation;
1837
1838 fs_reg surf_index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1839 fs_reg temp = fs_reg(this, glsl_type::float_type);
1840 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1841 surf_index,
1842 *inst->src[i].reladdr,
1843 pull_constant_loc[uniform] +
1844 inst->src[i].reg_offset);
1845 inst->insert_before(&list);
1846
1847 inst->src[i].file = temp.file;
1848 inst->src[i].reg = temp.reg;
1849 inst->src[i].reg_offset = temp.reg_offset;
1850 inst->src[i].reladdr = NULL;
1851 }
1852 }
1853 }
1854
1855 /**
1856 * Choose accesses from the UNIFORM file to demote to using the pull
1857 * constant buffer.
1858 *
1859 * We allow a fragment shader to have more than the specified minimum
1860 * maximum number of fragment shader uniform components (64). If
1861 * there are too many of these, they'd fill up all of register space.
1862 * So, this will push some of them out to the pull constant buffer and
1863 * update the program to load them.
1864 */
1865 void
1866 fs_visitor::setup_pull_constants()
1867 {
1868 /* Only allow 16 registers (128 uniform components) as push constants. */
1869 unsigned int max_uniform_components = 16 * 8;
1870 if (c->prog_data.nr_params <= max_uniform_components)
1871 return;
1872
1873 if (dispatch_width == 16) {
1874 fail("Pull constants not supported in 16-wide\n");
1875 return;
1876 }
1877
1878 /* Just demote the end of the list. We could probably do better
1879 * here, demoting things that are rarely used in the program first.
1880 */
1881 unsigned int pull_uniform_base = max_uniform_components;
1882
1883 int pull_constant_loc[c->prog_data.nr_params];
1884 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1885 if (i < pull_uniform_base) {
1886 pull_constant_loc[i] = -1;
1887 } else {
1888 pull_constant_loc[i] = -1;
1889 /* If our constant is already being uploaded for reladdr purposes,
1890 * reuse it.
1891 */
1892 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1893 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1894 pull_constant_loc[i] = j;
1895 break;
1896 }
1897 }
1898 if (pull_constant_loc[i] == -1) {
1899 int pull_index = c->prog_data.nr_pull_params++;
1900 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1901 pull_constant_loc[i] = pull_index;;
1902 }
1903 }
1904 }
1905 c->prog_data.nr_params = pull_uniform_base;
1906
1907 foreach_list(node, &this->instructions) {
1908 fs_inst *inst = (fs_inst *)node;
1909
1910 for (int i = 0; i < 3; i++) {
1911 if (inst->src[i].file != UNIFORM)
1912 continue;
1913
1914 int pull_index = pull_constant_loc[inst->src[i].reg +
1915 inst->src[i].reg_offset];
1916 if (pull_index == -1)
1917 continue;
1918
1919 assert(!inst->src[i].reladdr);
1920
1921 fs_reg dst = fs_reg(this, glsl_type::float_type);
1922 fs_reg index = fs_reg(c->prog_data.base.binding_table.pull_constants_start);
1923 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1924 fs_inst *pull =
1925 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1926 dst, index, offset);
1927 pull->ir = inst->ir;
1928 pull->annotation = inst->annotation;
1929
1930 inst->insert_before(pull);
1931
1932 inst->src[i].file = GRF;
1933 inst->src[i].reg = dst.reg;
1934 inst->src[i].reg_offset = 0;
1935 inst->src[i].smear = pull_index & 3;
1936 }
1937 }
1938 }
1939
1940 bool
1941 fs_visitor::opt_algebraic()
1942 {
1943 bool progress = false;
1944
1945 foreach_list(node, &this->instructions) {
1946 fs_inst *inst = (fs_inst *)node;
1947
1948 switch (inst->opcode) {
1949 case BRW_OPCODE_MUL:
1950 if (inst->src[1].file != IMM)
1951 continue;
1952
1953 /* a * 1.0 = a */
1954 if (inst->src[1].is_one()) {
1955 inst->opcode = BRW_OPCODE_MOV;
1956 inst->src[1] = reg_undef;
1957 progress = true;
1958 break;
1959 }
1960
1961 /* a * 0.0 = 0.0 */
1962 if (inst->src[1].is_zero()) {
1963 inst->opcode = BRW_OPCODE_MOV;
1964 inst->src[0] = inst->src[1];
1965 inst->src[1] = reg_undef;
1966 progress = true;
1967 break;
1968 }
1969
1970 break;
1971 case BRW_OPCODE_ADD:
1972 if (inst->src[1].file != IMM)
1973 continue;
1974
1975 /* a + 0.0 = a */
1976 if (inst->src[1].is_zero()) {
1977 inst->opcode = BRW_OPCODE_MOV;
1978 inst->src[1] = reg_undef;
1979 progress = true;
1980 break;
1981 }
1982 break;
1983 case BRW_OPCODE_OR:
1984 if (inst->src[0].equals(inst->src[1])) {
1985 inst->opcode = BRW_OPCODE_MOV;
1986 inst->src[1] = reg_undef;
1987 progress = true;
1988 break;
1989 }
1990 break;
1991 case BRW_OPCODE_SEL:
1992 if (inst->saturate && inst->src[1].file == IMM) {
1993 switch (inst->conditional_mod) {
1994 case BRW_CONDITIONAL_LE:
1995 case BRW_CONDITIONAL_L:
1996 switch (inst->src[1].type) {
1997 case BRW_REGISTER_TYPE_F:
1998 if (inst->src[1].imm.f >= 1.0f) {
1999 inst->opcode = BRW_OPCODE_MOV;
2000 inst->src[1] = reg_undef;
2001 progress = true;
2002 }
2003 break;
2004 default:
2005 break;
2006 }
2007 break;
2008 case BRW_CONDITIONAL_GE:
2009 case BRW_CONDITIONAL_G:
2010 switch (inst->src[1].type) {
2011 case BRW_REGISTER_TYPE_F:
2012 if (inst->src[1].imm.f <= 0.0f) {
2013 inst->opcode = BRW_OPCODE_MOV;
2014 inst->src[1] = reg_undef;
2015 inst->conditional_mod = BRW_CONDITIONAL_NONE;
2016 progress = true;
2017 }
2018 break;
2019 default:
2020 break;
2021 }
2022 default:
2023 break;
2024 }
2025 }
2026 break;
2027 default:
2028 break;
2029 }
2030 }
2031
2032 return progress;
2033 }
2034
2035 /**
2036 * Removes any instructions writing a VGRF where that VGRF is not used by any
2037 * later instruction.
2038 */
2039 bool
2040 fs_visitor::dead_code_eliminate()
2041 {
2042 bool progress = false;
2043 int pc = 0;
2044
2045 calculate_live_intervals();
2046
2047 foreach_list_safe(node, &this->instructions) {
2048 fs_inst *inst = (fs_inst *)node;
2049
2050 if (inst->dst.file == GRF && !inst->has_side_effects()) {
2051 bool dead = true;
2052
2053 for (int i = 0; i < inst->regs_written; i++) {
2054 int var = live_intervals->var_from_vgrf[inst->dst.reg];
2055 assert(live_intervals->end[var + inst->dst.reg_offset + i] >= pc);
2056 if (live_intervals->end[var + inst->dst.reg_offset + i] != pc) {
2057 dead = false;
2058 break;
2059 }
2060 }
2061
2062 if (dead) {
2063 /* Don't dead code eliminate instructions that write to the
2064 * accumulator as a side-effect. Instead just set the destination
2065 * to the null register to free it.
2066 */
2067 switch (inst->opcode) {
2068 case BRW_OPCODE_ADDC:
2069 case BRW_OPCODE_SUBB:
2070 case BRW_OPCODE_MACH:
2071 inst->dst = fs_reg(retype(brw_null_reg(), inst->dst.type));
2072 break;
2073 default:
2074 inst->remove();
2075 progress = true;
2076 break;
2077 }
2078 }
2079 }
2080
2081 pc++;
2082 }
2083
2084 if (progress)
2085 invalidate_live_intervals();
2086
2087 return progress;
2088 }
2089
2090 struct dead_code_hash_key
2091 {
2092 int vgrf;
2093 int reg_offset;
2094 };
2095
2096 static bool
2097 dead_code_hash_compare(const void *a, const void *b)
2098 {
2099 return memcmp(a, b, sizeof(struct dead_code_hash_key)) == 0;
2100 }
2101
2102 static void
2103 clear_dead_code_hash(struct hash_table *ht)
2104 {
2105 struct hash_entry *entry;
2106
2107 hash_table_foreach(ht, entry) {
2108 _mesa_hash_table_remove(ht, entry);
2109 }
2110 }
2111
2112 static void
2113 insert_dead_code_hash(struct hash_table *ht,
2114 int vgrf, int reg_offset, fs_inst *inst)
2115 {
2116 /* We don't bother freeing keys, because they'll be GCed with the ht. */
2117 struct dead_code_hash_key *key = ralloc(ht, struct dead_code_hash_key);
2118
2119 key->vgrf = vgrf;
2120 key->reg_offset = reg_offset;
2121
2122 _mesa_hash_table_insert(ht, _mesa_hash_data(key, sizeof(*key)), key, inst);
2123 }
2124
2125 static struct hash_entry *
2126 get_dead_code_hash_entry(struct hash_table *ht, int vgrf, int reg_offset)
2127 {
2128 struct dead_code_hash_key key;
2129
2130 key.vgrf = vgrf;
2131 key.reg_offset = reg_offset;
2132
2133 return _mesa_hash_table_search(ht, _mesa_hash_data(&key, sizeof(key)), &key);
2134 }
2135
2136 static void
2137 remove_dead_code_hash(struct hash_table *ht,
2138 int vgrf, int reg_offset)
2139 {
2140 struct hash_entry *entry = get_dead_code_hash_entry(ht, vgrf, reg_offset);
2141 if (!entry)
2142 return;
2143
2144 _mesa_hash_table_remove(ht, entry);
2145 }
2146
2147 /**
2148 * Walks basic blocks, removing any regs that are written but not read before
2149 * being redefined.
2150 *
2151 * The dead_code_eliminate() function implements a global dead code
2152 * elimination, but it only handles the removing the last write to a register
2153 * if it's never read. This one can handle intermediate writes, but only
2154 * within a basic block.
2155 */
2156 bool
2157 fs_visitor::dead_code_eliminate_local()
2158 {
2159 struct hash_table *ht;
2160 bool progress = false;
2161
2162 ht = _mesa_hash_table_create(mem_ctx, dead_code_hash_compare);
2163
2164 foreach_list_safe(node, &this->instructions) {
2165 fs_inst *inst = (fs_inst *)node;
2166
2167 /* At a basic block, empty the HT since we don't understand dataflow
2168 * here.
2169 */
2170 if (inst->is_control_flow()) {
2171 clear_dead_code_hash(ht);
2172 continue;
2173 }
2174
2175 /* Clear the HT of any instructions that got read. */
2176 for (int i = 0; i < 3; i++) {
2177 fs_reg src = inst->src[i];
2178 if (src.file != GRF)
2179 continue;
2180
2181 int read = 1;
2182 if (inst->is_send_from_grf())
2183 read = virtual_grf_sizes[src.reg] - src.reg_offset;
2184
2185 for (int reg_offset = src.reg_offset;
2186 reg_offset < src.reg_offset + read;
2187 reg_offset++) {
2188 remove_dead_code_hash(ht, src.reg, reg_offset);
2189 }
2190 }
2191
2192 /* Add any update of a GRF to the HT, removing a previous write if it
2193 * wasn't read.
2194 */
2195 if (inst->dst.file == GRF) {
2196 if (inst->regs_written > 1) {
2197 /* We don't know how to trim channels from an instruction's
2198 * writes, so we can't incrementally remove unread channels from
2199 * it. Just remove whatever it overwrites from the table
2200 */
2201 for (int i = 0; i < inst->regs_written; i++) {
2202 remove_dead_code_hash(ht,
2203 inst->dst.reg,
2204 inst->dst.reg_offset + i);
2205 }
2206 } else {
2207 struct hash_entry *entry =
2208 get_dead_code_hash_entry(ht, inst->dst.reg,
2209 inst->dst.reg_offset);
2210
2211 if (entry) {
2212 if (inst->is_partial_write()) {
2213 /* For a partial write, we can't remove any previous dead code
2214 * candidate, since we're just modifying their result.
2215 */
2216 } else {
2217 /* We're completely updating a channel, and there was a
2218 * previous write to the channel that wasn't read. Kill it!
2219 */
2220 fs_inst *inst = (fs_inst *)entry->data;
2221 inst->remove();
2222 progress = true;
2223 }
2224
2225 _mesa_hash_table_remove(ht, entry);
2226 }
2227
2228 if (!inst->has_side_effects())
2229 insert_dead_code_hash(ht, inst->dst.reg, inst->dst.reg_offset,
2230 inst);
2231 }
2232 }
2233 }
2234
2235 _mesa_hash_table_destroy(ht, NULL);
2236
2237 if (progress)
2238 invalidate_live_intervals();
2239
2240 return progress;
2241 }
2242
2243 /**
2244 * Implements register coalescing: Checks if the two registers involved in a
2245 * raw move don't interfere, in which case they can both be stored in the same
2246 * place and the MOV removed.
2247 */
2248 bool
2249 fs_visitor::register_coalesce()
2250 {
2251 bool progress = false;
2252
2253 calculate_live_intervals();
2254
2255 foreach_list_safe(node, &this->instructions) {
2256 fs_inst *inst = (fs_inst *)node;
2257
2258 if (inst->opcode != BRW_OPCODE_MOV ||
2259 inst->is_partial_write() ||
2260 inst->saturate ||
2261 inst->src[0].file != GRF ||
2262 inst->src[0].negate ||
2263 inst->src[0].abs ||
2264 inst->src[0].smear != -1 ||
2265 inst->dst.file != GRF ||
2266 inst->dst.type != inst->src[0].type ||
2267 virtual_grf_sizes[inst->src[0].reg] != 1) {
2268 continue;
2269 }
2270
2271 int var_from = live_intervals->var_from_reg(&inst->src[0]);
2272 int var_to = live_intervals->var_from_reg(&inst->dst);
2273
2274 if (live_intervals->vars_interfere(var_from, var_to) &&
2275 !inst->dst.equals(inst->src[0]))
2276 continue;
2277
2278 int reg_from = inst->src[0].reg;
2279 assert(inst->src[0].reg_offset == 0);
2280 int reg_to = inst->dst.reg;
2281 int reg_to_offset = inst->dst.reg_offset;
2282
2283 foreach_list(node, &this->instructions) {
2284 fs_inst *scan_inst = (fs_inst *)node;
2285
2286 if (scan_inst->dst.file == GRF &&
2287 scan_inst->dst.reg == reg_from) {
2288 scan_inst->dst.reg = reg_to;
2289 scan_inst->dst.reg_offset = reg_to_offset;
2290 }
2291 for (int i = 0; i < 3; i++) {
2292 if (scan_inst->src[i].file == GRF &&
2293 scan_inst->src[i].reg == reg_from) {
2294 scan_inst->src[i].reg = reg_to;
2295 scan_inst->src[i].reg_offset = reg_to_offset;
2296 }
2297 }
2298 }
2299
2300 inst->remove();
2301 progress = true;
2302 continue;
2303 }
2304
2305 if (progress)
2306 invalidate_live_intervals();
2307
2308 return progress;
2309 }
2310
2311 bool
2312 fs_visitor::compute_to_mrf()
2313 {
2314 bool progress = false;
2315 int next_ip = 0;
2316
2317 calculate_live_intervals();
2318
2319 foreach_list_safe(node, &this->instructions) {
2320 fs_inst *inst = (fs_inst *)node;
2321
2322 int ip = next_ip;
2323 next_ip++;
2324
2325 if (inst->opcode != BRW_OPCODE_MOV ||
2326 inst->is_partial_write() ||
2327 inst->dst.file != MRF || inst->src[0].file != GRF ||
2328 inst->dst.type != inst->src[0].type ||
2329 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
2330 continue;
2331
2332 /* Work out which hardware MRF registers are written by this
2333 * instruction.
2334 */
2335 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
2336 int mrf_high;
2337 if (inst->dst.reg & BRW_MRF_COMPR4) {
2338 mrf_high = mrf_low + 4;
2339 } else if (dispatch_width == 16 &&
2340 (!inst->force_uncompressed && !inst->force_sechalf)) {
2341 mrf_high = mrf_low + 1;
2342 } else {
2343 mrf_high = mrf_low;
2344 }
2345
2346 /* Can't compute-to-MRF this GRF if someone else was going to
2347 * read it later.
2348 */
2349 if (this->virtual_grf_end[inst->src[0].reg] > ip)
2350 continue;
2351
2352 /* Found a move of a GRF to a MRF. Let's see if we can go
2353 * rewrite the thing that made this GRF to write into the MRF.
2354 */
2355 fs_inst *scan_inst;
2356 for (scan_inst = (fs_inst *)inst->prev;
2357 scan_inst->prev != NULL;
2358 scan_inst = (fs_inst *)scan_inst->prev) {
2359 if (scan_inst->dst.file == GRF &&
2360 scan_inst->dst.reg == inst->src[0].reg) {
2361 /* Found the last thing to write our reg we want to turn
2362 * into a compute-to-MRF.
2363 */
2364
2365 /* If this one instruction didn't populate all the
2366 * channels, bail. We might be able to rewrite everything
2367 * that writes that reg, but it would require smarter
2368 * tracking to delay the rewriting until complete success.
2369 */
2370 if (scan_inst->is_partial_write())
2371 break;
2372
2373 /* Things returning more than one register would need us to
2374 * understand coalescing out more than one MOV at a time.
2375 */
2376 if (scan_inst->regs_written > 1)
2377 break;
2378
2379 /* SEND instructions can't have MRF as a destination. */
2380 if (scan_inst->mlen)
2381 break;
2382
2383 if (brw->gen == 6) {
2384 /* gen6 math instructions must have the destination be
2385 * GRF, so no compute-to-MRF for them.
2386 */
2387 if (scan_inst->is_math()) {
2388 break;
2389 }
2390 }
2391
2392 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
2393 /* Found the creator of our MRF's source value. */
2394 scan_inst->dst.file = MRF;
2395 scan_inst->dst.reg = inst->dst.reg;
2396 scan_inst->saturate |= inst->saturate;
2397 inst->remove();
2398 progress = true;
2399 }
2400 break;
2401 }
2402
2403 /* We don't handle control flow here. Most computation of
2404 * values that end up in MRFs are shortly before the MRF
2405 * write anyway.
2406 */
2407 if (scan_inst->is_control_flow() && scan_inst->opcode != BRW_OPCODE_IF)
2408 break;
2409
2410 /* You can't read from an MRF, so if someone else reads our
2411 * MRF's source GRF that we wanted to rewrite, that stops us.
2412 */
2413 bool interfered = false;
2414 for (int i = 0; i < 3; i++) {
2415 if (scan_inst->src[i].file == GRF &&
2416 scan_inst->src[i].reg == inst->src[0].reg &&
2417 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
2418 interfered = true;
2419 }
2420 }
2421 if (interfered)
2422 break;
2423
2424 if (scan_inst->dst.file == MRF) {
2425 /* If somebody else writes our MRF here, we can't
2426 * compute-to-MRF before that.
2427 */
2428 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
2429 int scan_mrf_high;
2430
2431 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
2432 scan_mrf_high = scan_mrf_low + 4;
2433 } else if (dispatch_width == 16 &&
2434 (!scan_inst->force_uncompressed &&
2435 !scan_inst->force_sechalf)) {
2436 scan_mrf_high = scan_mrf_low + 1;
2437 } else {
2438 scan_mrf_high = scan_mrf_low;
2439 }
2440
2441 if (mrf_low == scan_mrf_low ||
2442 mrf_low == scan_mrf_high ||
2443 mrf_high == scan_mrf_low ||
2444 mrf_high == scan_mrf_high) {
2445 break;
2446 }
2447 }
2448
2449 if (scan_inst->mlen > 0 && scan_inst->base_mrf != -1) {
2450 /* Found a SEND instruction, which means that there are
2451 * live values in MRFs from base_mrf to base_mrf +
2452 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2453 * above it.
2454 */
2455 if (mrf_low >= scan_inst->base_mrf &&
2456 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2457 break;
2458 }
2459 if (mrf_high >= scan_inst->base_mrf &&
2460 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2461 break;
2462 }
2463 }
2464 }
2465 }
2466
2467 if (progress)
2468 invalidate_live_intervals();
2469
2470 return progress;
2471 }
2472
2473 /**
2474 * Walks through basic blocks, looking for repeated MRF writes and
2475 * removing the later ones.
2476 */
2477 bool
2478 fs_visitor::remove_duplicate_mrf_writes()
2479 {
2480 fs_inst *last_mrf_move[16];
2481 bool progress = false;
2482
2483 /* Need to update the MRF tracking for compressed instructions. */
2484 if (dispatch_width == 16)
2485 return false;
2486
2487 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2488
2489 foreach_list_safe(node, &this->instructions) {
2490 fs_inst *inst = (fs_inst *)node;
2491
2492 if (inst->is_control_flow()) {
2493 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2494 }
2495
2496 if (inst->opcode == BRW_OPCODE_MOV &&
2497 inst->dst.file == MRF) {
2498 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2499 if (prev_inst && inst->equals(prev_inst)) {
2500 inst->remove();
2501 progress = true;
2502 continue;
2503 }
2504 }
2505
2506 /* Clear out the last-write records for MRFs that were overwritten. */
2507 if (inst->dst.file == MRF) {
2508 last_mrf_move[inst->dst.reg] = NULL;
2509 }
2510
2511 if (inst->mlen > 0 && inst->base_mrf != -1) {
2512 /* Found a SEND instruction, which will include two or fewer
2513 * implied MRF writes. We could do better here.
2514 */
2515 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2516 last_mrf_move[inst->base_mrf + i] = NULL;
2517 }
2518 }
2519
2520 /* Clear out any MRF move records whose sources got overwritten. */
2521 if (inst->dst.file == GRF) {
2522 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2523 if (last_mrf_move[i] &&
2524 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2525 last_mrf_move[i] = NULL;
2526 }
2527 }
2528 }
2529
2530 if (inst->opcode == BRW_OPCODE_MOV &&
2531 inst->dst.file == MRF &&
2532 inst->src[0].file == GRF &&
2533 !inst->is_partial_write()) {
2534 last_mrf_move[inst->dst.reg] = inst;
2535 }
2536 }
2537
2538 if (progress)
2539 invalidate_live_intervals();
2540
2541 return progress;
2542 }
2543
2544 static void
2545 clear_deps_for_inst_src(fs_inst *inst, int dispatch_width, bool *deps,
2546 int first_grf, int grf_len)
2547 {
2548 bool inst_16wide = (dispatch_width > 8 &&
2549 !inst->force_uncompressed &&
2550 !inst->force_sechalf);
2551
2552 /* Clear the flag for registers that actually got read (as expected). */
2553 for (int i = 0; i < 3; i++) {
2554 int grf;
2555 if (inst->src[i].file == GRF) {
2556 grf = inst->src[i].reg;
2557 } else if (inst->src[i].file == HW_REG &&
2558 inst->src[i].fixed_hw_reg.file == BRW_GENERAL_REGISTER_FILE) {
2559 grf = inst->src[i].fixed_hw_reg.nr;
2560 } else {
2561 continue;
2562 }
2563
2564 if (grf >= first_grf &&
2565 grf < first_grf + grf_len) {
2566 deps[grf - first_grf] = false;
2567 if (inst_16wide)
2568 deps[grf - first_grf + 1] = false;
2569 }
2570 }
2571 }
2572
2573 /**
2574 * Implements this workaround for the original 965:
2575 *
2576 * "[DevBW, DevCL] Implementation Restrictions: As the hardware does not
2577 * check for post destination dependencies on this instruction, software
2578 * must ensure that there is no destination hazard for the case of ‘write
2579 * followed by a posted write’ shown in the following example.
2580 *
2581 * 1. mov r3 0
2582 * 2. send r3.xy <rest of send instruction>
2583 * 3. mov r2 r3
2584 *
2585 * Due to no post-destination dependency check on the ‘send’, the above
2586 * code sequence could have two instructions (1 and 2) in flight at the
2587 * same time that both consider ‘r3’ as the target of their final writes.
2588 */
2589 void
2590 fs_visitor::insert_gen4_pre_send_dependency_workarounds(fs_inst *inst)
2591 {
2592 int reg_size = dispatch_width / 8;
2593 int write_len = inst->regs_written * reg_size;
2594 int first_write_grf = inst->dst.reg;
2595 bool needs_dep[BRW_MAX_MRF];
2596 assert(write_len < (int)sizeof(needs_dep) - 1);
2597
2598 memset(needs_dep, false, sizeof(needs_dep));
2599 memset(needs_dep, true, write_len);
2600
2601 clear_deps_for_inst_src(inst, dispatch_width,
2602 needs_dep, first_write_grf, write_len);
2603
2604 /* Walk backwards looking for writes to registers we're writing which
2605 * aren't read since being written. If we hit the start of the program,
2606 * we assume that there are no outstanding dependencies on entry to the
2607 * program.
2608 */
2609 for (fs_inst *scan_inst = (fs_inst *)inst->prev;
2610 scan_inst != NULL;
2611 scan_inst = (fs_inst *)scan_inst->prev) {
2612
2613 /* If we hit control flow, assume that there *are* outstanding
2614 * dependencies, and force their cleanup before our instruction.
2615 */
2616 if (scan_inst->is_control_flow()) {
2617 for (int i = 0; i < write_len; i++) {
2618 if (needs_dep[i]) {
2619 inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2620 }
2621 }
2622 return;
2623 }
2624
2625 bool scan_inst_16wide = (dispatch_width > 8 &&
2626 !scan_inst->force_uncompressed &&
2627 !scan_inst->force_sechalf);
2628
2629 /* We insert our reads as late as possible on the assumption that any
2630 * instruction but a MOV that might have left us an outstanding
2631 * dependency has more latency than a MOV.
2632 */
2633 if (scan_inst->dst.file == GRF) {
2634 for (int i = 0; i < scan_inst->regs_written; i++) {
2635 int reg = scan_inst->dst.reg + i * reg_size;
2636
2637 if (reg >= first_write_grf &&
2638 reg < first_write_grf + write_len &&
2639 needs_dep[reg - first_write_grf]) {
2640 inst->insert_before(DEP_RESOLVE_MOV(reg));
2641 needs_dep[reg - first_write_grf] = false;
2642 if (scan_inst_16wide)
2643 needs_dep[reg - first_write_grf + 1] = false;
2644 }
2645 }
2646 }
2647
2648 /* Clear the flag for registers that actually got read (as expected). */
2649 clear_deps_for_inst_src(scan_inst, dispatch_width,
2650 needs_dep, first_write_grf, write_len);
2651
2652 /* Continue the loop only if we haven't resolved all the dependencies */
2653 int i;
2654 for (i = 0; i < write_len; i++) {
2655 if (needs_dep[i])
2656 break;
2657 }
2658 if (i == write_len)
2659 return;
2660 }
2661 }
2662
2663 /**
2664 * Implements this workaround for the original 965:
2665 *
2666 * "[DevBW, DevCL] Errata: A destination register from a send can not be
2667 * used as a destination register until after it has been sourced by an
2668 * instruction with a different destination register.
2669 */
2670 void
2671 fs_visitor::insert_gen4_post_send_dependency_workarounds(fs_inst *inst)
2672 {
2673 int write_len = inst->regs_written * dispatch_width / 8;
2674 int first_write_grf = inst->dst.reg;
2675 bool needs_dep[BRW_MAX_MRF];
2676 assert(write_len < (int)sizeof(needs_dep) - 1);
2677
2678 memset(needs_dep, false, sizeof(needs_dep));
2679 memset(needs_dep, true, write_len);
2680 /* Walk forwards looking for writes to registers we're writing which aren't
2681 * read before being written.
2682 */
2683 for (fs_inst *scan_inst = (fs_inst *)inst->next;
2684 !scan_inst->is_tail_sentinel();
2685 scan_inst = (fs_inst *)scan_inst->next) {
2686 /* If we hit control flow, force resolve all remaining dependencies. */
2687 if (scan_inst->is_control_flow()) {
2688 for (int i = 0; i < write_len; i++) {
2689 if (needs_dep[i])
2690 scan_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2691 }
2692 return;
2693 }
2694
2695 /* Clear the flag for registers that actually got read (as expected). */
2696 clear_deps_for_inst_src(scan_inst, dispatch_width,
2697 needs_dep, first_write_grf, write_len);
2698
2699 /* We insert our reads as late as possible since they're reading the
2700 * result of a SEND, which has massive latency.
2701 */
2702 if (scan_inst->dst.file == GRF &&
2703 scan_inst->dst.reg >= first_write_grf &&
2704 scan_inst->dst.reg < first_write_grf + write_len &&
2705 needs_dep[scan_inst->dst.reg - first_write_grf]) {
2706 scan_inst->insert_before(DEP_RESOLVE_MOV(scan_inst->dst.reg));
2707 needs_dep[scan_inst->dst.reg - first_write_grf] = false;
2708 }
2709
2710 /* Continue the loop only if we haven't resolved all the dependencies */
2711 int i;
2712 for (i = 0; i < write_len; i++) {
2713 if (needs_dep[i])
2714 break;
2715 }
2716 if (i == write_len)
2717 return;
2718 }
2719
2720 /* If we hit the end of the program, resolve all remaining dependencies out
2721 * of paranoia.
2722 */
2723 fs_inst *last_inst = (fs_inst *)this->instructions.get_tail();
2724 assert(last_inst->eot);
2725 for (int i = 0; i < write_len; i++) {
2726 if (needs_dep[i])
2727 last_inst->insert_before(DEP_RESOLVE_MOV(first_write_grf + i));
2728 }
2729 }
2730
2731 void
2732 fs_visitor::insert_gen4_send_dependency_workarounds()
2733 {
2734 if (brw->gen != 4 || brw->is_g4x)
2735 return;
2736
2737 /* Note that we're done with register allocation, so GRF fs_regs always
2738 * have a .reg_offset of 0.
2739 */
2740
2741 foreach_list_safe(node, &this->instructions) {
2742 fs_inst *inst = (fs_inst *)node;
2743
2744 if (inst->mlen != 0 && inst->dst.file == GRF) {
2745 insert_gen4_pre_send_dependency_workarounds(inst);
2746 insert_gen4_post_send_dependency_workarounds(inst);
2747 }
2748 }
2749 }
2750
2751 /**
2752 * Turns the generic expression-style uniform pull constant load instruction
2753 * into a hardware-specific series of instructions for loading a pull
2754 * constant.
2755 *
2756 * The expression style allows the CSE pass before this to optimize out
2757 * repeated loads from the same offset, and gives the pre-register-allocation
2758 * scheduling full flexibility, while the conversion to native instructions
2759 * allows the post-register-allocation scheduler the best information
2760 * possible.
2761 *
2762 * Note that execution masking for setting up pull constant loads is special:
2763 * the channels that need to be written are unrelated to the current execution
2764 * mask, since a later instruction will use one of the result channels as a
2765 * source operand for all 8 or 16 of its channels.
2766 */
2767 void
2768 fs_visitor::lower_uniform_pull_constant_loads()
2769 {
2770 foreach_list(node, &this->instructions) {
2771 fs_inst *inst = (fs_inst *)node;
2772
2773 if (inst->opcode != FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD)
2774 continue;
2775
2776 if (brw->gen >= 7) {
2777 /* The offset arg before was a vec4-aligned byte offset. We need to
2778 * turn it into a dword offset.
2779 */
2780 fs_reg const_offset_reg = inst->src[1];
2781 assert(const_offset_reg.file == IMM &&
2782 const_offset_reg.type == BRW_REGISTER_TYPE_UD);
2783 const_offset_reg.imm.u /= 4;
2784 fs_reg payload = fs_reg(this, glsl_type::uint_type);
2785
2786 /* This is actually going to be a MOV, but since only the first dword
2787 * is accessed, we have a special opcode to do just that one. Note
2788 * that this needs to be an operation that will be considered a def
2789 * by live variable analysis, or register allocation will explode.
2790 */
2791 fs_inst *setup = new(mem_ctx) fs_inst(FS_OPCODE_SET_SIMD4X2_OFFSET,
2792 payload, const_offset_reg);
2793 setup->force_writemask_all = true;
2794
2795 setup->ir = inst->ir;
2796 setup->annotation = inst->annotation;
2797 inst->insert_before(setup);
2798
2799 /* Similarly, this will only populate the first 4 channels of the
2800 * result register (since we only use smear values from 0-3), but we
2801 * don't tell the optimizer.
2802 */
2803 inst->opcode = FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD_GEN7;
2804 inst->src[1] = payload;
2805
2806 invalidate_live_intervals();
2807 } else {
2808 /* Before register allocation, we didn't tell the scheduler about the
2809 * MRF we use. We know it's safe to use this MRF because nothing
2810 * else does except for register spill/unspill, which generates and
2811 * uses its MRF within a single IR instruction.
2812 */
2813 inst->base_mrf = 14;
2814 inst->mlen = 1;
2815 }
2816 }
2817 }
2818
2819 void
2820 fs_visitor::dump_instruction(backend_instruction *be_inst)
2821 {
2822 fs_inst *inst = (fs_inst *)be_inst;
2823
2824 if (inst->predicate) {
2825 printf("(%cf0.%d) ",
2826 inst->predicate_inverse ? '-' : '+',
2827 inst->flag_subreg);
2828 }
2829
2830 printf("%s", brw_instruction_name(inst->opcode));
2831 if (inst->saturate)
2832 printf(".sat");
2833 if (inst->conditional_mod) {
2834 printf(".cmod");
2835 if (!inst->predicate &&
2836 (brw->gen < 5 || (inst->opcode != BRW_OPCODE_SEL &&
2837 inst->opcode != BRW_OPCODE_IF &&
2838 inst->opcode != BRW_OPCODE_WHILE))) {
2839 printf(".f0.%d", inst->flag_subreg);
2840 }
2841 }
2842 printf(" ");
2843
2844
2845 switch (inst->dst.file) {
2846 case GRF:
2847 printf("vgrf%d", inst->dst.reg);
2848 if (inst->dst.reg_offset)
2849 printf("+%d", inst->dst.reg_offset);
2850 break;
2851 case MRF:
2852 printf("m%d", inst->dst.reg);
2853 break;
2854 case BAD_FILE:
2855 printf("(null)");
2856 break;
2857 case UNIFORM:
2858 printf("***u%d***", inst->dst.reg);
2859 break;
2860 case HW_REG:
2861 printf("hw_reg%d", inst->dst.fixed_hw_reg.nr);
2862 if (inst->dst.fixed_hw_reg.subnr)
2863 printf("+%d", inst->dst.fixed_hw_reg.subnr);
2864 break;
2865 default:
2866 printf("???");
2867 break;
2868 }
2869 printf(", ");
2870
2871 for (int i = 0; i < 3 && inst->src[i].file != BAD_FILE; i++) {
2872 if (inst->src[i].negate)
2873 printf("-");
2874 if (inst->src[i].abs)
2875 printf("|");
2876 switch (inst->src[i].file) {
2877 case GRF:
2878 printf("vgrf%d", inst->src[i].reg);
2879 if (inst->src[i].reg_offset)
2880 printf("+%d", inst->src[i].reg_offset);
2881 break;
2882 case MRF:
2883 printf("***m%d***", inst->src[i].reg);
2884 break;
2885 case UNIFORM:
2886 printf("u%d", inst->src[i].reg);
2887 if (inst->src[i].reg_offset)
2888 printf(".%d", inst->src[i].reg_offset);
2889 break;
2890 case BAD_FILE:
2891 printf("(null)");
2892 break;
2893 case IMM:
2894 switch (inst->src[i].type) {
2895 case BRW_REGISTER_TYPE_F:
2896 printf("%ff", inst->src[i].imm.f);
2897 break;
2898 case BRW_REGISTER_TYPE_D:
2899 printf("%dd", inst->src[i].imm.i);
2900 break;
2901 case BRW_REGISTER_TYPE_UD:
2902 printf("%uu", inst->src[i].imm.u);
2903 break;
2904 default:
2905 printf("???");
2906 break;
2907 }
2908 break;
2909 case HW_REG:
2910 if (inst->src[i].fixed_hw_reg.negate)
2911 printf("-");
2912 if (inst->src[i].fixed_hw_reg.abs)
2913 printf("|");
2914 printf("hw_reg%d", inst->src[i].fixed_hw_reg.nr);
2915 if (inst->src[i].fixed_hw_reg.subnr)
2916 printf("+%d", inst->src[i].fixed_hw_reg.subnr);
2917 if (inst->src[i].fixed_hw_reg.abs)
2918 printf("|");
2919 break;
2920 default:
2921 printf("???");
2922 break;
2923 }
2924 if (inst->src[i].abs)
2925 printf("|");
2926
2927 if (i < 2 && inst->src[i + 1].file != BAD_FILE)
2928 printf(", ");
2929 }
2930
2931 printf(" ");
2932
2933 if (inst->force_uncompressed)
2934 printf("1sthalf ");
2935
2936 if (inst->force_sechalf)
2937 printf("2ndhalf ");
2938
2939 printf("\n");
2940 }
2941
2942 /**
2943 * Possibly returns an instruction that set up @param reg.
2944 *
2945 * Sometimes we want to take the result of some expression/variable
2946 * dereference tree and rewrite the instruction generating the result
2947 * of the tree. When processing the tree, we know that the
2948 * instructions generated are all writing temporaries that are dead
2949 * outside of this tree. So, if we have some instructions that write
2950 * a temporary, we're free to point that temp write somewhere else.
2951 *
2952 * Note that this doesn't guarantee that the instruction generated
2953 * only reg -- it might be the size=4 destination of a texture instruction.
2954 */
2955 fs_inst *
2956 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2957 fs_inst *end,
2958 fs_reg reg)
2959 {
2960 if (end == start ||
2961 end->is_partial_write() ||
2962 reg.reladdr ||
2963 !reg.equals(end->dst)) {
2964 return NULL;
2965 } else {
2966 return end;
2967 }
2968 }
2969
2970 void
2971 fs_visitor::setup_payload_gen6()
2972 {
2973 bool uses_depth =
2974 (fp->Base.InputsRead & (1 << VARYING_SLOT_POS)) != 0;
2975 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2976
2977 assert(brw->gen >= 6);
2978
2979 /* R0-1: masks, pixel X/Y coordinates. */
2980 c->nr_payload_regs = 2;
2981 /* R2: only for 32-pixel dispatch.*/
2982
2983 /* R3-26: barycentric interpolation coordinates. These appear in the
2984 * same order that they appear in the brw_wm_barycentric_interp_mode
2985 * enum. Each set of coordinates occupies 2 registers if dispatch width
2986 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2987 * appear if they were enabled using the "Barycentric Interpolation
2988 * Mode" bits in WM_STATE.
2989 */
2990 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2991 if (barycentric_interp_modes & (1 << i)) {
2992 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2993 c->nr_payload_regs += 2;
2994 if (dispatch_width == 16) {
2995 c->nr_payload_regs += 2;
2996 }
2997 }
2998 }
2999
3000 /* R27: interpolated depth if uses source depth */
3001 if (uses_depth) {
3002 c->source_depth_reg = c->nr_payload_regs;
3003 c->nr_payload_regs++;
3004 if (dispatch_width == 16) {
3005 /* R28: interpolated depth if not 8-wide. */
3006 c->nr_payload_regs++;
3007 }
3008 }
3009 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
3010 if (uses_depth) {
3011 c->source_w_reg = c->nr_payload_regs;
3012 c->nr_payload_regs++;
3013 if (dispatch_width == 16) {
3014 /* R30: interpolated W if not 8-wide. */
3015 c->nr_payload_regs++;
3016 }
3017 }
3018
3019 c->prog_data.uses_pos_offset = c->key.compute_pos_offset;
3020 /* R31: MSAA position offsets. */
3021 if (c->prog_data.uses_pos_offset) {
3022 c->sample_pos_reg = c->nr_payload_regs;
3023 c->nr_payload_regs++;
3024 }
3025
3026 /* R32-: bary for 32-pixel. */
3027 /* R58-59: interp W for 32-pixel. */
3028
3029 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
3030 c->source_depth_to_render_target = true;
3031 }
3032 }
3033
3034 void
3035 fs_visitor::assign_binding_table_offsets()
3036 {
3037 uint32_t next_binding_table_offset = 0;
3038
3039 /* If there are no color regions, we still perform an FB write to a null
3040 * renderbuffer, which we place at surface index 0.
3041 */
3042 c->prog_data.binding_table.render_target_start = next_binding_table_offset;
3043 next_binding_table_offset += MAX2(c->key.nr_color_regions, 1);
3044
3045 assign_common_binding_table_offsets(next_binding_table_offset);
3046 }
3047
3048 bool
3049 fs_visitor::run()
3050 {
3051 sanity_param_count = fp->Base.Parameters->NumParameters;
3052 uint32_t orig_nr_params = c->prog_data.nr_params;
3053 bool allocated_without_spills;
3054
3055 assign_binding_table_offsets();
3056
3057 if (brw->gen >= 6)
3058 setup_payload_gen6();
3059 else
3060 setup_payload_gen4();
3061
3062 if (0) {
3063 emit_dummy_fs();
3064 } else {
3065 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
3066 emit_shader_time_begin();
3067
3068 calculate_urb_setup();
3069 if (fp->Base.InputsRead > 0) {
3070 if (brw->gen < 6)
3071 emit_interpolation_setup_gen4();
3072 else
3073 emit_interpolation_setup_gen6();
3074 }
3075
3076 /* We handle discards by keeping track of the still-live pixels in f0.1.
3077 * Initialize it with the dispatched pixels.
3078 */
3079 if (fp->UsesKill || c->key.alpha_test_func) {
3080 fs_inst *discard_init = emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS);
3081 discard_init->flag_subreg = 1;
3082 }
3083
3084 /* Generate FS IR for main(). (the visitor only descends into
3085 * functions called "main").
3086 */
3087 if (shader) {
3088 foreach_list(node, &*shader->ir) {
3089 ir_instruction *ir = (ir_instruction *)node;
3090 base_ir = ir;
3091 this->result = reg_undef;
3092 ir->accept(this);
3093 }
3094 } else {
3095 emit_fragment_program_code();
3096 }
3097 base_ir = NULL;
3098 if (failed)
3099 return false;
3100
3101 emit(FS_OPCODE_PLACEHOLDER_HALT);
3102
3103 if (c->key.alpha_test_func)
3104 emit_alpha_test();
3105
3106 emit_fb_writes();
3107
3108 split_virtual_grfs();
3109
3110 move_uniform_array_access_to_pull_constants();
3111 remove_dead_constants();
3112 setup_pull_constants();
3113
3114 bool progress;
3115 do {
3116 progress = false;
3117
3118 compact_virtual_grfs();
3119
3120 progress = remove_duplicate_mrf_writes() || progress;
3121
3122 progress = opt_algebraic() || progress;
3123 progress = opt_cse() || progress;
3124 progress = opt_copy_propagate() || progress;
3125 progress = dead_code_eliminate() || progress;
3126 progress = dead_code_eliminate_local() || progress;
3127 progress = dead_control_flow_eliminate(this) || progress;
3128 progress = register_coalesce() || progress;
3129 progress = compute_to_mrf() || progress;
3130 } while (progress);
3131
3132 lower_uniform_pull_constant_loads();
3133
3134 assign_curb_setup();
3135 assign_urb_setup();
3136
3137 static enum instruction_scheduler_mode pre_modes[] = {
3138 SCHEDULE_PRE,
3139 SCHEDULE_PRE_NON_LIFO,
3140 SCHEDULE_PRE_LIFO,
3141 };
3142
3143 /* Try each scheduling heuristic to see if it can successfully register
3144 * allocate without spilling. They should be ordered by decreasing
3145 * performance but increasing likelihood of allocating.
3146 */
3147 for (unsigned i = 0; i < ARRAY_SIZE(pre_modes); i++) {
3148 schedule_instructions(pre_modes[i]);
3149
3150 if (0) {
3151 assign_regs_trivial();
3152 allocated_without_spills = true;
3153 } else {
3154 allocated_without_spills = assign_regs(false);
3155 }
3156 if (allocated_without_spills)
3157 break;
3158 }
3159
3160 if (!allocated_without_spills) {
3161 /* We assume that any spilling is worse than just dropping back to
3162 * SIMD8. There's probably actually some intermediate point where
3163 * SIMD16 with a couple of spills is still better.
3164 */
3165 if (dispatch_width == 16) {
3166 fail("Failure to register allocate. Reduce number of "
3167 "live scalar values to avoid this.");
3168 }
3169
3170 /* Since we're out of heuristics, just go spill registers until we
3171 * get an allocation.
3172 */
3173 while (!assign_regs(true)) {
3174 if (failed)
3175 break;
3176 }
3177 }
3178 }
3179 assert(force_uncompressed_stack == 0);
3180
3181 /* This must come after all optimization and register allocation, since
3182 * it inserts dead code that happens to have side effects, and it does
3183 * so based on the actual physical registers in use.
3184 */
3185 insert_gen4_send_dependency_workarounds();
3186
3187 if (failed)
3188 return false;
3189
3190 if (!allocated_without_spills)
3191 schedule_instructions(SCHEDULE_POST);
3192
3193 if (dispatch_width == 8) {
3194 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
3195 } else {
3196 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
3197
3198 /* Make sure we didn't try to sneak in an extra uniform */
3199 assert(orig_nr_params == c->prog_data.nr_params);
3200 (void) orig_nr_params;
3201 }
3202
3203 /* If any state parameters were appended, then ParameterValues could have
3204 * been realloced, in which case the driver uniform storage set up by
3205 * _mesa_associate_uniform_storage() would point to freed memory. Make
3206 * sure that didn't happen.
3207 */
3208 assert(sanity_param_count == fp->Base.Parameters->NumParameters);
3209
3210 return !failed;
3211 }
3212
3213 const unsigned *
3214 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
3215 struct gl_fragment_program *fp,
3216 struct gl_shader_program *prog,
3217 unsigned *final_assembly_size)
3218 {
3219 bool start_busy = false;
3220 float start_time = 0;
3221
3222 if (unlikely(brw->perf_debug)) {
3223 start_busy = (brw->batch.last_bo &&
3224 drm_intel_bo_busy(brw->batch.last_bo));
3225 start_time = get_time();
3226 }
3227
3228 struct brw_shader *shader = NULL;
3229 if (prog)
3230 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
3231
3232 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
3233 if (prog) {
3234 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
3235 _mesa_print_ir(shader->ir, NULL);
3236 printf("\n\n");
3237 } else {
3238 printf("ARB_fragment_program %d ir for native fragment shader\n",
3239 fp->Base.Id);
3240 _mesa_print_program(&fp->Base);
3241 }
3242 }
3243
3244 /* Now the main event: Visit the shader IR and generate our FS IR for it.
3245 */
3246 fs_visitor v(brw, c, prog, fp, 8);
3247 if (!v.run()) {
3248 if (prog) {
3249 prog->LinkStatus = false;
3250 ralloc_strcat(&prog->InfoLog, v.fail_msg);
3251 }
3252
3253 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
3254 v.fail_msg);
3255
3256 return NULL;
3257 }
3258
3259 exec_list *simd16_instructions = NULL;
3260 fs_visitor v2(brw, c, prog, fp, 16);
3261 if (brw->gen >= 5 && likely(!(INTEL_DEBUG & DEBUG_NO16))) {
3262 if (c->prog_data.nr_pull_params == 0) {
3263 /* Try a 16-wide compile */
3264 v2.import_uniforms(&v);
3265 if (!v2.run()) {
3266 perf_debug("16-wide shader failed to compile, falling back to "
3267 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
3268 } else {
3269 simd16_instructions = &v2.instructions;
3270 }
3271 } else {
3272 perf_debug("Skipping 16-wide due to pull parameters.\n");
3273 }
3274 }
3275
3276 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
3277 const unsigned *generated = g.generate_assembly(&v.instructions,
3278 simd16_instructions,
3279 final_assembly_size);
3280
3281 if (unlikely(brw->perf_debug) && shader) {
3282 if (shader->compiled_once)
3283 brw_wm_debug_recompile(brw, prog, &c->key);
3284 shader->compiled_once = true;
3285
3286 if (start_busy && !drm_intel_bo_busy(brw->batch.last_bo)) {
3287 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
3288 (get_time() - start_time) * 1000);
3289 }
3290 }
3291
3292 return generated;
3293 }
3294
3295 bool
3296 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
3297 {
3298 struct brw_context *brw = brw_context(ctx);
3299 struct brw_wm_prog_key key;
3300
3301 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
3302 return true;
3303
3304 struct gl_fragment_program *fp = (struct gl_fragment_program *)
3305 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
3306 struct brw_fragment_program *bfp = brw_fragment_program(fp);
3307 bool program_uses_dfdy = fp->UsesDFdy;
3308
3309 memset(&key, 0, sizeof(key));
3310
3311 if (brw->gen < 6) {
3312 if (fp->UsesKill)
3313 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
3314
3315 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
3316 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
3317
3318 /* Just assume depth testing. */
3319 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
3320 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
3321 }
3322
3323 if (brw->gen < 6 || _mesa_bitcount_64(fp->Base.InputsRead &
3324 BRW_FS_VARYING_INPUT_MASK) > 16)
3325 key.input_slots_valid = fp->Base.InputsRead | VARYING_BIT_POS;
3326
3327 key.clamp_fragment_color = ctx->API == API_OPENGL_COMPAT;
3328
3329 unsigned sampler_count = _mesa_fls(fp->Base.SamplersUsed);
3330 for (unsigned i = 0; i < sampler_count; i++) {
3331 if (fp->Base.ShadowSamplers & (1 << i)) {
3332 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
3333 key.tex.swizzles[i] =
3334 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
3335 } else {
3336 /* Color sampler: assume no swizzling. */
3337 key.tex.swizzles[i] = SWIZZLE_XYZW;
3338 }
3339 }
3340
3341 if (fp->Base.InputsRead & VARYING_BIT_POS) {
3342 key.drawable_height = ctx->DrawBuffer->Height;
3343 }
3344
3345 if ((fp->Base.InputsRead & VARYING_BIT_POS) || program_uses_dfdy) {
3346 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
3347 }
3348
3349 key.nr_color_regions = 1;
3350
3351 /* GL_FRAGMENT_SHADER_DERIVATIVE_HINT is almost always GL_DONT_CARE. The
3352 * quality of the derivatives is likely to be determined by the driconf
3353 * option.
3354 */
3355 key.high_quality_derivatives = brw->disable_derivative_optimization;
3356
3357 key.program_string_id = bfp->id;
3358
3359 uint32_t old_prog_offset = brw->wm.base.prog_offset;
3360 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
3361
3362 bool success = do_wm_prog(brw, prog, bfp, &key);
3363
3364 brw->wm.base.prog_offset = old_prog_offset;
3365 brw->wm.prog_data = old_prog_data;
3366
3367 return success;
3368 }