i965/fs: Restrict optimization that would fail for gen7's SENDs from GRFs
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 ALU1(NOT)
150 ALU1(MOV)
151 ALU1(FRC)
152 ALU1(RNDD)
153 ALU1(RNDE)
154 ALU1(RNDZ)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(SHL)
162 ALU2(SHR)
163 ALU2(ASR)
164
165 /** Gen4 predicated IF. */
166 fs_inst *
167 fs_visitor::IF(uint32_t predicate)
168 {
169 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
170 inst->predicate = predicate;
171 return inst;
172 }
173
174 /** Gen6+ IF with embedded comparison. */
175 fs_inst *
176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
177 {
178 assert(intel->gen >= 6);
179 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
180 reg_null_d, src0, src1);
181 inst->conditional_mod = condition;
182 return inst;
183 }
184
185 /**
186 * CMP: Sets the low bit of the destination channels with the result
187 * of the comparison, while the upper bits are undefined, and updates
188 * the flag register with the packed 16 bits of the result.
189 */
190 fs_inst *
191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
192 {
193 fs_inst *inst;
194
195 /* Take the instruction:
196 *
197 * CMP null<d> src0<f> src1<f>
198 *
199 * Original gen4 does type conversion to the destination type before
200 * comparison, producing garbage results for floating point comparisons.
201 * gen5 does the comparison on the execution type (resolved source types),
202 * so dst type doesn't matter. gen6 does comparison and then uses the
203 * result as if it was the dst type with no conversion, which happens to
204 * mostly work out for float-interpreted-as-int since our comparisons are
205 * for >0, =0, <0.
206 */
207 if (intel->gen == 4) {
208 dst.type = src0.type;
209 if (dst.file == FIXED_HW_REG)
210 dst.fixed_hw_reg.type = dst.type;
211 }
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 bool
223 fs_inst::equals(fs_inst *inst)
224 {
225 return (opcode == inst->opcode &&
226 dst.equals(inst->dst) &&
227 src[0].equals(inst->src[0]) &&
228 src[1].equals(inst->src[1]) &&
229 src[2].equals(inst->src[2]) &&
230 saturate == inst->saturate &&
231 predicate == inst->predicate &&
232 conditional_mod == inst->conditional_mod &&
233 mlen == inst->mlen &&
234 base_mrf == inst->base_mrf &&
235 sampler == inst->sampler &&
236 target == inst->target &&
237 eot == inst->eot &&
238 header_present == inst->header_present &&
239 shadow_compare == inst->shadow_compare &&
240 offset == inst->offset);
241 }
242
243 int
244 fs_inst::regs_written()
245 {
246 if (is_tex())
247 return 4;
248
249 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
250 * but we don't currently use them...nor do we have an opcode for them.
251 */
252
253 return 1;
254 }
255
256 bool
257 fs_inst::overwrites_reg(const fs_reg &reg)
258 {
259 return (reg.file == dst.file &&
260 reg.reg == dst.reg &&
261 reg.reg_offset >= dst.reg_offset &&
262 reg.reg_offset < dst.reg_offset + regs_written());
263 }
264
265 bool
266 fs_inst::is_tex()
267 {
268 return (opcode == SHADER_OPCODE_TEX ||
269 opcode == FS_OPCODE_TXB ||
270 opcode == SHADER_OPCODE_TXD ||
271 opcode == SHADER_OPCODE_TXF ||
272 opcode == SHADER_OPCODE_TXL ||
273 opcode == SHADER_OPCODE_TXS);
274 }
275
276 bool
277 fs_inst::is_math()
278 {
279 return (opcode == SHADER_OPCODE_RCP ||
280 opcode == SHADER_OPCODE_RSQ ||
281 opcode == SHADER_OPCODE_SQRT ||
282 opcode == SHADER_OPCODE_EXP2 ||
283 opcode == SHADER_OPCODE_LOG2 ||
284 opcode == SHADER_OPCODE_SIN ||
285 opcode == SHADER_OPCODE_COS ||
286 opcode == SHADER_OPCODE_INT_QUOTIENT ||
287 opcode == SHADER_OPCODE_INT_REMAINDER ||
288 opcode == SHADER_OPCODE_POW);
289 }
290
291 bool
292 fs_inst::is_send_from_grf()
293 {
294 return opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
295 }
296
297 bool
298 fs_visitor::can_do_source_mods(fs_inst *inst)
299 {
300 if (intel->gen == 6 && inst->is_math())
301 return false;
302
303 if (inst->is_send_from_grf())
304 return false;
305
306 return true;
307 }
308
309 void
310 fs_reg::init()
311 {
312 memset(this, 0, sizeof(*this));
313 this->smear = -1;
314 }
315
316 /** Generic unset register constructor. */
317 fs_reg::fs_reg()
318 {
319 init();
320 this->file = BAD_FILE;
321 }
322
323 /** Immediate value constructor. */
324 fs_reg::fs_reg(float f)
325 {
326 init();
327 this->file = IMM;
328 this->type = BRW_REGISTER_TYPE_F;
329 this->imm.f = f;
330 }
331
332 /** Immediate value constructor. */
333 fs_reg::fs_reg(int32_t i)
334 {
335 init();
336 this->file = IMM;
337 this->type = BRW_REGISTER_TYPE_D;
338 this->imm.i = i;
339 }
340
341 /** Immediate value constructor. */
342 fs_reg::fs_reg(uint32_t u)
343 {
344 init();
345 this->file = IMM;
346 this->type = BRW_REGISTER_TYPE_UD;
347 this->imm.u = u;
348 }
349
350 /** Fixed brw_reg Immediate value constructor. */
351 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
352 {
353 init();
354 this->file = FIXED_HW_REG;
355 this->fixed_hw_reg = fixed_hw_reg;
356 this->type = fixed_hw_reg.type;
357 }
358
359 bool
360 fs_reg::equals(const fs_reg &r) const
361 {
362 return (file == r.file &&
363 reg == r.reg &&
364 reg_offset == r.reg_offset &&
365 type == r.type &&
366 negate == r.negate &&
367 abs == r.abs &&
368 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
369 sizeof(fixed_hw_reg)) == 0 &&
370 smear == r.smear &&
371 imm.u == r.imm.u);
372 }
373
374 bool
375 fs_reg::is_zero() const
376 {
377 if (file != IMM)
378 return false;
379
380 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
381 }
382
383 bool
384 fs_reg::is_one() const
385 {
386 if (file != IMM)
387 return false;
388
389 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
390 }
391
392 int
393 fs_visitor::type_size(const struct glsl_type *type)
394 {
395 unsigned int size, i;
396
397 switch (type->base_type) {
398 case GLSL_TYPE_UINT:
399 case GLSL_TYPE_INT:
400 case GLSL_TYPE_FLOAT:
401 case GLSL_TYPE_BOOL:
402 return type->components();
403 case GLSL_TYPE_ARRAY:
404 return type_size(type->fields.array) * type->length;
405 case GLSL_TYPE_STRUCT:
406 size = 0;
407 for (i = 0; i < type->length; i++) {
408 size += type_size(type->fields.structure[i].type);
409 }
410 return size;
411 case GLSL_TYPE_SAMPLER:
412 /* Samplers take up no register space, since they're baked in at
413 * link time.
414 */
415 return 0;
416 default:
417 assert(!"not reached");
418 return 0;
419 }
420 }
421
422 void
423 fs_visitor::fail(const char *format, ...)
424 {
425 va_list va;
426 char *msg;
427
428 if (failed)
429 return;
430
431 failed = true;
432
433 va_start(va, format);
434 msg = ralloc_vasprintf(mem_ctx, format, va);
435 va_end(va);
436 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
437
438 this->fail_msg = msg;
439
440 if (INTEL_DEBUG & DEBUG_WM) {
441 fprintf(stderr, "%s", msg);
442 }
443 }
444
445 fs_inst *
446 fs_visitor::emit(enum opcode opcode)
447 {
448 return emit(fs_inst(opcode));
449 }
450
451 fs_inst *
452 fs_visitor::emit(enum opcode opcode, fs_reg dst)
453 {
454 return emit(fs_inst(opcode, dst));
455 }
456
457 fs_inst *
458 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
459 {
460 return emit(fs_inst(opcode, dst, src0));
461 }
462
463 fs_inst *
464 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
465 {
466 return emit(fs_inst(opcode, dst, src0, src1));
467 }
468
469 fs_inst *
470 fs_visitor::emit(enum opcode opcode, fs_reg dst,
471 fs_reg src0, fs_reg src1, fs_reg src2)
472 {
473 return emit(fs_inst(opcode, dst, src0, src1, src2));
474 }
475
476 void
477 fs_visitor::push_force_uncompressed()
478 {
479 force_uncompressed_stack++;
480 }
481
482 void
483 fs_visitor::pop_force_uncompressed()
484 {
485 force_uncompressed_stack--;
486 assert(force_uncompressed_stack >= 0);
487 }
488
489 void
490 fs_visitor::push_force_sechalf()
491 {
492 force_sechalf_stack++;
493 }
494
495 void
496 fs_visitor::pop_force_sechalf()
497 {
498 force_sechalf_stack--;
499 assert(force_sechalf_stack >= 0);
500 }
501
502 /**
503 * Returns how many MRFs an FS opcode will write over.
504 *
505 * Note that this is not the 0 or 1 implied writes in an actual gen
506 * instruction -- the FS opcodes often generate MOVs in addition.
507 */
508 int
509 fs_visitor::implied_mrf_writes(fs_inst *inst)
510 {
511 if (inst->mlen == 0)
512 return 0;
513
514 switch (inst->opcode) {
515 case SHADER_OPCODE_RCP:
516 case SHADER_OPCODE_RSQ:
517 case SHADER_OPCODE_SQRT:
518 case SHADER_OPCODE_EXP2:
519 case SHADER_OPCODE_LOG2:
520 case SHADER_OPCODE_SIN:
521 case SHADER_OPCODE_COS:
522 return 1 * dispatch_width / 8;
523 case SHADER_OPCODE_POW:
524 case SHADER_OPCODE_INT_QUOTIENT:
525 case SHADER_OPCODE_INT_REMAINDER:
526 return 2 * dispatch_width / 8;
527 case SHADER_OPCODE_TEX:
528 case FS_OPCODE_TXB:
529 case SHADER_OPCODE_TXD:
530 case SHADER_OPCODE_TXF:
531 case SHADER_OPCODE_TXL:
532 case SHADER_OPCODE_TXS:
533 return 1;
534 case FS_OPCODE_FB_WRITE:
535 return 2;
536 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
537 case FS_OPCODE_UNSPILL:
538 return 1;
539 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
540 return inst->header_present;
541 case FS_OPCODE_SPILL:
542 return 2;
543 default:
544 assert(!"not reached");
545 return inst->mlen;
546 }
547 }
548
549 int
550 fs_visitor::virtual_grf_alloc(int size)
551 {
552 if (virtual_grf_array_size <= virtual_grf_count) {
553 if (virtual_grf_array_size == 0)
554 virtual_grf_array_size = 16;
555 else
556 virtual_grf_array_size *= 2;
557 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
558 virtual_grf_array_size);
559 }
560 virtual_grf_sizes[virtual_grf_count] = size;
561 return virtual_grf_count++;
562 }
563
564 /** Fixed HW reg constructor. */
565 fs_reg::fs_reg(enum register_file file, int reg)
566 {
567 init();
568 this->file = file;
569 this->reg = reg;
570 this->type = BRW_REGISTER_TYPE_F;
571 }
572
573 /** Fixed HW reg constructor. */
574 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
575 {
576 init();
577 this->file = file;
578 this->reg = reg;
579 this->type = type;
580 }
581
582 /** Automatic reg constructor. */
583 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
584 {
585 init();
586
587 this->file = GRF;
588 this->reg = v->virtual_grf_alloc(v->type_size(type));
589 this->reg_offset = 0;
590 this->type = brw_type_for_base_type(type);
591 }
592
593 fs_reg *
594 fs_visitor::variable_storage(ir_variable *var)
595 {
596 return (fs_reg *)hash_table_find(this->variable_ht, var);
597 }
598
599 void
600 import_uniforms_callback(const void *key,
601 void *data,
602 void *closure)
603 {
604 struct hash_table *dst_ht = (struct hash_table *)closure;
605 const fs_reg *reg = (const fs_reg *)data;
606
607 if (reg->file != UNIFORM)
608 return;
609
610 hash_table_insert(dst_ht, data, key);
611 }
612
613 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
614 * This brings in those uniform definitions
615 */
616 void
617 fs_visitor::import_uniforms(fs_visitor *v)
618 {
619 hash_table_call_foreach(v->variable_ht,
620 import_uniforms_callback,
621 variable_ht);
622 this->params_remap = v->params_remap;
623 }
624
625 /* Our support for uniforms is piggy-backed on the struct
626 * gl_fragment_program, because that's where the values actually
627 * get stored, rather than in some global gl_shader_program uniform
628 * store.
629 */
630 int
631 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
632 {
633 unsigned int offset = 0;
634
635 if (type->is_matrix()) {
636 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
637 type->vector_elements,
638 1);
639
640 for (unsigned int i = 0; i < type->matrix_columns; i++) {
641 offset += setup_uniform_values(loc + offset, column);
642 }
643
644 return offset;
645 }
646
647 switch (type->base_type) {
648 case GLSL_TYPE_FLOAT:
649 case GLSL_TYPE_UINT:
650 case GLSL_TYPE_INT:
651 case GLSL_TYPE_BOOL:
652 for (unsigned int i = 0; i < type->vector_elements; i++) {
653 unsigned int param = c->prog_data.nr_params++;
654
655 this->param_index[param] = loc;
656 this->param_offset[param] = i;
657 }
658 return 1;
659
660 case GLSL_TYPE_STRUCT:
661 for (unsigned int i = 0; i < type->length; i++) {
662 offset += setup_uniform_values(loc + offset,
663 type->fields.structure[i].type);
664 }
665 return offset;
666
667 case GLSL_TYPE_ARRAY:
668 for (unsigned int i = 0; i < type->length; i++) {
669 offset += setup_uniform_values(loc + offset, type->fields.array);
670 }
671 return offset;
672
673 case GLSL_TYPE_SAMPLER:
674 /* The sampler takes up a slot, but we don't use any values from it. */
675 return 1;
676
677 default:
678 assert(!"not reached");
679 return 0;
680 }
681 }
682
683
684 /* Our support for builtin uniforms is even scarier than non-builtin.
685 * It sits on top of the PROG_STATE_VAR parameters that are
686 * automatically updated from GL context state.
687 */
688 void
689 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
690 {
691 const ir_state_slot *const slots = ir->state_slots;
692 assert(ir->state_slots != NULL);
693
694 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
695 /* This state reference has already been setup by ir_to_mesa, but we'll
696 * get the same index back here.
697 */
698 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
699 (gl_state_index *)slots[i].tokens);
700
701 /* Add each of the unique swizzles of the element as a parameter.
702 * This'll end up matching the expected layout of the
703 * array/matrix/structure we're trying to fill in.
704 */
705 int last_swiz = -1;
706 for (unsigned int j = 0; j < 4; j++) {
707 int swiz = GET_SWZ(slots[i].swizzle, j);
708 if (swiz == last_swiz)
709 break;
710 last_swiz = swiz;
711
712 this->param_index[c->prog_data.nr_params] = index;
713 this->param_offset[c->prog_data.nr_params] = swiz;
714 c->prog_data.nr_params++;
715 }
716 }
717 }
718
719 fs_reg *
720 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
721 {
722 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
723 fs_reg wpos = *reg;
724 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
725
726 /* gl_FragCoord.x */
727 if (ir->pixel_center_integer) {
728 emit(MOV(wpos, this->pixel_x));
729 } else {
730 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
731 }
732 wpos.reg_offset++;
733
734 /* gl_FragCoord.y */
735 if (!flip && ir->pixel_center_integer) {
736 emit(MOV(wpos, this->pixel_y));
737 } else {
738 fs_reg pixel_y = this->pixel_y;
739 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
740
741 if (flip) {
742 pixel_y.negate = true;
743 offset += c->key.drawable_height - 1.0;
744 }
745
746 emit(ADD(wpos, pixel_y, fs_reg(offset)));
747 }
748 wpos.reg_offset++;
749
750 /* gl_FragCoord.z */
751 if (intel->gen >= 6) {
752 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
753 } else {
754 emit(FS_OPCODE_LINTERP, wpos,
755 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
756 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
757 interp_reg(FRAG_ATTRIB_WPOS, 2));
758 }
759 wpos.reg_offset++;
760
761 /* gl_FragCoord.w: Already set up in emit_interpolation */
762 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
763
764 return reg;
765 }
766
767 fs_inst *
768 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
769 glsl_interp_qualifier interpolation_mode,
770 bool is_centroid)
771 {
772 brw_wm_barycentric_interp_mode barycoord_mode;
773 if (is_centroid) {
774 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
775 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
776 else
777 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
778 } else {
779 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
780 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
781 else
782 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
783 }
784 return emit(FS_OPCODE_LINTERP, attr,
785 this->delta_x[barycoord_mode],
786 this->delta_y[barycoord_mode], interp);
787 }
788
789 fs_reg *
790 fs_visitor::emit_general_interpolation(ir_variable *ir)
791 {
792 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
793 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
794 fs_reg attr = *reg;
795
796 unsigned int array_elements;
797 const glsl_type *type;
798
799 if (ir->type->is_array()) {
800 array_elements = ir->type->length;
801 if (array_elements == 0) {
802 fail("dereferenced array '%s' has length 0\n", ir->name);
803 }
804 type = ir->type->fields.array;
805 } else {
806 array_elements = 1;
807 type = ir->type;
808 }
809
810 glsl_interp_qualifier interpolation_mode =
811 ir->determine_interpolation_mode(c->key.flat_shade);
812
813 int location = ir->location;
814 for (unsigned int i = 0; i < array_elements; i++) {
815 for (unsigned int j = 0; j < type->matrix_columns; j++) {
816 if (urb_setup[location] == -1) {
817 /* If there's no incoming setup data for this slot, don't
818 * emit interpolation for it.
819 */
820 attr.reg_offset += type->vector_elements;
821 location++;
822 continue;
823 }
824
825 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
826 /* Constant interpolation (flat shading) case. The SF has
827 * handed us defined values in only the constant offset
828 * field of the setup reg.
829 */
830 for (unsigned int k = 0; k < type->vector_elements; k++) {
831 struct brw_reg interp = interp_reg(location, k);
832 interp = suboffset(interp, 3);
833 interp.type = reg->type;
834 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
835 attr.reg_offset++;
836 }
837 } else {
838 /* Smooth/noperspective interpolation case. */
839 for (unsigned int k = 0; k < type->vector_elements; k++) {
840 /* FINISHME: At some point we probably want to push
841 * this farther by giving similar treatment to the
842 * other potentially constant components of the
843 * attribute, as well as making brw_vs_constval.c
844 * handle varyings other than gl_TexCoord.
845 */
846 if (location >= FRAG_ATTRIB_TEX0 &&
847 location <= FRAG_ATTRIB_TEX7 &&
848 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
849 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
850 } else {
851 struct brw_reg interp = interp_reg(location, k);
852 emit_linterp(attr, fs_reg(interp), interpolation_mode,
853 ir->centroid);
854 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
855 /* Get the pixel/sample mask into f0 so that we know
856 * which pixels are lit. Then, for each channel that is
857 * unlit, replace the centroid data with non-centroid
858 * data.
859 */
860 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
861 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
862 interpolation_mode, false);
863 inst->predicate = BRW_PREDICATE_NORMAL;
864 inst->predicate_inverse = true;
865 }
866 if (intel->gen < 6) {
867 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
868 }
869 }
870 attr.reg_offset++;
871 }
872
873 }
874 location++;
875 }
876 }
877
878 return reg;
879 }
880
881 fs_reg *
882 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
883 {
884 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
885
886 /* The frontfacing comes in as a bit in the thread payload. */
887 if (intel->gen >= 6) {
888 emit(BRW_OPCODE_ASR, *reg,
889 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
890 fs_reg(15));
891 emit(BRW_OPCODE_NOT, *reg, *reg);
892 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
893 } else {
894 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
895 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
896 * us front face
897 */
898 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
899 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
900 }
901
902 return reg;
903 }
904
905 fs_inst *
906 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
907 {
908 switch (opcode) {
909 case SHADER_OPCODE_RCP:
910 case SHADER_OPCODE_RSQ:
911 case SHADER_OPCODE_SQRT:
912 case SHADER_OPCODE_EXP2:
913 case SHADER_OPCODE_LOG2:
914 case SHADER_OPCODE_SIN:
915 case SHADER_OPCODE_COS:
916 break;
917 default:
918 assert(!"not reached: bad math opcode");
919 return NULL;
920 }
921
922 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
923 * might be able to do better by doing execsize = 1 math and then
924 * expanding that result out, but we would need to be careful with
925 * masking.
926 *
927 * Gen 6 hardware ignores source modifiers (negate and abs) on math
928 * instructions, so we also move to a temp to set those up.
929 */
930 if (intel->gen == 6 && (src.file == UNIFORM ||
931 src.abs ||
932 src.negate)) {
933 fs_reg expanded = fs_reg(this, glsl_type::float_type);
934 emit(BRW_OPCODE_MOV, expanded, src);
935 src = expanded;
936 }
937
938 fs_inst *inst = emit(opcode, dst, src);
939
940 if (intel->gen < 6) {
941 inst->base_mrf = 2;
942 inst->mlen = dispatch_width / 8;
943 }
944
945 return inst;
946 }
947
948 fs_inst *
949 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
950 {
951 int base_mrf = 2;
952 fs_inst *inst;
953
954 switch (opcode) {
955 case SHADER_OPCODE_POW:
956 case SHADER_OPCODE_INT_QUOTIENT:
957 case SHADER_OPCODE_INT_REMAINDER:
958 break;
959 default:
960 assert(!"not reached: unsupported binary math opcode.");
961 return NULL;
962 }
963
964 if (intel->gen >= 7) {
965 inst = emit(opcode, dst, src0, src1);
966 } else if (intel->gen == 6) {
967 /* Can't do hstride == 0 args to gen6 math, so expand it out.
968 *
969 * The hardware ignores source modifiers (negate and abs) on math
970 * instructions, so we also move to a temp to set those up.
971 */
972 if (src0.file == UNIFORM || src0.abs || src0.negate) {
973 fs_reg expanded = fs_reg(this, glsl_type::float_type);
974 expanded.type = src0.type;
975 emit(BRW_OPCODE_MOV, expanded, src0);
976 src0 = expanded;
977 }
978
979 if (src1.file == UNIFORM || src1.abs || src1.negate) {
980 fs_reg expanded = fs_reg(this, glsl_type::float_type);
981 expanded.type = src1.type;
982 emit(BRW_OPCODE_MOV, expanded, src1);
983 src1 = expanded;
984 }
985
986 inst = emit(opcode, dst, src0, src1);
987 } else {
988 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
989 * "Message Payload":
990 *
991 * "Operand0[7]. For the INT DIV functions, this operand is the
992 * denominator."
993 * ...
994 * "Operand1[7]. For the INT DIV functions, this operand is the
995 * numerator."
996 */
997 bool is_int_div = opcode != SHADER_OPCODE_POW;
998 fs_reg &op0 = is_int_div ? src1 : src0;
999 fs_reg &op1 = is_int_div ? src0 : src1;
1000
1001 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1002 inst = emit(opcode, dst, op0, reg_null_f);
1003
1004 inst->base_mrf = base_mrf;
1005 inst->mlen = 2 * dispatch_width / 8;
1006 }
1007 return inst;
1008 }
1009
1010 /**
1011 * To be called after the last _mesa_add_state_reference() call, to
1012 * set up prog_data.param[] for assign_curb_setup() and
1013 * setup_pull_constants().
1014 */
1015 void
1016 fs_visitor::setup_paramvalues_refs()
1017 {
1018 if (dispatch_width != 8)
1019 return;
1020
1021 /* Set up the pointers to ParamValues now that that array is finalized. */
1022 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1023 c->prog_data.param[i] =
1024 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
1025 this->param_offset[i];
1026 }
1027 }
1028
1029 void
1030 fs_visitor::assign_curb_setup()
1031 {
1032 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1033 if (dispatch_width == 8) {
1034 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1035 } else {
1036 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1037 }
1038
1039 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1040 foreach_list(node, &this->instructions) {
1041 fs_inst *inst = (fs_inst *)node;
1042
1043 for (unsigned int i = 0; i < 3; i++) {
1044 if (inst->src[i].file == UNIFORM) {
1045 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1046 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1047 constant_nr / 8,
1048 constant_nr % 8);
1049
1050 inst->src[i].file = FIXED_HW_REG;
1051 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1052 }
1053 }
1054 }
1055 }
1056
1057 void
1058 fs_visitor::calculate_urb_setup()
1059 {
1060 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1061 urb_setup[i] = -1;
1062 }
1063
1064 int urb_next = 0;
1065 /* Figure out where each of the incoming setup attributes lands. */
1066 if (intel->gen >= 6) {
1067 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1068 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1069 urb_setup[i] = urb_next++;
1070 }
1071 }
1072 } else {
1073 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1074 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1075 /* Point size is packed into the header, not as a general attribute */
1076 if (i == VERT_RESULT_PSIZ)
1077 continue;
1078
1079 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1080 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1081
1082 /* The back color slot is skipped when the front color is
1083 * also written to. In addition, some slots can be
1084 * written in the vertex shader and not read in the
1085 * fragment shader. So the register number must always be
1086 * incremented, mapped or not.
1087 */
1088 if (fp_index >= 0)
1089 urb_setup[fp_index] = urb_next;
1090 urb_next++;
1091 }
1092 }
1093
1094 /*
1095 * It's a FS only attribute, and we did interpolation for this attribute
1096 * in SF thread. So, count it here, too.
1097 *
1098 * See compile_sf_prog() for more info.
1099 */
1100 if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1101 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1102 }
1103
1104 /* Each attribute is 4 setup channels, each of which is half a reg. */
1105 c->prog_data.urb_read_length = urb_next * 2;
1106 }
1107
1108 void
1109 fs_visitor::assign_urb_setup()
1110 {
1111 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1112
1113 /* Offset all the urb_setup[] index by the actual position of the
1114 * setup regs, now that the location of the constants has been chosen.
1115 */
1116 foreach_list(node, &this->instructions) {
1117 fs_inst *inst = (fs_inst *)node;
1118
1119 if (inst->opcode == FS_OPCODE_LINTERP) {
1120 assert(inst->src[2].file == FIXED_HW_REG);
1121 inst->src[2].fixed_hw_reg.nr += urb_start;
1122 }
1123
1124 if (inst->opcode == FS_OPCODE_CINTERP) {
1125 assert(inst->src[0].file == FIXED_HW_REG);
1126 inst->src[0].fixed_hw_reg.nr += urb_start;
1127 }
1128 }
1129
1130 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1131 }
1132
1133 /**
1134 * Split large virtual GRFs into separate components if we can.
1135 *
1136 * This is mostly duplicated with what brw_fs_vector_splitting does,
1137 * but that's really conservative because it's afraid of doing
1138 * splitting that doesn't result in real progress after the rest of
1139 * the optimization phases, which would cause infinite looping in
1140 * optimization. We can do it once here, safely. This also has the
1141 * opportunity to split interpolated values, or maybe even uniforms,
1142 * which we don't have at the IR level.
1143 *
1144 * We want to split, because virtual GRFs are what we register
1145 * allocate and spill (due to contiguousness requirements for some
1146 * instructions), and they're what we naturally generate in the
1147 * codegen process, but most virtual GRFs don't actually need to be
1148 * contiguous sets of GRFs. If we split, we'll end up with reduced
1149 * live intervals and better dead code elimination and coalescing.
1150 */
1151 void
1152 fs_visitor::split_virtual_grfs()
1153 {
1154 int num_vars = this->virtual_grf_count;
1155 bool split_grf[num_vars];
1156 int new_virtual_grf[num_vars];
1157
1158 /* Try to split anything > 0 sized. */
1159 for (int i = 0; i < num_vars; i++) {
1160 if (this->virtual_grf_sizes[i] != 1)
1161 split_grf[i] = true;
1162 else
1163 split_grf[i] = false;
1164 }
1165
1166 if (brw->has_pln &&
1167 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1168 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1169 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1170 * Gen6, that was the only supported interpolation mode, and since Gen6,
1171 * delta_x and delta_y are in fixed hardware registers.
1172 */
1173 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1174 false;
1175 }
1176
1177 foreach_list(node, &this->instructions) {
1178 fs_inst *inst = (fs_inst *)node;
1179
1180 /* If there's a SEND message that requires contiguous destination
1181 * registers, no splitting is allowed.
1182 */
1183 if (inst->regs_written() > 1) {
1184 split_grf[inst->dst.reg] = false;
1185 }
1186 }
1187
1188 /* Allocate new space for split regs. Note that the virtual
1189 * numbers will be contiguous.
1190 */
1191 for (int i = 0; i < num_vars; i++) {
1192 if (split_grf[i]) {
1193 new_virtual_grf[i] = virtual_grf_alloc(1);
1194 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1195 int reg = virtual_grf_alloc(1);
1196 assert(reg == new_virtual_grf[i] + j - 1);
1197 (void) reg;
1198 }
1199 this->virtual_grf_sizes[i] = 1;
1200 }
1201 }
1202
1203 foreach_list(node, &this->instructions) {
1204 fs_inst *inst = (fs_inst *)node;
1205
1206 if (inst->dst.file == GRF &&
1207 split_grf[inst->dst.reg] &&
1208 inst->dst.reg_offset != 0) {
1209 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1210 inst->dst.reg_offset - 1);
1211 inst->dst.reg_offset = 0;
1212 }
1213 for (int i = 0; i < 3; i++) {
1214 if (inst->src[i].file == GRF &&
1215 split_grf[inst->src[i].reg] &&
1216 inst->src[i].reg_offset != 0) {
1217 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1218 inst->src[i].reg_offset - 1);
1219 inst->src[i].reg_offset = 0;
1220 }
1221 }
1222 }
1223 this->live_intervals_valid = false;
1224 }
1225
1226 /**
1227 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1228 *
1229 * During code generation, we create tons of temporary variables, many of
1230 * which get immediately killed and are never used again. Yet, in later
1231 * optimization and analysis passes, such as compute_live_intervals, we need
1232 * to loop over all the virtual GRFs. Compacting them can save a lot of
1233 * overhead.
1234 */
1235 void
1236 fs_visitor::compact_virtual_grfs()
1237 {
1238 /* Mark which virtual GRFs are used, and count how many. */
1239 int remap_table[this->virtual_grf_count];
1240 memset(remap_table, -1, sizeof(remap_table));
1241
1242 foreach_list(node, &this->instructions) {
1243 const fs_inst *inst = (const fs_inst *) node;
1244
1245 if (inst->dst.file == GRF)
1246 remap_table[inst->dst.reg] = 0;
1247
1248 for (int i = 0; i < 3; i++) {
1249 if (inst->src[i].file == GRF)
1250 remap_table[inst->src[i].reg] = 0;
1251 }
1252 }
1253
1254 /* In addition to registers used in instructions, fs_visitor keeps
1255 * direct references to certain special values which must be patched:
1256 */
1257 fs_reg *special[] = {
1258 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1259 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1260 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1261 &delta_x[0], &delta_x[1], &delta_x[2],
1262 &delta_x[3], &delta_x[4], &delta_x[5],
1263 &delta_y[0], &delta_y[1], &delta_y[2],
1264 &delta_y[3], &delta_y[4], &delta_y[5],
1265 };
1266 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1267 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1268
1269 /* Treat all special values as used, to be conservative */
1270 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1271 if (special[i]->file == GRF)
1272 remap_table[special[i]->reg] = 0;
1273 }
1274
1275 /* Compact the GRF arrays. */
1276 int new_index = 0;
1277 for (int i = 0; i < this->virtual_grf_count; i++) {
1278 if (remap_table[i] != -1) {
1279 remap_table[i] = new_index;
1280 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1281 if (live_intervals_valid) {
1282 virtual_grf_use[new_index] = virtual_grf_use[i];
1283 virtual_grf_def[new_index] = virtual_grf_def[i];
1284 }
1285 ++new_index;
1286 }
1287 }
1288
1289 this->virtual_grf_count = new_index;
1290
1291 /* Patch all the instructions to use the newly renumbered registers */
1292 foreach_list(node, &this->instructions) {
1293 fs_inst *inst = (fs_inst *) node;
1294
1295 if (inst->dst.file == GRF)
1296 inst->dst.reg = remap_table[inst->dst.reg];
1297
1298 for (int i = 0; i < 3; i++) {
1299 if (inst->src[i].file == GRF)
1300 inst->src[i].reg = remap_table[inst->src[i].reg];
1301 }
1302 }
1303
1304 /* Patch all the references to special values */
1305 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1306 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1307 special[i]->reg = remap_table[special[i]->reg];
1308 }
1309 }
1310
1311 bool
1312 fs_visitor::remove_dead_constants()
1313 {
1314 if (dispatch_width == 8) {
1315 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1316
1317 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1318 this->params_remap[i] = -1;
1319
1320 /* Find which params are still in use. */
1321 foreach_list(node, &this->instructions) {
1322 fs_inst *inst = (fs_inst *)node;
1323
1324 for (int i = 0; i < 3; i++) {
1325 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1326
1327 if (inst->src[i].file != UNIFORM)
1328 continue;
1329
1330 assert(constant_nr < (int)c->prog_data.nr_params);
1331
1332 /* For now, set this to non-negative. We'll give it the
1333 * actual new number in a moment, in order to keep the
1334 * register numbers nicely ordered.
1335 */
1336 this->params_remap[constant_nr] = 0;
1337 }
1338 }
1339
1340 /* Figure out what the new numbers for the params will be. At some
1341 * point when we're doing uniform array access, we're going to want
1342 * to keep the distinction between .reg and .reg_offset, but for
1343 * now we don't care.
1344 */
1345 unsigned int new_nr_params = 0;
1346 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1347 if (this->params_remap[i] != -1) {
1348 this->params_remap[i] = new_nr_params++;
1349 }
1350 }
1351
1352 /* Update the list of params to be uploaded to match our new numbering. */
1353 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1354 int remapped = this->params_remap[i];
1355
1356 if (remapped == -1)
1357 continue;
1358
1359 /* We've already done setup_paramvalues_refs() so no need to worry
1360 * about param_index and param_offset.
1361 */
1362 c->prog_data.param[remapped] = c->prog_data.param[i];
1363 }
1364
1365 c->prog_data.nr_params = new_nr_params;
1366 } else {
1367 /* This should have been generated in the 8-wide pass already. */
1368 assert(this->params_remap);
1369 }
1370
1371 /* Now do the renumbering of the shader to remove unused params. */
1372 foreach_list(node, &this->instructions) {
1373 fs_inst *inst = (fs_inst *)node;
1374
1375 for (int i = 0; i < 3; i++) {
1376 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1377
1378 if (inst->src[i].file != UNIFORM)
1379 continue;
1380
1381 assert(this->params_remap[constant_nr] != -1);
1382 inst->src[i].reg = this->params_remap[constant_nr];
1383 inst->src[i].reg_offset = 0;
1384 }
1385 }
1386
1387 return true;
1388 }
1389
1390 /**
1391 * Choose accesses from the UNIFORM file to demote to using the pull
1392 * constant buffer.
1393 *
1394 * We allow a fragment shader to have more than the specified minimum
1395 * maximum number of fragment shader uniform components (64). If
1396 * there are too many of these, they'd fill up all of register space.
1397 * So, this will push some of them out to the pull constant buffer and
1398 * update the program to load them.
1399 */
1400 void
1401 fs_visitor::setup_pull_constants()
1402 {
1403 /* Only allow 16 registers (128 uniform components) as push constants. */
1404 unsigned int max_uniform_components = 16 * 8;
1405 if (c->prog_data.nr_params <= max_uniform_components)
1406 return;
1407
1408 if (dispatch_width == 16) {
1409 fail("Pull constants not supported in 16-wide\n");
1410 return;
1411 }
1412
1413 /* Just demote the end of the list. We could probably do better
1414 * here, demoting things that are rarely used in the program first.
1415 */
1416 int pull_uniform_base = max_uniform_components;
1417 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
1418
1419 foreach_list(node, &this->instructions) {
1420 fs_inst *inst = (fs_inst *)node;
1421
1422 for (int i = 0; i < 3; i++) {
1423 if (inst->src[i].file != UNIFORM)
1424 continue;
1425
1426 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1427 if (uniform_nr < pull_uniform_base)
1428 continue;
1429
1430 fs_reg dst = fs_reg(this, glsl_type::float_type);
1431 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1432 fs_reg offset = fs_reg((unsigned)(((uniform_nr -
1433 pull_uniform_base) * 4) & ~15));
1434 fs_inst *pull =
1435 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1436 dst, index, offset);
1437 pull->ir = inst->ir;
1438 pull->annotation = inst->annotation;
1439 pull->base_mrf = 14;
1440 pull->mlen = 1;
1441
1442 inst->insert_before(pull);
1443
1444 inst->src[i].file = GRF;
1445 inst->src[i].reg = dst.reg;
1446 inst->src[i].reg_offset = 0;
1447 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
1448 }
1449 }
1450
1451 for (int i = 0; i < pull_uniform_count; i++) {
1452 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
1453 }
1454 c->prog_data.nr_params -= pull_uniform_count;
1455 c->prog_data.nr_pull_params = pull_uniform_count;
1456 }
1457
1458 bool
1459 fs_visitor::opt_algebraic()
1460 {
1461 bool progress = false;
1462
1463 foreach_list(node, &this->instructions) {
1464 fs_inst *inst = (fs_inst *)node;
1465
1466 switch (inst->opcode) {
1467 case BRW_OPCODE_MUL:
1468 if (inst->src[1].file != IMM)
1469 continue;
1470
1471 /* a * 1.0 = a */
1472 if (inst->src[1].is_one()) {
1473 inst->opcode = BRW_OPCODE_MOV;
1474 inst->src[1] = reg_undef;
1475 progress = true;
1476 break;
1477 }
1478
1479 /* a * 0.0 = 0.0 */
1480 if (inst->src[1].is_zero()) {
1481 inst->opcode = BRW_OPCODE_MOV;
1482 inst->src[0] = inst->src[1];
1483 inst->src[1] = reg_undef;
1484 progress = true;
1485 break;
1486 }
1487
1488 break;
1489 case BRW_OPCODE_ADD:
1490 if (inst->src[1].file != IMM)
1491 continue;
1492
1493 /* a + 0.0 = a */
1494 if (inst->src[1].is_zero()) {
1495 inst->opcode = BRW_OPCODE_MOV;
1496 inst->src[1] = reg_undef;
1497 progress = true;
1498 break;
1499 }
1500 break;
1501 default:
1502 break;
1503 }
1504 }
1505
1506 return progress;
1507 }
1508
1509 /**
1510 * Must be called after calculate_live_intervales() to remove unused
1511 * writes to registers -- register allocation will fail otherwise
1512 * because something deffed but not used won't be considered to
1513 * interfere with other regs.
1514 */
1515 bool
1516 fs_visitor::dead_code_eliminate()
1517 {
1518 bool progress = false;
1519 int pc = 0;
1520
1521 calculate_live_intervals();
1522
1523 foreach_list_safe(node, &this->instructions) {
1524 fs_inst *inst = (fs_inst *)node;
1525
1526 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1527 inst->remove();
1528 progress = true;
1529 }
1530
1531 pc++;
1532 }
1533
1534 if (progress)
1535 live_intervals_valid = false;
1536
1537 return progress;
1538 }
1539
1540 /**
1541 * Implements a second type of register coalescing: This one checks if
1542 * the two regs involved in a raw move don't interfere, in which case
1543 * they can both by stored in the same place and the MOV removed.
1544 */
1545 bool
1546 fs_visitor::register_coalesce_2()
1547 {
1548 bool progress = false;
1549
1550 calculate_live_intervals();
1551
1552 foreach_list_safe(node, &this->instructions) {
1553 fs_inst *inst = (fs_inst *)node;
1554
1555 if (inst->opcode != BRW_OPCODE_MOV ||
1556 inst->predicate ||
1557 inst->saturate ||
1558 inst->src[0].file != GRF ||
1559 inst->src[0].negate ||
1560 inst->src[0].abs ||
1561 inst->src[0].smear != -1 ||
1562 inst->dst.file != GRF ||
1563 inst->dst.type != inst->src[0].type ||
1564 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1565 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1566 continue;
1567 }
1568
1569 int reg_from = inst->src[0].reg;
1570 assert(inst->src[0].reg_offset == 0);
1571 int reg_to = inst->dst.reg;
1572 int reg_to_offset = inst->dst.reg_offset;
1573
1574 foreach_list_safe(node, &this->instructions) {
1575 fs_inst *scan_inst = (fs_inst *)node;
1576
1577 if (scan_inst->dst.file == GRF &&
1578 scan_inst->dst.reg == reg_from) {
1579 scan_inst->dst.reg = reg_to;
1580 scan_inst->dst.reg_offset = reg_to_offset;
1581 }
1582 for (int i = 0; i < 3; i++) {
1583 if (scan_inst->src[i].file == GRF &&
1584 scan_inst->src[i].reg == reg_from) {
1585 scan_inst->src[i].reg = reg_to;
1586 scan_inst->src[i].reg_offset = reg_to_offset;
1587 }
1588 }
1589 }
1590
1591 inst->remove();
1592 live_intervals_valid = false;
1593 progress = true;
1594 continue;
1595 }
1596
1597 return progress;
1598 }
1599
1600 bool
1601 fs_visitor::register_coalesce()
1602 {
1603 bool progress = false;
1604 int if_depth = 0;
1605 int loop_depth = 0;
1606
1607 foreach_list_safe(node, &this->instructions) {
1608 fs_inst *inst = (fs_inst *)node;
1609
1610 /* Make sure that we dominate the instructions we're going to
1611 * scan for interfering with our coalescing, or we won't have
1612 * scanned enough to see if anything interferes with our
1613 * coalescing. We don't dominate the following instructions if
1614 * we're in a loop or an if block.
1615 */
1616 switch (inst->opcode) {
1617 case BRW_OPCODE_DO:
1618 loop_depth++;
1619 break;
1620 case BRW_OPCODE_WHILE:
1621 loop_depth--;
1622 break;
1623 case BRW_OPCODE_IF:
1624 if_depth++;
1625 break;
1626 case BRW_OPCODE_ENDIF:
1627 if_depth--;
1628 break;
1629 default:
1630 break;
1631 }
1632 if (loop_depth || if_depth)
1633 continue;
1634
1635 if (inst->opcode != BRW_OPCODE_MOV ||
1636 inst->predicate ||
1637 inst->saturate ||
1638 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1639 inst->src[0].file != UNIFORM)||
1640 inst->dst.type != inst->src[0].type)
1641 continue;
1642
1643 bool has_source_modifiers = (inst->src[0].abs ||
1644 inst->src[0].negate ||
1645 inst->src[0].file == UNIFORM);
1646
1647 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1648 * them: check for no writes to either one until the exit of the
1649 * program.
1650 */
1651 bool interfered = false;
1652
1653 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1654 !scan_inst->is_tail_sentinel();
1655 scan_inst = (fs_inst *)scan_inst->next) {
1656 if (scan_inst->dst.file == GRF) {
1657 if (scan_inst->overwrites_reg(inst->dst) ||
1658 scan_inst->overwrites_reg(inst->src[0])) {
1659 interfered = true;
1660 break;
1661 }
1662 }
1663
1664 /* The gen6 MATH instruction can't handle source modifiers or
1665 * unusual register regions, so avoid coalescing those for
1666 * now. We should do something more specific.
1667 */
1668 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1669 interfered = true;
1670 break;
1671 }
1672
1673 /* The accumulator result appears to get used for the
1674 * conditional modifier generation. When negating a UD
1675 * value, there is a 33rd bit generated for the sign in the
1676 * accumulator value, so now you can't check, for example,
1677 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1678 */
1679 if (scan_inst->conditional_mod &&
1680 inst->src[0].negate &&
1681 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1682 interfered = true;
1683 break;
1684 }
1685 }
1686 if (interfered) {
1687 continue;
1688 }
1689
1690 /* Rewrite the later usage to point at the source of the move to
1691 * be removed.
1692 */
1693 for (fs_inst *scan_inst = inst;
1694 !scan_inst->is_tail_sentinel();
1695 scan_inst = (fs_inst *)scan_inst->next) {
1696 for (int i = 0; i < 3; i++) {
1697 if (scan_inst->src[i].file == GRF &&
1698 scan_inst->src[i].reg == inst->dst.reg &&
1699 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1700 fs_reg new_src = inst->src[0];
1701 if (scan_inst->src[i].abs) {
1702 new_src.negate = 0;
1703 new_src.abs = 1;
1704 }
1705 new_src.negate ^= scan_inst->src[i].negate;
1706 scan_inst->src[i] = new_src;
1707 }
1708 }
1709 }
1710
1711 inst->remove();
1712 progress = true;
1713 }
1714
1715 if (progress)
1716 live_intervals_valid = false;
1717
1718 return progress;
1719 }
1720
1721
1722 bool
1723 fs_visitor::compute_to_mrf()
1724 {
1725 bool progress = false;
1726 int next_ip = 0;
1727
1728 calculate_live_intervals();
1729
1730 foreach_list_safe(node, &this->instructions) {
1731 fs_inst *inst = (fs_inst *)node;
1732
1733 int ip = next_ip;
1734 next_ip++;
1735
1736 if (inst->opcode != BRW_OPCODE_MOV ||
1737 inst->predicate ||
1738 inst->dst.file != MRF || inst->src[0].file != GRF ||
1739 inst->dst.type != inst->src[0].type ||
1740 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1741 continue;
1742
1743 /* Work out which hardware MRF registers are written by this
1744 * instruction.
1745 */
1746 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1747 int mrf_high;
1748 if (inst->dst.reg & BRW_MRF_COMPR4) {
1749 mrf_high = mrf_low + 4;
1750 } else if (dispatch_width == 16 &&
1751 (!inst->force_uncompressed && !inst->force_sechalf)) {
1752 mrf_high = mrf_low + 1;
1753 } else {
1754 mrf_high = mrf_low;
1755 }
1756
1757 /* Can't compute-to-MRF this GRF if someone else was going to
1758 * read it later.
1759 */
1760 if (this->virtual_grf_use[inst->src[0].reg] > ip)
1761 continue;
1762
1763 /* Found a move of a GRF to a MRF. Let's see if we can go
1764 * rewrite the thing that made this GRF to write into the MRF.
1765 */
1766 fs_inst *scan_inst;
1767 for (scan_inst = (fs_inst *)inst->prev;
1768 scan_inst->prev != NULL;
1769 scan_inst = (fs_inst *)scan_inst->prev) {
1770 if (scan_inst->dst.file == GRF &&
1771 scan_inst->dst.reg == inst->src[0].reg) {
1772 /* Found the last thing to write our reg we want to turn
1773 * into a compute-to-MRF.
1774 */
1775
1776 /* SENDs can only write to GRFs, so no compute-to-MRF. */
1777 if (scan_inst->mlen) {
1778 break;
1779 }
1780
1781 /* If it's predicated, it (probably) didn't populate all
1782 * the channels. We might be able to rewrite everything
1783 * that writes that reg, but it would require smarter
1784 * tracking to delay the rewriting until complete success.
1785 */
1786 if (scan_inst->predicate)
1787 break;
1788
1789 /* If it's half of register setup and not the same half as
1790 * our MOV we're trying to remove, bail for now.
1791 */
1792 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
1793 scan_inst->force_sechalf != inst->force_sechalf) {
1794 break;
1795 }
1796
1797 /* SEND instructions can't have MRF as a destination. */
1798 if (scan_inst->mlen)
1799 break;
1800
1801 if (intel->gen >= 6) {
1802 /* gen6 math instructions must have the destination be
1803 * GRF, so no compute-to-MRF for them.
1804 */
1805 if (scan_inst->is_math()) {
1806 break;
1807 }
1808 }
1809
1810 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1811 /* Found the creator of our MRF's source value. */
1812 scan_inst->dst.file = MRF;
1813 scan_inst->dst.reg = inst->dst.reg;
1814 scan_inst->saturate |= inst->saturate;
1815 inst->remove();
1816 progress = true;
1817 }
1818 break;
1819 }
1820
1821 /* We don't handle flow control here. Most computation of
1822 * values that end up in MRFs are shortly before the MRF
1823 * write anyway.
1824 */
1825 if (scan_inst->opcode == BRW_OPCODE_DO ||
1826 scan_inst->opcode == BRW_OPCODE_WHILE ||
1827 scan_inst->opcode == BRW_OPCODE_ELSE ||
1828 scan_inst->opcode == BRW_OPCODE_ENDIF) {
1829 break;
1830 }
1831
1832 /* You can't read from an MRF, so if someone else reads our
1833 * MRF's source GRF that we wanted to rewrite, that stops us.
1834 */
1835 bool interfered = false;
1836 for (int i = 0; i < 3; i++) {
1837 if (scan_inst->src[i].file == GRF &&
1838 scan_inst->src[i].reg == inst->src[0].reg &&
1839 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
1840 interfered = true;
1841 }
1842 }
1843 if (interfered)
1844 break;
1845
1846 if (scan_inst->dst.file == MRF) {
1847 /* If somebody else writes our MRF here, we can't
1848 * compute-to-MRF before that.
1849 */
1850 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
1851 int scan_mrf_high;
1852
1853 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
1854 scan_mrf_high = scan_mrf_low + 4;
1855 } else if (dispatch_width == 16 &&
1856 (!scan_inst->force_uncompressed &&
1857 !scan_inst->force_sechalf)) {
1858 scan_mrf_high = scan_mrf_low + 1;
1859 } else {
1860 scan_mrf_high = scan_mrf_low;
1861 }
1862
1863 if (mrf_low == scan_mrf_low ||
1864 mrf_low == scan_mrf_high ||
1865 mrf_high == scan_mrf_low ||
1866 mrf_high == scan_mrf_high) {
1867 break;
1868 }
1869 }
1870
1871 if (scan_inst->mlen > 0) {
1872 /* Found a SEND instruction, which means that there are
1873 * live values in MRFs from base_mrf to base_mrf +
1874 * scan_inst->mlen - 1. Don't go pushing our MRF write up
1875 * above it.
1876 */
1877 if (mrf_low >= scan_inst->base_mrf &&
1878 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
1879 break;
1880 }
1881 if (mrf_high >= scan_inst->base_mrf &&
1882 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
1883 break;
1884 }
1885 }
1886 }
1887 }
1888
1889 if (progress)
1890 live_intervals_valid = false;
1891
1892 return progress;
1893 }
1894
1895 /**
1896 * Walks through basic blocks, looking for repeated MRF writes and
1897 * removing the later ones.
1898 */
1899 bool
1900 fs_visitor::remove_duplicate_mrf_writes()
1901 {
1902 fs_inst *last_mrf_move[16];
1903 bool progress = false;
1904
1905 /* Need to update the MRF tracking for compressed instructions. */
1906 if (dispatch_width == 16)
1907 return false;
1908
1909 memset(last_mrf_move, 0, sizeof(last_mrf_move));
1910
1911 foreach_list_safe(node, &this->instructions) {
1912 fs_inst *inst = (fs_inst *)node;
1913
1914 switch (inst->opcode) {
1915 case BRW_OPCODE_DO:
1916 case BRW_OPCODE_WHILE:
1917 case BRW_OPCODE_IF:
1918 case BRW_OPCODE_ELSE:
1919 case BRW_OPCODE_ENDIF:
1920 memset(last_mrf_move, 0, sizeof(last_mrf_move));
1921 continue;
1922 default:
1923 break;
1924 }
1925
1926 if (inst->opcode == BRW_OPCODE_MOV &&
1927 inst->dst.file == MRF) {
1928 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
1929 if (prev_inst && inst->equals(prev_inst)) {
1930 inst->remove();
1931 progress = true;
1932 continue;
1933 }
1934 }
1935
1936 /* Clear out the last-write records for MRFs that were overwritten. */
1937 if (inst->dst.file == MRF) {
1938 last_mrf_move[inst->dst.reg] = NULL;
1939 }
1940
1941 if (inst->mlen > 0) {
1942 /* Found a SEND instruction, which will include two or fewer
1943 * implied MRF writes. We could do better here.
1944 */
1945 for (int i = 0; i < implied_mrf_writes(inst); i++) {
1946 last_mrf_move[inst->base_mrf + i] = NULL;
1947 }
1948 }
1949
1950 /* Clear out any MRF move records whose sources got overwritten. */
1951 if (inst->dst.file == GRF) {
1952 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
1953 if (last_mrf_move[i] &&
1954 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
1955 last_mrf_move[i] = NULL;
1956 }
1957 }
1958 }
1959
1960 if (inst->opcode == BRW_OPCODE_MOV &&
1961 inst->dst.file == MRF &&
1962 inst->src[0].file == GRF &&
1963 !inst->predicate) {
1964 last_mrf_move[inst->dst.reg] = inst;
1965 }
1966 }
1967
1968 if (progress)
1969 live_intervals_valid = false;
1970
1971 return progress;
1972 }
1973
1974 void
1975 fs_visitor::dump_instruction(fs_inst *inst)
1976 {
1977 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
1978 opcode_descs[inst->opcode].name) {
1979 printf("%s", opcode_descs[inst->opcode].name);
1980 } else {
1981 printf("op%d", inst->opcode);
1982 }
1983 if (inst->saturate)
1984 printf(".sat");
1985 printf(" ");
1986
1987 switch (inst->dst.file) {
1988 case GRF:
1989 printf("vgrf%d", inst->dst.reg);
1990 if (inst->dst.reg_offset)
1991 printf("+%d", inst->dst.reg_offset);
1992 break;
1993 case MRF:
1994 printf("m%d", inst->dst.reg);
1995 break;
1996 case BAD_FILE:
1997 printf("(null)");
1998 break;
1999 case UNIFORM:
2000 printf("***u%d***", inst->dst.reg);
2001 break;
2002 default:
2003 printf("???");
2004 break;
2005 }
2006 printf(", ");
2007
2008 for (int i = 0; i < 3; i++) {
2009 if (inst->src[i].negate)
2010 printf("-");
2011 if (inst->src[i].abs)
2012 printf("|");
2013 switch (inst->src[i].file) {
2014 case GRF:
2015 printf("vgrf%d", inst->src[i].reg);
2016 if (inst->src[i].reg_offset)
2017 printf("+%d", inst->src[i].reg_offset);
2018 break;
2019 case MRF:
2020 printf("***m%d***", inst->src[i].reg);
2021 break;
2022 case UNIFORM:
2023 printf("u%d", inst->src[i].reg);
2024 if (inst->src[i].reg_offset)
2025 printf(".%d", inst->src[i].reg_offset);
2026 break;
2027 case BAD_FILE:
2028 printf("(null)");
2029 break;
2030 default:
2031 printf("???");
2032 break;
2033 }
2034 if (inst->src[i].abs)
2035 printf("|");
2036
2037 if (i < 3)
2038 printf(", ");
2039 }
2040
2041 printf(" ");
2042
2043 if (inst->force_uncompressed)
2044 printf("1sthalf ");
2045
2046 if (inst->force_sechalf)
2047 printf("2ndhalf ");
2048
2049 printf("\n");
2050 }
2051
2052 void
2053 fs_visitor::dump_instructions()
2054 {
2055 int ip = 0;
2056 foreach_list(node, &this->instructions) {
2057 fs_inst *inst = (fs_inst *)node;
2058 printf("%d: ", ip++);
2059 dump_instruction(inst);
2060 }
2061 }
2062
2063 /**
2064 * Possibly returns an instruction that set up @param reg.
2065 *
2066 * Sometimes we want to take the result of some expression/variable
2067 * dereference tree and rewrite the instruction generating the result
2068 * of the tree. When processing the tree, we know that the
2069 * instructions generated are all writing temporaries that are dead
2070 * outside of this tree. So, if we have some instructions that write
2071 * a temporary, we're free to point that temp write somewhere else.
2072 *
2073 * Note that this doesn't guarantee that the instruction generated
2074 * only reg -- it might be the size=4 destination of a texture instruction.
2075 */
2076 fs_inst *
2077 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2078 fs_inst *end,
2079 fs_reg reg)
2080 {
2081 if (end == start ||
2082 end->predicate ||
2083 end->force_uncompressed ||
2084 end->force_sechalf ||
2085 !reg.equals(end->dst)) {
2086 return NULL;
2087 } else {
2088 return end;
2089 }
2090 }
2091
2092 void
2093 fs_visitor::setup_payload_gen6()
2094 {
2095 struct intel_context *intel = &brw->intel;
2096 bool uses_depth =
2097 (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2098 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2099
2100 assert(intel->gen >= 6);
2101
2102 /* R0-1: masks, pixel X/Y coordinates. */
2103 c->nr_payload_regs = 2;
2104 /* R2: only for 32-pixel dispatch.*/
2105
2106 /* R3-26: barycentric interpolation coordinates. These appear in the
2107 * same order that they appear in the brw_wm_barycentric_interp_mode
2108 * enum. Each set of coordinates occupies 2 registers if dispatch width
2109 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2110 * appear if they were enabled using the "Barycentric Interpolation
2111 * Mode" bits in WM_STATE.
2112 */
2113 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2114 if (barycentric_interp_modes & (1 << i)) {
2115 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2116 c->nr_payload_regs += 2;
2117 if (dispatch_width == 16) {
2118 c->nr_payload_regs += 2;
2119 }
2120 }
2121 }
2122
2123 /* R27: interpolated depth if uses source depth */
2124 if (uses_depth) {
2125 c->source_depth_reg = c->nr_payload_regs;
2126 c->nr_payload_regs++;
2127 if (dispatch_width == 16) {
2128 /* R28: interpolated depth if not 8-wide. */
2129 c->nr_payload_regs++;
2130 }
2131 }
2132 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2133 if (uses_depth) {
2134 c->source_w_reg = c->nr_payload_regs;
2135 c->nr_payload_regs++;
2136 if (dispatch_width == 16) {
2137 /* R30: interpolated W if not 8-wide. */
2138 c->nr_payload_regs++;
2139 }
2140 }
2141 /* R31: MSAA position offsets. */
2142 /* R32-: bary for 32-pixel. */
2143 /* R58-59: interp W for 32-pixel. */
2144
2145 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2146 c->source_depth_to_render_target = true;
2147 }
2148 }
2149
2150 bool
2151 fs_visitor::run()
2152 {
2153 uint32_t orig_nr_params = c->prog_data.nr_params;
2154
2155 if (intel->gen >= 6)
2156 setup_payload_gen6();
2157 else
2158 setup_payload_gen4();
2159
2160 if (0) {
2161 emit_dummy_fs();
2162 } else {
2163 calculate_urb_setup();
2164 if (intel->gen < 6)
2165 emit_interpolation_setup_gen4();
2166 else
2167 emit_interpolation_setup_gen6();
2168
2169 /* Generate FS IR for main(). (the visitor only descends into
2170 * functions called "main").
2171 */
2172 if (shader) {
2173 foreach_list(node, &*shader->ir) {
2174 ir_instruction *ir = (ir_instruction *)node;
2175 base_ir = ir;
2176 this->result = reg_undef;
2177 ir->accept(this);
2178 }
2179 } else {
2180 emit_fragment_program_code();
2181 }
2182 base_ir = NULL;
2183 if (failed)
2184 return false;
2185
2186 emit_fb_writes();
2187
2188 split_virtual_grfs();
2189
2190 setup_paramvalues_refs();
2191 setup_pull_constants();
2192
2193 bool progress;
2194 do {
2195 progress = false;
2196
2197 compact_virtual_grfs();
2198
2199 progress = remove_duplicate_mrf_writes() || progress;
2200
2201 progress = opt_algebraic() || progress;
2202 progress = opt_cse() || progress;
2203 progress = opt_copy_propagate() || progress;
2204 progress = dead_code_eliminate() || progress;
2205 progress = register_coalesce() || progress;
2206 progress = register_coalesce_2() || progress;
2207 progress = compute_to_mrf() || progress;
2208 } while (progress);
2209
2210 remove_dead_constants();
2211
2212 schedule_instructions();
2213
2214 assign_curb_setup();
2215 assign_urb_setup();
2216
2217 if (0) {
2218 /* Debug of register spilling: Go spill everything. */
2219 for (int i = 0; i < virtual_grf_count; i++) {
2220 spill_reg(i);
2221 }
2222 }
2223
2224 if (0)
2225 assign_regs_trivial();
2226 else {
2227 while (!assign_regs()) {
2228 if (failed)
2229 break;
2230 }
2231 }
2232 }
2233 assert(force_uncompressed_stack == 0);
2234 assert(force_sechalf_stack == 0);
2235
2236 if (failed)
2237 return false;
2238
2239 if (dispatch_width == 8) {
2240 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2241 } else {
2242 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2243
2244 /* Make sure we didn't try to sneak in an extra uniform */
2245 assert(orig_nr_params == c->prog_data.nr_params);
2246 (void) orig_nr_params;
2247 }
2248
2249 return !failed;
2250 }
2251
2252 const unsigned *
2253 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2254 struct gl_fragment_program *fp,
2255 struct gl_shader_program *prog,
2256 unsigned *final_assembly_size)
2257 {
2258 struct intel_context *intel = &brw->intel;
2259 bool start_busy = false;
2260 float start_time = 0;
2261
2262 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2263 start_busy = (intel->batch.last_bo &&
2264 drm_intel_bo_busy(intel->batch.last_bo));
2265 start_time = get_time();
2266 }
2267
2268 struct brw_shader *shader = NULL;
2269 if (prog)
2270 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2271
2272 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2273 if (shader) {
2274 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2275 _mesa_print_ir(shader->ir, NULL);
2276 printf("\n\n");
2277 } else {
2278 printf("ARB_fragment_program %d ir for native fragment shader\n",
2279 fp->Base.Id);
2280 _mesa_print_program(&fp->Base);
2281 }
2282 }
2283
2284 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2285 */
2286 fs_visitor v(brw, c, prog, fp, 8);
2287 if (!v.run()) {
2288 prog->LinkStatus = false;
2289 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2290
2291 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2292 v.fail_msg);
2293
2294 return NULL;
2295 }
2296
2297 exec_list *simd16_instructions = NULL;
2298 fs_visitor v2(brw, c, prog, fp, 16);
2299 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2300 v2.import_uniforms(&v);
2301 if (!v2.run()) {
2302 perf_debug("16-wide shader failed to compile, falling back to "
2303 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2304 } else {
2305 simd16_instructions = &v2.instructions;
2306 }
2307 }
2308
2309 c->prog_data.dispatch_width = 8;
2310
2311 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2312 const unsigned *generated = g.generate_assembly(&v.instructions,
2313 simd16_instructions,
2314 final_assembly_size);
2315
2316 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2317 if (shader->compiled_once)
2318 brw_wm_debug_recompile(brw, prog, &c->key);
2319 shader->compiled_once = true;
2320
2321 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2322 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2323 (get_time() - start_time) * 1000);
2324 }
2325 }
2326
2327 return generated;
2328 }
2329
2330 bool
2331 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2332 {
2333 struct brw_context *brw = brw_context(ctx);
2334 struct intel_context *intel = &brw->intel;
2335 struct brw_wm_prog_key key;
2336
2337 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2338 return true;
2339
2340 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2341 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2342 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2343 bool program_uses_dfdy = fp->UsesDFdy;
2344
2345 memset(&key, 0, sizeof(key));
2346
2347 if (intel->gen < 6) {
2348 if (fp->UsesKill)
2349 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2350
2351 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2352 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2353
2354 /* Just assume depth testing. */
2355 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2356 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2357 }
2358
2359 if (prog->Name != 0)
2360 key.proj_attrib_mask = 0xffffffff;
2361
2362 if (intel->gen < 6)
2363 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2364
2365 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2366 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2367 continue;
2368
2369 if (prog->Name == 0)
2370 key.proj_attrib_mask |= 1 << i;
2371
2372 if (intel->gen < 6) {
2373 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2374
2375 if (vp_index >= 0)
2376 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2377 }
2378 }
2379
2380 key.clamp_fragment_color = true;
2381
2382 for (int i = 0; i < MAX_SAMPLERS; i++) {
2383 if (fp->Base.ShadowSamplers & (1 << i)) {
2384 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2385 key.tex.swizzles[i] =
2386 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2387 } else {
2388 /* Color sampler: assume no swizzling. */
2389 key.tex.swizzles[i] = SWIZZLE_XYZW;
2390 }
2391 }
2392
2393 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2394 key.drawable_height = ctx->DrawBuffer->Height;
2395 }
2396
2397 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2398 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2399 }
2400
2401 key.nr_color_regions = 1;
2402
2403 key.program_string_id = bfp->id;
2404
2405 uint32_t old_prog_offset = brw->wm.prog_offset;
2406 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2407
2408 bool success = do_wm_prog(brw, prog, bfp, &key);
2409
2410 brw->wm.prog_offset = old_prog_offset;
2411 brw->wm.prog_data = old_prog_data;
2412
2413 return success;
2414 }