i965: Include codegen time in the INTEL_DEBUG=perf stall detection.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 ALU1(NOT)
150 ALU1(MOV)
151 ALU1(FRC)
152 ALU1(RNDD)
153 ALU1(RNDE)
154 ALU1(RNDZ)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(SHL)
162 ALU2(SHR)
163 ALU2(ASR)
164
165 /** Gen4 predicated IF. */
166 fs_inst *
167 fs_visitor::IF(uint32_t predicate)
168 {
169 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
170 inst->predicate = predicate;
171 return inst;
172 }
173
174 /** Gen6+ IF with embedded comparison. */
175 fs_inst *
176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
177 {
178 assert(intel->gen >= 6);
179 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
180 reg_null_d, src0, src1);
181 inst->conditional_mod = condition;
182 return inst;
183 }
184
185 /**
186 * CMP: Sets the low bit of the destination channels with the result
187 * of the comparison, while the upper bits are undefined, and updates
188 * the flag register with the packed 16 bits of the result.
189 */
190 fs_inst *
191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
192 {
193 fs_inst *inst;
194
195 /* Take the instruction:
196 *
197 * CMP null<d> src0<f> src1<f>
198 *
199 * Original gen4 does type conversion to the destination type before
200 * comparison, producing garbage results for floating point comparisons.
201 * gen5 does the comparison on the execution type (resolved source types),
202 * so dst type doesn't matter. gen6 does comparison and then uses the
203 * result as if it was the dst type with no conversion, which happens to
204 * mostly work out for float-interpreted-as-int since our comparisons are
205 * for >0, =0, <0.
206 */
207 if (intel->gen == 4) {
208 dst.type = src0.type;
209 if (dst.file == FIXED_HW_REG)
210 dst.fixed_hw_reg.type = dst.type;
211 }
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 bool
223 fs_inst::equals(fs_inst *inst)
224 {
225 return (opcode == inst->opcode &&
226 dst.equals(inst->dst) &&
227 src[0].equals(inst->src[0]) &&
228 src[1].equals(inst->src[1]) &&
229 src[2].equals(inst->src[2]) &&
230 saturate == inst->saturate &&
231 predicate == inst->predicate &&
232 conditional_mod == inst->conditional_mod &&
233 mlen == inst->mlen &&
234 base_mrf == inst->base_mrf &&
235 sampler == inst->sampler &&
236 target == inst->target &&
237 eot == inst->eot &&
238 header_present == inst->header_present &&
239 shadow_compare == inst->shadow_compare &&
240 offset == inst->offset);
241 }
242
243 int
244 fs_inst::regs_written()
245 {
246 if (is_tex())
247 return 4;
248
249 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
250 * but we don't currently use them...nor do we have an opcode for them.
251 */
252
253 return 1;
254 }
255
256 bool
257 fs_inst::overwrites_reg(const fs_reg &reg)
258 {
259 return (reg.file == dst.file &&
260 reg.reg == dst.reg &&
261 reg.reg_offset >= dst.reg_offset &&
262 reg.reg_offset < dst.reg_offset + regs_written());
263 }
264
265 bool
266 fs_inst::is_tex()
267 {
268 return (opcode == SHADER_OPCODE_TEX ||
269 opcode == FS_OPCODE_TXB ||
270 opcode == SHADER_OPCODE_TXD ||
271 opcode == SHADER_OPCODE_TXF ||
272 opcode == SHADER_OPCODE_TXL ||
273 opcode == SHADER_OPCODE_TXS);
274 }
275
276 bool
277 fs_inst::is_math()
278 {
279 return (opcode == SHADER_OPCODE_RCP ||
280 opcode == SHADER_OPCODE_RSQ ||
281 opcode == SHADER_OPCODE_SQRT ||
282 opcode == SHADER_OPCODE_EXP2 ||
283 opcode == SHADER_OPCODE_LOG2 ||
284 opcode == SHADER_OPCODE_SIN ||
285 opcode == SHADER_OPCODE_COS ||
286 opcode == SHADER_OPCODE_INT_QUOTIENT ||
287 opcode == SHADER_OPCODE_INT_REMAINDER ||
288 opcode == SHADER_OPCODE_POW);
289 }
290
291 void
292 fs_reg::init()
293 {
294 memset(this, 0, sizeof(*this));
295 this->smear = -1;
296 }
297
298 /** Generic unset register constructor. */
299 fs_reg::fs_reg()
300 {
301 init();
302 this->file = BAD_FILE;
303 }
304
305 /** Immediate value constructor. */
306 fs_reg::fs_reg(float f)
307 {
308 init();
309 this->file = IMM;
310 this->type = BRW_REGISTER_TYPE_F;
311 this->imm.f = f;
312 }
313
314 /** Immediate value constructor. */
315 fs_reg::fs_reg(int32_t i)
316 {
317 init();
318 this->file = IMM;
319 this->type = BRW_REGISTER_TYPE_D;
320 this->imm.i = i;
321 }
322
323 /** Immediate value constructor. */
324 fs_reg::fs_reg(uint32_t u)
325 {
326 init();
327 this->file = IMM;
328 this->type = BRW_REGISTER_TYPE_UD;
329 this->imm.u = u;
330 }
331
332 /** Fixed brw_reg Immediate value constructor. */
333 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
334 {
335 init();
336 this->file = FIXED_HW_REG;
337 this->fixed_hw_reg = fixed_hw_reg;
338 this->type = fixed_hw_reg.type;
339 }
340
341 bool
342 fs_reg::equals(const fs_reg &r) const
343 {
344 return (file == r.file &&
345 reg == r.reg &&
346 reg_offset == r.reg_offset &&
347 type == r.type &&
348 negate == r.negate &&
349 abs == r.abs &&
350 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
351 sizeof(fixed_hw_reg)) == 0 &&
352 smear == r.smear &&
353 imm.u == r.imm.u);
354 }
355
356 bool
357 fs_reg::is_zero() const
358 {
359 if (file != IMM)
360 return false;
361
362 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
363 }
364
365 bool
366 fs_reg::is_one() const
367 {
368 if (file != IMM)
369 return false;
370
371 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
372 }
373
374 int
375 fs_visitor::type_size(const struct glsl_type *type)
376 {
377 unsigned int size, i;
378
379 switch (type->base_type) {
380 case GLSL_TYPE_UINT:
381 case GLSL_TYPE_INT:
382 case GLSL_TYPE_FLOAT:
383 case GLSL_TYPE_BOOL:
384 return type->components();
385 case GLSL_TYPE_ARRAY:
386 return type_size(type->fields.array) * type->length;
387 case GLSL_TYPE_STRUCT:
388 size = 0;
389 for (i = 0; i < type->length; i++) {
390 size += type_size(type->fields.structure[i].type);
391 }
392 return size;
393 case GLSL_TYPE_SAMPLER:
394 /* Samplers take up no register space, since they're baked in at
395 * link time.
396 */
397 return 0;
398 default:
399 assert(!"not reached");
400 return 0;
401 }
402 }
403
404 void
405 fs_visitor::fail(const char *format, ...)
406 {
407 va_list va;
408 char *msg;
409
410 if (failed)
411 return;
412
413 failed = true;
414
415 va_start(va, format);
416 msg = ralloc_vasprintf(mem_ctx, format, va);
417 va_end(va);
418 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
419
420 this->fail_msg = msg;
421
422 if (INTEL_DEBUG & DEBUG_WM) {
423 fprintf(stderr, "%s", msg);
424 }
425 }
426
427 fs_inst *
428 fs_visitor::emit(enum opcode opcode)
429 {
430 return emit(fs_inst(opcode));
431 }
432
433 fs_inst *
434 fs_visitor::emit(enum opcode opcode, fs_reg dst)
435 {
436 return emit(fs_inst(opcode, dst));
437 }
438
439 fs_inst *
440 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
441 {
442 return emit(fs_inst(opcode, dst, src0));
443 }
444
445 fs_inst *
446 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
447 {
448 return emit(fs_inst(opcode, dst, src0, src1));
449 }
450
451 fs_inst *
452 fs_visitor::emit(enum opcode opcode, fs_reg dst,
453 fs_reg src0, fs_reg src1, fs_reg src2)
454 {
455 return emit(fs_inst(opcode, dst, src0, src1, src2));
456 }
457
458 void
459 fs_visitor::push_force_uncompressed()
460 {
461 force_uncompressed_stack++;
462 }
463
464 void
465 fs_visitor::pop_force_uncompressed()
466 {
467 force_uncompressed_stack--;
468 assert(force_uncompressed_stack >= 0);
469 }
470
471 void
472 fs_visitor::push_force_sechalf()
473 {
474 force_sechalf_stack++;
475 }
476
477 void
478 fs_visitor::pop_force_sechalf()
479 {
480 force_sechalf_stack--;
481 assert(force_sechalf_stack >= 0);
482 }
483
484 /**
485 * Returns how many MRFs an FS opcode will write over.
486 *
487 * Note that this is not the 0 or 1 implied writes in an actual gen
488 * instruction -- the FS opcodes often generate MOVs in addition.
489 */
490 int
491 fs_visitor::implied_mrf_writes(fs_inst *inst)
492 {
493 if (inst->mlen == 0)
494 return 0;
495
496 switch (inst->opcode) {
497 case SHADER_OPCODE_RCP:
498 case SHADER_OPCODE_RSQ:
499 case SHADER_OPCODE_SQRT:
500 case SHADER_OPCODE_EXP2:
501 case SHADER_OPCODE_LOG2:
502 case SHADER_OPCODE_SIN:
503 case SHADER_OPCODE_COS:
504 return 1 * dispatch_width / 8;
505 case SHADER_OPCODE_POW:
506 case SHADER_OPCODE_INT_QUOTIENT:
507 case SHADER_OPCODE_INT_REMAINDER:
508 return 2 * dispatch_width / 8;
509 case SHADER_OPCODE_TEX:
510 case FS_OPCODE_TXB:
511 case SHADER_OPCODE_TXD:
512 case SHADER_OPCODE_TXF:
513 case SHADER_OPCODE_TXL:
514 case SHADER_OPCODE_TXS:
515 return 1;
516 case FS_OPCODE_FB_WRITE:
517 return 2;
518 case FS_OPCODE_PULL_CONSTANT_LOAD:
519 case FS_OPCODE_UNSPILL:
520 return 1;
521 case FS_OPCODE_SPILL:
522 return 2;
523 default:
524 assert(!"not reached");
525 return inst->mlen;
526 }
527 }
528
529 int
530 fs_visitor::virtual_grf_alloc(int size)
531 {
532 if (virtual_grf_array_size <= virtual_grf_count) {
533 if (virtual_grf_array_size == 0)
534 virtual_grf_array_size = 16;
535 else
536 virtual_grf_array_size *= 2;
537 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
538 virtual_grf_array_size);
539 }
540 virtual_grf_sizes[virtual_grf_count] = size;
541 return virtual_grf_count++;
542 }
543
544 /** Fixed HW reg constructor. */
545 fs_reg::fs_reg(enum register_file file, int reg)
546 {
547 init();
548 this->file = file;
549 this->reg = reg;
550 this->type = BRW_REGISTER_TYPE_F;
551 }
552
553 /** Fixed HW reg constructor. */
554 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
555 {
556 init();
557 this->file = file;
558 this->reg = reg;
559 this->type = type;
560 }
561
562 /** Automatic reg constructor. */
563 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
564 {
565 init();
566
567 this->file = GRF;
568 this->reg = v->virtual_grf_alloc(v->type_size(type));
569 this->reg_offset = 0;
570 this->type = brw_type_for_base_type(type);
571 }
572
573 fs_reg *
574 fs_visitor::variable_storage(ir_variable *var)
575 {
576 return (fs_reg *)hash_table_find(this->variable_ht, var);
577 }
578
579 void
580 import_uniforms_callback(const void *key,
581 void *data,
582 void *closure)
583 {
584 struct hash_table *dst_ht = (struct hash_table *)closure;
585 const fs_reg *reg = (const fs_reg *)data;
586
587 if (reg->file != UNIFORM)
588 return;
589
590 hash_table_insert(dst_ht, data, key);
591 }
592
593 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
594 * This brings in those uniform definitions
595 */
596 void
597 fs_visitor::import_uniforms(fs_visitor *v)
598 {
599 hash_table_call_foreach(v->variable_ht,
600 import_uniforms_callback,
601 variable_ht);
602 this->params_remap = v->params_remap;
603 }
604
605 /* Our support for uniforms is piggy-backed on the struct
606 * gl_fragment_program, because that's where the values actually
607 * get stored, rather than in some global gl_shader_program uniform
608 * store.
609 */
610 int
611 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
612 {
613 unsigned int offset = 0;
614
615 if (type->is_matrix()) {
616 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
617 type->vector_elements,
618 1);
619
620 for (unsigned int i = 0; i < type->matrix_columns; i++) {
621 offset += setup_uniform_values(loc + offset, column);
622 }
623
624 return offset;
625 }
626
627 switch (type->base_type) {
628 case GLSL_TYPE_FLOAT:
629 case GLSL_TYPE_UINT:
630 case GLSL_TYPE_INT:
631 case GLSL_TYPE_BOOL:
632 for (unsigned int i = 0; i < type->vector_elements; i++) {
633 unsigned int param = c->prog_data.nr_params++;
634
635 this->param_index[param] = loc;
636 this->param_offset[param] = i;
637 }
638 return 1;
639
640 case GLSL_TYPE_STRUCT:
641 for (unsigned int i = 0; i < type->length; i++) {
642 offset += setup_uniform_values(loc + offset,
643 type->fields.structure[i].type);
644 }
645 return offset;
646
647 case GLSL_TYPE_ARRAY:
648 for (unsigned int i = 0; i < type->length; i++) {
649 offset += setup_uniform_values(loc + offset, type->fields.array);
650 }
651 return offset;
652
653 case GLSL_TYPE_SAMPLER:
654 /* The sampler takes up a slot, but we don't use any values from it. */
655 return 1;
656
657 default:
658 assert(!"not reached");
659 return 0;
660 }
661 }
662
663
664 /* Our support for builtin uniforms is even scarier than non-builtin.
665 * It sits on top of the PROG_STATE_VAR parameters that are
666 * automatically updated from GL context state.
667 */
668 void
669 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
670 {
671 const ir_state_slot *const slots = ir->state_slots;
672 assert(ir->state_slots != NULL);
673
674 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
675 /* This state reference has already been setup by ir_to_mesa, but we'll
676 * get the same index back here.
677 */
678 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
679 (gl_state_index *)slots[i].tokens);
680
681 /* Add each of the unique swizzles of the element as a parameter.
682 * This'll end up matching the expected layout of the
683 * array/matrix/structure we're trying to fill in.
684 */
685 int last_swiz = -1;
686 for (unsigned int j = 0; j < 4; j++) {
687 int swiz = GET_SWZ(slots[i].swizzle, j);
688 if (swiz == last_swiz)
689 break;
690 last_swiz = swiz;
691
692 this->param_index[c->prog_data.nr_params] = index;
693 this->param_offset[c->prog_data.nr_params] = swiz;
694 c->prog_data.nr_params++;
695 }
696 }
697 }
698
699 fs_reg *
700 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
701 {
702 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
703 fs_reg wpos = *reg;
704 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
705
706 /* gl_FragCoord.x */
707 if (ir->pixel_center_integer) {
708 emit(MOV(wpos, this->pixel_x));
709 } else {
710 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
711 }
712 wpos.reg_offset++;
713
714 /* gl_FragCoord.y */
715 if (!flip && ir->pixel_center_integer) {
716 emit(MOV(wpos, this->pixel_y));
717 } else {
718 fs_reg pixel_y = this->pixel_y;
719 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
720
721 if (flip) {
722 pixel_y.negate = true;
723 offset += c->key.drawable_height - 1.0;
724 }
725
726 emit(ADD(wpos, pixel_y, fs_reg(offset)));
727 }
728 wpos.reg_offset++;
729
730 /* gl_FragCoord.z */
731 if (intel->gen >= 6) {
732 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
733 } else {
734 emit(FS_OPCODE_LINTERP, wpos,
735 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
736 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
737 interp_reg(FRAG_ATTRIB_WPOS, 2));
738 }
739 wpos.reg_offset++;
740
741 /* gl_FragCoord.w: Already set up in emit_interpolation */
742 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
743
744 return reg;
745 }
746
747 fs_inst *
748 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
749 glsl_interp_qualifier interpolation_mode,
750 bool is_centroid)
751 {
752 brw_wm_barycentric_interp_mode barycoord_mode;
753 if (is_centroid) {
754 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
755 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
756 else
757 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
758 } else {
759 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
760 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
761 else
762 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
763 }
764 return emit(FS_OPCODE_LINTERP, attr,
765 this->delta_x[barycoord_mode],
766 this->delta_y[barycoord_mode], interp);
767 }
768
769 fs_reg *
770 fs_visitor::emit_general_interpolation(ir_variable *ir)
771 {
772 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
773 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
774 fs_reg attr = *reg;
775
776 unsigned int array_elements;
777 const glsl_type *type;
778
779 if (ir->type->is_array()) {
780 array_elements = ir->type->length;
781 if (array_elements == 0) {
782 fail("dereferenced array '%s' has length 0\n", ir->name);
783 }
784 type = ir->type->fields.array;
785 } else {
786 array_elements = 1;
787 type = ir->type;
788 }
789
790 glsl_interp_qualifier interpolation_mode =
791 ir->determine_interpolation_mode(c->key.flat_shade);
792
793 int location = ir->location;
794 for (unsigned int i = 0; i < array_elements; i++) {
795 for (unsigned int j = 0; j < type->matrix_columns; j++) {
796 if (urb_setup[location] == -1) {
797 /* If there's no incoming setup data for this slot, don't
798 * emit interpolation for it.
799 */
800 attr.reg_offset += type->vector_elements;
801 location++;
802 continue;
803 }
804
805 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
806 /* Constant interpolation (flat shading) case. The SF has
807 * handed us defined values in only the constant offset
808 * field of the setup reg.
809 */
810 for (unsigned int k = 0; k < type->vector_elements; k++) {
811 struct brw_reg interp = interp_reg(location, k);
812 interp = suboffset(interp, 3);
813 interp.type = reg->type;
814 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
815 attr.reg_offset++;
816 }
817 } else {
818 /* Smooth/noperspective interpolation case. */
819 for (unsigned int k = 0; k < type->vector_elements; k++) {
820 /* FINISHME: At some point we probably want to push
821 * this farther by giving similar treatment to the
822 * other potentially constant components of the
823 * attribute, as well as making brw_vs_constval.c
824 * handle varyings other than gl_TexCoord.
825 */
826 if (location >= FRAG_ATTRIB_TEX0 &&
827 location <= FRAG_ATTRIB_TEX7 &&
828 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
829 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
830 } else {
831 struct brw_reg interp = interp_reg(location, k);
832 emit_linterp(attr, fs_reg(interp), interpolation_mode,
833 ir->centroid);
834 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
835 /* Get the pixel/sample mask into f0 so that we know
836 * which pixels are lit. Then, for each channel that is
837 * unlit, replace the centroid data with non-centroid
838 * data.
839 */
840 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
841 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
842 interpolation_mode, false);
843 inst->predicate = BRW_PREDICATE_NORMAL;
844 inst->predicate_inverse = true;
845 }
846 if (intel->gen < 6) {
847 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
848 }
849 }
850 attr.reg_offset++;
851 }
852
853 }
854 location++;
855 }
856 }
857
858 return reg;
859 }
860
861 fs_reg *
862 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
863 {
864 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
865
866 /* The frontfacing comes in as a bit in the thread payload. */
867 if (intel->gen >= 6) {
868 emit(BRW_OPCODE_ASR, *reg,
869 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
870 fs_reg(15));
871 emit(BRW_OPCODE_NOT, *reg, *reg);
872 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
873 } else {
874 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
875 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
876 * us front face
877 */
878 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
879 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
880 }
881
882 return reg;
883 }
884
885 fs_inst *
886 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
887 {
888 switch (opcode) {
889 case SHADER_OPCODE_RCP:
890 case SHADER_OPCODE_RSQ:
891 case SHADER_OPCODE_SQRT:
892 case SHADER_OPCODE_EXP2:
893 case SHADER_OPCODE_LOG2:
894 case SHADER_OPCODE_SIN:
895 case SHADER_OPCODE_COS:
896 break;
897 default:
898 assert(!"not reached: bad math opcode");
899 return NULL;
900 }
901
902 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
903 * might be able to do better by doing execsize = 1 math and then
904 * expanding that result out, but we would need to be careful with
905 * masking.
906 *
907 * Gen 6 hardware ignores source modifiers (negate and abs) on math
908 * instructions, so we also move to a temp to set those up.
909 */
910 if (intel->gen == 6 && (src.file == UNIFORM ||
911 src.abs ||
912 src.negate)) {
913 fs_reg expanded = fs_reg(this, glsl_type::float_type);
914 emit(BRW_OPCODE_MOV, expanded, src);
915 src = expanded;
916 }
917
918 fs_inst *inst = emit(opcode, dst, src);
919
920 if (intel->gen < 6) {
921 inst->base_mrf = 2;
922 inst->mlen = dispatch_width / 8;
923 }
924
925 return inst;
926 }
927
928 fs_inst *
929 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
930 {
931 int base_mrf = 2;
932 fs_inst *inst;
933
934 switch (opcode) {
935 case SHADER_OPCODE_POW:
936 case SHADER_OPCODE_INT_QUOTIENT:
937 case SHADER_OPCODE_INT_REMAINDER:
938 break;
939 default:
940 assert(!"not reached: unsupported binary math opcode.");
941 return NULL;
942 }
943
944 if (intel->gen >= 7) {
945 inst = emit(opcode, dst, src0, src1);
946 } else if (intel->gen == 6) {
947 /* Can't do hstride == 0 args to gen6 math, so expand it out.
948 *
949 * The hardware ignores source modifiers (negate and abs) on math
950 * instructions, so we also move to a temp to set those up.
951 */
952 if (src0.file == UNIFORM || src0.abs || src0.negate) {
953 fs_reg expanded = fs_reg(this, glsl_type::float_type);
954 expanded.type = src0.type;
955 emit(BRW_OPCODE_MOV, expanded, src0);
956 src0 = expanded;
957 }
958
959 if (src1.file == UNIFORM || src1.abs || src1.negate) {
960 fs_reg expanded = fs_reg(this, glsl_type::float_type);
961 expanded.type = src1.type;
962 emit(BRW_OPCODE_MOV, expanded, src1);
963 src1 = expanded;
964 }
965
966 inst = emit(opcode, dst, src0, src1);
967 } else {
968 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
969 * "Message Payload":
970 *
971 * "Operand0[7]. For the INT DIV functions, this operand is the
972 * denominator."
973 * ...
974 * "Operand1[7]. For the INT DIV functions, this operand is the
975 * numerator."
976 */
977 bool is_int_div = opcode != SHADER_OPCODE_POW;
978 fs_reg &op0 = is_int_div ? src1 : src0;
979 fs_reg &op1 = is_int_div ? src0 : src1;
980
981 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
982 inst = emit(opcode, dst, op0, reg_null_f);
983
984 inst->base_mrf = base_mrf;
985 inst->mlen = 2 * dispatch_width / 8;
986 }
987 return inst;
988 }
989
990 /**
991 * To be called after the last _mesa_add_state_reference() call, to
992 * set up prog_data.param[] for assign_curb_setup() and
993 * setup_pull_constants().
994 */
995 void
996 fs_visitor::setup_paramvalues_refs()
997 {
998 if (dispatch_width != 8)
999 return;
1000
1001 /* Set up the pointers to ParamValues now that that array is finalized. */
1002 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1003 c->prog_data.param[i] =
1004 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
1005 this->param_offset[i];
1006 }
1007 }
1008
1009 void
1010 fs_visitor::assign_curb_setup()
1011 {
1012 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1013 if (dispatch_width == 8) {
1014 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1015 } else {
1016 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1017 }
1018
1019 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1020 foreach_list(node, &this->instructions) {
1021 fs_inst *inst = (fs_inst *)node;
1022
1023 for (unsigned int i = 0; i < 3; i++) {
1024 if (inst->src[i].file == UNIFORM) {
1025 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1026 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1027 constant_nr / 8,
1028 constant_nr % 8);
1029
1030 inst->src[i].file = FIXED_HW_REG;
1031 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1032 }
1033 }
1034 }
1035 }
1036
1037 void
1038 fs_visitor::calculate_urb_setup()
1039 {
1040 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1041 urb_setup[i] = -1;
1042 }
1043
1044 int urb_next = 0;
1045 /* Figure out where each of the incoming setup attributes lands. */
1046 if (intel->gen >= 6) {
1047 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1048 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1049 urb_setup[i] = urb_next++;
1050 }
1051 }
1052 } else {
1053 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1054 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1055 /* Point size is packed into the header, not as a general attribute */
1056 if (i == VERT_RESULT_PSIZ)
1057 continue;
1058
1059 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1060 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1061
1062 /* The back color slot is skipped when the front color is
1063 * also written to. In addition, some slots can be
1064 * written in the vertex shader and not read in the
1065 * fragment shader. So the register number must always be
1066 * incremented, mapped or not.
1067 */
1068 if (fp_index >= 0)
1069 urb_setup[fp_index] = urb_next;
1070 urb_next++;
1071 }
1072 }
1073
1074 /*
1075 * It's a FS only attribute, and we did interpolation for this attribute
1076 * in SF thread. So, count it here, too.
1077 *
1078 * See compile_sf_prog() for more info.
1079 */
1080 if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1081 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1082 }
1083
1084 /* Each attribute is 4 setup channels, each of which is half a reg. */
1085 c->prog_data.urb_read_length = urb_next * 2;
1086 }
1087
1088 void
1089 fs_visitor::assign_urb_setup()
1090 {
1091 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1092
1093 /* Offset all the urb_setup[] index by the actual position of the
1094 * setup regs, now that the location of the constants has been chosen.
1095 */
1096 foreach_list(node, &this->instructions) {
1097 fs_inst *inst = (fs_inst *)node;
1098
1099 if (inst->opcode == FS_OPCODE_LINTERP) {
1100 assert(inst->src[2].file == FIXED_HW_REG);
1101 inst->src[2].fixed_hw_reg.nr += urb_start;
1102 }
1103
1104 if (inst->opcode == FS_OPCODE_CINTERP) {
1105 assert(inst->src[0].file == FIXED_HW_REG);
1106 inst->src[0].fixed_hw_reg.nr += urb_start;
1107 }
1108 }
1109
1110 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1111 }
1112
1113 /**
1114 * Split large virtual GRFs into separate components if we can.
1115 *
1116 * This is mostly duplicated with what brw_fs_vector_splitting does,
1117 * but that's really conservative because it's afraid of doing
1118 * splitting that doesn't result in real progress after the rest of
1119 * the optimization phases, which would cause infinite looping in
1120 * optimization. We can do it once here, safely. This also has the
1121 * opportunity to split interpolated values, or maybe even uniforms,
1122 * which we don't have at the IR level.
1123 *
1124 * We want to split, because virtual GRFs are what we register
1125 * allocate and spill (due to contiguousness requirements for some
1126 * instructions), and they're what we naturally generate in the
1127 * codegen process, but most virtual GRFs don't actually need to be
1128 * contiguous sets of GRFs. If we split, we'll end up with reduced
1129 * live intervals and better dead code elimination and coalescing.
1130 */
1131 void
1132 fs_visitor::split_virtual_grfs()
1133 {
1134 int num_vars = this->virtual_grf_count;
1135 bool split_grf[num_vars];
1136 int new_virtual_grf[num_vars];
1137
1138 /* Try to split anything > 0 sized. */
1139 for (int i = 0; i < num_vars; i++) {
1140 if (this->virtual_grf_sizes[i] != 1)
1141 split_grf[i] = true;
1142 else
1143 split_grf[i] = false;
1144 }
1145
1146 if (brw->has_pln &&
1147 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1148 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1149 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1150 * Gen6, that was the only supported interpolation mode, and since Gen6,
1151 * delta_x and delta_y are in fixed hardware registers.
1152 */
1153 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1154 false;
1155 }
1156
1157 foreach_list(node, &this->instructions) {
1158 fs_inst *inst = (fs_inst *)node;
1159
1160 /* If there's a SEND message that requires contiguous destination
1161 * registers, no splitting is allowed.
1162 */
1163 if (inst->regs_written() > 1) {
1164 split_grf[inst->dst.reg] = false;
1165 }
1166 }
1167
1168 /* Allocate new space for split regs. Note that the virtual
1169 * numbers will be contiguous.
1170 */
1171 for (int i = 0; i < num_vars; i++) {
1172 if (split_grf[i]) {
1173 new_virtual_grf[i] = virtual_grf_alloc(1);
1174 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1175 int reg = virtual_grf_alloc(1);
1176 assert(reg == new_virtual_grf[i] + j - 1);
1177 (void) reg;
1178 }
1179 this->virtual_grf_sizes[i] = 1;
1180 }
1181 }
1182
1183 foreach_list(node, &this->instructions) {
1184 fs_inst *inst = (fs_inst *)node;
1185
1186 if (inst->dst.file == GRF &&
1187 split_grf[inst->dst.reg] &&
1188 inst->dst.reg_offset != 0) {
1189 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1190 inst->dst.reg_offset - 1);
1191 inst->dst.reg_offset = 0;
1192 }
1193 for (int i = 0; i < 3; i++) {
1194 if (inst->src[i].file == GRF &&
1195 split_grf[inst->src[i].reg] &&
1196 inst->src[i].reg_offset != 0) {
1197 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1198 inst->src[i].reg_offset - 1);
1199 inst->src[i].reg_offset = 0;
1200 }
1201 }
1202 }
1203 this->live_intervals_valid = false;
1204 }
1205
1206 /**
1207 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1208 *
1209 * During code generation, we create tons of temporary variables, many of
1210 * which get immediately killed and are never used again. Yet, in later
1211 * optimization and analysis passes, such as compute_live_intervals, we need
1212 * to loop over all the virtual GRFs. Compacting them can save a lot of
1213 * overhead.
1214 */
1215 void
1216 fs_visitor::compact_virtual_grfs()
1217 {
1218 /* Mark which virtual GRFs are used, and count how many. */
1219 int remap_table[this->virtual_grf_count];
1220 memset(remap_table, -1, sizeof(remap_table));
1221
1222 foreach_list(node, &this->instructions) {
1223 const fs_inst *inst = (const fs_inst *) node;
1224
1225 if (inst->dst.file == GRF)
1226 remap_table[inst->dst.reg] = 0;
1227
1228 for (int i = 0; i < 3; i++) {
1229 if (inst->src[i].file == GRF)
1230 remap_table[inst->src[i].reg] = 0;
1231 }
1232 }
1233
1234 /* In addition to registers used in instructions, fs_visitor keeps
1235 * direct references to certain special values which must be patched:
1236 */
1237 fs_reg *special[] = {
1238 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1239 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1240 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1241 &delta_x[0], &delta_x[1], &delta_x[2],
1242 &delta_x[3], &delta_x[4], &delta_x[5],
1243 &delta_y[0], &delta_y[1], &delta_y[2],
1244 &delta_y[3], &delta_y[4], &delta_y[5],
1245 };
1246 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1247 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1248
1249 /* Treat all special values as used, to be conservative */
1250 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1251 if (special[i]->file == GRF)
1252 remap_table[special[i]->reg] = 0;
1253 }
1254
1255 /* Compact the GRF arrays. */
1256 int new_index = 0;
1257 for (int i = 0; i < this->virtual_grf_count; i++) {
1258 if (remap_table[i] != -1) {
1259 remap_table[i] = new_index;
1260 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1261 if (live_intervals_valid) {
1262 virtual_grf_use[new_index] = virtual_grf_use[i];
1263 virtual_grf_def[new_index] = virtual_grf_def[i];
1264 }
1265 ++new_index;
1266 }
1267 }
1268
1269 this->virtual_grf_count = new_index;
1270
1271 /* Patch all the instructions to use the newly renumbered registers */
1272 foreach_list(node, &this->instructions) {
1273 fs_inst *inst = (fs_inst *) node;
1274
1275 if (inst->dst.file == GRF)
1276 inst->dst.reg = remap_table[inst->dst.reg];
1277
1278 for (int i = 0; i < 3; i++) {
1279 if (inst->src[i].file == GRF)
1280 inst->src[i].reg = remap_table[inst->src[i].reg];
1281 }
1282 }
1283
1284 /* Patch all the references to special values */
1285 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1286 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1287 special[i]->reg = remap_table[special[i]->reg];
1288 }
1289 }
1290
1291 bool
1292 fs_visitor::remove_dead_constants()
1293 {
1294 if (dispatch_width == 8) {
1295 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1296
1297 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1298 this->params_remap[i] = -1;
1299
1300 /* Find which params are still in use. */
1301 foreach_list(node, &this->instructions) {
1302 fs_inst *inst = (fs_inst *)node;
1303
1304 for (int i = 0; i < 3; i++) {
1305 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1306
1307 if (inst->src[i].file != UNIFORM)
1308 continue;
1309
1310 assert(constant_nr < (int)c->prog_data.nr_params);
1311
1312 /* For now, set this to non-negative. We'll give it the
1313 * actual new number in a moment, in order to keep the
1314 * register numbers nicely ordered.
1315 */
1316 this->params_remap[constant_nr] = 0;
1317 }
1318 }
1319
1320 /* Figure out what the new numbers for the params will be. At some
1321 * point when we're doing uniform array access, we're going to want
1322 * to keep the distinction between .reg and .reg_offset, but for
1323 * now we don't care.
1324 */
1325 unsigned int new_nr_params = 0;
1326 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1327 if (this->params_remap[i] != -1) {
1328 this->params_remap[i] = new_nr_params++;
1329 }
1330 }
1331
1332 /* Update the list of params to be uploaded to match our new numbering. */
1333 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1334 int remapped = this->params_remap[i];
1335
1336 if (remapped == -1)
1337 continue;
1338
1339 /* We've already done setup_paramvalues_refs() so no need to worry
1340 * about param_index and param_offset.
1341 */
1342 c->prog_data.param[remapped] = c->prog_data.param[i];
1343 }
1344
1345 c->prog_data.nr_params = new_nr_params;
1346 } else {
1347 /* This should have been generated in the 8-wide pass already. */
1348 assert(this->params_remap);
1349 }
1350
1351 /* Now do the renumbering of the shader to remove unused params. */
1352 foreach_list(node, &this->instructions) {
1353 fs_inst *inst = (fs_inst *)node;
1354
1355 for (int i = 0; i < 3; i++) {
1356 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1357
1358 if (inst->src[i].file != UNIFORM)
1359 continue;
1360
1361 assert(this->params_remap[constant_nr] != -1);
1362 inst->src[i].reg = this->params_remap[constant_nr];
1363 inst->src[i].reg_offset = 0;
1364 }
1365 }
1366
1367 return true;
1368 }
1369
1370 /**
1371 * Choose accesses from the UNIFORM file to demote to using the pull
1372 * constant buffer.
1373 *
1374 * We allow a fragment shader to have more than the specified minimum
1375 * maximum number of fragment shader uniform components (64). If
1376 * there are too many of these, they'd fill up all of register space.
1377 * So, this will push some of them out to the pull constant buffer and
1378 * update the program to load them.
1379 */
1380 void
1381 fs_visitor::setup_pull_constants()
1382 {
1383 /* Only allow 16 registers (128 uniform components) as push constants. */
1384 unsigned int max_uniform_components = 16 * 8;
1385 if (c->prog_data.nr_params <= max_uniform_components)
1386 return;
1387
1388 if (dispatch_width == 16) {
1389 fail("Pull constants not supported in 16-wide\n");
1390 return;
1391 }
1392
1393 /* Just demote the end of the list. We could probably do better
1394 * here, demoting things that are rarely used in the program first.
1395 */
1396 int pull_uniform_base = max_uniform_components;
1397 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
1398
1399 foreach_list(node, &this->instructions) {
1400 fs_inst *inst = (fs_inst *)node;
1401
1402 for (int i = 0; i < 3; i++) {
1403 if (inst->src[i].file != UNIFORM)
1404 continue;
1405
1406 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1407 if (uniform_nr < pull_uniform_base)
1408 continue;
1409
1410 fs_reg dst = fs_reg(this, glsl_type::float_type);
1411 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1412 fs_reg offset = fs_reg((unsigned)(((uniform_nr -
1413 pull_uniform_base) * 4) & ~15));
1414 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
1415 dst, index, offset);
1416 pull->ir = inst->ir;
1417 pull->annotation = inst->annotation;
1418 pull->base_mrf = 14;
1419 pull->mlen = 1;
1420
1421 inst->insert_before(pull);
1422
1423 inst->src[i].file = GRF;
1424 inst->src[i].reg = dst.reg;
1425 inst->src[i].reg_offset = 0;
1426 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
1427 }
1428 }
1429
1430 for (int i = 0; i < pull_uniform_count; i++) {
1431 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
1432 }
1433 c->prog_data.nr_params -= pull_uniform_count;
1434 c->prog_data.nr_pull_params = pull_uniform_count;
1435 }
1436
1437 bool
1438 fs_visitor::opt_algebraic()
1439 {
1440 bool progress = false;
1441
1442 foreach_list(node, &this->instructions) {
1443 fs_inst *inst = (fs_inst *)node;
1444
1445 switch (inst->opcode) {
1446 case BRW_OPCODE_MUL:
1447 if (inst->src[1].file != IMM)
1448 continue;
1449
1450 /* a * 1.0 = a */
1451 if (inst->src[1].is_one()) {
1452 inst->opcode = BRW_OPCODE_MOV;
1453 inst->src[1] = reg_undef;
1454 progress = true;
1455 break;
1456 }
1457
1458 /* a * 0.0 = 0.0 */
1459 if (inst->src[1].is_zero()) {
1460 inst->opcode = BRW_OPCODE_MOV;
1461 inst->src[0] = inst->src[1];
1462 inst->src[1] = reg_undef;
1463 progress = true;
1464 break;
1465 }
1466
1467 break;
1468 case BRW_OPCODE_ADD:
1469 if (inst->src[1].file != IMM)
1470 continue;
1471
1472 /* a + 0.0 = a */
1473 if (inst->src[1].is_zero()) {
1474 inst->opcode = BRW_OPCODE_MOV;
1475 inst->src[1] = reg_undef;
1476 progress = true;
1477 break;
1478 }
1479 break;
1480 default:
1481 break;
1482 }
1483 }
1484
1485 return progress;
1486 }
1487
1488 /**
1489 * Must be called after calculate_live_intervales() to remove unused
1490 * writes to registers -- register allocation will fail otherwise
1491 * because something deffed but not used won't be considered to
1492 * interfere with other regs.
1493 */
1494 bool
1495 fs_visitor::dead_code_eliminate()
1496 {
1497 bool progress = false;
1498 int pc = 0;
1499
1500 calculate_live_intervals();
1501
1502 foreach_list_safe(node, &this->instructions) {
1503 fs_inst *inst = (fs_inst *)node;
1504
1505 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1506 inst->remove();
1507 progress = true;
1508 }
1509
1510 pc++;
1511 }
1512
1513 if (progress)
1514 live_intervals_valid = false;
1515
1516 return progress;
1517 }
1518
1519 /**
1520 * Implements a second type of register coalescing: This one checks if
1521 * the two regs involved in a raw move don't interfere, in which case
1522 * they can both by stored in the same place and the MOV removed.
1523 */
1524 bool
1525 fs_visitor::register_coalesce_2()
1526 {
1527 bool progress = false;
1528
1529 calculate_live_intervals();
1530
1531 foreach_list_safe(node, &this->instructions) {
1532 fs_inst *inst = (fs_inst *)node;
1533
1534 if (inst->opcode != BRW_OPCODE_MOV ||
1535 inst->predicate ||
1536 inst->saturate ||
1537 inst->src[0].file != GRF ||
1538 inst->src[0].negate ||
1539 inst->src[0].abs ||
1540 inst->src[0].smear != -1 ||
1541 inst->dst.file != GRF ||
1542 inst->dst.type != inst->src[0].type ||
1543 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1544 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1545 continue;
1546 }
1547
1548 int reg_from = inst->src[0].reg;
1549 assert(inst->src[0].reg_offset == 0);
1550 int reg_to = inst->dst.reg;
1551 int reg_to_offset = inst->dst.reg_offset;
1552
1553 foreach_list_safe(node, &this->instructions) {
1554 fs_inst *scan_inst = (fs_inst *)node;
1555
1556 if (scan_inst->dst.file == GRF &&
1557 scan_inst->dst.reg == reg_from) {
1558 scan_inst->dst.reg = reg_to;
1559 scan_inst->dst.reg_offset = reg_to_offset;
1560 }
1561 for (int i = 0; i < 3; i++) {
1562 if (scan_inst->src[i].file == GRF &&
1563 scan_inst->src[i].reg == reg_from) {
1564 scan_inst->src[i].reg = reg_to;
1565 scan_inst->src[i].reg_offset = reg_to_offset;
1566 }
1567 }
1568 }
1569
1570 inst->remove();
1571 live_intervals_valid = false;
1572 progress = true;
1573 continue;
1574 }
1575
1576 return progress;
1577 }
1578
1579 bool
1580 fs_visitor::register_coalesce()
1581 {
1582 bool progress = false;
1583 int if_depth = 0;
1584 int loop_depth = 0;
1585
1586 foreach_list_safe(node, &this->instructions) {
1587 fs_inst *inst = (fs_inst *)node;
1588
1589 /* Make sure that we dominate the instructions we're going to
1590 * scan for interfering with our coalescing, or we won't have
1591 * scanned enough to see if anything interferes with our
1592 * coalescing. We don't dominate the following instructions if
1593 * we're in a loop or an if block.
1594 */
1595 switch (inst->opcode) {
1596 case BRW_OPCODE_DO:
1597 loop_depth++;
1598 break;
1599 case BRW_OPCODE_WHILE:
1600 loop_depth--;
1601 break;
1602 case BRW_OPCODE_IF:
1603 if_depth++;
1604 break;
1605 case BRW_OPCODE_ENDIF:
1606 if_depth--;
1607 break;
1608 default:
1609 break;
1610 }
1611 if (loop_depth || if_depth)
1612 continue;
1613
1614 if (inst->opcode != BRW_OPCODE_MOV ||
1615 inst->predicate ||
1616 inst->saturate ||
1617 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1618 inst->src[0].file != UNIFORM)||
1619 inst->dst.type != inst->src[0].type)
1620 continue;
1621
1622 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
1623
1624 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1625 * them: check for no writes to either one until the exit of the
1626 * program.
1627 */
1628 bool interfered = false;
1629
1630 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1631 !scan_inst->is_tail_sentinel();
1632 scan_inst = (fs_inst *)scan_inst->next) {
1633 if (scan_inst->dst.file == GRF) {
1634 if (scan_inst->overwrites_reg(inst->dst) ||
1635 scan_inst->overwrites_reg(inst->src[0])) {
1636 interfered = true;
1637 break;
1638 }
1639 }
1640
1641 /* The gen6 MATH instruction can't handle source modifiers or
1642 * unusual register regions, so avoid coalescing those for
1643 * now. We should do something more specific.
1644 */
1645 if (intel->gen >= 6 &&
1646 scan_inst->is_math() &&
1647 (has_source_modifiers || inst->src[0].file == UNIFORM)) {
1648 interfered = true;
1649 break;
1650 }
1651
1652 /* The accumulator result appears to get used for the
1653 * conditional modifier generation. When negating a UD
1654 * value, there is a 33rd bit generated for the sign in the
1655 * accumulator value, so now you can't check, for example,
1656 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1657 */
1658 if (scan_inst->conditional_mod &&
1659 inst->src[0].negate &&
1660 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1661 interfered = true;
1662 break;
1663 }
1664 }
1665 if (interfered) {
1666 continue;
1667 }
1668
1669 /* Rewrite the later usage to point at the source of the move to
1670 * be removed.
1671 */
1672 for (fs_inst *scan_inst = inst;
1673 !scan_inst->is_tail_sentinel();
1674 scan_inst = (fs_inst *)scan_inst->next) {
1675 for (int i = 0; i < 3; i++) {
1676 if (scan_inst->src[i].file == GRF &&
1677 scan_inst->src[i].reg == inst->dst.reg &&
1678 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1679 fs_reg new_src = inst->src[0];
1680 if (scan_inst->src[i].abs) {
1681 new_src.negate = 0;
1682 new_src.abs = 1;
1683 }
1684 new_src.negate ^= scan_inst->src[i].negate;
1685 scan_inst->src[i] = new_src;
1686 }
1687 }
1688 }
1689
1690 inst->remove();
1691 progress = true;
1692 }
1693
1694 if (progress)
1695 live_intervals_valid = false;
1696
1697 return progress;
1698 }
1699
1700
1701 bool
1702 fs_visitor::compute_to_mrf()
1703 {
1704 bool progress = false;
1705 int next_ip = 0;
1706
1707 calculate_live_intervals();
1708
1709 foreach_list_safe(node, &this->instructions) {
1710 fs_inst *inst = (fs_inst *)node;
1711
1712 int ip = next_ip;
1713 next_ip++;
1714
1715 if (inst->opcode != BRW_OPCODE_MOV ||
1716 inst->predicate ||
1717 inst->dst.file != MRF || inst->src[0].file != GRF ||
1718 inst->dst.type != inst->src[0].type ||
1719 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1720 continue;
1721
1722 /* Work out which hardware MRF registers are written by this
1723 * instruction.
1724 */
1725 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1726 int mrf_high;
1727 if (inst->dst.reg & BRW_MRF_COMPR4) {
1728 mrf_high = mrf_low + 4;
1729 } else if (dispatch_width == 16 &&
1730 (!inst->force_uncompressed && !inst->force_sechalf)) {
1731 mrf_high = mrf_low + 1;
1732 } else {
1733 mrf_high = mrf_low;
1734 }
1735
1736 /* Can't compute-to-MRF this GRF if someone else was going to
1737 * read it later.
1738 */
1739 if (this->virtual_grf_use[inst->src[0].reg] > ip)
1740 continue;
1741
1742 /* Found a move of a GRF to a MRF. Let's see if we can go
1743 * rewrite the thing that made this GRF to write into the MRF.
1744 */
1745 fs_inst *scan_inst;
1746 for (scan_inst = (fs_inst *)inst->prev;
1747 scan_inst->prev != NULL;
1748 scan_inst = (fs_inst *)scan_inst->prev) {
1749 if (scan_inst->dst.file == GRF &&
1750 scan_inst->dst.reg == inst->src[0].reg) {
1751 /* Found the last thing to write our reg we want to turn
1752 * into a compute-to-MRF.
1753 */
1754
1755 /* SENDs can only write to GRFs, so no compute-to-MRF. */
1756 if (scan_inst->mlen) {
1757 break;
1758 }
1759
1760 /* If it's predicated, it (probably) didn't populate all
1761 * the channels. We might be able to rewrite everything
1762 * that writes that reg, but it would require smarter
1763 * tracking to delay the rewriting until complete success.
1764 */
1765 if (scan_inst->predicate)
1766 break;
1767
1768 /* If it's half of register setup and not the same half as
1769 * our MOV we're trying to remove, bail for now.
1770 */
1771 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
1772 scan_inst->force_sechalf != inst->force_sechalf) {
1773 break;
1774 }
1775
1776 /* SEND instructions can't have MRF as a destination. */
1777 if (scan_inst->mlen)
1778 break;
1779
1780 if (intel->gen >= 6) {
1781 /* gen6 math instructions must have the destination be
1782 * GRF, so no compute-to-MRF for them.
1783 */
1784 if (scan_inst->is_math()) {
1785 break;
1786 }
1787 }
1788
1789 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1790 /* Found the creator of our MRF's source value. */
1791 scan_inst->dst.file = MRF;
1792 scan_inst->dst.reg = inst->dst.reg;
1793 scan_inst->saturate |= inst->saturate;
1794 inst->remove();
1795 progress = true;
1796 }
1797 break;
1798 }
1799
1800 /* We don't handle flow control here. Most computation of
1801 * values that end up in MRFs are shortly before the MRF
1802 * write anyway.
1803 */
1804 if (scan_inst->opcode == BRW_OPCODE_DO ||
1805 scan_inst->opcode == BRW_OPCODE_WHILE ||
1806 scan_inst->opcode == BRW_OPCODE_ELSE ||
1807 scan_inst->opcode == BRW_OPCODE_ENDIF) {
1808 break;
1809 }
1810
1811 /* You can't read from an MRF, so if someone else reads our
1812 * MRF's source GRF that we wanted to rewrite, that stops us.
1813 */
1814 bool interfered = false;
1815 for (int i = 0; i < 3; i++) {
1816 if (scan_inst->src[i].file == GRF &&
1817 scan_inst->src[i].reg == inst->src[0].reg &&
1818 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
1819 interfered = true;
1820 }
1821 }
1822 if (interfered)
1823 break;
1824
1825 if (scan_inst->dst.file == MRF) {
1826 /* If somebody else writes our MRF here, we can't
1827 * compute-to-MRF before that.
1828 */
1829 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
1830 int scan_mrf_high;
1831
1832 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
1833 scan_mrf_high = scan_mrf_low + 4;
1834 } else if (dispatch_width == 16 &&
1835 (!scan_inst->force_uncompressed &&
1836 !scan_inst->force_sechalf)) {
1837 scan_mrf_high = scan_mrf_low + 1;
1838 } else {
1839 scan_mrf_high = scan_mrf_low;
1840 }
1841
1842 if (mrf_low == scan_mrf_low ||
1843 mrf_low == scan_mrf_high ||
1844 mrf_high == scan_mrf_low ||
1845 mrf_high == scan_mrf_high) {
1846 break;
1847 }
1848 }
1849
1850 if (scan_inst->mlen > 0) {
1851 /* Found a SEND instruction, which means that there are
1852 * live values in MRFs from base_mrf to base_mrf +
1853 * scan_inst->mlen - 1. Don't go pushing our MRF write up
1854 * above it.
1855 */
1856 if (mrf_low >= scan_inst->base_mrf &&
1857 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
1858 break;
1859 }
1860 if (mrf_high >= scan_inst->base_mrf &&
1861 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
1862 break;
1863 }
1864 }
1865 }
1866 }
1867
1868 if (progress)
1869 live_intervals_valid = false;
1870
1871 return progress;
1872 }
1873
1874 /**
1875 * Walks through basic blocks, looking for repeated MRF writes and
1876 * removing the later ones.
1877 */
1878 bool
1879 fs_visitor::remove_duplicate_mrf_writes()
1880 {
1881 fs_inst *last_mrf_move[16];
1882 bool progress = false;
1883
1884 /* Need to update the MRF tracking for compressed instructions. */
1885 if (dispatch_width == 16)
1886 return false;
1887
1888 memset(last_mrf_move, 0, sizeof(last_mrf_move));
1889
1890 foreach_list_safe(node, &this->instructions) {
1891 fs_inst *inst = (fs_inst *)node;
1892
1893 switch (inst->opcode) {
1894 case BRW_OPCODE_DO:
1895 case BRW_OPCODE_WHILE:
1896 case BRW_OPCODE_IF:
1897 case BRW_OPCODE_ELSE:
1898 case BRW_OPCODE_ENDIF:
1899 memset(last_mrf_move, 0, sizeof(last_mrf_move));
1900 continue;
1901 default:
1902 break;
1903 }
1904
1905 if (inst->opcode == BRW_OPCODE_MOV &&
1906 inst->dst.file == MRF) {
1907 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
1908 if (prev_inst && inst->equals(prev_inst)) {
1909 inst->remove();
1910 progress = true;
1911 continue;
1912 }
1913 }
1914
1915 /* Clear out the last-write records for MRFs that were overwritten. */
1916 if (inst->dst.file == MRF) {
1917 last_mrf_move[inst->dst.reg] = NULL;
1918 }
1919
1920 if (inst->mlen > 0) {
1921 /* Found a SEND instruction, which will include two or fewer
1922 * implied MRF writes. We could do better here.
1923 */
1924 for (int i = 0; i < implied_mrf_writes(inst); i++) {
1925 last_mrf_move[inst->base_mrf + i] = NULL;
1926 }
1927 }
1928
1929 /* Clear out any MRF move records whose sources got overwritten. */
1930 if (inst->dst.file == GRF) {
1931 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
1932 if (last_mrf_move[i] &&
1933 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
1934 last_mrf_move[i] = NULL;
1935 }
1936 }
1937 }
1938
1939 if (inst->opcode == BRW_OPCODE_MOV &&
1940 inst->dst.file == MRF &&
1941 inst->src[0].file == GRF &&
1942 !inst->predicate) {
1943 last_mrf_move[inst->dst.reg] = inst;
1944 }
1945 }
1946
1947 if (progress)
1948 live_intervals_valid = false;
1949
1950 return progress;
1951 }
1952
1953 void
1954 fs_visitor::dump_instruction(fs_inst *inst)
1955 {
1956 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
1957 opcode_descs[inst->opcode].name) {
1958 printf("%s", opcode_descs[inst->opcode].name);
1959 } else {
1960 printf("op%d", inst->opcode);
1961 }
1962 if (inst->saturate)
1963 printf(".sat");
1964 printf(" ");
1965
1966 switch (inst->dst.file) {
1967 case GRF:
1968 printf("vgrf%d", inst->dst.reg);
1969 if (inst->dst.reg_offset)
1970 printf("+%d", inst->dst.reg_offset);
1971 break;
1972 case MRF:
1973 printf("m%d", inst->dst.reg);
1974 break;
1975 case BAD_FILE:
1976 printf("(null)");
1977 break;
1978 case UNIFORM:
1979 printf("***u%d***", inst->dst.reg);
1980 break;
1981 default:
1982 printf("???");
1983 break;
1984 }
1985 printf(", ");
1986
1987 for (int i = 0; i < 3; i++) {
1988 if (inst->src[i].negate)
1989 printf("-");
1990 if (inst->src[i].abs)
1991 printf("|");
1992 switch (inst->src[i].file) {
1993 case GRF:
1994 printf("vgrf%d", inst->src[i].reg);
1995 if (inst->src[i].reg_offset)
1996 printf("+%d", inst->src[i].reg_offset);
1997 break;
1998 case MRF:
1999 printf("***m%d***", inst->src[i].reg);
2000 break;
2001 case UNIFORM:
2002 printf("u%d", inst->src[i].reg);
2003 if (inst->src[i].reg_offset)
2004 printf(".%d", inst->src[i].reg_offset);
2005 break;
2006 case BAD_FILE:
2007 printf("(null)");
2008 break;
2009 default:
2010 printf("???");
2011 break;
2012 }
2013 if (inst->src[i].abs)
2014 printf("|");
2015
2016 if (i < 3)
2017 printf(", ");
2018 }
2019
2020 printf(" ");
2021
2022 if (inst->force_uncompressed)
2023 printf("1sthalf ");
2024
2025 if (inst->force_sechalf)
2026 printf("2ndhalf ");
2027
2028 printf("\n");
2029 }
2030
2031 void
2032 fs_visitor::dump_instructions()
2033 {
2034 int ip = 0;
2035 foreach_list(node, &this->instructions) {
2036 fs_inst *inst = (fs_inst *)node;
2037 printf("%d: ", ip++);
2038 dump_instruction(inst);
2039 }
2040 }
2041
2042 /**
2043 * Possibly returns an instruction that set up @param reg.
2044 *
2045 * Sometimes we want to take the result of some expression/variable
2046 * dereference tree and rewrite the instruction generating the result
2047 * of the tree. When processing the tree, we know that the
2048 * instructions generated are all writing temporaries that are dead
2049 * outside of this tree. So, if we have some instructions that write
2050 * a temporary, we're free to point that temp write somewhere else.
2051 *
2052 * Note that this doesn't guarantee that the instruction generated
2053 * only reg -- it might be the size=4 destination of a texture instruction.
2054 */
2055 fs_inst *
2056 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2057 fs_inst *end,
2058 fs_reg reg)
2059 {
2060 if (end == start ||
2061 end->predicate ||
2062 end->force_uncompressed ||
2063 end->force_sechalf ||
2064 !reg.equals(end->dst)) {
2065 return NULL;
2066 } else {
2067 return end;
2068 }
2069 }
2070
2071 void
2072 fs_visitor::setup_payload_gen6()
2073 {
2074 struct intel_context *intel = &brw->intel;
2075 bool uses_depth =
2076 (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2077 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2078
2079 assert(intel->gen >= 6);
2080
2081 /* R0-1: masks, pixel X/Y coordinates. */
2082 c->nr_payload_regs = 2;
2083 /* R2: only for 32-pixel dispatch.*/
2084
2085 /* R3-26: barycentric interpolation coordinates. These appear in the
2086 * same order that they appear in the brw_wm_barycentric_interp_mode
2087 * enum. Each set of coordinates occupies 2 registers if dispatch width
2088 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2089 * appear if they were enabled using the "Barycentric Interpolation
2090 * Mode" bits in WM_STATE.
2091 */
2092 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2093 if (barycentric_interp_modes & (1 << i)) {
2094 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2095 c->nr_payload_regs += 2;
2096 if (dispatch_width == 16) {
2097 c->nr_payload_regs += 2;
2098 }
2099 }
2100 }
2101
2102 /* R27: interpolated depth if uses source depth */
2103 if (uses_depth) {
2104 c->source_depth_reg = c->nr_payload_regs;
2105 c->nr_payload_regs++;
2106 if (dispatch_width == 16) {
2107 /* R28: interpolated depth if not 8-wide. */
2108 c->nr_payload_regs++;
2109 }
2110 }
2111 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2112 if (uses_depth) {
2113 c->source_w_reg = c->nr_payload_regs;
2114 c->nr_payload_regs++;
2115 if (dispatch_width == 16) {
2116 /* R30: interpolated W if not 8-wide. */
2117 c->nr_payload_regs++;
2118 }
2119 }
2120 /* R31: MSAA position offsets. */
2121 /* R32-: bary for 32-pixel. */
2122 /* R58-59: interp W for 32-pixel. */
2123
2124 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2125 c->source_depth_to_render_target = true;
2126 }
2127 }
2128
2129 bool
2130 fs_visitor::run()
2131 {
2132 uint32_t orig_nr_params = c->prog_data.nr_params;
2133
2134 if (intel->gen >= 6)
2135 setup_payload_gen6();
2136 else
2137 setup_payload_gen4();
2138
2139 if (0) {
2140 emit_dummy_fs();
2141 } else {
2142 calculate_urb_setup();
2143 if (intel->gen < 6)
2144 emit_interpolation_setup_gen4();
2145 else
2146 emit_interpolation_setup_gen6();
2147
2148 /* Generate FS IR for main(). (the visitor only descends into
2149 * functions called "main").
2150 */
2151 if (shader) {
2152 foreach_list(node, &*shader->ir) {
2153 ir_instruction *ir = (ir_instruction *)node;
2154 base_ir = ir;
2155 this->result = reg_undef;
2156 ir->accept(this);
2157 }
2158 } else {
2159 emit_fragment_program_code();
2160 }
2161 base_ir = NULL;
2162 if (failed)
2163 return false;
2164
2165 emit_fb_writes();
2166
2167 split_virtual_grfs();
2168
2169 setup_paramvalues_refs();
2170 setup_pull_constants();
2171
2172 bool progress;
2173 do {
2174 progress = false;
2175
2176 compact_virtual_grfs();
2177
2178 progress = remove_duplicate_mrf_writes() || progress;
2179
2180 progress = opt_algebraic() || progress;
2181 progress = opt_cse() || progress;
2182 progress = opt_copy_propagate() || progress;
2183 progress = dead_code_eliminate() || progress;
2184 progress = register_coalesce() || progress;
2185 progress = register_coalesce_2() || progress;
2186 progress = compute_to_mrf() || progress;
2187 } while (progress);
2188
2189 remove_dead_constants();
2190
2191 schedule_instructions();
2192
2193 assign_curb_setup();
2194 assign_urb_setup();
2195
2196 if (0) {
2197 /* Debug of register spilling: Go spill everything. */
2198 for (int i = 0; i < virtual_grf_count; i++) {
2199 spill_reg(i);
2200 }
2201 }
2202
2203 if (0)
2204 assign_regs_trivial();
2205 else {
2206 while (!assign_regs()) {
2207 if (failed)
2208 break;
2209 }
2210 }
2211 }
2212 assert(force_uncompressed_stack == 0);
2213 assert(force_sechalf_stack == 0);
2214
2215 if (failed)
2216 return false;
2217
2218 if (dispatch_width == 8) {
2219 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2220 } else {
2221 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2222
2223 /* Make sure we didn't try to sneak in an extra uniform */
2224 assert(orig_nr_params == c->prog_data.nr_params);
2225 (void) orig_nr_params;
2226 }
2227
2228 return !failed;
2229 }
2230
2231 const unsigned *
2232 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2233 struct gl_fragment_program *fp,
2234 struct gl_shader_program *prog,
2235 unsigned *final_assembly_size)
2236 {
2237 struct intel_context *intel = &brw->intel;
2238 bool start_busy = false;
2239 float start_time = 0;
2240
2241 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2242 start_busy = (intel->batch.last_bo &&
2243 drm_intel_bo_busy(intel->batch.last_bo));
2244 start_time = get_time();
2245 }
2246
2247 struct brw_shader *shader = NULL;
2248 if (prog)
2249 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2250
2251 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2252 if (shader) {
2253 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2254 _mesa_print_ir(shader->ir, NULL);
2255 printf("\n\n");
2256 } else {
2257 printf("ARB_fragment_program %d ir for native fragment shader\n",
2258 fp->Base.Id);
2259 _mesa_print_program(&fp->Base);
2260 }
2261 }
2262
2263 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2264 */
2265 fs_visitor v(brw, c, prog, fp, 8);
2266 if (!v.run()) {
2267 prog->LinkStatus = false;
2268 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2269
2270 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2271 v.fail_msg);
2272
2273 return NULL;
2274 }
2275
2276 exec_list *simd16_instructions = NULL;
2277 fs_visitor v2(brw, c, prog, fp, 16);
2278 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2279 v2.import_uniforms(&v);
2280 if (!v2.run()) {
2281 perf_debug("16-wide shader failed to compile, falling back to "
2282 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2283 } else {
2284 simd16_instructions = &v2.instructions;
2285 }
2286 }
2287
2288 c->prog_data.dispatch_width = 8;
2289
2290 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2291 const unsigned *generated = g.generate_assembly(&v.instructions,
2292 simd16_instructions,
2293 final_assembly_size);
2294
2295 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2296 if (shader->compiled_once)
2297 brw_wm_debug_recompile(brw, prog, &c->key);
2298 shader->compiled_once = true;
2299
2300 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2301 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2302 (get_time() - start_time) * 1000);
2303 }
2304 }
2305
2306 return generated;
2307 }
2308
2309 bool
2310 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2311 {
2312 struct brw_context *brw = brw_context(ctx);
2313 struct intel_context *intel = &brw->intel;
2314 struct brw_wm_prog_key key;
2315
2316 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2317 return true;
2318
2319 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2320 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2321 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2322 bool program_uses_dfdy = fp->UsesDFdy;
2323
2324 memset(&key, 0, sizeof(key));
2325
2326 if (intel->gen < 6) {
2327 if (fp->UsesKill)
2328 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2329
2330 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2331 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2332
2333 /* Just assume depth testing. */
2334 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2335 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2336 }
2337
2338 if (prog->Name != 0)
2339 key.proj_attrib_mask = 0xffffffff;
2340
2341 if (intel->gen < 6)
2342 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2343
2344 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2345 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2346 continue;
2347
2348 if (prog->Name == 0)
2349 key.proj_attrib_mask |= 1 << i;
2350
2351 if (intel->gen < 6) {
2352 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2353
2354 if (vp_index >= 0)
2355 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2356 }
2357 }
2358
2359 key.clamp_fragment_color = true;
2360
2361 for (int i = 0; i < MAX_SAMPLERS; i++) {
2362 if (fp->Base.ShadowSamplers & (1 << i)) {
2363 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2364 key.tex.swizzles[i] =
2365 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2366 } else {
2367 /* Color sampler: assume no swizzling. */
2368 key.tex.swizzles[i] = SWIZZLE_XYZW;
2369 }
2370 }
2371
2372 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2373 key.drawable_height = ctx->DrawBuffer->Height;
2374 }
2375
2376 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2377 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2378 }
2379
2380 key.nr_color_regions = 1;
2381
2382 key.program_string_id = bfp->id;
2383
2384 uint32_t old_prog_offset = brw->wm.prog_offset;
2385 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2386
2387 bool success = do_wm_prog(brw, prog, bfp, &key);
2388
2389 brw->wm.prog_offset = old_prog_offset;
2390 brw->wm.prog_data = old_prog_data;
2391
2392 return success;
2393 }