i965/fs: Add some minimal backend-IR dumping.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 ALU1(NOT)
150 ALU1(MOV)
151 ALU1(FRC)
152 ALU1(RNDD)
153 ALU1(RNDE)
154 ALU1(RNDZ)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(SHL)
162 ALU2(SHR)
163 ALU2(ASR)
164
165 /** Gen4 predicated IF. */
166 fs_inst *
167 fs_visitor::IF(uint32_t predicate)
168 {
169 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
170 inst->predicate = predicate;
171 return inst;
172 }
173
174 /** Gen6+ IF with embedded comparison. */
175 fs_inst *
176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
177 {
178 assert(intel->gen >= 6);
179 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
180 reg_null_d, src0, src1);
181 inst->conditional_mod = condition;
182 return inst;
183 }
184
185 /**
186 * CMP: Sets the low bit of the destination channels with the result
187 * of the comparison, while the upper bits are undefined, and updates
188 * the flag register with the packed 16 bits of the result.
189 */
190 fs_inst *
191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
192 {
193 fs_inst *inst;
194
195 /* Take the instruction:
196 *
197 * CMP null<d> src0<f> src1<f>
198 *
199 * Original gen4 does type conversion to the destination type before
200 * comparison, producing garbage results for floating point comparisons.
201 * gen5 does the comparison on the execution type (resolved source types),
202 * so dst type doesn't matter. gen6 does comparison and then uses the
203 * result as if it was the dst type with no conversion, which happens to
204 * mostly work out for float-interpreted-as-int since our comparisons are
205 * for >0, =0, <0.
206 */
207 if (intel->gen == 4) {
208 dst.type = src0.type;
209 if (dst.file == FIXED_HW_REG)
210 dst.fixed_hw_reg.type = dst.type;
211 }
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 bool
223 fs_inst::equals(fs_inst *inst)
224 {
225 return (opcode == inst->opcode &&
226 dst.equals(inst->dst) &&
227 src[0].equals(inst->src[0]) &&
228 src[1].equals(inst->src[1]) &&
229 src[2].equals(inst->src[2]) &&
230 saturate == inst->saturate &&
231 predicate == inst->predicate &&
232 conditional_mod == inst->conditional_mod &&
233 mlen == inst->mlen &&
234 base_mrf == inst->base_mrf &&
235 sampler == inst->sampler &&
236 target == inst->target &&
237 eot == inst->eot &&
238 header_present == inst->header_present &&
239 shadow_compare == inst->shadow_compare &&
240 offset == inst->offset);
241 }
242
243 int
244 fs_inst::regs_written()
245 {
246 if (is_tex())
247 return 4;
248
249 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
250 * but we don't currently use them...nor do we have an opcode for them.
251 */
252
253 return 1;
254 }
255
256 bool
257 fs_inst::overwrites_reg(const fs_reg &reg)
258 {
259 return (reg.file == dst.file &&
260 reg.reg == dst.reg &&
261 reg.reg_offset >= dst.reg_offset &&
262 reg.reg_offset < dst.reg_offset + regs_written());
263 }
264
265 bool
266 fs_inst::is_tex()
267 {
268 return (opcode == SHADER_OPCODE_TEX ||
269 opcode == FS_OPCODE_TXB ||
270 opcode == SHADER_OPCODE_TXD ||
271 opcode == SHADER_OPCODE_TXF ||
272 opcode == SHADER_OPCODE_TXL ||
273 opcode == SHADER_OPCODE_TXS);
274 }
275
276 bool
277 fs_inst::is_math()
278 {
279 return (opcode == SHADER_OPCODE_RCP ||
280 opcode == SHADER_OPCODE_RSQ ||
281 opcode == SHADER_OPCODE_SQRT ||
282 opcode == SHADER_OPCODE_EXP2 ||
283 opcode == SHADER_OPCODE_LOG2 ||
284 opcode == SHADER_OPCODE_SIN ||
285 opcode == SHADER_OPCODE_COS ||
286 opcode == SHADER_OPCODE_INT_QUOTIENT ||
287 opcode == SHADER_OPCODE_INT_REMAINDER ||
288 opcode == SHADER_OPCODE_POW);
289 }
290
291 void
292 fs_reg::init()
293 {
294 memset(this, 0, sizeof(*this));
295 this->smear = -1;
296 }
297
298 /** Generic unset register constructor. */
299 fs_reg::fs_reg()
300 {
301 init();
302 this->file = BAD_FILE;
303 }
304
305 /** Immediate value constructor. */
306 fs_reg::fs_reg(float f)
307 {
308 init();
309 this->file = IMM;
310 this->type = BRW_REGISTER_TYPE_F;
311 this->imm.f = f;
312 }
313
314 /** Immediate value constructor. */
315 fs_reg::fs_reg(int32_t i)
316 {
317 init();
318 this->file = IMM;
319 this->type = BRW_REGISTER_TYPE_D;
320 this->imm.i = i;
321 }
322
323 /** Immediate value constructor. */
324 fs_reg::fs_reg(uint32_t u)
325 {
326 init();
327 this->file = IMM;
328 this->type = BRW_REGISTER_TYPE_UD;
329 this->imm.u = u;
330 }
331
332 /** Fixed brw_reg Immediate value constructor. */
333 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
334 {
335 init();
336 this->file = FIXED_HW_REG;
337 this->fixed_hw_reg = fixed_hw_reg;
338 this->type = fixed_hw_reg.type;
339 }
340
341 bool
342 fs_reg::equals(const fs_reg &r) const
343 {
344 return (file == r.file &&
345 reg == r.reg &&
346 reg_offset == r.reg_offset &&
347 type == r.type &&
348 negate == r.negate &&
349 abs == r.abs &&
350 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
351 sizeof(fixed_hw_reg)) == 0 &&
352 smear == r.smear &&
353 imm.u == r.imm.u);
354 }
355
356 int
357 fs_visitor::type_size(const struct glsl_type *type)
358 {
359 unsigned int size, i;
360
361 switch (type->base_type) {
362 case GLSL_TYPE_UINT:
363 case GLSL_TYPE_INT:
364 case GLSL_TYPE_FLOAT:
365 case GLSL_TYPE_BOOL:
366 return type->components();
367 case GLSL_TYPE_ARRAY:
368 return type_size(type->fields.array) * type->length;
369 case GLSL_TYPE_STRUCT:
370 size = 0;
371 for (i = 0; i < type->length; i++) {
372 size += type_size(type->fields.structure[i].type);
373 }
374 return size;
375 case GLSL_TYPE_SAMPLER:
376 /* Samplers take up no register space, since they're baked in at
377 * link time.
378 */
379 return 0;
380 default:
381 assert(!"not reached");
382 return 0;
383 }
384 }
385
386 void
387 fs_visitor::fail(const char *format, ...)
388 {
389 va_list va;
390 char *msg;
391
392 if (failed)
393 return;
394
395 failed = true;
396
397 va_start(va, format);
398 msg = ralloc_vasprintf(mem_ctx, format, va);
399 va_end(va);
400 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
401
402 this->fail_msg = msg;
403
404 if (INTEL_DEBUG & DEBUG_WM) {
405 fprintf(stderr, "%s", msg);
406 }
407 }
408
409 fs_inst *
410 fs_visitor::emit(enum opcode opcode)
411 {
412 return emit(fs_inst(opcode));
413 }
414
415 fs_inst *
416 fs_visitor::emit(enum opcode opcode, fs_reg dst)
417 {
418 return emit(fs_inst(opcode, dst));
419 }
420
421 fs_inst *
422 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
423 {
424 return emit(fs_inst(opcode, dst, src0));
425 }
426
427 fs_inst *
428 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
429 {
430 return emit(fs_inst(opcode, dst, src0, src1));
431 }
432
433 fs_inst *
434 fs_visitor::emit(enum opcode opcode, fs_reg dst,
435 fs_reg src0, fs_reg src1, fs_reg src2)
436 {
437 return emit(fs_inst(opcode, dst, src0, src1, src2));
438 }
439
440 void
441 fs_visitor::push_force_uncompressed()
442 {
443 force_uncompressed_stack++;
444 }
445
446 void
447 fs_visitor::pop_force_uncompressed()
448 {
449 force_uncompressed_stack--;
450 assert(force_uncompressed_stack >= 0);
451 }
452
453 void
454 fs_visitor::push_force_sechalf()
455 {
456 force_sechalf_stack++;
457 }
458
459 void
460 fs_visitor::pop_force_sechalf()
461 {
462 force_sechalf_stack--;
463 assert(force_sechalf_stack >= 0);
464 }
465
466 /**
467 * Returns how many MRFs an FS opcode will write over.
468 *
469 * Note that this is not the 0 or 1 implied writes in an actual gen
470 * instruction -- the FS opcodes often generate MOVs in addition.
471 */
472 int
473 fs_visitor::implied_mrf_writes(fs_inst *inst)
474 {
475 if (inst->mlen == 0)
476 return 0;
477
478 switch (inst->opcode) {
479 case SHADER_OPCODE_RCP:
480 case SHADER_OPCODE_RSQ:
481 case SHADER_OPCODE_SQRT:
482 case SHADER_OPCODE_EXP2:
483 case SHADER_OPCODE_LOG2:
484 case SHADER_OPCODE_SIN:
485 case SHADER_OPCODE_COS:
486 return 1 * dispatch_width / 8;
487 case SHADER_OPCODE_POW:
488 case SHADER_OPCODE_INT_QUOTIENT:
489 case SHADER_OPCODE_INT_REMAINDER:
490 return 2 * dispatch_width / 8;
491 case SHADER_OPCODE_TEX:
492 case FS_OPCODE_TXB:
493 case SHADER_OPCODE_TXD:
494 case SHADER_OPCODE_TXF:
495 case SHADER_OPCODE_TXL:
496 case SHADER_OPCODE_TXS:
497 return 1;
498 case FS_OPCODE_FB_WRITE:
499 return 2;
500 case FS_OPCODE_PULL_CONSTANT_LOAD:
501 case FS_OPCODE_UNSPILL:
502 return 1;
503 case FS_OPCODE_SPILL:
504 return 2;
505 default:
506 assert(!"not reached");
507 return inst->mlen;
508 }
509 }
510
511 int
512 fs_visitor::virtual_grf_alloc(int size)
513 {
514 if (virtual_grf_array_size <= virtual_grf_count) {
515 if (virtual_grf_array_size == 0)
516 virtual_grf_array_size = 16;
517 else
518 virtual_grf_array_size *= 2;
519 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
520 virtual_grf_array_size);
521 }
522 virtual_grf_sizes[virtual_grf_count] = size;
523 return virtual_grf_count++;
524 }
525
526 /** Fixed HW reg constructor. */
527 fs_reg::fs_reg(enum register_file file, int reg)
528 {
529 init();
530 this->file = file;
531 this->reg = reg;
532 this->type = BRW_REGISTER_TYPE_F;
533 }
534
535 /** Fixed HW reg constructor. */
536 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
537 {
538 init();
539 this->file = file;
540 this->reg = reg;
541 this->type = type;
542 }
543
544 /** Automatic reg constructor. */
545 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
546 {
547 init();
548
549 this->file = GRF;
550 this->reg = v->virtual_grf_alloc(v->type_size(type));
551 this->reg_offset = 0;
552 this->type = brw_type_for_base_type(type);
553 }
554
555 fs_reg *
556 fs_visitor::variable_storage(ir_variable *var)
557 {
558 return (fs_reg *)hash_table_find(this->variable_ht, var);
559 }
560
561 void
562 import_uniforms_callback(const void *key,
563 void *data,
564 void *closure)
565 {
566 struct hash_table *dst_ht = (struct hash_table *)closure;
567 const fs_reg *reg = (const fs_reg *)data;
568
569 if (reg->file != UNIFORM)
570 return;
571
572 hash_table_insert(dst_ht, data, key);
573 }
574
575 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
576 * This brings in those uniform definitions
577 */
578 void
579 fs_visitor::import_uniforms(fs_visitor *v)
580 {
581 hash_table_call_foreach(v->variable_ht,
582 import_uniforms_callback,
583 variable_ht);
584 this->params_remap = v->params_remap;
585 }
586
587 /* Our support for uniforms is piggy-backed on the struct
588 * gl_fragment_program, because that's where the values actually
589 * get stored, rather than in some global gl_shader_program uniform
590 * store.
591 */
592 int
593 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
594 {
595 unsigned int offset = 0;
596
597 if (type->is_matrix()) {
598 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
599 type->vector_elements,
600 1);
601
602 for (unsigned int i = 0; i < type->matrix_columns; i++) {
603 offset += setup_uniform_values(loc + offset, column);
604 }
605
606 return offset;
607 }
608
609 switch (type->base_type) {
610 case GLSL_TYPE_FLOAT:
611 case GLSL_TYPE_UINT:
612 case GLSL_TYPE_INT:
613 case GLSL_TYPE_BOOL:
614 for (unsigned int i = 0; i < type->vector_elements; i++) {
615 unsigned int param = c->prog_data.nr_params++;
616
617 this->param_index[param] = loc;
618 this->param_offset[param] = i;
619 }
620 return 1;
621
622 case GLSL_TYPE_STRUCT:
623 for (unsigned int i = 0; i < type->length; i++) {
624 offset += setup_uniform_values(loc + offset,
625 type->fields.structure[i].type);
626 }
627 return offset;
628
629 case GLSL_TYPE_ARRAY:
630 for (unsigned int i = 0; i < type->length; i++) {
631 offset += setup_uniform_values(loc + offset, type->fields.array);
632 }
633 return offset;
634
635 case GLSL_TYPE_SAMPLER:
636 /* The sampler takes up a slot, but we don't use any values from it. */
637 return 1;
638
639 default:
640 assert(!"not reached");
641 return 0;
642 }
643 }
644
645
646 /* Our support for builtin uniforms is even scarier than non-builtin.
647 * It sits on top of the PROG_STATE_VAR parameters that are
648 * automatically updated from GL context state.
649 */
650 void
651 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
652 {
653 const ir_state_slot *const slots = ir->state_slots;
654 assert(ir->state_slots != NULL);
655
656 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
657 /* This state reference has already been setup by ir_to_mesa, but we'll
658 * get the same index back here.
659 */
660 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
661 (gl_state_index *)slots[i].tokens);
662
663 /* Add each of the unique swizzles of the element as a parameter.
664 * This'll end up matching the expected layout of the
665 * array/matrix/structure we're trying to fill in.
666 */
667 int last_swiz = -1;
668 for (unsigned int j = 0; j < 4; j++) {
669 int swiz = GET_SWZ(slots[i].swizzle, j);
670 if (swiz == last_swiz)
671 break;
672 last_swiz = swiz;
673
674 this->param_index[c->prog_data.nr_params] = index;
675 this->param_offset[c->prog_data.nr_params] = swiz;
676 c->prog_data.nr_params++;
677 }
678 }
679 }
680
681 fs_reg *
682 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
683 {
684 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
685 fs_reg wpos = *reg;
686 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
687
688 /* gl_FragCoord.x */
689 if (ir->pixel_center_integer) {
690 emit(MOV(wpos, this->pixel_x));
691 } else {
692 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
693 }
694 wpos.reg_offset++;
695
696 /* gl_FragCoord.y */
697 if (!flip && ir->pixel_center_integer) {
698 emit(MOV(wpos, this->pixel_y));
699 } else {
700 fs_reg pixel_y = this->pixel_y;
701 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
702
703 if (flip) {
704 pixel_y.negate = true;
705 offset += c->key.drawable_height - 1.0;
706 }
707
708 emit(ADD(wpos, pixel_y, fs_reg(offset)));
709 }
710 wpos.reg_offset++;
711
712 /* gl_FragCoord.z */
713 if (intel->gen >= 6) {
714 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
715 } else {
716 emit(FS_OPCODE_LINTERP, wpos,
717 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
718 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
719 interp_reg(FRAG_ATTRIB_WPOS, 2));
720 }
721 wpos.reg_offset++;
722
723 /* gl_FragCoord.w: Already set up in emit_interpolation */
724 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
725
726 return reg;
727 }
728
729 fs_inst *
730 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
731 glsl_interp_qualifier interpolation_mode,
732 bool is_centroid)
733 {
734 brw_wm_barycentric_interp_mode barycoord_mode;
735 if (is_centroid) {
736 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
737 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
738 else
739 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
740 } else {
741 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
742 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
743 else
744 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
745 }
746 return emit(FS_OPCODE_LINTERP, attr,
747 this->delta_x[barycoord_mode],
748 this->delta_y[barycoord_mode], interp);
749 }
750
751 fs_reg *
752 fs_visitor::emit_general_interpolation(ir_variable *ir)
753 {
754 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
755 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
756 fs_reg attr = *reg;
757
758 unsigned int array_elements;
759 const glsl_type *type;
760
761 if (ir->type->is_array()) {
762 array_elements = ir->type->length;
763 if (array_elements == 0) {
764 fail("dereferenced array '%s' has length 0\n", ir->name);
765 }
766 type = ir->type->fields.array;
767 } else {
768 array_elements = 1;
769 type = ir->type;
770 }
771
772 glsl_interp_qualifier interpolation_mode =
773 ir->determine_interpolation_mode(c->key.flat_shade);
774
775 int location = ir->location;
776 for (unsigned int i = 0; i < array_elements; i++) {
777 for (unsigned int j = 0; j < type->matrix_columns; j++) {
778 if (urb_setup[location] == -1) {
779 /* If there's no incoming setup data for this slot, don't
780 * emit interpolation for it.
781 */
782 attr.reg_offset += type->vector_elements;
783 location++;
784 continue;
785 }
786
787 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
788 /* Constant interpolation (flat shading) case. The SF has
789 * handed us defined values in only the constant offset
790 * field of the setup reg.
791 */
792 for (unsigned int k = 0; k < type->vector_elements; k++) {
793 struct brw_reg interp = interp_reg(location, k);
794 interp = suboffset(interp, 3);
795 interp.type = reg->type;
796 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
797 attr.reg_offset++;
798 }
799 } else {
800 /* Smooth/noperspective interpolation case. */
801 for (unsigned int k = 0; k < type->vector_elements; k++) {
802 /* FINISHME: At some point we probably want to push
803 * this farther by giving similar treatment to the
804 * other potentially constant components of the
805 * attribute, as well as making brw_vs_constval.c
806 * handle varyings other than gl_TexCoord.
807 */
808 if (location >= FRAG_ATTRIB_TEX0 &&
809 location <= FRAG_ATTRIB_TEX7 &&
810 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
811 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
812 } else {
813 struct brw_reg interp = interp_reg(location, k);
814 emit_linterp(attr, fs_reg(interp), interpolation_mode,
815 ir->centroid);
816 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
817 /* Get the pixel/sample mask into f0 so that we know
818 * which pixels are lit. Then, for each channel that is
819 * unlit, replace the centroid data with non-centroid
820 * data.
821 */
822 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
823 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
824 interpolation_mode, false);
825 inst->predicate = BRW_PREDICATE_NORMAL;
826 inst->predicate_inverse = true;
827 }
828 if (intel->gen < 6) {
829 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
830 }
831 }
832 attr.reg_offset++;
833 }
834
835 }
836 location++;
837 }
838 }
839
840 return reg;
841 }
842
843 fs_reg *
844 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
845 {
846 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
847
848 /* The frontfacing comes in as a bit in the thread payload. */
849 if (intel->gen >= 6) {
850 emit(BRW_OPCODE_ASR, *reg,
851 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
852 fs_reg(15));
853 emit(BRW_OPCODE_NOT, *reg, *reg);
854 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
855 } else {
856 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
857 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
858 * us front face
859 */
860 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
861 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
862 }
863
864 return reg;
865 }
866
867 fs_inst *
868 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
869 {
870 switch (opcode) {
871 case SHADER_OPCODE_RCP:
872 case SHADER_OPCODE_RSQ:
873 case SHADER_OPCODE_SQRT:
874 case SHADER_OPCODE_EXP2:
875 case SHADER_OPCODE_LOG2:
876 case SHADER_OPCODE_SIN:
877 case SHADER_OPCODE_COS:
878 break;
879 default:
880 assert(!"not reached: bad math opcode");
881 return NULL;
882 }
883
884 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
885 * might be able to do better by doing execsize = 1 math and then
886 * expanding that result out, but we would need to be careful with
887 * masking.
888 *
889 * Gen 6 hardware ignores source modifiers (negate and abs) on math
890 * instructions, so we also move to a temp to set those up.
891 */
892 if (intel->gen == 6 && (src.file == UNIFORM ||
893 src.abs ||
894 src.negate)) {
895 fs_reg expanded = fs_reg(this, glsl_type::float_type);
896 emit(BRW_OPCODE_MOV, expanded, src);
897 src = expanded;
898 }
899
900 fs_inst *inst = emit(opcode, dst, src);
901
902 if (intel->gen < 6) {
903 inst->base_mrf = 2;
904 inst->mlen = dispatch_width / 8;
905 }
906
907 return inst;
908 }
909
910 fs_inst *
911 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
912 {
913 int base_mrf = 2;
914 fs_inst *inst;
915
916 switch (opcode) {
917 case SHADER_OPCODE_POW:
918 case SHADER_OPCODE_INT_QUOTIENT:
919 case SHADER_OPCODE_INT_REMAINDER:
920 break;
921 default:
922 assert(!"not reached: unsupported binary math opcode.");
923 return NULL;
924 }
925
926 if (intel->gen >= 7) {
927 inst = emit(opcode, dst, src0, src1);
928 } else if (intel->gen == 6) {
929 /* Can't do hstride == 0 args to gen6 math, so expand it out.
930 *
931 * The hardware ignores source modifiers (negate and abs) on math
932 * instructions, so we also move to a temp to set those up.
933 */
934 if (src0.file == UNIFORM || src0.abs || src0.negate) {
935 fs_reg expanded = fs_reg(this, glsl_type::float_type);
936 expanded.type = src0.type;
937 emit(BRW_OPCODE_MOV, expanded, src0);
938 src0 = expanded;
939 }
940
941 if (src1.file == UNIFORM || src1.abs || src1.negate) {
942 fs_reg expanded = fs_reg(this, glsl_type::float_type);
943 expanded.type = src1.type;
944 emit(BRW_OPCODE_MOV, expanded, src1);
945 src1 = expanded;
946 }
947
948 inst = emit(opcode, dst, src0, src1);
949 } else {
950 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
951 * "Message Payload":
952 *
953 * "Operand0[7]. For the INT DIV functions, this operand is the
954 * denominator."
955 * ...
956 * "Operand1[7]. For the INT DIV functions, this operand is the
957 * numerator."
958 */
959 bool is_int_div = opcode != SHADER_OPCODE_POW;
960 fs_reg &op0 = is_int_div ? src1 : src0;
961 fs_reg &op1 = is_int_div ? src0 : src1;
962
963 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
964 inst = emit(opcode, dst, op0, reg_null_f);
965
966 inst->base_mrf = base_mrf;
967 inst->mlen = 2 * dispatch_width / 8;
968 }
969 return inst;
970 }
971
972 /**
973 * To be called after the last _mesa_add_state_reference() call, to
974 * set up prog_data.param[] for assign_curb_setup() and
975 * setup_pull_constants().
976 */
977 void
978 fs_visitor::setup_paramvalues_refs()
979 {
980 if (dispatch_width != 8)
981 return;
982
983 /* Set up the pointers to ParamValues now that that array is finalized. */
984 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
985 c->prog_data.param[i] =
986 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
987 this->param_offset[i];
988 }
989 }
990
991 void
992 fs_visitor::assign_curb_setup()
993 {
994 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
995 if (dispatch_width == 8) {
996 c->prog_data.first_curbe_grf = c->nr_payload_regs;
997 } else {
998 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
999 }
1000
1001 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1002 foreach_list(node, &this->instructions) {
1003 fs_inst *inst = (fs_inst *)node;
1004
1005 for (unsigned int i = 0; i < 3; i++) {
1006 if (inst->src[i].file == UNIFORM) {
1007 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1008 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1009 constant_nr / 8,
1010 constant_nr % 8);
1011
1012 inst->src[i].file = FIXED_HW_REG;
1013 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1014 }
1015 }
1016 }
1017 }
1018
1019 void
1020 fs_visitor::calculate_urb_setup()
1021 {
1022 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1023 urb_setup[i] = -1;
1024 }
1025
1026 int urb_next = 0;
1027 /* Figure out where each of the incoming setup attributes lands. */
1028 if (intel->gen >= 6) {
1029 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1030 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1031 urb_setup[i] = urb_next++;
1032 }
1033 }
1034 } else {
1035 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1036 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1037 /* Point size is packed into the header, not as a general attribute */
1038 if (i == VERT_RESULT_PSIZ)
1039 continue;
1040
1041 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1042 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1043
1044 /* The back color slot is skipped when the front color is
1045 * also written to. In addition, some slots can be
1046 * written in the vertex shader and not read in the
1047 * fragment shader. So the register number must always be
1048 * incremented, mapped or not.
1049 */
1050 if (fp_index >= 0)
1051 urb_setup[fp_index] = urb_next;
1052 urb_next++;
1053 }
1054 }
1055
1056 /*
1057 * It's a FS only attribute, and we did interpolation for this attribute
1058 * in SF thread. So, count it here, too.
1059 *
1060 * See compile_sf_prog() for more info.
1061 */
1062 if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1063 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1064 }
1065
1066 /* Each attribute is 4 setup channels, each of which is half a reg. */
1067 c->prog_data.urb_read_length = urb_next * 2;
1068 }
1069
1070 void
1071 fs_visitor::assign_urb_setup()
1072 {
1073 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1074
1075 /* Offset all the urb_setup[] index by the actual position of the
1076 * setup regs, now that the location of the constants has been chosen.
1077 */
1078 foreach_list(node, &this->instructions) {
1079 fs_inst *inst = (fs_inst *)node;
1080
1081 if (inst->opcode == FS_OPCODE_LINTERP) {
1082 assert(inst->src[2].file == FIXED_HW_REG);
1083 inst->src[2].fixed_hw_reg.nr += urb_start;
1084 }
1085
1086 if (inst->opcode == FS_OPCODE_CINTERP) {
1087 assert(inst->src[0].file == FIXED_HW_REG);
1088 inst->src[0].fixed_hw_reg.nr += urb_start;
1089 }
1090 }
1091
1092 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1093 }
1094
1095 /**
1096 * Split large virtual GRFs into separate components if we can.
1097 *
1098 * This is mostly duplicated with what brw_fs_vector_splitting does,
1099 * but that's really conservative because it's afraid of doing
1100 * splitting that doesn't result in real progress after the rest of
1101 * the optimization phases, which would cause infinite looping in
1102 * optimization. We can do it once here, safely. This also has the
1103 * opportunity to split interpolated values, or maybe even uniforms,
1104 * which we don't have at the IR level.
1105 *
1106 * We want to split, because virtual GRFs are what we register
1107 * allocate and spill (due to contiguousness requirements for some
1108 * instructions), and they're what we naturally generate in the
1109 * codegen process, but most virtual GRFs don't actually need to be
1110 * contiguous sets of GRFs. If we split, we'll end up with reduced
1111 * live intervals and better dead code elimination and coalescing.
1112 */
1113 void
1114 fs_visitor::split_virtual_grfs()
1115 {
1116 int num_vars = this->virtual_grf_count;
1117 bool split_grf[num_vars];
1118 int new_virtual_grf[num_vars];
1119
1120 /* Try to split anything > 0 sized. */
1121 for (int i = 0; i < num_vars; i++) {
1122 if (this->virtual_grf_sizes[i] != 1)
1123 split_grf[i] = true;
1124 else
1125 split_grf[i] = false;
1126 }
1127
1128 if (brw->has_pln &&
1129 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1130 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1131 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1132 * Gen6, that was the only supported interpolation mode, and since Gen6,
1133 * delta_x and delta_y are in fixed hardware registers.
1134 */
1135 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1136 false;
1137 }
1138
1139 foreach_list(node, &this->instructions) {
1140 fs_inst *inst = (fs_inst *)node;
1141
1142 /* If there's a SEND message that requires contiguous destination
1143 * registers, no splitting is allowed.
1144 */
1145 if (inst->regs_written() > 1) {
1146 split_grf[inst->dst.reg] = false;
1147 }
1148 }
1149
1150 /* Allocate new space for split regs. Note that the virtual
1151 * numbers will be contiguous.
1152 */
1153 for (int i = 0; i < num_vars; i++) {
1154 if (split_grf[i]) {
1155 new_virtual_grf[i] = virtual_grf_alloc(1);
1156 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1157 int reg = virtual_grf_alloc(1);
1158 assert(reg == new_virtual_grf[i] + j - 1);
1159 (void) reg;
1160 }
1161 this->virtual_grf_sizes[i] = 1;
1162 }
1163 }
1164
1165 foreach_list(node, &this->instructions) {
1166 fs_inst *inst = (fs_inst *)node;
1167
1168 if (inst->dst.file == GRF &&
1169 split_grf[inst->dst.reg] &&
1170 inst->dst.reg_offset != 0) {
1171 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1172 inst->dst.reg_offset - 1);
1173 inst->dst.reg_offset = 0;
1174 }
1175 for (int i = 0; i < 3; i++) {
1176 if (inst->src[i].file == GRF &&
1177 split_grf[inst->src[i].reg] &&
1178 inst->src[i].reg_offset != 0) {
1179 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1180 inst->src[i].reg_offset - 1);
1181 inst->src[i].reg_offset = 0;
1182 }
1183 }
1184 }
1185 this->live_intervals_valid = false;
1186 }
1187
1188 /**
1189 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1190 *
1191 * During code generation, we create tons of temporary variables, many of
1192 * which get immediately killed and are never used again. Yet, in later
1193 * optimization and analysis passes, such as compute_live_intervals, we need
1194 * to loop over all the virtual GRFs. Compacting them can save a lot of
1195 * overhead.
1196 */
1197 void
1198 fs_visitor::compact_virtual_grfs()
1199 {
1200 /* Mark which virtual GRFs are used, and count how many. */
1201 int remap_table[this->virtual_grf_count];
1202 memset(remap_table, -1, sizeof(remap_table));
1203
1204 foreach_list(node, &this->instructions) {
1205 const fs_inst *inst = (const fs_inst *) node;
1206
1207 if (inst->dst.file == GRF)
1208 remap_table[inst->dst.reg] = 0;
1209
1210 for (int i = 0; i < 3; i++) {
1211 if (inst->src[i].file == GRF)
1212 remap_table[inst->src[i].reg] = 0;
1213 }
1214 }
1215
1216 /* In addition to registers used in instructions, fs_visitor keeps
1217 * direct references to certain special values which must be patched:
1218 */
1219 fs_reg *special[] = {
1220 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1221 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1222 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1223 &delta_x[0], &delta_x[1], &delta_x[2],
1224 &delta_x[3], &delta_x[4], &delta_x[5],
1225 &delta_y[0], &delta_y[1], &delta_y[2],
1226 &delta_y[3], &delta_y[4], &delta_y[5],
1227 };
1228 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1229 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1230
1231 /* Treat all special values as used, to be conservative */
1232 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1233 if (special[i]->file == GRF)
1234 remap_table[special[i]->reg] = 0;
1235 }
1236
1237 /* Compact the GRF arrays. */
1238 int new_index = 0;
1239 for (int i = 0; i < this->virtual_grf_count; i++) {
1240 if (remap_table[i] != -1) {
1241 remap_table[i] = new_index;
1242 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1243 if (live_intervals_valid) {
1244 virtual_grf_use[new_index] = virtual_grf_use[i];
1245 virtual_grf_def[new_index] = virtual_grf_def[i];
1246 }
1247 ++new_index;
1248 }
1249 }
1250
1251 this->virtual_grf_count = new_index;
1252
1253 /* Patch all the instructions to use the newly renumbered registers */
1254 foreach_list(node, &this->instructions) {
1255 fs_inst *inst = (fs_inst *) node;
1256
1257 if (inst->dst.file == GRF)
1258 inst->dst.reg = remap_table[inst->dst.reg];
1259
1260 for (int i = 0; i < 3; i++) {
1261 if (inst->src[i].file == GRF)
1262 inst->src[i].reg = remap_table[inst->src[i].reg];
1263 }
1264 }
1265
1266 /* Patch all the references to special values */
1267 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1268 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1269 special[i]->reg = remap_table[special[i]->reg];
1270 }
1271 }
1272
1273 bool
1274 fs_visitor::remove_dead_constants()
1275 {
1276 if (dispatch_width == 8) {
1277 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1278
1279 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1280 this->params_remap[i] = -1;
1281
1282 /* Find which params are still in use. */
1283 foreach_list(node, &this->instructions) {
1284 fs_inst *inst = (fs_inst *)node;
1285
1286 for (int i = 0; i < 3; i++) {
1287 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1288
1289 if (inst->src[i].file != UNIFORM)
1290 continue;
1291
1292 assert(constant_nr < (int)c->prog_data.nr_params);
1293
1294 /* For now, set this to non-negative. We'll give it the
1295 * actual new number in a moment, in order to keep the
1296 * register numbers nicely ordered.
1297 */
1298 this->params_remap[constant_nr] = 0;
1299 }
1300 }
1301
1302 /* Figure out what the new numbers for the params will be. At some
1303 * point when we're doing uniform array access, we're going to want
1304 * to keep the distinction between .reg and .reg_offset, but for
1305 * now we don't care.
1306 */
1307 unsigned int new_nr_params = 0;
1308 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1309 if (this->params_remap[i] != -1) {
1310 this->params_remap[i] = new_nr_params++;
1311 }
1312 }
1313
1314 /* Update the list of params to be uploaded to match our new numbering. */
1315 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1316 int remapped = this->params_remap[i];
1317
1318 if (remapped == -1)
1319 continue;
1320
1321 /* We've already done setup_paramvalues_refs() so no need to worry
1322 * about param_index and param_offset.
1323 */
1324 c->prog_data.param[remapped] = c->prog_data.param[i];
1325 }
1326
1327 c->prog_data.nr_params = new_nr_params;
1328 } else {
1329 /* This should have been generated in the 8-wide pass already. */
1330 assert(this->params_remap);
1331 }
1332
1333 /* Now do the renumbering of the shader to remove unused params. */
1334 foreach_list(node, &this->instructions) {
1335 fs_inst *inst = (fs_inst *)node;
1336
1337 for (int i = 0; i < 3; i++) {
1338 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1339
1340 if (inst->src[i].file != UNIFORM)
1341 continue;
1342
1343 assert(this->params_remap[constant_nr] != -1);
1344 inst->src[i].reg = this->params_remap[constant_nr];
1345 inst->src[i].reg_offset = 0;
1346 }
1347 }
1348
1349 return true;
1350 }
1351
1352 /**
1353 * Choose accesses from the UNIFORM file to demote to using the pull
1354 * constant buffer.
1355 *
1356 * We allow a fragment shader to have more than the specified minimum
1357 * maximum number of fragment shader uniform components (64). If
1358 * there are too many of these, they'd fill up all of register space.
1359 * So, this will push some of them out to the pull constant buffer and
1360 * update the program to load them.
1361 */
1362 void
1363 fs_visitor::setup_pull_constants()
1364 {
1365 /* Only allow 16 registers (128 uniform components) as push constants. */
1366 unsigned int max_uniform_components = 16 * 8;
1367 if (c->prog_data.nr_params <= max_uniform_components)
1368 return;
1369
1370 if (dispatch_width == 16) {
1371 fail("Pull constants not supported in 16-wide\n");
1372 return;
1373 }
1374
1375 /* Just demote the end of the list. We could probably do better
1376 * here, demoting things that are rarely used in the program first.
1377 */
1378 int pull_uniform_base = max_uniform_components;
1379 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
1380
1381 foreach_list(node, &this->instructions) {
1382 fs_inst *inst = (fs_inst *)node;
1383
1384 for (int i = 0; i < 3; i++) {
1385 if (inst->src[i].file != UNIFORM)
1386 continue;
1387
1388 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1389 if (uniform_nr < pull_uniform_base)
1390 continue;
1391
1392 fs_reg dst = fs_reg(this, glsl_type::float_type);
1393 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1394 fs_reg offset = fs_reg((unsigned)(((uniform_nr -
1395 pull_uniform_base) * 4) & ~15));
1396 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
1397 dst, index, offset);
1398 pull->ir = inst->ir;
1399 pull->annotation = inst->annotation;
1400 pull->base_mrf = 14;
1401 pull->mlen = 1;
1402
1403 inst->insert_before(pull);
1404
1405 inst->src[i].file = GRF;
1406 inst->src[i].reg = dst.reg;
1407 inst->src[i].reg_offset = 0;
1408 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
1409 }
1410 }
1411
1412 for (int i = 0; i < pull_uniform_count; i++) {
1413 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
1414 }
1415 c->prog_data.nr_params -= pull_uniform_count;
1416 c->prog_data.nr_pull_params = pull_uniform_count;
1417 }
1418
1419 bool
1420 fs_visitor::opt_algebraic()
1421 {
1422 bool progress = false;
1423
1424 foreach_list(node, &this->instructions) {
1425 fs_inst *inst = (fs_inst *)node;
1426
1427 switch (inst->opcode) {
1428 case BRW_OPCODE_MUL:
1429 if (inst->src[1].file != IMM)
1430 continue;
1431
1432 /* a * 1.0 = a */
1433 if (inst->src[1].type == BRW_REGISTER_TYPE_F &&
1434 inst->src[1].imm.f == 1.0) {
1435 inst->opcode = BRW_OPCODE_MOV;
1436 inst->src[1] = reg_undef;
1437 progress = true;
1438 break;
1439 }
1440
1441 /* a * 0.0 = 0.0 */
1442 if (inst->src[1].type == BRW_REGISTER_TYPE_F &&
1443 inst->src[1].imm.f == 0.0) {
1444 inst->opcode = BRW_OPCODE_MOV;
1445 inst->src[0] = fs_reg(0.0f);
1446 inst->src[1] = reg_undef;
1447 progress = true;
1448 break;
1449 }
1450
1451 break;
1452 case BRW_OPCODE_ADD:
1453 if (inst->src[1].file != IMM)
1454 continue;
1455
1456 /* a + 0.0 = a */
1457 if (inst->src[1].type == BRW_REGISTER_TYPE_F &&
1458 inst->src[1].imm.f == 0.0) {
1459 inst->opcode = BRW_OPCODE_MOV;
1460 inst->src[1] = reg_undef;
1461 progress = true;
1462 break;
1463 }
1464 break;
1465 default:
1466 break;
1467 }
1468 }
1469
1470 return progress;
1471 }
1472
1473 /**
1474 * Must be called after calculate_live_intervales() to remove unused
1475 * writes to registers -- register allocation will fail otherwise
1476 * because something deffed but not used won't be considered to
1477 * interfere with other regs.
1478 */
1479 bool
1480 fs_visitor::dead_code_eliminate()
1481 {
1482 bool progress = false;
1483 int pc = 0;
1484
1485 calculate_live_intervals();
1486
1487 foreach_list_safe(node, &this->instructions) {
1488 fs_inst *inst = (fs_inst *)node;
1489
1490 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1491 inst->remove();
1492 progress = true;
1493 }
1494
1495 pc++;
1496 }
1497
1498 if (progress)
1499 live_intervals_valid = false;
1500
1501 return progress;
1502 }
1503
1504 /**
1505 * Implements a second type of register coalescing: This one checks if
1506 * the two regs involved in a raw move don't interfere, in which case
1507 * they can both by stored in the same place and the MOV removed.
1508 */
1509 bool
1510 fs_visitor::register_coalesce_2()
1511 {
1512 bool progress = false;
1513
1514 calculate_live_intervals();
1515
1516 foreach_list_safe(node, &this->instructions) {
1517 fs_inst *inst = (fs_inst *)node;
1518
1519 if (inst->opcode != BRW_OPCODE_MOV ||
1520 inst->predicate ||
1521 inst->saturate ||
1522 inst->src[0].file != GRF ||
1523 inst->src[0].negate ||
1524 inst->src[0].abs ||
1525 inst->src[0].smear != -1 ||
1526 inst->dst.file != GRF ||
1527 inst->dst.type != inst->src[0].type ||
1528 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1529 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1530 continue;
1531 }
1532
1533 int reg_from = inst->src[0].reg;
1534 assert(inst->src[0].reg_offset == 0);
1535 int reg_to = inst->dst.reg;
1536 int reg_to_offset = inst->dst.reg_offset;
1537
1538 foreach_list_safe(node, &this->instructions) {
1539 fs_inst *scan_inst = (fs_inst *)node;
1540
1541 if (scan_inst->dst.file == GRF &&
1542 scan_inst->dst.reg == reg_from) {
1543 scan_inst->dst.reg = reg_to;
1544 scan_inst->dst.reg_offset = reg_to_offset;
1545 }
1546 for (int i = 0; i < 3; i++) {
1547 if (scan_inst->src[i].file == GRF &&
1548 scan_inst->src[i].reg == reg_from) {
1549 scan_inst->src[i].reg = reg_to;
1550 scan_inst->src[i].reg_offset = reg_to_offset;
1551 }
1552 }
1553 }
1554
1555 inst->remove();
1556 live_intervals_valid = false;
1557 progress = true;
1558 continue;
1559 }
1560
1561 return progress;
1562 }
1563
1564 bool
1565 fs_visitor::register_coalesce()
1566 {
1567 bool progress = false;
1568 int if_depth = 0;
1569 int loop_depth = 0;
1570
1571 foreach_list_safe(node, &this->instructions) {
1572 fs_inst *inst = (fs_inst *)node;
1573
1574 /* Make sure that we dominate the instructions we're going to
1575 * scan for interfering with our coalescing, or we won't have
1576 * scanned enough to see if anything interferes with our
1577 * coalescing. We don't dominate the following instructions if
1578 * we're in a loop or an if block.
1579 */
1580 switch (inst->opcode) {
1581 case BRW_OPCODE_DO:
1582 loop_depth++;
1583 break;
1584 case BRW_OPCODE_WHILE:
1585 loop_depth--;
1586 break;
1587 case BRW_OPCODE_IF:
1588 if_depth++;
1589 break;
1590 case BRW_OPCODE_ENDIF:
1591 if_depth--;
1592 break;
1593 default:
1594 break;
1595 }
1596 if (loop_depth || if_depth)
1597 continue;
1598
1599 if (inst->opcode != BRW_OPCODE_MOV ||
1600 inst->predicate ||
1601 inst->saturate ||
1602 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1603 inst->src[0].file != UNIFORM)||
1604 inst->dst.type != inst->src[0].type)
1605 continue;
1606
1607 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
1608
1609 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1610 * them: check for no writes to either one until the exit of the
1611 * program.
1612 */
1613 bool interfered = false;
1614
1615 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1616 !scan_inst->is_tail_sentinel();
1617 scan_inst = (fs_inst *)scan_inst->next) {
1618 if (scan_inst->dst.file == GRF) {
1619 if (scan_inst->overwrites_reg(inst->dst) ||
1620 scan_inst->overwrites_reg(inst->src[0])) {
1621 interfered = true;
1622 break;
1623 }
1624 }
1625
1626 /* The gen6 MATH instruction can't handle source modifiers or
1627 * unusual register regions, so avoid coalescing those for
1628 * now. We should do something more specific.
1629 */
1630 if (intel->gen >= 6 &&
1631 scan_inst->is_math() &&
1632 (has_source_modifiers || inst->src[0].file == UNIFORM)) {
1633 interfered = true;
1634 break;
1635 }
1636
1637 /* The accumulator result appears to get used for the
1638 * conditional modifier generation. When negating a UD
1639 * value, there is a 33rd bit generated for the sign in the
1640 * accumulator value, so now you can't check, for example,
1641 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1642 */
1643 if (scan_inst->conditional_mod &&
1644 inst->src[0].negate &&
1645 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1646 interfered = true;
1647 break;
1648 }
1649 }
1650 if (interfered) {
1651 continue;
1652 }
1653
1654 /* Rewrite the later usage to point at the source of the move to
1655 * be removed.
1656 */
1657 for (fs_inst *scan_inst = inst;
1658 !scan_inst->is_tail_sentinel();
1659 scan_inst = (fs_inst *)scan_inst->next) {
1660 for (int i = 0; i < 3; i++) {
1661 if (scan_inst->src[i].file == GRF &&
1662 scan_inst->src[i].reg == inst->dst.reg &&
1663 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1664 fs_reg new_src = inst->src[0];
1665 if (scan_inst->src[i].abs) {
1666 new_src.negate = 0;
1667 new_src.abs = 1;
1668 }
1669 new_src.negate ^= scan_inst->src[i].negate;
1670 scan_inst->src[i] = new_src;
1671 }
1672 }
1673 }
1674
1675 inst->remove();
1676 progress = true;
1677 }
1678
1679 if (progress)
1680 live_intervals_valid = false;
1681
1682 return progress;
1683 }
1684
1685
1686 bool
1687 fs_visitor::compute_to_mrf()
1688 {
1689 bool progress = false;
1690 int next_ip = 0;
1691
1692 calculate_live_intervals();
1693
1694 foreach_list_safe(node, &this->instructions) {
1695 fs_inst *inst = (fs_inst *)node;
1696
1697 int ip = next_ip;
1698 next_ip++;
1699
1700 if (inst->opcode != BRW_OPCODE_MOV ||
1701 inst->predicate ||
1702 inst->dst.file != MRF || inst->src[0].file != GRF ||
1703 inst->dst.type != inst->src[0].type ||
1704 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1705 continue;
1706
1707 /* Work out which hardware MRF registers are written by this
1708 * instruction.
1709 */
1710 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1711 int mrf_high;
1712 if (inst->dst.reg & BRW_MRF_COMPR4) {
1713 mrf_high = mrf_low + 4;
1714 } else if (dispatch_width == 16 &&
1715 (!inst->force_uncompressed && !inst->force_sechalf)) {
1716 mrf_high = mrf_low + 1;
1717 } else {
1718 mrf_high = mrf_low;
1719 }
1720
1721 /* Can't compute-to-MRF this GRF if someone else was going to
1722 * read it later.
1723 */
1724 if (this->virtual_grf_use[inst->src[0].reg] > ip)
1725 continue;
1726
1727 /* Found a move of a GRF to a MRF. Let's see if we can go
1728 * rewrite the thing that made this GRF to write into the MRF.
1729 */
1730 fs_inst *scan_inst;
1731 for (scan_inst = (fs_inst *)inst->prev;
1732 scan_inst->prev != NULL;
1733 scan_inst = (fs_inst *)scan_inst->prev) {
1734 if (scan_inst->dst.file == GRF &&
1735 scan_inst->dst.reg == inst->src[0].reg) {
1736 /* Found the last thing to write our reg we want to turn
1737 * into a compute-to-MRF.
1738 */
1739
1740 /* SENDs can only write to GRFs, so no compute-to-MRF. */
1741 if (scan_inst->mlen) {
1742 break;
1743 }
1744
1745 /* If it's predicated, it (probably) didn't populate all
1746 * the channels. We might be able to rewrite everything
1747 * that writes that reg, but it would require smarter
1748 * tracking to delay the rewriting until complete success.
1749 */
1750 if (scan_inst->predicate)
1751 break;
1752
1753 /* If it's half of register setup and not the same half as
1754 * our MOV we're trying to remove, bail for now.
1755 */
1756 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
1757 scan_inst->force_sechalf != inst->force_sechalf) {
1758 break;
1759 }
1760
1761 /* SEND instructions can't have MRF as a destination. */
1762 if (scan_inst->mlen)
1763 break;
1764
1765 if (intel->gen >= 6) {
1766 /* gen6 math instructions must have the destination be
1767 * GRF, so no compute-to-MRF for them.
1768 */
1769 if (scan_inst->is_math()) {
1770 break;
1771 }
1772 }
1773
1774 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1775 /* Found the creator of our MRF's source value. */
1776 scan_inst->dst.file = MRF;
1777 scan_inst->dst.reg = inst->dst.reg;
1778 scan_inst->saturate |= inst->saturate;
1779 inst->remove();
1780 progress = true;
1781 }
1782 break;
1783 }
1784
1785 /* We don't handle flow control here. Most computation of
1786 * values that end up in MRFs are shortly before the MRF
1787 * write anyway.
1788 */
1789 if (scan_inst->opcode == BRW_OPCODE_DO ||
1790 scan_inst->opcode == BRW_OPCODE_WHILE ||
1791 scan_inst->opcode == BRW_OPCODE_ELSE ||
1792 scan_inst->opcode == BRW_OPCODE_ENDIF) {
1793 break;
1794 }
1795
1796 /* You can't read from an MRF, so if someone else reads our
1797 * MRF's source GRF that we wanted to rewrite, that stops us.
1798 */
1799 bool interfered = false;
1800 for (int i = 0; i < 3; i++) {
1801 if (scan_inst->src[i].file == GRF &&
1802 scan_inst->src[i].reg == inst->src[0].reg &&
1803 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
1804 interfered = true;
1805 }
1806 }
1807 if (interfered)
1808 break;
1809
1810 if (scan_inst->dst.file == MRF) {
1811 /* If somebody else writes our MRF here, we can't
1812 * compute-to-MRF before that.
1813 */
1814 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
1815 int scan_mrf_high;
1816
1817 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
1818 scan_mrf_high = scan_mrf_low + 4;
1819 } else if (dispatch_width == 16 &&
1820 (!scan_inst->force_uncompressed &&
1821 !scan_inst->force_sechalf)) {
1822 scan_mrf_high = scan_mrf_low + 1;
1823 } else {
1824 scan_mrf_high = scan_mrf_low;
1825 }
1826
1827 if (mrf_low == scan_mrf_low ||
1828 mrf_low == scan_mrf_high ||
1829 mrf_high == scan_mrf_low ||
1830 mrf_high == scan_mrf_high) {
1831 break;
1832 }
1833 }
1834
1835 if (scan_inst->mlen > 0) {
1836 /* Found a SEND instruction, which means that there are
1837 * live values in MRFs from base_mrf to base_mrf +
1838 * scan_inst->mlen - 1. Don't go pushing our MRF write up
1839 * above it.
1840 */
1841 if (mrf_low >= scan_inst->base_mrf &&
1842 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
1843 break;
1844 }
1845 if (mrf_high >= scan_inst->base_mrf &&
1846 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
1847 break;
1848 }
1849 }
1850 }
1851 }
1852
1853 if (progress)
1854 live_intervals_valid = false;
1855
1856 return progress;
1857 }
1858
1859 /**
1860 * Walks through basic blocks, looking for repeated MRF writes and
1861 * removing the later ones.
1862 */
1863 bool
1864 fs_visitor::remove_duplicate_mrf_writes()
1865 {
1866 fs_inst *last_mrf_move[16];
1867 bool progress = false;
1868
1869 /* Need to update the MRF tracking for compressed instructions. */
1870 if (dispatch_width == 16)
1871 return false;
1872
1873 memset(last_mrf_move, 0, sizeof(last_mrf_move));
1874
1875 foreach_list_safe(node, &this->instructions) {
1876 fs_inst *inst = (fs_inst *)node;
1877
1878 switch (inst->opcode) {
1879 case BRW_OPCODE_DO:
1880 case BRW_OPCODE_WHILE:
1881 case BRW_OPCODE_IF:
1882 case BRW_OPCODE_ELSE:
1883 case BRW_OPCODE_ENDIF:
1884 memset(last_mrf_move, 0, sizeof(last_mrf_move));
1885 continue;
1886 default:
1887 break;
1888 }
1889
1890 if (inst->opcode == BRW_OPCODE_MOV &&
1891 inst->dst.file == MRF) {
1892 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
1893 if (prev_inst && inst->equals(prev_inst)) {
1894 inst->remove();
1895 progress = true;
1896 continue;
1897 }
1898 }
1899
1900 /* Clear out the last-write records for MRFs that were overwritten. */
1901 if (inst->dst.file == MRF) {
1902 last_mrf_move[inst->dst.reg] = NULL;
1903 }
1904
1905 if (inst->mlen > 0) {
1906 /* Found a SEND instruction, which will include two or fewer
1907 * implied MRF writes. We could do better here.
1908 */
1909 for (int i = 0; i < implied_mrf_writes(inst); i++) {
1910 last_mrf_move[inst->base_mrf + i] = NULL;
1911 }
1912 }
1913
1914 /* Clear out any MRF move records whose sources got overwritten. */
1915 if (inst->dst.file == GRF) {
1916 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
1917 if (last_mrf_move[i] &&
1918 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
1919 last_mrf_move[i] = NULL;
1920 }
1921 }
1922 }
1923
1924 if (inst->opcode == BRW_OPCODE_MOV &&
1925 inst->dst.file == MRF &&
1926 inst->src[0].file == GRF &&
1927 !inst->predicate) {
1928 last_mrf_move[inst->dst.reg] = inst;
1929 }
1930 }
1931
1932 if (progress)
1933 live_intervals_valid = false;
1934
1935 return progress;
1936 }
1937
1938 void
1939 fs_visitor::dump_instruction(fs_inst *inst)
1940 {
1941 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
1942 opcode_descs[inst->opcode].name) {
1943 printf("%s", opcode_descs[inst->opcode].name);
1944 } else {
1945 printf("op%d", inst->opcode);
1946 }
1947 if (inst->saturate)
1948 printf(".sat");
1949 printf(" ");
1950
1951 switch (inst->dst.file) {
1952 case GRF:
1953 printf("vgrf%d", inst->dst.reg);
1954 if (inst->dst.reg_offset)
1955 printf("+%d", inst->dst.reg_offset);
1956 break;
1957 case MRF:
1958 printf("m%d", inst->dst.reg);
1959 break;
1960 case BAD_FILE:
1961 printf("(null)");
1962 break;
1963 case UNIFORM:
1964 printf("***u%d***", inst->dst.reg);
1965 break;
1966 default:
1967 printf("???");
1968 break;
1969 }
1970 printf(", ");
1971
1972 for (int i = 0; i < 3; i++) {
1973 if (inst->src[i].negate)
1974 printf("-");
1975 if (inst->src[i].abs)
1976 printf("|");
1977 switch (inst->src[i].file) {
1978 case GRF:
1979 printf("vgrf%d", inst->src[i].reg);
1980 if (inst->src[i].reg_offset)
1981 printf("+%d", inst->src[i].reg_offset);
1982 break;
1983 case MRF:
1984 printf("***m%d***", inst->src[i].reg);
1985 break;
1986 case UNIFORM:
1987 printf("u%d", inst->src[i].reg);
1988 if (inst->src[i].reg_offset)
1989 printf(".%d", inst->src[i].reg_offset);
1990 break;
1991 case BAD_FILE:
1992 printf("(null)");
1993 break;
1994 default:
1995 printf("???");
1996 break;
1997 }
1998 if (inst->src[i].abs)
1999 printf("|");
2000
2001 if (i < 3)
2002 printf(", ");
2003 }
2004
2005 printf(" ");
2006
2007 if (inst->force_uncompressed)
2008 printf("1sthalf ");
2009
2010 if (inst->force_sechalf)
2011 printf("2ndhalf ");
2012
2013 printf("\n");
2014 }
2015
2016 void
2017 fs_visitor::dump_instructions()
2018 {
2019 int ip = 0;
2020 foreach_list(node, &this->instructions) {
2021 fs_inst *inst = (fs_inst *)node;
2022 printf("%d: ", ip++);
2023 dump_instruction(inst);
2024 }
2025 }
2026
2027 /**
2028 * Possibly returns an instruction that set up @param reg.
2029 *
2030 * Sometimes we want to take the result of some expression/variable
2031 * dereference tree and rewrite the instruction generating the result
2032 * of the tree. When processing the tree, we know that the
2033 * instructions generated are all writing temporaries that are dead
2034 * outside of this tree. So, if we have some instructions that write
2035 * a temporary, we're free to point that temp write somewhere else.
2036 *
2037 * Note that this doesn't guarantee that the instruction generated
2038 * only reg -- it might be the size=4 destination of a texture instruction.
2039 */
2040 fs_inst *
2041 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2042 fs_inst *end,
2043 fs_reg reg)
2044 {
2045 if (end == start ||
2046 end->predicate ||
2047 end->force_uncompressed ||
2048 end->force_sechalf ||
2049 !reg.equals(end->dst)) {
2050 return NULL;
2051 } else {
2052 return end;
2053 }
2054 }
2055
2056 void
2057 fs_visitor::setup_payload_gen6()
2058 {
2059 struct intel_context *intel = &brw->intel;
2060 bool uses_depth =
2061 (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2062 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2063
2064 assert(intel->gen >= 6);
2065
2066 /* R0-1: masks, pixel X/Y coordinates. */
2067 c->nr_payload_regs = 2;
2068 /* R2: only for 32-pixel dispatch.*/
2069
2070 /* R3-26: barycentric interpolation coordinates. These appear in the
2071 * same order that they appear in the brw_wm_barycentric_interp_mode
2072 * enum. Each set of coordinates occupies 2 registers if dispatch width
2073 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2074 * appear if they were enabled using the "Barycentric Interpolation
2075 * Mode" bits in WM_STATE.
2076 */
2077 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2078 if (barycentric_interp_modes & (1 << i)) {
2079 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2080 c->nr_payload_regs += 2;
2081 if (dispatch_width == 16) {
2082 c->nr_payload_regs += 2;
2083 }
2084 }
2085 }
2086
2087 /* R27: interpolated depth if uses source depth */
2088 if (uses_depth) {
2089 c->source_depth_reg = c->nr_payload_regs;
2090 c->nr_payload_regs++;
2091 if (dispatch_width == 16) {
2092 /* R28: interpolated depth if not 8-wide. */
2093 c->nr_payload_regs++;
2094 }
2095 }
2096 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2097 if (uses_depth) {
2098 c->source_w_reg = c->nr_payload_regs;
2099 c->nr_payload_regs++;
2100 if (dispatch_width == 16) {
2101 /* R30: interpolated W if not 8-wide. */
2102 c->nr_payload_regs++;
2103 }
2104 }
2105 /* R31: MSAA position offsets. */
2106 /* R32-: bary for 32-pixel. */
2107 /* R58-59: interp W for 32-pixel. */
2108
2109 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2110 c->source_depth_to_render_target = true;
2111 }
2112 }
2113
2114 bool
2115 fs_visitor::run()
2116 {
2117 uint32_t orig_nr_params = c->prog_data.nr_params;
2118
2119 if (intel->gen >= 6)
2120 setup_payload_gen6();
2121 else
2122 setup_payload_gen4();
2123
2124 if (0) {
2125 emit_dummy_fs();
2126 } else {
2127 calculate_urb_setup();
2128 if (intel->gen < 6)
2129 emit_interpolation_setup_gen4();
2130 else
2131 emit_interpolation_setup_gen6();
2132
2133 /* Generate FS IR for main(). (the visitor only descends into
2134 * functions called "main").
2135 */
2136 if (shader) {
2137 foreach_list(node, &*shader->ir) {
2138 ir_instruction *ir = (ir_instruction *)node;
2139 base_ir = ir;
2140 this->result = reg_undef;
2141 ir->accept(this);
2142 }
2143 } else {
2144 emit_fragment_program_code();
2145 }
2146 if (failed)
2147 return false;
2148
2149 emit_fb_writes();
2150
2151 split_virtual_grfs();
2152
2153 setup_paramvalues_refs();
2154 setup_pull_constants();
2155
2156 bool progress;
2157 do {
2158 progress = false;
2159
2160 compact_virtual_grfs();
2161
2162 progress = remove_duplicate_mrf_writes() || progress;
2163
2164 progress = opt_algebraic() || progress;
2165 progress = opt_cse() || progress;
2166 progress = opt_copy_propagate() || progress;
2167 progress = dead_code_eliminate() || progress;
2168 progress = register_coalesce() || progress;
2169 progress = register_coalesce_2() || progress;
2170 progress = compute_to_mrf() || progress;
2171 } while (progress);
2172
2173 remove_dead_constants();
2174
2175 schedule_instructions();
2176
2177 assign_curb_setup();
2178 assign_urb_setup();
2179
2180 if (0) {
2181 /* Debug of register spilling: Go spill everything. */
2182 for (int i = 0; i < virtual_grf_count; i++) {
2183 spill_reg(i);
2184 }
2185 }
2186
2187 if (0)
2188 assign_regs_trivial();
2189 else {
2190 while (!assign_regs()) {
2191 if (failed)
2192 break;
2193 }
2194 }
2195 }
2196 assert(force_uncompressed_stack == 0);
2197 assert(force_sechalf_stack == 0);
2198
2199 if (failed)
2200 return false;
2201
2202 if (dispatch_width == 8) {
2203 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2204 } else {
2205 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2206
2207 /* Make sure we didn't try to sneak in an extra uniform */
2208 assert(orig_nr_params == c->prog_data.nr_params);
2209 (void) orig_nr_params;
2210 }
2211
2212 return !failed;
2213 }
2214
2215 const unsigned *
2216 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2217 struct gl_fragment_program *fp,
2218 struct gl_shader_program *prog,
2219 unsigned *final_assembly_size)
2220 {
2221 struct intel_context *intel = &brw->intel;
2222 bool start_busy = false;
2223 float start_time = 0;
2224
2225 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2226 start_busy = (intel->batch.last_bo &&
2227 drm_intel_bo_busy(intel->batch.last_bo));
2228 start_time = get_time();
2229 }
2230
2231 struct brw_shader *shader = NULL;
2232 if (prog)
2233 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2234
2235 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2236 if (shader) {
2237 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2238 _mesa_print_ir(shader->ir, NULL);
2239 printf("\n\n");
2240 } else {
2241 printf("ARB_fragment_program %d ir for native fragment shader\n",
2242 fp->Base.Id);
2243 _mesa_print_program(&fp->Base);
2244 }
2245 }
2246
2247 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2248 */
2249 fs_visitor v(brw, c, prog, fp, 8);
2250 if (!v.run()) {
2251 prog->LinkStatus = false;
2252 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2253
2254 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2255 v.fail_msg);
2256
2257 return NULL;
2258 }
2259
2260 exec_list *simd16_instructions = NULL;
2261 fs_visitor v2(brw, c, prog, fp, 16);
2262 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2263 v2.import_uniforms(&v);
2264 if (!v2.run()) {
2265 perf_debug("16-wide shader failed to compile, falling back to "
2266 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2267 } else {
2268 simd16_instructions = &v2.instructions;
2269 }
2270 }
2271
2272 c->prog_data.dispatch_width = 8;
2273
2274 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2275 if (shader->compiled_once)
2276 brw_wm_debug_recompile(brw, prog, &c->key);
2277 shader->compiled_once = true;
2278
2279 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2280 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2281 (get_time() - start_time) * 1000);
2282 }
2283 }
2284
2285 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2286 return g.generate_assembly(&v.instructions, simd16_instructions,
2287 final_assembly_size);
2288 }
2289
2290 bool
2291 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2292 {
2293 struct brw_context *brw = brw_context(ctx);
2294 struct intel_context *intel = &brw->intel;
2295 struct brw_wm_prog_key key;
2296
2297 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2298 return true;
2299
2300 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2301 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2302 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2303 bool program_uses_dfdy = fp->UsesDFdy;
2304
2305 memset(&key, 0, sizeof(key));
2306
2307 if (intel->gen < 6) {
2308 if (fp->UsesKill)
2309 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2310
2311 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2312 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2313
2314 /* Just assume depth testing. */
2315 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2316 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2317 }
2318
2319 if (prog->Name != 0)
2320 key.proj_attrib_mask = 0xffffffff;
2321
2322 if (intel->gen < 6)
2323 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2324
2325 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2326 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2327 continue;
2328
2329 if (prog->Name == 0)
2330 key.proj_attrib_mask |= 1 << i;
2331
2332 if (intel->gen < 6) {
2333 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2334
2335 if (vp_index >= 0)
2336 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2337 }
2338 }
2339
2340 key.clamp_fragment_color = true;
2341
2342 for (int i = 0; i < MAX_SAMPLERS; i++) {
2343 if (fp->Base.ShadowSamplers & (1 << i)) {
2344 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2345 key.tex.swizzles[i] =
2346 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2347 } else {
2348 /* Color sampler: assume no swizzling. */
2349 key.tex.swizzles[i] = SWIZZLE_XYZW;
2350 }
2351 }
2352
2353 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2354 key.drawable_height = ctx->DrawBuffer->Height;
2355 }
2356
2357 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2358 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2359 }
2360
2361 key.nr_color_regions = 1;
2362
2363 key.program_string_id = bfp->id;
2364
2365 uint32_t old_prog_offset = brw->wm.prog_offset;
2366 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2367
2368 bool success = do_wm_prog(brw, prog, bfp, &key);
2369
2370 brw->wm.prog_offset = old_prog_offset;
2371 brw->wm.prog_data = old_prog_data;
2372
2373 return success;
2374 }