i965/fs: Allow source mods on gen7+ math.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 ALU1(NOT)
150 ALU1(MOV)
151 ALU1(FRC)
152 ALU1(RNDD)
153 ALU1(RNDE)
154 ALU1(RNDZ)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(SHL)
162 ALU2(SHR)
163 ALU2(ASR)
164
165 /** Gen4 predicated IF. */
166 fs_inst *
167 fs_visitor::IF(uint32_t predicate)
168 {
169 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
170 inst->predicate = predicate;
171 return inst;
172 }
173
174 /** Gen6+ IF with embedded comparison. */
175 fs_inst *
176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
177 {
178 assert(intel->gen >= 6);
179 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
180 reg_null_d, src0, src1);
181 inst->conditional_mod = condition;
182 return inst;
183 }
184
185 /**
186 * CMP: Sets the low bit of the destination channels with the result
187 * of the comparison, while the upper bits are undefined, and updates
188 * the flag register with the packed 16 bits of the result.
189 */
190 fs_inst *
191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
192 {
193 fs_inst *inst;
194
195 /* Take the instruction:
196 *
197 * CMP null<d> src0<f> src1<f>
198 *
199 * Original gen4 does type conversion to the destination type before
200 * comparison, producing garbage results for floating point comparisons.
201 * gen5 does the comparison on the execution type (resolved source types),
202 * so dst type doesn't matter. gen6 does comparison and then uses the
203 * result as if it was the dst type with no conversion, which happens to
204 * mostly work out for float-interpreted-as-int since our comparisons are
205 * for >0, =0, <0.
206 */
207 if (intel->gen == 4) {
208 dst.type = src0.type;
209 if (dst.file == FIXED_HW_REG)
210 dst.fixed_hw_reg.type = dst.type;
211 }
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 bool
223 fs_inst::equals(fs_inst *inst)
224 {
225 return (opcode == inst->opcode &&
226 dst.equals(inst->dst) &&
227 src[0].equals(inst->src[0]) &&
228 src[1].equals(inst->src[1]) &&
229 src[2].equals(inst->src[2]) &&
230 saturate == inst->saturate &&
231 predicate == inst->predicate &&
232 conditional_mod == inst->conditional_mod &&
233 mlen == inst->mlen &&
234 base_mrf == inst->base_mrf &&
235 sampler == inst->sampler &&
236 target == inst->target &&
237 eot == inst->eot &&
238 header_present == inst->header_present &&
239 shadow_compare == inst->shadow_compare &&
240 offset == inst->offset);
241 }
242
243 int
244 fs_inst::regs_written()
245 {
246 if (is_tex())
247 return 4;
248
249 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
250 * but we don't currently use them...nor do we have an opcode for them.
251 */
252
253 return 1;
254 }
255
256 bool
257 fs_inst::overwrites_reg(const fs_reg &reg)
258 {
259 return (reg.file == dst.file &&
260 reg.reg == dst.reg &&
261 reg.reg_offset >= dst.reg_offset &&
262 reg.reg_offset < dst.reg_offset + regs_written());
263 }
264
265 bool
266 fs_inst::is_tex()
267 {
268 return (opcode == SHADER_OPCODE_TEX ||
269 opcode == FS_OPCODE_TXB ||
270 opcode == SHADER_OPCODE_TXD ||
271 opcode == SHADER_OPCODE_TXF ||
272 opcode == SHADER_OPCODE_TXL ||
273 opcode == SHADER_OPCODE_TXS);
274 }
275
276 bool
277 fs_inst::is_math()
278 {
279 return (opcode == SHADER_OPCODE_RCP ||
280 opcode == SHADER_OPCODE_RSQ ||
281 opcode == SHADER_OPCODE_SQRT ||
282 opcode == SHADER_OPCODE_EXP2 ||
283 opcode == SHADER_OPCODE_LOG2 ||
284 opcode == SHADER_OPCODE_SIN ||
285 opcode == SHADER_OPCODE_COS ||
286 opcode == SHADER_OPCODE_INT_QUOTIENT ||
287 opcode == SHADER_OPCODE_INT_REMAINDER ||
288 opcode == SHADER_OPCODE_POW);
289 }
290
291 void
292 fs_reg::init()
293 {
294 memset(this, 0, sizeof(*this));
295 this->smear = -1;
296 }
297
298 /** Generic unset register constructor. */
299 fs_reg::fs_reg()
300 {
301 init();
302 this->file = BAD_FILE;
303 }
304
305 /** Immediate value constructor. */
306 fs_reg::fs_reg(float f)
307 {
308 init();
309 this->file = IMM;
310 this->type = BRW_REGISTER_TYPE_F;
311 this->imm.f = f;
312 }
313
314 /** Immediate value constructor. */
315 fs_reg::fs_reg(int32_t i)
316 {
317 init();
318 this->file = IMM;
319 this->type = BRW_REGISTER_TYPE_D;
320 this->imm.i = i;
321 }
322
323 /** Immediate value constructor. */
324 fs_reg::fs_reg(uint32_t u)
325 {
326 init();
327 this->file = IMM;
328 this->type = BRW_REGISTER_TYPE_UD;
329 this->imm.u = u;
330 }
331
332 /** Fixed brw_reg Immediate value constructor. */
333 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
334 {
335 init();
336 this->file = FIXED_HW_REG;
337 this->fixed_hw_reg = fixed_hw_reg;
338 this->type = fixed_hw_reg.type;
339 }
340
341 bool
342 fs_reg::equals(const fs_reg &r) const
343 {
344 return (file == r.file &&
345 reg == r.reg &&
346 reg_offset == r.reg_offset &&
347 type == r.type &&
348 negate == r.negate &&
349 abs == r.abs &&
350 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
351 sizeof(fixed_hw_reg)) == 0 &&
352 smear == r.smear &&
353 imm.u == r.imm.u);
354 }
355
356 bool
357 fs_reg::is_zero() const
358 {
359 if (file != IMM)
360 return false;
361
362 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
363 }
364
365 bool
366 fs_reg::is_one() const
367 {
368 if (file != IMM)
369 return false;
370
371 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
372 }
373
374 int
375 fs_visitor::type_size(const struct glsl_type *type)
376 {
377 unsigned int size, i;
378
379 switch (type->base_type) {
380 case GLSL_TYPE_UINT:
381 case GLSL_TYPE_INT:
382 case GLSL_TYPE_FLOAT:
383 case GLSL_TYPE_BOOL:
384 return type->components();
385 case GLSL_TYPE_ARRAY:
386 return type_size(type->fields.array) * type->length;
387 case GLSL_TYPE_STRUCT:
388 size = 0;
389 for (i = 0; i < type->length; i++) {
390 size += type_size(type->fields.structure[i].type);
391 }
392 return size;
393 case GLSL_TYPE_SAMPLER:
394 /* Samplers take up no register space, since they're baked in at
395 * link time.
396 */
397 return 0;
398 default:
399 assert(!"not reached");
400 return 0;
401 }
402 }
403
404 void
405 fs_visitor::fail(const char *format, ...)
406 {
407 va_list va;
408 char *msg;
409
410 if (failed)
411 return;
412
413 failed = true;
414
415 va_start(va, format);
416 msg = ralloc_vasprintf(mem_ctx, format, va);
417 va_end(va);
418 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
419
420 this->fail_msg = msg;
421
422 if (INTEL_DEBUG & DEBUG_WM) {
423 fprintf(stderr, "%s", msg);
424 }
425 }
426
427 fs_inst *
428 fs_visitor::emit(enum opcode opcode)
429 {
430 return emit(fs_inst(opcode));
431 }
432
433 fs_inst *
434 fs_visitor::emit(enum opcode opcode, fs_reg dst)
435 {
436 return emit(fs_inst(opcode, dst));
437 }
438
439 fs_inst *
440 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
441 {
442 return emit(fs_inst(opcode, dst, src0));
443 }
444
445 fs_inst *
446 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
447 {
448 return emit(fs_inst(opcode, dst, src0, src1));
449 }
450
451 fs_inst *
452 fs_visitor::emit(enum opcode opcode, fs_reg dst,
453 fs_reg src0, fs_reg src1, fs_reg src2)
454 {
455 return emit(fs_inst(opcode, dst, src0, src1, src2));
456 }
457
458 void
459 fs_visitor::push_force_uncompressed()
460 {
461 force_uncompressed_stack++;
462 }
463
464 void
465 fs_visitor::pop_force_uncompressed()
466 {
467 force_uncompressed_stack--;
468 assert(force_uncompressed_stack >= 0);
469 }
470
471 void
472 fs_visitor::push_force_sechalf()
473 {
474 force_sechalf_stack++;
475 }
476
477 void
478 fs_visitor::pop_force_sechalf()
479 {
480 force_sechalf_stack--;
481 assert(force_sechalf_stack >= 0);
482 }
483
484 /**
485 * Returns how many MRFs an FS opcode will write over.
486 *
487 * Note that this is not the 0 or 1 implied writes in an actual gen
488 * instruction -- the FS opcodes often generate MOVs in addition.
489 */
490 int
491 fs_visitor::implied_mrf_writes(fs_inst *inst)
492 {
493 if (inst->mlen == 0)
494 return 0;
495
496 switch (inst->opcode) {
497 case SHADER_OPCODE_RCP:
498 case SHADER_OPCODE_RSQ:
499 case SHADER_OPCODE_SQRT:
500 case SHADER_OPCODE_EXP2:
501 case SHADER_OPCODE_LOG2:
502 case SHADER_OPCODE_SIN:
503 case SHADER_OPCODE_COS:
504 return 1 * dispatch_width / 8;
505 case SHADER_OPCODE_POW:
506 case SHADER_OPCODE_INT_QUOTIENT:
507 case SHADER_OPCODE_INT_REMAINDER:
508 return 2 * dispatch_width / 8;
509 case SHADER_OPCODE_TEX:
510 case FS_OPCODE_TXB:
511 case SHADER_OPCODE_TXD:
512 case SHADER_OPCODE_TXF:
513 case SHADER_OPCODE_TXL:
514 case SHADER_OPCODE_TXS:
515 return 1;
516 case FS_OPCODE_FB_WRITE:
517 return 2;
518 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
519 case FS_OPCODE_UNSPILL:
520 return 1;
521 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
522 return inst->header_present;
523 case FS_OPCODE_SPILL:
524 return 2;
525 default:
526 assert(!"not reached");
527 return inst->mlen;
528 }
529 }
530
531 int
532 fs_visitor::virtual_grf_alloc(int size)
533 {
534 if (virtual_grf_array_size <= virtual_grf_count) {
535 if (virtual_grf_array_size == 0)
536 virtual_grf_array_size = 16;
537 else
538 virtual_grf_array_size *= 2;
539 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
540 virtual_grf_array_size);
541 }
542 virtual_grf_sizes[virtual_grf_count] = size;
543 return virtual_grf_count++;
544 }
545
546 /** Fixed HW reg constructor. */
547 fs_reg::fs_reg(enum register_file file, int reg)
548 {
549 init();
550 this->file = file;
551 this->reg = reg;
552 this->type = BRW_REGISTER_TYPE_F;
553 }
554
555 /** Fixed HW reg constructor. */
556 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
557 {
558 init();
559 this->file = file;
560 this->reg = reg;
561 this->type = type;
562 }
563
564 /** Automatic reg constructor. */
565 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
566 {
567 init();
568
569 this->file = GRF;
570 this->reg = v->virtual_grf_alloc(v->type_size(type));
571 this->reg_offset = 0;
572 this->type = brw_type_for_base_type(type);
573 }
574
575 fs_reg *
576 fs_visitor::variable_storage(ir_variable *var)
577 {
578 return (fs_reg *)hash_table_find(this->variable_ht, var);
579 }
580
581 void
582 import_uniforms_callback(const void *key,
583 void *data,
584 void *closure)
585 {
586 struct hash_table *dst_ht = (struct hash_table *)closure;
587 const fs_reg *reg = (const fs_reg *)data;
588
589 if (reg->file != UNIFORM)
590 return;
591
592 hash_table_insert(dst_ht, data, key);
593 }
594
595 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
596 * This brings in those uniform definitions
597 */
598 void
599 fs_visitor::import_uniforms(fs_visitor *v)
600 {
601 hash_table_call_foreach(v->variable_ht,
602 import_uniforms_callback,
603 variable_ht);
604 this->params_remap = v->params_remap;
605 }
606
607 /* Our support for uniforms is piggy-backed on the struct
608 * gl_fragment_program, because that's where the values actually
609 * get stored, rather than in some global gl_shader_program uniform
610 * store.
611 */
612 int
613 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
614 {
615 unsigned int offset = 0;
616
617 if (type->is_matrix()) {
618 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
619 type->vector_elements,
620 1);
621
622 for (unsigned int i = 0; i < type->matrix_columns; i++) {
623 offset += setup_uniform_values(loc + offset, column);
624 }
625
626 return offset;
627 }
628
629 switch (type->base_type) {
630 case GLSL_TYPE_FLOAT:
631 case GLSL_TYPE_UINT:
632 case GLSL_TYPE_INT:
633 case GLSL_TYPE_BOOL:
634 for (unsigned int i = 0; i < type->vector_elements; i++) {
635 unsigned int param = c->prog_data.nr_params++;
636
637 this->param_index[param] = loc;
638 this->param_offset[param] = i;
639 }
640 return 1;
641
642 case GLSL_TYPE_STRUCT:
643 for (unsigned int i = 0; i < type->length; i++) {
644 offset += setup_uniform_values(loc + offset,
645 type->fields.structure[i].type);
646 }
647 return offset;
648
649 case GLSL_TYPE_ARRAY:
650 for (unsigned int i = 0; i < type->length; i++) {
651 offset += setup_uniform_values(loc + offset, type->fields.array);
652 }
653 return offset;
654
655 case GLSL_TYPE_SAMPLER:
656 /* The sampler takes up a slot, but we don't use any values from it. */
657 return 1;
658
659 default:
660 assert(!"not reached");
661 return 0;
662 }
663 }
664
665
666 /* Our support for builtin uniforms is even scarier than non-builtin.
667 * It sits on top of the PROG_STATE_VAR parameters that are
668 * automatically updated from GL context state.
669 */
670 void
671 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
672 {
673 const ir_state_slot *const slots = ir->state_slots;
674 assert(ir->state_slots != NULL);
675
676 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
677 /* This state reference has already been setup by ir_to_mesa, but we'll
678 * get the same index back here.
679 */
680 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
681 (gl_state_index *)slots[i].tokens);
682
683 /* Add each of the unique swizzles of the element as a parameter.
684 * This'll end up matching the expected layout of the
685 * array/matrix/structure we're trying to fill in.
686 */
687 int last_swiz = -1;
688 for (unsigned int j = 0; j < 4; j++) {
689 int swiz = GET_SWZ(slots[i].swizzle, j);
690 if (swiz == last_swiz)
691 break;
692 last_swiz = swiz;
693
694 this->param_index[c->prog_data.nr_params] = index;
695 this->param_offset[c->prog_data.nr_params] = swiz;
696 c->prog_data.nr_params++;
697 }
698 }
699 }
700
701 fs_reg *
702 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
703 {
704 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
705 fs_reg wpos = *reg;
706 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
707
708 /* gl_FragCoord.x */
709 if (ir->pixel_center_integer) {
710 emit(MOV(wpos, this->pixel_x));
711 } else {
712 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
713 }
714 wpos.reg_offset++;
715
716 /* gl_FragCoord.y */
717 if (!flip && ir->pixel_center_integer) {
718 emit(MOV(wpos, this->pixel_y));
719 } else {
720 fs_reg pixel_y = this->pixel_y;
721 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
722
723 if (flip) {
724 pixel_y.negate = true;
725 offset += c->key.drawable_height - 1.0;
726 }
727
728 emit(ADD(wpos, pixel_y, fs_reg(offset)));
729 }
730 wpos.reg_offset++;
731
732 /* gl_FragCoord.z */
733 if (intel->gen >= 6) {
734 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
735 } else {
736 emit(FS_OPCODE_LINTERP, wpos,
737 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
738 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
739 interp_reg(FRAG_ATTRIB_WPOS, 2));
740 }
741 wpos.reg_offset++;
742
743 /* gl_FragCoord.w: Already set up in emit_interpolation */
744 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
745
746 return reg;
747 }
748
749 fs_inst *
750 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
751 glsl_interp_qualifier interpolation_mode,
752 bool is_centroid)
753 {
754 brw_wm_barycentric_interp_mode barycoord_mode;
755 if (is_centroid) {
756 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
757 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
758 else
759 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
760 } else {
761 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
762 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
763 else
764 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
765 }
766 return emit(FS_OPCODE_LINTERP, attr,
767 this->delta_x[barycoord_mode],
768 this->delta_y[barycoord_mode], interp);
769 }
770
771 fs_reg *
772 fs_visitor::emit_general_interpolation(ir_variable *ir)
773 {
774 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
775 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
776 fs_reg attr = *reg;
777
778 unsigned int array_elements;
779 const glsl_type *type;
780
781 if (ir->type->is_array()) {
782 array_elements = ir->type->length;
783 if (array_elements == 0) {
784 fail("dereferenced array '%s' has length 0\n", ir->name);
785 }
786 type = ir->type->fields.array;
787 } else {
788 array_elements = 1;
789 type = ir->type;
790 }
791
792 glsl_interp_qualifier interpolation_mode =
793 ir->determine_interpolation_mode(c->key.flat_shade);
794
795 int location = ir->location;
796 for (unsigned int i = 0; i < array_elements; i++) {
797 for (unsigned int j = 0; j < type->matrix_columns; j++) {
798 if (urb_setup[location] == -1) {
799 /* If there's no incoming setup data for this slot, don't
800 * emit interpolation for it.
801 */
802 attr.reg_offset += type->vector_elements;
803 location++;
804 continue;
805 }
806
807 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
808 /* Constant interpolation (flat shading) case. The SF has
809 * handed us defined values in only the constant offset
810 * field of the setup reg.
811 */
812 for (unsigned int k = 0; k < type->vector_elements; k++) {
813 struct brw_reg interp = interp_reg(location, k);
814 interp = suboffset(interp, 3);
815 interp.type = reg->type;
816 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
817 attr.reg_offset++;
818 }
819 } else {
820 /* Smooth/noperspective interpolation case. */
821 for (unsigned int k = 0; k < type->vector_elements; k++) {
822 /* FINISHME: At some point we probably want to push
823 * this farther by giving similar treatment to the
824 * other potentially constant components of the
825 * attribute, as well as making brw_vs_constval.c
826 * handle varyings other than gl_TexCoord.
827 */
828 if (location >= FRAG_ATTRIB_TEX0 &&
829 location <= FRAG_ATTRIB_TEX7 &&
830 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
831 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
832 } else {
833 struct brw_reg interp = interp_reg(location, k);
834 emit_linterp(attr, fs_reg(interp), interpolation_mode,
835 ir->centroid);
836 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
837 /* Get the pixel/sample mask into f0 so that we know
838 * which pixels are lit. Then, for each channel that is
839 * unlit, replace the centroid data with non-centroid
840 * data.
841 */
842 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
843 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
844 interpolation_mode, false);
845 inst->predicate = BRW_PREDICATE_NORMAL;
846 inst->predicate_inverse = true;
847 }
848 if (intel->gen < 6) {
849 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
850 }
851 }
852 attr.reg_offset++;
853 }
854
855 }
856 location++;
857 }
858 }
859
860 return reg;
861 }
862
863 fs_reg *
864 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
865 {
866 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
867
868 /* The frontfacing comes in as a bit in the thread payload. */
869 if (intel->gen >= 6) {
870 emit(BRW_OPCODE_ASR, *reg,
871 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
872 fs_reg(15));
873 emit(BRW_OPCODE_NOT, *reg, *reg);
874 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
875 } else {
876 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
877 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
878 * us front face
879 */
880 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
881 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
882 }
883
884 return reg;
885 }
886
887 fs_inst *
888 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
889 {
890 switch (opcode) {
891 case SHADER_OPCODE_RCP:
892 case SHADER_OPCODE_RSQ:
893 case SHADER_OPCODE_SQRT:
894 case SHADER_OPCODE_EXP2:
895 case SHADER_OPCODE_LOG2:
896 case SHADER_OPCODE_SIN:
897 case SHADER_OPCODE_COS:
898 break;
899 default:
900 assert(!"not reached: bad math opcode");
901 return NULL;
902 }
903
904 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
905 * might be able to do better by doing execsize = 1 math and then
906 * expanding that result out, but we would need to be careful with
907 * masking.
908 *
909 * Gen 6 hardware ignores source modifiers (negate and abs) on math
910 * instructions, so we also move to a temp to set those up.
911 */
912 if (intel->gen == 6 && (src.file == UNIFORM ||
913 src.abs ||
914 src.negate)) {
915 fs_reg expanded = fs_reg(this, glsl_type::float_type);
916 emit(BRW_OPCODE_MOV, expanded, src);
917 src = expanded;
918 }
919
920 fs_inst *inst = emit(opcode, dst, src);
921
922 if (intel->gen < 6) {
923 inst->base_mrf = 2;
924 inst->mlen = dispatch_width / 8;
925 }
926
927 return inst;
928 }
929
930 fs_inst *
931 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
932 {
933 int base_mrf = 2;
934 fs_inst *inst;
935
936 switch (opcode) {
937 case SHADER_OPCODE_POW:
938 case SHADER_OPCODE_INT_QUOTIENT:
939 case SHADER_OPCODE_INT_REMAINDER:
940 break;
941 default:
942 assert(!"not reached: unsupported binary math opcode.");
943 return NULL;
944 }
945
946 if (intel->gen >= 7) {
947 inst = emit(opcode, dst, src0, src1);
948 } else if (intel->gen == 6) {
949 /* Can't do hstride == 0 args to gen6 math, so expand it out.
950 *
951 * The hardware ignores source modifiers (negate and abs) on math
952 * instructions, so we also move to a temp to set those up.
953 */
954 if (src0.file == UNIFORM || src0.abs || src0.negate) {
955 fs_reg expanded = fs_reg(this, glsl_type::float_type);
956 expanded.type = src0.type;
957 emit(BRW_OPCODE_MOV, expanded, src0);
958 src0 = expanded;
959 }
960
961 if (src1.file == UNIFORM || src1.abs || src1.negate) {
962 fs_reg expanded = fs_reg(this, glsl_type::float_type);
963 expanded.type = src1.type;
964 emit(BRW_OPCODE_MOV, expanded, src1);
965 src1 = expanded;
966 }
967
968 inst = emit(opcode, dst, src0, src1);
969 } else {
970 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
971 * "Message Payload":
972 *
973 * "Operand0[7]. For the INT DIV functions, this operand is the
974 * denominator."
975 * ...
976 * "Operand1[7]. For the INT DIV functions, this operand is the
977 * numerator."
978 */
979 bool is_int_div = opcode != SHADER_OPCODE_POW;
980 fs_reg &op0 = is_int_div ? src1 : src0;
981 fs_reg &op1 = is_int_div ? src0 : src1;
982
983 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
984 inst = emit(opcode, dst, op0, reg_null_f);
985
986 inst->base_mrf = base_mrf;
987 inst->mlen = 2 * dispatch_width / 8;
988 }
989 return inst;
990 }
991
992 /**
993 * To be called after the last _mesa_add_state_reference() call, to
994 * set up prog_data.param[] for assign_curb_setup() and
995 * setup_pull_constants().
996 */
997 void
998 fs_visitor::setup_paramvalues_refs()
999 {
1000 if (dispatch_width != 8)
1001 return;
1002
1003 /* Set up the pointers to ParamValues now that that array is finalized. */
1004 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1005 c->prog_data.param[i] =
1006 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
1007 this->param_offset[i];
1008 }
1009 }
1010
1011 void
1012 fs_visitor::assign_curb_setup()
1013 {
1014 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1015 if (dispatch_width == 8) {
1016 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1017 } else {
1018 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1019 }
1020
1021 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1022 foreach_list(node, &this->instructions) {
1023 fs_inst *inst = (fs_inst *)node;
1024
1025 for (unsigned int i = 0; i < 3; i++) {
1026 if (inst->src[i].file == UNIFORM) {
1027 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1028 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1029 constant_nr / 8,
1030 constant_nr % 8);
1031
1032 inst->src[i].file = FIXED_HW_REG;
1033 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1034 }
1035 }
1036 }
1037 }
1038
1039 void
1040 fs_visitor::calculate_urb_setup()
1041 {
1042 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1043 urb_setup[i] = -1;
1044 }
1045
1046 int urb_next = 0;
1047 /* Figure out where each of the incoming setup attributes lands. */
1048 if (intel->gen >= 6) {
1049 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1050 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1051 urb_setup[i] = urb_next++;
1052 }
1053 }
1054 } else {
1055 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1056 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1057 /* Point size is packed into the header, not as a general attribute */
1058 if (i == VERT_RESULT_PSIZ)
1059 continue;
1060
1061 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1062 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1063
1064 /* The back color slot is skipped when the front color is
1065 * also written to. In addition, some slots can be
1066 * written in the vertex shader and not read in the
1067 * fragment shader. So the register number must always be
1068 * incremented, mapped or not.
1069 */
1070 if (fp_index >= 0)
1071 urb_setup[fp_index] = urb_next;
1072 urb_next++;
1073 }
1074 }
1075
1076 /*
1077 * It's a FS only attribute, and we did interpolation for this attribute
1078 * in SF thread. So, count it here, too.
1079 *
1080 * See compile_sf_prog() for more info.
1081 */
1082 if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1083 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1084 }
1085
1086 /* Each attribute is 4 setup channels, each of which is half a reg. */
1087 c->prog_data.urb_read_length = urb_next * 2;
1088 }
1089
1090 void
1091 fs_visitor::assign_urb_setup()
1092 {
1093 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1094
1095 /* Offset all the urb_setup[] index by the actual position of the
1096 * setup regs, now that the location of the constants has been chosen.
1097 */
1098 foreach_list(node, &this->instructions) {
1099 fs_inst *inst = (fs_inst *)node;
1100
1101 if (inst->opcode == FS_OPCODE_LINTERP) {
1102 assert(inst->src[2].file == FIXED_HW_REG);
1103 inst->src[2].fixed_hw_reg.nr += urb_start;
1104 }
1105
1106 if (inst->opcode == FS_OPCODE_CINTERP) {
1107 assert(inst->src[0].file == FIXED_HW_REG);
1108 inst->src[0].fixed_hw_reg.nr += urb_start;
1109 }
1110 }
1111
1112 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1113 }
1114
1115 /**
1116 * Split large virtual GRFs into separate components if we can.
1117 *
1118 * This is mostly duplicated with what brw_fs_vector_splitting does,
1119 * but that's really conservative because it's afraid of doing
1120 * splitting that doesn't result in real progress after the rest of
1121 * the optimization phases, which would cause infinite looping in
1122 * optimization. We can do it once here, safely. This also has the
1123 * opportunity to split interpolated values, or maybe even uniforms,
1124 * which we don't have at the IR level.
1125 *
1126 * We want to split, because virtual GRFs are what we register
1127 * allocate and spill (due to contiguousness requirements for some
1128 * instructions), and they're what we naturally generate in the
1129 * codegen process, but most virtual GRFs don't actually need to be
1130 * contiguous sets of GRFs. If we split, we'll end up with reduced
1131 * live intervals and better dead code elimination and coalescing.
1132 */
1133 void
1134 fs_visitor::split_virtual_grfs()
1135 {
1136 int num_vars = this->virtual_grf_count;
1137 bool split_grf[num_vars];
1138 int new_virtual_grf[num_vars];
1139
1140 /* Try to split anything > 0 sized. */
1141 for (int i = 0; i < num_vars; i++) {
1142 if (this->virtual_grf_sizes[i] != 1)
1143 split_grf[i] = true;
1144 else
1145 split_grf[i] = false;
1146 }
1147
1148 if (brw->has_pln &&
1149 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1150 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1151 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1152 * Gen6, that was the only supported interpolation mode, and since Gen6,
1153 * delta_x and delta_y are in fixed hardware registers.
1154 */
1155 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1156 false;
1157 }
1158
1159 foreach_list(node, &this->instructions) {
1160 fs_inst *inst = (fs_inst *)node;
1161
1162 /* If there's a SEND message that requires contiguous destination
1163 * registers, no splitting is allowed.
1164 */
1165 if (inst->regs_written() > 1) {
1166 split_grf[inst->dst.reg] = false;
1167 }
1168 }
1169
1170 /* Allocate new space for split regs. Note that the virtual
1171 * numbers will be contiguous.
1172 */
1173 for (int i = 0; i < num_vars; i++) {
1174 if (split_grf[i]) {
1175 new_virtual_grf[i] = virtual_grf_alloc(1);
1176 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1177 int reg = virtual_grf_alloc(1);
1178 assert(reg == new_virtual_grf[i] + j - 1);
1179 (void) reg;
1180 }
1181 this->virtual_grf_sizes[i] = 1;
1182 }
1183 }
1184
1185 foreach_list(node, &this->instructions) {
1186 fs_inst *inst = (fs_inst *)node;
1187
1188 if (inst->dst.file == GRF &&
1189 split_grf[inst->dst.reg] &&
1190 inst->dst.reg_offset != 0) {
1191 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1192 inst->dst.reg_offset - 1);
1193 inst->dst.reg_offset = 0;
1194 }
1195 for (int i = 0; i < 3; i++) {
1196 if (inst->src[i].file == GRF &&
1197 split_grf[inst->src[i].reg] &&
1198 inst->src[i].reg_offset != 0) {
1199 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1200 inst->src[i].reg_offset - 1);
1201 inst->src[i].reg_offset = 0;
1202 }
1203 }
1204 }
1205 this->live_intervals_valid = false;
1206 }
1207
1208 /**
1209 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1210 *
1211 * During code generation, we create tons of temporary variables, many of
1212 * which get immediately killed and are never used again. Yet, in later
1213 * optimization and analysis passes, such as compute_live_intervals, we need
1214 * to loop over all the virtual GRFs. Compacting them can save a lot of
1215 * overhead.
1216 */
1217 void
1218 fs_visitor::compact_virtual_grfs()
1219 {
1220 /* Mark which virtual GRFs are used, and count how many. */
1221 int remap_table[this->virtual_grf_count];
1222 memset(remap_table, -1, sizeof(remap_table));
1223
1224 foreach_list(node, &this->instructions) {
1225 const fs_inst *inst = (const fs_inst *) node;
1226
1227 if (inst->dst.file == GRF)
1228 remap_table[inst->dst.reg] = 0;
1229
1230 for (int i = 0; i < 3; i++) {
1231 if (inst->src[i].file == GRF)
1232 remap_table[inst->src[i].reg] = 0;
1233 }
1234 }
1235
1236 /* In addition to registers used in instructions, fs_visitor keeps
1237 * direct references to certain special values which must be patched:
1238 */
1239 fs_reg *special[] = {
1240 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1241 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1242 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1243 &delta_x[0], &delta_x[1], &delta_x[2],
1244 &delta_x[3], &delta_x[4], &delta_x[5],
1245 &delta_y[0], &delta_y[1], &delta_y[2],
1246 &delta_y[3], &delta_y[4], &delta_y[5],
1247 };
1248 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1249 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1250
1251 /* Treat all special values as used, to be conservative */
1252 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1253 if (special[i]->file == GRF)
1254 remap_table[special[i]->reg] = 0;
1255 }
1256
1257 /* Compact the GRF arrays. */
1258 int new_index = 0;
1259 for (int i = 0; i < this->virtual_grf_count; i++) {
1260 if (remap_table[i] != -1) {
1261 remap_table[i] = new_index;
1262 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1263 if (live_intervals_valid) {
1264 virtual_grf_use[new_index] = virtual_grf_use[i];
1265 virtual_grf_def[new_index] = virtual_grf_def[i];
1266 }
1267 ++new_index;
1268 }
1269 }
1270
1271 this->virtual_grf_count = new_index;
1272
1273 /* Patch all the instructions to use the newly renumbered registers */
1274 foreach_list(node, &this->instructions) {
1275 fs_inst *inst = (fs_inst *) node;
1276
1277 if (inst->dst.file == GRF)
1278 inst->dst.reg = remap_table[inst->dst.reg];
1279
1280 for (int i = 0; i < 3; i++) {
1281 if (inst->src[i].file == GRF)
1282 inst->src[i].reg = remap_table[inst->src[i].reg];
1283 }
1284 }
1285
1286 /* Patch all the references to special values */
1287 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1288 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1289 special[i]->reg = remap_table[special[i]->reg];
1290 }
1291 }
1292
1293 bool
1294 fs_visitor::remove_dead_constants()
1295 {
1296 if (dispatch_width == 8) {
1297 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1298
1299 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1300 this->params_remap[i] = -1;
1301
1302 /* Find which params are still in use. */
1303 foreach_list(node, &this->instructions) {
1304 fs_inst *inst = (fs_inst *)node;
1305
1306 for (int i = 0; i < 3; i++) {
1307 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1308
1309 if (inst->src[i].file != UNIFORM)
1310 continue;
1311
1312 assert(constant_nr < (int)c->prog_data.nr_params);
1313
1314 /* For now, set this to non-negative. We'll give it the
1315 * actual new number in a moment, in order to keep the
1316 * register numbers nicely ordered.
1317 */
1318 this->params_remap[constant_nr] = 0;
1319 }
1320 }
1321
1322 /* Figure out what the new numbers for the params will be. At some
1323 * point when we're doing uniform array access, we're going to want
1324 * to keep the distinction between .reg and .reg_offset, but for
1325 * now we don't care.
1326 */
1327 unsigned int new_nr_params = 0;
1328 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1329 if (this->params_remap[i] != -1) {
1330 this->params_remap[i] = new_nr_params++;
1331 }
1332 }
1333
1334 /* Update the list of params to be uploaded to match our new numbering. */
1335 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1336 int remapped = this->params_remap[i];
1337
1338 if (remapped == -1)
1339 continue;
1340
1341 /* We've already done setup_paramvalues_refs() so no need to worry
1342 * about param_index and param_offset.
1343 */
1344 c->prog_data.param[remapped] = c->prog_data.param[i];
1345 }
1346
1347 c->prog_data.nr_params = new_nr_params;
1348 } else {
1349 /* This should have been generated in the 8-wide pass already. */
1350 assert(this->params_remap);
1351 }
1352
1353 /* Now do the renumbering of the shader to remove unused params. */
1354 foreach_list(node, &this->instructions) {
1355 fs_inst *inst = (fs_inst *)node;
1356
1357 for (int i = 0; i < 3; i++) {
1358 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1359
1360 if (inst->src[i].file != UNIFORM)
1361 continue;
1362
1363 assert(this->params_remap[constant_nr] != -1);
1364 inst->src[i].reg = this->params_remap[constant_nr];
1365 inst->src[i].reg_offset = 0;
1366 }
1367 }
1368
1369 return true;
1370 }
1371
1372 /**
1373 * Choose accesses from the UNIFORM file to demote to using the pull
1374 * constant buffer.
1375 *
1376 * We allow a fragment shader to have more than the specified minimum
1377 * maximum number of fragment shader uniform components (64). If
1378 * there are too many of these, they'd fill up all of register space.
1379 * So, this will push some of them out to the pull constant buffer and
1380 * update the program to load them.
1381 */
1382 void
1383 fs_visitor::setup_pull_constants()
1384 {
1385 /* Only allow 16 registers (128 uniform components) as push constants. */
1386 unsigned int max_uniform_components = 16 * 8;
1387 if (c->prog_data.nr_params <= max_uniform_components)
1388 return;
1389
1390 if (dispatch_width == 16) {
1391 fail("Pull constants not supported in 16-wide\n");
1392 return;
1393 }
1394
1395 /* Just demote the end of the list. We could probably do better
1396 * here, demoting things that are rarely used in the program first.
1397 */
1398 int pull_uniform_base = max_uniform_components;
1399 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
1400
1401 foreach_list(node, &this->instructions) {
1402 fs_inst *inst = (fs_inst *)node;
1403
1404 for (int i = 0; i < 3; i++) {
1405 if (inst->src[i].file != UNIFORM)
1406 continue;
1407
1408 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1409 if (uniform_nr < pull_uniform_base)
1410 continue;
1411
1412 fs_reg dst = fs_reg(this, glsl_type::float_type);
1413 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1414 fs_reg offset = fs_reg((unsigned)(((uniform_nr -
1415 pull_uniform_base) * 4) & ~15));
1416 fs_inst *pull =
1417 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1418 dst, index, offset);
1419 pull->ir = inst->ir;
1420 pull->annotation = inst->annotation;
1421 pull->base_mrf = 14;
1422 pull->mlen = 1;
1423
1424 inst->insert_before(pull);
1425
1426 inst->src[i].file = GRF;
1427 inst->src[i].reg = dst.reg;
1428 inst->src[i].reg_offset = 0;
1429 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
1430 }
1431 }
1432
1433 for (int i = 0; i < pull_uniform_count; i++) {
1434 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
1435 }
1436 c->prog_data.nr_params -= pull_uniform_count;
1437 c->prog_data.nr_pull_params = pull_uniform_count;
1438 }
1439
1440 bool
1441 fs_visitor::opt_algebraic()
1442 {
1443 bool progress = false;
1444
1445 foreach_list(node, &this->instructions) {
1446 fs_inst *inst = (fs_inst *)node;
1447
1448 switch (inst->opcode) {
1449 case BRW_OPCODE_MUL:
1450 if (inst->src[1].file != IMM)
1451 continue;
1452
1453 /* a * 1.0 = a */
1454 if (inst->src[1].is_one()) {
1455 inst->opcode = BRW_OPCODE_MOV;
1456 inst->src[1] = reg_undef;
1457 progress = true;
1458 break;
1459 }
1460
1461 /* a * 0.0 = 0.0 */
1462 if (inst->src[1].is_zero()) {
1463 inst->opcode = BRW_OPCODE_MOV;
1464 inst->src[0] = inst->src[1];
1465 inst->src[1] = reg_undef;
1466 progress = true;
1467 break;
1468 }
1469
1470 break;
1471 case BRW_OPCODE_ADD:
1472 if (inst->src[1].file != IMM)
1473 continue;
1474
1475 /* a + 0.0 = a */
1476 if (inst->src[1].is_zero()) {
1477 inst->opcode = BRW_OPCODE_MOV;
1478 inst->src[1] = reg_undef;
1479 progress = true;
1480 break;
1481 }
1482 break;
1483 default:
1484 break;
1485 }
1486 }
1487
1488 return progress;
1489 }
1490
1491 /**
1492 * Must be called after calculate_live_intervales() to remove unused
1493 * writes to registers -- register allocation will fail otherwise
1494 * because something deffed but not used won't be considered to
1495 * interfere with other regs.
1496 */
1497 bool
1498 fs_visitor::dead_code_eliminate()
1499 {
1500 bool progress = false;
1501 int pc = 0;
1502
1503 calculate_live_intervals();
1504
1505 foreach_list_safe(node, &this->instructions) {
1506 fs_inst *inst = (fs_inst *)node;
1507
1508 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1509 inst->remove();
1510 progress = true;
1511 }
1512
1513 pc++;
1514 }
1515
1516 if (progress)
1517 live_intervals_valid = false;
1518
1519 return progress;
1520 }
1521
1522 /**
1523 * Implements a second type of register coalescing: This one checks if
1524 * the two regs involved in a raw move don't interfere, in which case
1525 * they can both by stored in the same place and the MOV removed.
1526 */
1527 bool
1528 fs_visitor::register_coalesce_2()
1529 {
1530 bool progress = false;
1531
1532 calculate_live_intervals();
1533
1534 foreach_list_safe(node, &this->instructions) {
1535 fs_inst *inst = (fs_inst *)node;
1536
1537 if (inst->opcode != BRW_OPCODE_MOV ||
1538 inst->predicate ||
1539 inst->saturate ||
1540 inst->src[0].file != GRF ||
1541 inst->src[0].negate ||
1542 inst->src[0].abs ||
1543 inst->src[0].smear != -1 ||
1544 inst->dst.file != GRF ||
1545 inst->dst.type != inst->src[0].type ||
1546 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1547 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1548 continue;
1549 }
1550
1551 int reg_from = inst->src[0].reg;
1552 assert(inst->src[0].reg_offset == 0);
1553 int reg_to = inst->dst.reg;
1554 int reg_to_offset = inst->dst.reg_offset;
1555
1556 foreach_list_safe(node, &this->instructions) {
1557 fs_inst *scan_inst = (fs_inst *)node;
1558
1559 if (scan_inst->dst.file == GRF &&
1560 scan_inst->dst.reg == reg_from) {
1561 scan_inst->dst.reg = reg_to;
1562 scan_inst->dst.reg_offset = reg_to_offset;
1563 }
1564 for (int i = 0; i < 3; i++) {
1565 if (scan_inst->src[i].file == GRF &&
1566 scan_inst->src[i].reg == reg_from) {
1567 scan_inst->src[i].reg = reg_to;
1568 scan_inst->src[i].reg_offset = reg_to_offset;
1569 }
1570 }
1571 }
1572
1573 inst->remove();
1574 live_intervals_valid = false;
1575 progress = true;
1576 continue;
1577 }
1578
1579 return progress;
1580 }
1581
1582 bool
1583 fs_visitor::register_coalesce()
1584 {
1585 bool progress = false;
1586 int if_depth = 0;
1587 int loop_depth = 0;
1588
1589 foreach_list_safe(node, &this->instructions) {
1590 fs_inst *inst = (fs_inst *)node;
1591
1592 /* Make sure that we dominate the instructions we're going to
1593 * scan for interfering with our coalescing, or we won't have
1594 * scanned enough to see if anything interferes with our
1595 * coalescing. We don't dominate the following instructions if
1596 * we're in a loop or an if block.
1597 */
1598 switch (inst->opcode) {
1599 case BRW_OPCODE_DO:
1600 loop_depth++;
1601 break;
1602 case BRW_OPCODE_WHILE:
1603 loop_depth--;
1604 break;
1605 case BRW_OPCODE_IF:
1606 if_depth++;
1607 break;
1608 case BRW_OPCODE_ENDIF:
1609 if_depth--;
1610 break;
1611 default:
1612 break;
1613 }
1614 if (loop_depth || if_depth)
1615 continue;
1616
1617 if (inst->opcode != BRW_OPCODE_MOV ||
1618 inst->predicate ||
1619 inst->saturate ||
1620 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1621 inst->src[0].file != UNIFORM)||
1622 inst->dst.type != inst->src[0].type)
1623 continue;
1624
1625 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
1626
1627 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1628 * them: check for no writes to either one until the exit of the
1629 * program.
1630 */
1631 bool interfered = false;
1632
1633 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1634 !scan_inst->is_tail_sentinel();
1635 scan_inst = (fs_inst *)scan_inst->next) {
1636 if (scan_inst->dst.file == GRF) {
1637 if (scan_inst->overwrites_reg(inst->dst) ||
1638 scan_inst->overwrites_reg(inst->src[0])) {
1639 interfered = true;
1640 break;
1641 }
1642 }
1643
1644 /* The gen6 MATH instruction can't handle source modifiers or
1645 * unusual register regions, so avoid coalescing those for
1646 * now. We should do something more specific.
1647 */
1648 if (intel->gen == 6 &&
1649 scan_inst->is_math() &&
1650 (has_source_modifiers || inst->src[0].file == UNIFORM)) {
1651 interfered = true;
1652 break;
1653 }
1654
1655 /* The accumulator result appears to get used for the
1656 * conditional modifier generation. When negating a UD
1657 * value, there is a 33rd bit generated for the sign in the
1658 * accumulator value, so now you can't check, for example,
1659 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1660 */
1661 if (scan_inst->conditional_mod &&
1662 inst->src[0].negate &&
1663 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1664 interfered = true;
1665 break;
1666 }
1667 }
1668 if (interfered) {
1669 continue;
1670 }
1671
1672 /* Rewrite the later usage to point at the source of the move to
1673 * be removed.
1674 */
1675 for (fs_inst *scan_inst = inst;
1676 !scan_inst->is_tail_sentinel();
1677 scan_inst = (fs_inst *)scan_inst->next) {
1678 for (int i = 0; i < 3; i++) {
1679 if (scan_inst->src[i].file == GRF &&
1680 scan_inst->src[i].reg == inst->dst.reg &&
1681 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1682 fs_reg new_src = inst->src[0];
1683 if (scan_inst->src[i].abs) {
1684 new_src.negate = 0;
1685 new_src.abs = 1;
1686 }
1687 new_src.negate ^= scan_inst->src[i].negate;
1688 scan_inst->src[i] = new_src;
1689 }
1690 }
1691 }
1692
1693 inst->remove();
1694 progress = true;
1695 }
1696
1697 if (progress)
1698 live_intervals_valid = false;
1699
1700 return progress;
1701 }
1702
1703
1704 bool
1705 fs_visitor::compute_to_mrf()
1706 {
1707 bool progress = false;
1708 int next_ip = 0;
1709
1710 calculate_live_intervals();
1711
1712 foreach_list_safe(node, &this->instructions) {
1713 fs_inst *inst = (fs_inst *)node;
1714
1715 int ip = next_ip;
1716 next_ip++;
1717
1718 if (inst->opcode != BRW_OPCODE_MOV ||
1719 inst->predicate ||
1720 inst->dst.file != MRF || inst->src[0].file != GRF ||
1721 inst->dst.type != inst->src[0].type ||
1722 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1723 continue;
1724
1725 /* Work out which hardware MRF registers are written by this
1726 * instruction.
1727 */
1728 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1729 int mrf_high;
1730 if (inst->dst.reg & BRW_MRF_COMPR4) {
1731 mrf_high = mrf_low + 4;
1732 } else if (dispatch_width == 16 &&
1733 (!inst->force_uncompressed && !inst->force_sechalf)) {
1734 mrf_high = mrf_low + 1;
1735 } else {
1736 mrf_high = mrf_low;
1737 }
1738
1739 /* Can't compute-to-MRF this GRF if someone else was going to
1740 * read it later.
1741 */
1742 if (this->virtual_grf_use[inst->src[0].reg] > ip)
1743 continue;
1744
1745 /* Found a move of a GRF to a MRF. Let's see if we can go
1746 * rewrite the thing that made this GRF to write into the MRF.
1747 */
1748 fs_inst *scan_inst;
1749 for (scan_inst = (fs_inst *)inst->prev;
1750 scan_inst->prev != NULL;
1751 scan_inst = (fs_inst *)scan_inst->prev) {
1752 if (scan_inst->dst.file == GRF &&
1753 scan_inst->dst.reg == inst->src[0].reg) {
1754 /* Found the last thing to write our reg we want to turn
1755 * into a compute-to-MRF.
1756 */
1757
1758 /* SENDs can only write to GRFs, so no compute-to-MRF. */
1759 if (scan_inst->mlen) {
1760 break;
1761 }
1762
1763 /* If it's predicated, it (probably) didn't populate all
1764 * the channels. We might be able to rewrite everything
1765 * that writes that reg, but it would require smarter
1766 * tracking to delay the rewriting until complete success.
1767 */
1768 if (scan_inst->predicate)
1769 break;
1770
1771 /* If it's half of register setup and not the same half as
1772 * our MOV we're trying to remove, bail for now.
1773 */
1774 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
1775 scan_inst->force_sechalf != inst->force_sechalf) {
1776 break;
1777 }
1778
1779 /* SEND instructions can't have MRF as a destination. */
1780 if (scan_inst->mlen)
1781 break;
1782
1783 if (intel->gen >= 6) {
1784 /* gen6 math instructions must have the destination be
1785 * GRF, so no compute-to-MRF for them.
1786 */
1787 if (scan_inst->is_math()) {
1788 break;
1789 }
1790 }
1791
1792 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1793 /* Found the creator of our MRF's source value. */
1794 scan_inst->dst.file = MRF;
1795 scan_inst->dst.reg = inst->dst.reg;
1796 scan_inst->saturate |= inst->saturate;
1797 inst->remove();
1798 progress = true;
1799 }
1800 break;
1801 }
1802
1803 /* We don't handle flow control here. Most computation of
1804 * values that end up in MRFs are shortly before the MRF
1805 * write anyway.
1806 */
1807 if (scan_inst->opcode == BRW_OPCODE_DO ||
1808 scan_inst->opcode == BRW_OPCODE_WHILE ||
1809 scan_inst->opcode == BRW_OPCODE_ELSE ||
1810 scan_inst->opcode == BRW_OPCODE_ENDIF) {
1811 break;
1812 }
1813
1814 /* You can't read from an MRF, so if someone else reads our
1815 * MRF's source GRF that we wanted to rewrite, that stops us.
1816 */
1817 bool interfered = false;
1818 for (int i = 0; i < 3; i++) {
1819 if (scan_inst->src[i].file == GRF &&
1820 scan_inst->src[i].reg == inst->src[0].reg &&
1821 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
1822 interfered = true;
1823 }
1824 }
1825 if (interfered)
1826 break;
1827
1828 if (scan_inst->dst.file == MRF) {
1829 /* If somebody else writes our MRF here, we can't
1830 * compute-to-MRF before that.
1831 */
1832 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
1833 int scan_mrf_high;
1834
1835 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
1836 scan_mrf_high = scan_mrf_low + 4;
1837 } else if (dispatch_width == 16 &&
1838 (!scan_inst->force_uncompressed &&
1839 !scan_inst->force_sechalf)) {
1840 scan_mrf_high = scan_mrf_low + 1;
1841 } else {
1842 scan_mrf_high = scan_mrf_low;
1843 }
1844
1845 if (mrf_low == scan_mrf_low ||
1846 mrf_low == scan_mrf_high ||
1847 mrf_high == scan_mrf_low ||
1848 mrf_high == scan_mrf_high) {
1849 break;
1850 }
1851 }
1852
1853 if (scan_inst->mlen > 0) {
1854 /* Found a SEND instruction, which means that there are
1855 * live values in MRFs from base_mrf to base_mrf +
1856 * scan_inst->mlen - 1. Don't go pushing our MRF write up
1857 * above it.
1858 */
1859 if (mrf_low >= scan_inst->base_mrf &&
1860 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
1861 break;
1862 }
1863 if (mrf_high >= scan_inst->base_mrf &&
1864 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
1865 break;
1866 }
1867 }
1868 }
1869 }
1870
1871 if (progress)
1872 live_intervals_valid = false;
1873
1874 return progress;
1875 }
1876
1877 /**
1878 * Walks through basic blocks, looking for repeated MRF writes and
1879 * removing the later ones.
1880 */
1881 bool
1882 fs_visitor::remove_duplicate_mrf_writes()
1883 {
1884 fs_inst *last_mrf_move[16];
1885 bool progress = false;
1886
1887 /* Need to update the MRF tracking for compressed instructions. */
1888 if (dispatch_width == 16)
1889 return false;
1890
1891 memset(last_mrf_move, 0, sizeof(last_mrf_move));
1892
1893 foreach_list_safe(node, &this->instructions) {
1894 fs_inst *inst = (fs_inst *)node;
1895
1896 switch (inst->opcode) {
1897 case BRW_OPCODE_DO:
1898 case BRW_OPCODE_WHILE:
1899 case BRW_OPCODE_IF:
1900 case BRW_OPCODE_ELSE:
1901 case BRW_OPCODE_ENDIF:
1902 memset(last_mrf_move, 0, sizeof(last_mrf_move));
1903 continue;
1904 default:
1905 break;
1906 }
1907
1908 if (inst->opcode == BRW_OPCODE_MOV &&
1909 inst->dst.file == MRF) {
1910 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
1911 if (prev_inst && inst->equals(prev_inst)) {
1912 inst->remove();
1913 progress = true;
1914 continue;
1915 }
1916 }
1917
1918 /* Clear out the last-write records for MRFs that were overwritten. */
1919 if (inst->dst.file == MRF) {
1920 last_mrf_move[inst->dst.reg] = NULL;
1921 }
1922
1923 if (inst->mlen > 0) {
1924 /* Found a SEND instruction, which will include two or fewer
1925 * implied MRF writes. We could do better here.
1926 */
1927 for (int i = 0; i < implied_mrf_writes(inst); i++) {
1928 last_mrf_move[inst->base_mrf + i] = NULL;
1929 }
1930 }
1931
1932 /* Clear out any MRF move records whose sources got overwritten. */
1933 if (inst->dst.file == GRF) {
1934 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
1935 if (last_mrf_move[i] &&
1936 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
1937 last_mrf_move[i] = NULL;
1938 }
1939 }
1940 }
1941
1942 if (inst->opcode == BRW_OPCODE_MOV &&
1943 inst->dst.file == MRF &&
1944 inst->src[0].file == GRF &&
1945 !inst->predicate) {
1946 last_mrf_move[inst->dst.reg] = inst;
1947 }
1948 }
1949
1950 if (progress)
1951 live_intervals_valid = false;
1952
1953 return progress;
1954 }
1955
1956 void
1957 fs_visitor::dump_instruction(fs_inst *inst)
1958 {
1959 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
1960 opcode_descs[inst->opcode].name) {
1961 printf("%s", opcode_descs[inst->opcode].name);
1962 } else {
1963 printf("op%d", inst->opcode);
1964 }
1965 if (inst->saturate)
1966 printf(".sat");
1967 printf(" ");
1968
1969 switch (inst->dst.file) {
1970 case GRF:
1971 printf("vgrf%d", inst->dst.reg);
1972 if (inst->dst.reg_offset)
1973 printf("+%d", inst->dst.reg_offset);
1974 break;
1975 case MRF:
1976 printf("m%d", inst->dst.reg);
1977 break;
1978 case BAD_FILE:
1979 printf("(null)");
1980 break;
1981 case UNIFORM:
1982 printf("***u%d***", inst->dst.reg);
1983 break;
1984 default:
1985 printf("???");
1986 break;
1987 }
1988 printf(", ");
1989
1990 for (int i = 0; i < 3; i++) {
1991 if (inst->src[i].negate)
1992 printf("-");
1993 if (inst->src[i].abs)
1994 printf("|");
1995 switch (inst->src[i].file) {
1996 case GRF:
1997 printf("vgrf%d", inst->src[i].reg);
1998 if (inst->src[i].reg_offset)
1999 printf("+%d", inst->src[i].reg_offset);
2000 break;
2001 case MRF:
2002 printf("***m%d***", inst->src[i].reg);
2003 break;
2004 case UNIFORM:
2005 printf("u%d", inst->src[i].reg);
2006 if (inst->src[i].reg_offset)
2007 printf(".%d", inst->src[i].reg_offset);
2008 break;
2009 case BAD_FILE:
2010 printf("(null)");
2011 break;
2012 default:
2013 printf("???");
2014 break;
2015 }
2016 if (inst->src[i].abs)
2017 printf("|");
2018
2019 if (i < 3)
2020 printf(", ");
2021 }
2022
2023 printf(" ");
2024
2025 if (inst->force_uncompressed)
2026 printf("1sthalf ");
2027
2028 if (inst->force_sechalf)
2029 printf("2ndhalf ");
2030
2031 printf("\n");
2032 }
2033
2034 void
2035 fs_visitor::dump_instructions()
2036 {
2037 int ip = 0;
2038 foreach_list(node, &this->instructions) {
2039 fs_inst *inst = (fs_inst *)node;
2040 printf("%d: ", ip++);
2041 dump_instruction(inst);
2042 }
2043 }
2044
2045 /**
2046 * Possibly returns an instruction that set up @param reg.
2047 *
2048 * Sometimes we want to take the result of some expression/variable
2049 * dereference tree and rewrite the instruction generating the result
2050 * of the tree. When processing the tree, we know that the
2051 * instructions generated are all writing temporaries that are dead
2052 * outside of this tree. So, if we have some instructions that write
2053 * a temporary, we're free to point that temp write somewhere else.
2054 *
2055 * Note that this doesn't guarantee that the instruction generated
2056 * only reg -- it might be the size=4 destination of a texture instruction.
2057 */
2058 fs_inst *
2059 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2060 fs_inst *end,
2061 fs_reg reg)
2062 {
2063 if (end == start ||
2064 end->predicate ||
2065 end->force_uncompressed ||
2066 end->force_sechalf ||
2067 !reg.equals(end->dst)) {
2068 return NULL;
2069 } else {
2070 return end;
2071 }
2072 }
2073
2074 void
2075 fs_visitor::setup_payload_gen6()
2076 {
2077 struct intel_context *intel = &brw->intel;
2078 bool uses_depth =
2079 (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2080 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2081
2082 assert(intel->gen >= 6);
2083
2084 /* R0-1: masks, pixel X/Y coordinates. */
2085 c->nr_payload_regs = 2;
2086 /* R2: only for 32-pixel dispatch.*/
2087
2088 /* R3-26: barycentric interpolation coordinates. These appear in the
2089 * same order that they appear in the brw_wm_barycentric_interp_mode
2090 * enum. Each set of coordinates occupies 2 registers if dispatch width
2091 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2092 * appear if they were enabled using the "Barycentric Interpolation
2093 * Mode" bits in WM_STATE.
2094 */
2095 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2096 if (barycentric_interp_modes & (1 << i)) {
2097 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2098 c->nr_payload_regs += 2;
2099 if (dispatch_width == 16) {
2100 c->nr_payload_regs += 2;
2101 }
2102 }
2103 }
2104
2105 /* R27: interpolated depth if uses source depth */
2106 if (uses_depth) {
2107 c->source_depth_reg = c->nr_payload_regs;
2108 c->nr_payload_regs++;
2109 if (dispatch_width == 16) {
2110 /* R28: interpolated depth if not 8-wide. */
2111 c->nr_payload_regs++;
2112 }
2113 }
2114 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2115 if (uses_depth) {
2116 c->source_w_reg = c->nr_payload_regs;
2117 c->nr_payload_regs++;
2118 if (dispatch_width == 16) {
2119 /* R30: interpolated W if not 8-wide. */
2120 c->nr_payload_regs++;
2121 }
2122 }
2123 /* R31: MSAA position offsets. */
2124 /* R32-: bary for 32-pixel. */
2125 /* R58-59: interp W for 32-pixel. */
2126
2127 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2128 c->source_depth_to_render_target = true;
2129 }
2130 }
2131
2132 bool
2133 fs_visitor::run()
2134 {
2135 uint32_t orig_nr_params = c->prog_data.nr_params;
2136
2137 if (intel->gen >= 6)
2138 setup_payload_gen6();
2139 else
2140 setup_payload_gen4();
2141
2142 if (0) {
2143 emit_dummy_fs();
2144 } else {
2145 calculate_urb_setup();
2146 if (intel->gen < 6)
2147 emit_interpolation_setup_gen4();
2148 else
2149 emit_interpolation_setup_gen6();
2150
2151 /* Generate FS IR for main(). (the visitor only descends into
2152 * functions called "main").
2153 */
2154 if (shader) {
2155 foreach_list(node, &*shader->ir) {
2156 ir_instruction *ir = (ir_instruction *)node;
2157 base_ir = ir;
2158 this->result = reg_undef;
2159 ir->accept(this);
2160 }
2161 } else {
2162 emit_fragment_program_code();
2163 }
2164 base_ir = NULL;
2165 if (failed)
2166 return false;
2167
2168 emit_fb_writes();
2169
2170 split_virtual_grfs();
2171
2172 setup_paramvalues_refs();
2173 setup_pull_constants();
2174
2175 bool progress;
2176 do {
2177 progress = false;
2178
2179 compact_virtual_grfs();
2180
2181 progress = remove_duplicate_mrf_writes() || progress;
2182
2183 progress = opt_algebraic() || progress;
2184 progress = opt_cse() || progress;
2185 progress = opt_copy_propagate() || progress;
2186 progress = dead_code_eliminate() || progress;
2187 progress = register_coalesce() || progress;
2188 progress = register_coalesce_2() || progress;
2189 progress = compute_to_mrf() || progress;
2190 } while (progress);
2191
2192 remove_dead_constants();
2193
2194 schedule_instructions();
2195
2196 assign_curb_setup();
2197 assign_urb_setup();
2198
2199 if (0) {
2200 /* Debug of register spilling: Go spill everything. */
2201 for (int i = 0; i < virtual_grf_count; i++) {
2202 spill_reg(i);
2203 }
2204 }
2205
2206 if (0)
2207 assign_regs_trivial();
2208 else {
2209 while (!assign_regs()) {
2210 if (failed)
2211 break;
2212 }
2213 }
2214 }
2215 assert(force_uncompressed_stack == 0);
2216 assert(force_sechalf_stack == 0);
2217
2218 if (failed)
2219 return false;
2220
2221 if (dispatch_width == 8) {
2222 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2223 } else {
2224 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2225
2226 /* Make sure we didn't try to sneak in an extra uniform */
2227 assert(orig_nr_params == c->prog_data.nr_params);
2228 (void) orig_nr_params;
2229 }
2230
2231 return !failed;
2232 }
2233
2234 const unsigned *
2235 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2236 struct gl_fragment_program *fp,
2237 struct gl_shader_program *prog,
2238 unsigned *final_assembly_size)
2239 {
2240 struct intel_context *intel = &brw->intel;
2241 bool start_busy = false;
2242 float start_time = 0;
2243
2244 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2245 start_busy = (intel->batch.last_bo &&
2246 drm_intel_bo_busy(intel->batch.last_bo));
2247 start_time = get_time();
2248 }
2249
2250 struct brw_shader *shader = NULL;
2251 if (prog)
2252 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2253
2254 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2255 if (shader) {
2256 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2257 _mesa_print_ir(shader->ir, NULL);
2258 printf("\n\n");
2259 } else {
2260 printf("ARB_fragment_program %d ir for native fragment shader\n",
2261 fp->Base.Id);
2262 _mesa_print_program(&fp->Base);
2263 }
2264 }
2265
2266 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2267 */
2268 fs_visitor v(brw, c, prog, fp, 8);
2269 if (!v.run()) {
2270 prog->LinkStatus = false;
2271 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2272
2273 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2274 v.fail_msg);
2275
2276 return NULL;
2277 }
2278
2279 exec_list *simd16_instructions = NULL;
2280 fs_visitor v2(brw, c, prog, fp, 16);
2281 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2282 v2.import_uniforms(&v);
2283 if (!v2.run()) {
2284 perf_debug("16-wide shader failed to compile, falling back to "
2285 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2286 } else {
2287 simd16_instructions = &v2.instructions;
2288 }
2289 }
2290
2291 c->prog_data.dispatch_width = 8;
2292
2293 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2294 const unsigned *generated = g.generate_assembly(&v.instructions,
2295 simd16_instructions,
2296 final_assembly_size);
2297
2298 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2299 if (shader->compiled_once)
2300 brw_wm_debug_recompile(brw, prog, &c->key);
2301 shader->compiled_once = true;
2302
2303 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2304 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2305 (get_time() - start_time) * 1000);
2306 }
2307 }
2308
2309 return generated;
2310 }
2311
2312 bool
2313 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2314 {
2315 struct brw_context *brw = brw_context(ctx);
2316 struct intel_context *intel = &brw->intel;
2317 struct brw_wm_prog_key key;
2318
2319 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2320 return true;
2321
2322 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2323 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2324 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2325 bool program_uses_dfdy = fp->UsesDFdy;
2326
2327 memset(&key, 0, sizeof(key));
2328
2329 if (intel->gen < 6) {
2330 if (fp->UsesKill)
2331 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2332
2333 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2334 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2335
2336 /* Just assume depth testing. */
2337 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2338 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2339 }
2340
2341 if (prog->Name != 0)
2342 key.proj_attrib_mask = 0xffffffff;
2343
2344 if (intel->gen < 6)
2345 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2346
2347 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2348 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2349 continue;
2350
2351 if (prog->Name == 0)
2352 key.proj_attrib_mask |= 1 << i;
2353
2354 if (intel->gen < 6) {
2355 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2356
2357 if (vp_index >= 0)
2358 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2359 }
2360 }
2361
2362 key.clamp_fragment_color = true;
2363
2364 for (int i = 0; i < MAX_SAMPLERS; i++) {
2365 if (fp->Base.ShadowSamplers & (1 << i)) {
2366 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2367 key.tex.swizzles[i] =
2368 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2369 } else {
2370 /* Color sampler: assume no swizzling. */
2371 key.tex.swizzles[i] = SWIZZLE_XYZW;
2372 }
2373 }
2374
2375 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2376 key.drawable_height = ctx->DrawBuffer->Height;
2377 }
2378
2379 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2380 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2381 }
2382
2383 key.nr_color_regions = 1;
2384
2385 key.program_string_id = bfp->id;
2386
2387 uint32_t old_prog_offset = brw->wm.prog_offset;
2388 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2389
2390 bool success = do_wm_prog(brw, prog, bfp, &key);
2391
2392 brw->wm.prog_offset = old_prog_offset;
2393 brw->wm.prog_data = old_prog_data;
2394
2395 return success;
2396 }