i965/fs: Move brw_wm_compile::fp to fs_visitor.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 ALU1(NOT)
150 ALU1(MOV)
151 ALU1(FRC)
152 ALU1(RNDD)
153 ALU1(RNDE)
154 ALU1(RNDZ)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(SHL)
162 ALU2(SHR)
163 ALU2(ASR)
164
165 /** Gen4 predicated IF. */
166 fs_inst *
167 fs_visitor::IF(uint32_t predicate)
168 {
169 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
170 inst->predicate = predicate;
171 return inst;
172 }
173
174 /** Gen6+ IF with embedded comparison. */
175 fs_inst *
176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
177 {
178 assert(intel->gen >= 6);
179 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
180 reg_null_d, src0, src1);
181 inst->conditional_mod = condition;
182 return inst;
183 }
184
185 /**
186 * CMP: Sets the low bit of the destination channels with the result
187 * of the comparison, while the upper bits are undefined, and updates
188 * the flag register with the packed 16 bits of the result.
189 */
190 fs_inst *
191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
192 {
193 fs_inst *inst;
194
195 /* Take the instruction:
196 *
197 * CMP null<d> src0<f> src1<f>
198 *
199 * Original gen4 does type conversion to the destination type before
200 * comparison, producing garbage results for floating point comparisons.
201 * gen5 does the comparison on the execution type (resolved source types),
202 * so dst type doesn't matter. gen6 does comparison and then uses the
203 * result as if it was the dst type with no conversion, which happens to
204 * mostly work out for float-interpreted-as-int since our comparisons are
205 * for >0, =0, <0.
206 */
207 if (intel->gen == 4) {
208 dst.type = src0.type;
209 if (dst.file == FIXED_HW_REG)
210 dst.fixed_hw_reg.type = dst.type;
211 }
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 bool
223 fs_inst::equals(fs_inst *inst)
224 {
225 return (opcode == inst->opcode &&
226 dst.equals(inst->dst) &&
227 src[0].equals(inst->src[0]) &&
228 src[1].equals(inst->src[1]) &&
229 src[2].equals(inst->src[2]) &&
230 saturate == inst->saturate &&
231 predicate == inst->predicate &&
232 conditional_mod == inst->conditional_mod &&
233 mlen == inst->mlen &&
234 base_mrf == inst->base_mrf &&
235 sampler == inst->sampler &&
236 target == inst->target &&
237 eot == inst->eot &&
238 header_present == inst->header_present &&
239 shadow_compare == inst->shadow_compare &&
240 offset == inst->offset);
241 }
242
243 int
244 fs_inst::regs_written()
245 {
246 if (is_tex())
247 return 4;
248
249 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
250 * but we don't currently use them...nor do we have an opcode for them.
251 */
252
253 return 1;
254 }
255
256 bool
257 fs_inst::overwrites_reg(const fs_reg &reg)
258 {
259 return (reg.file == dst.file &&
260 reg.reg == dst.reg &&
261 reg.reg_offset >= dst.reg_offset &&
262 reg.reg_offset < dst.reg_offset + regs_written());
263 }
264
265 bool
266 fs_inst::is_tex()
267 {
268 return (opcode == SHADER_OPCODE_TEX ||
269 opcode == FS_OPCODE_TXB ||
270 opcode == SHADER_OPCODE_TXD ||
271 opcode == SHADER_OPCODE_TXF ||
272 opcode == SHADER_OPCODE_TXL ||
273 opcode == SHADER_OPCODE_TXS);
274 }
275
276 bool
277 fs_inst::is_math()
278 {
279 return (opcode == SHADER_OPCODE_RCP ||
280 opcode == SHADER_OPCODE_RSQ ||
281 opcode == SHADER_OPCODE_SQRT ||
282 opcode == SHADER_OPCODE_EXP2 ||
283 opcode == SHADER_OPCODE_LOG2 ||
284 opcode == SHADER_OPCODE_SIN ||
285 opcode == SHADER_OPCODE_COS ||
286 opcode == SHADER_OPCODE_INT_QUOTIENT ||
287 opcode == SHADER_OPCODE_INT_REMAINDER ||
288 opcode == SHADER_OPCODE_POW);
289 }
290
291 void
292 fs_reg::init()
293 {
294 memset(this, 0, sizeof(*this));
295 this->smear = -1;
296 }
297
298 /** Generic unset register constructor. */
299 fs_reg::fs_reg()
300 {
301 init();
302 this->file = BAD_FILE;
303 }
304
305 /** Immediate value constructor. */
306 fs_reg::fs_reg(float f)
307 {
308 init();
309 this->file = IMM;
310 this->type = BRW_REGISTER_TYPE_F;
311 this->imm.f = f;
312 }
313
314 /** Immediate value constructor. */
315 fs_reg::fs_reg(int32_t i)
316 {
317 init();
318 this->file = IMM;
319 this->type = BRW_REGISTER_TYPE_D;
320 this->imm.i = i;
321 }
322
323 /** Immediate value constructor. */
324 fs_reg::fs_reg(uint32_t u)
325 {
326 init();
327 this->file = IMM;
328 this->type = BRW_REGISTER_TYPE_UD;
329 this->imm.u = u;
330 }
331
332 /** Fixed brw_reg Immediate value constructor. */
333 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
334 {
335 init();
336 this->file = FIXED_HW_REG;
337 this->fixed_hw_reg = fixed_hw_reg;
338 this->type = fixed_hw_reg.type;
339 }
340
341 bool
342 fs_reg::equals(const fs_reg &r) const
343 {
344 return (file == r.file &&
345 reg == r.reg &&
346 reg_offset == r.reg_offset &&
347 type == r.type &&
348 negate == r.negate &&
349 abs == r.abs &&
350 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
351 sizeof(fixed_hw_reg)) == 0 &&
352 smear == r.smear &&
353 imm.u == r.imm.u);
354 }
355
356 int
357 fs_visitor::type_size(const struct glsl_type *type)
358 {
359 unsigned int size, i;
360
361 switch (type->base_type) {
362 case GLSL_TYPE_UINT:
363 case GLSL_TYPE_INT:
364 case GLSL_TYPE_FLOAT:
365 case GLSL_TYPE_BOOL:
366 return type->components();
367 case GLSL_TYPE_ARRAY:
368 return type_size(type->fields.array) * type->length;
369 case GLSL_TYPE_STRUCT:
370 size = 0;
371 for (i = 0; i < type->length; i++) {
372 size += type_size(type->fields.structure[i].type);
373 }
374 return size;
375 case GLSL_TYPE_SAMPLER:
376 /* Samplers take up no register space, since they're baked in at
377 * link time.
378 */
379 return 0;
380 default:
381 assert(!"not reached");
382 return 0;
383 }
384 }
385
386 void
387 fs_visitor::fail(const char *format, ...)
388 {
389 va_list va;
390 char *msg;
391
392 if (failed)
393 return;
394
395 failed = true;
396
397 va_start(va, format);
398 msg = ralloc_vasprintf(mem_ctx, format, va);
399 va_end(va);
400 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
401
402 this->fail_msg = msg;
403
404 if (INTEL_DEBUG & DEBUG_WM) {
405 fprintf(stderr, "%s", msg);
406 }
407 }
408
409 fs_inst *
410 fs_visitor::emit(enum opcode opcode)
411 {
412 return emit(fs_inst(opcode));
413 }
414
415 fs_inst *
416 fs_visitor::emit(enum opcode opcode, fs_reg dst)
417 {
418 return emit(fs_inst(opcode, dst));
419 }
420
421 fs_inst *
422 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
423 {
424 return emit(fs_inst(opcode, dst, src0));
425 }
426
427 fs_inst *
428 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
429 {
430 return emit(fs_inst(opcode, dst, src0, src1));
431 }
432
433 fs_inst *
434 fs_visitor::emit(enum opcode opcode, fs_reg dst,
435 fs_reg src0, fs_reg src1, fs_reg src2)
436 {
437 return emit(fs_inst(opcode, dst, src0, src1, src2));
438 }
439
440 void
441 fs_visitor::push_force_uncompressed()
442 {
443 force_uncompressed_stack++;
444 }
445
446 void
447 fs_visitor::pop_force_uncompressed()
448 {
449 force_uncompressed_stack--;
450 assert(force_uncompressed_stack >= 0);
451 }
452
453 void
454 fs_visitor::push_force_sechalf()
455 {
456 force_sechalf_stack++;
457 }
458
459 void
460 fs_visitor::pop_force_sechalf()
461 {
462 force_sechalf_stack--;
463 assert(force_sechalf_stack >= 0);
464 }
465
466 /**
467 * Returns how many MRFs an FS opcode will write over.
468 *
469 * Note that this is not the 0 or 1 implied writes in an actual gen
470 * instruction -- the FS opcodes often generate MOVs in addition.
471 */
472 int
473 fs_visitor::implied_mrf_writes(fs_inst *inst)
474 {
475 if (inst->mlen == 0)
476 return 0;
477
478 switch (inst->opcode) {
479 case SHADER_OPCODE_RCP:
480 case SHADER_OPCODE_RSQ:
481 case SHADER_OPCODE_SQRT:
482 case SHADER_OPCODE_EXP2:
483 case SHADER_OPCODE_LOG2:
484 case SHADER_OPCODE_SIN:
485 case SHADER_OPCODE_COS:
486 return 1 * dispatch_width / 8;
487 case SHADER_OPCODE_POW:
488 case SHADER_OPCODE_INT_QUOTIENT:
489 case SHADER_OPCODE_INT_REMAINDER:
490 return 2 * dispatch_width / 8;
491 case SHADER_OPCODE_TEX:
492 case FS_OPCODE_TXB:
493 case SHADER_OPCODE_TXD:
494 case SHADER_OPCODE_TXF:
495 case SHADER_OPCODE_TXL:
496 case SHADER_OPCODE_TXS:
497 return 1;
498 case FS_OPCODE_FB_WRITE:
499 return 2;
500 case FS_OPCODE_PULL_CONSTANT_LOAD:
501 case FS_OPCODE_UNSPILL:
502 return 1;
503 case FS_OPCODE_SPILL:
504 return 2;
505 default:
506 assert(!"not reached");
507 return inst->mlen;
508 }
509 }
510
511 int
512 fs_visitor::virtual_grf_alloc(int size)
513 {
514 if (virtual_grf_array_size <= virtual_grf_count) {
515 if (virtual_grf_array_size == 0)
516 virtual_grf_array_size = 16;
517 else
518 virtual_grf_array_size *= 2;
519 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
520 virtual_grf_array_size);
521 }
522 virtual_grf_sizes[virtual_grf_count] = size;
523 return virtual_grf_count++;
524 }
525
526 /** Fixed HW reg constructor. */
527 fs_reg::fs_reg(enum register_file file, int reg)
528 {
529 init();
530 this->file = file;
531 this->reg = reg;
532 this->type = BRW_REGISTER_TYPE_F;
533 }
534
535 /** Fixed HW reg constructor. */
536 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
537 {
538 init();
539 this->file = file;
540 this->reg = reg;
541 this->type = type;
542 }
543
544 /** Automatic reg constructor. */
545 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
546 {
547 init();
548
549 this->file = GRF;
550 this->reg = v->virtual_grf_alloc(v->type_size(type));
551 this->reg_offset = 0;
552 this->type = brw_type_for_base_type(type);
553 }
554
555 fs_reg *
556 fs_visitor::variable_storage(ir_variable *var)
557 {
558 return (fs_reg *)hash_table_find(this->variable_ht, var);
559 }
560
561 void
562 import_uniforms_callback(const void *key,
563 void *data,
564 void *closure)
565 {
566 struct hash_table *dst_ht = (struct hash_table *)closure;
567 const fs_reg *reg = (const fs_reg *)data;
568
569 if (reg->file != UNIFORM)
570 return;
571
572 hash_table_insert(dst_ht, data, key);
573 }
574
575 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
576 * This brings in those uniform definitions
577 */
578 void
579 fs_visitor::import_uniforms(fs_visitor *v)
580 {
581 hash_table_call_foreach(v->variable_ht,
582 import_uniforms_callback,
583 variable_ht);
584 this->params_remap = v->params_remap;
585 }
586
587 /* Our support for uniforms is piggy-backed on the struct
588 * gl_fragment_program, because that's where the values actually
589 * get stored, rather than in some global gl_shader_program uniform
590 * store.
591 */
592 int
593 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
594 {
595 unsigned int offset = 0;
596
597 if (type->is_matrix()) {
598 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
599 type->vector_elements,
600 1);
601
602 for (unsigned int i = 0; i < type->matrix_columns; i++) {
603 offset += setup_uniform_values(loc + offset, column);
604 }
605
606 return offset;
607 }
608
609 switch (type->base_type) {
610 case GLSL_TYPE_FLOAT:
611 case GLSL_TYPE_UINT:
612 case GLSL_TYPE_INT:
613 case GLSL_TYPE_BOOL:
614 for (unsigned int i = 0; i < type->vector_elements; i++) {
615 unsigned int param = c->prog_data.nr_params++;
616
617 this->param_index[param] = loc;
618 this->param_offset[param] = i;
619 }
620 return 1;
621
622 case GLSL_TYPE_STRUCT:
623 for (unsigned int i = 0; i < type->length; i++) {
624 offset += setup_uniform_values(loc + offset,
625 type->fields.structure[i].type);
626 }
627 return offset;
628
629 case GLSL_TYPE_ARRAY:
630 for (unsigned int i = 0; i < type->length; i++) {
631 offset += setup_uniform_values(loc + offset, type->fields.array);
632 }
633 return offset;
634
635 case GLSL_TYPE_SAMPLER:
636 /* The sampler takes up a slot, but we don't use any values from it. */
637 return 1;
638
639 default:
640 assert(!"not reached");
641 return 0;
642 }
643 }
644
645
646 /* Our support for builtin uniforms is even scarier than non-builtin.
647 * It sits on top of the PROG_STATE_VAR parameters that are
648 * automatically updated from GL context state.
649 */
650 void
651 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
652 {
653 const ir_state_slot *const slots = ir->state_slots;
654 assert(ir->state_slots != NULL);
655
656 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
657 /* This state reference has already been setup by ir_to_mesa, but we'll
658 * get the same index back here.
659 */
660 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
661 (gl_state_index *)slots[i].tokens);
662
663 /* Add each of the unique swizzles of the element as a parameter.
664 * This'll end up matching the expected layout of the
665 * array/matrix/structure we're trying to fill in.
666 */
667 int last_swiz = -1;
668 for (unsigned int j = 0; j < 4; j++) {
669 int swiz = GET_SWZ(slots[i].swizzle, j);
670 if (swiz == last_swiz)
671 break;
672 last_swiz = swiz;
673
674 this->param_index[c->prog_data.nr_params] = index;
675 this->param_offset[c->prog_data.nr_params] = swiz;
676 c->prog_data.nr_params++;
677 }
678 }
679 }
680
681 fs_reg *
682 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
683 {
684 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
685 fs_reg wpos = *reg;
686 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
687
688 /* gl_FragCoord.x */
689 if (ir->pixel_center_integer) {
690 emit(MOV(wpos, this->pixel_x));
691 } else {
692 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
693 }
694 wpos.reg_offset++;
695
696 /* gl_FragCoord.y */
697 if (!flip && ir->pixel_center_integer) {
698 emit(MOV(wpos, this->pixel_y));
699 } else {
700 fs_reg pixel_y = this->pixel_y;
701 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
702
703 if (flip) {
704 pixel_y.negate = true;
705 offset += c->key.drawable_height - 1.0;
706 }
707
708 emit(ADD(wpos, pixel_y, fs_reg(offset)));
709 }
710 wpos.reg_offset++;
711
712 /* gl_FragCoord.z */
713 if (intel->gen >= 6) {
714 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
715 } else {
716 emit(FS_OPCODE_LINTERP, wpos,
717 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
718 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
719 interp_reg(FRAG_ATTRIB_WPOS, 2));
720 }
721 wpos.reg_offset++;
722
723 /* gl_FragCoord.w: Already set up in emit_interpolation */
724 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
725
726 return reg;
727 }
728
729 fs_inst *
730 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
731 glsl_interp_qualifier interpolation_mode,
732 bool is_centroid)
733 {
734 brw_wm_barycentric_interp_mode barycoord_mode;
735 if (is_centroid) {
736 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
737 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
738 else
739 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
740 } else {
741 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
742 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
743 else
744 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
745 }
746 return emit(FS_OPCODE_LINTERP, attr,
747 this->delta_x[barycoord_mode],
748 this->delta_y[barycoord_mode], interp);
749 }
750
751 fs_reg *
752 fs_visitor::emit_general_interpolation(ir_variable *ir)
753 {
754 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
755 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
756 fs_reg attr = *reg;
757
758 unsigned int array_elements;
759 const glsl_type *type;
760
761 if (ir->type->is_array()) {
762 array_elements = ir->type->length;
763 if (array_elements == 0) {
764 fail("dereferenced array '%s' has length 0\n", ir->name);
765 }
766 type = ir->type->fields.array;
767 } else {
768 array_elements = 1;
769 type = ir->type;
770 }
771
772 glsl_interp_qualifier interpolation_mode =
773 ir->determine_interpolation_mode(c->key.flat_shade);
774
775 int location = ir->location;
776 for (unsigned int i = 0; i < array_elements; i++) {
777 for (unsigned int j = 0; j < type->matrix_columns; j++) {
778 if (urb_setup[location] == -1) {
779 /* If there's no incoming setup data for this slot, don't
780 * emit interpolation for it.
781 */
782 attr.reg_offset += type->vector_elements;
783 location++;
784 continue;
785 }
786
787 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
788 /* Constant interpolation (flat shading) case. The SF has
789 * handed us defined values in only the constant offset
790 * field of the setup reg.
791 */
792 for (unsigned int k = 0; k < type->vector_elements; k++) {
793 struct brw_reg interp = interp_reg(location, k);
794 interp = suboffset(interp, 3);
795 interp.type = reg->type;
796 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
797 attr.reg_offset++;
798 }
799 } else {
800 /* Smooth/noperspective interpolation case. */
801 for (unsigned int k = 0; k < type->vector_elements; k++) {
802 /* FINISHME: At some point we probably want to push
803 * this farther by giving similar treatment to the
804 * other potentially constant components of the
805 * attribute, as well as making brw_vs_constval.c
806 * handle varyings other than gl_TexCoord.
807 */
808 if (location >= FRAG_ATTRIB_TEX0 &&
809 location <= FRAG_ATTRIB_TEX7 &&
810 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
811 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
812 } else {
813 struct brw_reg interp = interp_reg(location, k);
814 emit_linterp(attr, fs_reg(interp), interpolation_mode,
815 ir->centroid);
816 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
817 /* Get the pixel/sample mask into f0 so that we know
818 * which pixels are lit. Then, for each channel that is
819 * unlit, replace the centroid data with non-centroid
820 * data.
821 */
822 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
823 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
824 interpolation_mode, false);
825 inst->predicate = BRW_PREDICATE_NORMAL;
826 inst->predicate_inverse = true;
827 }
828 if (intel->gen < 6) {
829 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
830 }
831 }
832 attr.reg_offset++;
833 }
834
835 }
836 location++;
837 }
838 }
839
840 return reg;
841 }
842
843 fs_reg *
844 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
845 {
846 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
847
848 /* The frontfacing comes in as a bit in the thread payload. */
849 if (intel->gen >= 6) {
850 emit(BRW_OPCODE_ASR, *reg,
851 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
852 fs_reg(15));
853 emit(BRW_OPCODE_NOT, *reg, *reg);
854 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
855 } else {
856 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
857 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
858 * us front face
859 */
860 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
861 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
862 }
863
864 return reg;
865 }
866
867 fs_inst *
868 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
869 {
870 switch (opcode) {
871 case SHADER_OPCODE_RCP:
872 case SHADER_OPCODE_RSQ:
873 case SHADER_OPCODE_SQRT:
874 case SHADER_OPCODE_EXP2:
875 case SHADER_OPCODE_LOG2:
876 case SHADER_OPCODE_SIN:
877 case SHADER_OPCODE_COS:
878 break;
879 default:
880 assert(!"not reached: bad math opcode");
881 return NULL;
882 }
883
884 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
885 * might be able to do better by doing execsize = 1 math and then
886 * expanding that result out, but we would need to be careful with
887 * masking.
888 *
889 * Gen 6 hardware ignores source modifiers (negate and abs) on math
890 * instructions, so we also move to a temp to set those up.
891 */
892 if (intel->gen == 6 && (src.file == UNIFORM ||
893 src.abs ||
894 src.negate)) {
895 fs_reg expanded = fs_reg(this, glsl_type::float_type);
896 emit(BRW_OPCODE_MOV, expanded, src);
897 src = expanded;
898 }
899
900 fs_inst *inst = emit(opcode, dst, src);
901
902 if (intel->gen < 6) {
903 inst->base_mrf = 2;
904 inst->mlen = dispatch_width / 8;
905 }
906
907 return inst;
908 }
909
910 fs_inst *
911 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
912 {
913 int base_mrf = 2;
914 fs_inst *inst;
915
916 switch (opcode) {
917 case SHADER_OPCODE_POW:
918 case SHADER_OPCODE_INT_QUOTIENT:
919 case SHADER_OPCODE_INT_REMAINDER:
920 break;
921 default:
922 assert(!"not reached: unsupported binary math opcode.");
923 return NULL;
924 }
925
926 if (intel->gen >= 7) {
927 inst = emit(opcode, dst, src0, src1);
928 } else if (intel->gen == 6) {
929 /* Can't do hstride == 0 args to gen6 math, so expand it out.
930 *
931 * The hardware ignores source modifiers (negate and abs) on math
932 * instructions, so we also move to a temp to set those up.
933 */
934 if (src0.file == UNIFORM || src0.abs || src0.negate) {
935 fs_reg expanded = fs_reg(this, glsl_type::float_type);
936 expanded.type = src0.type;
937 emit(BRW_OPCODE_MOV, expanded, src0);
938 src0 = expanded;
939 }
940
941 if (src1.file == UNIFORM || src1.abs || src1.negate) {
942 fs_reg expanded = fs_reg(this, glsl_type::float_type);
943 expanded.type = src1.type;
944 emit(BRW_OPCODE_MOV, expanded, src1);
945 src1 = expanded;
946 }
947
948 inst = emit(opcode, dst, src0, src1);
949 } else {
950 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
951 * "Message Payload":
952 *
953 * "Operand0[7]. For the INT DIV functions, this operand is the
954 * denominator."
955 * ...
956 * "Operand1[7]. For the INT DIV functions, this operand is the
957 * numerator."
958 */
959 bool is_int_div = opcode != SHADER_OPCODE_POW;
960 fs_reg &op0 = is_int_div ? src1 : src0;
961 fs_reg &op1 = is_int_div ? src0 : src1;
962
963 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
964 inst = emit(opcode, dst, op0, reg_null_f);
965
966 inst->base_mrf = base_mrf;
967 inst->mlen = 2 * dispatch_width / 8;
968 }
969 return inst;
970 }
971
972 /**
973 * To be called after the last _mesa_add_state_reference() call, to
974 * set up prog_data.param[] for assign_curb_setup() and
975 * setup_pull_constants().
976 */
977 void
978 fs_visitor::setup_paramvalues_refs()
979 {
980 if (dispatch_width != 8)
981 return;
982
983 /* Set up the pointers to ParamValues now that that array is finalized. */
984 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
985 c->prog_data.param[i] =
986 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
987 this->param_offset[i];
988 }
989 }
990
991 void
992 fs_visitor::assign_curb_setup()
993 {
994 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
995 if (dispatch_width == 8) {
996 c->prog_data.first_curbe_grf = c->nr_payload_regs;
997 } else {
998 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
999 }
1000
1001 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1002 foreach_list(node, &this->instructions) {
1003 fs_inst *inst = (fs_inst *)node;
1004
1005 for (unsigned int i = 0; i < 3; i++) {
1006 if (inst->src[i].file == UNIFORM) {
1007 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1008 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1009 constant_nr / 8,
1010 constant_nr % 8);
1011
1012 inst->src[i].file = FIXED_HW_REG;
1013 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1014 }
1015 }
1016 }
1017 }
1018
1019 void
1020 fs_visitor::calculate_urb_setup()
1021 {
1022 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1023 urb_setup[i] = -1;
1024 }
1025
1026 int urb_next = 0;
1027 /* Figure out where each of the incoming setup attributes lands. */
1028 if (intel->gen >= 6) {
1029 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1030 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1031 urb_setup[i] = urb_next++;
1032 }
1033 }
1034 } else {
1035 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1036 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1037 /* Point size is packed into the header, not as a general attribute */
1038 if (i == VERT_RESULT_PSIZ)
1039 continue;
1040
1041 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1042 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1043
1044 /* The back color slot is skipped when the front color is
1045 * also written to. In addition, some slots can be
1046 * written in the vertex shader and not read in the
1047 * fragment shader. So the register number must always be
1048 * incremented, mapped or not.
1049 */
1050 if (fp_index >= 0)
1051 urb_setup[fp_index] = urb_next;
1052 urb_next++;
1053 }
1054 }
1055
1056 /*
1057 * It's a FS only attribute, and we did interpolation for this attribute
1058 * in SF thread. So, count it here, too.
1059 *
1060 * See compile_sf_prog() for more info.
1061 */
1062 if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1063 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1064 }
1065
1066 /* Each attribute is 4 setup channels, each of which is half a reg. */
1067 c->prog_data.urb_read_length = urb_next * 2;
1068 }
1069
1070 void
1071 fs_visitor::assign_urb_setup()
1072 {
1073 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1074
1075 /* Offset all the urb_setup[] index by the actual position of the
1076 * setup regs, now that the location of the constants has been chosen.
1077 */
1078 foreach_list(node, &this->instructions) {
1079 fs_inst *inst = (fs_inst *)node;
1080
1081 if (inst->opcode == FS_OPCODE_LINTERP) {
1082 assert(inst->src[2].file == FIXED_HW_REG);
1083 inst->src[2].fixed_hw_reg.nr += urb_start;
1084 }
1085
1086 if (inst->opcode == FS_OPCODE_CINTERP) {
1087 assert(inst->src[0].file == FIXED_HW_REG);
1088 inst->src[0].fixed_hw_reg.nr += urb_start;
1089 }
1090 }
1091
1092 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1093 }
1094
1095 /**
1096 * Split large virtual GRFs into separate components if we can.
1097 *
1098 * This is mostly duplicated with what brw_fs_vector_splitting does,
1099 * but that's really conservative because it's afraid of doing
1100 * splitting that doesn't result in real progress after the rest of
1101 * the optimization phases, which would cause infinite looping in
1102 * optimization. We can do it once here, safely. This also has the
1103 * opportunity to split interpolated values, or maybe even uniforms,
1104 * which we don't have at the IR level.
1105 *
1106 * We want to split, because virtual GRFs are what we register
1107 * allocate and spill (due to contiguousness requirements for some
1108 * instructions), and they're what we naturally generate in the
1109 * codegen process, but most virtual GRFs don't actually need to be
1110 * contiguous sets of GRFs. If we split, we'll end up with reduced
1111 * live intervals and better dead code elimination and coalescing.
1112 */
1113 void
1114 fs_visitor::split_virtual_grfs()
1115 {
1116 int num_vars = this->virtual_grf_count;
1117 bool split_grf[num_vars];
1118 int new_virtual_grf[num_vars];
1119
1120 /* Try to split anything > 0 sized. */
1121 for (int i = 0; i < num_vars; i++) {
1122 if (this->virtual_grf_sizes[i] != 1)
1123 split_grf[i] = true;
1124 else
1125 split_grf[i] = false;
1126 }
1127
1128 if (brw->has_pln &&
1129 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1130 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1131 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1132 * Gen6, that was the only supported interpolation mode, and since Gen6,
1133 * delta_x and delta_y are in fixed hardware registers.
1134 */
1135 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1136 false;
1137 }
1138
1139 foreach_list(node, &this->instructions) {
1140 fs_inst *inst = (fs_inst *)node;
1141
1142 /* If there's a SEND message that requires contiguous destination
1143 * registers, no splitting is allowed.
1144 */
1145 if (inst->regs_written() > 1) {
1146 split_grf[inst->dst.reg] = false;
1147 }
1148 }
1149
1150 /* Allocate new space for split regs. Note that the virtual
1151 * numbers will be contiguous.
1152 */
1153 for (int i = 0; i < num_vars; i++) {
1154 if (split_grf[i]) {
1155 new_virtual_grf[i] = virtual_grf_alloc(1);
1156 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1157 int reg = virtual_grf_alloc(1);
1158 assert(reg == new_virtual_grf[i] + j - 1);
1159 (void) reg;
1160 }
1161 this->virtual_grf_sizes[i] = 1;
1162 }
1163 }
1164
1165 foreach_list(node, &this->instructions) {
1166 fs_inst *inst = (fs_inst *)node;
1167
1168 if (inst->dst.file == GRF &&
1169 split_grf[inst->dst.reg] &&
1170 inst->dst.reg_offset != 0) {
1171 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1172 inst->dst.reg_offset - 1);
1173 inst->dst.reg_offset = 0;
1174 }
1175 for (int i = 0; i < 3; i++) {
1176 if (inst->src[i].file == GRF &&
1177 split_grf[inst->src[i].reg] &&
1178 inst->src[i].reg_offset != 0) {
1179 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1180 inst->src[i].reg_offset - 1);
1181 inst->src[i].reg_offset = 0;
1182 }
1183 }
1184 }
1185 this->live_intervals_valid = false;
1186 }
1187
1188 /**
1189 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1190 *
1191 * During code generation, we create tons of temporary variables, many of
1192 * which get immediately killed and are never used again. Yet, in later
1193 * optimization and analysis passes, such as compute_live_intervals, we need
1194 * to loop over all the virtual GRFs. Compacting them can save a lot of
1195 * overhead.
1196 */
1197 void
1198 fs_visitor::compact_virtual_grfs()
1199 {
1200 /* Mark which virtual GRFs are used, and count how many. */
1201 int remap_table[this->virtual_grf_count];
1202 memset(remap_table, -1, sizeof(remap_table));
1203
1204 foreach_list(node, &this->instructions) {
1205 const fs_inst *inst = (const fs_inst *) node;
1206
1207 if (inst->dst.file == GRF)
1208 remap_table[inst->dst.reg] = 0;
1209
1210 for (int i = 0; i < 3; i++) {
1211 if (inst->src[i].file == GRF)
1212 remap_table[inst->src[i].reg] = 0;
1213 }
1214 }
1215
1216 /* In addition to registers used in instructions, fs_visitor keeps
1217 * direct references to certain special values which must be patched:
1218 */
1219 fs_reg *special[] = {
1220 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1221 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1222 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1223 &delta_x[0], &delta_x[1], &delta_x[2],
1224 &delta_x[3], &delta_x[4], &delta_x[5],
1225 &delta_y[0], &delta_y[1], &delta_y[2],
1226 &delta_y[3], &delta_y[4], &delta_y[5],
1227 };
1228 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1229 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1230
1231 /* Treat all special values as used, to be conservative */
1232 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1233 if (special[i]->file == GRF)
1234 remap_table[special[i]->reg] = 0;
1235 }
1236
1237 /* Compact the GRF arrays. */
1238 int new_index = 0;
1239 for (int i = 0; i < this->virtual_grf_count; i++) {
1240 if (remap_table[i] != -1) {
1241 remap_table[i] = new_index;
1242 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1243 if (live_intervals_valid) {
1244 virtual_grf_use[new_index] = virtual_grf_use[i];
1245 virtual_grf_def[new_index] = virtual_grf_def[i];
1246 }
1247 ++new_index;
1248 }
1249 }
1250
1251 this->virtual_grf_count = new_index;
1252
1253 /* Patch all the instructions to use the newly renumbered registers */
1254 foreach_list(node, &this->instructions) {
1255 fs_inst *inst = (fs_inst *) node;
1256
1257 if (inst->dst.file == GRF)
1258 inst->dst.reg = remap_table[inst->dst.reg];
1259
1260 for (int i = 0; i < 3; i++) {
1261 if (inst->src[i].file == GRF)
1262 inst->src[i].reg = remap_table[inst->src[i].reg];
1263 }
1264 }
1265
1266 /* Patch all the references to special values */
1267 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1268 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1269 special[i]->reg = remap_table[special[i]->reg];
1270 }
1271 }
1272
1273 bool
1274 fs_visitor::remove_dead_constants()
1275 {
1276 if (dispatch_width == 8) {
1277 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1278
1279 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1280 this->params_remap[i] = -1;
1281
1282 /* Find which params are still in use. */
1283 foreach_list(node, &this->instructions) {
1284 fs_inst *inst = (fs_inst *)node;
1285
1286 for (int i = 0; i < 3; i++) {
1287 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1288
1289 if (inst->src[i].file != UNIFORM)
1290 continue;
1291
1292 assert(constant_nr < (int)c->prog_data.nr_params);
1293
1294 /* For now, set this to non-negative. We'll give it the
1295 * actual new number in a moment, in order to keep the
1296 * register numbers nicely ordered.
1297 */
1298 this->params_remap[constant_nr] = 0;
1299 }
1300 }
1301
1302 /* Figure out what the new numbers for the params will be. At some
1303 * point when we're doing uniform array access, we're going to want
1304 * to keep the distinction between .reg and .reg_offset, but for
1305 * now we don't care.
1306 */
1307 unsigned int new_nr_params = 0;
1308 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1309 if (this->params_remap[i] != -1) {
1310 this->params_remap[i] = new_nr_params++;
1311 }
1312 }
1313
1314 /* Update the list of params to be uploaded to match our new numbering. */
1315 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1316 int remapped = this->params_remap[i];
1317
1318 if (remapped == -1)
1319 continue;
1320
1321 /* We've already done setup_paramvalues_refs() so no need to worry
1322 * about param_index and param_offset.
1323 */
1324 c->prog_data.param[remapped] = c->prog_data.param[i];
1325 }
1326
1327 c->prog_data.nr_params = new_nr_params;
1328 } else {
1329 /* This should have been generated in the 8-wide pass already. */
1330 assert(this->params_remap);
1331 }
1332
1333 /* Now do the renumbering of the shader to remove unused params. */
1334 foreach_list(node, &this->instructions) {
1335 fs_inst *inst = (fs_inst *)node;
1336
1337 for (int i = 0; i < 3; i++) {
1338 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1339
1340 if (inst->src[i].file != UNIFORM)
1341 continue;
1342
1343 assert(this->params_remap[constant_nr] != -1);
1344 inst->src[i].reg = this->params_remap[constant_nr];
1345 inst->src[i].reg_offset = 0;
1346 }
1347 }
1348
1349 return true;
1350 }
1351
1352 /**
1353 * Choose accesses from the UNIFORM file to demote to using the pull
1354 * constant buffer.
1355 *
1356 * We allow a fragment shader to have more than the specified minimum
1357 * maximum number of fragment shader uniform components (64). If
1358 * there are too many of these, they'd fill up all of register space.
1359 * So, this will push some of them out to the pull constant buffer and
1360 * update the program to load them.
1361 */
1362 void
1363 fs_visitor::setup_pull_constants()
1364 {
1365 /* Only allow 16 registers (128 uniform components) as push constants. */
1366 unsigned int max_uniform_components = 16 * 8;
1367 if (c->prog_data.nr_params <= max_uniform_components)
1368 return;
1369
1370 if (dispatch_width == 16) {
1371 fail("Pull constants not supported in 16-wide\n");
1372 return;
1373 }
1374
1375 /* Just demote the end of the list. We could probably do better
1376 * here, demoting things that are rarely used in the program first.
1377 */
1378 int pull_uniform_base = max_uniform_components;
1379 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
1380
1381 foreach_list(node, &this->instructions) {
1382 fs_inst *inst = (fs_inst *)node;
1383
1384 for (int i = 0; i < 3; i++) {
1385 if (inst->src[i].file != UNIFORM)
1386 continue;
1387
1388 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1389 if (uniform_nr < pull_uniform_base)
1390 continue;
1391
1392 fs_reg dst = fs_reg(this, glsl_type::float_type);
1393 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1394 fs_reg offset = fs_reg((unsigned)(((uniform_nr -
1395 pull_uniform_base) * 4) & ~15));
1396 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
1397 dst, index, offset);
1398 pull->ir = inst->ir;
1399 pull->annotation = inst->annotation;
1400 pull->base_mrf = 14;
1401 pull->mlen = 1;
1402
1403 inst->insert_before(pull);
1404
1405 inst->src[i].file = GRF;
1406 inst->src[i].reg = dst.reg;
1407 inst->src[i].reg_offset = 0;
1408 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
1409 }
1410 }
1411
1412 for (int i = 0; i < pull_uniform_count; i++) {
1413 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
1414 }
1415 c->prog_data.nr_params -= pull_uniform_count;
1416 c->prog_data.nr_pull_params = pull_uniform_count;
1417 }
1418
1419 bool
1420 fs_visitor::opt_algebraic()
1421 {
1422 bool progress = false;
1423
1424 foreach_list(node, &this->instructions) {
1425 fs_inst *inst = (fs_inst *)node;
1426
1427 switch (inst->opcode) {
1428 case BRW_OPCODE_MUL:
1429 if (inst->src[1].file != IMM)
1430 continue;
1431
1432 /* a * 1.0 = a */
1433 if (inst->src[1].type == BRW_REGISTER_TYPE_F &&
1434 inst->src[1].imm.f == 1.0) {
1435 inst->opcode = BRW_OPCODE_MOV;
1436 inst->src[1] = reg_undef;
1437 progress = true;
1438 break;
1439 }
1440
1441 /* a * 0.0 = 0.0 */
1442 if (inst->src[1].type == BRW_REGISTER_TYPE_F &&
1443 inst->src[1].imm.f == 0.0) {
1444 inst->opcode = BRW_OPCODE_MOV;
1445 inst->src[0] = fs_reg(0.0f);
1446 inst->src[1] = reg_undef;
1447 progress = true;
1448 break;
1449 }
1450
1451 break;
1452 case BRW_OPCODE_ADD:
1453 if (inst->src[1].file != IMM)
1454 continue;
1455
1456 /* a + 0.0 = a */
1457 if (inst->src[1].type == BRW_REGISTER_TYPE_F &&
1458 inst->src[1].imm.f == 0.0) {
1459 inst->opcode = BRW_OPCODE_MOV;
1460 inst->src[1] = reg_undef;
1461 progress = true;
1462 break;
1463 }
1464 break;
1465 default:
1466 break;
1467 }
1468 }
1469
1470 return progress;
1471 }
1472
1473 /**
1474 * Must be called after calculate_live_intervales() to remove unused
1475 * writes to registers -- register allocation will fail otherwise
1476 * because something deffed but not used won't be considered to
1477 * interfere with other regs.
1478 */
1479 bool
1480 fs_visitor::dead_code_eliminate()
1481 {
1482 bool progress = false;
1483 int pc = 0;
1484
1485 calculate_live_intervals();
1486
1487 foreach_list_safe(node, &this->instructions) {
1488 fs_inst *inst = (fs_inst *)node;
1489
1490 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1491 inst->remove();
1492 progress = true;
1493 }
1494
1495 pc++;
1496 }
1497
1498 if (progress)
1499 live_intervals_valid = false;
1500
1501 return progress;
1502 }
1503
1504 /**
1505 * Implements a second type of register coalescing: This one checks if
1506 * the two regs involved in a raw move don't interfere, in which case
1507 * they can both by stored in the same place and the MOV removed.
1508 */
1509 bool
1510 fs_visitor::register_coalesce_2()
1511 {
1512 bool progress = false;
1513
1514 calculate_live_intervals();
1515
1516 foreach_list_safe(node, &this->instructions) {
1517 fs_inst *inst = (fs_inst *)node;
1518
1519 if (inst->opcode != BRW_OPCODE_MOV ||
1520 inst->predicate ||
1521 inst->saturate ||
1522 inst->src[0].file != GRF ||
1523 inst->src[0].negate ||
1524 inst->src[0].abs ||
1525 inst->src[0].smear != -1 ||
1526 inst->dst.file != GRF ||
1527 inst->dst.type != inst->src[0].type ||
1528 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1529 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1530 continue;
1531 }
1532
1533 int reg_from = inst->src[0].reg;
1534 assert(inst->src[0].reg_offset == 0);
1535 int reg_to = inst->dst.reg;
1536 int reg_to_offset = inst->dst.reg_offset;
1537
1538 foreach_list_safe(node, &this->instructions) {
1539 fs_inst *scan_inst = (fs_inst *)node;
1540
1541 if (scan_inst->dst.file == GRF &&
1542 scan_inst->dst.reg == reg_from) {
1543 scan_inst->dst.reg = reg_to;
1544 scan_inst->dst.reg_offset = reg_to_offset;
1545 }
1546 for (int i = 0; i < 3; i++) {
1547 if (scan_inst->src[i].file == GRF &&
1548 scan_inst->src[i].reg == reg_from) {
1549 scan_inst->src[i].reg = reg_to;
1550 scan_inst->src[i].reg_offset = reg_to_offset;
1551 }
1552 }
1553 }
1554
1555 inst->remove();
1556 live_intervals_valid = false;
1557 progress = true;
1558 continue;
1559 }
1560
1561 return progress;
1562 }
1563
1564 bool
1565 fs_visitor::register_coalesce()
1566 {
1567 bool progress = false;
1568 int if_depth = 0;
1569 int loop_depth = 0;
1570
1571 foreach_list_safe(node, &this->instructions) {
1572 fs_inst *inst = (fs_inst *)node;
1573
1574 /* Make sure that we dominate the instructions we're going to
1575 * scan for interfering with our coalescing, or we won't have
1576 * scanned enough to see if anything interferes with our
1577 * coalescing. We don't dominate the following instructions if
1578 * we're in a loop or an if block.
1579 */
1580 switch (inst->opcode) {
1581 case BRW_OPCODE_DO:
1582 loop_depth++;
1583 break;
1584 case BRW_OPCODE_WHILE:
1585 loop_depth--;
1586 break;
1587 case BRW_OPCODE_IF:
1588 if_depth++;
1589 break;
1590 case BRW_OPCODE_ENDIF:
1591 if_depth--;
1592 break;
1593 default:
1594 break;
1595 }
1596 if (loop_depth || if_depth)
1597 continue;
1598
1599 if (inst->opcode != BRW_OPCODE_MOV ||
1600 inst->predicate ||
1601 inst->saturate ||
1602 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1603 inst->src[0].file != UNIFORM)||
1604 inst->dst.type != inst->src[0].type)
1605 continue;
1606
1607 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
1608
1609 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1610 * them: check for no writes to either one until the exit of the
1611 * program.
1612 */
1613 bool interfered = false;
1614
1615 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1616 !scan_inst->is_tail_sentinel();
1617 scan_inst = (fs_inst *)scan_inst->next) {
1618 if (scan_inst->dst.file == GRF) {
1619 if (scan_inst->overwrites_reg(inst->dst) ||
1620 scan_inst->overwrites_reg(inst->src[0])) {
1621 interfered = true;
1622 break;
1623 }
1624 }
1625
1626 /* The gen6 MATH instruction can't handle source modifiers or
1627 * unusual register regions, so avoid coalescing those for
1628 * now. We should do something more specific.
1629 */
1630 if (intel->gen >= 6 &&
1631 scan_inst->is_math() &&
1632 (has_source_modifiers || inst->src[0].file == UNIFORM)) {
1633 interfered = true;
1634 break;
1635 }
1636
1637 /* The accumulator result appears to get used for the
1638 * conditional modifier generation. When negating a UD
1639 * value, there is a 33rd bit generated for the sign in the
1640 * accumulator value, so now you can't check, for example,
1641 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1642 */
1643 if (scan_inst->conditional_mod &&
1644 inst->src[0].negate &&
1645 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1646 interfered = true;
1647 break;
1648 }
1649 }
1650 if (interfered) {
1651 continue;
1652 }
1653
1654 /* Rewrite the later usage to point at the source of the move to
1655 * be removed.
1656 */
1657 for (fs_inst *scan_inst = inst;
1658 !scan_inst->is_tail_sentinel();
1659 scan_inst = (fs_inst *)scan_inst->next) {
1660 for (int i = 0; i < 3; i++) {
1661 if (scan_inst->src[i].file == GRF &&
1662 scan_inst->src[i].reg == inst->dst.reg &&
1663 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1664 fs_reg new_src = inst->src[0];
1665 if (scan_inst->src[i].abs) {
1666 new_src.negate = 0;
1667 new_src.abs = 1;
1668 }
1669 new_src.negate ^= scan_inst->src[i].negate;
1670 scan_inst->src[i] = new_src;
1671 }
1672 }
1673 }
1674
1675 inst->remove();
1676 progress = true;
1677 }
1678
1679 if (progress)
1680 live_intervals_valid = false;
1681
1682 return progress;
1683 }
1684
1685
1686 bool
1687 fs_visitor::compute_to_mrf()
1688 {
1689 bool progress = false;
1690 int next_ip = 0;
1691
1692 calculate_live_intervals();
1693
1694 foreach_list_safe(node, &this->instructions) {
1695 fs_inst *inst = (fs_inst *)node;
1696
1697 int ip = next_ip;
1698 next_ip++;
1699
1700 if (inst->opcode != BRW_OPCODE_MOV ||
1701 inst->predicate ||
1702 inst->dst.file != MRF || inst->src[0].file != GRF ||
1703 inst->dst.type != inst->src[0].type ||
1704 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1705 continue;
1706
1707 /* Work out which hardware MRF registers are written by this
1708 * instruction.
1709 */
1710 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1711 int mrf_high;
1712 if (inst->dst.reg & BRW_MRF_COMPR4) {
1713 mrf_high = mrf_low + 4;
1714 } else if (dispatch_width == 16 &&
1715 (!inst->force_uncompressed && !inst->force_sechalf)) {
1716 mrf_high = mrf_low + 1;
1717 } else {
1718 mrf_high = mrf_low;
1719 }
1720
1721 /* Can't compute-to-MRF this GRF if someone else was going to
1722 * read it later.
1723 */
1724 if (this->virtual_grf_use[inst->src[0].reg] > ip)
1725 continue;
1726
1727 /* Found a move of a GRF to a MRF. Let's see if we can go
1728 * rewrite the thing that made this GRF to write into the MRF.
1729 */
1730 fs_inst *scan_inst;
1731 for (scan_inst = (fs_inst *)inst->prev;
1732 scan_inst->prev != NULL;
1733 scan_inst = (fs_inst *)scan_inst->prev) {
1734 if (scan_inst->dst.file == GRF &&
1735 scan_inst->dst.reg == inst->src[0].reg) {
1736 /* Found the last thing to write our reg we want to turn
1737 * into a compute-to-MRF.
1738 */
1739
1740 /* SENDs can only write to GRFs, so no compute-to-MRF. */
1741 if (scan_inst->mlen) {
1742 break;
1743 }
1744
1745 /* If it's predicated, it (probably) didn't populate all
1746 * the channels. We might be able to rewrite everything
1747 * that writes that reg, but it would require smarter
1748 * tracking to delay the rewriting until complete success.
1749 */
1750 if (scan_inst->predicate)
1751 break;
1752
1753 /* If it's half of register setup and not the same half as
1754 * our MOV we're trying to remove, bail for now.
1755 */
1756 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
1757 scan_inst->force_sechalf != inst->force_sechalf) {
1758 break;
1759 }
1760
1761 /* SEND instructions can't have MRF as a destination. */
1762 if (scan_inst->mlen)
1763 break;
1764
1765 if (intel->gen >= 6) {
1766 /* gen6 math instructions must have the destination be
1767 * GRF, so no compute-to-MRF for them.
1768 */
1769 if (scan_inst->is_math()) {
1770 break;
1771 }
1772 }
1773
1774 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1775 /* Found the creator of our MRF's source value. */
1776 scan_inst->dst.file = MRF;
1777 scan_inst->dst.reg = inst->dst.reg;
1778 scan_inst->saturate |= inst->saturate;
1779 inst->remove();
1780 progress = true;
1781 }
1782 break;
1783 }
1784
1785 /* We don't handle flow control here. Most computation of
1786 * values that end up in MRFs are shortly before the MRF
1787 * write anyway.
1788 */
1789 if (scan_inst->opcode == BRW_OPCODE_DO ||
1790 scan_inst->opcode == BRW_OPCODE_WHILE ||
1791 scan_inst->opcode == BRW_OPCODE_ELSE ||
1792 scan_inst->opcode == BRW_OPCODE_ENDIF) {
1793 break;
1794 }
1795
1796 /* You can't read from an MRF, so if someone else reads our
1797 * MRF's source GRF that we wanted to rewrite, that stops us.
1798 */
1799 bool interfered = false;
1800 for (int i = 0; i < 3; i++) {
1801 if (scan_inst->src[i].file == GRF &&
1802 scan_inst->src[i].reg == inst->src[0].reg &&
1803 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
1804 interfered = true;
1805 }
1806 }
1807 if (interfered)
1808 break;
1809
1810 if (scan_inst->dst.file == MRF) {
1811 /* If somebody else writes our MRF here, we can't
1812 * compute-to-MRF before that.
1813 */
1814 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
1815 int scan_mrf_high;
1816
1817 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
1818 scan_mrf_high = scan_mrf_low + 4;
1819 } else if (dispatch_width == 16 &&
1820 (!scan_inst->force_uncompressed &&
1821 !scan_inst->force_sechalf)) {
1822 scan_mrf_high = scan_mrf_low + 1;
1823 } else {
1824 scan_mrf_high = scan_mrf_low;
1825 }
1826
1827 if (mrf_low == scan_mrf_low ||
1828 mrf_low == scan_mrf_high ||
1829 mrf_high == scan_mrf_low ||
1830 mrf_high == scan_mrf_high) {
1831 break;
1832 }
1833 }
1834
1835 if (scan_inst->mlen > 0) {
1836 /* Found a SEND instruction, which means that there are
1837 * live values in MRFs from base_mrf to base_mrf +
1838 * scan_inst->mlen - 1. Don't go pushing our MRF write up
1839 * above it.
1840 */
1841 if (mrf_low >= scan_inst->base_mrf &&
1842 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
1843 break;
1844 }
1845 if (mrf_high >= scan_inst->base_mrf &&
1846 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
1847 break;
1848 }
1849 }
1850 }
1851 }
1852
1853 if (progress)
1854 live_intervals_valid = false;
1855
1856 return progress;
1857 }
1858
1859 /**
1860 * Walks through basic blocks, looking for repeated MRF writes and
1861 * removing the later ones.
1862 */
1863 bool
1864 fs_visitor::remove_duplicate_mrf_writes()
1865 {
1866 fs_inst *last_mrf_move[16];
1867 bool progress = false;
1868
1869 /* Need to update the MRF tracking for compressed instructions. */
1870 if (dispatch_width == 16)
1871 return false;
1872
1873 memset(last_mrf_move, 0, sizeof(last_mrf_move));
1874
1875 foreach_list_safe(node, &this->instructions) {
1876 fs_inst *inst = (fs_inst *)node;
1877
1878 switch (inst->opcode) {
1879 case BRW_OPCODE_DO:
1880 case BRW_OPCODE_WHILE:
1881 case BRW_OPCODE_IF:
1882 case BRW_OPCODE_ELSE:
1883 case BRW_OPCODE_ENDIF:
1884 memset(last_mrf_move, 0, sizeof(last_mrf_move));
1885 continue;
1886 default:
1887 break;
1888 }
1889
1890 if (inst->opcode == BRW_OPCODE_MOV &&
1891 inst->dst.file == MRF) {
1892 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
1893 if (prev_inst && inst->equals(prev_inst)) {
1894 inst->remove();
1895 progress = true;
1896 continue;
1897 }
1898 }
1899
1900 /* Clear out the last-write records for MRFs that were overwritten. */
1901 if (inst->dst.file == MRF) {
1902 last_mrf_move[inst->dst.reg] = NULL;
1903 }
1904
1905 if (inst->mlen > 0) {
1906 /* Found a SEND instruction, which will include two or fewer
1907 * implied MRF writes. We could do better here.
1908 */
1909 for (int i = 0; i < implied_mrf_writes(inst); i++) {
1910 last_mrf_move[inst->base_mrf + i] = NULL;
1911 }
1912 }
1913
1914 /* Clear out any MRF move records whose sources got overwritten. */
1915 if (inst->dst.file == GRF) {
1916 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
1917 if (last_mrf_move[i] &&
1918 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
1919 last_mrf_move[i] = NULL;
1920 }
1921 }
1922 }
1923
1924 if (inst->opcode == BRW_OPCODE_MOV &&
1925 inst->dst.file == MRF &&
1926 inst->src[0].file == GRF &&
1927 !inst->predicate) {
1928 last_mrf_move[inst->dst.reg] = inst;
1929 }
1930 }
1931
1932 if (progress)
1933 live_intervals_valid = false;
1934
1935 return progress;
1936 }
1937
1938 /**
1939 * Possibly returns an instruction that set up @param reg.
1940 *
1941 * Sometimes we want to take the result of some expression/variable
1942 * dereference tree and rewrite the instruction generating the result
1943 * of the tree. When processing the tree, we know that the
1944 * instructions generated are all writing temporaries that are dead
1945 * outside of this tree. So, if we have some instructions that write
1946 * a temporary, we're free to point that temp write somewhere else.
1947 *
1948 * Note that this doesn't guarantee that the instruction generated
1949 * only reg -- it might be the size=4 destination of a texture instruction.
1950 */
1951 fs_inst *
1952 fs_visitor::get_instruction_generating_reg(fs_inst *start,
1953 fs_inst *end,
1954 fs_reg reg)
1955 {
1956 if (end == start ||
1957 end->predicate ||
1958 end->force_uncompressed ||
1959 end->force_sechalf ||
1960 !reg.equals(end->dst)) {
1961 return NULL;
1962 } else {
1963 return end;
1964 }
1965 }
1966
1967 void
1968 fs_visitor::setup_payload_gen6()
1969 {
1970 struct intel_context *intel = &brw->intel;
1971 bool uses_depth =
1972 (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
1973 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
1974
1975 assert(intel->gen >= 6);
1976
1977 /* R0-1: masks, pixel X/Y coordinates. */
1978 c->nr_payload_regs = 2;
1979 /* R2: only for 32-pixel dispatch.*/
1980
1981 /* R3-26: barycentric interpolation coordinates. These appear in the
1982 * same order that they appear in the brw_wm_barycentric_interp_mode
1983 * enum. Each set of coordinates occupies 2 registers if dispatch width
1984 * == 8 and 4 registers if dispatch width == 16. Coordinates only
1985 * appear if they were enabled using the "Barycentric Interpolation
1986 * Mode" bits in WM_STATE.
1987 */
1988 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
1989 if (barycentric_interp_modes & (1 << i)) {
1990 c->barycentric_coord_reg[i] = c->nr_payload_regs;
1991 c->nr_payload_regs += 2;
1992 if (dispatch_width == 16) {
1993 c->nr_payload_regs += 2;
1994 }
1995 }
1996 }
1997
1998 /* R27: interpolated depth if uses source depth */
1999 if (uses_depth) {
2000 c->source_depth_reg = c->nr_payload_regs;
2001 c->nr_payload_regs++;
2002 if (dispatch_width == 16) {
2003 /* R28: interpolated depth if not 8-wide. */
2004 c->nr_payload_regs++;
2005 }
2006 }
2007 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2008 if (uses_depth) {
2009 c->source_w_reg = c->nr_payload_regs;
2010 c->nr_payload_regs++;
2011 if (dispatch_width == 16) {
2012 /* R30: interpolated W if not 8-wide. */
2013 c->nr_payload_regs++;
2014 }
2015 }
2016 /* R31: MSAA position offsets. */
2017 /* R32-: bary for 32-pixel. */
2018 /* R58-59: interp W for 32-pixel. */
2019
2020 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2021 c->source_depth_to_render_target = true;
2022 }
2023 }
2024
2025 bool
2026 fs_visitor::run()
2027 {
2028 uint32_t prog_offset_16 = 0;
2029 uint32_t orig_nr_params = c->prog_data.nr_params;
2030
2031 if (intel->gen >= 6)
2032 setup_payload_gen6();
2033 else
2034 setup_payload_gen4();
2035
2036 if (dispatch_width == 16) {
2037 /* We have to do a compaction pass now, or the one at the end of
2038 * execution will squash down where our prog_offset start needs
2039 * to be.
2040 */
2041 brw_compact_instructions(p);
2042
2043 /* align to 64 byte boundary. */
2044 while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
2045 brw_NOP(p);
2046 }
2047
2048 /* Save off the start of this 16-wide program in case we succeed. */
2049 prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
2050
2051 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
2052 }
2053
2054 if (0) {
2055 emit_dummy_fs();
2056 } else {
2057 calculate_urb_setup();
2058 if (intel->gen < 6)
2059 emit_interpolation_setup_gen4();
2060 else
2061 emit_interpolation_setup_gen6();
2062
2063 /* Generate FS IR for main(). (the visitor only descends into
2064 * functions called "main").
2065 */
2066 if (shader) {
2067 foreach_list(node, &*shader->ir) {
2068 ir_instruction *ir = (ir_instruction *)node;
2069 base_ir = ir;
2070 this->result = reg_undef;
2071 ir->accept(this);
2072 }
2073 } else {
2074 emit_fragment_program_code();
2075 }
2076 if (failed)
2077 return false;
2078
2079 emit_fb_writes();
2080
2081 split_virtual_grfs();
2082
2083 setup_paramvalues_refs();
2084 setup_pull_constants();
2085
2086 bool progress;
2087 do {
2088 progress = false;
2089
2090 compact_virtual_grfs();
2091
2092 progress = remove_duplicate_mrf_writes() || progress;
2093
2094 progress = opt_algebraic() || progress;
2095 progress = opt_cse() || progress;
2096 progress = opt_copy_propagate() || progress;
2097 progress = dead_code_eliminate() || progress;
2098 progress = register_coalesce() || progress;
2099 progress = register_coalesce_2() || progress;
2100 progress = compute_to_mrf() || progress;
2101 } while (progress);
2102
2103 remove_dead_constants();
2104
2105 schedule_instructions();
2106
2107 assign_curb_setup();
2108 assign_urb_setup();
2109
2110 if (0) {
2111 /* Debug of register spilling: Go spill everything. */
2112 for (int i = 0; i < virtual_grf_count; i++) {
2113 spill_reg(i);
2114 }
2115 }
2116
2117 if (0)
2118 assign_regs_trivial();
2119 else {
2120 while (!assign_regs()) {
2121 if (failed)
2122 break;
2123 }
2124 }
2125 }
2126 assert(force_uncompressed_stack == 0);
2127 assert(force_sechalf_stack == 0);
2128
2129 if (failed)
2130 return false;
2131
2132 generate_code();
2133
2134 if (dispatch_width == 8) {
2135 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2136 } else {
2137 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2138 c->prog_data.prog_offset_16 = prog_offset_16;
2139
2140 /* Make sure we didn't try to sneak in an extra uniform */
2141 assert(orig_nr_params == c->prog_data.nr_params);
2142 (void) orig_nr_params;
2143 }
2144
2145 return !failed;
2146 }
2147
2148 bool
2149 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2150 struct gl_fragment_program *fp,
2151 struct gl_shader_program *prog)
2152 {
2153 struct intel_context *intel = &brw->intel;
2154 bool start_busy = false;
2155 float start_time = 0;
2156
2157 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2158 start_busy = (intel->batch.last_bo &&
2159 drm_intel_bo_busy(intel->batch.last_bo));
2160 start_time = get_time();
2161 }
2162
2163 struct brw_shader *shader = NULL;
2164 if (prog)
2165 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2166
2167 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2168 if (shader) {
2169 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2170 _mesa_print_ir(shader->ir, NULL);
2171 printf("\n\n");
2172 } else {
2173 printf("ARB_fragment_program %d ir for native fragment shader\n",
2174 fp->Base.Id);
2175 _mesa_print_program(&fp->Base);
2176 }
2177 }
2178
2179 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2180 */
2181 fs_visitor v(c, prog, fp, 8);
2182 if (!v.run()) {
2183 prog->LinkStatus = false;
2184 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2185
2186 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2187 v.fail_msg);
2188
2189 return false;
2190 }
2191
2192 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2193 fs_visitor v2(c, prog, fp, 16);
2194 v2.import_uniforms(&v);
2195 if (!v2.run()) {
2196 perf_debug("16-wide shader failed to compile, falling back to "
2197 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2198 }
2199 }
2200
2201 c->prog_data.dispatch_width = 8;
2202
2203 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2204 if (shader->compiled_once)
2205 brw_wm_debug_recompile(brw, prog, &c->key);
2206 shader->compiled_once = true;
2207
2208 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2209 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2210 (get_time() - start_time) * 1000);
2211 }
2212 }
2213
2214 return true;
2215 }
2216
2217 bool
2218 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2219 {
2220 struct brw_context *brw = brw_context(ctx);
2221 struct intel_context *intel = &brw->intel;
2222 struct brw_wm_prog_key key;
2223
2224 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2225 return true;
2226
2227 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2228 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2229 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2230 bool program_uses_dfdy = fp->UsesDFdy;
2231
2232 memset(&key, 0, sizeof(key));
2233
2234 if (intel->gen < 6) {
2235 if (fp->UsesKill)
2236 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2237
2238 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2239 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2240
2241 /* Just assume depth testing. */
2242 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2243 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2244 }
2245
2246 if (prog->Name != 0)
2247 key.proj_attrib_mask = 0xffffffff;
2248
2249 if (intel->gen < 6)
2250 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2251
2252 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2253 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2254 continue;
2255
2256 if (prog->Name == 0)
2257 key.proj_attrib_mask |= 1 << i;
2258
2259 if (intel->gen < 6) {
2260 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2261
2262 if (vp_index >= 0)
2263 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2264 }
2265 }
2266
2267 key.clamp_fragment_color = true;
2268
2269 for (int i = 0; i < MAX_SAMPLERS; i++) {
2270 if (fp->Base.ShadowSamplers & (1 << i)) {
2271 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2272 key.tex.swizzles[i] =
2273 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2274 } else {
2275 /* Color sampler: assume no swizzling. */
2276 key.tex.swizzles[i] = SWIZZLE_XYZW;
2277 }
2278 }
2279
2280 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2281 key.drawable_height = ctx->DrawBuffer->Height;
2282 }
2283
2284 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2285 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2286 }
2287
2288 key.nr_color_regions = 1;
2289
2290 key.program_string_id = bfp->id;
2291
2292 uint32_t old_prog_offset = brw->wm.prog_offset;
2293 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2294
2295 bool success = do_wm_prog(brw, prog, bfp, &key);
2296
2297 brw->wm.prog_offset = old_prog_offset;
2298 brw->wm.prog_data = old_prog_data;
2299
2300 return success;
2301 }