i965/fs: Rename the existing pull constant load opcode.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 ALU1(NOT)
150 ALU1(MOV)
151 ALU1(FRC)
152 ALU1(RNDD)
153 ALU1(RNDE)
154 ALU1(RNDZ)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(SHL)
162 ALU2(SHR)
163 ALU2(ASR)
164
165 /** Gen4 predicated IF. */
166 fs_inst *
167 fs_visitor::IF(uint32_t predicate)
168 {
169 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
170 inst->predicate = predicate;
171 return inst;
172 }
173
174 /** Gen6+ IF with embedded comparison. */
175 fs_inst *
176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
177 {
178 assert(intel->gen >= 6);
179 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
180 reg_null_d, src0, src1);
181 inst->conditional_mod = condition;
182 return inst;
183 }
184
185 /**
186 * CMP: Sets the low bit of the destination channels with the result
187 * of the comparison, while the upper bits are undefined, and updates
188 * the flag register with the packed 16 bits of the result.
189 */
190 fs_inst *
191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
192 {
193 fs_inst *inst;
194
195 /* Take the instruction:
196 *
197 * CMP null<d> src0<f> src1<f>
198 *
199 * Original gen4 does type conversion to the destination type before
200 * comparison, producing garbage results for floating point comparisons.
201 * gen5 does the comparison on the execution type (resolved source types),
202 * so dst type doesn't matter. gen6 does comparison and then uses the
203 * result as if it was the dst type with no conversion, which happens to
204 * mostly work out for float-interpreted-as-int since our comparisons are
205 * for >0, =0, <0.
206 */
207 if (intel->gen == 4) {
208 dst.type = src0.type;
209 if (dst.file == FIXED_HW_REG)
210 dst.fixed_hw_reg.type = dst.type;
211 }
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 bool
223 fs_inst::equals(fs_inst *inst)
224 {
225 return (opcode == inst->opcode &&
226 dst.equals(inst->dst) &&
227 src[0].equals(inst->src[0]) &&
228 src[1].equals(inst->src[1]) &&
229 src[2].equals(inst->src[2]) &&
230 saturate == inst->saturate &&
231 predicate == inst->predicate &&
232 conditional_mod == inst->conditional_mod &&
233 mlen == inst->mlen &&
234 base_mrf == inst->base_mrf &&
235 sampler == inst->sampler &&
236 target == inst->target &&
237 eot == inst->eot &&
238 header_present == inst->header_present &&
239 shadow_compare == inst->shadow_compare &&
240 offset == inst->offset);
241 }
242
243 int
244 fs_inst::regs_written()
245 {
246 if (is_tex())
247 return 4;
248
249 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
250 * but we don't currently use them...nor do we have an opcode for them.
251 */
252
253 return 1;
254 }
255
256 bool
257 fs_inst::overwrites_reg(const fs_reg &reg)
258 {
259 return (reg.file == dst.file &&
260 reg.reg == dst.reg &&
261 reg.reg_offset >= dst.reg_offset &&
262 reg.reg_offset < dst.reg_offset + regs_written());
263 }
264
265 bool
266 fs_inst::is_tex()
267 {
268 return (opcode == SHADER_OPCODE_TEX ||
269 opcode == FS_OPCODE_TXB ||
270 opcode == SHADER_OPCODE_TXD ||
271 opcode == SHADER_OPCODE_TXF ||
272 opcode == SHADER_OPCODE_TXL ||
273 opcode == SHADER_OPCODE_TXS);
274 }
275
276 bool
277 fs_inst::is_math()
278 {
279 return (opcode == SHADER_OPCODE_RCP ||
280 opcode == SHADER_OPCODE_RSQ ||
281 opcode == SHADER_OPCODE_SQRT ||
282 opcode == SHADER_OPCODE_EXP2 ||
283 opcode == SHADER_OPCODE_LOG2 ||
284 opcode == SHADER_OPCODE_SIN ||
285 opcode == SHADER_OPCODE_COS ||
286 opcode == SHADER_OPCODE_INT_QUOTIENT ||
287 opcode == SHADER_OPCODE_INT_REMAINDER ||
288 opcode == SHADER_OPCODE_POW);
289 }
290
291 void
292 fs_reg::init()
293 {
294 memset(this, 0, sizeof(*this));
295 this->smear = -1;
296 }
297
298 /** Generic unset register constructor. */
299 fs_reg::fs_reg()
300 {
301 init();
302 this->file = BAD_FILE;
303 }
304
305 /** Immediate value constructor. */
306 fs_reg::fs_reg(float f)
307 {
308 init();
309 this->file = IMM;
310 this->type = BRW_REGISTER_TYPE_F;
311 this->imm.f = f;
312 }
313
314 /** Immediate value constructor. */
315 fs_reg::fs_reg(int32_t i)
316 {
317 init();
318 this->file = IMM;
319 this->type = BRW_REGISTER_TYPE_D;
320 this->imm.i = i;
321 }
322
323 /** Immediate value constructor. */
324 fs_reg::fs_reg(uint32_t u)
325 {
326 init();
327 this->file = IMM;
328 this->type = BRW_REGISTER_TYPE_UD;
329 this->imm.u = u;
330 }
331
332 /** Fixed brw_reg Immediate value constructor. */
333 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
334 {
335 init();
336 this->file = FIXED_HW_REG;
337 this->fixed_hw_reg = fixed_hw_reg;
338 this->type = fixed_hw_reg.type;
339 }
340
341 bool
342 fs_reg::equals(const fs_reg &r) const
343 {
344 return (file == r.file &&
345 reg == r.reg &&
346 reg_offset == r.reg_offset &&
347 type == r.type &&
348 negate == r.negate &&
349 abs == r.abs &&
350 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
351 sizeof(fixed_hw_reg)) == 0 &&
352 smear == r.smear &&
353 imm.u == r.imm.u);
354 }
355
356 bool
357 fs_reg::is_zero() const
358 {
359 if (file != IMM)
360 return false;
361
362 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
363 }
364
365 bool
366 fs_reg::is_one() const
367 {
368 if (file != IMM)
369 return false;
370
371 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
372 }
373
374 int
375 fs_visitor::type_size(const struct glsl_type *type)
376 {
377 unsigned int size, i;
378
379 switch (type->base_type) {
380 case GLSL_TYPE_UINT:
381 case GLSL_TYPE_INT:
382 case GLSL_TYPE_FLOAT:
383 case GLSL_TYPE_BOOL:
384 return type->components();
385 case GLSL_TYPE_ARRAY:
386 return type_size(type->fields.array) * type->length;
387 case GLSL_TYPE_STRUCT:
388 size = 0;
389 for (i = 0; i < type->length; i++) {
390 size += type_size(type->fields.structure[i].type);
391 }
392 return size;
393 case GLSL_TYPE_SAMPLER:
394 /* Samplers take up no register space, since they're baked in at
395 * link time.
396 */
397 return 0;
398 default:
399 assert(!"not reached");
400 return 0;
401 }
402 }
403
404 void
405 fs_visitor::fail(const char *format, ...)
406 {
407 va_list va;
408 char *msg;
409
410 if (failed)
411 return;
412
413 failed = true;
414
415 va_start(va, format);
416 msg = ralloc_vasprintf(mem_ctx, format, va);
417 va_end(va);
418 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
419
420 this->fail_msg = msg;
421
422 if (INTEL_DEBUG & DEBUG_WM) {
423 fprintf(stderr, "%s", msg);
424 }
425 }
426
427 fs_inst *
428 fs_visitor::emit(enum opcode opcode)
429 {
430 return emit(fs_inst(opcode));
431 }
432
433 fs_inst *
434 fs_visitor::emit(enum opcode opcode, fs_reg dst)
435 {
436 return emit(fs_inst(opcode, dst));
437 }
438
439 fs_inst *
440 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
441 {
442 return emit(fs_inst(opcode, dst, src0));
443 }
444
445 fs_inst *
446 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
447 {
448 return emit(fs_inst(opcode, dst, src0, src1));
449 }
450
451 fs_inst *
452 fs_visitor::emit(enum opcode opcode, fs_reg dst,
453 fs_reg src0, fs_reg src1, fs_reg src2)
454 {
455 return emit(fs_inst(opcode, dst, src0, src1, src2));
456 }
457
458 void
459 fs_visitor::push_force_uncompressed()
460 {
461 force_uncompressed_stack++;
462 }
463
464 void
465 fs_visitor::pop_force_uncompressed()
466 {
467 force_uncompressed_stack--;
468 assert(force_uncompressed_stack >= 0);
469 }
470
471 void
472 fs_visitor::push_force_sechalf()
473 {
474 force_sechalf_stack++;
475 }
476
477 void
478 fs_visitor::pop_force_sechalf()
479 {
480 force_sechalf_stack--;
481 assert(force_sechalf_stack >= 0);
482 }
483
484 /**
485 * Returns how many MRFs an FS opcode will write over.
486 *
487 * Note that this is not the 0 or 1 implied writes in an actual gen
488 * instruction -- the FS opcodes often generate MOVs in addition.
489 */
490 int
491 fs_visitor::implied_mrf_writes(fs_inst *inst)
492 {
493 if (inst->mlen == 0)
494 return 0;
495
496 switch (inst->opcode) {
497 case SHADER_OPCODE_RCP:
498 case SHADER_OPCODE_RSQ:
499 case SHADER_OPCODE_SQRT:
500 case SHADER_OPCODE_EXP2:
501 case SHADER_OPCODE_LOG2:
502 case SHADER_OPCODE_SIN:
503 case SHADER_OPCODE_COS:
504 return 1 * dispatch_width / 8;
505 case SHADER_OPCODE_POW:
506 case SHADER_OPCODE_INT_QUOTIENT:
507 case SHADER_OPCODE_INT_REMAINDER:
508 return 2 * dispatch_width / 8;
509 case SHADER_OPCODE_TEX:
510 case FS_OPCODE_TXB:
511 case SHADER_OPCODE_TXD:
512 case SHADER_OPCODE_TXF:
513 case SHADER_OPCODE_TXL:
514 case SHADER_OPCODE_TXS:
515 return 1;
516 case FS_OPCODE_FB_WRITE:
517 return 2;
518 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
519 case FS_OPCODE_UNSPILL:
520 return 1;
521 case FS_OPCODE_SPILL:
522 return 2;
523 default:
524 assert(!"not reached");
525 return inst->mlen;
526 }
527 }
528
529 int
530 fs_visitor::virtual_grf_alloc(int size)
531 {
532 if (virtual_grf_array_size <= virtual_grf_count) {
533 if (virtual_grf_array_size == 0)
534 virtual_grf_array_size = 16;
535 else
536 virtual_grf_array_size *= 2;
537 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
538 virtual_grf_array_size);
539 }
540 virtual_grf_sizes[virtual_grf_count] = size;
541 return virtual_grf_count++;
542 }
543
544 /** Fixed HW reg constructor. */
545 fs_reg::fs_reg(enum register_file file, int reg)
546 {
547 init();
548 this->file = file;
549 this->reg = reg;
550 this->type = BRW_REGISTER_TYPE_F;
551 }
552
553 /** Fixed HW reg constructor. */
554 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
555 {
556 init();
557 this->file = file;
558 this->reg = reg;
559 this->type = type;
560 }
561
562 /** Automatic reg constructor. */
563 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
564 {
565 init();
566
567 this->file = GRF;
568 this->reg = v->virtual_grf_alloc(v->type_size(type));
569 this->reg_offset = 0;
570 this->type = brw_type_for_base_type(type);
571 }
572
573 fs_reg *
574 fs_visitor::variable_storage(ir_variable *var)
575 {
576 return (fs_reg *)hash_table_find(this->variable_ht, var);
577 }
578
579 void
580 import_uniforms_callback(const void *key,
581 void *data,
582 void *closure)
583 {
584 struct hash_table *dst_ht = (struct hash_table *)closure;
585 const fs_reg *reg = (const fs_reg *)data;
586
587 if (reg->file != UNIFORM)
588 return;
589
590 hash_table_insert(dst_ht, data, key);
591 }
592
593 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
594 * This brings in those uniform definitions
595 */
596 void
597 fs_visitor::import_uniforms(fs_visitor *v)
598 {
599 hash_table_call_foreach(v->variable_ht,
600 import_uniforms_callback,
601 variable_ht);
602 this->params_remap = v->params_remap;
603 }
604
605 /* Our support for uniforms is piggy-backed on the struct
606 * gl_fragment_program, because that's where the values actually
607 * get stored, rather than in some global gl_shader_program uniform
608 * store.
609 */
610 int
611 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
612 {
613 unsigned int offset = 0;
614
615 if (type->is_matrix()) {
616 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
617 type->vector_elements,
618 1);
619
620 for (unsigned int i = 0; i < type->matrix_columns; i++) {
621 offset += setup_uniform_values(loc + offset, column);
622 }
623
624 return offset;
625 }
626
627 switch (type->base_type) {
628 case GLSL_TYPE_FLOAT:
629 case GLSL_TYPE_UINT:
630 case GLSL_TYPE_INT:
631 case GLSL_TYPE_BOOL:
632 for (unsigned int i = 0; i < type->vector_elements; i++) {
633 unsigned int param = c->prog_data.nr_params++;
634
635 this->param_index[param] = loc;
636 this->param_offset[param] = i;
637 }
638 return 1;
639
640 case GLSL_TYPE_STRUCT:
641 for (unsigned int i = 0; i < type->length; i++) {
642 offset += setup_uniform_values(loc + offset,
643 type->fields.structure[i].type);
644 }
645 return offset;
646
647 case GLSL_TYPE_ARRAY:
648 for (unsigned int i = 0; i < type->length; i++) {
649 offset += setup_uniform_values(loc + offset, type->fields.array);
650 }
651 return offset;
652
653 case GLSL_TYPE_SAMPLER:
654 /* The sampler takes up a slot, but we don't use any values from it. */
655 return 1;
656
657 default:
658 assert(!"not reached");
659 return 0;
660 }
661 }
662
663
664 /* Our support for builtin uniforms is even scarier than non-builtin.
665 * It sits on top of the PROG_STATE_VAR parameters that are
666 * automatically updated from GL context state.
667 */
668 void
669 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
670 {
671 const ir_state_slot *const slots = ir->state_slots;
672 assert(ir->state_slots != NULL);
673
674 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
675 /* This state reference has already been setup by ir_to_mesa, but we'll
676 * get the same index back here.
677 */
678 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
679 (gl_state_index *)slots[i].tokens);
680
681 /* Add each of the unique swizzles of the element as a parameter.
682 * This'll end up matching the expected layout of the
683 * array/matrix/structure we're trying to fill in.
684 */
685 int last_swiz = -1;
686 for (unsigned int j = 0; j < 4; j++) {
687 int swiz = GET_SWZ(slots[i].swizzle, j);
688 if (swiz == last_swiz)
689 break;
690 last_swiz = swiz;
691
692 this->param_index[c->prog_data.nr_params] = index;
693 this->param_offset[c->prog_data.nr_params] = swiz;
694 c->prog_data.nr_params++;
695 }
696 }
697 }
698
699 fs_reg *
700 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
701 {
702 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
703 fs_reg wpos = *reg;
704 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
705
706 /* gl_FragCoord.x */
707 if (ir->pixel_center_integer) {
708 emit(MOV(wpos, this->pixel_x));
709 } else {
710 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
711 }
712 wpos.reg_offset++;
713
714 /* gl_FragCoord.y */
715 if (!flip && ir->pixel_center_integer) {
716 emit(MOV(wpos, this->pixel_y));
717 } else {
718 fs_reg pixel_y = this->pixel_y;
719 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
720
721 if (flip) {
722 pixel_y.negate = true;
723 offset += c->key.drawable_height - 1.0;
724 }
725
726 emit(ADD(wpos, pixel_y, fs_reg(offset)));
727 }
728 wpos.reg_offset++;
729
730 /* gl_FragCoord.z */
731 if (intel->gen >= 6) {
732 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
733 } else {
734 emit(FS_OPCODE_LINTERP, wpos,
735 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
736 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
737 interp_reg(FRAG_ATTRIB_WPOS, 2));
738 }
739 wpos.reg_offset++;
740
741 /* gl_FragCoord.w: Already set up in emit_interpolation */
742 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
743
744 return reg;
745 }
746
747 fs_inst *
748 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
749 glsl_interp_qualifier interpolation_mode,
750 bool is_centroid)
751 {
752 brw_wm_barycentric_interp_mode barycoord_mode;
753 if (is_centroid) {
754 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
755 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
756 else
757 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
758 } else {
759 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
760 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
761 else
762 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
763 }
764 return emit(FS_OPCODE_LINTERP, attr,
765 this->delta_x[barycoord_mode],
766 this->delta_y[barycoord_mode], interp);
767 }
768
769 fs_reg *
770 fs_visitor::emit_general_interpolation(ir_variable *ir)
771 {
772 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
773 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
774 fs_reg attr = *reg;
775
776 unsigned int array_elements;
777 const glsl_type *type;
778
779 if (ir->type->is_array()) {
780 array_elements = ir->type->length;
781 if (array_elements == 0) {
782 fail("dereferenced array '%s' has length 0\n", ir->name);
783 }
784 type = ir->type->fields.array;
785 } else {
786 array_elements = 1;
787 type = ir->type;
788 }
789
790 glsl_interp_qualifier interpolation_mode =
791 ir->determine_interpolation_mode(c->key.flat_shade);
792
793 int location = ir->location;
794 for (unsigned int i = 0; i < array_elements; i++) {
795 for (unsigned int j = 0; j < type->matrix_columns; j++) {
796 if (urb_setup[location] == -1) {
797 /* If there's no incoming setup data for this slot, don't
798 * emit interpolation for it.
799 */
800 attr.reg_offset += type->vector_elements;
801 location++;
802 continue;
803 }
804
805 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
806 /* Constant interpolation (flat shading) case. The SF has
807 * handed us defined values in only the constant offset
808 * field of the setup reg.
809 */
810 for (unsigned int k = 0; k < type->vector_elements; k++) {
811 struct brw_reg interp = interp_reg(location, k);
812 interp = suboffset(interp, 3);
813 interp.type = reg->type;
814 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
815 attr.reg_offset++;
816 }
817 } else {
818 /* Smooth/noperspective interpolation case. */
819 for (unsigned int k = 0; k < type->vector_elements; k++) {
820 /* FINISHME: At some point we probably want to push
821 * this farther by giving similar treatment to the
822 * other potentially constant components of the
823 * attribute, as well as making brw_vs_constval.c
824 * handle varyings other than gl_TexCoord.
825 */
826 if (location >= FRAG_ATTRIB_TEX0 &&
827 location <= FRAG_ATTRIB_TEX7 &&
828 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
829 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
830 } else {
831 struct brw_reg interp = interp_reg(location, k);
832 emit_linterp(attr, fs_reg(interp), interpolation_mode,
833 ir->centroid);
834 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
835 /* Get the pixel/sample mask into f0 so that we know
836 * which pixels are lit. Then, for each channel that is
837 * unlit, replace the centroid data with non-centroid
838 * data.
839 */
840 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
841 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
842 interpolation_mode, false);
843 inst->predicate = BRW_PREDICATE_NORMAL;
844 inst->predicate_inverse = true;
845 }
846 if (intel->gen < 6) {
847 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
848 }
849 }
850 attr.reg_offset++;
851 }
852
853 }
854 location++;
855 }
856 }
857
858 return reg;
859 }
860
861 fs_reg *
862 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
863 {
864 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
865
866 /* The frontfacing comes in as a bit in the thread payload. */
867 if (intel->gen >= 6) {
868 emit(BRW_OPCODE_ASR, *reg,
869 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
870 fs_reg(15));
871 emit(BRW_OPCODE_NOT, *reg, *reg);
872 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
873 } else {
874 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
875 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
876 * us front face
877 */
878 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
879 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
880 }
881
882 return reg;
883 }
884
885 fs_inst *
886 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
887 {
888 switch (opcode) {
889 case SHADER_OPCODE_RCP:
890 case SHADER_OPCODE_RSQ:
891 case SHADER_OPCODE_SQRT:
892 case SHADER_OPCODE_EXP2:
893 case SHADER_OPCODE_LOG2:
894 case SHADER_OPCODE_SIN:
895 case SHADER_OPCODE_COS:
896 break;
897 default:
898 assert(!"not reached: bad math opcode");
899 return NULL;
900 }
901
902 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
903 * might be able to do better by doing execsize = 1 math and then
904 * expanding that result out, but we would need to be careful with
905 * masking.
906 *
907 * Gen 6 hardware ignores source modifiers (negate and abs) on math
908 * instructions, so we also move to a temp to set those up.
909 */
910 if (intel->gen == 6 && (src.file == UNIFORM ||
911 src.abs ||
912 src.negate)) {
913 fs_reg expanded = fs_reg(this, glsl_type::float_type);
914 emit(BRW_OPCODE_MOV, expanded, src);
915 src = expanded;
916 }
917
918 fs_inst *inst = emit(opcode, dst, src);
919
920 if (intel->gen < 6) {
921 inst->base_mrf = 2;
922 inst->mlen = dispatch_width / 8;
923 }
924
925 return inst;
926 }
927
928 fs_inst *
929 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
930 {
931 int base_mrf = 2;
932 fs_inst *inst;
933
934 switch (opcode) {
935 case SHADER_OPCODE_POW:
936 case SHADER_OPCODE_INT_QUOTIENT:
937 case SHADER_OPCODE_INT_REMAINDER:
938 break;
939 default:
940 assert(!"not reached: unsupported binary math opcode.");
941 return NULL;
942 }
943
944 if (intel->gen >= 7) {
945 inst = emit(opcode, dst, src0, src1);
946 } else if (intel->gen == 6) {
947 /* Can't do hstride == 0 args to gen6 math, so expand it out.
948 *
949 * The hardware ignores source modifiers (negate and abs) on math
950 * instructions, so we also move to a temp to set those up.
951 */
952 if (src0.file == UNIFORM || src0.abs || src0.negate) {
953 fs_reg expanded = fs_reg(this, glsl_type::float_type);
954 expanded.type = src0.type;
955 emit(BRW_OPCODE_MOV, expanded, src0);
956 src0 = expanded;
957 }
958
959 if (src1.file == UNIFORM || src1.abs || src1.negate) {
960 fs_reg expanded = fs_reg(this, glsl_type::float_type);
961 expanded.type = src1.type;
962 emit(BRW_OPCODE_MOV, expanded, src1);
963 src1 = expanded;
964 }
965
966 inst = emit(opcode, dst, src0, src1);
967 } else {
968 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
969 * "Message Payload":
970 *
971 * "Operand0[7]. For the INT DIV functions, this operand is the
972 * denominator."
973 * ...
974 * "Operand1[7]. For the INT DIV functions, this operand is the
975 * numerator."
976 */
977 bool is_int_div = opcode != SHADER_OPCODE_POW;
978 fs_reg &op0 = is_int_div ? src1 : src0;
979 fs_reg &op1 = is_int_div ? src0 : src1;
980
981 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
982 inst = emit(opcode, dst, op0, reg_null_f);
983
984 inst->base_mrf = base_mrf;
985 inst->mlen = 2 * dispatch_width / 8;
986 }
987 return inst;
988 }
989
990 /**
991 * To be called after the last _mesa_add_state_reference() call, to
992 * set up prog_data.param[] for assign_curb_setup() and
993 * setup_pull_constants().
994 */
995 void
996 fs_visitor::setup_paramvalues_refs()
997 {
998 if (dispatch_width != 8)
999 return;
1000
1001 /* Set up the pointers to ParamValues now that that array is finalized. */
1002 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1003 c->prog_data.param[i] =
1004 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
1005 this->param_offset[i];
1006 }
1007 }
1008
1009 void
1010 fs_visitor::assign_curb_setup()
1011 {
1012 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1013 if (dispatch_width == 8) {
1014 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1015 } else {
1016 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1017 }
1018
1019 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1020 foreach_list(node, &this->instructions) {
1021 fs_inst *inst = (fs_inst *)node;
1022
1023 for (unsigned int i = 0; i < 3; i++) {
1024 if (inst->src[i].file == UNIFORM) {
1025 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1026 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1027 constant_nr / 8,
1028 constant_nr % 8);
1029
1030 inst->src[i].file = FIXED_HW_REG;
1031 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1032 }
1033 }
1034 }
1035 }
1036
1037 void
1038 fs_visitor::calculate_urb_setup()
1039 {
1040 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1041 urb_setup[i] = -1;
1042 }
1043
1044 int urb_next = 0;
1045 /* Figure out where each of the incoming setup attributes lands. */
1046 if (intel->gen >= 6) {
1047 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1048 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1049 urb_setup[i] = urb_next++;
1050 }
1051 }
1052 } else {
1053 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1054 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1055 /* Point size is packed into the header, not as a general attribute */
1056 if (i == VERT_RESULT_PSIZ)
1057 continue;
1058
1059 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1060 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1061
1062 /* The back color slot is skipped when the front color is
1063 * also written to. In addition, some slots can be
1064 * written in the vertex shader and not read in the
1065 * fragment shader. So the register number must always be
1066 * incremented, mapped or not.
1067 */
1068 if (fp_index >= 0)
1069 urb_setup[fp_index] = urb_next;
1070 urb_next++;
1071 }
1072 }
1073
1074 /*
1075 * It's a FS only attribute, and we did interpolation for this attribute
1076 * in SF thread. So, count it here, too.
1077 *
1078 * See compile_sf_prog() for more info.
1079 */
1080 if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1081 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1082 }
1083
1084 /* Each attribute is 4 setup channels, each of which is half a reg. */
1085 c->prog_data.urb_read_length = urb_next * 2;
1086 }
1087
1088 void
1089 fs_visitor::assign_urb_setup()
1090 {
1091 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1092
1093 /* Offset all the urb_setup[] index by the actual position of the
1094 * setup regs, now that the location of the constants has been chosen.
1095 */
1096 foreach_list(node, &this->instructions) {
1097 fs_inst *inst = (fs_inst *)node;
1098
1099 if (inst->opcode == FS_OPCODE_LINTERP) {
1100 assert(inst->src[2].file == FIXED_HW_REG);
1101 inst->src[2].fixed_hw_reg.nr += urb_start;
1102 }
1103
1104 if (inst->opcode == FS_OPCODE_CINTERP) {
1105 assert(inst->src[0].file == FIXED_HW_REG);
1106 inst->src[0].fixed_hw_reg.nr += urb_start;
1107 }
1108 }
1109
1110 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1111 }
1112
1113 /**
1114 * Split large virtual GRFs into separate components if we can.
1115 *
1116 * This is mostly duplicated with what brw_fs_vector_splitting does,
1117 * but that's really conservative because it's afraid of doing
1118 * splitting that doesn't result in real progress after the rest of
1119 * the optimization phases, which would cause infinite looping in
1120 * optimization. We can do it once here, safely. This also has the
1121 * opportunity to split interpolated values, or maybe even uniforms,
1122 * which we don't have at the IR level.
1123 *
1124 * We want to split, because virtual GRFs are what we register
1125 * allocate and spill (due to contiguousness requirements for some
1126 * instructions), and they're what we naturally generate in the
1127 * codegen process, but most virtual GRFs don't actually need to be
1128 * contiguous sets of GRFs. If we split, we'll end up with reduced
1129 * live intervals and better dead code elimination and coalescing.
1130 */
1131 void
1132 fs_visitor::split_virtual_grfs()
1133 {
1134 int num_vars = this->virtual_grf_count;
1135 bool split_grf[num_vars];
1136 int new_virtual_grf[num_vars];
1137
1138 /* Try to split anything > 0 sized. */
1139 for (int i = 0; i < num_vars; i++) {
1140 if (this->virtual_grf_sizes[i] != 1)
1141 split_grf[i] = true;
1142 else
1143 split_grf[i] = false;
1144 }
1145
1146 if (brw->has_pln &&
1147 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1148 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1149 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1150 * Gen6, that was the only supported interpolation mode, and since Gen6,
1151 * delta_x and delta_y are in fixed hardware registers.
1152 */
1153 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1154 false;
1155 }
1156
1157 foreach_list(node, &this->instructions) {
1158 fs_inst *inst = (fs_inst *)node;
1159
1160 /* If there's a SEND message that requires contiguous destination
1161 * registers, no splitting is allowed.
1162 */
1163 if (inst->regs_written() > 1) {
1164 split_grf[inst->dst.reg] = false;
1165 }
1166 }
1167
1168 /* Allocate new space for split regs. Note that the virtual
1169 * numbers will be contiguous.
1170 */
1171 for (int i = 0; i < num_vars; i++) {
1172 if (split_grf[i]) {
1173 new_virtual_grf[i] = virtual_grf_alloc(1);
1174 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1175 int reg = virtual_grf_alloc(1);
1176 assert(reg == new_virtual_grf[i] + j - 1);
1177 (void) reg;
1178 }
1179 this->virtual_grf_sizes[i] = 1;
1180 }
1181 }
1182
1183 foreach_list(node, &this->instructions) {
1184 fs_inst *inst = (fs_inst *)node;
1185
1186 if (inst->dst.file == GRF &&
1187 split_grf[inst->dst.reg] &&
1188 inst->dst.reg_offset != 0) {
1189 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1190 inst->dst.reg_offset - 1);
1191 inst->dst.reg_offset = 0;
1192 }
1193 for (int i = 0; i < 3; i++) {
1194 if (inst->src[i].file == GRF &&
1195 split_grf[inst->src[i].reg] &&
1196 inst->src[i].reg_offset != 0) {
1197 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1198 inst->src[i].reg_offset - 1);
1199 inst->src[i].reg_offset = 0;
1200 }
1201 }
1202 }
1203 this->live_intervals_valid = false;
1204 }
1205
1206 /**
1207 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1208 *
1209 * During code generation, we create tons of temporary variables, many of
1210 * which get immediately killed and are never used again. Yet, in later
1211 * optimization and analysis passes, such as compute_live_intervals, we need
1212 * to loop over all the virtual GRFs. Compacting them can save a lot of
1213 * overhead.
1214 */
1215 void
1216 fs_visitor::compact_virtual_grfs()
1217 {
1218 /* Mark which virtual GRFs are used, and count how many. */
1219 int remap_table[this->virtual_grf_count];
1220 memset(remap_table, -1, sizeof(remap_table));
1221
1222 foreach_list(node, &this->instructions) {
1223 const fs_inst *inst = (const fs_inst *) node;
1224
1225 if (inst->dst.file == GRF)
1226 remap_table[inst->dst.reg] = 0;
1227
1228 for (int i = 0; i < 3; i++) {
1229 if (inst->src[i].file == GRF)
1230 remap_table[inst->src[i].reg] = 0;
1231 }
1232 }
1233
1234 /* In addition to registers used in instructions, fs_visitor keeps
1235 * direct references to certain special values which must be patched:
1236 */
1237 fs_reg *special[] = {
1238 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1239 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1240 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1241 &delta_x[0], &delta_x[1], &delta_x[2],
1242 &delta_x[3], &delta_x[4], &delta_x[5],
1243 &delta_y[0], &delta_y[1], &delta_y[2],
1244 &delta_y[3], &delta_y[4], &delta_y[5],
1245 };
1246 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1247 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1248
1249 /* Treat all special values as used, to be conservative */
1250 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1251 if (special[i]->file == GRF)
1252 remap_table[special[i]->reg] = 0;
1253 }
1254
1255 /* Compact the GRF arrays. */
1256 int new_index = 0;
1257 for (int i = 0; i < this->virtual_grf_count; i++) {
1258 if (remap_table[i] != -1) {
1259 remap_table[i] = new_index;
1260 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1261 if (live_intervals_valid) {
1262 virtual_grf_use[new_index] = virtual_grf_use[i];
1263 virtual_grf_def[new_index] = virtual_grf_def[i];
1264 }
1265 ++new_index;
1266 }
1267 }
1268
1269 this->virtual_grf_count = new_index;
1270
1271 /* Patch all the instructions to use the newly renumbered registers */
1272 foreach_list(node, &this->instructions) {
1273 fs_inst *inst = (fs_inst *) node;
1274
1275 if (inst->dst.file == GRF)
1276 inst->dst.reg = remap_table[inst->dst.reg];
1277
1278 for (int i = 0; i < 3; i++) {
1279 if (inst->src[i].file == GRF)
1280 inst->src[i].reg = remap_table[inst->src[i].reg];
1281 }
1282 }
1283
1284 /* Patch all the references to special values */
1285 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1286 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1287 special[i]->reg = remap_table[special[i]->reg];
1288 }
1289 }
1290
1291 bool
1292 fs_visitor::remove_dead_constants()
1293 {
1294 if (dispatch_width == 8) {
1295 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1296
1297 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1298 this->params_remap[i] = -1;
1299
1300 /* Find which params are still in use. */
1301 foreach_list(node, &this->instructions) {
1302 fs_inst *inst = (fs_inst *)node;
1303
1304 for (int i = 0; i < 3; i++) {
1305 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1306
1307 if (inst->src[i].file != UNIFORM)
1308 continue;
1309
1310 assert(constant_nr < (int)c->prog_data.nr_params);
1311
1312 /* For now, set this to non-negative. We'll give it the
1313 * actual new number in a moment, in order to keep the
1314 * register numbers nicely ordered.
1315 */
1316 this->params_remap[constant_nr] = 0;
1317 }
1318 }
1319
1320 /* Figure out what the new numbers for the params will be. At some
1321 * point when we're doing uniform array access, we're going to want
1322 * to keep the distinction between .reg and .reg_offset, but for
1323 * now we don't care.
1324 */
1325 unsigned int new_nr_params = 0;
1326 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1327 if (this->params_remap[i] != -1) {
1328 this->params_remap[i] = new_nr_params++;
1329 }
1330 }
1331
1332 /* Update the list of params to be uploaded to match our new numbering. */
1333 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1334 int remapped = this->params_remap[i];
1335
1336 if (remapped == -1)
1337 continue;
1338
1339 /* We've already done setup_paramvalues_refs() so no need to worry
1340 * about param_index and param_offset.
1341 */
1342 c->prog_data.param[remapped] = c->prog_data.param[i];
1343 }
1344
1345 c->prog_data.nr_params = new_nr_params;
1346 } else {
1347 /* This should have been generated in the 8-wide pass already. */
1348 assert(this->params_remap);
1349 }
1350
1351 /* Now do the renumbering of the shader to remove unused params. */
1352 foreach_list(node, &this->instructions) {
1353 fs_inst *inst = (fs_inst *)node;
1354
1355 for (int i = 0; i < 3; i++) {
1356 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1357
1358 if (inst->src[i].file != UNIFORM)
1359 continue;
1360
1361 assert(this->params_remap[constant_nr] != -1);
1362 inst->src[i].reg = this->params_remap[constant_nr];
1363 inst->src[i].reg_offset = 0;
1364 }
1365 }
1366
1367 return true;
1368 }
1369
1370 /**
1371 * Choose accesses from the UNIFORM file to demote to using the pull
1372 * constant buffer.
1373 *
1374 * We allow a fragment shader to have more than the specified minimum
1375 * maximum number of fragment shader uniform components (64). If
1376 * there are too many of these, they'd fill up all of register space.
1377 * So, this will push some of them out to the pull constant buffer and
1378 * update the program to load them.
1379 */
1380 void
1381 fs_visitor::setup_pull_constants()
1382 {
1383 /* Only allow 16 registers (128 uniform components) as push constants. */
1384 unsigned int max_uniform_components = 16 * 8;
1385 if (c->prog_data.nr_params <= max_uniform_components)
1386 return;
1387
1388 if (dispatch_width == 16) {
1389 fail("Pull constants not supported in 16-wide\n");
1390 return;
1391 }
1392
1393 /* Just demote the end of the list. We could probably do better
1394 * here, demoting things that are rarely used in the program first.
1395 */
1396 int pull_uniform_base = max_uniform_components;
1397 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
1398
1399 foreach_list(node, &this->instructions) {
1400 fs_inst *inst = (fs_inst *)node;
1401
1402 for (int i = 0; i < 3; i++) {
1403 if (inst->src[i].file != UNIFORM)
1404 continue;
1405
1406 int uniform_nr = inst->src[i].reg + inst->src[i].reg_offset;
1407 if (uniform_nr < pull_uniform_base)
1408 continue;
1409
1410 fs_reg dst = fs_reg(this, glsl_type::float_type);
1411 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1412 fs_reg offset = fs_reg((unsigned)(((uniform_nr -
1413 pull_uniform_base) * 4) & ~15));
1414 fs_inst *pull =
1415 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1416 dst, index, offset);
1417 pull->ir = inst->ir;
1418 pull->annotation = inst->annotation;
1419 pull->base_mrf = 14;
1420 pull->mlen = 1;
1421
1422 inst->insert_before(pull);
1423
1424 inst->src[i].file = GRF;
1425 inst->src[i].reg = dst.reg;
1426 inst->src[i].reg_offset = 0;
1427 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
1428 }
1429 }
1430
1431 for (int i = 0; i < pull_uniform_count; i++) {
1432 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
1433 }
1434 c->prog_data.nr_params -= pull_uniform_count;
1435 c->prog_data.nr_pull_params = pull_uniform_count;
1436 }
1437
1438 bool
1439 fs_visitor::opt_algebraic()
1440 {
1441 bool progress = false;
1442
1443 foreach_list(node, &this->instructions) {
1444 fs_inst *inst = (fs_inst *)node;
1445
1446 switch (inst->opcode) {
1447 case BRW_OPCODE_MUL:
1448 if (inst->src[1].file != IMM)
1449 continue;
1450
1451 /* a * 1.0 = a */
1452 if (inst->src[1].is_one()) {
1453 inst->opcode = BRW_OPCODE_MOV;
1454 inst->src[1] = reg_undef;
1455 progress = true;
1456 break;
1457 }
1458
1459 /* a * 0.0 = 0.0 */
1460 if (inst->src[1].is_zero()) {
1461 inst->opcode = BRW_OPCODE_MOV;
1462 inst->src[0] = inst->src[1];
1463 inst->src[1] = reg_undef;
1464 progress = true;
1465 break;
1466 }
1467
1468 break;
1469 case BRW_OPCODE_ADD:
1470 if (inst->src[1].file != IMM)
1471 continue;
1472
1473 /* a + 0.0 = a */
1474 if (inst->src[1].is_zero()) {
1475 inst->opcode = BRW_OPCODE_MOV;
1476 inst->src[1] = reg_undef;
1477 progress = true;
1478 break;
1479 }
1480 break;
1481 default:
1482 break;
1483 }
1484 }
1485
1486 return progress;
1487 }
1488
1489 /**
1490 * Must be called after calculate_live_intervales() to remove unused
1491 * writes to registers -- register allocation will fail otherwise
1492 * because something deffed but not used won't be considered to
1493 * interfere with other regs.
1494 */
1495 bool
1496 fs_visitor::dead_code_eliminate()
1497 {
1498 bool progress = false;
1499 int pc = 0;
1500
1501 calculate_live_intervals();
1502
1503 foreach_list_safe(node, &this->instructions) {
1504 fs_inst *inst = (fs_inst *)node;
1505
1506 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1507 inst->remove();
1508 progress = true;
1509 }
1510
1511 pc++;
1512 }
1513
1514 if (progress)
1515 live_intervals_valid = false;
1516
1517 return progress;
1518 }
1519
1520 /**
1521 * Implements a second type of register coalescing: This one checks if
1522 * the two regs involved in a raw move don't interfere, in which case
1523 * they can both by stored in the same place and the MOV removed.
1524 */
1525 bool
1526 fs_visitor::register_coalesce_2()
1527 {
1528 bool progress = false;
1529
1530 calculate_live_intervals();
1531
1532 foreach_list_safe(node, &this->instructions) {
1533 fs_inst *inst = (fs_inst *)node;
1534
1535 if (inst->opcode != BRW_OPCODE_MOV ||
1536 inst->predicate ||
1537 inst->saturate ||
1538 inst->src[0].file != GRF ||
1539 inst->src[0].negate ||
1540 inst->src[0].abs ||
1541 inst->src[0].smear != -1 ||
1542 inst->dst.file != GRF ||
1543 inst->dst.type != inst->src[0].type ||
1544 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1545 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1546 continue;
1547 }
1548
1549 int reg_from = inst->src[0].reg;
1550 assert(inst->src[0].reg_offset == 0);
1551 int reg_to = inst->dst.reg;
1552 int reg_to_offset = inst->dst.reg_offset;
1553
1554 foreach_list_safe(node, &this->instructions) {
1555 fs_inst *scan_inst = (fs_inst *)node;
1556
1557 if (scan_inst->dst.file == GRF &&
1558 scan_inst->dst.reg == reg_from) {
1559 scan_inst->dst.reg = reg_to;
1560 scan_inst->dst.reg_offset = reg_to_offset;
1561 }
1562 for (int i = 0; i < 3; i++) {
1563 if (scan_inst->src[i].file == GRF &&
1564 scan_inst->src[i].reg == reg_from) {
1565 scan_inst->src[i].reg = reg_to;
1566 scan_inst->src[i].reg_offset = reg_to_offset;
1567 }
1568 }
1569 }
1570
1571 inst->remove();
1572 live_intervals_valid = false;
1573 progress = true;
1574 continue;
1575 }
1576
1577 return progress;
1578 }
1579
1580 bool
1581 fs_visitor::register_coalesce()
1582 {
1583 bool progress = false;
1584 int if_depth = 0;
1585 int loop_depth = 0;
1586
1587 foreach_list_safe(node, &this->instructions) {
1588 fs_inst *inst = (fs_inst *)node;
1589
1590 /* Make sure that we dominate the instructions we're going to
1591 * scan for interfering with our coalescing, or we won't have
1592 * scanned enough to see if anything interferes with our
1593 * coalescing. We don't dominate the following instructions if
1594 * we're in a loop or an if block.
1595 */
1596 switch (inst->opcode) {
1597 case BRW_OPCODE_DO:
1598 loop_depth++;
1599 break;
1600 case BRW_OPCODE_WHILE:
1601 loop_depth--;
1602 break;
1603 case BRW_OPCODE_IF:
1604 if_depth++;
1605 break;
1606 case BRW_OPCODE_ENDIF:
1607 if_depth--;
1608 break;
1609 default:
1610 break;
1611 }
1612 if (loop_depth || if_depth)
1613 continue;
1614
1615 if (inst->opcode != BRW_OPCODE_MOV ||
1616 inst->predicate ||
1617 inst->saturate ||
1618 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1619 inst->src[0].file != UNIFORM)||
1620 inst->dst.type != inst->src[0].type)
1621 continue;
1622
1623 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
1624
1625 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1626 * them: check for no writes to either one until the exit of the
1627 * program.
1628 */
1629 bool interfered = false;
1630
1631 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1632 !scan_inst->is_tail_sentinel();
1633 scan_inst = (fs_inst *)scan_inst->next) {
1634 if (scan_inst->dst.file == GRF) {
1635 if (scan_inst->overwrites_reg(inst->dst) ||
1636 scan_inst->overwrites_reg(inst->src[0])) {
1637 interfered = true;
1638 break;
1639 }
1640 }
1641
1642 /* The gen6 MATH instruction can't handle source modifiers or
1643 * unusual register regions, so avoid coalescing those for
1644 * now. We should do something more specific.
1645 */
1646 if (intel->gen >= 6 &&
1647 scan_inst->is_math() &&
1648 (has_source_modifiers || inst->src[0].file == UNIFORM)) {
1649 interfered = true;
1650 break;
1651 }
1652
1653 /* The accumulator result appears to get used for the
1654 * conditional modifier generation. When negating a UD
1655 * value, there is a 33rd bit generated for the sign in the
1656 * accumulator value, so now you can't check, for example,
1657 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1658 */
1659 if (scan_inst->conditional_mod &&
1660 inst->src[0].negate &&
1661 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1662 interfered = true;
1663 break;
1664 }
1665 }
1666 if (interfered) {
1667 continue;
1668 }
1669
1670 /* Rewrite the later usage to point at the source of the move to
1671 * be removed.
1672 */
1673 for (fs_inst *scan_inst = inst;
1674 !scan_inst->is_tail_sentinel();
1675 scan_inst = (fs_inst *)scan_inst->next) {
1676 for (int i = 0; i < 3; i++) {
1677 if (scan_inst->src[i].file == GRF &&
1678 scan_inst->src[i].reg == inst->dst.reg &&
1679 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1680 fs_reg new_src = inst->src[0];
1681 if (scan_inst->src[i].abs) {
1682 new_src.negate = 0;
1683 new_src.abs = 1;
1684 }
1685 new_src.negate ^= scan_inst->src[i].negate;
1686 scan_inst->src[i] = new_src;
1687 }
1688 }
1689 }
1690
1691 inst->remove();
1692 progress = true;
1693 }
1694
1695 if (progress)
1696 live_intervals_valid = false;
1697
1698 return progress;
1699 }
1700
1701
1702 bool
1703 fs_visitor::compute_to_mrf()
1704 {
1705 bool progress = false;
1706 int next_ip = 0;
1707
1708 calculate_live_intervals();
1709
1710 foreach_list_safe(node, &this->instructions) {
1711 fs_inst *inst = (fs_inst *)node;
1712
1713 int ip = next_ip;
1714 next_ip++;
1715
1716 if (inst->opcode != BRW_OPCODE_MOV ||
1717 inst->predicate ||
1718 inst->dst.file != MRF || inst->src[0].file != GRF ||
1719 inst->dst.type != inst->src[0].type ||
1720 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1721 continue;
1722
1723 /* Work out which hardware MRF registers are written by this
1724 * instruction.
1725 */
1726 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1727 int mrf_high;
1728 if (inst->dst.reg & BRW_MRF_COMPR4) {
1729 mrf_high = mrf_low + 4;
1730 } else if (dispatch_width == 16 &&
1731 (!inst->force_uncompressed && !inst->force_sechalf)) {
1732 mrf_high = mrf_low + 1;
1733 } else {
1734 mrf_high = mrf_low;
1735 }
1736
1737 /* Can't compute-to-MRF this GRF if someone else was going to
1738 * read it later.
1739 */
1740 if (this->virtual_grf_use[inst->src[0].reg] > ip)
1741 continue;
1742
1743 /* Found a move of a GRF to a MRF. Let's see if we can go
1744 * rewrite the thing that made this GRF to write into the MRF.
1745 */
1746 fs_inst *scan_inst;
1747 for (scan_inst = (fs_inst *)inst->prev;
1748 scan_inst->prev != NULL;
1749 scan_inst = (fs_inst *)scan_inst->prev) {
1750 if (scan_inst->dst.file == GRF &&
1751 scan_inst->dst.reg == inst->src[0].reg) {
1752 /* Found the last thing to write our reg we want to turn
1753 * into a compute-to-MRF.
1754 */
1755
1756 /* SENDs can only write to GRFs, so no compute-to-MRF. */
1757 if (scan_inst->mlen) {
1758 break;
1759 }
1760
1761 /* If it's predicated, it (probably) didn't populate all
1762 * the channels. We might be able to rewrite everything
1763 * that writes that reg, but it would require smarter
1764 * tracking to delay the rewriting until complete success.
1765 */
1766 if (scan_inst->predicate)
1767 break;
1768
1769 /* If it's half of register setup and not the same half as
1770 * our MOV we're trying to remove, bail for now.
1771 */
1772 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
1773 scan_inst->force_sechalf != inst->force_sechalf) {
1774 break;
1775 }
1776
1777 /* SEND instructions can't have MRF as a destination. */
1778 if (scan_inst->mlen)
1779 break;
1780
1781 if (intel->gen >= 6) {
1782 /* gen6 math instructions must have the destination be
1783 * GRF, so no compute-to-MRF for them.
1784 */
1785 if (scan_inst->is_math()) {
1786 break;
1787 }
1788 }
1789
1790 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1791 /* Found the creator of our MRF's source value. */
1792 scan_inst->dst.file = MRF;
1793 scan_inst->dst.reg = inst->dst.reg;
1794 scan_inst->saturate |= inst->saturate;
1795 inst->remove();
1796 progress = true;
1797 }
1798 break;
1799 }
1800
1801 /* We don't handle flow control here. Most computation of
1802 * values that end up in MRFs are shortly before the MRF
1803 * write anyway.
1804 */
1805 if (scan_inst->opcode == BRW_OPCODE_DO ||
1806 scan_inst->opcode == BRW_OPCODE_WHILE ||
1807 scan_inst->opcode == BRW_OPCODE_ELSE ||
1808 scan_inst->opcode == BRW_OPCODE_ENDIF) {
1809 break;
1810 }
1811
1812 /* You can't read from an MRF, so if someone else reads our
1813 * MRF's source GRF that we wanted to rewrite, that stops us.
1814 */
1815 bool interfered = false;
1816 for (int i = 0; i < 3; i++) {
1817 if (scan_inst->src[i].file == GRF &&
1818 scan_inst->src[i].reg == inst->src[0].reg &&
1819 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
1820 interfered = true;
1821 }
1822 }
1823 if (interfered)
1824 break;
1825
1826 if (scan_inst->dst.file == MRF) {
1827 /* If somebody else writes our MRF here, we can't
1828 * compute-to-MRF before that.
1829 */
1830 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
1831 int scan_mrf_high;
1832
1833 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
1834 scan_mrf_high = scan_mrf_low + 4;
1835 } else if (dispatch_width == 16 &&
1836 (!scan_inst->force_uncompressed &&
1837 !scan_inst->force_sechalf)) {
1838 scan_mrf_high = scan_mrf_low + 1;
1839 } else {
1840 scan_mrf_high = scan_mrf_low;
1841 }
1842
1843 if (mrf_low == scan_mrf_low ||
1844 mrf_low == scan_mrf_high ||
1845 mrf_high == scan_mrf_low ||
1846 mrf_high == scan_mrf_high) {
1847 break;
1848 }
1849 }
1850
1851 if (scan_inst->mlen > 0) {
1852 /* Found a SEND instruction, which means that there are
1853 * live values in MRFs from base_mrf to base_mrf +
1854 * scan_inst->mlen - 1. Don't go pushing our MRF write up
1855 * above it.
1856 */
1857 if (mrf_low >= scan_inst->base_mrf &&
1858 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
1859 break;
1860 }
1861 if (mrf_high >= scan_inst->base_mrf &&
1862 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
1863 break;
1864 }
1865 }
1866 }
1867 }
1868
1869 if (progress)
1870 live_intervals_valid = false;
1871
1872 return progress;
1873 }
1874
1875 /**
1876 * Walks through basic blocks, looking for repeated MRF writes and
1877 * removing the later ones.
1878 */
1879 bool
1880 fs_visitor::remove_duplicate_mrf_writes()
1881 {
1882 fs_inst *last_mrf_move[16];
1883 bool progress = false;
1884
1885 /* Need to update the MRF tracking for compressed instructions. */
1886 if (dispatch_width == 16)
1887 return false;
1888
1889 memset(last_mrf_move, 0, sizeof(last_mrf_move));
1890
1891 foreach_list_safe(node, &this->instructions) {
1892 fs_inst *inst = (fs_inst *)node;
1893
1894 switch (inst->opcode) {
1895 case BRW_OPCODE_DO:
1896 case BRW_OPCODE_WHILE:
1897 case BRW_OPCODE_IF:
1898 case BRW_OPCODE_ELSE:
1899 case BRW_OPCODE_ENDIF:
1900 memset(last_mrf_move, 0, sizeof(last_mrf_move));
1901 continue;
1902 default:
1903 break;
1904 }
1905
1906 if (inst->opcode == BRW_OPCODE_MOV &&
1907 inst->dst.file == MRF) {
1908 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
1909 if (prev_inst && inst->equals(prev_inst)) {
1910 inst->remove();
1911 progress = true;
1912 continue;
1913 }
1914 }
1915
1916 /* Clear out the last-write records for MRFs that were overwritten. */
1917 if (inst->dst.file == MRF) {
1918 last_mrf_move[inst->dst.reg] = NULL;
1919 }
1920
1921 if (inst->mlen > 0) {
1922 /* Found a SEND instruction, which will include two or fewer
1923 * implied MRF writes. We could do better here.
1924 */
1925 for (int i = 0; i < implied_mrf_writes(inst); i++) {
1926 last_mrf_move[inst->base_mrf + i] = NULL;
1927 }
1928 }
1929
1930 /* Clear out any MRF move records whose sources got overwritten. */
1931 if (inst->dst.file == GRF) {
1932 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
1933 if (last_mrf_move[i] &&
1934 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
1935 last_mrf_move[i] = NULL;
1936 }
1937 }
1938 }
1939
1940 if (inst->opcode == BRW_OPCODE_MOV &&
1941 inst->dst.file == MRF &&
1942 inst->src[0].file == GRF &&
1943 !inst->predicate) {
1944 last_mrf_move[inst->dst.reg] = inst;
1945 }
1946 }
1947
1948 if (progress)
1949 live_intervals_valid = false;
1950
1951 return progress;
1952 }
1953
1954 void
1955 fs_visitor::dump_instruction(fs_inst *inst)
1956 {
1957 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
1958 opcode_descs[inst->opcode].name) {
1959 printf("%s", opcode_descs[inst->opcode].name);
1960 } else {
1961 printf("op%d", inst->opcode);
1962 }
1963 if (inst->saturate)
1964 printf(".sat");
1965 printf(" ");
1966
1967 switch (inst->dst.file) {
1968 case GRF:
1969 printf("vgrf%d", inst->dst.reg);
1970 if (inst->dst.reg_offset)
1971 printf("+%d", inst->dst.reg_offset);
1972 break;
1973 case MRF:
1974 printf("m%d", inst->dst.reg);
1975 break;
1976 case BAD_FILE:
1977 printf("(null)");
1978 break;
1979 case UNIFORM:
1980 printf("***u%d***", inst->dst.reg);
1981 break;
1982 default:
1983 printf("???");
1984 break;
1985 }
1986 printf(", ");
1987
1988 for (int i = 0; i < 3; i++) {
1989 if (inst->src[i].negate)
1990 printf("-");
1991 if (inst->src[i].abs)
1992 printf("|");
1993 switch (inst->src[i].file) {
1994 case GRF:
1995 printf("vgrf%d", inst->src[i].reg);
1996 if (inst->src[i].reg_offset)
1997 printf("+%d", inst->src[i].reg_offset);
1998 break;
1999 case MRF:
2000 printf("***m%d***", inst->src[i].reg);
2001 break;
2002 case UNIFORM:
2003 printf("u%d", inst->src[i].reg);
2004 if (inst->src[i].reg_offset)
2005 printf(".%d", inst->src[i].reg_offset);
2006 break;
2007 case BAD_FILE:
2008 printf("(null)");
2009 break;
2010 default:
2011 printf("???");
2012 break;
2013 }
2014 if (inst->src[i].abs)
2015 printf("|");
2016
2017 if (i < 3)
2018 printf(", ");
2019 }
2020
2021 printf(" ");
2022
2023 if (inst->force_uncompressed)
2024 printf("1sthalf ");
2025
2026 if (inst->force_sechalf)
2027 printf("2ndhalf ");
2028
2029 printf("\n");
2030 }
2031
2032 void
2033 fs_visitor::dump_instructions()
2034 {
2035 int ip = 0;
2036 foreach_list(node, &this->instructions) {
2037 fs_inst *inst = (fs_inst *)node;
2038 printf("%d: ", ip++);
2039 dump_instruction(inst);
2040 }
2041 }
2042
2043 /**
2044 * Possibly returns an instruction that set up @param reg.
2045 *
2046 * Sometimes we want to take the result of some expression/variable
2047 * dereference tree and rewrite the instruction generating the result
2048 * of the tree. When processing the tree, we know that the
2049 * instructions generated are all writing temporaries that are dead
2050 * outside of this tree. So, if we have some instructions that write
2051 * a temporary, we're free to point that temp write somewhere else.
2052 *
2053 * Note that this doesn't guarantee that the instruction generated
2054 * only reg -- it might be the size=4 destination of a texture instruction.
2055 */
2056 fs_inst *
2057 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2058 fs_inst *end,
2059 fs_reg reg)
2060 {
2061 if (end == start ||
2062 end->predicate ||
2063 end->force_uncompressed ||
2064 end->force_sechalf ||
2065 !reg.equals(end->dst)) {
2066 return NULL;
2067 } else {
2068 return end;
2069 }
2070 }
2071
2072 void
2073 fs_visitor::setup_payload_gen6()
2074 {
2075 struct intel_context *intel = &brw->intel;
2076 bool uses_depth =
2077 (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2078 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2079
2080 assert(intel->gen >= 6);
2081
2082 /* R0-1: masks, pixel X/Y coordinates. */
2083 c->nr_payload_regs = 2;
2084 /* R2: only for 32-pixel dispatch.*/
2085
2086 /* R3-26: barycentric interpolation coordinates. These appear in the
2087 * same order that they appear in the brw_wm_barycentric_interp_mode
2088 * enum. Each set of coordinates occupies 2 registers if dispatch width
2089 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2090 * appear if they were enabled using the "Barycentric Interpolation
2091 * Mode" bits in WM_STATE.
2092 */
2093 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2094 if (barycentric_interp_modes & (1 << i)) {
2095 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2096 c->nr_payload_regs += 2;
2097 if (dispatch_width == 16) {
2098 c->nr_payload_regs += 2;
2099 }
2100 }
2101 }
2102
2103 /* R27: interpolated depth if uses source depth */
2104 if (uses_depth) {
2105 c->source_depth_reg = c->nr_payload_regs;
2106 c->nr_payload_regs++;
2107 if (dispatch_width == 16) {
2108 /* R28: interpolated depth if not 8-wide. */
2109 c->nr_payload_regs++;
2110 }
2111 }
2112 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2113 if (uses_depth) {
2114 c->source_w_reg = c->nr_payload_regs;
2115 c->nr_payload_regs++;
2116 if (dispatch_width == 16) {
2117 /* R30: interpolated W if not 8-wide. */
2118 c->nr_payload_regs++;
2119 }
2120 }
2121 /* R31: MSAA position offsets. */
2122 /* R32-: bary for 32-pixel. */
2123 /* R58-59: interp W for 32-pixel. */
2124
2125 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2126 c->source_depth_to_render_target = true;
2127 }
2128 }
2129
2130 bool
2131 fs_visitor::run()
2132 {
2133 uint32_t orig_nr_params = c->prog_data.nr_params;
2134
2135 if (intel->gen >= 6)
2136 setup_payload_gen6();
2137 else
2138 setup_payload_gen4();
2139
2140 if (0) {
2141 emit_dummy_fs();
2142 } else {
2143 calculate_urb_setup();
2144 if (intel->gen < 6)
2145 emit_interpolation_setup_gen4();
2146 else
2147 emit_interpolation_setup_gen6();
2148
2149 /* Generate FS IR for main(). (the visitor only descends into
2150 * functions called "main").
2151 */
2152 if (shader) {
2153 foreach_list(node, &*shader->ir) {
2154 ir_instruction *ir = (ir_instruction *)node;
2155 base_ir = ir;
2156 this->result = reg_undef;
2157 ir->accept(this);
2158 }
2159 } else {
2160 emit_fragment_program_code();
2161 }
2162 base_ir = NULL;
2163 if (failed)
2164 return false;
2165
2166 emit_fb_writes();
2167
2168 split_virtual_grfs();
2169
2170 setup_paramvalues_refs();
2171 setup_pull_constants();
2172
2173 bool progress;
2174 do {
2175 progress = false;
2176
2177 compact_virtual_grfs();
2178
2179 progress = remove_duplicate_mrf_writes() || progress;
2180
2181 progress = opt_algebraic() || progress;
2182 progress = opt_cse() || progress;
2183 progress = opt_copy_propagate() || progress;
2184 progress = dead_code_eliminate() || progress;
2185 progress = register_coalesce() || progress;
2186 progress = register_coalesce_2() || progress;
2187 progress = compute_to_mrf() || progress;
2188 } while (progress);
2189
2190 remove_dead_constants();
2191
2192 schedule_instructions();
2193
2194 assign_curb_setup();
2195 assign_urb_setup();
2196
2197 if (0) {
2198 /* Debug of register spilling: Go spill everything. */
2199 for (int i = 0; i < virtual_grf_count; i++) {
2200 spill_reg(i);
2201 }
2202 }
2203
2204 if (0)
2205 assign_regs_trivial();
2206 else {
2207 while (!assign_regs()) {
2208 if (failed)
2209 break;
2210 }
2211 }
2212 }
2213 assert(force_uncompressed_stack == 0);
2214 assert(force_sechalf_stack == 0);
2215
2216 if (failed)
2217 return false;
2218
2219 if (dispatch_width == 8) {
2220 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2221 } else {
2222 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2223
2224 /* Make sure we didn't try to sneak in an extra uniform */
2225 assert(orig_nr_params == c->prog_data.nr_params);
2226 (void) orig_nr_params;
2227 }
2228
2229 return !failed;
2230 }
2231
2232 const unsigned *
2233 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2234 struct gl_fragment_program *fp,
2235 struct gl_shader_program *prog,
2236 unsigned *final_assembly_size)
2237 {
2238 struct intel_context *intel = &brw->intel;
2239 bool start_busy = false;
2240 float start_time = 0;
2241
2242 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2243 start_busy = (intel->batch.last_bo &&
2244 drm_intel_bo_busy(intel->batch.last_bo));
2245 start_time = get_time();
2246 }
2247
2248 struct brw_shader *shader = NULL;
2249 if (prog)
2250 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2251
2252 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2253 if (shader) {
2254 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2255 _mesa_print_ir(shader->ir, NULL);
2256 printf("\n\n");
2257 } else {
2258 printf("ARB_fragment_program %d ir for native fragment shader\n",
2259 fp->Base.Id);
2260 _mesa_print_program(&fp->Base);
2261 }
2262 }
2263
2264 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2265 */
2266 fs_visitor v(brw, c, prog, fp, 8);
2267 if (!v.run()) {
2268 prog->LinkStatus = false;
2269 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2270
2271 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2272 v.fail_msg);
2273
2274 return NULL;
2275 }
2276
2277 exec_list *simd16_instructions = NULL;
2278 fs_visitor v2(brw, c, prog, fp, 16);
2279 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2280 v2.import_uniforms(&v);
2281 if (!v2.run()) {
2282 perf_debug("16-wide shader failed to compile, falling back to "
2283 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2284 } else {
2285 simd16_instructions = &v2.instructions;
2286 }
2287 }
2288
2289 c->prog_data.dispatch_width = 8;
2290
2291 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2292 const unsigned *generated = g.generate_assembly(&v.instructions,
2293 simd16_instructions,
2294 final_assembly_size);
2295
2296 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2297 if (shader->compiled_once)
2298 brw_wm_debug_recompile(brw, prog, &c->key);
2299 shader->compiled_once = true;
2300
2301 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2302 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2303 (get_time() - start_time) * 1000);
2304 }
2305 }
2306
2307 return generated;
2308 }
2309
2310 bool
2311 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2312 {
2313 struct brw_context *brw = brw_context(ctx);
2314 struct intel_context *intel = &brw->intel;
2315 struct brw_wm_prog_key key;
2316
2317 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2318 return true;
2319
2320 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2321 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2322 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2323 bool program_uses_dfdy = fp->UsesDFdy;
2324
2325 memset(&key, 0, sizeof(key));
2326
2327 if (intel->gen < 6) {
2328 if (fp->UsesKill)
2329 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2330
2331 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2332 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2333
2334 /* Just assume depth testing. */
2335 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2336 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2337 }
2338
2339 if (prog->Name != 0)
2340 key.proj_attrib_mask = 0xffffffff;
2341
2342 if (intel->gen < 6)
2343 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2344
2345 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2346 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2347 continue;
2348
2349 if (prog->Name == 0)
2350 key.proj_attrib_mask |= 1 << i;
2351
2352 if (intel->gen < 6) {
2353 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2354
2355 if (vp_index >= 0)
2356 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2357 }
2358 }
2359
2360 key.clamp_fragment_color = true;
2361
2362 for (int i = 0; i < MAX_SAMPLERS; i++) {
2363 if (fp->Base.ShadowSamplers & (1 << i)) {
2364 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2365 key.tex.swizzles[i] =
2366 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2367 } else {
2368 /* Color sampler: assume no swizzling. */
2369 key.tex.swizzles[i] = SWIZZLE_XYZW;
2370 }
2371 }
2372
2373 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2374 key.drawable_height = ctx->DrawBuffer->Height;
2375 }
2376
2377 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2378 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2379 }
2380
2381 key.nr_color_regions = 1;
2382
2383 key.program_string_id = bfp->id;
2384
2385 uint32_t old_prog_offset = brw->wm.prog_offset;
2386 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2387
2388 bool success = do_wm_prog(brw, prog, bfp, &key);
2389
2390 brw->wm.prog_offset = old_prog_offset;
2391 brw->wm.prog_data = old_prog_data;
2392
2393 return success;
2394 }