i965/fs: Add support for uniform array access with a variable index.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "main/fbobject.h"
39 #include "program/prog_parameter.h"
40 #include "program/prog_print.h"
41 #include "program/register_allocate.h"
42 #include "program/sampler.h"
43 #include "program/hash_table.h"
44 #include "brw_context.h"
45 #include "brw_eu.h"
46 #include "brw_wm.h"
47 }
48 #include "brw_fs.h"
49 #include "glsl/glsl_types.h"
50 #include "glsl/ir_print_visitor.h"
51
52 void
53 fs_inst::init()
54 {
55 memset(this, 0, sizeof(*this));
56 this->opcode = BRW_OPCODE_NOP;
57 this->conditional_mod = BRW_CONDITIONAL_NONE;
58
59 this->dst = reg_undef;
60 this->src[0] = reg_undef;
61 this->src[1] = reg_undef;
62 this->src[2] = reg_undef;
63 }
64
65 fs_inst::fs_inst()
66 {
67 init();
68 }
69
70 fs_inst::fs_inst(enum opcode opcode)
71 {
72 init();
73 this->opcode = opcode;
74 }
75
76 fs_inst::fs_inst(enum opcode opcode, fs_reg dst)
77 {
78 init();
79 this->opcode = opcode;
80 this->dst = dst;
81
82 if (dst.file == GRF)
83 assert(dst.reg_offset >= 0);
84 }
85
86 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0)
87 {
88 init();
89 this->opcode = opcode;
90 this->dst = dst;
91 this->src[0] = src0;
92
93 if (dst.file == GRF)
94 assert(dst.reg_offset >= 0);
95 if (src[0].file == GRF)
96 assert(src[0].reg_offset >= 0);
97 }
98
99 fs_inst::fs_inst(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
100 {
101 init();
102 this->opcode = opcode;
103 this->dst = dst;
104 this->src[0] = src0;
105 this->src[1] = src1;
106
107 if (dst.file == GRF)
108 assert(dst.reg_offset >= 0);
109 if (src[0].file == GRF)
110 assert(src[0].reg_offset >= 0);
111 if (src[1].file == GRF)
112 assert(src[1].reg_offset >= 0);
113 }
114
115 fs_inst::fs_inst(enum opcode opcode, fs_reg dst,
116 fs_reg src0, fs_reg src1, fs_reg src2)
117 {
118 init();
119 this->opcode = opcode;
120 this->dst = dst;
121 this->src[0] = src0;
122 this->src[1] = src1;
123 this->src[2] = src2;
124
125 if (dst.file == GRF)
126 assert(dst.reg_offset >= 0);
127 if (src[0].file == GRF)
128 assert(src[0].reg_offset >= 0);
129 if (src[1].file == GRF)
130 assert(src[1].reg_offset >= 0);
131 if (src[2].file == GRF)
132 assert(src[2].reg_offset >= 0);
133 }
134
135 #define ALU1(op) \
136 fs_inst * \
137 fs_visitor::op(fs_reg dst, fs_reg src0) \
138 { \
139 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0); \
140 }
141
142 #define ALU2(op) \
143 fs_inst * \
144 fs_visitor::op(fs_reg dst, fs_reg src0, fs_reg src1) \
145 { \
146 return new(mem_ctx) fs_inst(BRW_OPCODE_##op, dst, src0, src1); \
147 }
148
149 ALU1(NOT)
150 ALU1(MOV)
151 ALU1(FRC)
152 ALU1(RNDD)
153 ALU1(RNDE)
154 ALU1(RNDZ)
155 ALU2(ADD)
156 ALU2(MUL)
157 ALU2(MACH)
158 ALU2(AND)
159 ALU2(OR)
160 ALU2(XOR)
161 ALU2(SHL)
162 ALU2(SHR)
163 ALU2(ASR)
164
165 /** Gen4 predicated IF. */
166 fs_inst *
167 fs_visitor::IF(uint32_t predicate)
168 {
169 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF);
170 inst->predicate = predicate;
171 return inst;
172 }
173
174 /** Gen6+ IF with embedded comparison. */
175 fs_inst *
176 fs_visitor::IF(fs_reg src0, fs_reg src1, uint32_t condition)
177 {
178 assert(intel->gen >= 6);
179 fs_inst *inst = new(mem_ctx) fs_inst(BRW_OPCODE_IF,
180 reg_null_d, src0, src1);
181 inst->conditional_mod = condition;
182 return inst;
183 }
184
185 /**
186 * CMP: Sets the low bit of the destination channels with the result
187 * of the comparison, while the upper bits are undefined, and updates
188 * the flag register with the packed 16 bits of the result.
189 */
190 fs_inst *
191 fs_visitor::CMP(fs_reg dst, fs_reg src0, fs_reg src1, uint32_t condition)
192 {
193 fs_inst *inst;
194
195 /* Take the instruction:
196 *
197 * CMP null<d> src0<f> src1<f>
198 *
199 * Original gen4 does type conversion to the destination type before
200 * comparison, producing garbage results for floating point comparisons.
201 * gen5 does the comparison on the execution type (resolved source types),
202 * so dst type doesn't matter. gen6 does comparison and then uses the
203 * result as if it was the dst type with no conversion, which happens to
204 * mostly work out for float-interpreted-as-int since our comparisons are
205 * for >0, =0, <0.
206 */
207 if (intel->gen == 4) {
208 dst.type = src0.type;
209 if (dst.file == FIXED_HW_REG)
210 dst.fixed_hw_reg.type = dst.type;
211 }
212
213 resolve_ud_negate(&src0);
214 resolve_ud_negate(&src1);
215
216 inst = new(mem_ctx) fs_inst(BRW_OPCODE_CMP, dst, src0, src1);
217 inst->conditional_mod = condition;
218
219 return inst;
220 }
221
222 exec_list
223 fs_visitor::VARYING_PULL_CONSTANT_LOAD(fs_reg dst, fs_reg surf_index,
224 fs_reg offset)
225 {
226 exec_list instructions;
227 fs_inst *inst;
228
229 if (intel->gen >= 7) {
230 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7,
231 dst, surf_index, offset);
232 instructions.push_tail(inst);
233 } else {
234 int base_mrf = 13;
235 bool header_present = true;
236
237 fs_reg mrf = fs_reg(MRF, base_mrf + header_present);
238 mrf.type = BRW_REGISTER_TYPE_D;
239
240 /* On gen6+ we want the dword offset passed in, but on gen4/5 we need a
241 * dword-aligned byte offset.
242 */
243 if (intel->gen == 6) {
244 instructions.push_tail(MOV(mrf, offset));
245 } else {
246 instructions.push_tail(MUL(mrf, offset, fs_reg(4)));
247 }
248 inst = MOV(mrf, offset);
249 inst = new(mem_ctx) fs_inst(FS_OPCODE_VARYING_PULL_CONSTANT_LOAD,
250 dst, surf_index);
251 inst->header_present = header_present;
252 inst->base_mrf = base_mrf;
253 inst->mlen = header_present + dispatch_width / 8;
254
255 instructions.push_tail(inst);
256 }
257
258 return instructions;
259 }
260
261 bool
262 fs_inst::equals(fs_inst *inst)
263 {
264 return (opcode == inst->opcode &&
265 dst.equals(inst->dst) &&
266 src[0].equals(inst->src[0]) &&
267 src[1].equals(inst->src[1]) &&
268 src[2].equals(inst->src[2]) &&
269 saturate == inst->saturate &&
270 predicate == inst->predicate &&
271 conditional_mod == inst->conditional_mod &&
272 mlen == inst->mlen &&
273 base_mrf == inst->base_mrf &&
274 sampler == inst->sampler &&
275 target == inst->target &&
276 eot == inst->eot &&
277 header_present == inst->header_present &&
278 shadow_compare == inst->shadow_compare &&
279 offset == inst->offset);
280 }
281
282 int
283 fs_inst::regs_written()
284 {
285 if (is_tex())
286 return 4;
287
288 /* The SINCOS and INT_DIV_QUOTIENT_AND_REMAINDER math functions return 2,
289 * but we don't currently use them...nor do we have an opcode for them.
290 */
291
292 return 1;
293 }
294
295 bool
296 fs_inst::overwrites_reg(const fs_reg &reg)
297 {
298 return (reg.file == dst.file &&
299 reg.reg == dst.reg &&
300 reg.reg_offset >= dst.reg_offset &&
301 reg.reg_offset < dst.reg_offset + regs_written());
302 }
303
304 bool
305 fs_inst::is_tex()
306 {
307 return (opcode == SHADER_OPCODE_TEX ||
308 opcode == FS_OPCODE_TXB ||
309 opcode == SHADER_OPCODE_TXD ||
310 opcode == SHADER_OPCODE_TXF ||
311 opcode == SHADER_OPCODE_TXL ||
312 opcode == SHADER_OPCODE_TXS);
313 }
314
315 bool
316 fs_inst::is_math()
317 {
318 return (opcode == SHADER_OPCODE_RCP ||
319 opcode == SHADER_OPCODE_RSQ ||
320 opcode == SHADER_OPCODE_SQRT ||
321 opcode == SHADER_OPCODE_EXP2 ||
322 opcode == SHADER_OPCODE_LOG2 ||
323 opcode == SHADER_OPCODE_SIN ||
324 opcode == SHADER_OPCODE_COS ||
325 opcode == SHADER_OPCODE_INT_QUOTIENT ||
326 opcode == SHADER_OPCODE_INT_REMAINDER ||
327 opcode == SHADER_OPCODE_POW);
328 }
329
330 bool
331 fs_inst::is_send_from_grf()
332 {
333 return opcode == FS_OPCODE_VARYING_PULL_CONSTANT_LOAD_GEN7;
334 }
335
336 bool
337 fs_visitor::can_do_source_mods(fs_inst *inst)
338 {
339 if (intel->gen == 6 && inst->is_math())
340 return false;
341
342 if (inst->is_send_from_grf())
343 return false;
344
345 return true;
346 }
347
348 void
349 fs_reg::init()
350 {
351 memset(this, 0, sizeof(*this));
352 this->smear = -1;
353 }
354
355 /** Generic unset register constructor. */
356 fs_reg::fs_reg()
357 {
358 init();
359 this->file = BAD_FILE;
360 }
361
362 /** Immediate value constructor. */
363 fs_reg::fs_reg(float f)
364 {
365 init();
366 this->file = IMM;
367 this->type = BRW_REGISTER_TYPE_F;
368 this->imm.f = f;
369 }
370
371 /** Immediate value constructor. */
372 fs_reg::fs_reg(int32_t i)
373 {
374 init();
375 this->file = IMM;
376 this->type = BRW_REGISTER_TYPE_D;
377 this->imm.i = i;
378 }
379
380 /** Immediate value constructor. */
381 fs_reg::fs_reg(uint32_t u)
382 {
383 init();
384 this->file = IMM;
385 this->type = BRW_REGISTER_TYPE_UD;
386 this->imm.u = u;
387 }
388
389 /** Fixed brw_reg Immediate value constructor. */
390 fs_reg::fs_reg(struct brw_reg fixed_hw_reg)
391 {
392 init();
393 this->file = FIXED_HW_REG;
394 this->fixed_hw_reg = fixed_hw_reg;
395 this->type = fixed_hw_reg.type;
396 }
397
398 bool
399 fs_reg::equals(const fs_reg &r) const
400 {
401 return (file == r.file &&
402 reg == r.reg &&
403 reg_offset == r.reg_offset &&
404 type == r.type &&
405 negate == r.negate &&
406 abs == r.abs &&
407 !reladdr && !r.reladdr &&
408 memcmp(&fixed_hw_reg, &r.fixed_hw_reg,
409 sizeof(fixed_hw_reg)) == 0 &&
410 smear == r.smear &&
411 imm.u == r.imm.u);
412 }
413
414 bool
415 fs_reg::is_zero() const
416 {
417 if (file != IMM)
418 return false;
419
420 return type == BRW_REGISTER_TYPE_F ? imm.f == 0.0 : imm.i == 0;
421 }
422
423 bool
424 fs_reg::is_one() const
425 {
426 if (file != IMM)
427 return false;
428
429 return type == BRW_REGISTER_TYPE_F ? imm.f == 1.0 : imm.i == 1;
430 }
431
432 int
433 fs_visitor::type_size(const struct glsl_type *type)
434 {
435 unsigned int size, i;
436
437 switch (type->base_type) {
438 case GLSL_TYPE_UINT:
439 case GLSL_TYPE_INT:
440 case GLSL_TYPE_FLOAT:
441 case GLSL_TYPE_BOOL:
442 return type->components();
443 case GLSL_TYPE_ARRAY:
444 return type_size(type->fields.array) * type->length;
445 case GLSL_TYPE_STRUCT:
446 size = 0;
447 for (i = 0; i < type->length; i++) {
448 size += type_size(type->fields.structure[i].type);
449 }
450 return size;
451 case GLSL_TYPE_SAMPLER:
452 /* Samplers take up no register space, since they're baked in at
453 * link time.
454 */
455 return 0;
456 default:
457 assert(!"not reached");
458 return 0;
459 }
460 }
461
462 void
463 fs_visitor::fail(const char *format, ...)
464 {
465 va_list va;
466 char *msg;
467
468 if (failed)
469 return;
470
471 failed = true;
472
473 va_start(va, format);
474 msg = ralloc_vasprintf(mem_ctx, format, va);
475 va_end(va);
476 msg = ralloc_asprintf(mem_ctx, "FS compile failed: %s\n", msg);
477
478 this->fail_msg = msg;
479
480 if (INTEL_DEBUG & DEBUG_WM) {
481 fprintf(stderr, "%s", msg);
482 }
483 }
484
485 fs_inst *
486 fs_visitor::emit(enum opcode opcode)
487 {
488 return emit(fs_inst(opcode));
489 }
490
491 fs_inst *
492 fs_visitor::emit(enum opcode opcode, fs_reg dst)
493 {
494 return emit(fs_inst(opcode, dst));
495 }
496
497 fs_inst *
498 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0)
499 {
500 return emit(fs_inst(opcode, dst, src0));
501 }
502
503 fs_inst *
504 fs_visitor::emit(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
505 {
506 return emit(fs_inst(opcode, dst, src0, src1));
507 }
508
509 fs_inst *
510 fs_visitor::emit(enum opcode opcode, fs_reg dst,
511 fs_reg src0, fs_reg src1, fs_reg src2)
512 {
513 return emit(fs_inst(opcode, dst, src0, src1, src2));
514 }
515
516 void
517 fs_visitor::push_force_uncompressed()
518 {
519 force_uncompressed_stack++;
520 }
521
522 void
523 fs_visitor::pop_force_uncompressed()
524 {
525 force_uncompressed_stack--;
526 assert(force_uncompressed_stack >= 0);
527 }
528
529 void
530 fs_visitor::push_force_sechalf()
531 {
532 force_sechalf_stack++;
533 }
534
535 void
536 fs_visitor::pop_force_sechalf()
537 {
538 force_sechalf_stack--;
539 assert(force_sechalf_stack >= 0);
540 }
541
542 /**
543 * Returns how many MRFs an FS opcode will write over.
544 *
545 * Note that this is not the 0 or 1 implied writes in an actual gen
546 * instruction -- the FS opcodes often generate MOVs in addition.
547 */
548 int
549 fs_visitor::implied_mrf_writes(fs_inst *inst)
550 {
551 if (inst->mlen == 0)
552 return 0;
553
554 switch (inst->opcode) {
555 case SHADER_OPCODE_RCP:
556 case SHADER_OPCODE_RSQ:
557 case SHADER_OPCODE_SQRT:
558 case SHADER_OPCODE_EXP2:
559 case SHADER_OPCODE_LOG2:
560 case SHADER_OPCODE_SIN:
561 case SHADER_OPCODE_COS:
562 return 1 * dispatch_width / 8;
563 case SHADER_OPCODE_POW:
564 case SHADER_OPCODE_INT_QUOTIENT:
565 case SHADER_OPCODE_INT_REMAINDER:
566 return 2 * dispatch_width / 8;
567 case SHADER_OPCODE_TEX:
568 case FS_OPCODE_TXB:
569 case SHADER_OPCODE_TXD:
570 case SHADER_OPCODE_TXF:
571 case SHADER_OPCODE_TXL:
572 case SHADER_OPCODE_TXS:
573 return 1;
574 case FS_OPCODE_FB_WRITE:
575 return 2;
576 case FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD:
577 case FS_OPCODE_UNSPILL:
578 return 1;
579 case FS_OPCODE_VARYING_PULL_CONSTANT_LOAD:
580 return inst->header_present;
581 case FS_OPCODE_SPILL:
582 return 2;
583 default:
584 assert(!"not reached");
585 return inst->mlen;
586 }
587 }
588
589 int
590 fs_visitor::virtual_grf_alloc(int size)
591 {
592 if (virtual_grf_array_size <= virtual_grf_count) {
593 if (virtual_grf_array_size == 0)
594 virtual_grf_array_size = 16;
595 else
596 virtual_grf_array_size *= 2;
597 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
598 virtual_grf_array_size);
599 }
600 virtual_grf_sizes[virtual_grf_count] = size;
601 return virtual_grf_count++;
602 }
603
604 /** Fixed HW reg constructor. */
605 fs_reg::fs_reg(enum register_file file, int reg)
606 {
607 init();
608 this->file = file;
609 this->reg = reg;
610 this->type = BRW_REGISTER_TYPE_F;
611 }
612
613 /** Fixed HW reg constructor. */
614 fs_reg::fs_reg(enum register_file file, int reg, uint32_t type)
615 {
616 init();
617 this->file = file;
618 this->reg = reg;
619 this->type = type;
620 }
621
622 /** Automatic reg constructor. */
623 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
624 {
625 init();
626
627 this->file = GRF;
628 this->reg = v->virtual_grf_alloc(v->type_size(type));
629 this->reg_offset = 0;
630 this->type = brw_type_for_base_type(type);
631 }
632
633 fs_reg *
634 fs_visitor::variable_storage(ir_variable *var)
635 {
636 return (fs_reg *)hash_table_find(this->variable_ht, var);
637 }
638
639 void
640 import_uniforms_callback(const void *key,
641 void *data,
642 void *closure)
643 {
644 struct hash_table *dst_ht = (struct hash_table *)closure;
645 const fs_reg *reg = (const fs_reg *)data;
646
647 if (reg->file != UNIFORM)
648 return;
649
650 hash_table_insert(dst_ht, data, key);
651 }
652
653 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
654 * This brings in those uniform definitions
655 */
656 void
657 fs_visitor::import_uniforms(fs_visitor *v)
658 {
659 hash_table_call_foreach(v->variable_ht,
660 import_uniforms_callback,
661 variable_ht);
662 this->params_remap = v->params_remap;
663 }
664
665 /* Our support for uniforms is piggy-backed on the struct
666 * gl_fragment_program, because that's where the values actually
667 * get stored, rather than in some global gl_shader_program uniform
668 * store.
669 */
670 int
671 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
672 {
673 unsigned int offset = 0;
674
675 if (type->is_matrix()) {
676 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
677 type->vector_elements,
678 1);
679
680 for (unsigned int i = 0; i < type->matrix_columns; i++) {
681 offset += setup_uniform_values(loc + offset, column);
682 }
683
684 return offset;
685 }
686
687 switch (type->base_type) {
688 case GLSL_TYPE_FLOAT:
689 case GLSL_TYPE_UINT:
690 case GLSL_TYPE_INT:
691 case GLSL_TYPE_BOOL:
692 for (unsigned int i = 0; i < type->vector_elements; i++) {
693 unsigned int param = c->prog_data.nr_params++;
694
695 this->param_index[param] = loc;
696 this->param_offset[param] = i;
697 }
698 return 1;
699
700 case GLSL_TYPE_STRUCT:
701 for (unsigned int i = 0; i < type->length; i++) {
702 offset += setup_uniform_values(loc + offset,
703 type->fields.structure[i].type);
704 }
705 return offset;
706
707 case GLSL_TYPE_ARRAY:
708 for (unsigned int i = 0; i < type->length; i++) {
709 offset += setup_uniform_values(loc + offset, type->fields.array);
710 }
711 return offset;
712
713 case GLSL_TYPE_SAMPLER:
714 /* The sampler takes up a slot, but we don't use any values from it. */
715 return 1;
716
717 default:
718 assert(!"not reached");
719 return 0;
720 }
721 }
722
723
724 /* Our support for builtin uniforms is even scarier than non-builtin.
725 * It sits on top of the PROG_STATE_VAR parameters that are
726 * automatically updated from GL context state.
727 */
728 void
729 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
730 {
731 const ir_state_slot *const slots = ir->state_slots;
732 assert(ir->state_slots != NULL);
733
734 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
735 /* This state reference has already been setup by ir_to_mesa, but we'll
736 * get the same index back here.
737 */
738 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
739 (gl_state_index *)slots[i].tokens);
740
741 /* Add each of the unique swizzles of the element as a parameter.
742 * This'll end up matching the expected layout of the
743 * array/matrix/structure we're trying to fill in.
744 */
745 int last_swiz = -1;
746 for (unsigned int j = 0; j < 4; j++) {
747 int swiz = GET_SWZ(slots[i].swizzle, j);
748 if (swiz == last_swiz)
749 break;
750 last_swiz = swiz;
751
752 this->param_index[c->prog_data.nr_params] = index;
753 this->param_offset[c->prog_data.nr_params] = swiz;
754 c->prog_data.nr_params++;
755 }
756 }
757 }
758
759 fs_reg *
760 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
761 {
762 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
763 fs_reg wpos = *reg;
764 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
765
766 /* gl_FragCoord.x */
767 if (ir->pixel_center_integer) {
768 emit(MOV(wpos, this->pixel_x));
769 } else {
770 emit(ADD(wpos, this->pixel_x, fs_reg(0.5f)));
771 }
772 wpos.reg_offset++;
773
774 /* gl_FragCoord.y */
775 if (!flip && ir->pixel_center_integer) {
776 emit(MOV(wpos, this->pixel_y));
777 } else {
778 fs_reg pixel_y = this->pixel_y;
779 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
780
781 if (flip) {
782 pixel_y.negate = true;
783 offset += c->key.drawable_height - 1.0;
784 }
785
786 emit(ADD(wpos, pixel_y, fs_reg(offset)));
787 }
788 wpos.reg_offset++;
789
790 /* gl_FragCoord.z */
791 if (intel->gen >= 6) {
792 emit(MOV(wpos, fs_reg(brw_vec8_grf(c->source_depth_reg, 0))));
793 } else {
794 emit(FS_OPCODE_LINTERP, wpos,
795 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
796 this->delta_y[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC],
797 interp_reg(FRAG_ATTRIB_WPOS, 2));
798 }
799 wpos.reg_offset++;
800
801 /* gl_FragCoord.w: Already set up in emit_interpolation */
802 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
803
804 return reg;
805 }
806
807 fs_inst *
808 fs_visitor::emit_linterp(const fs_reg &attr, const fs_reg &interp,
809 glsl_interp_qualifier interpolation_mode,
810 bool is_centroid)
811 {
812 brw_wm_barycentric_interp_mode barycoord_mode;
813 if (is_centroid) {
814 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
815 barycoord_mode = BRW_WM_PERSPECTIVE_CENTROID_BARYCENTRIC;
816 else
817 barycoord_mode = BRW_WM_NONPERSPECTIVE_CENTROID_BARYCENTRIC;
818 } else {
819 if (interpolation_mode == INTERP_QUALIFIER_SMOOTH)
820 barycoord_mode = BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC;
821 else
822 barycoord_mode = BRW_WM_NONPERSPECTIVE_PIXEL_BARYCENTRIC;
823 }
824 return emit(FS_OPCODE_LINTERP, attr,
825 this->delta_x[barycoord_mode],
826 this->delta_y[barycoord_mode], interp);
827 }
828
829 fs_reg *
830 fs_visitor::emit_general_interpolation(ir_variable *ir)
831 {
832 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
833 reg->type = brw_type_for_base_type(ir->type->get_scalar_type());
834 fs_reg attr = *reg;
835
836 unsigned int array_elements;
837 const glsl_type *type;
838
839 if (ir->type->is_array()) {
840 array_elements = ir->type->length;
841 if (array_elements == 0) {
842 fail("dereferenced array '%s' has length 0\n", ir->name);
843 }
844 type = ir->type->fields.array;
845 } else {
846 array_elements = 1;
847 type = ir->type;
848 }
849
850 glsl_interp_qualifier interpolation_mode =
851 ir->determine_interpolation_mode(c->key.flat_shade);
852
853 int location = ir->location;
854 for (unsigned int i = 0; i < array_elements; i++) {
855 for (unsigned int j = 0; j < type->matrix_columns; j++) {
856 if (urb_setup[location] == -1) {
857 /* If there's no incoming setup data for this slot, don't
858 * emit interpolation for it.
859 */
860 attr.reg_offset += type->vector_elements;
861 location++;
862 continue;
863 }
864
865 if (interpolation_mode == INTERP_QUALIFIER_FLAT) {
866 /* Constant interpolation (flat shading) case. The SF has
867 * handed us defined values in only the constant offset
868 * field of the setup reg.
869 */
870 for (unsigned int k = 0; k < type->vector_elements; k++) {
871 struct brw_reg interp = interp_reg(location, k);
872 interp = suboffset(interp, 3);
873 interp.type = reg->type;
874 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
875 attr.reg_offset++;
876 }
877 } else {
878 /* Smooth/noperspective interpolation case. */
879 for (unsigned int k = 0; k < type->vector_elements; k++) {
880 /* FINISHME: At some point we probably want to push
881 * this farther by giving similar treatment to the
882 * other potentially constant components of the
883 * attribute, as well as making brw_vs_constval.c
884 * handle varyings other than gl_TexCoord.
885 */
886 if (location >= FRAG_ATTRIB_TEX0 &&
887 location <= FRAG_ATTRIB_TEX7 &&
888 k == 3 && !(c->key.proj_attrib_mask & (1 << location))) {
889 emit(BRW_OPCODE_MOV, attr, fs_reg(1.0f));
890 } else {
891 struct brw_reg interp = interp_reg(location, k);
892 emit_linterp(attr, fs_reg(interp), interpolation_mode,
893 ir->centroid);
894 if (brw->needs_unlit_centroid_workaround && ir->centroid) {
895 /* Get the pixel/sample mask into f0 so that we know
896 * which pixels are lit. Then, for each channel that is
897 * unlit, replace the centroid data with non-centroid
898 * data.
899 */
900 emit(FS_OPCODE_MOV_DISPATCH_TO_FLAGS, attr);
901 fs_inst *inst = emit_linterp(attr, fs_reg(interp),
902 interpolation_mode, false);
903 inst->predicate = BRW_PREDICATE_NORMAL;
904 inst->predicate_inverse = true;
905 }
906 if (intel->gen < 6) {
907 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
908 }
909 }
910 attr.reg_offset++;
911 }
912
913 }
914 location++;
915 }
916 }
917
918 return reg;
919 }
920
921 fs_reg *
922 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
923 {
924 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
925
926 /* The frontfacing comes in as a bit in the thread payload. */
927 if (intel->gen >= 6) {
928 emit(BRW_OPCODE_ASR, *reg,
929 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
930 fs_reg(15));
931 emit(BRW_OPCODE_NOT, *reg, *reg);
932 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
933 } else {
934 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
935 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
936 * us front face
937 */
938 emit(CMP(*reg, fs_reg(r1_6ud), fs_reg(1u << 31), BRW_CONDITIONAL_L));
939 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
940 }
941
942 return reg;
943 }
944
945 fs_inst *
946 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src)
947 {
948 switch (opcode) {
949 case SHADER_OPCODE_RCP:
950 case SHADER_OPCODE_RSQ:
951 case SHADER_OPCODE_SQRT:
952 case SHADER_OPCODE_EXP2:
953 case SHADER_OPCODE_LOG2:
954 case SHADER_OPCODE_SIN:
955 case SHADER_OPCODE_COS:
956 break;
957 default:
958 assert(!"not reached: bad math opcode");
959 return NULL;
960 }
961
962 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
963 * might be able to do better by doing execsize = 1 math and then
964 * expanding that result out, but we would need to be careful with
965 * masking.
966 *
967 * Gen 6 hardware ignores source modifiers (negate and abs) on math
968 * instructions, so we also move to a temp to set those up.
969 */
970 if (intel->gen == 6 && (src.file == UNIFORM ||
971 src.abs ||
972 src.negate)) {
973 fs_reg expanded = fs_reg(this, glsl_type::float_type);
974 emit(BRW_OPCODE_MOV, expanded, src);
975 src = expanded;
976 }
977
978 fs_inst *inst = emit(opcode, dst, src);
979
980 if (intel->gen < 6) {
981 inst->base_mrf = 2;
982 inst->mlen = dispatch_width / 8;
983 }
984
985 return inst;
986 }
987
988 fs_inst *
989 fs_visitor::emit_math(enum opcode opcode, fs_reg dst, fs_reg src0, fs_reg src1)
990 {
991 int base_mrf = 2;
992 fs_inst *inst;
993
994 switch (opcode) {
995 case SHADER_OPCODE_POW:
996 case SHADER_OPCODE_INT_QUOTIENT:
997 case SHADER_OPCODE_INT_REMAINDER:
998 break;
999 default:
1000 assert(!"not reached: unsupported binary math opcode.");
1001 return NULL;
1002 }
1003
1004 if (intel->gen >= 7) {
1005 inst = emit(opcode, dst, src0, src1);
1006 } else if (intel->gen == 6) {
1007 /* Can't do hstride == 0 args to gen6 math, so expand it out.
1008 *
1009 * The hardware ignores source modifiers (negate and abs) on math
1010 * instructions, so we also move to a temp to set those up.
1011 */
1012 if (src0.file == UNIFORM || src0.abs || src0.negate) {
1013 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1014 expanded.type = src0.type;
1015 emit(BRW_OPCODE_MOV, expanded, src0);
1016 src0 = expanded;
1017 }
1018
1019 if (src1.file == UNIFORM || src1.abs || src1.negate) {
1020 fs_reg expanded = fs_reg(this, glsl_type::float_type);
1021 expanded.type = src1.type;
1022 emit(BRW_OPCODE_MOV, expanded, src1);
1023 src1 = expanded;
1024 }
1025
1026 inst = emit(opcode, dst, src0, src1);
1027 } else {
1028 /* From the Ironlake PRM, Volume 4, Part 1, Section 6.1.13
1029 * "Message Payload":
1030 *
1031 * "Operand0[7]. For the INT DIV functions, this operand is the
1032 * denominator."
1033 * ...
1034 * "Operand1[7]. For the INT DIV functions, this operand is the
1035 * numerator."
1036 */
1037 bool is_int_div = opcode != SHADER_OPCODE_POW;
1038 fs_reg &op0 = is_int_div ? src1 : src0;
1039 fs_reg &op1 = is_int_div ? src0 : src1;
1040
1041 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1, op1.type), op1);
1042 inst = emit(opcode, dst, op0, reg_null_f);
1043
1044 inst->base_mrf = base_mrf;
1045 inst->mlen = 2 * dispatch_width / 8;
1046 }
1047 return inst;
1048 }
1049
1050 /**
1051 * To be called after the last _mesa_add_state_reference() call, to
1052 * set up prog_data.param[] for assign_curb_setup() and
1053 * setup_pull_constants().
1054 */
1055 void
1056 fs_visitor::setup_paramvalues_refs()
1057 {
1058 if (dispatch_width != 8)
1059 return;
1060
1061 /* Set up the pointers to ParamValues now that that array is finalized. */
1062 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1063 c->prog_data.param[i] =
1064 (const float *)fp->Base.Parameters->ParameterValues[this->param_index[i]] +
1065 this->param_offset[i];
1066 }
1067 }
1068
1069 void
1070 fs_visitor::assign_curb_setup()
1071 {
1072 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
1073 if (dispatch_width == 8) {
1074 c->prog_data.first_curbe_grf = c->nr_payload_regs;
1075 } else {
1076 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
1077 }
1078
1079 /* Map the offsets in the UNIFORM file to fixed HW regs. */
1080 foreach_list(node, &this->instructions) {
1081 fs_inst *inst = (fs_inst *)node;
1082
1083 for (unsigned int i = 0; i < 3; i++) {
1084 if (inst->src[i].file == UNIFORM) {
1085 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1086 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
1087 constant_nr / 8,
1088 constant_nr % 8);
1089
1090 inst->src[i].file = FIXED_HW_REG;
1091 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
1092 }
1093 }
1094 }
1095 }
1096
1097 void
1098 fs_visitor::calculate_urb_setup()
1099 {
1100 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1101 urb_setup[i] = -1;
1102 }
1103
1104 int urb_next = 0;
1105 /* Figure out where each of the incoming setup attributes lands. */
1106 if (intel->gen >= 6) {
1107 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
1108 if (fp->Base.InputsRead & BITFIELD64_BIT(i)) {
1109 urb_setup[i] = urb_next++;
1110 }
1111 }
1112 } else {
1113 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
1114 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
1115 /* Point size is packed into the header, not as a general attribute */
1116 if (i == VERT_RESULT_PSIZ)
1117 continue;
1118
1119 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
1120 int fp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
1121
1122 /* The back color slot is skipped when the front color is
1123 * also written to. In addition, some slots can be
1124 * written in the vertex shader and not read in the
1125 * fragment shader. So the register number must always be
1126 * incremented, mapped or not.
1127 */
1128 if (fp_index >= 0)
1129 urb_setup[fp_index] = urb_next;
1130 urb_next++;
1131 }
1132 }
1133
1134 /*
1135 * It's a FS only attribute, and we did interpolation for this attribute
1136 * in SF thread. So, count it here, too.
1137 *
1138 * See compile_sf_prog() for more info.
1139 */
1140 if (fp->Base.InputsRead & BITFIELD64_BIT(FRAG_ATTRIB_PNTC))
1141 urb_setup[FRAG_ATTRIB_PNTC] = urb_next++;
1142 }
1143
1144 /* Each attribute is 4 setup channels, each of which is half a reg. */
1145 c->prog_data.urb_read_length = urb_next * 2;
1146 }
1147
1148 void
1149 fs_visitor::assign_urb_setup()
1150 {
1151 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
1152
1153 /* Offset all the urb_setup[] index by the actual position of the
1154 * setup regs, now that the location of the constants has been chosen.
1155 */
1156 foreach_list(node, &this->instructions) {
1157 fs_inst *inst = (fs_inst *)node;
1158
1159 if (inst->opcode == FS_OPCODE_LINTERP) {
1160 assert(inst->src[2].file == FIXED_HW_REG);
1161 inst->src[2].fixed_hw_reg.nr += urb_start;
1162 }
1163
1164 if (inst->opcode == FS_OPCODE_CINTERP) {
1165 assert(inst->src[0].file == FIXED_HW_REG);
1166 inst->src[0].fixed_hw_reg.nr += urb_start;
1167 }
1168 }
1169
1170 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
1171 }
1172
1173 /**
1174 * Split large virtual GRFs into separate components if we can.
1175 *
1176 * This is mostly duplicated with what brw_fs_vector_splitting does,
1177 * but that's really conservative because it's afraid of doing
1178 * splitting that doesn't result in real progress after the rest of
1179 * the optimization phases, which would cause infinite looping in
1180 * optimization. We can do it once here, safely. This also has the
1181 * opportunity to split interpolated values, or maybe even uniforms,
1182 * which we don't have at the IR level.
1183 *
1184 * We want to split, because virtual GRFs are what we register
1185 * allocate and spill (due to contiguousness requirements for some
1186 * instructions), and they're what we naturally generate in the
1187 * codegen process, but most virtual GRFs don't actually need to be
1188 * contiguous sets of GRFs. If we split, we'll end up with reduced
1189 * live intervals and better dead code elimination and coalescing.
1190 */
1191 void
1192 fs_visitor::split_virtual_grfs()
1193 {
1194 int num_vars = this->virtual_grf_count;
1195 bool split_grf[num_vars];
1196 int new_virtual_grf[num_vars];
1197
1198 /* Try to split anything > 0 sized. */
1199 for (int i = 0; i < num_vars; i++) {
1200 if (this->virtual_grf_sizes[i] != 1)
1201 split_grf[i] = true;
1202 else
1203 split_grf[i] = false;
1204 }
1205
1206 if (brw->has_pln &&
1207 this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].file == GRF) {
1208 /* PLN opcodes rely on the delta_xy being contiguous. We only have to
1209 * check this for BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC, because prior to
1210 * Gen6, that was the only supported interpolation mode, and since Gen6,
1211 * delta_x and delta_y are in fixed hardware registers.
1212 */
1213 split_grf[this->delta_x[BRW_WM_PERSPECTIVE_PIXEL_BARYCENTRIC].reg] =
1214 false;
1215 }
1216
1217 foreach_list(node, &this->instructions) {
1218 fs_inst *inst = (fs_inst *)node;
1219
1220 /* If there's a SEND message that requires contiguous destination
1221 * registers, no splitting is allowed.
1222 */
1223 if (inst->regs_written() > 1) {
1224 split_grf[inst->dst.reg] = false;
1225 }
1226 }
1227
1228 /* Allocate new space for split regs. Note that the virtual
1229 * numbers will be contiguous.
1230 */
1231 for (int i = 0; i < num_vars; i++) {
1232 if (split_grf[i]) {
1233 new_virtual_grf[i] = virtual_grf_alloc(1);
1234 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
1235 int reg = virtual_grf_alloc(1);
1236 assert(reg == new_virtual_grf[i] + j - 1);
1237 (void) reg;
1238 }
1239 this->virtual_grf_sizes[i] = 1;
1240 }
1241 }
1242
1243 foreach_list(node, &this->instructions) {
1244 fs_inst *inst = (fs_inst *)node;
1245
1246 if (inst->dst.file == GRF &&
1247 split_grf[inst->dst.reg] &&
1248 inst->dst.reg_offset != 0) {
1249 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
1250 inst->dst.reg_offset - 1);
1251 inst->dst.reg_offset = 0;
1252 }
1253 for (int i = 0; i < 3; i++) {
1254 if (inst->src[i].file == GRF &&
1255 split_grf[inst->src[i].reg] &&
1256 inst->src[i].reg_offset != 0) {
1257 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
1258 inst->src[i].reg_offset - 1);
1259 inst->src[i].reg_offset = 0;
1260 }
1261 }
1262 }
1263 this->live_intervals_valid = false;
1264 }
1265
1266 /**
1267 * Remove unused virtual GRFs and compact the virtual_grf_* arrays.
1268 *
1269 * During code generation, we create tons of temporary variables, many of
1270 * which get immediately killed and are never used again. Yet, in later
1271 * optimization and analysis passes, such as compute_live_intervals, we need
1272 * to loop over all the virtual GRFs. Compacting them can save a lot of
1273 * overhead.
1274 */
1275 void
1276 fs_visitor::compact_virtual_grfs()
1277 {
1278 /* Mark which virtual GRFs are used, and count how many. */
1279 int remap_table[this->virtual_grf_count];
1280 memset(remap_table, -1, sizeof(remap_table));
1281
1282 foreach_list(node, &this->instructions) {
1283 const fs_inst *inst = (const fs_inst *) node;
1284
1285 if (inst->dst.file == GRF)
1286 remap_table[inst->dst.reg] = 0;
1287
1288 for (int i = 0; i < 3; i++) {
1289 if (inst->src[i].file == GRF)
1290 remap_table[inst->src[i].reg] = 0;
1291 }
1292 }
1293
1294 /* In addition to registers used in instructions, fs_visitor keeps
1295 * direct references to certain special values which must be patched:
1296 */
1297 fs_reg *special[] = {
1298 &frag_depth, &pixel_x, &pixel_y, &pixel_w, &wpos_w, &dual_src_output,
1299 &outputs[0], &outputs[1], &outputs[2], &outputs[3],
1300 &outputs[4], &outputs[5], &outputs[6], &outputs[7],
1301 &delta_x[0], &delta_x[1], &delta_x[2],
1302 &delta_x[3], &delta_x[4], &delta_x[5],
1303 &delta_y[0], &delta_y[1], &delta_y[2],
1304 &delta_y[3], &delta_y[4], &delta_y[5],
1305 };
1306 STATIC_ASSERT(BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT == 6);
1307 STATIC_ASSERT(BRW_MAX_DRAW_BUFFERS == 8);
1308
1309 /* Treat all special values as used, to be conservative */
1310 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1311 if (special[i]->file == GRF)
1312 remap_table[special[i]->reg] = 0;
1313 }
1314
1315 /* Compact the GRF arrays. */
1316 int new_index = 0;
1317 for (int i = 0; i < this->virtual_grf_count; i++) {
1318 if (remap_table[i] != -1) {
1319 remap_table[i] = new_index;
1320 virtual_grf_sizes[new_index] = virtual_grf_sizes[i];
1321 if (live_intervals_valid) {
1322 virtual_grf_use[new_index] = virtual_grf_use[i];
1323 virtual_grf_def[new_index] = virtual_grf_def[i];
1324 }
1325 ++new_index;
1326 }
1327 }
1328
1329 this->virtual_grf_count = new_index;
1330
1331 /* Patch all the instructions to use the newly renumbered registers */
1332 foreach_list(node, &this->instructions) {
1333 fs_inst *inst = (fs_inst *) node;
1334
1335 if (inst->dst.file == GRF)
1336 inst->dst.reg = remap_table[inst->dst.reg];
1337
1338 for (int i = 0; i < 3; i++) {
1339 if (inst->src[i].file == GRF)
1340 inst->src[i].reg = remap_table[inst->src[i].reg];
1341 }
1342 }
1343
1344 /* Patch all the references to special values */
1345 for (unsigned i = 0; i < ARRAY_SIZE(special); i++) {
1346 if (special[i]->file == GRF && remap_table[special[i]->reg] != -1)
1347 special[i]->reg = remap_table[special[i]->reg];
1348 }
1349 }
1350
1351 bool
1352 fs_visitor::remove_dead_constants()
1353 {
1354 if (dispatch_width == 8) {
1355 this->params_remap = ralloc_array(mem_ctx, int, c->prog_data.nr_params);
1356
1357 for (unsigned int i = 0; i < c->prog_data.nr_params; i++)
1358 this->params_remap[i] = -1;
1359
1360 /* Find which params are still in use. */
1361 foreach_list(node, &this->instructions) {
1362 fs_inst *inst = (fs_inst *)node;
1363
1364 for (int i = 0; i < 3; i++) {
1365 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1366
1367 if (inst->src[i].file != UNIFORM)
1368 continue;
1369
1370 assert(constant_nr < (int)c->prog_data.nr_params);
1371
1372 /* For now, set this to non-negative. We'll give it the
1373 * actual new number in a moment, in order to keep the
1374 * register numbers nicely ordered.
1375 */
1376 this->params_remap[constant_nr] = 0;
1377 }
1378 }
1379
1380 /* Figure out what the new numbers for the params will be. At some
1381 * point when we're doing uniform array access, we're going to want
1382 * to keep the distinction between .reg and .reg_offset, but for
1383 * now we don't care.
1384 */
1385 unsigned int new_nr_params = 0;
1386 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1387 if (this->params_remap[i] != -1) {
1388 this->params_remap[i] = new_nr_params++;
1389 }
1390 }
1391
1392 /* Update the list of params to be uploaded to match our new numbering. */
1393 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1394 int remapped = this->params_remap[i];
1395
1396 if (remapped == -1)
1397 continue;
1398
1399 /* We've already done setup_paramvalues_refs() so no need to worry
1400 * about param_index and param_offset.
1401 */
1402 c->prog_data.param[remapped] = c->prog_data.param[i];
1403 }
1404
1405 c->prog_data.nr_params = new_nr_params;
1406 } else {
1407 /* This should have been generated in the 8-wide pass already. */
1408 assert(this->params_remap);
1409 }
1410
1411 /* Now do the renumbering of the shader to remove unused params. */
1412 foreach_list(node, &this->instructions) {
1413 fs_inst *inst = (fs_inst *)node;
1414
1415 for (int i = 0; i < 3; i++) {
1416 int constant_nr = inst->src[i].reg + inst->src[i].reg_offset;
1417
1418 if (inst->src[i].file != UNIFORM)
1419 continue;
1420
1421 assert(this->params_remap[constant_nr] != -1);
1422 inst->src[i].reg = this->params_remap[constant_nr];
1423 inst->src[i].reg_offset = 0;
1424 }
1425 }
1426
1427 return true;
1428 }
1429
1430 /*
1431 * Implements array access of uniforms by inserting a
1432 * PULL_CONSTANT_LOAD instruction.
1433 *
1434 * Unlike temporary GRF array access (where we don't support it due to
1435 * the difficulty of doing relative addressing on instruction
1436 * destinations), we could potentially do array access of uniforms
1437 * that were loaded in GRF space as push constants. In real-world
1438 * usage we've seen, though, the arrays being used are always larger
1439 * than we could load as push constants, so just always move all
1440 * uniform array access out to a pull constant buffer.
1441 */
1442 void
1443 fs_visitor::move_uniform_array_access_to_pull_constants()
1444 {
1445 int pull_constant_loc[c->prog_data.nr_params];
1446
1447 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1448 pull_constant_loc[i] = -1;
1449 }
1450
1451 /* Walk through and find array access of uniforms. Put a copy of that
1452 * uniform in the pull constant buffer.
1453 *
1454 * Note that we don't move constant-indexed accesses to arrays. No
1455 * testing has been done of the performance impact of this choice.
1456 */
1457 foreach_list_safe(node, &this->instructions) {
1458 fs_inst *inst = (fs_inst *)node;
1459
1460 for (int i = 0 ; i < 3; i++) {
1461 if (inst->src[i].file != UNIFORM || !inst->src[i].reladdr)
1462 continue;
1463
1464 int uniform = inst->src[i].reg;
1465
1466 /* If this array isn't already present in the pull constant buffer,
1467 * add it.
1468 */
1469 if (pull_constant_loc[uniform] == -1) {
1470 const float **values = &c->prog_data.param[uniform];
1471
1472 pull_constant_loc[uniform] = c->prog_data.nr_pull_params;
1473
1474 assert(param_size[uniform]);
1475
1476 for (int j = 0; j < param_size[uniform]; j++) {
1477 c->prog_data.pull_param[c->prog_data.nr_pull_params++] =
1478 values[j];
1479 }
1480 }
1481
1482 /* Set up the annotation tracking for new generated instructions. */
1483 base_ir = inst->ir;
1484 current_annotation = inst->annotation;
1485
1486 fs_reg offset = fs_reg(this, glsl_type::int_type);
1487 inst->insert_before(ADD(offset, *inst->src[i].reladdr,
1488 fs_reg(pull_constant_loc[uniform] +
1489 inst->src[i].reg_offset)));
1490
1491 fs_reg surf_index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1492 fs_reg temp = fs_reg(this, glsl_type::float_type);
1493 exec_list list = VARYING_PULL_CONSTANT_LOAD(temp,
1494 surf_index, offset);
1495 inst->insert_before(&list);
1496
1497 inst->src[i].file = temp.file;
1498 inst->src[i].reg = temp.reg;
1499 inst->src[i].reg_offset = temp.reg_offset;
1500 inst->src[i].reladdr = NULL;
1501 }
1502 }
1503 }
1504
1505 /**
1506 * Choose accesses from the UNIFORM file to demote to using the pull
1507 * constant buffer.
1508 *
1509 * We allow a fragment shader to have more than the specified minimum
1510 * maximum number of fragment shader uniform components (64). If
1511 * there are too many of these, they'd fill up all of register space.
1512 * So, this will push some of them out to the pull constant buffer and
1513 * update the program to load them.
1514 */
1515 void
1516 fs_visitor::setup_pull_constants()
1517 {
1518 /* Only allow 16 registers (128 uniform components) as push constants. */
1519 unsigned int max_uniform_components = 16 * 8;
1520 if (c->prog_data.nr_params <= max_uniform_components)
1521 return;
1522
1523 if (dispatch_width == 16) {
1524 fail("Pull constants not supported in 16-wide\n");
1525 return;
1526 }
1527
1528 /* Just demote the end of the list. We could probably do better
1529 * here, demoting things that are rarely used in the program first.
1530 */
1531 unsigned int pull_uniform_base = max_uniform_components;
1532
1533 int pull_constant_loc[c->prog_data.nr_params];
1534 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
1535 if (i < pull_uniform_base) {
1536 pull_constant_loc[i] = -1;
1537 } else {
1538 pull_constant_loc[i] = -1;
1539 /* If our constant is already being uploaded for reladdr purposes,
1540 * reuse it.
1541 */
1542 for (unsigned int j = 0; j < c->prog_data.nr_pull_params; j++) {
1543 if (c->prog_data.pull_param[j] == c->prog_data.param[i]) {
1544 pull_constant_loc[i] = j;
1545 break;
1546 }
1547 }
1548 if (pull_constant_loc[i] == -1) {
1549 int pull_index = c->prog_data.nr_pull_params++;
1550 c->prog_data.pull_param[pull_index] = c->prog_data.param[i];
1551 pull_constant_loc[i] = pull_index;;
1552 }
1553 }
1554 }
1555 c->prog_data.nr_params = pull_uniform_base;
1556
1557 foreach_list(node, &this->instructions) {
1558 fs_inst *inst = (fs_inst *)node;
1559
1560 for (int i = 0; i < 3; i++) {
1561 if (inst->src[i].file != UNIFORM)
1562 continue;
1563
1564 int pull_index = pull_constant_loc[inst->src[i].reg +
1565 inst->src[i].reg_offset];
1566 if (pull_index == -1)
1567 continue;
1568
1569 assert(!inst->src[i].reladdr);
1570
1571 fs_reg dst = fs_reg(this, glsl_type::float_type);
1572 fs_reg index = fs_reg((unsigned)SURF_INDEX_FRAG_CONST_BUFFER);
1573 fs_reg offset = fs_reg((unsigned)(pull_index * 4) & ~15);
1574 fs_inst *pull =
1575 new(mem_ctx) fs_inst(FS_OPCODE_UNIFORM_PULL_CONSTANT_LOAD,
1576 dst, index, offset);
1577 pull->ir = inst->ir;
1578 pull->annotation = inst->annotation;
1579 pull->base_mrf = 14;
1580 pull->mlen = 1;
1581
1582 inst->insert_before(pull);
1583
1584 inst->src[i].file = GRF;
1585 inst->src[i].reg = dst.reg;
1586 inst->src[i].reg_offset = 0;
1587 inst->src[i].smear = pull_index & 3;
1588 }
1589 }
1590 }
1591
1592 bool
1593 fs_visitor::opt_algebraic()
1594 {
1595 bool progress = false;
1596
1597 foreach_list(node, &this->instructions) {
1598 fs_inst *inst = (fs_inst *)node;
1599
1600 switch (inst->opcode) {
1601 case BRW_OPCODE_MUL:
1602 if (inst->src[1].file != IMM)
1603 continue;
1604
1605 /* a * 1.0 = a */
1606 if (inst->src[1].is_one()) {
1607 inst->opcode = BRW_OPCODE_MOV;
1608 inst->src[1] = reg_undef;
1609 progress = true;
1610 break;
1611 }
1612
1613 /* a * 0.0 = 0.0 */
1614 if (inst->src[1].is_zero()) {
1615 inst->opcode = BRW_OPCODE_MOV;
1616 inst->src[0] = inst->src[1];
1617 inst->src[1] = reg_undef;
1618 progress = true;
1619 break;
1620 }
1621
1622 break;
1623 case BRW_OPCODE_ADD:
1624 if (inst->src[1].file != IMM)
1625 continue;
1626
1627 /* a + 0.0 = a */
1628 if (inst->src[1].is_zero()) {
1629 inst->opcode = BRW_OPCODE_MOV;
1630 inst->src[1] = reg_undef;
1631 progress = true;
1632 break;
1633 }
1634 break;
1635 default:
1636 break;
1637 }
1638 }
1639
1640 return progress;
1641 }
1642
1643 /**
1644 * Must be called after calculate_live_intervales() to remove unused
1645 * writes to registers -- register allocation will fail otherwise
1646 * because something deffed but not used won't be considered to
1647 * interfere with other regs.
1648 */
1649 bool
1650 fs_visitor::dead_code_eliminate()
1651 {
1652 bool progress = false;
1653 int pc = 0;
1654
1655 calculate_live_intervals();
1656
1657 foreach_list_safe(node, &this->instructions) {
1658 fs_inst *inst = (fs_inst *)node;
1659
1660 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1661 inst->remove();
1662 progress = true;
1663 }
1664
1665 pc++;
1666 }
1667
1668 if (progress)
1669 live_intervals_valid = false;
1670
1671 return progress;
1672 }
1673
1674 /**
1675 * Implements a second type of register coalescing: This one checks if
1676 * the two regs involved in a raw move don't interfere, in which case
1677 * they can both by stored in the same place and the MOV removed.
1678 */
1679 bool
1680 fs_visitor::register_coalesce_2()
1681 {
1682 bool progress = false;
1683
1684 calculate_live_intervals();
1685
1686 foreach_list_safe(node, &this->instructions) {
1687 fs_inst *inst = (fs_inst *)node;
1688
1689 if (inst->opcode != BRW_OPCODE_MOV ||
1690 inst->predicate ||
1691 inst->saturate ||
1692 inst->src[0].file != GRF ||
1693 inst->src[0].negate ||
1694 inst->src[0].abs ||
1695 inst->src[0].smear != -1 ||
1696 inst->dst.file != GRF ||
1697 inst->dst.type != inst->src[0].type ||
1698 virtual_grf_sizes[inst->src[0].reg] != 1 ||
1699 virtual_grf_interferes(inst->dst.reg, inst->src[0].reg)) {
1700 continue;
1701 }
1702
1703 int reg_from = inst->src[0].reg;
1704 assert(inst->src[0].reg_offset == 0);
1705 int reg_to = inst->dst.reg;
1706 int reg_to_offset = inst->dst.reg_offset;
1707
1708 foreach_list_safe(node, &this->instructions) {
1709 fs_inst *scan_inst = (fs_inst *)node;
1710
1711 if (scan_inst->dst.file == GRF &&
1712 scan_inst->dst.reg == reg_from) {
1713 scan_inst->dst.reg = reg_to;
1714 scan_inst->dst.reg_offset = reg_to_offset;
1715 }
1716 for (int i = 0; i < 3; i++) {
1717 if (scan_inst->src[i].file == GRF &&
1718 scan_inst->src[i].reg == reg_from) {
1719 scan_inst->src[i].reg = reg_to;
1720 scan_inst->src[i].reg_offset = reg_to_offset;
1721 }
1722 }
1723 }
1724
1725 inst->remove();
1726 live_intervals_valid = false;
1727 progress = true;
1728 continue;
1729 }
1730
1731 return progress;
1732 }
1733
1734 bool
1735 fs_visitor::register_coalesce()
1736 {
1737 bool progress = false;
1738 int if_depth = 0;
1739 int loop_depth = 0;
1740
1741 foreach_list_safe(node, &this->instructions) {
1742 fs_inst *inst = (fs_inst *)node;
1743
1744 /* Make sure that we dominate the instructions we're going to
1745 * scan for interfering with our coalescing, or we won't have
1746 * scanned enough to see if anything interferes with our
1747 * coalescing. We don't dominate the following instructions if
1748 * we're in a loop or an if block.
1749 */
1750 switch (inst->opcode) {
1751 case BRW_OPCODE_DO:
1752 loop_depth++;
1753 break;
1754 case BRW_OPCODE_WHILE:
1755 loop_depth--;
1756 break;
1757 case BRW_OPCODE_IF:
1758 if_depth++;
1759 break;
1760 case BRW_OPCODE_ENDIF:
1761 if_depth--;
1762 break;
1763 default:
1764 break;
1765 }
1766 if (loop_depth || if_depth)
1767 continue;
1768
1769 if (inst->opcode != BRW_OPCODE_MOV ||
1770 inst->predicate ||
1771 inst->saturate ||
1772 inst->dst.file != GRF || (inst->src[0].file != GRF &&
1773 inst->src[0].file != UNIFORM)||
1774 inst->dst.type != inst->src[0].type)
1775 continue;
1776
1777 bool has_source_modifiers = (inst->src[0].abs ||
1778 inst->src[0].negate ||
1779 inst->src[0].file == UNIFORM);
1780
1781 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1782 * them: check for no writes to either one until the exit of the
1783 * program.
1784 */
1785 bool interfered = false;
1786
1787 for (fs_inst *scan_inst = (fs_inst *)inst->next;
1788 !scan_inst->is_tail_sentinel();
1789 scan_inst = (fs_inst *)scan_inst->next) {
1790 if (scan_inst->dst.file == GRF) {
1791 if (scan_inst->overwrites_reg(inst->dst) ||
1792 scan_inst->overwrites_reg(inst->src[0])) {
1793 interfered = true;
1794 break;
1795 }
1796 }
1797
1798 /* The gen6 MATH instruction can't handle source modifiers or
1799 * unusual register regions, so avoid coalescing those for
1800 * now. We should do something more specific.
1801 */
1802 if (has_source_modifiers && !can_do_source_mods(scan_inst)) {
1803 interfered = true;
1804 break;
1805 }
1806
1807 /* The accumulator result appears to get used for the
1808 * conditional modifier generation. When negating a UD
1809 * value, there is a 33rd bit generated for the sign in the
1810 * accumulator value, so now you can't check, for example,
1811 * equality with a 32-bit value. See piglit fs-op-neg-uint.
1812 */
1813 if (scan_inst->conditional_mod &&
1814 inst->src[0].negate &&
1815 inst->src[0].type == BRW_REGISTER_TYPE_UD) {
1816 interfered = true;
1817 break;
1818 }
1819 }
1820 if (interfered) {
1821 continue;
1822 }
1823
1824 /* Rewrite the later usage to point at the source of the move to
1825 * be removed.
1826 */
1827 for (fs_inst *scan_inst = inst;
1828 !scan_inst->is_tail_sentinel();
1829 scan_inst = (fs_inst *)scan_inst->next) {
1830 for (int i = 0; i < 3; i++) {
1831 if (scan_inst->src[i].file == GRF &&
1832 scan_inst->src[i].reg == inst->dst.reg &&
1833 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1834 fs_reg new_src = inst->src[0];
1835 if (scan_inst->src[i].abs) {
1836 new_src.negate = 0;
1837 new_src.abs = 1;
1838 }
1839 new_src.negate ^= scan_inst->src[i].negate;
1840 scan_inst->src[i] = new_src;
1841 }
1842 }
1843 }
1844
1845 inst->remove();
1846 progress = true;
1847 }
1848
1849 if (progress)
1850 live_intervals_valid = false;
1851
1852 return progress;
1853 }
1854
1855
1856 bool
1857 fs_visitor::compute_to_mrf()
1858 {
1859 bool progress = false;
1860 int next_ip = 0;
1861
1862 calculate_live_intervals();
1863
1864 foreach_list_safe(node, &this->instructions) {
1865 fs_inst *inst = (fs_inst *)node;
1866
1867 int ip = next_ip;
1868 next_ip++;
1869
1870 if (inst->opcode != BRW_OPCODE_MOV ||
1871 inst->predicate ||
1872 inst->dst.file != MRF || inst->src[0].file != GRF ||
1873 inst->dst.type != inst->src[0].type ||
1874 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1875 continue;
1876
1877 /* Work out which hardware MRF registers are written by this
1878 * instruction.
1879 */
1880 int mrf_low = inst->dst.reg & ~BRW_MRF_COMPR4;
1881 int mrf_high;
1882 if (inst->dst.reg & BRW_MRF_COMPR4) {
1883 mrf_high = mrf_low + 4;
1884 } else if (dispatch_width == 16 &&
1885 (!inst->force_uncompressed && !inst->force_sechalf)) {
1886 mrf_high = mrf_low + 1;
1887 } else {
1888 mrf_high = mrf_low;
1889 }
1890
1891 /* Can't compute-to-MRF this GRF if someone else was going to
1892 * read it later.
1893 */
1894 if (this->virtual_grf_use[inst->src[0].reg] > ip)
1895 continue;
1896
1897 /* Found a move of a GRF to a MRF. Let's see if we can go
1898 * rewrite the thing that made this GRF to write into the MRF.
1899 */
1900 fs_inst *scan_inst;
1901 for (scan_inst = (fs_inst *)inst->prev;
1902 scan_inst->prev != NULL;
1903 scan_inst = (fs_inst *)scan_inst->prev) {
1904 if (scan_inst->dst.file == GRF &&
1905 scan_inst->dst.reg == inst->src[0].reg) {
1906 /* Found the last thing to write our reg we want to turn
1907 * into a compute-to-MRF.
1908 */
1909
1910 /* SENDs can only write to GRFs, so no compute-to-MRF. */
1911 if (scan_inst->mlen) {
1912 break;
1913 }
1914
1915 /* If it's predicated, it (probably) didn't populate all
1916 * the channels. We might be able to rewrite everything
1917 * that writes that reg, but it would require smarter
1918 * tracking to delay the rewriting until complete success.
1919 */
1920 if (scan_inst->predicate)
1921 break;
1922
1923 /* If it's half of register setup and not the same half as
1924 * our MOV we're trying to remove, bail for now.
1925 */
1926 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
1927 scan_inst->force_sechalf != inst->force_sechalf) {
1928 break;
1929 }
1930
1931 /* SEND instructions can't have MRF as a destination. */
1932 if (scan_inst->mlen)
1933 break;
1934
1935 if (intel->gen >= 6) {
1936 /* gen6 math instructions must have the destination be
1937 * GRF, so no compute-to-MRF for them.
1938 */
1939 if (scan_inst->is_math()) {
1940 break;
1941 }
1942 }
1943
1944 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1945 /* Found the creator of our MRF's source value. */
1946 scan_inst->dst.file = MRF;
1947 scan_inst->dst.reg = inst->dst.reg;
1948 scan_inst->saturate |= inst->saturate;
1949 inst->remove();
1950 progress = true;
1951 }
1952 break;
1953 }
1954
1955 /* We don't handle flow control here. Most computation of
1956 * values that end up in MRFs are shortly before the MRF
1957 * write anyway.
1958 */
1959 if (scan_inst->opcode == BRW_OPCODE_DO ||
1960 scan_inst->opcode == BRW_OPCODE_WHILE ||
1961 scan_inst->opcode == BRW_OPCODE_ELSE ||
1962 scan_inst->opcode == BRW_OPCODE_ENDIF) {
1963 break;
1964 }
1965
1966 /* You can't read from an MRF, so if someone else reads our
1967 * MRF's source GRF that we wanted to rewrite, that stops us.
1968 */
1969 bool interfered = false;
1970 for (int i = 0; i < 3; i++) {
1971 if (scan_inst->src[i].file == GRF &&
1972 scan_inst->src[i].reg == inst->src[0].reg &&
1973 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
1974 interfered = true;
1975 }
1976 }
1977 if (interfered)
1978 break;
1979
1980 if (scan_inst->dst.file == MRF) {
1981 /* If somebody else writes our MRF here, we can't
1982 * compute-to-MRF before that.
1983 */
1984 int scan_mrf_low = scan_inst->dst.reg & ~BRW_MRF_COMPR4;
1985 int scan_mrf_high;
1986
1987 if (scan_inst->dst.reg & BRW_MRF_COMPR4) {
1988 scan_mrf_high = scan_mrf_low + 4;
1989 } else if (dispatch_width == 16 &&
1990 (!scan_inst->force_uncompressed &&
1991 !scan_inst->force_sechalf)) {
1992 scan_mrf_high = scan_mrf_low + 1;
1993 } else {
1994 scan_mrf_high = scan_mrf_low;
1995 }
1996
1997 if (mrf_low == scan_mrf_low ||
1998 mrf_low == scan_mrf_high ||
1999 mrf_high == scan_mrf_low ||
2000 mrf_high == scan_mrf_high) {
2001 break;
2002 }
2003 }
2004
2005 if (scan_inst->mlen > 0) {
2006 /* Found a SEND instruction, which means that there are
2007 * live values in MRFs from base_mrf to base_mrf +
2008 * scan_inst->mlen - 1. Don't go pushing our MRF write up
2009 * above it.
2010 */
2011 if (mrf_low >= scan_inst->base_mrf &&
2012 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
2013 break;
2014 }
2015 if (mrf_high >= scan_inst->base_mrf &&
2016 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
2017 break;
2018 }
2019 }
2020 }
2021 }
2022
2023 if (progress)
2024 live_intervals_valid = false;
2025
2026 return progress;
2027 }
2028
2029 /**
2030 * Walks through basic blocks, looking for repeated MRF writes and
2031 * removing the later ones.
2032 */
2033 bool
2034 fs_visitor::remove_duplicate_mrf_writes()
2035 {
2036 fs_inst *last_mrf_move[16];
2037 bool progress = false;
2038
2039 /* Need to update the MRF tracking for compressed instructions. */
2040 if (dispatch_width == 16)
2041 return false;
2042
2043 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2044
2045 foreach_list_safe(node, &this->instructions) {
2046 fs_inst *inst = (fs_inst *)node;
2047
2048 switch (inst->opcode) {
2049 case BRW_OPCODE_DO:
2050 case BRW_OPCODE_WHILE:
2051 case BRW_OPCODE_IF:
2052 case BRW_OPCODE_ELSE:
2053 case BRW_OPCODE_ENDIF:
2054 memset(last_mrf_move, 0, sizeof(last_mrf_move));
2055 continue;
2056 default:
2057 break;
2058 }
2059
2060 if (inst->opcode == BRW_OPCODE_MOV &&
2061 inst->dst.file == MRF) {
2062 fs_inst *prev_inst = last_mrf_move[inst->dst.reg];
2063 if (prev_inst && inst->equals(prev_inst)) {
2064 inst->remove();
2065 progress = true;
2066 continue;
2067 }
2068 }
2069
2070 /* Clear out the last-write records for MRFs that were overwritten. */
2071 if (inst->dst.file == MRF) {
2072 last_mrf_move[inst->dst.reg] = NULL;
2073 }
2074
2075 if (inst->mlen > 0) {
2076 /* Found a SEND instruction, which will include two or fewer
2077 * implied MRF writes. We could do better here.
2078 */
2079 for (int i = 0; i < implied_mrf_writes(inst); i++) {
2080 last_mrf_move[inst->base_mrf + i] = NULL;
2081 }
2082 }
2083
2084 /* Clear out any MRF move records whose sources got overwritten. */
2085 if (inst->dst.file == GRF) {
2086 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
2087 if (last_mrf_move[i] &&
2088 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
2089 last_mrf_move[i] = NULL;
2090 }
2091 }
2092 }
2093
2094 if (inst->opcode == BRW_OPCODE_MOV &&
2095 inst->dst.file == MRF &&
2096 inst->src[0].file == GRF &&
2097 !inst->predicate) {
2098 last_mrf_move[inst->dst.reg] = inst;
2099 }
2100 }
2101
2102 if (progress)
2103 live_intervals_valid = false;
2104
2105 return progress;
2106 }
2107
2108 void
2109 fs_visitor::dump_instruction(fs_inst *inst)
2110 {
2111 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
2112 opcode_descs[inst->opcode].name) {
2113 printf("%s", opcode_descs[inst->opcode].name);
2114 } else {
2115 printf("op%d", inst->opcode);
2116 }
2117 if (inst->saturate)
2118 printf(".sat");
2119 printf(" ");
2120
2121 switch (inst->dst.file) {
2122 case GRF:
2123 printf("vgrf%d", inst->dst.reg);
2124 if (inst->dst.reg_offset)
2125 printf("+%d", inst->dst.reg_offset);
2126 break;
2127 case MRF:
2128 printf("m%d", inst->dst.reg);
2129 break;
2130 case BAD_FILE:
2131 printf("(null)");
2132 break;
2133 case UNIFORM:
2134 printf("***u%d***", inst->dst.reg);
2135 break;
2136 default:
2137 printf("???");
2138 break;
2139 }
2140 printf(", ");
2141
2142 for (int i = 0; i < 3; i++) {
2143 if (inst->src[i].negate)
2144 printf("-");
2145 if (inst->src[i].abs)
2146 printf("|");
2147 switch (inst->src[i].file) {
2148 case GRF:
2149 printf("vgrf%d", inst->src[i].reg);
2150 if (inst->src[i].reg_offset)
2151 printf("+%d", inst->src[i].reg_offset);
2152 break;
2153 case MRF:
2154 printf("***m%d***", inst->src[i].reg);
2155 break;
2156 case UNIFORM:
2157 printf("u%d", inst->src[i].reg);
2158 if (inst->src[i].reg_offset)
2159 printf(".%d", inst->src[i].reg_offset);
2160 break;
2161 case BAD_FILE:
2162 printf("(null)");
2163 break;
2164 default:
2165 printf("???");
2166 break;
2167 }
2168 if (inst->src[i].abs)
2169 printf("|");
2170
2171 if (i < 3)
2172 printf(", ");
2173 }
2174
2175 printf(" ");
2176
2177 if (inst->force_uncompressed)
2178 printf("1sthalf ");
2179
2180 if (inst->force_sechalf)
2181 printf("2ndhalf ");
2182
2183 printf("\n");
2184 }
2185
2186 void
2187 fs_visitor::dump_instructions()
2188 {
2189 int ip = 0;
2190 foreach_list(node, &this->instructions) {
2191 fs_inst *inst = (fs_inst *)node;
2192 printf("%d: ", ip++);
2193 dump_instruction(inst);
2194 }
2195 }
2196
2197 /**
2198 * Possibly returns an instruction that set up @param reg.
2199 *
2200 * Sometimes we want to take the result of some expression/variable
2201 * dereference tree and rewrite the instruction generating the result
2202 * of the tree. When processing the tree, we know that the
2203 * instructions generated are all writing temporaries that are dead
2204 * outside of this tree. So, if we have some instructions that write
2205 * a temporary, we're free to point that temp write somewhere else.
2206 *
2207 * Note that this doesn't guarantee that the instruction generated
2208 * only reg -- it might be the size=4 destination of a texture instruction.
2209 */
2210 fs_inst *
2211 fs_visitor::get_instruction_generating_reg(fs_inst *start,
2212 fs_inst *end,
2213 fs_reg reg)
2214 {
2215 if (end == start ||
2216 end->predicate ||
2217 end->force_uncompressed ||
2218 end->force_sechalf ||
2219 reg.reladdr ||
2220 !reg.equals(end->dst)) {
2221 return NULL;
2222 } else {
2223 return end;
2224 }
2225 }
2226
2227 void
2228 fs_visitor::setup_payload_gen6()
2229 {
2230 struct intel_context *intel = &brw->intel;
2231 bool uses_depth =
2232 (fp->Base.InputsRead & (1 << FRAG_ATTRIB_WPOS)) != 0;
2233 unsigned barycentric_interp_modes = c->prog_data.barycentric_interp_modes;
2234
2235 assert(intel->gen >= 6);
2236
2237 /* R0-1: masks, pixel X/Y coordinates. */
2238 c->nr_payload_regs = 2;
2239 /* R2: only for 32-pixel dispatch.*/
2240
2241 /* R3-26: barycentric interpolation coordinates. These appear in the
2242 * same order that they appear in the brw_wm_barycentric_interp_mode
2243 * enum. Each set of coordinates occupies 2 registers if dispatch width
2244 * == 8 and 4 registers if dispatch width == 16. Coordinates only
2245 * appear if they were enabled using the "Barycentric Interpolation
2246 * Mode" bits in WM_STATE.
2247 */
2248 for (int i = 0; i < BRW_WM_BARYCENTRIC_INTERP_MODE_COUNT; ++i) {
2249 if (barycentric_interp_modes & (1 << i)) {
2250 c->barycentric_coord_reg[i] = c->nr_payload_regs;
2251 c->nr_payload_regs += 2;
2252 if (dispatch_width == 16) {
2253 c->nr_payload_regs += 2;
2254 }
2255 }
2256 }
2257
2258 /* R27: interpolated depth if uses source depth */
2259 if (uses_depth) {
2260 c->source_depth_reg = c->nr_payload_regs;
2261 c->nr_payload_regs++;
2262 if (dispatch_width == 16) {
2263 /* R28: interpolated depth if not 8-wide. */
2264 c->nr_payload_regs++;
2265 }
2266 }
2267 /* R29: interpolated W set if GEN6_WM_USES_SOURCE_W. */
2268 if (uses_depth) {
2269 c->source_w_reg = c->nr_payload_regs;
2270 c->nr_payload_regs++;
2271 if (dispatch_width == 16) {
2272 /* R30: interpolated W if not 8-wide. */
2273 c->nr_payload_regs++;
2274 }
2275 }
2276 /* R31: MSAA position offsets. */
2277 /* R32-: bary for 32-pixel. */
2278 /* R58-59: interp W for 32-pixel. */
2279
2280 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH)) {
2281 c->source_depth_to_render_target = true;
2282 }
2283 }
2284
2285 bool
2286 fs_visitor::run()
2287 {
2288 uint32_t orig_nr_params = c->prog_data.nr_params;
2289
2290 if (intel->gen >= 6)
2291 setup_payload_gen6();
2292 else
2293 setup_payload_gen4();
2294
2295 if (0) {
2296 emit_dummy_fs();
2297 } else {
2298 calculate_urb_setup();
2299 if (intel->gen < 6)
2300 emit_interpolation_setup_gen4();
2301 else
2302 emit_interpolation_setup_gen6();
2303
2304 /* Generate FS IR for main(). (the visitor only descends into
2305 * functions called "main").
2306 */
2307 if (shader) {
2308 foreach_list(node, &*shader->ir) {
2309 ir_instruction *ir = (ir_instruction *)node;
2310 base_ir = ir;
2311 this->result = reg_undef;
2312 ir->accept(this);
2313 }
2314 } else {
2315 emit_fragment_program_code();
2316 }
2317 base_ir = NULL;
2318 if (failed)
2319 return false;
2320
2321 emit_fb_writes();
2322
2323 split_virtual_grfs();
2324
2325 setup_paramvalues_refs();
2326 move_uniform_array_access_to_pull_constants();
2327 setup_pull_constants();
2328
2329 bool progress;
2330 do {
2331 progress = false;
2332
2333 compact_virtual_grfs();
2334
2335 progress = remove_duplicate_mrf_writes() || progress;
2336
2337 progress = opt_algebraic() || progress;
2338 progress = opt_cse() || progress;
2339 progress = opt_copy_propagate() || progress;
2340 progress = dead_code_eliminate() || progress;
2341 progress = register_coalesce() || progress;
2342 progress = register_coalesce_2() || progress;
2343 progress = compute_to_mrf() || progress;
2344 } while (progress);
2345
2346 remove_dead_constants();
2347
2348 schedule_instructions();
2349
2350 assign_curb_setup();
2351 assign_urb_setup();
2352
2353 if (0) {
2354 /* Debug of register spilling: Go spill everything. */
2355 for (int i = 0; i < virtual_grf_count; i++) {
2356 spill_reg(i);
2357 }
2358 }
2359
2360 if (0)
2361 assign_regs_trivial();
2362 else {
2363 while (!assign_regs()) {
2364 if (failed)
2365 break;
2366 }
2367 }
2368 }
2369 assert(force_uncompressed_stack == 0);
2370 assert(force_sechalf_stack == 0);
2371
2372 if (failed)
2373 return false;
2374
2375 if (dispatch_width == 8) {
2376 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
2377 } else {
2378 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
2379
2380 /* Make sure we didn't try to sneak in an extra uniform */
2381 assert(orig_nr_params == c->prog_data.nr_params);
2382 (void) orig_nr_params;
2383 }
2384
2385 return !failed;
2386 }
2387
2388 const unsigned *
2389 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c,
2390 struct gl_fragment_program *fp,
2391 struct gl_shader_program *prog,
2392 unsigned *final_assembly_size)
2393 {
2394 struct intel_context *intel = &brw->intel;
2395 bool start_busy = false;
2396 float start_time = 0;
2397
2398 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
2399 start_busy = (intel->batch.last_bo &&
2400 drm_intel_bo_busy(intel->batch.last_bo));
2401 start_time = get_time();
2402 }
2403
2404 struct brw_shader *shader = NULL;
2405 if (prog)
2406 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
2407
2408 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
2409 if (shader) {
2410 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
2411 _mesa_print_ir(shader->ir, NULL);
2412 printf("\n\n");
2413 } else {
2414 printf("ARB_fragment_program %d ir for native fragment shader\n",
2415 fp->Base.Id);
2416 _mesa_print_program(&fp->Base);
2417 }
2418 }
2419
2420 /* Now the main event: Visit the shader IR and generate our FS IR for it.
2421 */
2422 fs_visitor v(brw, c, prog, fp, 8);
2423 if (!v.run()) {
2424 prog->LinkStatus = false;
2425 ralloc_strcat(&prog->InfoLog, v.fail_msg);
2426
2427 _mesa_problem(NULL, "Failed to compile fragment shader: %s\n",
2428 v.fail_msg);
2429
2430 return NULL;
2431 }
2432
2433 exec_list *simd16_instructions = NULL;
2434 fs_visitor v2(brw, c, prog, fp, 16);
2435 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
2436 v2.import_uniforms(&v);
2437 if (!v2.run()) {
2438 perf_debug("16-wide shader failed to compile, falling back to "
2439 "8-wide at a 10-20%% performance cost: %s", v2.fail_msg);
2440 } else {
2441 simd16_instructions = &v2.instructions;
2442 }
2443 }
2444
2445 c->prog_data.dispatch_width = 8;
2446
2447 fs_generator g(brw, c, prog, fp, v.dual_src_output.file != BAD_FILE);
2448 const unsigned *generated = g.generate_assembly(&v.instructions,
2449 simd16_instructions,
2450 final_assembly_size);
2451
2452 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
2453 if (shader->compiled_once)
2454 brw_wm_debug_recompile(brw, prog, &c->key);
2455 shader->compiled_once = true;
2456
2457 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
2458 perf_debug("FS compile took %.03f ms and stalled the GPU\n",
2459 (get_time() - start_time) * 1000);
2460 }
2461 }
2462
2463 return generated;
2464 }
2465
2466 bool
2467 brw_fs_precompile(struct gl_context *ctx, struct gl_shader_program *prog)
2468 {
2469 struct brw_context *brw = brw_context(ctx);
2470 struct intel_context *intel = &brw->intel;
2471 struct brw_wm_prog_key key;
2472
2473 if (!prog->_LinkedShaders[MESA_SHADER_FRAGMENT])
2474 return true;
2475
2476 struct gl_fragment_program *fp = (struct gl_fragment_program *)
2477 prog->_LinkedShaders[MESA_SHADER_FRAGMENT]->Program;
2478 struct brw_fragment_program *bfp = brw_fragment_program(fp);
2479 bool program_uses_dfdy = fp->UsesDFdy;
2480
2481 memset(&key, 0, sizeof(key));
2482
2483 if (intel->gen < 6) {
2484 if (fp->UsesKill)
2485 key.iz_lookup |= IZ_PS_KILL_ALPHATEST_BIT;
2486
2487 if (fp->Base.OutputsWritten & BITFIELD64_BIT(FRAG_RESULT_DEPTH))
2488 key.iz_lookup |= IZ_PS_COMPUTES_DEPTH_BIT;
2489
2490 /* Just assume depth testing. */
2491 key.iz_lookup |= IZ_DEPTH_TEST_ENABLE_BIT;
2492 key.iz_lookup |= IZ_DEPTH_WRITE_ENABLE_BIT;
2493 }
2494
2495 if (prog->Name != 0)
2496 key.proj_attrib_mask = 0xffffffff;
2497
2498 if (intel->gen < 6)
2499 key.vp_outputs_written |= BITFIELD64_BIT(FRAG_ATTRIB_WPOS);
2500
2501 for (int i = 0; i < FRAG_ATTRIB_MAX; i++) {
2502 if (!(fp->Base.InputsRead & BITFIELD64_BIT(i)))
2503 continue;
2504
2505 if (prog->Name == 0)
2506 key.proj_attrib_mask |= 1 << i;
2507
2508 if (intel->gen < 6) {
2509 int vp_index = _mesa_vert_result_to_frag_attrib((gl_vert_result) i);
2510
2511 if (vp_index >= 0)
2512 key.vp_outputs_written |= BITFIELD64_BIT(vp_index);
2513 }
2514 }
2515
2516 key.clamp_fragment_color = true;
2517
2518 for (int i = 0; i < MAX_SAMPLERS; i++) {
2519 if (fp->Base.ShadowSamplers & (1 << i)) {
2520 /* Assume DEPTH_TEXTURE_MODE is the default: X, X, X, 1 */
2521 key.tex.swizzles[i] =
2522 MAKE_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_ONE);
2523 } else {
2524 /* Color sampler: assume no swizzling. */
2525 key.tex.swizzles[i] = SWIZZLE_XYZW;
2526 }
2527 }
2528
2529 if (fp->Base.InputsRead & FRAG_BIT_WPOS) {
2530 key.drawable_height = ctx->DrawBuffer->Height;
2531 }
2532
2533 if ((fp->Base.InputsRead & FRAG_BIT_WPOS) || program_uses_dfdy) {
2534 key.render_to_fbo = _mesa_is_user_fbo(ctx->DrawBuffer);
2535 }
2536
2537 key.nr_color_regions = 1;
2538
2539 key.program_string_id = bfp->id;
2540
2541 uint32_t old_prog_offset = brw->wm.prog_offset;
2542 struct brw_wm_prog_data *old_prog_data = brw->wm.prog_data;
2543
2544 bool success = do_wm_prog(brw, prog, bfp, &key);
2545
2546 brw->wm.prog_offset = old_prog_offset;
2547 brw->wm.prog_data = old_prog_data;
2548
2549 return success;
2550 }