i965/fs: Split the GLSL IR -> FS LIR visitor to brw_fs_visitor.cpp.
[mesa.git] / src / mesa / drivers / dri / i965 / brw_fs.cpp
1 /*
2 * Copyright © 2010 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 /** @file brw_fs.cpp
25 *
26 * This file drives the GLSL IR -> LIR translation, contains the
27 * optimizations on the LIR, and drives the generation of native code
28 * from the LIR.
29 */
30
31 extern "C" {
32
33 #include <sys/types.h>
34
35 #include "main/macros.h"
36 #include "main/shaderobj.h"
37 #include "main/uniforms.h"
38 #include "program/prog_parameter.h"
39 #include "program/prog_print.h"
40 #include "program/register_allocate.h"
41 #include "program/sampler.h"
42 #include "program/hash_table.h"
43 #include "brw_context.h"
44 #include "brw_eu.h"
45 #include "brw_wm.h"
46 }
47 #include "brw_shader.h"
48 #include "brw_fs.h"
49 #include "../glsl/glsl_types.h"
50 #include "../glsl/ir_print_visitor.h"
51
52 #define MAX_INSTRUCTION (1 << 30)
53
54 int
55 fs_visitor::type_size(const struct glsl_type *type)
56 {
57 unsigned int size, i;
58
59 switch (type->base_type) {
60 case GLSL_TYPE_UINT:
61 case GLSL_TYPE_INT:
62 case GLSL_TYPE_FLOAT:
63 case GLSL_TYPE_BOOL:
64 return type->components();
65 case GLSL_TYPE_ARRAY:
66 return type_size(type->fields.array) * type->length;
67 case GLSL_TYPE_STRUCT:
68 size = 0;
69 for (i = 0; i < type->length; i++) {
70 size += type_size(type->fields.structure[i].type);
71 }
72 return size;
73 case GLSL_TYPE_SAMPLER:
74 /* Samplers take up no register space, since they're baked in at
75 * link time.
76 */
77 return 0;
78 default:
79 assert(!"not reached");
80 return 0;
81 }
82 }
83
84 void
85 fs_visitor::fail(const char *format, ...)
86 {
87 if (!failed) {
88 failed = true;
89
90 if (INTEL_DEBUG & DEBUG_WM) {
91 fprintf(stderr, "FS compile failed: ");
92
93 va_list va;
94 va_start(va, format);
95 vfprintf(stderr, format, va);
96 va_end(va);
97 }
98 }
99 }
100
101 void
102 fs_visitor::push_force_uncompressed()
103 {
104 force_uncompressed_stack++;
105 }
106
107 void
108 fs_visitor::pop_force_uncompressed()
109 {
110 force_uncompressed_stack--;
111 assert(force_uncompressed_stack >= 0);
112 }
113
114 void
115 fs_visitor::push_force_sechalf()
116 {
117 force_sechalf_stack++;
118 }
119
120 void
121 fs_visitor::pop_force_sechalf()
122 {
123 force_sechalf_stack--;
124 assert(force_sechalf_stack >= 0);
125 }
126
127 /**
128 * Returns how many MRFs an FS opcode will write over.
129 *
130 * Note that this is not the 0 or 1 implied writes in an actual gen
131 * instruction -- the FS opcodes often generate MOVs in addition.
132 */
133 int
134 fs_visitor::implied_mrf_writes(fs_inst *inst)
135 {
136 if (inst->mlen == 0)
137 return 0;
138
139 switch (inst->opcode) {
140 case FS_OPCODE_RCP:
141 case FS_OPCODE_RSQ:
142 case FS_OPCODE_SQRT:
143 case FS_OPCODE_EXP2:
144 case FS_OPCODE_LOG2:
145 case FS_OPCODE_SIN:
146 case FS_OPCODE_COS:
147 return 1 * c->dispatch_width / 8;
148 case FS_OPCODE_POW:
149 return 2 * c->dispatch_width / 8;
150 case FS_OPCODE_TEX:
151 case FS_OPCODE_TXB:
152 case FS_OPCODE_TXD:
153 case FS_OPCODE_TXL:
154 return 1;
155 case FS_OPCODE_FB_WRITE:
156 return 2;
157 case FS_OPCODE_PULL_CONSTANT_LOAD:
158 case FS_OPCODE_UNSPILL:
159 return 1;
160 case FS_OPCODE_SPILL:
161 return 2;
162 default:
163 assert(!"not reached");
164 return inst->mlen;
165 }
166 }
167
168 int
169 fs_visitor::virtual_grf_alloc(int size)
170 {
171 if (virtual_grf_array_size <= virtual_grf_next) {
172 if (virtual_grf_array_size == 0)
173 virtual_grf_array_size = 16;
174 else
175 virtual_grf_array_size *= 2;
176 virtual_grf_sizes = reralloc(mem_ctx, virtual_grf_sizes, int,
177 virtual_grf_array_size);
178
179 /* This slot is always unused. */
180 virtual_grf_sizes[0] = 0;
181 }
182 virtual_grf_sizes[virtual_grf_next] = size;
183 return virtual_grf_next++;
184 }
185
186 /** Fixed HW reg constructor. */
187 fs_reg::fs_reg(enum register_file file, int hw_reg)
188 {
189 init();
190 this->file = file;
191 this->hw_reg = hw_reg;
192 this->type = BRW_REGISTER_TYPE_F;
193 }
194
195 /** Fixed HW reg constructor. */
196 fs_reg::fs_reg(enum register_file file, int hw_reg, uint32_t type)
197 {
198 init();
199 this->file = file;
200 this->hw_reg = hw_reg;
201 this->type = type;
202 }
203
204 /** Automatic reg constructor. */
205 fs_reg::fs_reg(class fs_visitor *v, const struct glsl_type *type)
206 {
207 init();
208
209 this->file = GRF;
210 this->reg = v->virtual_grf_alloc(v->type_size(type));
211 this->reg_offset = 0;
212 this->type = brw_type_for_base_type(type);
213 }
214
215 fs_reg *
216 fs_visitor::variable_storage(ir_variable *var)
217 {
218 return (fs_reg *)hash_table_find(this->variable_ht, var);
219 }
220
221 void
222 import_uniforms_callback(const void *key,
223 void *data,
224 void *closure)
225 {
226 struct hash_table *dst_ht = (struct hash_table *)closure;
227 const fs_reg *reg = (const fs_reg *)data;
228
229 if (reg->file != UNIFORM)
230 return;
231
232 hash_table_insert(dst_ht, data, key);
233 }
234
235 /* For 16-wide, we need to follow from the uniform setup of 8-wide dispatch.
236 * This brings in those uniform definitions
237 */
238 void
239 fs_visitor::import_uniforms(struct hash_table *src_variable_ht)
240 {
241 hash_table_call_foreach(src_variable_ht,
242 import_uniforms_callback,
243 variable_ht);
244 }
245
246 /* Our support for uniforms is piggy-backed on the struct
247 * gl_fragment_program, because that's where the values actually
248 * get stored, rather than in some global gl_shader_program uniform
249 * store.
250 */
251 int
252 fs_visitor::setup_uniform_values(int loc, const glsl_type *type)
253 {
254 unsigned int offset = 0;
255
256 if (type->is_matrix()) {
257 const glsl_type *column = glsl_type::get_instance(GLSL_TYPE_FLOAT,
258 type->vector_elements,
259 1);
260
261 for (unsigned int i = 0; i < type->matrix_columns; i++) {
262 offset += setup_uniform_values(loc + offset, column);
263 }
264
265 return offset;
266 }
267
268 switch (type->base_type) {
269 case GLSL_TYPE_FLOAT:
270 case GLSL_TYPE_UINT:
271 case GLSL_TYPE_INT:
272 case GLSL_TYPE_BOOL:
273 for (unsigned int i = 0; i < type->vector_elements; i++) {
274 unsigned int param = c->prog_data.nr_params++;
275
276 assert(param < ARRAY_SIZE(c->prog_data.param));
277
278 switch (type->base_type) {
279 case GLSL_TYPE_FLOAT:
280 c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
281 break;
282 case GLSL_TYPE_UINT:
283 c->prog_data.param_convert[param] = PARAM_CONVERT_F2U;
284 break;
285 case GLSL_TYPE_INT:
286 c->prog_data.param_convert[param] = PARAM_CONVERT_F2I;
287 break;
288 case GLSL_TYPE_BOOL:
289 c->prog_data.param_convert[param] = PARAM_CONVERT_F2B;
290 break;
291 default:
292 assert(!"not reached");
293 c->prog_data.param_convert[param] = PARAM_NO_CONVERT;
294 break;
295 }
296 this->param_index[param] = loc;
297 this->param_offset[param] = i;
298 }
299 return 1;
300
301 case GLSL_TYPE_STRUCT:
302 for (unsigned int i = 0; i < type->length; i++) {
303 offset += setup_uniform_values(loc + offset,
304 type->fields.structure[i].type);
305 }
306 return offset;
307
308 case GLSL_TYPE_ARRAY:
309 for (unsigned int i = 0; i < type->length; i++) {
310 offset += setup_uniform_values(loc + offset, type->fields.array);
311 }
312 return offset;
313
314 case GLSL_TYPE_SAMPLER:
315 /* The sampler takes up a slot, but we don't use any values from it. */
316 return 1;
317
318 default:
319 assert(!"not reached");
320 return 0;
321 }
322 }
323
324
325 /* Our support for builtin uniforms is even scarier than non-builtin.
326 * It sits on top of the PROG_STATE_VAR parameters that are
327 * automatically updated from GL context state.
328 */
329 void
330 fs_visitor::setup_builtin_uniform_values(ir_variable *ir)
331 {
332 const ir_state_slot *const slots = ir->state_slots;
333 assert(ir->state_slots != NULL);
334
335 for (unsigned int i = 0; i < ir->num_state_slots; i++) {
336 /* This state reference has already been setup by ir_to_mesa, but we'll
337 * get the same index back here.
338 */
339 int index = _mesa_add_state_reference(this->fp->Base.Parameters,
340 (gl_state_index *)slots[i].tokens);
341
342 /* Add each of the unique swizzles of the element as a parameter.
343 * This'll end up matching the expected layout of the
344 * array/matrix/structure we're trying to fill in.
345 */
346 int last_swiz = -1;
347 for (unsigned int j = 0; j < 4; j++) {
348 int swiz = GET_SWZ(slots[i].swizzle, j);
349 if (swiz == last_swiz)
350 break;
351 last_swiz = swiz;
352
353 c->prog_data.param_convert[c->prog_data.nr_params] =
354 PARAM_NO_CONVERT;
355 this->param_index[c->prog_data.nr_params] = index;
356 this->param_offset[c->prog_data.nr_params] = swiz;
357 c->prog_data.nr_params++;
358 }
359 }
360 }
361
362 fs_reg *
363 fs_visitor::emit_fragcoord_interpolation(ir_variable *ir)
364 {
365 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
366 fs_reg wpos = *reg;
367 bool flip = !ir->origin_upper_left ^ c->key.render_to_fbo;
368
369 /* gl_FragCoord.x */
370 if (ir->pixel_center_integer) {
371 emit(BRW_OPCODE_MOV, wpos, this->pixel_x);
372 } else {
373 emit(BRW_OPCODE_ADD, wpos, this->pixel_x, fs_reg(0.5f));
374 }
375 wpos.reg_offset++;
376
377 /* gl_FragCoord.y */
378 if (!flip && ir->pixel_center_integer) {
379 emit(BRW_OPCODE_MOV, wpos, this->pixel_y);
380 } else {
381 fs_reg pixel_y = this->pixel_y;
382 float offset = (ir->pixel_center_integer ? 0.0 : 0.5);
383
384 if (flip) {
385 pixel_y.negate = true;
386 offset += c->key.drawable_height - 1.0;
387 }
388
389 emit(BRW_OPCODE_ADD, wpos, pixel_y, fs_reg(offset));
390 }
391 wpos.reg_offset++;
392
393 /* gl_FragCoord.z */
394 if (intel->gen >= 6) {
395 emit(BRW_OPCODE_MOV, wpos,
396 fs_reg(brw_vec8_grf(c->source_depth_reg, 0)));
397 } else {
398 emit(FS_OPCODE_LINTERP, wpos, this->delta_x, this->delta_y,
399 interp_reg(FRAG_ATTRIB_WPOS, 2));
400 }
401 wpos.reg_offset++;
402
403 /* gl_FragCoord.w: Already set up in emit_interpolation */
404 emit(BRW_OPCODE_MOV, wpos, this->wpos_w);
405
406 return reg;
407 }
408
409 fs_reg *
410 fs_visitor::emit_general_interpolation(ir_variable *ir)
411 {
412 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
413 /* Interpolation is always in floating point regs. */
414 reg->type = BRW_REGISTER_TYPE_F;
415 fs_reg attr = *reg;
416
417 unsigned int array_elements;
418 const glsl_type *type;
419
420 if (ir->type->is_array()) {
421 array_elements = ir->type->length;
422 if (array_elements == 0) {
423 fail("dereferenced array '%s' has length 0\n", ir->name);
424 }
425 type = ir->type->fields.array;
426 } else {
427 array_elements = 1;
428 type = ir->type;
429 }
430
431 int location = ir->location;
432 for (unsigned int i = 0; i < array_elements; i++) {
433 for (unsigned int j = 0; j < type->matrix_columns; j++) {
434 if (urb_setup[location] == -1) {
435 /* If there's no incoming setup data for this slot, don't
436 * emit interpolation for it.
437 */
438 attr.reg_offset += type->vector_elements;
439 location++;
440 continue;
441 }
442
443 bool is_gl_Color =
444 location == FRAG_ATTRIB_COL0 || location == FRAG_ATTRIB_COL1;
445
446 if (c->key.flat_shade && is_gl_Color) {
447 /* Constant interpolation (flat shading) case. The SF has
448 * handed us defined values in only the constant offset
449 * field of the setup reg.
450 */
451 for (unsigned int k = 0; k < type->vector_elements; k++) {
452 struct brw_reg interp = interp_reg(location, k);
453 interp = suboffset(interp, 3);
454 emit(FS_OPCODE_CINTERP, attr, fs_reg(interp));
455 attr.reg_offset++;
456 }
457 } else {
458 /* Perspective interpolation case. */
459 for (unsigned int k = 0; k < type->vector_elements; k++) {
460 struct brw_reg interp = interp_reg(location, k);
461 emit(FS_OPCODE_LINTERP, attr,
462 this->delta_x, this->delta_y, fs_reg(interp));
463 attr.reg_offset++;
464 }
465
466 if (intel->gen < 6) {
467 attr.reg_offset -= type->vector_elements;
468 for (unsigned int k = 0; k < type->vector_elements; k++) {
469 emit(BRW_OPCODE_MUL, attr, attr, this->pixel_w);
470 attr.reg_offset++;
471 }
472 }
473 }
474 location++;
475 }
476 }
477
478 return reg;
479 }
480
481 fs_reg *
482 fs_visitor::emit_frontfacing_interpolation(ir_variable *ir)
483 {
484 fs_reg *reg = new(this->mem_ctx) fs_reg(this, ir->type);
485
486 /* The frontfacing comes in as a bit in the thread payload. */
487 if (intel->gen >= 6) {
488 emit(BRW_OPCODE_ASR, *reg,
489 fs_reg(retype(brw_vec1_grf(0, 0), BRW_REGISTER_TYPE_D)),
490 fs_reg(15));
491 emit(BRW_OPCODE_NOT, *reg, *reg);
492 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1));
493 } else {
494 struct brw_reg r1_6ud = retype(brw_vec1_grf(1, 6), BRW_REGISTER_TYPE_UD);
495 /* bit 31 is "primitive is back face", so checking < (1 << 31) gives
496 * us front face
497 */
498 fs_inst *inst = emit(BRW_OPCODE_CMP, *reg,
499 fs_reg(r1_6ud),
500 fs_reg(1u << 31));
501 inst->conditional_mod = BRW_CONDITIONAL_L;
502 emit(BRW_OPCODE_AND, *reg, *reg, fs_reg(1u));
503 }
504
505 return reg;
506 }
507
508 fs_inst *
509 fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src)
510 {
511 switch (opcode) {
512 case FS_OPCODE_RCP:
513 case FS_OPCODE_RSQ:
514 case FS_OPCODE_SQRT:
515 case FS_OPCODE_EXP2:
516 case FS_OPCODE_LOG2:
517 case FS_OPCODE_SIN:
518 case FS_OPCODE_COS:
519 break;
520 default:
521 assert(!"not reached: bad math opcode");
522 return NULL;
523 }
524
525 /* Can't do hstride == 0 args to gen6 math, so expand it out. We
526 * might be able to do better by doing execsize = 1 math and then
527 * expanding that result out, but we would need to be careful with
528 * masking.
529 *
530 * The hardware ignores source modifiers (negate and abs) on math
531 * instructions, so we also move to a temp to set those up.
532 */
533 if (intel->gen >= 6 && (src.file == UNIFORM ||
534 src.abs ||
535 src.negate)) {
536 fs_reg expanded = fs_reg(this, glsl_type::float_type);
537 emit(BRW_OPCODE_MOV, expanded, src);
538 src = expanded;
539 }
540
541 fs_inst *inst = emit(opcode, dst, src);
542
543 if (intel->gen < 6) {
544 inst->base_mrf = 2;
545 inst->mlen = c->dispatch_width / 8;
546 }
547
548 return inst;
549 }
550
551 fs_inst *
552 fs_visitor::emit_math(fs_opcodes opcode, fs_reg dst, fs_reg src0, fs_reg src1)
553 {
554 int base_mrf = 2;
555 fs_inst *inst;
556
557 assert(opcode == FS_OPCODE_POW);
558
559 if (intel->gen >= 6) {
560 /* Can't do hstride == 0 args to gen6 math, so expand it out.
561 *
562 * The hardware ignores source modifiers (negate and abs) on math
563 * instructions, so we also move to a temp to set those up.
564 */
565 if (src0.file == UNIFORM || src0.abs || src0.negate) {
566 fs_reg expanded = fs_reg(this, glsl_type::float_type);
567 emit(BRW_OPCODE_MOV, expanded, src0);
568 src0 = expanded;
569 }
570
571 if (src1.file == UNIFORM || src1.abs || src1.negate) {
572 fs_reg expanded = fs_reg(this, glsl_type::float_type);
573 emit(BRW_OPCODE_MOV, expanded, src1);
574 src1 = expanded;
575 }
576
577 inst = emit(opcode, dst, src0, src1);
578 } else {
579 emit(BRW_OPCODE_MOV, fs_reg(MRF, base_mrf + 1), src1);
580 inst = emit(opcode, dst, src0, reg_null_f);
581
582 inst->base_mrf = base_mrf;
583 inst->mlen = 2 * c->dispatch_width / 8;
584 }
585 return inst;
586 }
587
588 /**
589 * To be called after the last _mesa_add_state_reference() call, to
590 * set up prog_data.param[] for assign_curb_setup() and
591 * setup_pull_constants().
592 */
593 void
594 fs_visitor::setup_paramvalues_refs()
595 {
596 if (c->dispatch_width != 8)
597 return;
598
599 /* Set up the pointers to ParamValues now that that array is finalized. */
600 for (unsigned int i = 0; i < c->prog_data.nr_params; i++) {
601 c->prog_data.param[i] =
602 fp->Base.Parameters->ParameterValues[this->param_index[i]] +
603 this->param_offset[i];
604 }
605 }
606
607 void
608 fs_visitor::assign_curb_setup()
609 {
610 c->prog_data.curb_read_length = ALIGN(c->prog_data.nr_params, 8) / 8;
611 if (c->dispatch_width == 8) {
612 c->prog_data.first_curbe_grf = c->nr_payload_regs;
613 } else {
614 c->prog_data.first_curbe_grf_16 = c->nr_payload_regs;
615 }
616
617 /* Map the offsets in the UNIFORM file to fixed HW regs. */
618 foreach_iter(exec_list_iterator, iter, this->instructions) {
619 fs_inst *inst = (fs_inst *)iter.get();
620
621 for (unsigned int i = 0; i < 3; i++) {
622 if (inst->src[i].file == UNIFORM) {
623 int constant_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
624 struct brw_reg brw_reg = brw_vec1_grf(c->nr_payload_regs +
625 constant_nr / 8,
626 constant_nr % 8);
627
628 inst->src[i].file = FIXED_HW_REG;
629 inst->src[i].fixed_hw_reg = retype(brw_reg, inst->src[i].type);
630 }
631 }
632 }
633 }
634
635 void
636 fs_visitor::calculate_urb_setup()
637 {
638 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
639 urb_setup[i] = -1;
640 }
641
642 int urb_next = 0;
643 /* Figure out where each of the incoming setup attributes lands. */
644 if (intel->gen >= 6) {
645 for (unsigned int i = 0; i < FRAG_ATTRIB_MAX; i++) {
646 if (brw->fragment_program->Base.InputsRead & BITFIELD64_BIT(i)) {
647 urb_setup[i] = urb_next++;
648 }
649 }
650 } else {
651 /* FINISHME: The sf doesn't map VS->FS inputs for us very well. */
652 for (unsigned int i = 0; i < VERT_RESULT_MAX; i++) {
653 if (c->key.vp_outputs_written & BITFIELD64_BIT(i)) {
654 int fp_index;
655
656 if (i >= VERT_RESULT_VAR0)
657 fp_index = i - (VERT_RESULT_VAR0 - FRAG_ATTRIB_VAR0);
658 else if (i <= VERT_RESULT_TEX7)
659 fp_index = i;
660 else
661 fp_index = -1;
662
663 if (fp_index >= 0)
664 urb_setup[fp_index] = urb_next++;
665 }
666 }
667 }
668
669 /* Each attribute is 4 setup channels, each of which is half a reg. */
670 c->prog_data.urb_read_length = urb_next * 2;
671 }
672
673 void
674 fs_visitor::assign_urb_setup()
675 {
676 int urb_start = c->nr_payload_regs + c->prog_data.curb_read_length;
677
678 /* Offset all the urb_setup[] index by the actual position of the
679 * setup regs, now that the location of the constants has been chosen.
680 */
681 foreach_iter(exec_list_iterator, iter, this->instructions) {
682 fs_inst *inst = (fs_inst *)iter.get();
683
684 if (inst->opcode == FS_OPCODE_LINTERP) {
685 assert(inst->src[2].file == FIXED_HW_REG);
686 inst->src[2].fixed_hw_reg.nr += urb_start;
687 }
688
689 if (inst->opcode == FS_OPCODE_CINTERP) {
690 assert(inst->src[0].file == FIXED_HW_REG);
691 inst->src[0].fixed_hw_reg.nr += urb_start;
692 }
693 }
694
695 this->first_non_payload_grf = urb_start + c->prog_data.urb_read_length;
696 }
697
698 /**
699 * Split large virtual GRFs into separate components if we can.
700 *
701 * This is mostly duplicated with what brw_fs_vector_splitting does,
702 * but that's really conservative because it's afraid of doing
703 * splitting that doesn't result in real progress after the rest of
704 * the optimization phases, which would cause infinite looping in
705 * optimization. We can do it once here, safely. This also has the
706 * opportunity to split interpolated values, or maybe even uniforms,
707 * which we don't have at the IR level.
708 *
709 * We want to split, because virtual GRFs are what we register
710 * allocate and spill (due to contiguousness requirements for some
711 * instructions), and they're what we naturally generate in the
712 * codegen process, but most virtual GRFs don't actually need to be
713 * contiguous sets of GRFs. If we split, we'll end up with reduced
714 * live intervals and better dead code elimination and coalescing.
715 */
716 void
717 fs_visitor::split_virtual_grfs()
718 {
719 int num_vars = this->virtual_grf_next;
720 bool split_grf[num_vars];
721 int new_virtual_grf[num_vars];
722
723 /* Try to split anything > 0 sized. */
724 for (int i = 0; i < num_vars; i++) {
725 if (this->virtual_grf_sizes[i] != 1)
726 split_grf[i] = true;
727 else
728 split_grf[i] = false;
729 }
730
731 if (brw->has_pln) {
732 /* PLN opcodes rely on the delta_xy being contiguous. */
733 split_grf[this->delta_x.reg] = false;
734 }
735
736 foreach_iter(exec_list_iterator, iter, this->instructions) {
737 fs_inst *inst = (fs_inst *)iter.get();
738
739 /* Texturing produces 4 contiguous registers, so no splitting. */
740 if (inst->is_tex()) {
741 split_grf[inst->dst.reg] = false;
742 }
743 }
744
745 /* Allocate new space for split regs. Note that the virtual
746 * numbers will be contiguous.
747 */
748 for (int i = 0; i < num_vars; i++) {
749 if (split_grf[i]) {
750 new_virtual_grf[i] = virtual_grf_alloc(1);
751 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
752 int reg = virtual_grf_alloc(1);
753 assert(reg == new_virtual_grf[i] + j - 1);
754 (void) reg;
755 }
756 this->virtual_grf_sizes[i] = 1;
757 }
758 }
759
760 foreach_iter(exec_list_iterator, iter, this->instructions) {
761 fs_inst *inst = (fs_inst *)iter.get();
762
763 if (inst->dst.file == GRF &&
764 split_grf[inst->dst.reg] &&
765 inst->dst.reg_offset != 0) {
766 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
767 inst->dst.reg_offset - 1);
768 inst->dst.reg_offset = 0;
769 }
770 for (int i = 0; i < 3; i++) {
771 if (inst->src[i].file == GRF &&
772 split_grf[inst->src[i].reg] &&
773 inst->src[i].reg_offset != 0) {
774 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
775 inst->src[i].reg_offset - 1);
776 inst->src[i].reg_offset = 0;
777 }
778 }
779 }
780 this->live_intervals_valid = false;
781 }
782
783 /**
784 * Choose accesses from the UNIFORM file to demote to using the pull
785 * constant buffer.
786 *
787 * We allow a fragment shader to have more than the specified minimum
788 * maximum number of fragment shader uniform components (64). If
789 * there are too many of these, they'd fill up all of register space.
790 * So, this will push some of them out to the pull constant buffer and
791 * update the program to load them.
792 */
793 void
794 fs_visitor::setup_pull_constants()
795 {
796 /* Only allow 16 registers (128 uniform components) as push constants. */
797 unsigned int max_uniform_components = 16 * 8;
798 if (c->prog_data.nr_params <= max_uniform_components)
799 return;
800
801 if (c->dispatch_width == 16) {
802 fail("Pull constants not supported in 16-wide\n");
803 return;
804 }
805
806 /* Just demote the end of the list. We could probably do better
807 * here, demoting things that are rarely used in the program first.
808 */
809 int pull_uniform_base = max_uniform_components;
810 int pull_uniform_count = c->prog_data.nr_params - pull_uniform_base;
811
812 foreach_iter(exec_list_iterator, iter, this->instructions) {
813 fs_inst *inst = (fs_inst *)iter.get();
814
815 for (int i = 0; i < 3; i++) {
816 if (inst->src[i].file != UNIFORM)
817 continue;
818
819 int uniform_nr = inst->src[i].hw_reg + inst->src[i].reg_offset;
820 if (uniform_nr < pull_uniform_base)
821 continue;
822
823 fs_reg dst = fs_reg(this, glsl_type::float_type);
824 fs_inst *pull = new(mem_ctx) fs_inst(FS_OPCODE_PULL_CONSTANT_LOAD,
825 dst);
826 pull->offset = ((uniform_nr - pull_uniform_base) * 4) & ~15;
827 pull->ir = inst->ir;
828 pull->annotation = inst->annotation;
829 pull->base_mrf = 14;
830 pull->mlen = 1;
831
832 inst->insert_before(pull);
833
834 inst->src[i].file = GRF;
835 inst->src[i].reg = dst.reg;
836 inst->src[i].reg_offset = 0;
837 inst->src[i].smear = (uniform_nr - pull_uniform_base) & 3;
838 }
839 }
840
841 for (int i = 0; i < pull_uniform_count; i++) {
842 c->prog_data.pull_param[i] = c->prog_data.param[pull_uniform_base + i];
843 c->prog_data.pull_param_convert[i] =
844 c->prog_data.param_convert[pull_uniform_base + i];
845 }
846 c->prog_data.nr_params -= pull_uniform_count;
847 c->prog_data.nr_pull_params = pull_uniform_count;
848 }
849
850 void
851 fs_visitor::calculate_live_intervals()
852 {
853 int num_vars = this->virtual_grf_next;
854 int *def = ralloc_array(mem_ctx, int, num_vars);
855 int *use = ralloc_array(mem_ctx, int, num_vars);
856 int loop_depth = 0;
857 int loop_start = 0;
858
859 if (this->live_intervals_valid)
860 return;
861
862 for (int i = 0; i < num_vars; i++) {
863 def[i] = MAX_INSTRUCTION;
864 use[i] = -1;
865 }
866
867 int ip = 0;
868 foreach_iter(exec_list_iterator, iter, this->instructions) {
869 fs_inst *inst = (fs_inst *)iter.get();
870
871 if (inst->opcode == BRW_OPCODE_DO) {
872 if (loop_depth++ == 0)
873 loop_start = ip;
874 } else if (inst->opcode == BRW_OPCODE_WHILE) {
875 loop_depth--;
876
877 if (loop_depth == 0) {
878 /* Patches up the use of vars marked for being live across
879 * the whole loop.
880 */
881 for (int i = 0; i < num_vars; i++) {
882 if (use[i] == loop_start) {
883 use[i] = ip;
884 }
885 }
886 }
887 } else {
888 for (unsigned int i = 0; i < 3; i++) {
889 if (inst->src[i].file == GRF && inst->src[i].reg != 0) {
890 int reg = inst->src[i].reg;
891
892 if (!loop_depth) {
893 use[reg] = ip;
894 } else {
895 def[reg] = MIN2(loop_start, def[reg]);
896 use[reg] = loop_start;
897
898 /* Nobody else is going to go smash our start to
899 * later in the loop now, because def[reg] now
900 * points before the bb header.
901 */
902 }
903 }
904 }
905 if (inst->dst.file == GRF && inst->dst.reg != 0) {
906 int reg = inst->dst.reg;
907
908 if (!loop_depth) {
909 def[reg] = MIN2(def[reg], ip);
910 } else {
911 def[reg] = MIN2(def[reg], loop_start);
912 }
913 }
914 }
915
916 ip++;
917 }
918
919 ralloc_free(this->virtual_grf_def);
920 ralloc_free(this->virtual_grf_use);
921 this->virtual_grf_def = def;
922 this->virtual_grf_use = use;
923
924 this->live_intervals_valid = true;
925 }
926
927 /**
928 * Attempts to move immediate constants into the immediate
929 * constant slot of following instructions.
930 *
931 * Immediate constants are a bit tricky -- they have to be in the last
932 * operand slot, you can't do abs/negate on them,
933 */
934
935 bool
936 fs_visitor::propagate_constants()
937 {
938 bool progress = false;
939
940 calculate_live_intervals();
941
942 foreach_iter(exec_list_iterator, iter, this->instructions) {
943 fs_inst *inst = (fs_inst *)iter.get();
944
945 if (inst->opcode != BRW_OPCODE_MOV ||
946 inst->predicated ||
947 inst->dst.file != GRF || inst->src[0].file != IMM ||
948 inst->dst.type != inst->src[0].type ||
949 (c->dispatch_width == 16 &&
950 (inst->force_uncompressed || inst->force_sechalf)))
951 continue;
952
953 /* Don't bother with cases where we should have had the
954 * operation on the constant folded in GLSL already.
955 */
956 if (inst->saturate)
957 continue;
958
959 /* Found a move of a constant to a GRF. Find anything else using the GRF
960 * before it's written, and replace it with the constant if we can.
961 */
962 exec_list_iterator scan_iter = iter;
963 scan_iter.next();
964 for (; scan_iter.has_next(); scan_iter.next()) {
965 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
966
967 if (scan_inst->opcode == BRW_OPCODE_DO ||
968 scan_inst->opcode == BRW_OPCODE_WHILE ||
969 scan_inst->opcode == BRW_OPCODE_ELSE ||
970 scan_inst->opcode == BRW_OPCODE_ENDIF) {
971 break;
972 }
973
974 for (int i = 2; i >= 0; i--) {
975 if (scan_inst->src[i].file != GRF ||
976 scan_inst->src[i].reg != inst->dst.reg ||
977 scan_inst->src[i].reg_offset != inst->dst.reg_offset)
978 continue;
979
980 /* Don't bother with cases where we should have had the
981 * operation on the constant folded in GLSL already.
982 */
983 if (scan_inst->src[i].negate || scan_inst->src[i].abs)
984 continue;
985
986 switch (scan_inst->opcode) {
987 case BRW_OPCODE_MOV:
988 scan_inst->src[i] = inst->src[0];
989 progress = true;
990 break;
991
992 case BRW_OPCODE_MUL:
993 case BRW_OPCODE_ADD:
994 if (i == 1) {
995 scan_inst->src[i] = inst->src[0];
996 progress = true;
997 } else if (i == 0 && scan_inst->src[1].file != IMM) {
998 /* Fit this constant in by commuting the operands */
999 scan_inst->src[0] = scan_inst->src[1];
1000 scan_inst->src[1] = inst->src[0];
1001 progress = true;
1002 }
1003 break;
1004
1005 case BRW_OPCODE_CMP:
1006 if (i == 1) {
1007 scan_inst->src[i] = inst->src[0];
1008 progress = true;
1009 } else if (i == 0 && scan_inst->src[1].file != IMM) {
1010 uint32_t new_cmod;
1011
1012 new_cmod = brw_swap_cmod(scan_inst->conditional_mod);
1013 if (new_cmod != ~0u) {
1014 /* Fit this constant in by swapping the operands and
1015 * flipping the test
1016 */
1017 scan_inst->src[0] = scan_inst->src[1];
1018 scan_inst->src[1] = inst->src[0];
1019 scan_inst->conditional_mod = new_cmod;
1020 progress = true;
1021 }
1022 }
1023 break;
1024
1025 case BRW_OPCODE_SEL:
1026 if (i == 1) {
1027 scan_inst->src[i] = inst->src[0];
1028 progress = true;
1029 } else if (i == 0 && scan_inst->src[1].file != IMM) {
1030 /* Fit this constant in by swapping the operands and
1031 * flipping the predicate
1032 */
1033 scan_inst->src[0] = scan_inst->src[1];
1034 scan_inst->src[1] = inst->src[0];
1035 scan_inst->predicate_inverse = !scan_inst->predicate_inverse;
1036 progress = true;
1037 }
1038 break;
1039 }
1040 }
1041
1042 if (scan_inst->dst.file == GRF &&
1043 scan_inst->dst.reg == inst->dst.reg &&
1044 (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
1045 scan_inst->is_tex())) {
1046 break;
1047 }
1048 }
1049 }
1050
1051 if (progress)
1052 this->live_intervals_valid = false;
1053
1054 return progress;
1055 }
1056 /**
1057 * Must be called after calculate_live_intervales() to remove unused
1058 * writes to registers -- register allocation will fail otherwise
1059 * because something deffed but not used won't be considered to
1060 * interfere with other regs.
1061 */
1062 bool
1063 fs_visitor::dead_code_eliminate()
1064 {
1065 bool progress = false;
1066 int pc = 0;
1067
1068 calculate_live_intervals();
1069
1070 foreach_iter(exec_list_iterator, iter, this->instructions) {
1071 fs_inst *inst = (fs_inst *)iter.get();
1072
1073 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
1074 inst->remove();
1075 progress = true;
1076 }
1077
1078 pc++;
1079 }
1080
1081 if (progress)
1082 live_intervals_valid = false;
1083
1084 return progress;
1085 }
1086
1087 bool
1088 fs_visitor::register_coalesce()
1089 {
1090 bool progress = false;
1091 int if_depth = 0;
1092 int loop_depth = 0;
1093
1094 foreach_iter(exec_list_iterator, iter, this->instructions) {
1095 fs_inst *inst = (fs_inst *)iter.get();
1096
1097 /* Make sure that we dominate the instructions we're going to
1098 * scan for interfering with our coalescing, or we won't have
1099 * scanned enough to see if anything interferes with our
1100 * coalescing. We don't dominate the following instructions if
1101 * we're in a loop or an if block.
1102 */
1103 switch (inst->opcode) {
1104 case BRW_OPCODE_DO:
1105 loop_depth++;
1106 break;
1107 case BRW_OPCODE_WHILE:
1108 loop_depth--;
1109 break;
1110 case BRW_OPCODE_IF:
1111 if_depth++;
1112 break;
1113 case BRW_OPCODE_ENDIF:
1114 if_depth--;
1115 break;
1116 }
1117 if (loop_depth || if_depth)
1118 continue;
1119
1120 if (inst->opcode != BRW_OPCODE_MOV ||
1121 inst->predicated ||
1122 inst->saturate ||
1123 inst->dst.file != GRF || inst->src[0].file != GRF ||
1124 inst->dst.type != inst->src[0].type)
1125 continue;
1126
1127 bool has_source_modifiers = inst->src[0].abs || inst->src[0].negate;
1128
1129 /* Found a move of a GRF to a GRF. Let's see if we can coalesce
1130 * them: check for no writes to either one until the exit of the
1131 * program.
1132 */
1133 bool interfered = false;
1134 exec_list_iterator scan_iter = iter;
1135 scan_iter.next();
1136 for (; scan_iter.has_next(); scan_iter.next()) {
1137 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
1138
1139 if (scan_inst->dst.file == GRF) {
1140 if (scan_inst->dst.reg == inst->dst.reg &&
1141 (scan_inst->dst.reg_offset == inst->dst.reg_offset ||
1142 scan_inst->is_tex())) {
1143 interfered = true;
1144 break;
1145 }
1146 if (scan_inst->dst.reg == inst->src[0].reg &&
1147 (scan_inst->dst.reg_offset == inst->src[0].reg_offset ||
1148 scan_inst->is_tex())) {
1149 interfered = true;
1150 break;
1151 }
1152 }
1153
1154 /* The gen6 MATH instruction can't handle source modifiers, so avoid
1155 * coalescing those for now. We should do something more specific.
1156 */
1157 if (intel->gen >= 6 && scan_inst->is_math() && has_source_modifiers) {
1158 interfered = true;
1159 break;
1160 }
1161 }
1162 if (interfered) {
1163 continue;
1164 }
1165
1166 /* Rewrite the later usage to point at the source of the move to
1167 * be removed.
1168 */
1169 for (exec_list_iterator scan_iter = iter; scan_iter.has_next();
1170 scan_iter.next()) {
1171 fs_inst *scan_inst = (fs_inst *)scan_iter.get();
1172
1173 for (int i = 0; i < 3; i++) {
1174 if (scan_inst->src[i].file == GRF &&
1175 scan_inst->src[i].reg == inst->dst.reg &&
1176 scan_inst->src[i].reg_offset == inst->dst.reg_offset) {
1177 scan_inst->src[i].reg = inst->src[0].reg;
1178 scan_inst->src[i].reg_offset = inst->src[0].reg_offset;
1179 scan_inst->src[i].abs |= inst->src[0].abs;
1180 scan_inst->src[i].negate ^= inst->src[0].negate;
1181 scan_inst->src[i].smear = inst->src[0].smear;
1182 }
1183 }
1184 }
1185
1186 inst->remove();
1187 progress = true;
1188 }
1189
1190 if (progress)
1191 live_intervals_valid = false;
1192
1193 return progress;
1194 }
1195
1196
1197 bool
1198 fs_visitor::compute_to_mrf()
1199 {
1200 bool progress = false;
1201 int next_ip = 0;
1202
1203 calculate_live_intervals();
1204
1205 foreach_iter(exec_list_iterator, iter, this->instructions) {
1206 fs_inst *inst = (fs_inst *)iter.get();
1207
1208 int ip = next_ip;
1209 next_ip++;
1210
1211 if (inst->opcode != BRW_OPCODE_MOV ||
1212 inst->predicated ||
1213 inst->dst.file != MRF || inst->src[0].file != GRF ||
1214 inst->dst.type != inst->src[0].type ||
1215 inst->src[0].abs || inst->src[0].negate || inst->src[0].smear != -1)
1216 continue;
1217
1218 /* Work out which hardware MRF registers are written by this
1219 * instruction.
1220 */
1221 int mrf_low = inst->dst.hw_reg & ~BRW_MRF_COMPR4;
1222 int mrf_high;
1223 if (inst->dst.hw_reg & BRW_MRF_COMPR4) {
1224 mrf_high = mrf_low + 4;
1225 } else if (c->dispatch_width == 16 &&
1226 (!inst->force_uncompressed && !inst->force_sechalf)) {
1227 mrf_high = mrf_low + 1;
1228 } else {
1229 mrf_high = mrf_low;
1230 }
1231
1232 /* Can't compute-to-MRF this GRF if someone else was going to
1233 * read it later.
1234 */
1235 if (this->virtual_grf_use[inst->src[0].reg] > ip)
1236 continue;
1237
1238 /* Found a move of a GRF to a MRF. Let's see if we can go
1239 * rewrite the thing that made this GRF to write into the MRF.
1240 */
1241 fs_inst *scan_inst;
1242 for (scan_inst = (fs_inst *)inst->prev;
1243 scan_inst->prev != NULL;
1244 scan_inst = (fs_inst *)scan_inst->prev) {
1245 if (scan_inst->dst.file == GRF &&
1246 scan_inst->dst.reg == inst->src[0].reg) {
1247 /* Found the last thing to write our reg we want to turn
1248 * into a compute-to-MRF.
1249 */
1250
1251 if (scan_inst->is_tex()) {
1252 /* texturing writes several continuous regs, so we can't
1253 * compute-to-mrf that.
1254 */
1255 break;
1256 }
1257
1258 /* If it's predicated, it (probably) didn't populate all
1259 * the channels. We might be able to rewrite everything
1260 * that writes that reg, but it would require smarter
1261 * tracking to delay the rewriting until complete success.
1262 */
1263 if (scan_inst->predicated)
1264 break;
1265
1266 /* If it's half of register setup and not the same half as
1267 * our MOV we're trying to remove, bail for now.
1268 */
1269 if (scan_inst->force_uncompressed != inst->force_uncompressed ||
1270 scan_inst->force_sechalf != inst->force_sechalf) {
1271 break;
1272 }
1273
1274 /* SEND instructions can't have MRF as a destination. */
1275 if (scan_inst->mlen)
1276 break;
1277
1278 if (intel->gen >= 6) {
1279 /* gen6 math instructions must have the destination be
1280 * GRF, so no compute-to-MRF for them.
1281 */
1282 if (scan_inst->is_math()) {
1283 break;
1284 }
1285 }
1286
1287 if (scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
1288 /* Found the creator of our MRF's source value. */
1289 scan_inst->dst.file = MRF;
1290 scan_inst->dst.hw_reg = inst->dst.hw_reg;
1291 scan_inst->saturate |= inst->saturate;
1292 inst->remove();
1293 progress = true;
1294 }
1295 break;
1296 }
1297
1298 /* We don't handle flow control here. Most computation of
1299 * values that end up in MRFs are shortly before the MRF
1300 * write anyway.
1301 */
1302 if (scan_inst->opcode == BRW_OPCODE_DO ||
1303 scan_inst->opcode == BRW_OPCODE_WHILE ||
1304 scan_inst->opcode == BRW_OPCODE_ELSE ||
1305 scan_inst->opcode == BRW_OPCODE_ENDIF) {
1306 break;
1307 }
1308
1309 /* You can't read from an MRF, so if someone else reads our
1310 * MRF's source GRF that we wanted to rewrite, that stops us.
1311 */
1312 bool interfered = false;
1313 for (int i = 0; i < 3; i++) {
1314 if (scan_inst->src[i].file == GRF &&
1315 scan_inst->src[i].reg == inst->src[0].reg &&
1316 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
1317 interfered = true;
1318 }
1319 }
1320 if (interfered)
1321 break;
1322
1323 if (scan_inst->dst.file == MRF) {
1324 /* If somebody else writes our MRF here, we can't
1325 * compute-to-MRF before that.
1326 */
1327 int scan_mrf_low = scan_inst->dst.hw_reg & ~BRW_MRF_COMPR4;
1328 int scan_mrf_high;
1329
1330 if (scan_inst->dst.hw_reg & BRW_MRF_COMPR4) {
1331 scan_mrf_high = scan_mrf_low + 4;
1332 } else if (c->dispatch_width == 16 &&
1333 (!scan_inst->force_uncompressed &&
1334 !scan_inst->force_sechalf)) {
1335 scan_mrf_high = scan_mrf_low + 1;
1336 } else {
1337 scan_mrf_high = scan_mrf_low;
1338 }
1339
1340 if (mrf_low == scan_mrf_low ||
1341 mrf_low == scan_mrf_high ||
1342 mrf_high == scan_mrf_low ||
1343 mrf_high == scan_mrf_high) {
1344 break;
1345 }
1346 }
1347
1348 if (scan_inst->mlen > 0) {
1349 /* Found a SEND instruction, which means that there are
1350 * live values in MRFs from base_mrf to base_mrf +
1351 * scan_inst->mlen - 1. Don't go pushing our MRF write up
1352 * above it.
1353 */
1354 if (mrf_low >= scan_inst->base_mrf &&
1355 mrf_low < scan_inst->base_mrf + scan_inst->mlen) {
1356 break;
1357 }
1358 if (mrf_high >= scan_inst->base_mrf &&
1359 mrf_high < scan_inst->base_mrf + scan_inst->mlen) {
1360 break;
1361 }
1362 }
1363 }
1364 }
1365
1366 return progress;
1367 }
1368
1369 /**
1370 * Walks through basic blocks, locking for repeated MRF writes and
1371 * removing the later ones.
1372 */
1373 bool
1374 fs_visitor::remove_duplicate_mrf_writes()
1375 {
1376 fs_inst *last_mrf_move[16];
1377 bool progress = false;
1378
1379 /* Need to update the MRF tracking for compressed instructions. */
1380 if (c->dispatch_width == 16)
1381 return false;
1382
1383 memset(last_mrf_move, 0, sizeof(last_mrf_move));
1384
1385 foreach_iter(exec_list_iterator, iter, this->instructions) {
1386 fs_inst *inst = (fs_inst *)iter.get();
1387
1388 switch (inst->opcode) {
1389 case BRW_OPCODE_DO:
1390 case BRW_OPCODE_WHILE:
1391 case BRW_OPCODE_IF:
1392 case BRW_OPCODE_ELSE:
1393 case BRW_OPCODE_ENDIF:
1394 memset(last_mrf_move, 0, sizeof(last_mrf_move));
1395 continue;
1396 default:
1397 break;
1398 }
1399
1400 if (inst->opcode == BRW_OPCODE_MOV &&
1401 inst->dst.file == MRF) {
1402 fs_inst *prev_inst = last_mrf_move[inst->dst.hw_reg];
1403 if (prev_inst && inst->equals(prev_inst)) {
1404 inst->remove();
1405 progress = true;
1406 continue;
1407 }
1408 }
1409
1410 /* Clear out the last-write records for MRFs that were overwritten. */
1411 if (inst->dst.file == MRF) {
1412 last_mrf_move[inst->dst.hw_reg] = NULL;
1413 }
1414
1415 if (inst->mlen > 0) {
1416 /* Found a SEND instruction, which will include two or fewer
1417 * implied MRF writes. We could do better here.
1418 */
1419 for (int i = 0; i < implied_mrf_writes(inst); i++) {
1420 last_mrf_move[inst->base_mrf + i] = NULL;
1421 }
1422 }
1423
1424 /* Clear out any MRF move records whose sources got overwritten. */
1425 if (inst->dst.file == GRF) {
1426 for (unsigned int i = 0; i < Elements(last_mrf_move); i++) {
1427 if (last_mrf_move[i] &&
1428 last_mrf_move[i]->src[0].reg == inst->dst.reg) {
1429 last_mrf_move[i] = NULL;
1430 }
1431 }
1432 }
1433
1434 if (inst->opcode == BRW_OPCODE_MOV &&
1435 inst->dst.file == MRF &&
1436 inst->src[0].file == GRF &&
1437 !inst->predicated) {
1438 last_mrf_move[inst->dst.hw_reg] = inst;
1439 }
1440 }
1441
1442 return progress;
1443 }
1444
1445 bool
1446 fs_visitor::virtual_grf_interferes(int a, int b)
1447 {
1448 int start = MAX2(this->virtual_grf_def[a], this->virtual_grf_def[b]);
1449 int end = MIN2(this->virtual_grf_use[a], this->virtual_grf_use[b]);
1450
1451 /* We can't handle dead register writes here, without iterating
1452 * over the whole instruction stream to find every single dead
1453 * write to that register to compare to the live interval of the
1454 * other register. Just assert that dead_code_eliminate() has been
1455 * called.
1456 */
1457 assert((this->virtual_grf_use[a] != -1 ||
1458 this->virtual_grf_def[a] == MAX_INSTRUCTION) &&
1459 (this->virtual_grf_use[b] != -1 ||
1460 this->virtual_grf_def[b] == MAX_INSTRUCTION));
1461
1462 /* If the register is used to store 16 values of less than float
1463 * size (only the case for pixel_[xy]), then we can't allocate
1464 * another dword-sized thing to that register that would be used in
1465 * the same instruction. This is because when the GPU decodes (for
1466 * example):
1467 *
1468 * (declare (in ) vec4 gl_FragCoord@0x97766a0)
1469 * add(16) g6<1>F g6<8,8,1>UW 0.5F { align1 compr };
1470 *
1471 * it's actually processed as:
1472 * add(8) g6<1>F g6<8,8,1>UW 0.5F { align1 };
1473 * add(8) g7<1>F g6.8<8,8,1>UW 0.5F { align1 sechalf };
1474 *
1475 * so our second half values in g6 got overwritten in the first
1476 * half.
1477 */
1478 if (c->dispatch_width == 16 && (this->pixel_x.reg == a ||
1479 this->pixel_x.reg == b ||
1480 this->pixel_y.reg == a ||
1481 this->pixel_y.reg == b)) {
1482 return start <= end;
1483 }
1484
1485 return start < end;
1486 }
1487
1488 bool
1489 fs_visitor::run()
1490 {
1491 uint32_t prog_offset_16 = 0;
1492 uint32_t orig_nr_params = c->prog_data.nr_params;
1493
1494 brw_wm_payload_setup(brw, c);
1495
1496 if (c->dispatch_width == 16) {
1497 /* align to 64 byte boundary. */
1498 while ((c->func.nr_insn * sizeof(struct brw_instruction)) % 64) {
1499 brw_NOP(p);
1500 }
1501
1502 /* Save off the start of this 16-wide program in case we succeed. */
1503 prog_offset_16 = c->func.nr_insn * sizeof(struct brw_instruction);
1504
1505 brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
1506 }
1507
1508 if (0) {
1509 emit_dummy_fs();
1510 } else {
1511 calculate_urb_setup();
1512 if (intel->gen < 6)
1513 emit_interpolation_setup_gen4();
1514 else
1515 emit_interpolation_setup_gen6();
1516
1517 /* Generate FS IR for main(). (the visitor only descends into
1518 * functions called "main").
1519 */
1520 foreach_iter(exec_list_iterator, iter, *shader->ir) {
1521 ir_instruction *ir = (ir_instruction *)iter.get();
1522 base_ir = ir;
1523 this->result = reg_undef;
1524 ir->accept(this);
1525 }
1526
1527 emit_fb_writes();
1528
1529 split_virtual_grfs();
1530
1531 setup_paramvalues_refs();
1532 setup_pull_constants();
1533
1534 bool progress;
1535 do {
1536 progress = false;
1537
1538 progress = remove_duplicate_mrf_writes() || progress;
1539
1540 progress = propagate_constants() || progress;
1541 progress = register_coalesce() || progress;
1542 progress = compute_to_mrf() || progress;
1543 progress = dead_code_eliminate() || progress;
1544 } while (progress);
1545
1546 schedule_instructions();
1547
1548 assign_curb_setup();
1549 assign_urb_setup();
1550
1551 if (0) {
1552 /* Debug of register spilling: Go spill everything. */
1553 int virtual_grf_count = virtual_grf_next;
1554 for (int i = 1; i < virtual_grf_count; i++) {
1555 spill_reg(i);
1556 }
1557 }
1558
1559 if (0)
1560 assign_regs_trivial();
1561 else {
1562 while (!assign_regs()) {
1563 if (failed)
1564 break;
1565 }
1566 }
1567 }
1568 assert(force_uncompressed_stack == 0);
1569 assert(force_sechalf_stack == 0);
1570
1571 if (failed)
1572 return false;
1573
1574 generate_code();
1575
1576 if (c->dispatch_width == 8) {
1577 c->prog_data.reg_blocks = brw_register_blocks(grf_used);
1578 } else {
1579 c->prog_data.reg_blocks_16 = brw_register_blocks(grf_used);
1580 c->prog_data.prog_offset_16 = prog_offset_16;
1581
1582 /* Make sure we didn't try to sneak in an extra uniform */
1583 assert(orig_nr_params == c->prog_data.nr_params);
1584 }
1585
1586 return !failed;
1587 }
1588
1589 bool
1590 brw_wm_fs_emit(struct brw_context *brw, struct brw_wm_compile *c)
1591 {
1592 struct intel_context *intel = &brw->intel;
1593 struct gl_context *ctx = &intel->ctx;
1594 struct gl_shader_program *prog = ctx->Shader.CurrentFragmentProgram;
1595
1596 if (!prog)
1597 return false;
1598
1599 struct brw_shader *shader =
1600 (brw_shader *) prog->_LinkedShaders[MESA_SHADER_FRAGMENT];
1601 if (!shader)
1602 return false;
1603
1604 if (unlikely(INTEL_DEBUG & DEBUG_WM)) {
1605 printf("GLSL IR for native fragment shader %d:\n", prog->Name);
1606 _mesa_print_ir(shader->ir, NULL);
1607 printf("\n\n");
1608 }
1609
1610 /* Now the main event: Visit the shader IR and generate our FS IR for it.
1611 */
1612 c->dispatch_width = 8;
1613
1614 fs_visitor v(c, shader);
1615 if (!v.run()) {
1616 /* FINISHME: Cleanly fail, test at link time, etc. */
1617 assert(!"not reached");
1618 return false;
1619 }
1620
1621 if (intel->gen >= 5 && c->prog_data.nr_pull_params == 0) {
1622 c->dispatch_width = 16;
1623 fs_visitor v2(c, shader);
1624 v2.import_uniforms(v.variable_ht);
1625 v2.run();
1626 }
1627
1628 c->prog_data.dispatch_width = 8;
1629
1630 return true;
1631 }