9308ed61041857456fa156531b950a9c7b1a5be9
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_print_visitor.h"
26
27 extern "C" {
28 #include "main/macros.h"
29 #include "main/shaderobj.h"
30 #include "program/prog_print.h"
31 #include "program/prog_parameter.h"
32 }
33
34 #define MAX_INSTRUCTION (1 << 30)
35
36 using namespace brw;
37
38 namespace brw {
39
40 /**
41 * Common helper for constructing swizzles. When only a subset of
42 * channels of a vec4 are used, we don't want to reference the other
43 * channels, as that will tell optimization passes that those other
44 * channels are used.
45 */
46 unsigned
47 swizzle_for_size(int size)
48 {
49 static const unsigned size_swizzles[4] = {
50 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
51 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
52 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z),
53 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W),
54 };
55
56 assert((size >= 1) && (size <= 4));
57 return size_swizzles[size - 1];
58 }
59
60 void
61 src_reg::init()
62 {
63 memset(this, 0, sizeof(*this));
64
65 this->file = BAD_FILE;
66 }
67
68 src_reg::src_reg(register_file file, int reg, const glsl_type *type)
69 {
70 init();
71
72 this->file = file;
73 this->reg = reg;
74 if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
75 this->swizzle = swizzle_for_size(type->vector_elements);
76 else
77 this->swizzle = SWIZZLE_XYZW;
78 }
79
80 /** Generic unset register constructor. */
81 src_reg::src_reg()
82 {
83 init();
84 }
85
86 src_reg::src_reg(float f)
87 {
88 init();
89
90 this->file = IMM;
91 this->type = BRW_REGISTER_TYPE_F;
92 this->imm.f = f;
93 }
94
95 src_reg::src_reg(uint32_t u)
96 {
97 init();
98
99 this->file = IMM;
100 this->type = BRW_REGISTER_TYPE_UD;
101 this->imm.u = u;
102 }
103
104 src_reg::src_reg(int32_t i)
105 {
106 init();
107
108 this->file = IMM;
109 this->type = BRW_REGISTER_TYPE_D;
110 this->imm.i = i;
111 }
112
113 src_reg::src_reg(dst_reg reg)
114 {
115 init();
116
117 this->file = reg.file;
118 this->reg = reg.reg;
119 this->reg_offset = reg.reg_offset;
120 this->type = reg.type;
121 this->reladdr = reg.reladdr;
122 this->fixed_hw_reg = reg.fixed_hw_reg;
123
124 int swizzles[4];
125 int next_chan = 0;
126 int last = 0;
127
128 for (int i = 0; i < 4; i++) {
129 if (!(reg.writemask & (1 << i)))
130 continue;
131
132 swizzles[next_chan++] = last = i;
133 }
134
135 for (; next_chan < 4; next_chan++) {
136 swizzles[next_chan] = last;
137 }
138
139 this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
140 swizzles[2], swizzles[3]);
141 }
142
143 bool
144 vec4_instruction::is_tex()
145 {
146 return (opcode == SHADER_OPCODE_TEX ||
147 opcode == SHADER_OPCODE_TXD ||
148 opcode == SHADER_OPCODE_TXF ||
149 opcode == SHADER_OPCODE_TXF_MS ||
150 opcode == SHADER_OPCODE_TXL ||
151 opcode == SHADER_OPCODE_TXS);
152 }
153
154 void
155 dst_reg::init()
156 {
157 memset(this, 0, sizeof(*this));
158 this->file = BAD_FILE;
159 this->writemask = WRITEMASK_XYZW;
160 }
161
162 dst_reg::dst_reg()
163 {
164 init();
165 }
166
167 dst_reg::dst_reg(register_file file, int reg)
168 {
169 init();
170
171 this->file = file;
172 this->reg = reg;
173 }
174
175 dst_reg::dst_reg(register_file file, int reg, const glsl_type *type,
176 int writemask)
177 {
178 init();
179
180 this->file = file;
181 this->reg = reg;
182 this->type = brw_type_for_base_type(type);
183 this->writemask = writemask;
184 }
185
186 dst_reg::dst_reg(struct brw_reg reg)
187 {
188 init();
189
190 this->file = HW_REG;
191 this->fixed_hw_reg = reg;
192 }
193
194 dst_reg::dst_reg(src_reg reg)
195 {
196 init();
197
198 this->file = reg.file;
199 this->reg = reg.reg;
200 this->reg_offset = reg.reg_offset;
201 this->type = reg.type;
202 this->writemask = WRITEMASK_XYZW;
203 this->reladdr = reg.reladdr;
204 this->fixed_hw_reg = reg.fixed_hw_reg;
205 }
206
207 bool
208 vec4_instruction::is_math()
209 {
210 return (opcode == SHADER_OPCODE_RCP ||
211 opcode == SHADER_OPCODE_RSQ ||
212 opcode == SHADER_OPCODE_SQRT ||
213 opcode == SHADER_OPCODE_EXP2 ||
214 opcode == SHADER_OPCODE_LOG2 ||
215 opcode == SHADER_OPCODE_SIN ||
216 opcode == SHADER_OPCODE_COS ||
217 opcode == SHADER_OPCODE_INT_QUOTIENT ||
218 opcode == SHADER_OPCODE_INT_REMAINDER ||
219 opcode == SHADER_OPCODE_POW);
220 }
221
222 bool
223 vec4_instruction::is_send_from_grf()
224 {
225 return opcode == SHADER_OPCODE_SHADER_TIME_ADD;
226 }
227
228 bool
229 vec4_visitor::can_do_source_mods(vec4_instruction *inst)
230 {
231 if (intel->gen == 6 && inst->is_math())
232 return false;
233
234 if (inst->is_send_from_grf())
235 return false;
236
237 return true;
238 }
239
240 /**
241 * Returns how many MRFs an opcode will write over.
242 *
243 * Note that this is not the 0 or 1 implied writes in an actual gen
244 * instruction -- the generate_* functions generate additional MOVs
245 * for setup.
246 */
247 int
248 vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
249 {
250 if (inst->mlen == 0)
251 return 0;
252
253 switch (inst->opcode) {
254 case SHADER_OPCODE_RCP:
255 case SHADER_OPCODE_RSQ:
256 case SHADER_OPCODE_SQRT:
257 case SHADER_OPCODE_EXP2:
258 case SHADER_OPCODE_LOG2:
259 case SHADER_OPCODE_SIN:
260 case SHADER_OPCODE_COS:
261 return 1;
262 case SHADER_OPCODE_POW:
263 return 2;
264 case VS_OPCODE_URB_WRITE:
265 return 1;
266 case VS_OPCODE_PULL_CONSTANT_LOAD:
267 return 2;
268 case VS_OPCODE_SCRATCH_READ:
269 return 2;
270 case VS_OPCODE_SCRATCH_WRITE:
271 return 3;
272 case SHADER_OPCODE_SHADER_TIME_ADD:
273 return 0;
274 default:
275 assert(!"not reached");
276 return inst->mlen;
277 }
278 }
279
280 bool
281 src_reg::equals(src_reg *r)
282 {
283 return (file == r->file &&
284 reg == r->reg &&
285 reg_offset == r->reg_offset &&
286 type == r->type &&
287 negate == r->negate &&
288 abs == r->abs &&
289 swizzle == r->swizzle &&
290 !reladdr && !r->reladdr &&
291 memcmp(&fixed_hw_reg, &r->fixed_hw_reg,
292 sizeof(fixed_hw_reg)) == 0 &&
293 imm.u == r->imm.u);
294 }
295
296 /**
297 * Must be called after calculate_live_intervales() to remove unused
298 * writes to registers -- register allocation will fail otherwise
299 * because something deffed but not used won't be considered to
300 * interfere with other regs.
301 */
302 bool
303 vec4_visitor::dead_code_eliminate()
304 {
305 bool progress = false;
306 int pc = 0;
307
308 calculate_live_intervals();
309
310 foreach_list_safe(node, &this->instructions) {
311 vec4_instruction *inst = (vec4_instruction *)node;
312
313 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
314 inst->remove();
315 progress = true;
316 }
317
318 pc++;
319 }
320
321 if (progress)
322 live_intervals_valid = false;
323
324 return progress;
325 }
326
327 void
328 vec4_visitor::split_uniform_registers()
329 {
330 /* Prior to this, uniforms have been in an array sized according to
331 * the number of vector uniforms present, sparsely filled (so an
332 * aggregate results in reg indices being skipped over). Now we're
333 * going to cut those aggregates up so each .reg index is one
334 * vector. The goal is to make elimination of unused uniform
335 * components easier later.
336 */
337 foreach_list(node, &this->instructions) {
338 vec4_instruction *inst = (vec4_instruction *)node;
339
340 for (int i = 0 ; i < 3; i++) {
341 if (inst->src[i].file != UNIFORM)
342 continue;
343
344 assert(!inst->src[i].reladdr);
345
346 inst->src[i].reg += inst->src[i].reg_offset;
347 inst->src[i].reg_offset = 0;
348 }
349 }
350
351 /* Update that everything is now vector-sized. */
352 for (int i = 0; i < this->uniforms; i++) {
353 this->uniform_size[i] = 1;
354 }
355 }
356
357 void
358 vec4_visitor::pack_uniform_registers()
359 {
360 bool uniform_used[this->uniforms];
361 int new_loc[this->uniforms];
362 int new_chan[this->uniforms];
363
364 memset(uniform_used, 0, sizeof(uniform_used));
365 memset(new_loc, 0, sizeof(new_loc));
366 memset(new_chan, 0, sizeof(new_chan));
367
368 /* Find which uniform vectors are actually used by the program. We
369 * expect unused vector elements when we've moved array access out
370 * to pull constants, and from some GLSL code generators like wine.
371 */
372 foreach_list(node, &this->instructions) {
373 vec4_instruction *inst = (vec4_instruction *)node;
374
375 for (int i = 0 ; i < 3; i++) {
376 if (inst->src[i].file != UNIFORM)
377 continue;
378
379 uniform_used[inst->src[i].reg] = true;
380 }
381 }
382
383 int new_uniform_count = 0;
384
385 /* Now, figure out a packing of the live uniform vectors into our
386 * push constants.
387 */
388 for (int src = 0; src < uniforms; src++) {
389 int size = this->uniform_vector_size[src];
390
391 if (!uniform_used[src]) {
392 this->uniform_vector_size[src] = 0;
393 continue;
394 }
395
396 int dst;
397 /* Find the lowest place we can slot this uniform in. */
398 for (dst = 0; dst < src; dst++) {
399 if (this->uniform_vector_size[dst] + size <= 4)
400 break;
401 }
402
403 if (src == dst) {
404 new_loc[src] = dst;
405 new_chan[src] = 0;
406 } else {
407 new_loc[src] = dst;
408 new_chan[src] = this->uniform_vector_size[dst];
409
410 /* Move the references to the data */
411 for (int j = 0; j < size; j++) {
412 c->prog_data.param[dst * 4 + new_chan[src] + j] =
413 c->prog_data.param[src * 4 + j];
414 }
415
416 this->uniform_vector_size[dst] += size;
417 this->uniform_vector_size[src] = 0;
418 }
419
420 new_uniform_count = MAX2(new_uniform_count, dst + 1);
421 }
422
423 this->uniforms = new_uniform_count;
424
425 /* Now, update the instructions for our repacked uniforms. */
426 foreach_list(node, &this->instructions) {
427 vec4_instruction *inst = (vec4_instruction *)node;
428
429 for (int i = 0 ; i < 3; i++) {
430 int src = inst->src[i].reg;
431
432 if (inst->src[i].file != UNIFORM)
433 continue;
434
435 inst->src[i].reg = new_loc[src];
436
437 int sx = BRW_GET_SWZ(inst->src[i].swizzle, 0) + new_chan[src];
438 int sy = BRW_GET_SWZ(inst->src[i].swizzle, 1) + new_chan[src];
439 int sz = BRW_GET_SWZ(inst->src[i].swizzle, 2) + new_chan[src];
440 int sw = BRW_GET_SWZ(inst->src[i].swizzle, 3) + new_chan[src];
441 inst->src[i].swizzle = BRW_SWIZZLE4(sx, sy, sz, sw);
442 }
443 }
444 }
445
446 bool
447 src_reg::is_zero() const
448 {
449 if (file != IMM)
450 return false;
451
452 if (type == BRW_REGISTER_TYPE_F) {
453 return imm.f == 0.0;
454 } else {
455 return imm.i == 0;
456 }
457 }
458
459 bool
460 src_reg::is_one() const
461 {
462 if (file != IMM)
463 return false;
464
465 if (type == BRW_REGISTER_TYPE_F) {
466 return imm.f == 1.0;
467 } else {
468 return imm.i == 1;
469 }
470 }
471
472 /**
473 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
474 *
475 * While GLSL IR also performs this optimization, we end up with it in
476 * our instruction stream for a couple of reasons. One is that we
477 * sometimes generate silly instructions, for example in array access
478 * where we'll generate "ADD offset, index, base" even if base is 0.
479 * The other is that GLSL IR's constant propagation doesn't track the
480 * components of aggregates, so some VS patterns (initialize matrix to
481 * 0, accumulate in vertex blending factors) end up breaking down to
482 * instructions involving 0.
483 */
484 bool
485 vec4_visitor::opt_algebraic()
486 {
487 bool progress = false;
488
489 foreach_list(node, &this->instructions) {
490 vec4_instruction *inst = (vec4_instruction *)node;
491
492 switch (inst->opcode) {
493 case BRW_OPCODE_ADD:
494 if (inst->src[1].is_zero()) {
495 inst->opcode = BRW_OPCODE_MOV;
496 inst->src[1] = src_reg();
497 progress = true;
498 }
499 break;
500
501 case BRW_OPCODE_MUL:
502 if (inst->src[1].is_zero()) {
503 inst->opcode = BRW_OPCODE_MOV;
504 switch (inst->src[0].type) {
505 case BRW_REGISTER_TYPE_F:
506 inst->src[0] = src_reg(0.0f);
507 break;
508 case BRW_REGISTER_TYPE_D:
509 inst->src[0] = src_reg(0);
510 break;
511 case BRW_REGISTER_TYPE_UD:
512 inst->src[0] = src_reg(0u);
513 break;
514 default:
515 assert(!"not reached");
516 inst->src[0] = src_reg(0.0f);
517 break;
518 }
519 inst->src[1] = src_reg();
520 progress = true;
521 } else if (inst->src[1].is_one()) {
522 inst->opcode = BRW_OPCODE_MOV;
523 inst->src[1] = src_reg();
524 progress = true;
525 }
526 break;
527 default:
528 break;
529 }
530 }
531
532 if (progress)
533 this->live_intervals_valid = false;
534
535 return progress;
536 }
537
538 /**
539 * Only a limited number of hardware registers may be used for push
540 * constants, so this turns access to the overflowed constants into
541 * pull constants.
542 */
543 void
544 vec4_visitor::move_push_constants_to_pull_constants()
545 {
546 int pull_constant_loc[this->uniforms];
547
548 /* Only allow 32 registers (256 uniform components) as push constants,
549 * which is the limit on gen6.
550 */
551 int max_uniform_components = 32 * 8;
552 if (this->uniforms * 4 <= max_uniform_components)
553 return;
554
555 /* Make some sort of choice as to which uniforms get sent to pull
556 * constants. We could potentially do something clever here like
557 * look for the most infrequently used uniform vec4s, but leave
558 * that for later.
559 */
560 for (int i = 0; i < this->uniforms * 4; i += 4) {
561 pull_constant_loc[i / 4] = -1;
562
563 if (i >= max_uniform_components) {
564 const float **values = &prog_data->param[i];
565
566 /* Try to find an existing copy of this uniform in the pull
567 * constants if it was part of an array access already.
568 */
569 for (unsigned int j = 0; j < prog_data->nr_pull_params; j += 4) {
570 int matches;
571
572 for (matches = 0; matches < 4; matches++) {
573 if (prog_data->pull_param[j + matches] != values[matches])
574 break;
575 }
576
577 if (matches == 4) {
578 pull_constant_loc[i / 4] = j / 4;
579 break;
580 }
581 }
582
583 if (pull_constant_loc[i / 4] == -1) {
584 assert(prog_data->nr_pull_params % 4 == 0);
585 pull_constant_loc[i / 4] = prog_data->nr_pull_params / 4;
586
587 for (int j = 0; j < 4; j++) {
588 prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
589 }
590 }
591 }
592 }
593
594 /* Now actually rewrite usage of the things we've moved to pull
595 * constants.
596 */
597 foreach_list_safe(node, &this->instructions) {
598 vec4_instruction *inst = (vec4_instruction *)node;
599
600 for (int i = 0 ; i < 3; i++) {
601 if (inst->src[i].file != UNIFORM ||
602 pull_constant_loc[inst->src[i].reg] == -1)
603 continue;
604
605 int uniform = inst->src[i].reg;
606
607 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
608
609 emit_pull_constant_load(inst, temp, inst->src[i],
610 pull_constant_loc[uniform]);
611
612 inst->src[i].file = temp.file;
613 inst->src[i].reg = temp.reg;
614 inst->src[i].reg_offset = temp.reg_offset;
615 inst->src[i].reladdr = NULL;
616 }
617 }
618
619 /* Repack push constants to remove the now-unused ones. */
620 pack_uniform_registers();
621 }
622
623 bool
624 vec4_instruction::can_reswizzle_dst(int dst_writemask,
625 int swizzle,
626 int swizzle_mask)
627 {
628 /* If this instruction sets anything not referenced by swizzle, then we'd
629 * totally break it when we reswizzle.
630 */
631 if (dst.writemask & ~swizzle_mask)
632 return false;
633
634 switch (opcode) {
635 case BRW_OPCODE_DP4:
636 case BRW_OPCODE_DP3:
637 case BRW_OPCODE_DP2:
638 return true;
639 default:
640 /* Check if there happens to be no reswizzling required. */
641 for (int c = 0; c < 4; c++) {
642 int bit = 1 << BRW_GET_SWZ(swizzle, c);
643 /* Skip components of the swizzle not used by the dst. */
644 if (!(dst_writemask & (1 << c)))
645 continue;
646
647 /* We don't do the reswizzling yet, so just sanity check that we
648 * don't have to.
649 */
650 if (bit != (1 << c))
651 return false;
652 }
653 return true;
654 }
655 }
656
657 /**
658 * For any channels in the swizzle's source that were populated by this
659 * instruction, rewrite the instruction to put the appropriate result directly
660 * in those channels.
661 *
662 * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
663 */
664 void
665 vec4_instruction::reswizzle_dst(int dst_writemask, int swizzle)
666 {
667 int new_writemask = 0;
668
669 switch (opcode) {
670 case BRW_OPCODE_DP4:
671 case BRW_OPCODE_DP3:
672 case BRW_OPCODE_DP2:
673 for (int c = 0; c < 4; c++) {
674 int bit = 1 << BRW_GET_SWZ(swizzle, c);
675 /* Skip components of the swizzle not used by the dst. */
676 if (!(dst_writemask & (1 << c)))
677 continue;
678 /* If we were populating this component, then populate the
679 * corresponding channel of the new dst.
680 */
681 if (dst.writemask & bit)
682 new_writemask |= (1 << c);
683 }
684 dst.writemask = new_writemask;
685 break;
686 default:
687 for (int c = 0; c < 4; c++) {
688 int bit = 1 << BRW_GET_SWZ(swizzle, c);
689 /* Skip components of the swizzle not used by the dst. */
690 if (!(dst_writemask & (1 << c)))
691 continue;
692
693 /* We don't do the reswizzling yet, so just sanity check that we
694 * don't have to.
695 */
696 assert(bit == (1 << c));
697 }
698 break;
699 }
700 }
701
702 /*
703 * Tries to reduce extra MOV instructions by taking temporary GRFs that get
704 * just written and then MOVed into another reg and making the original write
705 * of the GRF write directly to the final destination instead.
706 */
707 bool
708 vec4_visitor::opt_register_coalesce()
709 {
710 bool progress = false;
711 int next_ip = 0;
712
713 calculate_live_intervals();
714
715 foreach_list_safe(node, &this->instructions) {
716 vec4_instruction *inst = (vec4_instruction *)node;
717
718 int ip = next_ip;
719 next_ip++;
720
721 if (inst->opcode != BRW_OPCODE_MOV ||
722 (inst->dst.file != GRF && inst->dst.file != MRF) ||
723 inst->predicate ||
724 inst->src[0].file != GRF ||
725 inst->dst.type != inst->src[0].type ||
726 inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
727 continue;
728
729 bool to_mrf = (inst->dst.file == MRF);
730
731 /* Can't coalesce this GRF if someone else was going to
732 * read it later.
733 */
734 if (this->virtual_grf_use[inst->src[0].reg] > ip)
735 continue;
736
737 /* We need to check interference with the final destination between this
738 * instruction and the earliest instruction involved in writing the GRF
739 * we're eliminating. To do that, keep track of which of our source
740 * channels we've seen initialized.
741 */
742 bool chans_needed[4] = {false, false, false, false};
743 int chans_remaining = 0;
744 int swizzle_mask = 0;
745 for (int i = 0; i < 4; i++) {
746 int chan = BRW_GET_SWZ(inst->src[0].swizzle, i);
747
748 if (!(inst->dst.writemask & (1 << i)))
749 continue;
750
751 swizzle_mask |= (1 << chan);
752
753 if (!chans_needed[chan]) {
754 chans_needed[chan] = true;
755 chans_remaining++;
756 }
757 }
758
759 /* Now walk up the instruction stream trying to see if we can rewrite
760 * everything writing to the temporary to write into the destination
761 * instead.
762 */
763 vec4_instruction *scan_inst;
764 for (scan_inst = (vec4_instruction *)inst->prev;
765 scan_inst->prev != NULL;
766 scan_inst = (vec4_instruction *)scan_inst->prev) {
767 if (scan_inst->dst.file == GRF &&
768 scan_inst->dst.reg == inst->src[0].reg &&
769 scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
770 /* Found something writing to the reg we want to coalesce away. */
771 if (to_mrf) {
772 /* SEND instructions can't have MRF as a destination. */
773 if (scan_inst->mlen)
774 break;
775
776 if (intel->gen == 6) {
777 /* gen6 math instructions must have the destination be
778 * GRF, so no compute-to-MRF for them.
779 */
780 if (scan_inst->is_math()) {
781 break;
782 }
783 }
784 }
785
786 /* If we can't handle the swizzle, bail. */
787 if (!scan_inst->can_reswizzle_dst(inst->dst.writemask,
788 inst->src[0].swizzle,
789 swizzle_mask)) {
790 break;
791 }
792
793 /* Mark which channels we found unconditional writes for. */
794 if (!scan_inst->predicate) {
795 for (int i = 0; i < 4; i++) {
796 if (scan_inst->dst.writemask & (1 << i) &&
797 chans_needed[i]) {
798 chans_needed[i] = false;
799 chans_remaining--;
800 }
801 }
802 }
803
804 if (chans_remaining == 0)
805 break;
806 }
807
808 /* We don't handle flow control here. Most computation of values
809 * that could be coalesced happens just before their use.
810 */
811 if (scan_inst->opcode == BRW_OPCODE_DO ||
812 scan_inst->opcode == BRW_OPCODE_WHILE ||
813 scan_inst->opcode == BRW_OPCODE_ELSE ||
814 scan_inst->opcode == BRW_OPCODE_ENDIF) {
815 break;
816 }
817
818 /* You can't read from an MRF, so if someone else reads our MRF's
819 * source GRF that we wanted to rewrite, that stops us. If it's a
820 * GRF we're trying to coalesce to, we don't actually handle
821 * rewriting sources so bail in that case as well.
822 */
823 bool interfered = false;
824 for (int i = 0; i < 3; i++) {
825 if (scan_inst->src[i].file == GRF &&
826 scan_inst->src[i].reg == inst->src[0].reg &&
827 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
828 interfered = true;
829 }
830 }
831 if (interfered)
832 break;
833
834 /* If somebody else writes our destination here, we can't coalesce
835 * before that.
836 */
837 if (scan_inst->dst.file == inst->dst.file &&
838 scan_inst->dst.reg == inst->dst.reg) {
839 break;
840 }
841
842 /* Check for reads of the register we're trying to coalesce into. We
843 * can't go rewriting instructions above that to put some other value
844 * in the register instead.
845 */
846 if (to_mrf && scan_inst->mlen > 0) {
847 if (inst->dst.reg >= scan_inst->base_mrf &&
848 inst->dst.reg < scan_inst->base_mrf + scan_inst->mlen) {
849 break;
850 }
851 } else {
852 for (int i = 0; i < 3; i++) {
853 if (scan_inst->src[i].file == inst->dst.file &&
854 scan_inst->src[i].reg == inst->dst.reg &&
855 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
856 interfered = true;
857 }
858 }
859 if (interfered)
860 break;
861 }
862 }
863
864 if (chans_remaining == 0) {
865 /* If we've made it here, we have an MOV we want to coalesce out, and
866 * a scan_inst pointing to the earliest instruction involved in
867 * computing the value. Now go rewrite the instruction stream
868 * between the two.
869 */
870
871 while (scan_inst != inst) {
872 if (scan_inst->dst.file == GRF &&
873 scan_inst->dst.reg == inst->src[0].reg &&
874 scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
875 scan_inst->reswizzle_dst(inst->dst.writemask,
876 inst->src[0].swizzle);
877 scan_inst->dst.file = inst->dst.file;
878 scan_inst->dst.reg = inst->dst.reg;
879 scan_inst->dst.reg_offset = inst->dst.reg_offset;
880 scan_inst->saturate |= inst->saturate;
881 }
882 scan_inst = (vec4_instruction *)scan_inst->next;
883 }
884 inst->remove();
885 progress = true;
886 }
887 }
888
889 if (progress)
890 live_intervals_valid = false;
891
892 return progress;
893 }
894
895 /**
896 * Splits virtual GRFs requesting more than one contiguous physical register.
897 *
898 * We initially create large virtual GRFs for temporary structures, arrays,
899 * and matrices, so that the dereference visitor functions can add reg_offsets
900 * to work their way down to the actual member being accessed. But when it
901 * comes to optimization, we'd like to treat each register as individual
902 * storage if possible.
903 *
904 * So far, the only thing that might prevent splitting is a send message from
905 * a GRF on IVB.
906 */
907 void
908 vec4_visitor::split_virtual_grfs()
909 {
910 int num_vars = this->virtual_grf_count;
911 int new_virtual_grf[num_vars];
912 bool split_grf[num_vars];
913
914 memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
915
916 /* Try to split anything > 0 sized. */
917 for (int i = 0; i < num_vars; i++) {
918 split_grf[i] = this->virtual_grf_sizes[i] != 1;
919 }
920
921 /* Check that the instructions are compatible with the registers we're trying
922 * to split.
923 */
924 foreach_list(node, &this->instructions) {
925 vec4_instruction *inst = (vec4_instruction *)node;
926
927 /* If there's a SEND message loading from a GRF on gen7+, it needs to be
928 * contiguous. Assume that the GRF for the SEND is always in src[0].
929 */
930 if (inst->is_send_from_grf()) {
931 split_grf[inst->src[0].reg] = false;
932 }
933 }
934
935 /* Allocate new space for split regs. Note that the virtual
936 * numbers will be contiguous.
937 */
938 for (int i = 0; i < num_vars; i++) {
939 if (!split_grf[i])
940 continue;
941
942 new_virtual_grf[i] = virtual_grf_alloc(1);
943 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
944 int reg = virtual_grf_alloc(1);
945 assert(reg == new_virtual_grf[i] + j - 1);
946 (void) reg;
947 }
948 this->virtual_grf_sizes[i] = 1;
949 }
950
951 foreach_list(node, &this->instructions) {
952 vec4_instruction *inst = (vec4_instruction *)node;
953
954 if (inst->dst.file == GRF && split_grf[inst->dst.reg] &&
955 inst->dst.reg_offset != 0) {
956 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
957 inst->dst.reg_offset - 1);
958 inst->dst.reg_offset = 0;
959 }
960 for (int i = 0; i < 3; i++) {
961 if (inst->src[i].file == GRF && split_grf[inst->src[i].reg] &&
962 inst->src[i].reg_offset != 0) {
963 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
964 inst->src[i].reg_offset - 1);
965 inst->src[i].reg_offset = 0;
966 }
967 }
968 }
969 this->live_intervals_valid = false;
970 }
971
972 void
973 vec4_visitor::dump_instruction(vec4_instruction *inst)
974 {
975 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
976 opcode_descs[inst->opcode].name) {
977 printf("%s ", opcode_descs[inst->opcode].name);
978 } else {
979 printf("op%d ", inst->opcode);
980 }
981
982 switch (inst->dst.file) {
983 case GRF:
984 printf("vgrf%d.%d", inst->dst.reg, inst->dst.reg_offset);
985 break;
986 case MRF:
987 printf("m%d", inst->dst.reg);
988 break;
989 case BAD_FILE:
990 printf("(null)");
991 break;
992 default:
993 printf("???");
994 break;
995 }
996 if (inst->dst.writemask != WRITEMASK_XYZW) {
997 printf(".");
998 if (inst->dst.writemask & 1)
999 printf("x");
1000 if (inst->dst.writemask & 2)
1001 printf("y");
1002 if (inst->dst.writemask & 4)
1003 printf("z");
1004 if (inst->dst.writemask & 8)
1005 printf("w");
1006 }
1007 printf(", ");
1008
1009 for (int i = 0; i < 3; i++) {
1010 switch (inst->src[i].file) {
1011 case GRF:
1012 printf("vgrf%d", inst->src[i].reg);
1013 break;
1014 case ATTR:
1015 printf("attr%d", inst->src[i].reg);
1016 break;
1017 case UNIFORM:
1018 printf("u%d", inst->src[i].reg);
1019 break;
1020 case IMM:
1021 switch (inst->src[i].type) {
1022 case BRW_REGISTER_TYPE_F:
1023 printf("%fF", inst->src[i].imm.f);
1024 break;
1025 case BRW_REGISTER_TYPE_D:
1026 printf("%dD", inst->src[i].imm.i);
1027 break;
1028 case BRW_REGISTER_TYPE_UD:
1029 printf("%uU", inst->src[i].imm.u);
1030 break;
1031 default:
1032 printf("???");
1033 break;
1034 }
1035 break;
1036 case BAD_FILE:
1037 printf("(null)");
1038 break;
1039 default:
1040 printf("???");
1041 break;
1042 }
1043
1044 if (inst->src[i].reg_offset)
1045 printf(".%d", inst->src[i].reg_offset);
1046
1047 static const char *chans[4] = {"x", "y", "z", "w"};
1048 printf(".");
1049 for (int c = 0; c < 4; c++) {
1050 printf("%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]);
1051 }
1052
1053 if (i < 3)
1054 printf(", ");
1055 }
1056
1057 printf("\n");
1058 }
1059
1060 void
1061 vec4_visitor::dump_instructions()
1062 {
1063 int ip = 0;
1064 foreach_list_safe(node, &this->instructions) {
1065 vec4_instruction *inst = (vec4_instruction *)node;
1066 printf("%d: ", ip++);
1067 dump_instruction(inst);
1068 }
1069 }
1070
1071 int
1072 vec4_visitor::setup_attributes(int payload_reg)
1073 {
1074 int nr_attributes;
1075 int attribute_map[VERT_ATTRIB_MAX + 1];
1076
1077 nr_attributes = 0;
1078 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
1079 if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
1080 attribute_map[i] = payload_reg + nr_attributes;
1081 nr_attributes++;
1082 }
1083 }
1084
1085 /* VertexID is stored by the VF as the last vertex element, but we
1086 * don't represent it with a flag in inputs_read, so we call it
1087 * VERT_ATTRIB_MAX.
1088 */
1089 if (prog_data->uses_vertexid) {
1090 attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
1091 nr_attributes++;
1092 }
1093
1094 foreach_list(node, &this->instructions) {
1095 vec4_instruction *inst = (vec4_instruction *)node;
1096
1097 /* We have to support ATTR as a destination for GL_FIXED fixup. */
1098 if (inst->dst.file == ATTR) {
1099 int grf = attribute_map[inst->dst.reg + inst->dst.reg_offset];
1100
1101 struct brw_reg reg = brw_vec8_grf(grf, 0);
1102 reg.type = inst->dst.type;
1103 reg.dw1.bits.writemask = inst->dst.writemask;
1104
1105 inst->dst.file = HW_REG;
1106 inst->dst.fixed_hw_reg = reg;
1107 }
1108
1109 for (int i = 0; i < 3; i++) {
1110 if (inst->src[i].file != ATTR)
1111 continue;
1112
1113 int grf = attribute_map[inst->src[i].reg + inst->src[i].reg_offset];
1114
1115 struct brw_reg reg = brw_vec8_grf(grf, 0);
1116 reg.dw1.bits.swizzle = inst->src[i].swizzle;
1117 reg.type = inst->src[i].type;
1118 if (inst->src[i].abs)
1119 reg = brw_abs(reg);
1120 if (inst->src[i].negate)
1121 reg = negate(reg);
1122
1123 inst->src[i].file = HW_REG;
1124 inst->src[i].fixed_hw_reg = reg;
1125 }
1126 }
1127
1128 /* The BSpec says we always have to read at least one thing from
1129 * the VF, and it appears that the hardware wedges otherwise.
1130 */
1131 if (nr_attributes == 0)
1132 nr_attributes = 1;
1133
1134 prog_data->urb_read_length = (nr_attributes + 1) / 2;
1135
1136 unsigned vue_entries = MAX2(nr_attributes, c->prog_data.vue_map.num_slots);
1137
1138 if (intel->gen == 6)
1139 c->prog_data.urb_entry_size = ALIGN(vue_entries, 8) / 8;
1140 else
1141 c->prog_data.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1142
1143 return payload_reg + nr_attributes;
1144 }
1145
1146 int
1147 vec4_visitor::setup_uniforms(int reg)
1148 {
1149 /* The pre-gen6 VS requires that some push constants get loaded no
1150 * matter what, or the GPU would hang.
1151 */
1152 if (intel->gen < 6 && this->uniforms == 0) {
1153 this->uniform_vector_size[this->uniforms] = 1;
1154
1155 for (unsigned int i = 0; i < 4; i++) {
1156 unsigned int slot = this->uniforms * 4 + i;
1157 static float zero = 0.0;
1158 c->prog_data.param[slot] = &zero;
1159 }
1160
1161 this->uniforms++;
1162 reg++;
1163 } else {
1164 reg += ALIGN(uniforms, 2) / 2;
1165 }
1166
1167 c->prog_data.nr_params = this->uniforms * 4;
1168
1169 c->prog_data.curb_read_length = reg - 1;
1170
1171 return reg;
1172 }
1173
1174 void
1175 vec4_visitor::setup_payload(void)
1176 {
1177 int reg = 0;
1178
1179 /* The payload always contains important data in g0, which contains
1180 * the URB handles that are passed on to the URB write at the end
1181 * of the thread. So, we always start push constants at g1.
1182 */
1183 reg++;
1184
1185 reg = setup_uniforms(reg);
1186
1187 reg = setup_attributes(reg);
1188
1189 this->first_non_payload_grf = reg;
1190 }
1191
1192 src_reg
1193 vec4_visitor::get_timestamp()
1194 {
1195 assert(intel->gen >= 7);
1196
1197 src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1198 BRW_ARF_TIMESTAMP,
1199 0,
1200 BRW_REGISTER_TYPE_UD,
1201 BRW_VERTICAL_STRIDE_0,
1202 BRW_WIDTH_4,
1203 BRW_HORIZONTAL_STRIDE_4,
1204 BRW_SWIZZLE_XYZW,
1205 WRITEMASK_XYZW));
1206
1207 dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
1208
1209 vec4_instruction *mov = emit(MOV(dst, ts));
1210 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
1211 * even if it's not enabled in the dispatch.
1212 */
1213 mov->force_writemask_all = true;
1214
1215 return src_reg(dst);
1216 }
1217
1218 void
1219 vec4_visitor::emit_shader_time_begin()
1220 {
1221 current_annotation = "shader time start";
1222 shader_start_time = get_timestamp();
1223 }
1224
1225 void
1226 vec4_visitor::emit_shader_time_end()
1227 {
1228 current_annotation = "shader time end";
1229 src_reg shader_end_time = get_timestamp();
1230
1231
1232 /* Check that there weren't any timestamp reset events (assuming these
1233 * were the only two timestamp reads that happened).
1234 */
1235 src_reg reset_end = shader_end_time;
1236 reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
1237 vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u)));
1238 test->conditional_mod = BRW_CONDITIONAL_Z;
1239
1240 emit(IF(BRW_PREDICATE_NORMAL));
1241
1242 /* Take the current timestamp and get the delta. */
1243 shader_start_time.negate = true;
1244 dst_reg diff = dst_reg(this, glsl_type::uint_type);
1245 emit(ADD(diff, shader_start_time, shader_end_time));
1246
1247 /* If there were no instructions between the two timestamp gets, the diff
1248 * is 2 cycles. Remove that overhead, so I can forget about that when
1249 * trying to determine the time taken for single instructions.
1250 */
1251 emit(ADD(diff, src_reg(diff), src_reg(-2u)));
1252
1253 emit_shader_time_write(ST_VS, src_reg(diff));
1254 emit_shader_time_write(ST_VS_WRITTEN, src_reg(1u));
1255 emit(BRW_OPCODE_ELSE);
1256 emit_shader_time_write(ST_VS_RESET, src_reg(1u));
1257 emit(BRW_OPCODE_ENDIF);
1258 }
1259
1260 void
1261 vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
1262 src_reg value)
1263 {
1264 int shader_time_index = brw_get_shader_time_index(brw, prog, &vp->Base,
1265 type);
1266
1267 dst_reg dst =
1268 dst_reg(this, glsl_type::get_array_instance(glsl_type::vec4_type, 2));
1269
1270 dst_reg offset = dst;
1271 dst_reg time = dst;
1272 time.reg_offset++;
1273
1274 offset.type = BRW_REGISTER_TYPE_UD;
1275 emit(MOV(offset, src_reg(shader_time_index * SHADER_TIME_STRIDE)));
1276
1277 time.type = BRW_REGISTER_TYPE_UD;
1278 emit(MOV(time, src_reg(value)));
1279
1280 emit(SHADER_OPCODE_SHADER_TIME_ADD, dst_reg(), src_reg(dst));
1281 }
1282
1283 bool
1284 vec4_visitor::run()
1285 {
1286 sanity_param_count = vp->Base.Parameters->NumParameters;
1287
1288 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
1289 emit_shader_time_begin();
1290
1291 emit_attribute_fixups();
1292
1293 /* Generate VS IR for main(). (the visitor only descends into
1294 * functions called "main").
1295 */
1296 if (shader) {
1297 visit_instructions(shader->ir);
1298 } else {
1299 emit_vertex_program_code();
1300 }
1301 base_ir = NULL;
1302
1303 if (c->key.userclip_active && !c->key.uses_clip_distance)
1304 setup_uniform_clipplane_values();
1305
1306 emit_urb_writes();
1307
1308 /* Before any optimization, push array accesses out to scratch
1309 * space where we need them to be. This pass may allocate new
1310 * virtual GRFs, so we want to do it early. It also makes sure
1311 * that we have reladdr computations available for CSE, since we'll
1312 * often do repeated subexpressions for those.
1313 */
1314 if (shader) {
1315 move_grf_array_access_to_scratch();
1316 move_uniform_array_access_to_pull_constants();
1317 } else {
1318 /* The ARB_vertex_program frontend emits pull constant loads directly
1319 * rather than using reladdr, so we don't need to walk through all the
1320 * instructions looking for things to move. There isn't anything.
1321 *
1322 * We do still need to split things to vec4 size.
1323 */
1324 split_uniform_registers();
1325 }
1326 pack_uniform_registers();
1327 move_push_constants_to_pull_constants();
1328 split_virtual_grfs();
1329
1330 bool progress;
1331 do {
1332 progress = false;
1333 progress = dead_code_eliminate() || progress;
1334 progress = opt_copy_propagation() || progress;
1335 progress = opt_algebraic() || progress;
1336 progress = opt_register_coalesce() || progress;
1337 } while (progress);
1338
1339
1340 if (failed)
1341 return false;
1342
1343 setup_payload();
1344
1345 if (false) {
1346 /* Debug of register spilling: Go spill everything. */
1347 const int grf_count = virtual_grf_count;
1348 float spill_costs[virtual_grf_count];
1349 bool no_spill[virtual_grf_count];
1350 evaluate_spill_costs(spill_costs, no_spill);
1351 for (int i = 0; i < grf_count; i++) {
1352 if (no_spill[i])
1353 continue;
1354 spill_reg(i);
1355 }
1356 }
1357
1358 while (!reg_allocate()) {
1359 if (failed)
1360 break;
1361 }
1362
1363 /* If any state parameters were appended, then ParameterValues could have
1364 * been realloced, in which case the driver uniform storage set up by
1365 * _mesa_associate_uniform_storage() would point to freed memory. Make
1366 * sure that didn't happen.
1367 */
1368 assert(sanity_param_count == vp->Base.Parameters->NumParameters);
1369
1370 return !failed;
1371 }
1372
1373 } /* namespace brw */
1374
1375 extern "C" {
1376
1377 /**
1378 * Compile a vertex shader.
1379 *
1380 * Returns the final assembly and the program's size.
1381 */
1382 const unsigned *
1383 brw_vs_emit(struct brw_context *brw,
1384 struct gl_shader_program *prog,
1385 struct brw_vs_compile *c,
1386 void *mem_ctx,
1387 unsigned *final_assembly_size)
1388 {
1389 struct intel_context *intel = &brw->intel;
1390 bool start_busy = false;
1391 float start_time = 0;
1392
1393 if (unlikely(intel->perf_debug)) {
1394 start_busy = (intel->batch.last_bo &&
1395 drm_intel_bo_busy(intel->batch.last_bo));
1396 start_time = get_time();
1397 }
1398
1399 struct brw_shader *shader = NULL;
1400 if (prog)
1401 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX];
1402
1403 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
1404 if (shader) {
1405 printf("GLSL IR for native vertex shader %d:\n", prog->Name);
1406 _mesa_print_ir(shader->ir, NULL);
1407 printf("\n\n");
1408 } else {
1409 printf("ARB_vertex_program %d for native vertex shader\n",
1410 c->vp->program.Base.Id);
1411 _mesa_print_program(&c->vp->program.Base);
1412 }
1413 }
1414
1415 vec4_visitor v(brw, c, prog, shader, mem_ctx);
1416 if (!v.run()) {
1417 prog->LinkStatus = false;
1418 ralloc_strcat(&prog->InfoLog, v.fail_msg);
1419 return NULL;
1420 }
1421
1422 vec4_generator g(brw, c, prog, mem_ctx);
1423 const unsigned *generated =g.generate_assembly(&v.instructions,
1424 final_assembly_size);
1425
1426 if (unlikely(intel->perf_debug) && shader) {
1427 if (shader->compiled_once) {
1428 brw_vs_debug_recompile(brw, prog, &c->key);
1429 }
1430 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
1431 perf_debug("VS compile took %.03f ms and stalled the GPU\n",
1432 (get_time() - start_time) * 1000);
1433 }
1434 shader->compiled_once = true;
1435 }
1436
1437 return generated;
1438 }
1439
1440 } /* extern "C" */