i965/vs/gen7: Allow MATH instructions to have MRF as a destination
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_print_visitor.h"
26
27 extern "C" {
28 #include "main/macros.h"
29 #include "main/shaderobj.h"
30 #include "program/prog_print.h"
31 #include "program/prog_parameter.h"
32 }
33
34 #define MAX_INSTRUCTION (1 << 30)
35
36 using namespace brw;
37
38 namespace brw {
39
40 /**
41 * Common helper for constructing swizzles. When only a subset of
42 * channels of a vec4 are used, we don't want to reference the other
43 * channels, as that will tell optimization passes that those other
44 * channels are used.
45 */
46 unsigned
47 swizzle_for_size(int size)
48 {
49 static const unsigned size_swizzles[4] = {
50 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
51 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
52 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z),
53 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W),
54 };
55
56 assert((size >= 1) && (size <= 4));
57 return size_swizzles[size - 1];
58 }
59
60 void
61 src_reg::init()
62 {
63 memset(this, 0, sizeof(*this));
64
65 this->file = BAD_FILE;
66 }
67
68 src_reg::src_reg(register_file file, int reg, const glsl_type *type)
69 {
70 init();
71
72 this->file = file;
73 this->reg = reg;
74 if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
75 this->swizzle = swizzle_for_size(type->vector_elements);
76 else
77 this->swizzle = SWIZZLE_XYZW;
78 }
79
80 /** Generic unset register constructor. */
81 src_reg::src_reg()
82 {
83 init();
84 }
85
86 src_reg::src_reg(float f)
87 {
88 init();
89
90 this->file = IMM;
91 this->type = BRW_REGISTER_TYPE_F;
92 this->imm.f = f;
93 }
94
95 src_reg::src_reg(uint32_t u)
96 {
97 init();
98
99 this->file = IMM;
100 this->type = BRW_REGISTER_TYPE_UD;
101 this->imm.u = u;
102 }
103
104 src_reg::src_reg(int32_t i)
105 {
106 init();
107
108 this->file = IMM;
109 this->type = BRW_REGISTER_TYPE_D;
110 this->imm.i = i;
111 }
112
113 src_reg::src_reg(dst_reg reg)
114 {
115 init();
116
117 this->file = reg.file;
118 this->reg = reg.reg;
119 this->reg_offset = reg.reg_offset;
120 this->type = reg.type;
121 this->reladdr = reg.reladdr;
122 this->fixed_hw_reg = reg.fixed_hw_reg;
123
124 int swizzles[4];
125 int next_chan = 0;
126 int last = 0;
127
128 for (int i = 0; i < 4; i++) {
129 if (!(reg.writemask & (1 << i)))
130 continue;
131
132 swizzles[next_chan++] = last = i;
133 }
134
135 for (; next_chan < 4; next_chan++) {
136 swizzles[next_chan] = last;
137 }
138
139 this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
140 swizzles[2], swizzles[3]);
141 }
142
143 bool
144 vec4_instruction::is_tex()
145 {
146 return (opcode == SHADER_OPCODE_TEX ||
147 opcode == SHADER_OPCODE_TXD ||
148 opcode == SHADER_OPCODE_TXF ||
149 opcode == SHADER_OPCODE_TXL ||
150 opcode == SHADER_OPCODE_TXS);
151 }
152
153 void
154 dst_reg::init()
155 {
156 memset(this, 0, sizeof(*this));
157 this->file = BAD_FILE;
158 this->writemask = WRITEMASK_XYZW;
159 }
160
161 dst_reg::dst_reg()
162 {
163 init();
164 }
165
166 dst_reg::dst_reg(register_file file, int reg)
167 {
168 init();
169
170 this->file = file;
171 this->reg = reg;
172 }
173
174 dst_reg::dst_reg(register_file file, int reg, const glsl_type *type,
175 int writemask)
176 {
177 init();
178
179 this->file = file;
180 this->reg = reg;
181 this->type = brw_type_for_base_type(type);
182 this->writemask = writemask;
183 }
184
185 dst_reg::dst_reg(struct brw_reg reg)
186 {
187 init();
188
189 this->file = HW_REG;
190 this->fixed_hw_reg = reg;
191 }
192
193 dst_reg::dst_reg(src_reg reg)
194 {
195 init();
196
197 this->file = reg.file;
198 this->reg = reg.reg;
199 this->reg_offset = reg.reg_offset;
200 this->type = reg.type;
201 this->writemask = WRITEMASK_XYZW;
202 this->reladdr = reg.reladdr;
203 this->fixed_hw_reg = reg.fixed_hw_reg;
204 }
205
206 bool
207 vec4_instruction::is_math()
208 {
209 return (opcode == SHADER_OPCODE_RCP ||
210 opcode == SHADER_OPCODE_RSQ ||
211 opcode == SHADER_OPCODE_SQRT ||
212 opcode == SHADER_OPCODE_EXP2 ||
213 opcode == SHADER_OPCODE_LOG2 ||
214 opcode == SHADER_OPCODE_SIN ||
215 opcode == SHADER_OPCODE_COS ||
216 opcode == SHADER_OPCODE_INT_QUOTIENT ||
217 opcode == SHADER_OPCODE_INT_REMAINDER ||
218 opcode == SHADER_OPCODE_POW);
219 }
220 /**
221 * Returns how many MRFs an opcode will write over.
222 *
223 * Note that this is not the 0 or 1 implied writes in an actual gen
224 * instruction -- the generate_* functions generate additional MOVs
225 * for setup.
226 */
227 int
228 vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
229 {
230 if (inst->mlen == 0)
231 return 0;
232
233 switch (inst->opcode) {
234 case SHADER_OPCODE_RCP:
235 case SHADER_OPCODE_RSQ:
236 case SHADER_OPCODE_SQRT:
237 case SHADER_OPCODE_EXP2:
238 case SHADER_OPCODE_LOG2:
239 case SHADER_OPCODE_SIN:
240 case SHADER_OPCODE_COS:
241 return 1;
242 case SHADER_OPCODE_POW:
243 return 2;
244 case VS_OPCODE_URB_WRITE:
245 return 1;
246 case VS_OPCODE_PULL_CONSTANT_LOAD:
247 return 2;
248 case VS_OPCODE_SCRATCH_READ:
249 return 2;
250 case VS_OPCODE_SCRATCH_WRITE:
251 return 3;
252 case SHADER_OPCODE_SHADER_TIME_ADD:
253 return 0;
254 default:
255 assert(!"not reached");
256 return inst->mlen;
257 }
258 }
259
260 bool
261 src_reg::equals(src_reg *r)
262 {
263 return (file == r->file &&
264 reg == r->reg &&
265 reg_offset == r->reg_offset &&
266 type == r->type &&
267 negate == r->negate &&
268 abs == r->abs &&
269 swizzle == r->swizzle &&
270 !reladdr && !r->reladdr &&
271 memcmp(&fixed_hw_reg, &r->fixed_hw_reg,
272 sizeof(fixed_hw_reg)) == 0 &&
273 imm.u == r->imm.u);
274 }
275
276 /**
277 * Must be called after calculate_live_intervales() to remove unused
278 * writes to registers -- register allocation will fail otherwise
279 * because something deffed but not used won't be considered to
280 * interfere with other regs.
281 */
282 bool
283 vec4_visitor::dead_code_eliminate()
284 {
285 bool progress = false;
286 int pc = 0;
287
288 calculate_live_intervals();
289
290 foreach_list_safe(node, &this->instructions) {
291 vec4_instruction *inst = (vec4_instruction *)node;
292
293 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
294 inst->remove();
295 progress = true;
296 }
297
298 pc++;
299 }
300
301 if (progress)
302 live_intervals_valid = false;
303
304 return progress;
305 }
306
307 void
308 vec4_visitor::split_uniform_registers()
309 {
310 /* Prior to this, uniforms have been in an array sized according to
311 * the number of vector uniforms present, sparsely filled (so an
312 * aggregate results in reg indices being skipped over). Now we're
313 * going to cut those aggregates up so each .reg index is one
314 * vector. The goal is to make elimination of unused uniform
315 * components easier later.
316 */
317 foreach_list(node, &this->instructions) {
318 vec4_instruction *inst = (vec4_instruction *)node;
319
320 for (int i = 0 ; i < 3; i++) {
321 if (inst->src[i].file != UNIFORM)
322 continue;
323
324 assert(!inst->src[i].reladdr);
325
326 inst->src[i].reg += inst->src[i].reg_offset;
327 inst->src[i].reg_offset = 0;
328 }
329 }
330
331 /* Update that everything is now vector-sized. */
332 for (int i = 0; i < this->uniforms; i++) {
333 this->uniform_size[i] = 1;
334 }
335 }
336
337 void
338 vec4_visitor::pack_uniform_registers()
339 {
340 bool uniform_used[this->uniforms];
341 int new_loc[this->uniforms];
342 int new_chan[this->uniforms];
343
344 memset(uniform_used, 0, sizeof(uniform_used));
345 memset(new_loc, 0, sizeof(new_loc));
346 memset(new_chan, 0, sizeof(new_chan));
347
348 /* Find which uniform vectors are actually used by the program. We
349 * expect unused vector elements when we've moved array access out
350 * to pull constants, and from some GLSL code generators like wine.
351 */
352 foreach_list(node, &this->instructions) {
353 vec4_instruction *inst = (vec4_instruction *)node;
354
355 for (int i = 0 ; i < 3; i++) {
356 if (inst->src[i].file != UNIFORM)
357 continue;
358
359 uniform_used[inst->src[i].reg] = true;
360 }
361 }
362
363 int new_uniform_count = 0;
364
365 /* Now, figure out a packing of the live uniform vectors into our
366 * push constants.
367 */
368 for (int src = 0; src < uniforms; src++) {
369 int size = this->uniform_vector_size[src];
370
371 if (!uniform_used[src]) {
372 this->uniform_vector_size[src] = 0;
373 continue;
374 }
375
376 int dst;
377 /* Find the lowest place we can slot this uniform in. */
378 for (dst = 0; dst < src; dst++) {
379 if (this->uniform_vector_size[dst] + size <= 4)
380 break;
381 }
382
383 if (src == dst) {
384 new_loc[src] = dst;
385 new_chan[src] = 0;
386 } else {
387 new_loc[src] = dst;
388 new_chan[src] = this->uniform_vector_size[dst];
389
390 /* Move the references to the data */
391 for (int j = 0; j < size; j++) {
392 c->prog_data.param[dst * 4 + new_chan[src] + j] =
393 c->prog_data.param[src * 4 + j];
394 }
395
396 this->uniform_vector_size[dst] += size;
397 this->uniform_vector_size[src] = 0;
398 }
399
400 new_uniform_count = MAX2(new_uniform_count, dst + 1);
401 }
402
403 this->uniforms = new_uniform_count;
404
405 /* Now, update the instructions for our repacked uniforms. */
406 foreach_list(node, &this->instructions) {
407 vec4_instruction *inst = (vec4_instruction *)node;
408
409 for (int i = 0 ; i < 3; i++) {
410 int src = inst->src[i].reg;
411
412 if (inst->src[i].file != UNIFORM)
413 continue;
414
415 inst->src[i].reg = new_loc[src];
416
417 int sx = BRW_GET_SWZ(inst->src[i].swizzle, 0) + new_chan[src];
418 int sy = BRW_GET_SWZ(inst->src[i].swizzle, 1) + new_chan[src];
419 int sz = BRW_GET_SWZ(inst->src[i].swizzle, 2) + new_chan[src];
420 int sw = BRW_GET_SWZ(inst->src[i].swizzle, 3) + new_chan[src];
421 inst->src[i].swizzle = BRW_SWIZZLE4(sx, sy, sz, sw);
422 }
423 }
424 }
425
426 bool
427 src_reg::is_zero() const
428 {
429 if (file != IMM)
430 return false;
431
432 if (type == BRW_REGISTER_TYPE_F) {
433 return imm.f == 0.0;
434 } else {
435 return imm.i == 0;
436 }
437 }
438
439 bool
440 src_reg::is_one() const
441 {
442 if (file != IMM)
443 return false;
444
445 if (type == BRW_REGISTER_TYPE_F) {
446 return imm.f == 1.0;
447 } else {
448 return imm.i == 1;
449 }
450 }
451
452 /**
453 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
454 *
455 * While GLSL IR also performs this optimization, we end up with it in
456 * our instruction stream for a couple of reasons. One is that we
457 * sometimes generate silly instructions, for example in array access
458 * where we'll generate "ADD offset, index, base" even if base is 0.
459 * The other is that GLSL IR's constant propagation doesn't track the
460 * components of aggregates, so some VS patterns (initialize matrix to
461 * 0, accumulate in vertex blending factors) end up breaking down to
462 * instructions involving 0.
463 */
464 bool
465 vec4_visitor::opt_algebraic()
466 {
467 bool progress = false;
468
469 foreach_list(node, &this->instructions) {
470 vec4_instruction *inst = (vec4_instruction *)node;
471
472 switch (inst->opcode) {
473 case BRW_OPCODE_ADD:
474 if (inst->src[1].is_zero()) {
475 inst->opcode = BRW_OPCODE_MOV;
476 inst->src[1] = src_reg();
477 progress = true;
478 }
479 break;
480
481 case BRW_OPCODE_MUL:
482 if (inst->src[1].is_zero()) {
483 inst->opcode = BRW_OPCODE_MOV;
484 switch (inst->src[0].type) {
485 case BRW_REGISTER_TYPE_F:
486 inst->src[0] = src_reg(0.0f);
487 break;
488 case BRW_REGISTER_TYPE_D:
489 inst->src[0] = src_reg(0);
490 break;
491 case BRW_REGISTER_TYPE_UD:
492 inst->src[0] = src_reg(0u);
493 break;
494 default:
495 assert(!"not reached");
496 inst->src[0] = src_reg(0.0f);
497 break;
498 }
499 inst->src[1] = src_reg();
500 progress = true;
501 } else if (inst->src[1].is_one()) {
502 inst->opcode = BRW_OPCODE_MOV;
503 inst->src[1] = src_reg();
504 progress = true;
505 }
506 break;
507 default:
508 break;
509 }
510 }
511
512 if (progress)
513 this->live_intervals_valid = false;
514
515 return progress;
516 }
517
518 /**
519 * Only a limited number of hardware registers may be used for push
520 * constants, so this turns access to the overflowed constants into
521 * pull constants.
522 */
523 void
524 vec4_visitor::move_push_constants_to_pull_constants()
525 {
526 int pull_constant_loc[this->uniforms];
527
528 /* Only allow 32 registers (256 uniform components) as push constants,
529 * which is the limit on gen6.
530 */
531 int max_uniform_components = 32 * 8;
532 if (this->uniforms * 4 <= max_uniform_components)
533 return;
534
535 /* Make some sort of choice as to which uniforms get sent to pull
536 * constants. We could potentially do something clever here like
537 * look for the most infrequently used uniform vec4s, but leave
538 * that for later.
539 */
540 for (int i = 0; i < this->uniforms * 4; i += 4) {
541 pull_constant_loc[i / 4] = -1;
542
543 if (i >= max_uniform_components) {
544 const float **values = &prog_data->param[i];
545
546 /* Try to find an existing copy of this uniform in the pull
547 * constants if it was part of an array access already.
548 */
549 for (unsigned int j = 0; j < prog_data->nr_pull_params; j += 4) {
550 int matches;
551
552 for (matches = 0; matches < 4; matches++) {
553 if (prog_data->pull_param[j + matches] != values[matches])
554 break;
555 }
556
557 if (matches == 4) {
558 pull_constant_loc[i / 4] = j / 4;
559 break;
560 }
561 }
562
563 if (pull_constant_loc[i / 4] == -1) {
564 assert(prog_data->nr_pull_params % 4 == 0);
565 pull_constant_loc[i / 4] = prog_data->nr_pull_params / 4;
566
567 for (int j = 0; j < 4; j++) {
568 prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
569 }
570 }
571 }
572 }
573
574 /* Now actually rewrite usage of the things we've moved to pull
575 * constants.
576 */
577 foreach_list_safe(node, &this->instructions) {
578 vec4_instruction *inst = (vec4_instruction *)node;
579
580 for (int i = 0 ; i < 3; i++) {
581 if (inst->src[i].file != UNIFORM ||
582 pull_constant_loc[inst->src[i].reg] == -1)
583 continue;
584
585 int uniform = inst->src[i].reg;
586
587 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
588
589 emit_pull_constant_load(inst, temp, inst->src[i],
590 pull_constant_loc[uniform]);
591
592 inst->src[i].file = temp.file;
593 inst->src[i].reg = temp.reg;
594 inst->src[i].reg_offset = temp.reg_offset;
595 inst->src[i].reladdr = NULL;
596 }
597 }
598
599 /* Repack push constants to remove the now-unused ones. */
600 pack_uniform_registers();
601 }
602
603 bool
604 vec4_instruction::can_reswizzle_dst(int dst_writemask,
605 int swizzle,
606 int swizzle_mask)
607 {
608 /* If this instruction sets anything not referenced by swizzle, then we'd
609 * totally break it when we reswizzle.
610 */
611 if (dst.writemask & ~swizzle_mask)
612 return false;
613
614 switch (opcode) {
615 case BRW_OPCODE_DP4:
616 case BRW_OPCODE_DP3:
617 case BRW_OPCODE_DP2:
618 return true;
619 default:
620 /* Check if there happens to be no reswizzling required. */
621 for (int c = 0; c < 4; c++) {
622 int bit = 1 << BRW_GET_SWZ(swizzle, c);
623 /* Skip components of the swizzle not used by the dst. */
624 if (!(dst_writemask & (1 << c)))
625 continue;
626
627 /* We don't do the reswizzling yet, so just sanity check that we
628 * don't have to.
629 */
630 if (bit != (1 << c))
631 return false;
632 }
633 return true;
634 }
635 }
636
637 /**
638 * For any channels in the swizzle's source that were populated by this
639 * instruction, rewrite the instruction to put the appropriate result directly
640 * in those channels.
641 *
642 * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
643 */
644 void
645 vec4_instruction::reswizzle_dst(int dst_writemask, int swizzle)
646 {
647 int new_writemask = 0;
648
649 switch (opcode) {
650 case BRW_OPCODE_DP4:
651 case BRW_OPCODE_DP3:
652 case BRW_OPCODE_DP2:
653 for (int c = 0; c < 4; c++) {
654 int bit = 1 << BRW_GET_SWZ(swizzle, c);
655 /* Skip components of the swizzle not used by the dst. */
656 if (!(dst_writemask & (1 << c)))
657 continue;
658 /* If we were populating this component, then populate the
659 * corresponding channel of the new dst.
660 */
661 if (dst.writemask & bit)
662 new_writemask |= (1 << c);
663 }
664 dst.writemask = new_writemask;
665 break;
666 default:
667 for (int c = 0; c < 4; c++) {
668 int bit = 1 << BRW_GET_SWZ(swizzle, c);
669 /* Skip components of the swizzle not used by the dst. */
670 if (!(dst_writemask & (1 << c)))
671 continue;
672
673 /* We don't do the reswizzling yet, so just sanity check that we
674 * don't have to.
675 */
676 assert(bit == (1 << c));
677 }
678 break;
679 }
680 }
681
682 /*
683 * Tries to reduce extra MOV instructions by taking temporary GRFs that get
684 * just written and then MOVed into another reg and making the original write
685 * of the GRF write directly to the final destination instead.
686 */
687 bool
688 vec4_visitor::opt_register_coalesce()
689 {
690 bool progress = false;
691 int next_ip = 0;
692
693 calculate_live_intervals();
694
695 foreach_list_safe(node, &this->instructions) {
696 vec4_instruction *inst = (vec4_instruction *)node;
697
698 int ip = next_ip;
699 next_ip++;
700
701 if (inst->opcode != BRW_OPCODE_MOV ||
702 (inst->dst.file != GRF && inst->dst.file != MRF) ||
703 inst->predicate ||
704 inst->src[0].file != GRF ||
705 inst->dst.type != inst->src[0].type ||
706 inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
707 continue;
708
709 bool to_mrf = (inst->dst.file == MRF);
710
711 /* Can't coalesce this GRF if someone else was going to
712 * read it later.
713 */
714 if (this->virtual_grf_use[inst->src[0].reg] > ip)
715 continue;
716
717 /* We need to check interference with the final destination between this
718 * instruction and the earliest instruction involved in writing the GRF
719 * we're eliminating. To do that, keep track of which of our source
720 * channels we've seen initialized.
721 */
722 bool chans_needed[4] = {false, false, false, false};
723 int chans_remaining = 0;
724 int swizzle_mask = 0;
725 for (int i = 0; i < 4; i++) {
726 int chan = BRW_GET_SWZ(inst->src[0].swizzle, i);
727
728 if (!(inst->dst.writemask & (1 << i)))
729 continue;
730
731 swizzle_mask |= (1 << chan);
732
733 if (!chans_needed[chan]) {
734 chans_needed[chan] = true;
735 chans_remaining++;
736 }
737 }
738
739 /* Now walk up the instruction stream trying to see if we can rewrite
740 * everything writing to the temporary to write into the destination
741 * instead.
742 */
743 vec4_instruction *scan_inst;
744 for (scan_inst = (vec4_instruction *)inst->prev;
745 scan_inst->prev != NULL;
746 scan_inst = (vec4_instruction *)scan_inst->prev) {
747 if (scan_inst->dst.file == GRF &&
748 scan_inst->dst.reg == inst->src[0].reg &&
749 scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
750 /* Found something writing to the reg we want to coalesce away. */
751 if (to_mrf) {
752 /* SEND instructions can't have MRF as a destination. */
753 if (scan_inst->mlen)
754 break;
755
756 if (intel->gen == 6) {
757 /* gen6 math instructions must have the destination be
758 * GRF, so no compute-to-MRF for them.
759 */
760 if (scan_inst->is_math()) {
761 break;
762 }
763 }
764 }
765
766 /* If we can't handle the swizzle, bail. */
767 if (!scan_inst->can_reswizzle_dst(inst->dst.writemask,
768 inst->src[0].swizzle,
769 swizzle_mask)) {
770 break;
771 }
772
773 /* Mark which channels we found unconditional writes for. */
774 if (!scan_inst->predicate) {
775 for (int i = 0; i < 4; i++) {
776 if (scan_inst->dst.writemask & (1 << i) &&
777 chans_needed[i]) {
778 chans_needed[i] = false;
779 chans_remaining--;
780 }
781 }
782 }
783
784 if (chans_remaining == 0)
785 break;
786 }
787
788 /* We don't handle flow control here. Most computation of values
789 * that could be coalesced happens just before their use.
790 */
791 if (scan_inst->opcode == BRW_OPCODE_DO ||
792 scan_inst->opcode == BRW_OPCODE_WHILE ||
793 scan_inst->opcode == BRW_OPCODE_ELSE ||
794 scan_inst->opcode == BRW_OPCODE_ENDIF) {
795 break;
796 }
797
798 /* You can't read from an MRF, so if someone else reads our MRF's
799 * source GRF that we wanted to rewrite, that stops us. If it's a
800 * GRF we're trying to coalesce to, we don't actually handle
801 * rewriting sources so bail in that case as well.
802 */
803 bool interfered = false;
804 for (int i = 0; i < 3; i++) {
805 if (scan_inst->src[i].file == GRF &&
806 scan_inst->src[i].reg == inst->src[0].reg &&
807 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
808 interfered = true;
809 }
810 }
811 if (interfered)
812 break;
813
814 /* If somebody else writes our destination here, we can't coalesce
815 * before that.
816 */
817 if (scan_inst->dst.file == inst->dst.file &&
818 scan_inst->dst.reg == inst->dst.reg) {
819 break;
820 }
821
822 /* Check for reads of the register we're trying to coalesce into. We
823 * can't go rewriting instructions above that to put some other value
824 * in the register instead.
825 */
826 if (to_mrf && scan_inst->mlen > 0) {
827 if (inst->dst.reg >= scan_inst->base_mrf &&
828 inst->dst.reg < scan_inst->base_mrf + scan_inst->mlen) {
829 break;
830 }
831 } else {
832 for (int i = 0; i < 3; i++) {
833 if (scan_inst->src[i].file == inst->dst.file &&
834 scan_inst->src[i].reg == inst->dst.reg &&
835 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
836 interfered = true;
837 }
838 }
839 if (interfered)
840 break;
841 }
842 }
843
844 if (chans_remaining == 0) {
845 /* If we've made it here, we have an MOV we want to coalesce out, and
846 * a scan_inst pointing to the earliest instruction involved in
847 * computing the value. Now go rewrite the instruction stream
848 * between the two.
849 */
850
851 while (scan_inst != inst) {
852 if (scan_inst->dst.file == GRF &&
853 scan_inst->dst.reg == inst->src[0].reg &&
854 scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
855 scan_inst->reswizzle_dst(inst->dst.writemask,
856 inst->src[0].swizzle);
857 scan_inst->dst.file = inst->dst.file;
858 scan_inst->dst.reg = inst->dst.reg;
859 scan_inst->dst.reg_offset = inst->dst.reg_offset;
860 scan_inst->saturate |= inst->saturate;
861 }
862 scan_inst = (vec4_instruction *)scan_inst->next;
863 }
864 inst->remove();
865 progress = true;
866 }
867 }
868
869 if (progress)
870 live_intervals_valid = false;
871
872 return progress;
873 }
874
875 /**
876 * Splits virtual GRFs requesting more than one contiguous physical register.
877 *
878 * We initially create large virtual GRFs for temporary structures, arrays,
879 * and matrices, so that the dereference visitor functions can add reg_offsets
880 * to work their way down to the actual member being accessed.
881 *
882 * Unlike in the FS visitor, though, we have no SEND messages that return more
883 * than 1 register. We also don't do any array access in register space,
884 * which would have required contiguous physical registers. Thus, all those
885 * large virtual GRFs can be split up into independent single-register virtual
886 * GRFs, making allocation and optimization easier.
887 */
888 void
889 vec4_visitor::split_virtual_grfs()
890 {
891 int num_vars = this->virtual_grf_count;
892 int new_virtual_grf[num_vars];
893
894 memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
895
896 /* Allocate new space for split regs. Note that the virtual
897 * numbers will be contiguous.
898 */
899 for (int i = 0; i < num_vars; i++) {
900 if (this->virtual_grf_sizes[i] == 1)
901 continue;
902
903 new_virtual_grf[i] = virtual_grf_alloc(1);
904 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
905 int reg = virtual_grf_alloc(1);
906 assert(reg == new_virtual_grf[i] + j - 1);
907 (void) reg;
908 }
909 this->virtual_grf_sizes[i] = 1;
910 }
911
912 foreach_list(node, &this->instructions) {
913 vec4_instruction *inst = (vec4_instruction *)node;
914
915 if (inst->dst.file == GRF &&
916 new_virtual_grf[inst->dst.reg] &&
917 inst->dst.reg_offset != 0) {
918 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
919 inst->dst.reg_offset - 1);
920 inst->dst.reg_offset = 0;
921 }
922 for (int i = 0; i < 3; i++) {
923 if (inst->src[i].file == GRF &&
924 new_virtual_grf[inst->src[i].reg] &&
925 inst->src[i].reg_offset != 0) {
926 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
927 inst->src[i].reg_offset - 1);
928 inst->src[i].reg_offset = 0;
929 }
930 }
931 }
932 this->live_intervals_valid = false;
933 }
934
935 void
936 vec4_visitor::dump_instruction(vec4_instruction *inst)
937 {
938 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
939 opcode_descs[inst->opcode].name) {
940 printf("%s ", opcode_descs[inst->opcode].name);
941 } else {
942 printf("op%d ", inst->opcode);
943 }
944
945 switch (inst->dst.file) {
946 case GRF:
947 printf("vgrf%d.%d", inst->dst.reg, inst->dst.reg_offset);
948 break;
949 case MRF:
950 printf("m%d", inst->dst.reg);
951 break;
952 case BAD_FILE:
953 printf("(null)");
954 break;
955 default:
956 printf("???");
957 break;
958 }
959 if (inst->dst.writemask != WRITEMASK_XYZW) {
960 printf(".");
961 if (inst->dst.writemask & 1)
962 printf("x");
963 if (inst->dst.writemask & 2)
964 printf("y");
965 if (inst->dst.writemask & 4)
966 printf("z");
967 if (inst->dst.writemask & 8)
968 printf("w");
969 }
970 printf(", ");
971
972 for (int i = 0; i < 3; i++) {
973 switch (inst->src[i].file) {
974 case GRF:
975 printf("vgrf%d", inst->src[i].reg);
976 break;
977 case ATTR:
978 printf("attr%d", inst->src[i].reg);
979 break;
980 case UNIFORM:
981 printf("u%d", inst->src[i].reg);
982 break;
983 case BAD_FILE:
984 printf("(null)");
985 break;
986 default:
987 printf("???");
988 break;
989 }
990
991 if (inst->src[i].reg_offset)
992 printf(".%d", inst->src[i].reg_offset);
993
994 static const char *chans[4] = {"x", "y", "z", "w"};
995 printf(".");
996 for (int c = 0; c < 4; c++) {
997 printf("%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]);
998 }
999
1000 if (i < 3)
1001 printf(", ");
1002 }
1003
1004 printf("\n");
1005 }
1006
1007 void
1008 vec4_visitor::dump_instructions()
1009 {
1010 int ip = 0;
1011 foreach_list_safe(node, &this->instructions) {
1012 vec4_instruction *inst = (vec4_instruction *)node;
1013 printf("%d: ", ip++);
1014 dump_instruction(inst);
1015 }
1016 }
1017
1018 int
1019 vec4_visitor::setup_attributes(int payload_reg)
1020 {
1021 int nr_attributes;
1022 int attribute_map[VERT_ATTRIB_MAX + 1];
1023
1024 nr_attributes = 0;
1025 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
1026 if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
1027 attribute_map[i] = payload_reg + nr_attributes;
1028 nr_attributes++;
1029 }
1030 }
1031
1032 /* VertexID is stored by the VF as the last vertex element, but we
1033 * don't represent it with a flag in inputs_read, so we call it
1034 * VERT_ATTRIB_MAX.
1035 */
1036 if (prog_data->uses_vertexid) {
1037 attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
1038 nr_attributes++;
1039 }
1040
1041 foreach_list(node, &this->instructions) {
1042 vec4_instruction *inst = (vec4_instruction *)node;
1043
1044 /* We have to support ATTR as a destination for GL_FIXED fixup. */
1045 if (inst->dst.file == ATTR) {
1046 int grf = attribute_map[inst->dst.reg + inst->dst.reg_offset];
1047
1048 struct brw_reg reg = brw_vec8_grf(grf, 0);
1049 reg.type = inst->dst.type;
1050 reg.dw1.bits.writemask = inst->dst.writemask;
1051
1052 inst->dst.file = HW_REG;
1053 inst->dst.fixed_hw_reg = reg;
1054 }
1055
1056 for (int i = 0; i < 3; i++) {
1057 if (inst->src[i].file != ATTR)
1058 continue;
1059
1060 int grf = attribute_map[inst->src[i].reg + inst->src[i].reg_offset];
1061
1062 struct brw_reg reg = brw_vec8_grf(grf, 0);
1063 reg.dw1.bits.swizzle = inst->src[i].swizzle;
1064 reg.type = inst->src[i].type;
1065 if (inst->src[i].abs)
1066 reg = brw_abs(reg);
1067 if (inst->src[i].negate)
1068 reg = negate(reg);
1069
1070 inst->src[i].file = HW_REG;
1071 inst->src[i].fixed_hw_reg = reg;
1072 }
1073 }
1074
1075 /* The BSpec says we always have to read at least one thing from
1076 * the VF, and it appears that the hardware wedges otherwise.
1077 */
1078 if (nr_attributes == 0)
1079 nr_attributes = 1;
1080
1081 prog_data->urb_read_length = (nr_attributes + 1) / 2;
1082
1083 unsigned vue_entries = MAX2(nr_attributes, c->prog_data.vue_map.num_slots);
1084
1085 if (intel->gen == 6)
1086 c->prog_data.urb_entry_size = ALIGN(vue_entries, 8) / 8;
1087 else
1088 c->prog_data.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1089
1090 return payload_reg + nr_attributes;
1091 }
1092
1093 int
1094 vec4_visitor::setup_uniforms(int reg)
1095 {
1096 /* The pre-gen6 VS requires that some push constants get loaded no
1097 * matter what, or the GPU would hang.
1098 */
1099 if (intel->gen < 6 && this->uniforms == 0) {
1100 this->uniform_vector_size[this->uniforms] = 1;
1101
1102 for (unsigned int i = 0; i < 4; i++) {
1103 unsigned int slot = this->uniforms * 4 + i;
1104 static float zero = 0.0;
1105 c->prog_data.param[slot] = &zero;
1106 }
1107
1108 this->uniforms++;
1109 reg++;
1110 } else {
1111 reg += ALIGN(uniforms, 2) / 2;
1112 }
1113
1114 c->prog_data.nr_params = this->uniforms * 4;
1115
1116 c->prog_data.curb_read_length = reg - 1;
1117
1118 return reg;
1119 }
1120
1121 void
1122 vec4_visitor::setup_payload(void)
1123 {
1124 int reg = 0;
1125
1126 /* The payload always contains important data in g0, which contains
1127 * the URB handles that are passed on to the URB write at the end
1128 * of the thread. So, we always start push constants at g1.
1129 */
1130 reg++;
1131
1132 reg = setup_uniforms(reg);
1133
1134 reg = setup_attributes(reg);
1135
1136 this->first_non_payload_grf = reg;
1137 }
1138
1139 src_reg
1140 vec4_visitor::get_timestamp()
1141 {
1142 assert(intel->gen >= 7);
1143
1144 src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1145 BRW_ARF_TIMESTAMP,
1146 0,
1147 BRW_REGISTER_TYPE_UD,
1148 BRW_VERTICAL_STRIDE_0,
1149 BRW_WIDTH_4,
1150 BRW_HORIZONTAL_STRIDE_4,
1151 BRW_SWIZZLE_XYZW,
1152 WRITEMASK_XYZW));
1153
1154 dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
1155
1156 vec4_instruction *mov = emit(MOV(dst, ts));
1157 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
1158 * even if it's not enabled in the dispatch.
1159 */
1160 mov->force_writemask_all = true;
1161
1162 return src_reg(dst);
1163 }
1164
1165 void
1166 vec4_visitor::emit_shader_time_begin()
1167 {
1168 current_annotation = "shader time start";
1169 shader_start_time = get_timestamp();
1170 }
1171
1172 void
1173 vec4_visitor::emit_shader_time_end()
1174 {
1175 current_annotation = "shader time end";
1176 src_reg shader_end_time = get_timestamp();
1177
1178
1179 /* Check that there weren't any timestamp reset events (assuming these
1180 * were the only two timestamp reads that happened).
1181 */
1182 src_reg reset_end = shader_end_time;
1183 reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
1184 vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u)));
1185 test->conditional_mod = BRW_CONDITIONAL_Z;
1186
1187 emit(IF(BRW_PREDICATE_NORMAL));
1188
1189 /* Take the current timestamp and get the delta. */
1190 shader_start_time.negate = true;
1191 dst_reg diff = dst_reg(this, glsl_type::uint_type);
1192 emit(ADD(diff, shader_start_time, shader_end_time));
1193
1194 /* If there were no instructions between the two timestamp gets, the diff
1195 * is 2 cycles. Remove that overhead, so I can forget about that when
1196 * trying to determine the time taken for single instructions.
1197 */
1198 emit(ADD(diff, src_reg(diff), src_reg(-2u)));
1199
1200 emit_shader_time_write(ST_VS, src_reg(diff));
1201 emit_shader_time_write(ST_VS_WRITTEN, src_reg(1u));
1202 emit(BRW_OPCODE_ELSE);
1203 emit_shader_time_write(ST_VS_RESET, src_reg(1u));
1204 emit(BRW_OPCODE_ENDIF);
1205 }
1206
1207 void
1208 vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
1209 src_reg value)
1210 {
1211 /* Choose an index in the buffer and set up tracking information for our
1212 * printouts.
1213 */
1214 int shader_time_index = brw->shader_time.num_entries++;
1215 assert(shader_time_index <= brw->shader_time.max_entries);
1216 brw->shader_time.types[shader_time_index] = type;
1217 if (prog) {
1218 _mesa_reference_shader_program(ctx,
1219 &brw->shader_time.programs[shader_time_index],
1220 prog);
1221 }
1222
1223 int base_mrf = 6;
1224
1225 dst_reg offset_mrf = dst_reg(MRF, base_mrf);
1226 offset_mrf.type = BRW_REGISTER_TYPE_UD;
1227 emit(MOV(offset_mrf, src_reg(shader_time_index * 4)));
1228
1229 dst_reg time_mrf = dst_reg(MRF, base_mrf + 1);
1230 time_mrf.type = BRW_REGISTER_TYPE_UD;
1231 emit(MOV(time_mrf, src_reg(value)));
1232
1233 vec4_instruction *inst;
1234 inst = emit(SHADER_OPCODE_SHADER_TIME_ADD);
1235 inst->base_mrf = base_mrf;
1236 inst->mlen = 2;
1237 }
1238
1239 bool
1240 vec4_visitor::run()
1241 {
1242 sanity_param_count = vp->Base.Parameters->NumParameters;
1243
1244 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
1245 emit_shader_time_begin();
1246
1247 emit_attribute_fixups();
1248
1249 /* Generate VS IR for main(). (the visitor only descends into
1250 * functions called "main").
1251 */
1252 if (shader) {
1253 visit_instructions(shader->ir);
1254 } else {
1255 emit_vertex_program_code();
1256 }
1257 base_ir = NULL;
1258
1259 if (c->key.userclip_active && !c->key.uses_clip_distance)
1260 setup_uniform_clipplane_values();
1261
1262 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
1263 emit_shader_time_end();
1264
1265 emit_urb_writes();
1266
1267 /* Before any optimization, push array accesses out to scratch
1268 * space where we need them to be. This pass may allocate new
1269 * virtual GRFs, so we want to do it early. It also makes sure
1270 * that we have reladdr computations available for CSE, since we'll
1271 * often do repeated subexpressions for those.
1272 */
1273 if (shader) {
1274 move_grf_array_access_to_scratch();
1275 move_uniform_array_access_to_pull_constants();
1276 } else {
1277 /* The ARB_vertex_program frontend emits pull constant loads directly
1278 * rather than using reladdr, so we don't need to walk through all the
1279 * instructions looking for things to move. There isn't anything.
1280 *
1281 * We do still need to split things to vec4 size.
1282 */
1283 split_uniform_registers();
1284 }
1285 pack_uniform_registers();
1286 move_push_constants_to_pull_constants();
1287 split_virtual_grfs();
1288
1289 bool progress;
1290 do {
1291 progress = false;
1292 progress = dead_code_eliminate() || progress;
1293 progress = opt_copy_propagation() || progress;
1294 progress = opt_algebraic() || progress;
1295 progress = opt_register_coalesce() || progress;
1296 } while (progress);
1297
1298
1299 if (failed)
1300 return false;
1301
1302 setup_payload();
1303
1304 if (false) {
1305 /* Debug of register spilling: Go spill everything. */
1306 const int grf_count = virtual_grf_count;
1307 float spill_costs[virtual_grf_count];
1308 bool no_spill[virtual_grf_count];
1309 evaluate_spill_costs(spill_costs, no_spill);
1310 for (int i = 0; i < grf_count; i++) {
1311 if (no_spill[i])
1312 continue;
1313 spill_reg(i);
1314 }
1315 }
1316
1317 while (!reg_allocate()) {
1318 if (failed)
1319 break;
1320 }
1321
1322 /* If any state parameters were appended, then ParameterValues could have
1323 * been realloced, in which case the driver uniform storage set up by
1324 * _mesa_associate_uniform_storage() would point to freed memory. Make
1325 * sure that didn't happen.
1326 */
1327 assert(sanity_param_count == vp->Base.Parameters->NumParameters);
1328
1329 return !failed;
1330 }
1331
1332 } /* namespace brw */
1333
1334 extern "C" {
1335
1336 /**
1337 * Compile a vertex shader.
1338 *
1339 * Returns the final assembly and the program's size.
1340 */
1341 const unsigned *
1342 brw_vs_emit(struct brw_context *brw,
1343 struct gl_shader_program *prog,
1344 struct brw_vs_compile *c,
1345 void *mem_ctx,
1346 unsigned *final_assembly_size)
1347 {
1348 struct intel_context *intel = &brw->intel;
1349 bool start_busy = false;
1350 float start_time = 0;
1351
1352 if (unlikely(INTEL_DEBUG & DEBUG_PERF)) {
1353 start_busy = (intel->batch.last_bo &&
1354 drm_intel_bo_busy(intel->batch.last_bo));
1355 start_time = get_time();
1356 }
1357
1358 struct brw_shader *shader = NULL;
1359 if (prog)
1360 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX];
1361
1362 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
1363 if (shader) {
1364 printf("GLSL IR for native vertex shader %d:\n", prog->Name);
1365 _mesa_print_ir(shader->ir, NULL);
1366 printf("\n\n");
1367 } else {
1368 printf("ARB_vertex_program %d for native vertex shader\n",
1369 c->vp->program.Base.Id);
1370 _mesa_print_program(&c->vp->program.Base);
1371 }
1372 }
1373
1374 vec4_visitor v(brw, c, prog, shader, mem_ctx);
1375 if (!v.run()) {
1376 prog->LinkStatus = false;
1377 ralloc_strcat(&prog->InfoLog, v.fail_msg);
1378 return NULL;
1379 }
1380
1381 vec4_generator g(brw, c, prog, mem_ctx);
1382 const unsigned *generated =g.generate_assembly(&v.instructions,
1383 final_assembly_size);
1384
1385 if (unlikely(INTEL_DEBUG & DEBUG_PERF) && shader) {
1386 if (shader->compiled_once) {
1387 brw_vs_debug_recompile(brw, prog, &c->key);
1388 }
1389 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
1390 perf_debug("VS compile took %.03f ms and stalled the GPU\n",
1391 (get_time() - start_time) * 1000);
1392 }
1393 shader->compiled_once = true;
1394 }
1395
1396 return generated;
1397 }
1398
1399 } /* extern "C" */