9ab599f4b2115bb186a6ccb2b4ce059addf7bef8
[mesa.git] / src / mesa / drivers / dri / i965 / brw_vec4.cpp
1 /*
2 * Copyright © 2011 Intel Corporation
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8 * and/or sell copies of the Software, and to permit persons to whom the
9 * Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21 * IN THE SOFTWARE.
22 */
23
24 #include "brw_vec4.h"
25 #include "glsl/ir_print_visitor.h"
26
27 extern "C" {
28 #include "main/macros.h"
29 #include "main/shaderobj.h"
30 #include "program/prog_print.h"
31 #include "program/prog_parameter.h"
32 }
33
34 #define MAX_INSTRUCTION (1 << 30)
35
36 using namespace brw;
37
38 namespace brw {
39
40 /**
41 * Common helper for constructing swizzles. When only a subset of
42 * channels of a vec4 are used, we don't want to reference the other
43 * channels, as that will tell optimization passes that those other
44 * channels are used.
45 */
46 unsigned
47 swizzle_for_size(int size)
48 {
49 static const unsigned size_swizzles[4] = {
50 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_X, SWIZZLE_X, SWIZZLE_X),
51 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Y, SWIZZLE_Y),
52 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_Z),
53 BRW_SWIZZLE4(SWIZZLE_X, SWIZZLE_Y, SWIZZLE_Z, SWIZZLE_W),
54 };
55
56 assert((size >= 1) && (size <= 4));
57 return size_swizzles[size - 1];
58 }
59
60 void
61 src_reg::init()
62 {
63 memset(this, 0, sizeof(*this));
64
65 this->file = BAD_FILE;
66 }
67
68 src_reg::src_reg(register_file file, int reg, const glsl_type *type)
69 {
70 init();
71
72 this->file = file;
73 this->reg = reg;
74 if (type && (type->is_scalar() || type->is_vector() || type->is_matrix()))
75 this->swizzle = swizzle_for_size(type->vector_elements);
76 else
77 this->swizzle = SWIZZLE_XYZW;
78 }
79
80 /** Generic unset register constructor. */
81 src_reg::src_reg()
82 {
83 init();
84 }
85
86 src_reg::src_reg(float f)
87 {
88 init();
89
90 this->file = IMM;
91 this->type = BRW_REGISTER_TYPE_F;
92 this->imm.f = f;
93 }
94
95 src_reg::src_reg(uint32_t u)
96 {
97 init();
98
99 this->file = IMM;
100 this->type = BRW_REGISTER_TYPE_UD;
101 this->imm.u = u;
102 }
103
104 src_reg::src_reg(int32_t i)
105 {
106 init();
107
108 this->file = IMM;
109 this->type = BRW_REGISTER_TYPE_D;
110 this->imm.i = i;
111 }
112
113 src_reg::src_reg(dst_reg reg)
114 {
115 init();
116
117 this->file = reg.file;
118 this->reg = reg.reg;
119 this->reg_offset = reg.reg_offset;
120 this->type = reg.type;
121 this->reladdr = reg.reladdr;
122 this->fixed_hw_reg = reg.fixed_hw_reg;
123
124 int swizzles[4];
125 int next_chan = 0;
126 int last = 0;
127
128 for (int i = 0; i < 4; i++) {
129 if (!(reg.writemask & (1 << i)))
130 continue;
131
132 swizzles[next_chan++] = last = i;
133 }
134
135 for (; next_chan < 4; next_chan++) {
136 swizzles[next_chan] = last;
137 }
138
139 this->swizzle = BRW_SWIZZLE4(swizzles[0], swizzles[1],
140 swizzles[2], swizzles[3]);
141 }
142
143 bool
144 vec4_instruction::is_tex()
145 {
146 return (opcode == SHADER_OPCODE_TEX ||
147 opcode == SHADER_OPCODE_TXD ||
148 opcode == SHADER_OPCODE_TXF ||
149 opcode == SHADER_OPCODE_TXF_MS ||
150 opcode == SHADER_OPCODE_TXL ||
151 opcode == SHADER_OPCODE_TXS);
152 }
153
154 void
155 dst_reg::init()
156 {
157 memset(this, 0, sizeof(*this));
158 this->file = BAD_FILE;
159 this->writemask = WRITEMASK_XYZW;
160 }
161
162 dst_reg::dst_reg()
163 {
164 init();
165 }
166
167 dst_reg::dst_reg(register_file file, int reg)
168 {
169 init();
170
171 this->file = file;
172 this->reg = reg;
173 }
174
175 dst_reg::dst_reg(register_file file, int reg, const glsl_type *type,
176 int writemask)
177 {
178 init();
179
180 this->file = file;
181 this->reg = reg;
182 this->type = brw_type_for_base_type(type);
183 this->writemask = writemask;
184 }
185
186 dst_reg::dst_reg(struct brw_reg reg)
187 {
188 init();
189
190 this->file = HW_REG;
191 this->fixed_hw_reg = reg;
192 }
193
194 dst_reg::dst_reg(src_reg reg)
195 {
196 init();
197
198 this->file = reg.file;
199 this->reg = reg.reg;
200 this->reg_offset = reg.reg_offset;
201 this->type = reg.type;
202 this->writemask = WRITEMASK_XYZW;
203 this->reladdr = reg.reladdr;
204 this->fixed_hw_reg = reg.fixed_hw_reg;
205 }
206
207 bool
208 vec4_instruction::is_math()
209 {
210 return (opcode == SHADER_OPCODE_RCP ||
211 opcode == SHADER_OPCODE_RSQ ||
212 opcode == SHADER_OPCODE_SQRT ||
213 opcode == SHADER_OPCODE_EXP2 ||
214 opcode == SHADER_OPCODE_LOG2 ||
215 opcode == SHADER_OPCODE_SIN ||
216 opcode == SHADER_OPCODE_COS ||
217 opcode == SHADER_OPCODE_INT_QUOTIENT ||
218 opcode == SHADER_OPCODE_INT_REMAINDER ||
219 opcode == SHADER_OPCODE_POW);
220 }
221 /**
222 * Returns how many MRFs an opcode will write over.
223 *
224 * Note that this is not the 0 or 1 implied writes in an actual gen
225 * instruction -- the generate_* functions generate additional MOVs
226 * for setup.
227 */
228 int
229 vec4_visitor::implied_mrf_writes(vec4_instruction *inst)
230 {
231 if (inst->mlen == 0)
232 return 0;
233
234 switch (inst->opcode) {
235 case SHADER_OPCODE_RCP:
236 case SHADER_OPCODE_RSQ:
237 case SHADER_OPCODE_SQRT:
238 case SHADER_OPCODE_EXP2:
239 case SHADER_OPCODE_LOG2:
240 case SHADER_OPCODE_SIN:
241 case SHADER_OPCODE_COS:
242 return 1;
243 case SHADER_OPCODE_POW:
244 return 2;
245 case VS_OPCODE_URB_WRITE:
246 return 1;
247 case VS_OPCODE_PULL_CONSTANT_LOAD:
248 return 2;
249 case VS_OPCODE_SCRATCH_READ:
250 return 2;
251 case VS_OPCODE_SCRATCH_WRITE:
252 return 3;
253 case SHADER_OPCODE_SHADER_TIME_ADD:
254 return 0;
255 default:
256 assert(!"not reached");
257 return inst->mlen;
258 }
259 }
260
261 bool
262 src_reg::equals(src_reg *r)
263 {
264 return (file == r->file &&
265 reg == r->reg &&
266 reg_offset == r->reg_offset &&
267 type == r->type &&
268 negate == r->negate &&
269 abs == r->abs &&
270 swizzle == r->swizzle &&
271 !reladdr && !r->reladdr &&
272 memcmp(&fixed_hw_reg, &r->fixed_hw_reg,
273 sizeof(fixed_hw_reg)) == 0 &&
274 imm.u == r->imm.u);
275 }
276
277 /**
278 * Must be called after calculate_live_intervales() to remove unused
279 * writes to registers -- register allocation will fail otherwise
280 * because something deffed but not used won't be considered to
281 * interfere with other regs.
282 */
283 bool
284 vec4_visitor::dead_code_eliminate()
285 {
286 bool progress = false;
287 int pc = 0;
288
289 calculate_live_intervals();
290
291 foreach_list_safe(node, &this->instructions) {
292 vec4_instruction *inst = (vec4_instruction *)node;
293
294 if (inst->dst.file == GRF && this->virtual_grf_use[inst->dst.reg] <= pc) {
295 inst->remove();
296 progress = true;
297 }
298
299 pc++;
300 }
301
302 if (progress)
303 live_intervals_valid = false;
304
305 return progress;
306 }
307
308 void
309 vec4_visitor::split_uniform_registers()
310 {
311 /* Prior to this, uniforms have been in an array sized according to
312 * the number of vector uniforms present, sparsely filled (so an
313 * aggregate results in reg indices being skipped over). Now we're
314 * going to cut those aggregates up so each .reg index is one
315 * vector. The goal is to make elimination of unused uniform
316 * components easier later.
317 */
318 foreach_list(node, &this->instructions) {
319 vec4_instruction *inst = (vec4_instruction *)node;
320
321 for (int i = 0 ; i < 3; i++) {
322 if (inst->src[i].file != UNIFORM)
323 continue;
324
325 assert(!inst->src[i].reladdr);
326
327 inst->src[i].reg += inst->src[i].reg_offset;
328 inst->src[i].reg_offset = 0;
329 }
330 }
331
332 /* Update that everything is now vector-sized. */
333 for (int i = 0; i < this->uniforms; i++) {
334 this->uniform_size[i] = 1;
335 }
336 }
337
338 void
339 vec4_visitor::pack_uniform_registers()
340 {
341 bool uniform_used[this->uniforms];
342 int new_loc[this->uniforms];
343 int new_chan[this->uniforms];
344
345 memset(uniform_used, 0, sizeof(uniform_used));
346 memset(new_loc, 0, sizeof(new_loc));
347 memset(new_chan, 0, sizeof(new_chan));
348
349 /* Find which uniform vectors are actually used by the program. We
350 * expect unused vector elements when we've moved array access out
351 * to pull constants, and from some GLSL code generators like wine.
352 */
353 foreach_list(node, &this->instructions) {
354 vec4_instruction *inst = (vec4_instruction *)node;
355
356 for (int i = 0 ; i < 3; i++) {
357 if (inst->src[i].file != UNIFORM)
358 continue;
359
360 uniform_used[inst->src[i].reg] = true;
361 }
362 }
363
364 int new_uniform_count = 0;
365
366 /* Now, figure out a packing of the live uniform vectors into our
367 * push constants.
368 */
369 for (int src = 0; src < uniforms; src++) {
370 int size = this->uniform_vector_size[src];
371
372 if (!uniform_used[src]) {
373 this->uniform_vector_size[src] = 0;
374 continue;
375 }
376
377 int dst;
378 /* Find the lowest place we can slot this uniform in. */
379 for (dst = 0; dst < src; dst++) {
380 if (this->uniform_vector_size[dst] + size <= 4)
381 break;
382 }
383
384 if (src == dst) {
385 new_loc[src] = dst;
386 new_chan[src] = 0;
387 } else {
388 new_loc[src] = dst;
389 new_chan[src] = this->uniform_vector_size[dst];
390
391 /* Move the references to the data */
392 for (int j = 0; j < size; j++) {
393 c->prog_data.param[dst * 4 + new_chan[src] + j] =
394 c->prog_data.param[src * 4 + j];
395 }
396
397 this->uniform_vector_size[dst] += size;
398 this->uniform_vector_size[src] = 0;
399 }
400
401 new_uniform_count = MAX2(new_uniform_count, dst + 1);
402 }
403
404 this->uniforms = new_uniform_count;
405
406 /* Now, update the instructions for our repacked uniforms. */
407 foreach_list(node, &this->instructions) {
408 vec4_instruction *inst = (vec4_instruction *)node;
409
410 for (int i = 0 ; i < 3; i++) {
411 int src = inst->src[i].reg;
412
413 if (inst->src[i].file != UNIFORM)
414 continue;
415
416 inst->src[i].reg = new_loc[src];
417
418 int sx = BRW_GET_SWZ(inst->src[i].swizzle, 0) + new_chan[src];
419 int sy = BRW_GET_SWZ(inst->src[i].swizzle, 1) + new_chan[src];
420 int sz = BRW_GET_SWZ(inst->src[i].swizzle, 2) + new_chan[src];
421 int sw = BRW_GET_SWZ(inst->src[i].swizzle, 3) + new_chan[src];
422 inst->src[i].swizzle = BRW_SWIZZLE4(sx, sy, sz, sw);
423 }
424 }
425 }
426
427 bool
428 src_reg::is_zero() const
429 {
430 if (file != IMM)
431 return false;
432
433 if (type == BRW_REGISTER_TYPE_F) {
434 return imm.f == 0.0;
435 } else {
436 return imm.i == 0;
437 }
438 }
439
440 bool
441 src_reg::is_one() const
442 {
443 if (file != IMM)
444 return false;
445
446 if (type == BRW_REGISTER_TYPE_F) {
447 return imm.f == 1.0;
448 } else {
449 return imm.i == 1;
450 }
451 }
452
453 /**
454 * Does algebraic optimizations (0 * a = 0, 1 * a = a, a + 0 = a).
455 *
456 * While GLSL IR also performs this optimization, we end up with it in
457 * our instruction stream for a couple of reasons. One is that we
458 * sometimes generate silly instructions, for example in array access
459 * where we'll generate "ADD offset, index, base" even if base is 0.
460 * The other is that GLSL IR's constant propagation doesn't track the
461 * components of aggregates, so some VS patterns (initialize matrix to
462 * 0, accumulate in vertex blending factors) end up breaking down to
463 * instructions involving 0.
464 */
465 bool
466 vec4_visitor::opt_algebraic()
467 {
468 bool progress = false;
469
470 foreach_list(node, &this->instructions) {
471 vec4_instruction *inst = (vec4_instruction *)node;
472
473 switch (inst->opcode) {
474 case BRW_OPCODE_ADD:
475 if (inst->src[1].is_zero()) {
476 inst->opcode = BRW_OPCODE_MOV;
477 inst->src[1] = src_reg();
478 progress = true;
479 }
480 break;
481
482 case BRW_OPCODE_MUL:
483 if (inst->src[1].is_zero()) {
484 inst->opcode = BRW_OPCODE_MOV;
485 switch (inst->src[0].type) {
486 case BRW_REGISTER_TYPE_F:
487 inst->src[0] = src_reg(0.0f);
488 break;
489 case BRW_REGISTER_TYPE_D:
490 inst->src[0] = src_reg(0);
491 break;
492 case BRW_REGISTER_TYPE_UD:
493 inst->src[0] = src_reg(0u);
494 break;
495 default:
496 assert(!"not reached");
497 inst->src[0] = src_reg(0.0f);
498 break;
499 }
500 inst->src[1] = src_reg();
501 progress = true;
502 } else if (inst->src[1].is_one()) {
503 inst->opcode = BRW_OPCODE_MOV;
504 inst->src[1] = src_reg();
505 progress = true;
506 }
507 break;
508 default:
509 break;
510 }
511 }
512
513 if (progress)
514 this->live_intervals_valid = false;
515
516 return progress;
517 }
518
519 /**
520 * Only a limited number of hardware registers may be used for push
521 * constants, so this turns access to the overflowed constants into
522 * pull constants.
523 */
524 void
525 vec4_visitor::move_push_constants_to_pull_constants()
526 {
527 int pull_constant_loc[this->uniforms];
528
529 /* Only allow 32 registers (256 uniform components) as push constants,
530 * which is the limit on gen6.
531 */
532 int max_uniform_components = 32 * 8;
533 if (this->uniforms * 4 <= max_uniform_components)
534 return;
535
536 /* Make some sort of choice as to which uniforms get sent to pull
537 * constants. We could potentially do something clever here like
538 * look for the most infrequently used uniform vec4s, but leave
539 * that for later.
540 */
541 for (int i = 0; i < this->uniforms * 4; i += 4) {
542 pull_constant_loc[i / 4] = -1;
543
544 if (i >= max_uniform_components) {
545 const float **values = &prog_data->param[i];
546
547 /* Try to find an existing copy of this uniform in the pull
548 * constants if it was part of an array access already.
549 */
550 for (unsigned int j = 0; j < prog_data->nr_pull_params; j += 4) {
551 int matches;
552
553 for (matches = 0; matches < 4; matches++) {
554 if (prog_data->pull_param[j + matches] != values[matches])
555 break;
556 }
557
558 if (matches == 4) {
559 pull_constant_loc[i / 4] = j / 4;
560 break;
561 }
562 }
563
564 if (pull_constant_loc[i / 4] == -1) {
565 assert(prog_data->nr_pull_params % 4 == 0);
566 pull_constant_loc[i / 4] = prog_data->nr_pull_params / 4;
567
568 for (int j = 0; j < 4; j++) {
569 prog_data->pull_param[prog_data->nr_pull_params++] = values[j];
570 }
571 }
572 }
573 }
574
575 /* Now actually rewrite usage of the things we've moved to pull
576 * constants.
577 */
578 foreach_list_safe(node, &this->instructions) {
579 vec4_instruction *inst = (vec4_instruction *)node;
580
581 for (int i = 0 ; i < 3; i++) {
582 if (inst->src[i].file != UNIFORM ||
583 pull_constant_loc[inst->src[i].reg] == -1)
584 continue;
585
586 int uniform = inst->src[i].reg;
587
588 dst_reg temp = dst_reg(this, glsl_type::vec4_type);
589
590 emit_pull_constant_load(inst, temp, inst->src[i],
591 pull_constant_loc[uniform]);
592
593 inst->src[i].file = temp.file;
594 inst->src[i].reg = temp.reg;
595 inst->src[i].reg_offset = temp.reg_offset;
596 inst->src[i].reladdr = NULL;
597 }
598 }
599
600 /* Repack push constants to remove the now-unused ones. */
601 pack_uniform_registers();
602 }
603
604 bool
605 vec4_instruction::can_reswizzle_dst(int dst_writemask,
606 int swizzle,
607 int swizzle_mask)
608 {
609 /* If this instruction sets anything not referenced by swizzle, then we'd
610 * totally break it when we reswizzle.
611 */
612 if (dst.writemask & ~swizzle_mask)
613 return false;
614
615 switch (opcode) {
616 case BRW_OPCODE_DP4:
617 case BRW_OPCODE_DP3:
618 case BRW_OPCODE_DP2:
619 return true;
620 default:
621 /* Check if there happens to be no reswizzling required. */
622 for (int c = 0; c < 4; c++) {
623 int bit = 1 << BRW_GET_SWZ(swizzle, c);
624 /* Skip components of the swizzle not used by the dst. */
625 if (!(dst_writemask & (1 << c)))
626 continue;
627
628 /* We don't do the reswizzling yet, so just sanity check that we
629 * don't have to.
630 */
631 if (bit != (1 << c))
632 return false;
633 }
634 return true;
635 }
636 }
637
638 /**
639 * For any channels in the swizzle's source that were populated by this
640 * instruction, rewrite the instruction to put the appropriate result directly
641 * in those channels.
642 *
643 * e.g. for swizzle=yywx, MUL a.xy b c -> MUL a.yy_x b.yy z.yy_x
644 */
645 void
646 vec4_instruction::reswizzle_dst(int dst_writemask, int swizzle)
647 {
648 int new_writemask = 0;
649
650 switch (opcode) {
651 case BRW_OPCODE_DP4:
652 case BRW_OPCODE_DP3:
653 case BRW_OPCODE_DP2:
654 for (int c = 0; c < 4; c++) {
655 int bit = 1 << BRW_GET_SWZ(swizzle, c);
656 /* Skip components of the swizzle not used by the dst. */
657 if (!(dst_writemask & (1 << c)))
658 continue;
659 /* If we were populating this component, then populate the
660 * corresponding channel of the new dst.
661 */
662 if (dst.writemask & bit)
663 new_writemask |= (1 << c);
664 }
665 dst.writemask = new_writemask;
666 break;
667 default:
668 for (int c = 0; c < 4; c++) {
669 int bit = 1 << BRW_GET_SWZ(swizzle, c);
670 /* Skip components of the swizzle not used by the dst. */
671 if (!(dst_writemask & (1 << c)))
672 continue;
673
674 /* We don't do the reswizzling yet, so just sanity check that we
675 * don't have to.
676 */
677 assert(bit == (1 << c));
678 }
679 break;
680 }
681 }
682
683 /*
684 * Tries to reduce extra MOV instructions by taking temporary GRFs that get
685 * just written and then MOVed into another reg and making the original write
686 * of the GRF write directly to the final destination instead.
687 */
688 bool
689 vec4_visitor::opt_register_coalesce()
690 {
691 bool progress = false;
692 int next_ip = 0;
693
694 calculate_live_intervals();
695
696 foreach_list_safe(node, &this->instructions) {
697 vec4_instruction *inst = (vec4_instruction *)node;
698
699 int ip = next_ip;
700 next_ip++;
701
702 if (inst->opcode != BRW_OPCODE_MOV ||
703 (inst->dst.file != GRF && inst->dst.file != MRF) ||
704 inst->predicate ||
705 inst->src[0].file != GRF ||
706 inst->dst.type != inst->src[0].type ||
707 inst->src[0].abs || inst->src[0].negate || inst->src[0].reladdr)
708 continue;
709
710 bool to_mrf = (inst->dst.file == MRF);
711
712 /* Can't coalesce this GRF if someone else was going to
713 * read it later.
714 */
715 if (this->virtual_grf_use[inst->src[0].reg] > ip)
716 continue;
717
718 /* We need to check interference with the final destination between this
719 * instruction and the earliest instruction involved in writing the GRF
720 * we're eliminating. To do that, keep track of which of our source
721 * channels we've seen initialized.
722 */
723 bool chans_needed[4] = {false, false, false, false};
724 int chans_remaining = 0;
725 int swizzle_mask = 0;
726 for (int i = 0; i < 4; i++) {
727 int chan = BRW_GET_SWZ(inst->src[0].swizzle, i);
728
729 if (!(inst->dst.writemask & (1 << i)))
730 continue;
731
732 swizzle_mask |= (1 << chan);
733
734 if (!chans_needed[chan]) {
735 chans_needed[chan] = true;
736 chans_remaining++;
737 }
738 }
739
740 /* Now walk up the instruction stream trying to see if we can rewrite
741 * everything writing to the temporary to write into the destination
742 * instead.
743 */
744 vec4_instruction *scan_inst;
745 for (scan_inst = (vec4_instruction *)inst->prev;
746 scan_inst->prev != NULL;
747 scan_inst = (vec4_instruction *)scan_inst->prev) {
748 if (scan_inst->dst.file == GRF &&
749 scan_inst->dst.reg == inst->src[0].reg &&
750 scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
751 /* Found something writing to the reg we want to coalesce away. */
752 if (to_mrf) {
753 /* SEND instructions can't have MRF as a destination. */
754 if (scan_inst->mlen)
755 break;
756
757 if (intel->gen == 6) {
758 /* gen6 math instructions must have the destination be
759 * GRF, so no compute-to-MRF for them.
760 */
761 if (scan_inst->is_math()) {
762 break;
763 }
764 }
765 }
766
767 /* If we can't handle the swizzle, bail. */
768 if (!scan_inst->can_reswizzle_dst(inst->dst.writemask,
769 inst->src[0].swizzle,
770 swizzle_mask)) {
771 break;
772 }
773
774 /* Mark which channels we found unconditional writes for. */
775 if (!scan_inst->predicate) {
776 for (int i = 0; i < 4; i++) {
777 if (scan_inst->dst.writemask & (1 << i) &&
778 chans_needed[i]) {
779 chans_needed[i] = false;
780 chans_remaining--;
781 }
782 }
783 }
784
785 if (chans_remaining == 0)
786 break;
787 }
788
789 /* We don't handle flow control here. Most computation of values
790 * that could be coalesced happens just before their use.
791 */
792 if (scan_inst->opcode == BRW_OPCODE_DO ||
793 scan_inst->opcode == BRW_OPCODE_WHILE ||
794 scan_inst->opcode == BRW_OPCODE_ELSE ||
795 scan_inst->opcode == BRW_OPCODE_ENDIF) {
796 break;
797 }
798
799 /* You can't read from an MRF, so if someone else reads our MRF's
800 * source GRF that we wanted to rewrite, that stops us. If it's a
801 * GRF we're trying to coalesce to, we don't actually handle
802 * rewriting sources so bail in that case as well.
803 */
804 bool interfered = false;
805 for (int i = 0; i < 3; i++) {
806 if (scan_inst->src[i].file == GRF &&
807 scan_inst->src[i].reg == inst->src[0].reg &&
808 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
809 interfered = true;
810 }
811 }
812 if (interfered)
813 break;
814
815 /* If somebody else writes our destination here, we can't coalesce
816 * before that.
817 */
818 if (scan_inst->dst.file == inst->dst.file &&
819 scan_inst->dst.reg == inst->dst.reg) {
820 break;
821 }
822
823 /* Check for reads of the register we're trying to coalesce into. We
824 * can't go rewriting instructions above that to put some other value
825 * in the register instead.
826 */
827 if (to_mrf && scan_inst->mlen > 0) {
828 if (inst->dst.reg >= scan_inst->base_mrf &&
829 inst->dst.reg < scan_inst->base_mrf + scan_inst->mlen) {
830 break;
831 }
832 } else {
833 for (int i = 0; i < 3; i++) {
834 if (scan_inst->src[i].file == inst->dst.file &&
835 scan_inst->src[i].reg == inst->dst.reg &&
836 scan_inst->src[i].reg_offset == inst->src[0].reg_offset) {
837 interfered = true;
838 }
839 }
840 if (interfered)
841 break;
842 }
843 }
844
845 if (chans_remaining == 0) {
846 /* If we've made it here, we have an MOV we want to coalesce out, and
847 * a scan_inst pointing to the earliest instruction involved in
848 * computing the value. Now go rewrite the instruction stream
849 * between the two.
850 */
851
852 while (scan_inst != inst) {
853 if (scan_inst->dst.file == GRF &&
854 scan_inst->dst.reg == inst->src[0].reg &&
855 scan_inst->dst.reg_offset == inst->src[0].reg_offset) {
856 scan_inst->reswizzle_dst(inst->dst.writemask,
857 inst->src[0].swizzle);
858 scan_inst->dst.file = inst->dst.file;
859 scan_inst->dst.reg = inst->dst.reg;
860 scan_inst->dst.reg_offset = inst->dst.reg_offset;
861 scan_inst->saturate |= inst->saturate;
862 }
863 scan_inst = (vec4_instruction *)scan_inst->next;
864 }
865 inst->remove();
866 progress = true;
867 }
868 }
869
870 if (progress)
871 live_intervals_valid = false;
872
873 return progress;
874 }
875
876 /**
877 * Splits virtual GRFs requesting more than one contiguous physical register.
878 *
879 * We initially create large virtual GRFs for temporary structures, arrays,
880 * and matrices, so that the dereference visitor functions can add reg_offsets
881 * to work their way down to the actual member being accessed.
882 *
883 * Unlike in the FS visitor, though, we have no SEND messages that return more
884 * than 1 register. We also don't do any array access in register space,
885 * which would have required contiguous physical registers. Thus, all those
886 * large virtual GRFs can be split up into independent single-register virtual
887 * GRFs, making allocation and optimization easier.
888 */
889 void
890 vec4_visitor::split_virtual_grfs()
891 {
892 int num_vars = this->virtual_grf_count;
893 int new_virtual_grf[num_vars];
894
895 memset(new_virtual_grf, 0, sizeof(new_virtual_grf));
896
897 /* Allocate new space for split regs. Note that the virtual
898 * numbers will be contiguous.
899 */
900 for (int i = 0; i < num_vars; i++) {
901 if (this->virtual_grf_sizes[i] == 1)
902 continue;
903
904 new_virtual_grf[i] = virtual_grf_alloc(1);
905 for (int j = 2; j < this->virtual_grf_sizes[i]; j++) {
906 int reg = virtual_grf_alloc(1);
907 assert(reg == new_virtual_grf[i] + j - 1);
908 (void) reg;
909 }
910 this->virtual_grf_sizes[i] = 1;
911 }
912
913 foreach_list(node, &this->instructions) {
914 vec4_instruction *inst = (vec4_instruction *)node;
915
916 if (inst->dst.file == GRF &&
917 new_virtual_grf[inst->dst.reg] &&
918 inst->dst.reg_offset != 0) {
919 inst->dst.reg = (new_virtual_grf[inst->dst.reg] +
920 inst->dst.reg_offset - 1);
921 inst->dst.reg_offset = 0;
922 }
923 for (int i = 0; i < 3; i++) {
924 if (inst->src[i].file == GRF &&
925 new_virtual_grf[inst->src[i].reg] &&
926 inst->src[i].reg_offset != 0) {
927 inst->src[i].reg = (new_virtual_grf[inst->src[i].reg] +
928 inst->src[i].reg_offset - 1);
929 inst->src[i].reg_offset = 0;
930 }
931 }
932 }
933 this->live_intervals_valid = false;
934 }
935
936 void
937 vec4_visitor::dump_instruction(vec4_instruction *inst)
938 {
939 if (inst->opcode < ARRAY_SIZE(opcode_descs) &&
940 opcode_descs[inst->opcode].name) {
941 printf("%s ", opcode_descs[inst->opcode].name);
942 } else {
943 printf("op%d ", inst->opcode);
944 }
945
946 switch (inst->dst.file) {
947 case GRF:
948 printf("vgrf%d.%d", inst->dst.reg, inst->dst.reg_offset);
949 break;
950 case MRF:
951 printf("m%d", inst->dst.reg);
952 break;
953 case BAD_FILE:
954 printf("(null)");
955 break;
956 default:
957 printf("???");
958 break;
959 }
960 if (inst->dst.writemask != WRITEMASK_XYZW) {
961 printf(".");
962 if (inst->dst.writemask & 1)
963 printf("x");
964 if (inst->dst.writemask & 2)
965 printf("y");
966 if (inst->dst.writemask & 4)
967 printf("z");
968 if (inst->dst.writemask & 8)
969 printf("w");
970 }
971 printf(", ");
972
973 for (int i = 0; i < 3; i++) {
974 switch (inst->src[i].file) {
975 case GRF:
976 printf("vgrf%d", inst->src[i].reg);
977 break;
978 case ATTR:
979 printf("attr%d", inst->src[i].reg);
980 break;
981 case UNIFORM:
982 printf("u%d", inst->src[i].reg);
983 break;
984 case IMM:
985 switch (inst->src[i].type) {
986 case BRW_REGISTER_TYPE_F:
987 printf("%fF", inst->src[i].imm.f);
988 break;
989 case BRW_REGISTER_TYPE_D:
990 printf("%dD", inst->src[i].imm.i);
991 break;
992 case BRW_REGISTER_TYPE_UD:
993 printf("%uU", inst->src[i].imm.u);
994 break;
995 default:
996 printf("???");
997 break;
998 }
999 break;
1000 case BAD_FILE:
1001 printf("(null)");
1002 break;
1003 default:
1004 printf("???");
1005 break;
1006 }
1007
1008 if (inst->src[i].reg_offset)
1009 printf(".%d", inst->src[i].reg_offset);
1010
1011 static const char *chans[4] = {"x", "y", "z", "w"};
1012 printf(".");
1013 for (int c = 0; c < 4; c++) {
1014 printf("%s", chans[BRW_GET_SWZ(inst->src[i].swizzle, c)]);
1015 }
1016
1017 if (i < 3)
1018 printf(", ");
1019 }
1020
1021 printf("\n");
1022 }
1023
1024 void
1025 vec4_visitor::dump_instructions()
1026 {
1027 int ip = 0;
1028 foreach_list_safe(node, &this->instructions) {
1029 vec4_instruction *inst = (vec4_instruction *)node;
1030 printf("%d: ", ip++);
1031 dump_instruction(inst);
1032 }
1033 }
1034
1035 int
1036 vec4_visitor::setup_attributes(int payload_reg)
1037 {
1038 int nr_attributes;
1039 int attribute_map[VERT_ATTRIB_MAX + 1];
1040
1041 nr_attributes = 0;
1042 for (int i = 0; i < VERT_ATTRIB_MAX; i++) {
1043 if (prog_data->inputs_read & BITFIELD64_BIT(i)) {
1044 attribute_map[i] = payload_reg + nr_attributes;
1045 nr_attributes++;
1046 }
1047 }
1048
1049 /* VertexID is stored by the VF as the last vertex element, but we
1050 * don't represent it with a flag in inputs_read, so we call it
1051 * VERT_ATTRIB_MAX.
1052 */
1053 if (prog_data->uses_vertexid) {
1054 attribute_map[VERT_ATTRIB_MAX] = payload_reg + nr_attributes;
1055 nr_attributes++;
1056 }
1057
1058 foreach_list(node, &this->instructions) {
1059 vec4_instruction *inst = (vec4_instruction *)node;
1060
1061 /* We have to support ATTR as a destination for GL_FIXED fixup. */
1062 if (inst->dst.file == ATTR) {
1063 int grf = attribute_map[inst->dst.reg + inst->dst.reg_offset];
1064
1065 struct brw_reg reg = brw_vec8_grf(grf, 0);
1066 reg.type = inst->dst.type;
1067 reg.dw1.bits.writemask = inst->dst.writemask;
1068
1069 inst->dst.file = HW_REG;
1070 inst->dst.fixed_hw_reg = reg;
1071 }
1072
1073 for (int i = 0; i < 3; i++) {
1074 if (inst->src[i].file != ATTR)
1075 continue;
1076
1077 int grf = attribute_map[inst->src[i].reg + inst->src[i].reg_offset];
1078
1079 struct brw_reg reg = brw_vec8_grf(grf, 0);
1080 reg.dw1.bits.swizzle = inst->src[i].swizzle;
1081 reg.type = inst->src[i].type;
1082 if (inst->src[i].abs)
1083 reg = brw_abs(reg);
1084 if (inst->src[i].negate)
1085 reg = negate(reg);
1086
1087 inst->src[i].file = HW_REG;
1088 inst->src[i].fixed_hw_reg = reg;
1089 }
1090 }
1091
1092 /* The BSpec says we always have to read at least one thing from
1093 * the VF, and it appears that the hardware wedges otherwise.
1094 */
1095 if (nr_attributes == 0)
1096 nr_attributes = 1;
1097
1098 prog_data->urb_read_length = (nr_attributes + 1) / 2;
1099
1100 unsigned vue_entries = MAX2(nr_attributes, c->prog_data.vue_map.num_slots);
1101
1102 if (intel->gen == 6)
1103 c->prog_data.urb_entry_size = ALIGN(vue_entries, 8) / 8;
1104 else
1105 c->prog_data.urb_entry_size = ALIGN(vue_entries, 4) / 4;
1106
1107 return payload_reg + nr_attributes;
1108 }
1109
1110 int
1111 vec4_visitor::setup_uniforms(int reg)
1112 {
1113 /* The pre-gen6 VS requires that some push constants get loaded no
1114 * matter what, or the GPU would hang.
1115 */
1116 if (intel->gen < 6 && this->uniforms == 0) {
1117 this->uniform_vector_size[this->uniforms] = 1;
1118
1119 for (unsigned int i = 0; i < 4; i++) {
1120 unsigned int slot = this->uniforms * 4 + i;
1121 static float zero = 0.0;
1122 c->prog_data.param[slot] = &zero;
1123 }
1124
1125 this->uniforms++;
1126 reg++;
1127 } else {
1128 reg += ALIGN(uniforms, 2) / 2;
1129 }
1130
1131 c->prog_data.nr_params = this->uniforms * 4;
1132
1133 c->prog_data.curb_read_length = reg - 1;
1134
1135 return reg;
1136 }
1137
1138 void
1139 vec4_visitor::setup_payload(void)
1140 {
1141 int reg = 0;
1142
1143 /* The payload always contains important data in g0, which contains
1144 * the URB handles that are passed on to the URB write at the end
1145 * of the thread. So, we always start push constants at g1.
1146 */
1147 reg++;
1148
1149 reg = setup_uniforms(reg);
1150
1151 reg = setup_attributes(reg);
1152
1153 this->first_non_payload_grf = reg;
1154 }
1155
1156 src_reg
1157 vec4_visitor::get_timestamp()
1158 {
1159 assert(intel->gen >= 7);
1160
1161 src_reg ts = src_reg(brw_reg(BRW_ARCHITECTURE_REGISTER_FILE,
1162 BRW_ARF_TIMESTAMP,
1163 0,
1164 BRW_REGISTER_TYPE_UD,
1165 BRW_VERTICAL_STRIDE_0,
1166 BRW_WIDTH_4,
1167 BRW_HORIZONTAL_STRIDE_4,
1168 BRW_SWIZZLE_XYZW,
1169 WRITEMASK_XYZW));
1170
1171 dst_reg dst = dst_reg(this, glsl_type::uvec4_type);
1172
1173 vec4_instruction *mov = emit(MOV(dst, ts));
1174 /* We want to read the 3 fields we care about (mostly field 0, but also 2)
1175 * even if it's not enabled in the dispatch.
1176 */
1177 mov->force_writemask_all = true;
1178
1179 return src_reg(dst);
1180 }
1181
1182 void
1183 vec4_visitor::emit_shader_time_begin()
1184 {
1185 current_annotation = "shader time start";
1186 shader_start_time = get_timestamp();
1187 }
1188
1189 void
1190 vec4_visitor::emit_shader_time_end()
1191 {
1192 current_annotation = "shader time end";
1193 src_reg shader_end_time = get_timestamp();
1194
1195
1196 /* Check that there weren't any timestamp reset events (assuming these
1197 * were the only two timestamp reads that happened).
1198 */
1199 src_reg reset_end = shader_end_time;
1200 reset_end.swizzle = BRW_SWIZZLE_ZZZZ;
1201 vec4_instruction *test = emit(AND(dst_null_d(), reset_end, src_reg(1u)));
1202 test->conditional_mod = BRW_CONDITIONAL_Z;
1203
1204 emit(IF(BRW_PREDICATE_NORMAL));
1205
1206 /* Take the current timestamp and get the delta. */
1207 shader_start_time.negate = true;
1208 dst_reg diff = dst_reg(this, glsl_type::uint_type);
1209 emit(ADD(diff, shader_start_time, shader_end_time));
1210
1211 /* If there were no instructions between the two timestamp gets, the diff
1212 * is 2 cycles. Remove that overhead, so I can forget about that when
1213 * trying to determine the time taken for single instructions.
1214 */
1215 emit(ADD(diff, src_reg(diff), src_reg(-2u)));
1216
1217 emit_shader_time_write(ST_VS, src_reg(diff));
1218 emit_shader_time_write(ST_VS_WRITTEN, src_reg(1u));
1219 emit(BRW_OPCODE_ELSE);
1220 emit_shader_time_write(ST_VS_RESET, src_reg(1u));
1221 emit(BRW_OPCODE_ENDIF);
1222 }
1223
1224 void
1225 vec4_visitor::emit_shader_time_write(enum shader_time_shader_type type,
1226 src_reg value)
1227 {
1228 /* Choose an index in the buffer and set up tracking information for our
1229 * printouts.
1230 */
1231 int shader_time_index = brw->shader_time.num_entries++;
1232 assert(shader_time_index <= brw->shader_time.max_entries);
1233 brw->shader_time.types[shader_time_index] = type;
1234 if (prog) {
1235 _mesa_reference_shader_program(ctx,
1236 &brw->shader_time.programs[shader_time_index],
1237 prog);
1238 }
1239
1240 int base_mrf = 6;
1241
1242 dst_reg offset_mrf = dst_reg(MRF, base_mrf);
1243 offset_mrf.type = BRW_REGISTER_TYPE_UD;
1244 emit(MOV(offset_mrf, src_reg(shader_time_index * SHADER_TIME_STRIDE)));
1245
1246 dst_reg time_mrf = dst_reg(MRF, base_mrf + 1);
1247 time_mrf.type = BRW_REGISTER_TYPE_UD;
1248 emit(MOV(time_mrf, src_reg(value)));
1249
1250 vec4_instruction *inst;
1251 inst = emit(SHADER_OPCODE_SHADER_TIME_ADD);
1252 inst->base_mrf = base_mrf;
1253 inst->mlen = 2;
1254 }
1255
1256 bool
1257 vec4_visitor::run()
1258 {
1259 sanity_param_count = vp->Base.Parameters->NumParameters;
1260
1261 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
1262 emit_shader_time_begin();
1263
1264 emit_attribute_fixups();
1265
1266 /* Generate VS IR for main(). (the visitor only descends into
1267 * functions called "main").
1268 */
1269 if (shader) {
1270 visit_instructions(shader->ir);
1271 } else {
1272 emit_vertex_program_code();
1273 }
1274 base_ir = NULL;
1275
1276 if (c->key.userclip_active && !c->key.uses_clip_distance)
1277 setup_uniform_clipplane_values();
1278
1279 if (INTEL_DEBUG & DEBUG_SHADER_TIME)
1280 emit_shader_time_end();
1281
1282 emit_urb_writes();
1283
1284 /* Before any optimization, push array accesses out to scratch
1285 * space where we need them to be. This pass may allocate new
1286 * virtual GRFs, so we want to do it early. It also makes sure
1287 * that we have reladdr computations available for CSE, since we'll
1288 * often do repeated subexpressions for those.
1289 */
1290 if (shader) {
1291 move_grf_array_access_to_scratch();
1292 move_uniform_array_access_to_pull_constants();
1293 } else {
1294 /* The ARB_vertex_program frontend emits pull constant loads directly
1295 * rather than using reladdr, so we don't need to walk through all the
1296 * instructions looking for things to move. There isn't anything.
1297 *
1298 * We do still need to split things to vec4 size.
1299 */
1300 split_uniform_registers();
1301 }
1302 pack_uniform_registers();
1303 move_push_constants_to_pull_constants();
1304 split_virtual_grfs();
1305
1306 bool progress;
1307 do {
1308 progress = false;
1309 progress = dead_code_eliminate() || progress;
1310 progress = opt_copy_propagation() || progress;
1311 progress = opt_algebraic() || progress;
1312 progress = opt_register_coalesce() || progress;
1313 } while (progress);
1314
1315
1316 if (failed)
1317 return false;
1318
1319 setup_payload();
1320
1321 if (false) {
1322 /* Debug of register spilling: Go spill everything. */
1323 const int grf_count = virtual_grf_count;
1324 float spill_costs[virtual_grf_count];
1325 bool no_spill[virtual_grf_count];
1326 evaluate_spill_costs(spill_costs, no_spill);
1327 for (int i = 0; i < grf_count; i++) {
1328 if (no_spill[i])
1329 continue;
1330 spill_reg(i);
1331 }
1332 }
1333
1334 while (!reg_allocate()) {
1335 if (failed)
1336 break;
1337 }
1338
1339 /* If any state parameters were appended, then ParameterValues could have
1340 * been realloced, in which case the driver uniform storage set up by
1341 * _mesa_associate_uniform_storage() would point to freed memory. Make
1342 * sure that didn't happen.
1343 */
1344 assert(sanity_param_count == vp->Base.Parameters->NumParameters);
1345
1346 return !failed;
1347 }
1348
1349 } /* namespace brw */
1350
1351 extern "C" {
1352
1353 /**
1354 * Compile a vertex shader.
1355 *
1356 * Returns the final assembly and the program's size.
1357 */
1358 const unsigned *
1359 brw_vs_emit(struct brw_context *brw,
1360 struct gl_shader_program *prog,
1361 struct brw_vs_compile *c,
1362 void *mem_ctx,
1363 unsigned *final_assembly_size)
1364 {
1365 struct intel_context *intel = &brw->intel;
1366 bool start_busy = false;
1367 float start_time = 0;
1368
1369 if (unlikely(intel->perf_debug)) {
1370 start_busy = (intel->batch.last_bo &&
1371 drm_intel_bo_busy(intel->batch.last_bo));
1372 start_time = get_time();
1373 }
1374
1375 struct brw_shader *shader = NULL;
1376 if (prog)
1377 shader = (brw_shader *) prog->_LinkedShaders[MESA_SHADER_VERTEX];
1378
1379 if (unlikely(INTEL_DEBUG & DEBUG_VS)) {
1380 if (shader) {
1381 printf("GLSL IR for native vertex shader %d:\n", prog->Name);
1382 _mesa_print_ir(shader->ir, NULL);
1383 printf("\n\n");
1384 } else {
1385 printf("ARB_vertex_program %d for native vertex shader\n",
1386 c->vp->program.Base.Id);
1387 _mesa_print_program(&c->vp->program.Base);
1388 }
1389 }
1390
1391 vec4_visitor v(brw, c, prog, shader, mem_ctx);
1392 if (!v.run()) {
1393 prog->LinkStatus = false;
1394 ralloc_strcat(&prog->InfoLog, v.fail_msg);
1395 return NULL;
1396 }
1397
1398 vec4_generator g(brw, c, prog, mem_ctx);
1399 const unsigned *generated =g.generate_assembly(&v.instructions,
1400 final_assembly_size);
1401
1402 if (unlikely(intel->perf_debug) && shader) {
1403 if (shader->compiled_once) {
1404 brw_vs_debug_recompile(brw, prog, &c->key);
1405 }
1406 if (start_busy && !drm_intel_bo_busy(intel->batch.last_bo)) {
1407 perf_debug("VS compile took %.03f ms and stalled the GPU\n",
1408 (get_time() - start_time) * 1000);
1409 }
1410 shader->compiled_once = true;
1411 }
1412
1413 return generated;
1414 }
1415
1416 } /* extern "C" */