r600g/sb: fix allocation of indirectly addressed input arrays
[mesa.git] / src / gallium / drivers / r600 / sb / sb_ra_init.cpp
1 /*
2 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Vadim Girlin
25 */
26
27 #define RA_DEBUG 0
28
29 #if RA_DEBUG
30 #define RA_DUMP(q) do { q } while (0)
31 #else
32 #define RA_DUMP(q)
33 #endif
34
35 #include <cstring>
36 #include <iostream>
37 #include <iomanip>
38
39 #include "sb_bc.h"
40 #include "sb_shader.h"
41
42 #include "sb_pass.h"
43
44 namespace r600_sb {
45
46 using std::cerr;
47
48 class regbits {
49 typedef uint32_t basetype;
50 static const unsigned bt_bytes = sizeof(basetype);
51 static const unsigned bt_index_shift = 5;
52 static const unsigned bt_index_mask = (1u << bt_index_shift) - 1;
53 static const unsigned bt_bits = bt_bytes << 3;
54 static const unsigned size = MAX_GPR * 4 / bt_bits;
55
56 basetype dta[size];
57
58 unsigned num_temps;
59
60 public:
61
62 regbits(unsigned num_temps) : dta(), num_temps(num_temps) {}
63 regbits(unsigned num_temps, unsigned value) : num_temps(num_temps)
64 { set_all(value); }
65
66 regbits(shader &sh, val_set &vs) : num_temps(sh.get_ctx().alu_temp_gprs)
67 { set_all(1); from_val_set(sh, vs); }
68
69 void set_all(unsigned val);
70 void from_val_set(shader &sh, val_set &vs);
71
72 void set(unsigned index);
73 void clear(unsigned index);
74 bool get(unsigned index);
75
76 void set(unsigned index, unsigned val);
77
78 sel_chan find_free_bit(unsigned start);
79 sel_chan find_free_chans(unsigned mask);
80 sel_chan find_free_array(unsigned size, unsigned mask);
81
82 void dump();
83 };
84
85 // =======================================
86
87 void regbits::dump() {
88 for (unsigned i = 0; i < size * bt_bits; ++i) {
89
90 if (!(i & 31))
91 cerr << "\n";
92
93 if (!(i & 3))
94 cerr << " " << std::setw(3) << (i / 4) << " ";
95
96 cerr << (get(i) ? 1 : 0);
97 }
98 }
99
100
101 void regbits::set_all(unsigned v) {
102 memset(&dta, v ? 0xFF : 0x00, size * bt_bytes);
103 }
104
105 void regbits::from_val_set(shader &sh, val_set& vs) {
106 val_set &s = vs;
107 unsigned g;
108 for (val_set::iterator I = s.begin(sh), E = s.end(sh); I != E; ++I) {
109 value *v = *I;
110 if (v->is_any_gpr()) {
111 g = v->get_final_gpr();
112 if (!g)
113 continue;
114 } else
115 continue;
116
117 assert(g);
118 --g;
119 assert(g < 512);
120 clear(g);
121 }
122 }
123
124 void regbits::set(unsigned index) {
125 unsigned ih = index >> bt_index_shift;
126 unsigned il = index & bt_index_mask;
127 dta[ih] |= ((basetype)1u << il);
128 }
129
130 void regbits::clear(unsigned index) {
131 unsigned ih = index >> bt_index_shift;
132 unsigned il = index & bt_index_mask;
133 assert(ih < size);
134 dta[ih] &= ~((basetype)1u << il);
135 }
136
137 bool regbits::get(unsigned index) {
138 unsigned ih = index >> bt_index_shift;
139 unsigned il = index & bt_index_mask;
140 return dta[ih] & ((basetype)1u << il);
141 }
142
143 void regbits::set(unsigned index, unsigned val) {
144 unsigned ih = index >> bt_index_shift;
145 unsigned il = index & bt_index_mask;
146 basetype bm = 1u << il;
147 dta[ih] = (dta[ih] & ~bm) | (val << il);
148 }
149
150 // free register for ra means the bit is set
151 sel_chan regbits::find_free_bit(unsigned start) {
152 unsigned elt = start >> bt_index_shift;
153 unsigned bit = start & bt_index_mask;
154
155 unsigned end = start < MAX_GPR - num_temps ? MAX_GPR - num_temps : MAX_GPR;
156
157 while (elt < end && !dta[elt]) {
158 ++elt;
159 bit = 0;
160 }
161
162 if (elt >= end)
163 return 0;
164
165 // FIXME this seems broken when not starting from 0
166
167 bit += __builtin_ctz(dta[elt]);
168 return ((elt << bt_index_shift) | bit) + 1;
169 }
170
171 // find free gpr component to use as indirectly addressable array
172 sel_chan regbits::find_free_array(unsigned length, unsigned mask) {
173 unsigned cc[4] = {};
174
175 // FIXME optimize this. though hopefully we won't have a lot of arrays
176 for (unsigned a = 0; a < MAX_GPR - num_temps; ++a) {
177 for(unsigned c = 0; c < MAX_CHAN; ++c) {
178 if (mask & (1 << c)) {
179 if (get((a << 2) | c)) {
180 if (++cc[c] == length)
181 return sel_chan(a - length + 1, c);
182 } else {
183 cc[c] = 0;
184 }
185 }
186 }
187 }
188 return 0;
189 }
190
191 sel_chan regbits::find_free_chans(unsigned mask) {
192 unsigned elt = 0;
193 unsigned bit = 0;
194
195 basetype cd = dta[elt] >> bit;
196
197 do {
198
199 if (!cd) {
200 if (++elt < size)
201 cd = dta[elt];
202 else
203 return 0;
204
205 bit = 0;
206 }
207
208 unsigned p = __builtin_ctz(cd) & ~(basetype)3u;
209
210 if (p > bt_bits - bit) {
211 if (++elt < size)
212 cd = dta[elt];
213 else
214 return 0;
215 bit = 0;
216 }
217
218 bit += p;
219 cd >>= p;
220
221 if ((cd & mask) == mask) {
222 return ((elt << bt_index_shift) | bit) + 1;
223 }
224
225 bit += 4;
226 cd >>= 4;
227
228 } while (1);
229
230 return 0;
231 }
232
233 // ================================
234
235 void ra_init::alloc_arrays() {
236
237 gpr_array_vec &ga = sh.arrays();
238
239 for(gpr_array_vec::iterator I = ga.begin(), E = ga.end(); I != E; ++I) {
240 gpr_array *a = *I;
241
242 RA_DUMP(
243 cerr << "array [" << a->array_size << "] at " << a->base_gpr << "\n";
244 cerr << "\n";
245 );
246
247 // skip preallocated arrays (e.g. with preloaded inputs)
248 if (a->gpr) {
249 RA_DUMP( cerr << " FIXED at " << a->gpr << "\n"; );
250 continue;
251 }
252
253 bool dead = a->is_dead();
254
255 if (dead) {
256 RA_DUMP( cerr << " DEAD\n"; );
257 continue;
258 }
259
260 val_set &s = a->interferences;
261
262
263 for (val_set::iterator I = s.begin(sh), E = s.end(sh); I != E; ++I) {
264 value *v = *I;
265 if (v->array == a)
266 s.remove_val(v);
267 }
268
269 RA_DUMP(
270 cerr << " interf: ";
271 dump::dump_set(sh, s);
272 cerr << "\n";
273 );
274
275 regbits rb(sh, s);
276
277 sel_chan base = rb.find_free_array(a->array_size,
278 (1 << a->base_gpr.chan()));
279
280 RA_DUMP( cerr << " found base: " << base << "\n"; );
281
282 a->gpr = base;
283 }
284 }
285
286
287 int ra_init::run() {
288
289 alloc_arrays();
290
291 ra_node(sh.root);
292 return 0;
293 }
294
295 void ra_init::ra_node(container_node* c) {
296
297 for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) {
298 node *n = *I;
299 if (n->type == NT_OP) {
300 process_op(n);
301 }
302 if (n->is_container() && !n->is_alu_packed()) {
303 ra_node(static_cast<container_node*>(n));
304 }
305 }
306 }
307
308 void ra_init::process_op(node* n) {
309
310 bool copy = n->is_copy_mov();
311
312 RA_DUMP(
313 cerr << "ra_init: process_op : ";
314 dump::dump_op(n);
315 cerr << "\n";
316 );
317
318 if (n->is_alu_packed()) {
319 for (vvec::iterator I = n->src.begin(), E = n->src.end(); I != E; ++I) {
320 value *v = *I;
321 if (v && v->is_sgpr() && v->constraint &&
322 v->constraint->kind == CK_PACKED_BS) {
323 color_bs_constraint(v->constraint);
324 break;
325 }
326 }
327 }
328
329 if (n->is_fetch_inst() || n->is_cf_inst()) {
330 for (vvec::iterator I = n->src.begin(), E = n->src.end(); I != E; ++I) {
331 value *v = *I;
332 if (v && v->is_sgpr())
333 color(v);
334 }
335 }
336
337 for (vvec::iterator I = n->dst.begin(), E = n->dst.end(); I != E; ++I) {
338 value *v = *I;
339 if (!v)
340 continue;
341 if (v->is_sgpr()) {
342 if (!v->gpr) {
343 if (copy && !v->constraint) {
344 value *s = *(n->src.begin() + (I - n->dst.begin()));
345 assert(s);
346 if (s->is_sgpr()) {
347 assign_color(v, s->gpr);
348 }
349 } else
350 color(v);
351 }
352 }
353 }
354 }
355
356 void ra_init::color_bs_constraint(ra_constraint* c) {
357 vvec &vv = c->values;
358 assert(vv.size() <= 8);
359
360 RA_DUMP(
361 cerr << "color_bs_constraint: ";
362 dump::dump_vec(vv);
363 cerr << "\n";
364 );
365
366 regbits rb(ctx.alu_temp_gprs);
367
368 unsigned chan_count[4] = {};
369 unsigned allowed_chans = 0x0F;
370
371 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
372 value *v = *I;
373 sel_chan gpr = v->get_final_gpr();
374
375 if (!v || v->is_dead())
376 continue;
377
378 val_set interf;
379
380 if (v->chunk)
381 sh.coal.get_chunk_interferences(v->chunk, interf);
382 else
383 interf = v->interferences;
384
385 RA_DUMP(
386 cerr << " processing " << *v << " interferences : ";
387 dump::dump_set(sh, interf);
388 cerr << "\n";
389 );
390
391 if (gpr) {
392 unsigned chan = gpr.chan();
393 if (chan_count[chan] < 3) {
394 ++chan_count[chan];
395 continue;
396 } else {
397 v->flags &= ~VLF_FIXED;
398 allowed_chans &= ~(1 << chan);
399 assert(allowed_chans);
400 }
401 }
402
403 v->gpr = 0;
404
405 gpr = 1;
406 rb.set_all(1);
407
408
409 rb.from_val_set(sh, interf);
410
411 RA_DUMP(
412 cerr << " regbits : ";
413 rb.dump();
414 cerr << "\n";
415 );
416
417 while (allowed_chans && gpr.sel() < sh.num_nontemp_gpr()) {
418
419 while (rb.get(gpr - 1) == 0)
420 gpr = gpr + 1;
421
422 RA_DUMP(
423 cerr << " trying " << gpr << "\n";
424 );
425
426 unsigned chan = gpr.chan();
427 if (chan_count[chan] < 3) {
428 ++chan_count[chan];
429
430 if (v->chunk) {
431 vvec::iterator F = std::find(v->chunk->values.begin(),
432 v->chunk->values.end(),
433 v);
434 v->chunk->values.erase(F);
435 v->chunk = NULL;
436 }
437
438 assign_color(v, gpr);
439 break;
440 } else {
441 allowed_chans &= ~(1 << chan);
442 }
443 gpr = gpr + 1;
444 }
445
446 if (!gpr) {
447 cerr << "color_bs_constraint: failed...\n";
448 assert(!"coloring failed");
449 }
450 }
451 }
452
453 void ra_init::color(value* v) {
454
455 if (v->constraint && v->constraint->kind == CK_PACKED_BS) {
456 color_bs_constraint(v->constraint);
457 return;
458 }
459
460 if (v->chunk && v->chunk->is_fixed())
461 return;
462
463 RA_DUMP(
464 cerr << "coloring ";
465 dump::dump_val(v);
466 cerr << " interferences ";
467 dump::dump_set(sh, v->interferences);
468 cerr << "\n";
469 );
470
471 if (v->is_reg_pinned()) {
472 assert(v->is_chan_pinned());
473 assign_color(v, v->pin_gpr);
474 return;
475 }
476
477 regbits rb(sh, v->interferences);
478 sel_chan c;
479
480 if (v->is_chan_pinned()) {
481 RA_DUMP( cerr << "chan_pinned = " << v->pin_gpr.chan() << " "; );
482 unsigned mask = 1 << v->pin_gpr.chan();
483 c = rb.find_free_chans(mask) + v->pin_gpr.chan();
484 } else {
485 c = rb.find_free_bit(0);
486 }
487
488 assert(c && c.sel() < 128 - ctx.alu_temp_gprs && "color failed");
489 assign_color(v, c);
490 }
491
492 void ra_init::assign_color(value* v, sel_chan c) {
493 v->gpr = c;
494 RA_DUMP(
495 cerr << "colored ";
496 dump::dump_val(v);
497 cerr << " to " << c << "\n";
498 );
499 }
500
501 // ===================================================
502
503 int ra_split::run() {
504 split(sh.root);
505 return 0;
506 }
507
508 void ra_split::split_phi_src(container_node *loc, container_node *c,
509 unsigned id, bool loop) {
510 for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) {
511 node *p = *I;
512 value* &v = p->src[id], *d = p->dst[0];
513 assert(v);
514
515 if (!d->is_sgpr() || v->is_undef())
516 continue;
517
518 value *t = sh.create_temp_value();
519 if (loop && id == 0)
520 loc->insert_before(sh.create_copy_mov(t, v));
521 else
522 loc->push_back(sh.create_copy_mov(t, v));
523 v = t;
524
525 sh.coal.add_edge(v, d, coalescer::phi_cost);
526 }
527 }
528
529 void ra_split::split_phi_dst(node* loc, container_node *c, bool loop) {
530 for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) {
531 node *p = *I;
532 value* &v = p->dst[0];
533 assert(v);
534
535 if (!v->is_sgpr())
536 continue;
537
538 value *t = sh.create_temp_value();
539 node *cp = sh.create_copy_mov(v, t);
540 if (loop)
541 static_cast<container_node*>(loc)->push_front(cp);
542 else
543 loc->insert_after(cp);
544 v = t;
545 }
546 }
547
548
549 void ra_split::init_phi_constraints(container_node *c) {
550 for (node_iterator I = c->begin(), E = c->end(); I != E; ++I) {
551 node *p = *I;
552 ra_constraint *cc = sh.coal.create_constraint(CK_PHI);
553 cc->values.push_back(p->dst[0]);
554
555 for (vvec::iterator I = p->src.begin(), E = p->src.end(); I != E; ++I) {
556 value *v = *I;
557 if (v->is_sgpr())
558 cc->values.push_back(v);
559 }
560
561 cc->update_values();
562 }
563 }
564
565 void ra_split::split(container_node* n) {
566
567 if (n->type == NT_DEPART) {
568 depart_node *d = static_cast<depart_node*>(n);
569 if (d->target->phi)
570 split_phi_src(d, d->target->phi, d->dep_id, false);
571 } else if (n->type == NT_REPEAT) {
572 repeat_node *r = static_cast<repeat_node*>(n);
573 if (r->target->loop_phi)
574 split_phi_src(r, r->target->loop_phi, r->rep_id, true);
575 } else if (n->type == NT_REGION) {
576 region_node *r = static_cast<region_node*>(n);
577 if (r->phi) {
578 split_phi_dst(r, r->phi, false);
579 }
580 if (r->loop_phi) {
581 split_phi_dst(r->get_entry_code_location(), r->loop_phi,
582 true);
583 split_phi_src(r, r->loop_phi, 0, true);
584 }
585 }
586
587 for (node_riterator N, I = n->rbegin(), E = n->rend(); I != E; I = N) {
588 N = I;
589 ++N;
590 node *o = *I;
591 if (o->type == NT_OP) {
592 split_op(o);
593 } else if (o->is_container()) {
594 split(static_cast<container_node*>(o));
595 }
596 }
597
598 if (n->type == NT_REGION) {
599 region_node *r = static_cast<region_node*>(n);
600 if (r->phi)
601 init_phi_constraints(r->phi);
602 if (r->loop_phi)
603 init_phi_constraints(r->loop_phi);
604 }
605 }
606
607 void ra_split::split_op(node* n) {
608 switch(n->subtype) {
609 case NST_ALU_PACKED_INST:
610 split_alu_packed(static_cast<alu_packed_node*>(n));
611 break;
612 case NST_FETCH_INST:
613 case NST_CF_INST:
614 split_vector_inst(n);
615 default:
616 break;
617 }
618 }
619
620 void ra_split::split_packed_ins(alu_packed_node *n) {
621 vvec vv = n->src;
622 vvec sv, dv;
623
624 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
625
626 value *&v = *I;
627
628 if (v && v->is_any_gpr() && !v->is_undef()) {
629
630 vvec::iterator F = std::find(sv.begin(), sv.end(), v);
631 value *t;
632
633 if (F != sv.end()) {
634 t = *(dv.begin() + (F - sv.begin()));
635 } else {
636 t = sh.create_temp_value();
637 sv.push_back(v);
638 dv.push_back(t);
639 }
640 v = t;
641 }
642 }
643
644 unsigned cnt = sv.size();
645
646 if (cnt > 0) {
647 n->src = vv;
648 for (vvec::iterator SI = sv.begin(), DI = dv.begin(), SE = sv.end();
649 SI != SE; ++SI, ++DI) {
650 n->insert_before(sh.create_copy_mov(*DI, *SI));
651 }
652
653 ra_constraint *c = sh.coal.create_constraint(CK_PACKED_BS);
654 c->values = dv;
655 c->update_values();
656 }
657 }
658
659 // TODO handle other packed ops for cayman
660 void ra_split::split_alu_packed(alu_packed_node* n) {
661 switch (n->op()) {
662 case ALU_OP2_DOT4:
663 case ALU_OP2_CUBE:
664 split_packed_ins(n);
665 break;
666 default:
667 break;
668 }
669 }
670
671 void ra_split::split_vec(vvec &vv, vvec &v1, vvec &v2, bool allow_swz) {
672 unsigned ch = 0;
673 for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I, ++ch) {
674
675 value* &o = *I;
676
677 if (o) {
678
679 assert(!o->is_dead());
680
681 if (o->is_undef())
682 continue;
683
684 if (allow_swz && o->is_float_0_or_1())
685 continue;
686
687 value *t;
688 vvec::iterator F =
689 allow_swz ? find(v2.begin(), v2.end(), o) : v2.end();
690
691 if (F != v2.end()) {
692 t = *(v1.begin() + (F - v2.begin()));
693 } else {
694 t = sh.create_temp_value();
695
696 if (!allow_swz) {
697 t->flags |= VLF_PIN_CHAN;
698 t->pin_gpr = sel_chan(0, ch);
699 }
700
701 v2.push_back(o);
702 v1.push_back(t);
703 }
704 o = t;
705 }
706 }
707 }
708
709 void ra_split::split_vector_inst(node* n) {
710 ra_constraint *c;
711
712 bool call_fs = n->is_cf_op(CF_OP_CALL_FS);
713 bool no_src_swizzle = n->is_cf_inst() && (n->cf_op_flags() & CF_MEM);
714
715 no_src_swizzle |= n->is_fetch_op(FETCH_OP_VFETCH) ||
716 n->is_fetch_op(FETCH_OP_SEMFETCH);
717
718 if (!n->src.empty() && !call_fs) {
719
720 // we may have more than one source vector -
721 // fetch instructions with FF_USEGRAD have gradient values in
722 // src vectors 1 (src[4-7] and 2 (src[8-11])
723
724 unsigned nvec = n->src.size() >> 2;
725 assert(nvec << 2 == n->src.size());
726
727 for (unsigned nv = 0; nv < nvec; ++nv) {
728 vvec sv, tv, nsrc(4);
729 unsigned arg_start = nv << 2;
730
731 std::copy(n->src.begin() + arg_start,
732 n->src.begin() + arg_start + 4,
733 nsrc.begin());
734
735 split_vec(nsrc, tv, sv, !no_src_swizzle);
736
737 unsigned cnt = sv.size();
738
739 if (no_src_swizzle || cnt) {
740
741 std::copy(nsrc.begin(), nsrc.end(), n->src.begin() + arg_start);
742
743 for(unsigned i = 0, s = tv.size(); i < s; ++i) {
744 n->insert_before(sh.create_copy_mov(tv[i], sv[i]));
745 }
746
747 c = sh.coal.create_constraint(CK_SAME_REG);
748 c->values = tv;
749 c->update_values();
750 }
751 }
752 }
753
754 if (!n->dst.empty()) {
755 vvec sv, tv, ndst = n->dst;
756
757 split_vec(ndst, tv, sv, true);
758
759 if (sv.size()) {
760 n->dst = ndst;
761
762 node *lp = n;
763 for(unsigned i = 0, s = tv.size(); i < s; ++i) {
764 lp->insert_after(sh.create_copy_mov(sv[i], tv[i]));
765 lp = lp->next;
766 }
767
768 if (call_fs) {
769 for (unsigned i = 0, cnt = tv.size(); i < cnt; ++i) {
770 value *v = tv[i];
771 value *s = sv[i];
772 if (!v)
773 continue;
774
775 v->flags |= VLF_PIN_REG | VLF_PIN_CHAN;
776 s->flags &= ~(VLF_PIN_REG | VLF_PIN_CHAN);
777 sel_chan sel;
778
779 if (s->is_rel()) {
780 assert(s->rel->is_const());
781 sel = sel_chan(s->select.sel() +
782 s->rel->get_const_value().u,
783 s->select.chan());
784 } else
785 sel = s->select;
786
787 v->gpr = v->pin_gpr = sel;
788 v->fix();
789 }
790 } else {
791 c = sh.coal.create_constraint(CK_SAME_REG);
792 c->values = tv;
793 c->update_values();
794 }
795 }
796 }
797 }
798
799 } // namespace r600_sb