r600/sb: add finalising for lds output queue special values.
[mesa.git] / src / gallium / drivers / r600 / sb / sb_pass.h
1 /*
2 * Copyright 2013 Vadim Girlin <vadimgirlin@gmail.com>
3 *
4 * Permission is hereby granted, free of charge, to any person obtaining a
5 * copy of this software and associated documentation files (the "Software"),
6 * to deal in the Software without restriction, including without limitation
7 * on the rights to use, copy, modify, merge, publish, distribute, sub
8 * license, and/or sell copies of the Software, and to permit persons to whom
9 * the Software is furnished to do so, subject to the following conditions:
10 *
11 * The above copyright notice and this permission notice (including the next
12 * paragraph) shall be included in all copies or substantial portions of the
13 * Software.
14 *
15 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
18 * THE AUTHOR(S) AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
19 * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
20 * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
21 * USE OR OTHER DEALINGS IN THE SOFTWARE.
22 *
23 * Authors:
24 * Vadim Girlin
25 */
26
27 #ifndef SB_PASS_H_
28 #define SB_PASS_H_
29
30 #include <stack>
31
32 namespace r600_sb {
33
34 class pass {
35 protected:
36 sb_context &ctx;
37 shader &sh;
38
39 public:
40 pass(shader &s);
41
42 virtual int run();
43
44 virtual ~pass() {}
45 };
46
47 class vpass : public pass {
48
49 public:
50
51 vpass(shader &s) : pass(s) {}
52
53 virtual int init();
54 virtual int done();
55
56 virtual int run();
57 virtual void run_on(container_node &n);
58
59 virtual bool visit(node &n, bool enter);
60 virtual bool visit(container_node &n, bool enter);
61 virtual bool visit(alu_group_node &n, bool enter);
62 virtual bool visit(cf_node &n, bool enter);
63 virtual bool visit(alu_node &n, bool enter);
64 virtual bool visit(alu_packed_node &n, bool enter);
65 virtual bool visit(fetch_node &n, bool enter);
66 virtual bool visit(region_node &n, bool enter);
67 virtual bool visit(repeat_node &n, bool enter);
68 virtual bool visit(depart_node &n, bool enter);
69 virtual bool visit(if_node &n, bool enter);
70 virtual bool visit(bb_node &n, bool enter);
71
72 };
73
74 class rev_vpass : public vpass {
75
76 public:
77 rev_vpass(shader &s) : vpass(s) {}
78
79 virtual void run_on(container_node &n);
80 };
81
82
83 // =================== PASSES
84
85 class bytecode;
86
87 class bc_dump : public vpass {
88 using vpass::visit;
89
90 uint32_t *bc_data;
91 unsigned ndw;
92
93 unsigned id;
94
95 unsigned new_group, group_index;
96
97 public:
98
99 bc_dump(shader &s, bytecode *bc = NULL);
100
101 bc_dump(shader &s, uint32_t *bc_ptr, unsigned ndw) :
102 vpass(s), bc_data(bc_ptr), ndw(ndw), id(), new_group(), group_index() {}
103
104 virtual int init();
105 virtual int done();
106
107 virtual bool visit(cf_node &n, bool enter);
108 virtual bool visit(alu_node &n, bool enter);
109 virtual bool visit(fetch_node &n, bool enter);
110
111 void dump_dw(unsigned dw_id, unsigned count = 2);
112
113 void dump(cf_node& n);
114 void dump(alu_node& n);
115 void dump(fetch_node& n);
116 };
117
118
119 class dce_cleanup : public vpass {
120 using vpass::visit;
121
122 bool remove_unused;
123
124 public:
125
126 dce_cleanup(shader &s) : vpass(s),
127 remove_unused(s.dce_flags & DF_REMOVE_UNUSED), nodes_changed(false) {}
128
129 virtual int run();
130
131 virtual bool visit(node &n, bool enter);
132 virtual bool visit(alu_group_node &n, bool enter);
133 virtual bool visit(cf_node &n, bool enter);
134 virtual bool visit(alu_node &n, bool enter);
135 virtual bool visit(alu_packed_node &n, bool enter);
136 virtual bool visit(fetch_node &n, bool enter);
137 virtual bool visit(region_node &n, bool enter);
138 virtual bool visit(container_node &n, bool enter);
139
140 private:
141
142 void cleanup_dst(node &n);
143 bool cleanup_dst_vec(vvec &vv);
144
145 // Did we alter/remove nodes during a single pass?
146 bool nodes_changed;
147 };
148
149
150 class def_use : public pass {
151
152 public:
153
154 def_use(shader &sh) : pass(sh) {}
155
156 virtual int run();
157 void run_on(node *n, bool defs);
158
159 private:
160
161 void process_uses(node *n);
162 void process_defs(node *n, vvec &vv, bool arr_def);
163 void process_phi(container_node *c, bool defs, bool uses);
164 };
165
166
167
168 class dump : public vpass {
169 using vpass::visit;
170
171 int level;
172
173 public:
174
175 dump(shader &s) : vpass(s), level(0) {}
176
177 virtual bool visit(node &n, bool enter);
178 virtual bool visit(container_node &n, bool enter);
179 virtual bool visit(alu_group_node &n, bool enter);
180 virtual bool visit(cf_node &n, bool enter);
181 virtual bool visit(alu_node &n, bool enter);
182 virtual bool visit(alu_packed_node &n, bool enter);
183 virtual bool visit(fetch_node &n, bool enter);
184 virtual bool visit(region_node &n, bool enter);
185 virtual bool visit(repeat_node &n, bool enter);
186 virtual bool visit(depart_node &n, bool enter);
187 virtual bool visit(if_node &n, bool enter);
188 virtual bool visit(bb_node &n, bool enter);
189
190
191 static void dump_op(node &n, const char *name);
192 static void dump_vec(const vvec & vv);
193 static void dump_set(shader &sh, val_set & v);
194
195 static void dump_rels(vvec & vv);
196
197 static void dump_val(value *v);
198 static void dump_op(node *n);
199
200 static void dump_op_list(container_node *c);
201 static void dump_queue(sched_queue &q);
202
203 static void dump_alu(alu_node *n);
204
205 private:
206
207 void indent();
208
209 void dump_common(node &n);
210 void dump_flags(node &n);
211
212 void dump_live_values(container_node &n, bool before);
213 };
214
215
216 // Global Code Motion
217
218 class gcm : public pass {
219
220 sched_queue bu_ready[SQ_NUM];
221 sched_queue bu_ready_next[SQ_NUM];
222 sched_queue bu_ready_early[SQ_NUM];
223 sched_queue ready;
224 sched_queue ready_above;
225
226 container_node pending;
227
228 struct op_info {
229 bb_node* top_bb;
230 bb_node* bottom_bb;
231 op_info() : top_bb(), bottom_bb() {}
232 };
233
234 typedef std::map<node*, op_info> op_info_map;
235
236 typedef std::map<node*, unsigned> nuc_map;
237
238 op_info_map op_map;
239 nuc_map uses;
240
241 typedef std::vector<nuc_map> nuc_stack;
242
243 nuc_stack nuc_stk;
244 unsigned ucs_level;
245
246 bb_node * bu_bb;
247
248 vvec pending_defs;
249
250 node_list pending_nodes;
251
252 unsigned cur_sq;
253
254 // for register pressure tracking in bottom-up pass
255 val_set live;
256 int live_count;
257
258 static const int rp_threshold = 100;
259
260 bool pending_exec_mask_update;
261
262 public:
263
264 gcm(shader &sh) : pass(sh),
265 bu_ready(), bu_ready_next(), bu_ready_early(),
266 ready(), op_map(), uses(), nuc_stk(1), ucs_level(),
267 bu_bb(), pending_defs(), pending_nodes(), cur_sq(),
268 live(), live_count(), pending_exec_mask_update() {}
269
270 virtual int run();
271
272 private:
273
274 void collect_instructions(container_node *c, bool early_pass);
275
276 void sched_early(container_node *n);
277 void td_sched_bb(bb_node *bb);
278 bool td_is_ready(node *n);
279 void td_release_uses(vvec &v);
280 void td_release_val(value *v);
281 void td_schedule(bb_node *bb, node *n);
282
283 void sched_late(container_node *n);
284 void bu_sched_bb(bb_node *bb);
285 void bu_release_defs(vvec &v, bool src);
286 void bu_release_phi_defs(container_node *p, unsigned op);
287 bool bu_is_ready(node *n);
288 void bu_release_val(value *v);
289 void bu_release_op(node * n);
290 void bu_find_best_bb(node *n, op_info &oi);
291 void bu_schedule(container_node *bb, node *n);
292
293 void push_uc_stack();
294 void pop_uc_stack();
295
296 void init_def_count(nuc_map &m, container_node &s);
297 void init_use_count(nuc_map &m, container_node &s);
298 unsigned get_uc_vec(vvec &vv);
299 unsigned get_dc_vec(vvec &vv, bool src);
300
301 void add_ready(node *n);
302
303 void dump_uc_stack();
304
305 unsigned real_alu_count(sched_queue &q, unsigned max);
306
307 // check if we have not less than threshold ready alu instructions
308 bool check_alu_ready_count(unsigned threshold);
309 };
310
311
312 class gvn : public vpass {
313 using vpass::visit;
314
315 public:
316
317 gvn(shader &sh) : vpass(sh) {}
318
319 virtual bool visit(node &n, bool enter);
320 virtual bool visit(cf_node &n, bool enter);
321 virtual bool visit(alu_node &n, bool enter);
322 virtual bool visit(alu_packed_node &n, bool enter);
323 virtual bool visit(fetch_node &n, bool enter);
324 virtual bool visit(region_node &n, bool enter);
325
326 private:
327
328 void process_op(node &n, bool rewrite = true);
329
330 // returns true if the value was rewritten
331 bool process_src(value* &v, bool rewrite);
332
333
334 void process_alu_src_constants(node &n, value* &v);
335 };
336
337
338 class if_conversion : public pass {
339
340 public:
341
342 if_conversion(shader &sh) : pass(sh) {}
343
344 virtual int run();
345
346 bool run_on(region_node *r);
347
348 void convert_kill_instructions(region_node *r, value *em, bool branch,
349 container_node *c);
350
351 bool check_and_convert(region_node *r);
352
353 alu_node* convert_phi(value *select, node *phi);
354
355 };
356
357
358 class liveness : public rev_vpass {
359 using vpass::visit;
360
361 val_set live;
362 bool live_changed;
363
364 public:
365
366 liveness(shader &s) : rev_vpass(s), live_changed(false) {}
367
368 virtual int init();
369
370 virtual bool visit(node &n, bool enter);
371 virtual bool visit(bb_node &n, bool enter);
372 virtual bool visit(container_node &n, bool enter);
373 virtual bool visit(alu_group_node &n, bool enter);
374 virtual bool visit(cf_node &n, bool enter);
375 virtual bool visit(alu_node &n, bool enter);
376 virtual bool visit(alu_packed_node &n, bool enter);
377 virtual bool visit(fetch_node &n, bool enter);
378 virtual bool visit(region_node &n, bool enter);
379 virtual bool visit(repeat_node &n, bool enter);
380 virtual bool visit(depart_node &n, bool enter);
381 virtual bool visit(if_node &n, bool enter);
382
383 private:
384
385 void update_interferences();
386 void process_op(node &n);
387
388 bool remove_val(value *v);
389 bool remove_vec(vvec &v);
390 bool process_outs(node& n);
391 void process_ins(node& n);
392
393 void process_phi_outs(container_node *phi);
394 void process_phi_branch(container_node *phi, unsigned id);
395
396 bool process_maydef(value *v);
397
398 bool add_vec(vvec &vv, bool src);
399
400 void update_src_vec(vvec &vv, bool src);
401 };
402
403
404 struct bool_op_info {
405 bool invert;
406 unsigned int_cvt;
407
408 alu_node *n;
409 };
410
411 class peephole : public pass {
412
413 public:
414
415 peephole(shader &sh) : pass(sh) {}
416
417 virtual int run();
418
419 void run_on(container_node *c);
420
421 void optimize_cc_op(alu_node *a);
422
423 void optimize_cc_op2(alu_node *a);
424 void optimize_CNDcc_op(alu_node *a);
425
426 bool get_bool_op_info(value *b, bool_op_info& bop);
427 bool get_bool_flt_to_int_source(alu_node* &a);
428 void convert_float_setcc(alu_node *f2i, alu_node *s);
429 };
430
431
432 class psi_ops : public rev_vpass {
433 using rev_vpass::visit;
434
435 public:
436
437 psi_ops(shader &s) : rev_vpass(s) {}
438
439 virtual bool visit(node &n, bool enter);
440 virtual bool visit(alu_node &n, bool enter);
441
442 bool try_inline(node &n);
443 bool try_reduce(node &n);
444 bool eliminate(node &n);
445
446 void unpredicate(node *n);
447 };
448
449
450 // check correctness of the generated code, e.g.:
451 // - expected source operand value is the last value written to its gpr,
452 // - all arguments of phi node should be allocated to the same gpr,
453 // TODO other tests
454 class ra_checker : public pass {
455
456 typedef std::map<sel_chan, value *> reg_value_map;
457
458 typedef std::vector<reg_value_map> regmap_stack;
459
460 regmap_stack rm_stack;
461 unsigned rm_stk_level;
462
463 value* prev_dst[5];
464
465 public:
466
467 ra_checker(shader &sh) : pass(sh), rm_stk_level(0), prev_dst() {}
468
469 virtual int run();
470
471 void run_on(container_node *c);
472
473 void dump_error(const error_info &e);
474 void dump_all_errors();
475
476 private:
477
478 reg_value_map& rmap() { return rm_stack[rm_stk_level]; }
479
480 void push_stack();
481 void pop_stack();
482
483 // when going out of the alu clause, values in the clause temporary gprs,
484 // AR, predicate values, PS/PV are destroyed
485 void kill_alu_only_regs();
486 void error(node *n, unsigned id, std::string msg);
487
488 void check_phi_src(container_node *p, unsigned id);
489 void process_phi_dst(container_node *p);
490 void check_alu_group(alu_group_node *g);
491 void process_op_dst(node *n);
492 void check_op_src(node *n);
493 void check_src_vec(node *n, unsigned id, vvec &vv, bool src);
494 void check_value_gpr(node *n, unsigned id, value *v);
495 };
496
497 // =======================================
498
499
500 class ra_coalesce : public pass {
501
502 public:
503
504 ra_coalesce(shader &sh) : pass(sh) {}
505
506 virtual int run();
507 };
508
509
510
511 // =======================================
512
513 class ra_init : public pass {
514
515 public:
516
517 ra_init(shader &sh) : pass(sh), prev_chans() {
518
519 // The parameter below affects register channels distribution.
520 // For cayman (VLIW-4) we're trying to distribute the channels
521 // uniformly, this means significantly better alu slots utilization
522 // at the expense of higher gpr usage. Hopefully this will improve
523 // performance, though it has to be proven with real benchmarks yet.
524 // For VLIW-5 this method could also slightly improve slots
525 // utilization, but increased register pressure seems more significant
526 // and overall performance effect is negative according to some
527 // benchmarks, so it's not used currently. Basically, VLIW-5 doesn't
528 // really need it because trans slot (unrestricted by register write
529 // channel) allows to consume most deviations from uniform channel
530 // distribution.
531 // Value 3 means that for new allocation we'll use channel that differs
532 // from 3 last used channels. 0 for VLIW-5 effectively turns this off.
533
534 ra_tune = sh.get_ctx().is_cayman() ? 3 : 0;
535 }
536
537 virtual int run();
538
539 private:
540
541 unsigned prev_chans;
542 unsigned ra_tune;
543
544 void add_prev_chan(unsigned chan);
545 unsigned get_preferable_chan_mask();
546
547 void ra_node(container_node *c);
548 void process_op(node *n);
549
550 void color(value *v);
551
552 void color_bs_constraint(ra_constraint *c);
553
554 void assign_color(value *v, sel_chan c);
555 void alloc_arrays();
556 };
557
558 // =======================================
559
560 class ra_split : public pass {
561
562 public:
563
564 ra_split(shader &sh) : pass(sh) {}
565
566 virtual int run();
567
568 void split(container_node *n);
569 void split_op(node *n);
570 void split_alu_packed(alu_packed_node *n);
571 void split_vector_inst(node *n);
572
573 void split_packed_ins(alu_packed_node *n);
574
575 #if 0
576 void split_pinned_outs(node *n);
577 #endif
578
579 void split_vec(vvec &vv, vvec &v1, vvec &v2, bool allow_swz);
580
581 void split_phi_src(container_node *loc, container_node *c, unsigned id,
582 bool loop);
583 void split_phi_dst(node *loc, container_node *c, bool loop);
584 void init_phi_constraints(container_node *c);
585 };
586
587
588
589 class ssa_prepare : public vpass {
590 using vpass::visit;
591
592 typedef std::vector<val_set> vd_stk;
593 vd_stk stk;
594
595 unsigned level;
596
597 public:
598 ssa_prepare(shader &s) : vpass(s), level(0) {}
599
600 virtual bool visit(cf_node &n, bool enter);
601 virtual bool visit(alu_node &n, bool enter);
602 virtual bool visit(fetch_node &n, bool enter);
603 virtual bool visit(region_node &n, bool enter);
604 virtual bool visit(repeat_node &n, bool enter);
605 virtual bool visit(depart_node &n, bool enter);
606
607 private:
608
609 void push_stk() {
610 ++level;
611 if (level + 1 > stk.size())
612 stk.resize(level+1);
613 else
614 stk[level].clear();
615 }
616 void pop_stk() {
617 assert(level);
618 --level;
619 stk[level].add_set(stk[level + 1]);
620 }
621
622 void add_defs(node &n);
623
624 val_set & cur_set() { return stk[level]; }
625
626 container_node* create_phi_nodes(int count);
627 };
628
629 class ssa_rename : public vpass {
630 using vpass::visit;
631
632 typedef sb_map<value*, unsigned> def_map;
633
634 def_map def_count;
635 std::stack<def_map> rename_stack;
636
637 typedef std::map<uint32_t, value*> val_map;
638 val_map values;
639
640 public:
641
642 ssa_rename(shader &s) : vpass(s) {}
643
644 virtual int init();
645
646 virtual bool visit(container_node &n, bool enter);
647 virtual bool visit(node &n, bool enter);
648 virtual bool visit(alu_group_node &n, bool enter);
649 virtual bool visit(cf_node &n, bool enter);
650 virtual bool visit(alu_node &n, bool enter);
651 virtual bool visit(alu_packed_node &n, bool enter);
652 virtual bool visit(fetch_node &n, bool enter);
653 virtual bool visit(region_node &n, bool enter);
654 virtual bool visit(repeat_node &n, bool enter);
655 virtual bool visit(depart_node &n, bool enter);
656 virtual bool visit(if_node &n, bool enter);
657
658 private:
659
660 void push(node *phi);
661 void pop();
662
663 unsigned get_index(def_map& m, value* v);
664 void set_index(def_map& m, value* v, unsigned index);
665 unsigned new_index(def_map& m, value* v);
666
667 value* rename_use(node *n, value* v);
668 value* rename_def(node *def, value* v);
669
670 void rename_src_vec(node *n, vvec &vv, bool src);
671 void rename_dst_vec(node *def, vvec &vv, bool set_def);
672
673 void rename_src(node *n);
674 void rename_dst(node *n);
675
676 void rename_phi_args(container_node *phi, unsigned op, bool def);
677
678 void rename_virt(node *n);
679 void rename_virt_val(node *n, value *v);
680 };
681
682 class bc_finalizer : public pass {
683
684 cf_node *last_export[EXP_TYPE_COUNT];
685 cf_node *last_cf;
686
687 unsigned ngpr;
688 unsigned nstack;
689
690 public:
691
692 bc_finalizer(shader &sh) : pass(sh), last_export(), last_cf(), ngpr(),
693 nstack() {}
694
695 virtual int run();
696
697 void finalize_loop(region_node *r);
698 void finalize_if(region_node *r);
699
700 void run_on(container_node *c);
701
702 void insert_rv6xx_load_ar_workaround(alu_group_node *b4);
703 void finalize_alu_group(alu_group_node *g, node *prev_node);
704 bool finalize_alu_src(alu_group_node *g, alu_node *a, alu_group_node *prev_node);
705
706 void emit_set_grad(fetch_node* f);
707 void finalize_fetch(fetch_node *f);
708
709 void finalize_cf(cf_node *c);
710
711 sel_chan translate_kcache(cf_node *alu, value *v);
712
713 void update_ngpr(unsigned gpr);
714 void update_nstack(region_node *r, unsigned add = 0);
715
716 unsigned get_stack_depth(node *n, unsigned &loops, unsigned &ifs,
717 unsigned add = 0);
718
719 void cf_peephole();
720
721 private:
722 void copy_fetch_src(fetch_node &dst, fetch_node &src, unsigned arg_start);
723 void emit_set_texture_offsets(fetch_node &f);
724 };
725
726
727 } // namespace r600_sb
728
729 #endif /* SB_PASS_H_ */