r600/sb: fix a bug emitting ar load from a constant.
[mesa.git] / src / gallium / drivers / r600 / sb / sb_sched.cpp
index c98b8fff764b3734d488925f5a924a5690dbc447..4158317765f0fd059a4a33a1e071a1cdfdcdffcf 100644 (file)
@@ -36,6 +36,7 @@
 #include "sb_shader.h"
 #include "sb_pass.h"
 #include "sb_sched.h"
+#include "eg_sq.h" // V_SQ_CF_INDEX_NONE/0/1
 
 namespace r600_sb {
 
@@ -710,22 +711,24 @@ void alu_group_tracker::update_flags(alu_node* n) {
 }
 
 int post_scheduler::run() {
-       run_on(sh.root);
-       return 0;
+       return run_on(sh.root) ? 0 : 1;
 }
 
-void post_scheduler::run_on(container_node* n) {
-
+bool post_scheduler::run_on(container_node* n) {
+       int r = true;
        for (node_riterator I = n->rbegin(), E = n->rend(); I != E; ++I) {
                if (I->is_container()) {
                        if (I->subtype == NST_BB) {
                                bb_node* bb = static_cast<bb_node*>(*I);
-                               schedule_bb(bb);
+                               r = schedule_bb(bb);
                        } else {
-                               run_on(static_cast<container_node*>(*I));
+                               r = run_on(static_cast<container_node*>(*I));
                        }
+                       if (!r)
+                               break;
                }
        }
+       return r;
 }
 
 void post_scheduler::init_uc_val(container_node *c, value *v) {
@@ -757,7 +760,7 @@ unsigned post_scheduler::init_ucm(container_node *c, node *n) {
        return F == ucm.end() ? 0 : F->second;
 }
 
-void post_scheduler::schedule_bb(bb_node* bb) {
+bool post_scheduler::schedule_bb(bb_node* bb) {
        PSC_DUMP(
                sblog << "scheduling BB " << bb->id << "\n";
                if (!pending.empty())
@@ -781,17 +784,27 @@ void post_scheduler::schedule_bb(bb_node* bb) {
                        sblog << "\n";
                );
 
-               if (n->subtype == NST_ALU_CLAUSE) {
+               // May require emitting ALU ops to load index registers
+               if (n->is_fetch_clause()) {
                        n->remove();
-                       process_alu(static_cast<container_node*>(n));
+                       process_fetch(static_cast<container_node *>(n));
                        continue;
                }
 
+               if (n->is_alu_clause()) {
+                       n->remove();
+                       bool r = process_alu(static_cast<container_node*>(n));
+                       if (r)
+                               continue;
+                       return false;
+               }
+
                n->remove();
                bb->push_front(n);
        }
 
        this->cur_bb = NULL;
+       return true;
 }
 
 void post_scheduler::init_regmap() {
@@ -823,11 +836,113 @@ void post_scheduler::init_regmap() {
        }
 }
 
-void post_scheduler::process_alu(container_node *c) {
+static alu_node *create_set_idx(shader &sh, unsigned ar_idx) {
+       alu_node *a = sh.create_alu();
+
+       assert(ar_idx == V_SQ_CF_INDEX_0 || ar_idx == V_SQ_CF_INDEX_1);
+       if (ar_idx == V_SQ_CF_INDEX_0)
+               a->bc.set_op(ALU_OP0_SET_CF_IDX0);
+       else
+               a->bc.set_op(ALU_OP0_SET_CF_IDX1);
+       a->bc.slot = SLOT_X;
+       a->dst.resize(1); // Dummy needed for recolor
+
+       PSC_DUMP(
+               sblog << "created IDX load: ";
+               dump::dump_op(a);
+               sblog << "\n";
+       );
+
+       return a;
+}
+
+void post_scheduler::load_index_register(value *v, unsigned ar_idx)
+{
+       alu.reset();
+
+       if (!sh.get_ctx().is_cayman()) {
+               // Evergreen has to first load address register, then use CF_SET_IDX0/1
+               alu_group_tracker &rt = alu.grp();
+               alu_node *set_idx = create_set_idx(sh, ar_idx);
+               if (!rt.try_reserve(set_idx)) {
+                       sblog << "can't emit SET_CF_IDX";
+                       dump::dump_op(set_idx);
+                       sblog << "\n";
+               }
+               process_group();
+
+               if (!alu.check_clause_limits()) {
+                       // Can't happen since clause only contains MOVA/CF_SET_IDX0/1
+               }
+               alu.emit_group();
+       }
 
+       alu_group_tracker &rt = alu.grp();
+       alu_node *a = alu.create_ar_load(v, ar_idx == V_SQ_CF_INDEX_1 ? SEL_Z : SEL_Y);
+
+       if (!rt.try_reserve(a)) {
+               sblog << "can't emit AR load : ";
+               dump::dump_op(a);
+               sblog << "\n";
+       }
+
+       process_group();
+
+       if (!alu.check_clause_limits()) {
+               // Can't happen since clause only contains MOVA/CF_SET_IDX0/1
+       }
+
+       alu.emit_group();
+       alu.emit_clause(cur_bb);
+}
+
+void post_scheduler::process_fetch(container_node *c) {
        if (c->empty())
                return;
 
+       for (node_iterator N, I = c->begin(), E = c->end(); I != E; I = N) {
+               N = I;
+               ++N;
+
+               node *n = *I;
+
+               fetch_node *f = static_cast<fetch_node*>(n);
+
+               PSC_DUMP(
+                       sblog << "process_tex ";
+                       dump::dump_op(n);
+                       sblog << "  ";
+               );
+
+               // TODO: If same values used can avoid reloading index register
+               if (f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE ||
+                       f->bc.resource_index_mode != V_SQ_CF_INDEX_NONE) {
+                       unsigned index_mode = f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE ?
+                               f->bc.sampler_index_mode : f->bc.resource_index_mode;
+
+                       // Currently require prior opt passes to use one TEX per indexed op
+                       assert(f->parent->count() == 1);
+
+                       value *v = f->src.back(); // Last src is index offset
+                       assert(v);
+
+                       cur_bb->push_front(c);
+
+                       load_index_register(v, index_mode);
+                       f->src.pop_back(); // Don't need index value any more
+
+                       return;
+               }
+       }
+
+       cur_bb->push_front(c);
+}
+
+bool post_scheduler::process_alu(container_node *c) {
+
+       if (c->empty())
+               return true;
+
        ucm.clear();
        alu.reset();
 
@@ -855,6 +970,7 @@ void post_scheduler::process_alu(container_node *c) {
 
                if (uc) {
                        n->remove();
+
                        pending.push_back(n);
                        PSC_DUMP( sblog << "pending\n"; );
                } else {
@@ -862,7 +978,7 @@ void post_scheduler::process_alu(container_node *c) {
                }
        }
 
-       schedule_alu(c);
+       return schedule_alu(c);
 }
 
 void post_scheduler::update_local_interferences() {
@@ -997,26 +1113,58 @@ void post_scheduler::init_globals(val_set &s, bool prealloc) {
        }
 }
 
+void post_scheduler::emit_index_registers() {
+       for (unsigned i = 0; i < 2; i++) {
+               if (alu.current_idx[i]) {
+                       regmap = prev_regmap;
+                       alu.discard_current_group();
+
+                       load_index_register(alu.current_idx[i], KC_INDEX_0 + i);
+                       alu.current_idx[i] = NULL;
+               }
+       }
+}
+
 void post_scheduler::emit_clause() {
 
        if (alu.current_ar) {
                emit_load_ar();
                process_group();
+               if (!alu.check_clause_limits()) {
+                       // Can't happen since clause only contains MOVA/CF_SET_IDX0/1
+               }
                alu.emit_group();
        }
 
-       alu.emit_clause(cur_bb);
+       if (!alu.is_empty()) {
+               alu.emit_clause(cur_bb);
+       }
+
+       emit_index_registers();
 }
 
-void post_scheduler::schedule_alu(container_node *c) {
+bool post_scheduler::schedule_alu(container_node *c) {
 
        assert(!ready.empty() || !ready_copies.empty());
 
-       while (1) {
-
+       bool improving = true;
+       int last_pending = pending.count();
+       while (improving) {
                prev_regmap = regmap;
-
                if (!prepare_alu_group()) {
+
+                       int new_pending = pending.count();
+                       improving = (new_pending < last_pending) || (last_pending == 0);
+                       last_pending = new_pending;
+
+                       if (alu.current_idx[0] || alu.current_idx[1]) {
+                               regmap = prev_regmap;
+                               emit_clause();
+                               init_globals(live, false);
+
+                               continue;
+                       }
+
                        if (alu.current_ar) {
                                emit_load_ar();
                                continue;
@@ -1028,6 +1176,7 @@ void post_scheduler::schedule_alu(container_node *c) {
                        regmap = prev_regmap;
                        emit_clause();
                        init_globals(live, false);
+
                        continue;
                }
 
@@ -1050,6 +1199,7 @@ void post_scheduler::schedule_alu(container_node *c) {
                dump::dump_op_list(&pending);
                assert(!"unscheduled pending instructions");
        }
+       return improving;
 }
 
 void post_scheduler::add_interferences(value *v, sb_bitset &rb, val_set &vs) {
@@ -1180,7 +1330,7 @@ void post_scheduler::emit_load_ar() {
        alu.discard_current_group();
 
        alu_group_tracker &rt = alu.grp();
-       alu_node *a = alu.create_ar_load();
+       alu_node *a = alu.create_ar_load(alu.current_ar, SEL_X);
 
        if (!rt.try_reserve(a)) {
                sblog << "can't emit AR load : ";
@@ -1287,6 +1437,42 @@ bool post_scheduler::map_src_val(value *v) {
 }
 
 bool post_scheduler::map_src_vec(vvec &vv, bool src) {
+       if (src) {
+               // Handle possible UBO indexing
+               bool ubo_indexing[2] = { false, false };
+               for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
+                       value *v = *I;
+                       if (!v)
+                               continue;
+
+                       if (v->is_kcache()) {
+                               unsigned index_mode = v->select.kcache_index_mode();
+                               if (index_mode == KC_INDEX_0 || index_mode == KC_INDEX_1) {
+                                       ubo_indexing[index_mode - KC_INDEX_0] = true;
+                               }
+                       }
+               }
+
+               // idx values stored at end of src vec, see bc_parser::prepare_alu_group
+               for (unsigned i = 2; i != 0; i--) {
+                       if (ubo_indexing[i-1]) {
+                               // TODO: skip adding value to kcache reservation somehow, causes
+                               // unnecessary group breaks and cache line locks
+                               value *v = vv.back();
+                               if (alu.current_idx[i-1] && alu.current_idx[i-1] != v) {
+                                       PSC_DUMP(
+                                               sblog << "IDX" << i-1 << " already set to " <<
+                                               *alu.current_idx[i-1] << ", trying to set " << *v << "\n";
+                                       );
+                                       return false;
+                               }
+
+                               alu.current_idx[i-1] = v;
+                               PSC_DUMP(sblog << "IDX" << i-1 << " set to " << *v << "\n";);
+                       }
+               }
+       }
+
        for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) {
                value *v = *I;
                if (!v)
@@ -1352,6 +1538,10 @@ void post_scheduler::dump_regmap() {
                sblog << "    current_AR: " << *alu.current_ar << "\n";
        if (alu.current_pr)
                sblog << "    current_PR: " << *alu.current_pr << "\n";
+       if (alu.current_idx[0])
+               sblog << "    current IDX0: " << *alu.current_idx[0] << "\n";
+       if (alu.current_idx[1])
+               sblog << "    current IDX1: " << *alu.current_idx[1] << "\n";
 }
 
 void post_scheduler::recolor_locals() {
@@ -1441,6 +1631,13 @@ unsigned post_scheduler::try_add_instruction(node *n) {
 
        unsigned avail_slots = rt.avail_slots();
 
+       // Cannot schedule in same clause as instructions using this index value
+       if (!n->dst.empty() && n->dst[0] &&
+               (n->dst[0] == alu.current_idx[0] || n->dst[0] == alu.current_idx[1])) {
+               PSC_DUMP(sblog << "   CF_IDX source: " << *n->dst[0] << "\n";);
+               return 0;
+       }
+
        if (n->is_alu_packed()) {
                alu_packed_node *p = static_cast<alu_packed_node*>(n);
                unsigned slots = p->get_slot_mask();
@@ -1770,7 +1967,7 @@ alu_clause_tracker::alu_clause_tracker(shader &sh)
          grp0(sh), grp1(sh),
          group(), clause(),
          push_exec_mask(),
-         current_ar(), current_pr() {}
+         current_ar(), current_pr(), current_idx() {}
 
 void alu_clause_tracker::emit_group() {
 
@@ -1827,6 +2024,8 @@ bool alu_clause_tracker::check_clause_limits() {
 
        // reserving slots to load AR and PR values
        unsigned reserve_slots = (current_ar ? 1 : 0) + (current_pr ? 1 : 0);
+       // ...and index registers
+       reserve_slots += (current_idx[0] != NULL) + (current_idx[1] != NULL);
 
        if (slot_count + slots > MAX_ALU_SLOTS - reserve_slots)
                return false;
@@ -1892,13 +2091,15 @@ unsigned rp_kcache_tracker::get_lines(kc_lines& lines) {
        unsigned cnt = 0;
 
        for (unsigned i = 0; i < sel_count; ++i) {
-               unsigned line = rp[i];
+               unsigned line = rp[i] & 0x1fffffffu;
+               unsigned index_mode = rp[i] >> 29;
 
                if (!line)
                        return cnt;
 
                --line;
                line = (sel_count == 2) ? line >> 5 : line >> 6;
+               line |= index_mode << 29;
 
                if (lines.insert(line).second)
                        ++cnt;
@@ -1913,14 +2114,18 @@ bool alu_kcache_tracker::update_kc() {
        memcpy(old_kc, kc, sizeof(kc));
 
        for (kc_lines::iterator I = lines.begin(), E = lines.end(); I != E; ++I) {
-               unsigned line = *I;
+               unsigned index_mode = *I >> 29;
+               unsigned line = *I & 0x1fffffffu;
                unsigned bank = line >> 8;
 
+               assert(index_mode <= KC_INDEX_INVALID);
                line &= 0xFF;
 
-               if (c && (bank == kc[c-1].bank) && (kc[c-1].addr + 1 == line))
-                       ++kc[c-1].mode;
-               else {
+               if (c && (bank == kc[c-1].bank) && (kc[c-1].addr + 1 == line) &&
+                       kc[c-1].index_mode == index_mode)
+               {
+                       kc[c-1].mode = KC_LOCK_2;
+               } else {
                        if (c == max_kcs) {
                                memcpy(kc, old_kc, sizeof(kc));
                                return false;
@@ -1930,17 +2135,16 @@ bool alu_kcache_tracker::update_kc() {
 
                        kc[c].bank = bank;
                        kc[c].addr = line;
+                       kc[c].index_mode = index_mode;
                        ++c;
                }
        }
        return true;
 }
 
-alu_node* alu_clause_tracker::create_ar_load() {
+alu_node* alu_clause_tracker::create_ar_load(value *v, chan_select ar_channel) {
        alu_node *a = sh.create_alu();
 
-       // FIXME use MOVA_GPR on R6xx
-
        if (sh.get_ctx().uses_mova_gpr) {
                a->bc.set_op(ALU_OP1_MOVA_GPR_INT);
                a->bc.slot = SLOT_TRANS;
@@ -1948,9 +2152,13 @@ alu_node* alu_clause_tracker::create_ar_load() {
                a->bc.set_op(ALU_OP1_MOVA_INT);
                a->bc.slot = SLOT_X;
        }
+       a->bc.dst_chan = ar_channel;
+       if (ar_channel != SEL_X && sh.get_ctx().is_cayman()) {
+               a->bc.dst_gpr = ar_channel == SEL_Y ? CM_V_SQ_MOVA_DST_CF_IDX0 : CM_V_SQ_MOVA_DST_CF_IDX1;
+       }
 
        a->dst.resize(1);
-       a->src.push_back(current_ar);
+       a->src.push_back(v);
 
        PSC_DUMP(
                sblog << "created AR load: ";