From 1befb7ed9856381cbfe874f361fae73b8e331bb4 Mon Sep 17 00:00:00 2001 From: Glenn Kennard Date: Wed, 7 Oct 2015 17:17:33 +0200 Subject: [PATCH] r600g/sb: SB support for UBO indexing Signed-off-by: Glenn Kennard Signed-off-by: Dave Airlie --- src/gallium/drivers/r600/r600_shader.c | 6 - src/gallium/drivers/r600/r600_shader.h | 2 - src/gallium/drivers/r600/sb/sb_bc.h | 4 +- .../drivers/r600/sb/sb_bc_finalize.cpp | 6 +- src/gallium/drivers/r600/sb/sb_bc_parser.cpp | 21 +++- src/gallium/drivers/r600/sb/sb_expr.cpp | 3 +- src/gallium/drivers/r600/sb/sb_ir.h | 7 ++ src/gallium/drivers/r600/sb/sb_sched.cpp | 108 ++++++++++++++++-- src/gallium/drivers/r600/sb/sb_sched.h | 4 + src/gallium/drivers/r600/sb/sb_shader.cpp | 4 +- src/gallium/drivers/r600/sb/sb_shader.h | 2 +- 11 files changed, 140 insertions(+), 27 deletions(-) diff --git a/src/gallium/drivers/r600/r600_shader.c b/src/gallium/drivers/r600/r600_shader.c index 24c3d43b0fa..8efe902a329 100644 --- a/src/gallium/drivers/r600/r600_shader.c +++ b/src/gallium/drivers/r600/r600_shader.c @@ -166,8 +166,6 @@ int r600_pipe_shader_create(struct pipe_context *ctx, if (rctx->b.chip_class <= R700) { use_sb &= (shader->shader.processor_type != TGSI_PROCESSOR_GEOMETRY); } - /* disable SB for shaders using ubo array indexing as it doesn't handle those currently */ - use_sb &= !shader->shader.uses_ubo_indexing; /* disable SB for shaders using doubles */ use_sb &= !shader->shader.uses_doubles; @@ -1250,9 +1248,6 @@ static int tgsi_split_constant(struct r600_shader_ctx *ctx) continue; } - if (ctx->src[i].kc_rel) - ctx->shader->uses_ubo_indexing = true; - if (ctx->src[i].rel) { int chan = inst->Src[i].Indirect.Swizzle; int treg = r600_get_temp(ctx); @@ -1936,7 +1931,6 @@ static int r600_shader_from_tgsi(struct r600_context *rctx, ctx.gs_next_vertex = 0; ctx.gs_stream_output_info = &so; - shader->uses_ubo_indexing = false; ctx.face_gpr = -1; ctx.fixed_pt_position_gpr = -1; ctx.fragcoord_input = -1; diff --git a/src/gallium/drivers/r600/r600_shader.h b/src/gallium/drivers/r600/r600_shader.h index 8ba32ae4999..c240e7110c1 100644 --- a/src/gallium/drivers/r600/r600_shader.h +++ b/src/gallium/drivers/r600/r600_shader.h @@ -75,8 +75,6 @@ struct r600_shader { boolean has_txq_cube_array_z_comp; boolean uses_tex_buffers; boolean gs_prim_id_input; - /* Temporarily workaround SB not handling ubo indexing */ - boolean uses_ubo_indexing; /* Size in bytes of a data item in the ring(s) (single vertex data). Stages with only one ring items 123 will be set to 0. */ diff --git a/src/gallium/drivers/r600/sb/sb_bc.h b/src/gallium/drivers/r600/sb/sb_bc.h index 126750d5c7e..9c2a9170436 100644 --- a/src/gallium/drivers/r600/sb/sb_bc.h +++ b/src/gallium/drivers/r600/sb/sb_bc.h @@ -478,7 +478,9 @@ struct bc_cf { bool is_alu_extended() { assert(op_ptr->flags & CF_ALU); - return kc[2].mode != KC_LOCK_NONE || kc[3].mode != KC_LOCK_NONE; + return kc[2].mode != KC_LOCK_NONE || kc[3].mode != KC_LOCK_NONE || + kc[0].index_mode != KC_INDEX_NONE || kc[1].index_mode != KC_INDEX_NONE || + kc[2].index_mode != KC_INDEX_NONE || kc[3].index_mode != KC_INDEX_NONE; } }; diff --git a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp index 193ade8a661..82826a90921 100644 --- a/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp +++ b/src/gallium/drivers/r600/sb/sb_bc_finalize.cpp @@ -515,7 +515,7 @@ void bc_finalizer::copy_fetch_src(fetch_node &dst, fetch_node &src, unsigned arg void bc_finalizer::emit_set_grad(fetch_node* f) { - assert(f->src.size() == 12); + assert(f->src.size() == 12 || f->src.size() == 13); unsigned ops[2] = { FETCH_OP_SET_GRADIENTS_V, FETCH_OP_SET_GRADIENTS_H }; unsigned arg_start = 0; @@ -810,8 +810,8 @@ void bc_finalizer::finalize_cf(cf_node* c) { } sel_chan bc_finalizer::translate_kcache(cf_node* alu, value* v) { - unsigned sel = v->select.sel(); - unsigned bank = sel >> 12; + unsigned sel = v->select.kcache_sel(); + unsigned bank = v->select.kcache_bank(); unsigned chan = v->select.chan(); static const unsigned kc_base[] = {128, 160, 256, 288}; diff --git a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp index 7f712b451c9..28ebfa2ce62 100644 --- a/src/gallium/drivers/r600/sb/sb_bc_parser.cpp +++ b/src/gallium/drivers/r600/sb/sb_bc_parser.cpp @@ -338,6 +338,7 @@ void bc_parser::save_set_cf_index(value *val, unsigned idx) value *bc_parser::get_cf_index_value(unsigned idx) { assert(idx <= 1); + assert(cf_index_value[idx]); return cf_index_value[idx]; } void bc_parser::save_mova(alu_node *mova) @@ -361,6 +362,7 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { for (node_iterator I = g->begin(), E = g->end(); I != E; ++I) { n = static_cast(*I); + bool ubo_indexing[2] = {}; if (!sh->assign_slot(n, slots[cgroup])) { assert(!"alu slot assignment failed"); @@ -460,7 +462,12 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { bc_kcache &kc = cf->bc.kc[kc_set]; kc_addr = (kc.addr << 4) + (sel & 0x1F); - n->src[s] = sh->get_kcache_value(kc.bank, kc_addr, src.chan); + n->src[s] = sh->get_kcache_value(kc.bank, kc_addr, src.chan, (alu_kcache_index_mode)kc.index_mode); + + if (kc.index_mode != KC_INDEX_NONE) { + assert(kc.index_mode != KC_LOCK_LOOP); + ubo_indexing[kc.index_mode - KC_INDEX_0] = true; + } } else if (src.sel < MAX_GPR) { value *v = sh->get_gpr_value(true, src.sel, src.chan, src.rel); @@ -497,6 +504,15 @@ int bc_parser::prepare_alu_group(cf_node* cf, alu_group_node *g) { } } } + + // add UBO index values if any as dependencies + if (ubo_indexing[0]) { + n->src.push_back(get_cf_index_value(0)); + } + if (ubo_indexing[1]) { + n->src.push_back(get_cf_index_value(1)); + } + if ((n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX0 || n->bc.dst_gpr == CM_V_SQ_MOVA_DST_CF_IDX1) && ctx.is_cayman()) // Move CF_IDX value into tex instruction operands, scheduler will later re-emit setting of CF_IDX @@ -644,6 +660,9 @@ int bc_parser::prepare_fetch_clause(cf_node *cf) { if (n->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE) { n->src.push_back(get_cf_index_value(n->bc.sampler_index_mode == V_SQ_CF_INDEX_1)); } + if (n->bc.resource_index_mode != V_SQ_CF_INDEX_NONE) { + n->src.push_back(get_cf_index_value(n->bc.resource_index_mode == V_SQ_CF_INDEX_1)); + } } } diff --git a/src/gallium/drivers/r600/sb/sb_expr.cpp b/src/gallium/drivers/r600/sb/sb_expr.cpp index 9c2274e65a3..556a05da395 100644 --- a/src/gallium/drivers/r600/sb/sb_expr.cpp +++ b/src/gallium/drivers/r600/sb/sb_expr.cpp @@ -403,7 +403,8 @@ bool expr_handler::fold_alu_op1(alu_node& n) { if ((n.bc.op == ALU_OP1_MOV || n.bc.op == ALU_OP1_MOVA_INT || n.bc.op == ALU_OP1_MOVA_GPR_INT) && n.bc.clamp == 0 && n.bc.omod == 0 - && n.bc.src[0].abs == 0 && n.bc.src[0].neg == 0) { + && n.bc.src[0].abs == 0 && n.bc.src[0].neg == 0 && + n.src.size() == 1 /* RIM/SIM can be appended as additional values */) { assign_source(n.dst[0], v0); return true; } diff --git a/src/gallium/drivers/r600/sb/sb_ir.h b/src/gallium/drivers/r600/sb/sb_ir.h index 560a4a9b284..c612e6c4ec6 100644 --- a/src/gallium/drivers/r600/sb/sb_ir.h +++ b/src/gallium/drivers/r600/sb/sb_ir.h @@ -62,6 +62,13 @@ struct sel_chan static unsigned sel(unsigned idx) { return (idx-1) >> 2; } static unsigned chan(unsigned idx) { return (idx-1) & 3; } + + sel_chan(unsigned bank, unsigned index, + unsigned chan, alu_kcache_index_mode index_mode) + : id(sel_chan((bank << 12) | index | ((unsigned)index_mode << 28), chan).id) {} + unsigned kcache_index_mode() const { return sel() >> 28; } + unsigned kcache_sel() const { return sel() & 0x0fffffffu; } + unsigned kcache_bank() const { return kcache_sel() >> 12; } }; inline sb_ostream& operator <<(sb_ostream& o, sel_chan r) { diff --git a/src/gallium/drivers/r600/sb/sb_sched.cpp b/src/gallium/drivers/r600/sb/sb_sched.cpp index 601445f7dc3..5113b756847 100644 --- a/src/gallium/drivers/r600/sb/sb_sched.cpp +++ b/src/gallium/drivers/r600/sb/sb_sched.cpp @@ -843,7 +843,7 @@ static alu_node *create_set_idx(shader &sh, unsigned ar_idx) { a->dst.resize(1); // Dummy needed for recolor PSC_DUMP( - sblog << "created IDX load: " + sblog << "created IDX load: "; dump::dump_op(a); sblog << "\n"; ); @@ -909,15 +909,21 @@ void post_scheduler::process_fetch(container_node *c) { sblog << " "; ); - if (f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE) { + // TODO: If same values used can avoid reloading index register + if (f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE || + f->bc.resource_index_mode != V_SQ_CF_INDEX_NONE) { + unsigned index_mode = f->bc.sampler_index_mode != V_SQ_CF_INDEX_NONE ? + f->bc.sampler_index_mode : f->bc.resource_index_mode; + // Currently require prior opt passes to use one TEX per indexed op assert(f->parent->count() == 1); value *v = f->src.back(); // Last src is index offset + assert(v); cur_bb->push_front(c); - load_index_register(v, f->bc.sampler_index_mode); + load_index_register(v, index_mode); f->src.pop_back(); // Don't need index value any more return; @@ -959,6 +965,7 @@ void post_scheduler::process_alu(container_node *c) { if (uc) { n->remove(); + pending.push_back(n); PSC_DUMP( sblog << "pending\n"; ); } else { @@ -1101,6 +1108,18 @@ void post_scheduler::init_globals(val_set &s, bool prealloc) { } } +void post_scheduler::emit_index_registers() { + for (unsigned i = 0; i < 2; i++) { + if (alu.current_idx[i]) { + regmap = prev_regmap; + alu.discard_current_group(); + + load_index_register(alu.current_idx[i], KC_INDEX_0 + i); + alu.current_idx[i] = NULL; + } + } +} + void post_scheduler::emit_clause() { if (alu.current_ar) { @@ -1109,7 +1128,11 @@ void post_scheduler::emit_clause() { alu.emit_group(); } - alu.emit_clause(cur_bb); + if (!alu.is_empty()) { + alu.emit_clause(cur_bb); + } + + emit_index_registers(); } void post_scheduler::schedule_alu(container_node *c) { @@ -1121,6 +1144,14 @@ void post_scheduler::schedule_alu(container_node *c) { prev_regmap = regmap; if (!prepare_alu_group()) { + if (alu.current_idx[0] || alu.current_idx[1]) { + regmap = prev_regmap; + emit_clause(); + init_globals(live, false); + + continue; + } + if (alu.current_ar) { emit_load_ar(); continue; @@ -1132,6 +1163,7 @@ void post_scheduler::schedule_alu(container_node *c) { regmap = prev_regmap; emit_clause(); init_globals(live, false); + continue; } @@ -1391,6 +1423,42 @@ bool post_scheduler::map_src_val(value *v) { } bool post_scheduler::map_src_vec(vvec &vv, bool src) { + if (src) { + // Handle possible UBO indexing + bool ubo_indexing[2] = { false, false }; + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { + value *v = *I; + if (!v) + continue; + + if (v->is_kcache()) { + unsigned index_mode = v->select.kcache_index_mode(); + if (index_mode == KC_INDEX_0 || index_mode == KC_INDEX_1) { + ubo_indexing[index_mode - KC_INDEX_0] = true; + } + } + } + + // idx values stored at end of src vec, see bc_parser::prepare_alu_group + for (unsigned i = 2; i != 0; i--) { + if (ubo_indexing[i-1]) { + // TODO: skip adding value to kcache reservation somehow, causes + // unnecessary group breaks and cache line locks + value *v = vv.back(); + if (alu.current_idx[i-1] && alu.current_idx[i-1] != v) { + PSC_DUMP( + sblog << "IDX" << i-1 << " already set to " << + *alu.current_idx[i-1] << ", trying to set " << *v << "\n"; + ); + return false; + } + + alu.current_idx[i-1] = v; + PSC_DUMP(sblog << "IDX" << i-1 << " set to " << *v << "\n";); + } + } + } + for (vvec::iterator I = vv.begin(), E = vv.end(); I != E; ++I) { value *v = *I; if (!v) @@ -1456,6 +1524,10 @@ void post_scheduler::dump_regmap() { sblog << " current_AR: " << *alu.current_ar << "\n"; if (alu.current_pr) sblog << " current_PR: " << *alu.current_pr << "\n"; + if (alu.current_idx[0]) + sblog << " current IDX0: " << *alu.current_idx[0] << "\n"; + if (alu.current_idx[1]) + sblog << " current IDX1: " << *alu.current_idx[1] << "\n"; } void post_scheduler::recolor_locals() { @@ -1545,6 +1617,13 @@ unsigned post_scheduler::try_add_instruction(node *n) { unsigned avail_slots = rt.avail_slots(); + // Cannot schedule in same clause as instructions using this index value + if (!n->dst.empty() && n->dst[0] && + (n->dst[0] == alu.current_idx[0] || n->dst[0] == alu.current_idx[1])) { + PSC_DUMP(sblog << " CF_IDX source: " << *n->dst[0] << "\n";); + return 0; + } + if (n->is_alu_packed()) { alu_packed_node *p = static_cast(n); unsigned slots = p->get_slot_mask(); @@ -1874,7 +1953,7 @@ alu_clause_tracker::alu_clause_tracker(shader &sh) grp0(sh), grp1(sh), group(), clause(), push_exec_mask(), - current_ar(), current_pr() {} + current_ar(), current_pr(), current_idx() {} void alu_clause_tracker::emit_group() { @@ -1931,6 +2010,8 @@ bool alu_clause_tracker::check_clause_limits() { // reserving slots to load AR and PR values unsigned reserve_slots = (current_ar ? 1 : 0) + (current_pr ? 1 : 0); + // ...and index registers + reserve_slots += (current_idx[0] != NULL) + (current_idx[1] != NULL); if (slot_count + slots > MAX_ALU_SLOTS - reserve_slots) return false; @@ -1996,13 +2077,15 @@ unsigned rp_kcache_tracker::get_lines(kc_lines& lines) { unsigned cnt = 0; for (unsigned i = 0; i < sel_count; ++i) { - unsigned line = rp[i]; + unsigned line = rp[i] & 0x1fffffffu; + unsigned index_mode = rp[i] >> 29; if (!line) return cnt; --line; line = (sel_count == 2) ? line >> 5 : line >> 6; + line |= index_mode << 29; if (lines.insert(line).second) ++cnt; @@ -2017,14 +2100,18 @@ bool alu_kcache_tracker::update_kc() { memcpy(old_kc, kc, sizeof(kc)); for (kc_lines::iterator I = lines.begin(), E = lines.end(); I != E; ++I) { - unsigned line = *I; + unsigned index_mode = *I >> 29; + unsigned line = *I & 0x1fffffffu; unsigned bank = line >> 8; + assert(index_mode <= KC_INDEX_INVALID); line &= 0xFF; - if (c && (bank == kc[c-1].bank) && (kc[c-1].addr + 1 == line)) - ++kc[c-1].mode; - else { + if (c && (bank == kc[c-1].bank) && (kc[c-1].addr + 1 == line) && + kc[c-1].index_mode == index_mode) + { + kc[c-1].mode = KC_LOCK_2; + } else { if (c == max_kcs) { memcpy(kc, old_kc, sizeof(kc)); return false; @@ -2034,6 +2121,7 @@ bool alu_kcache_tracker::update_kc() { kc[c].bank = bank; kc[c].addr = line; + kc[c].index_mode = index_mode; ++c; } } diff --git a/src/gallium/drivers/r600/sb/sb_sched.h b/src/gallium/drivers/r600/sb/sb_sched.h index 2ca714665a7..05b428ca884 100644 --- a/src/gallium/drivers/r600/sb/sb_sched.h +++ b/src/gallium/drivers/r600/sb/sb_sched.h @@ -66,6 +66,7 @@ public: class literal_tracker { literal lt[4]; unsigned uc[4]; + public: literal_tracker() : lt(), uc() {} @@ -219,6 +220,8 @@ public: // bottom-up) value *current_ar; value *current_pr; + // current values of CF_IDX registers that need preloading + value *current_idx[2]; alu_clause_tracker(shader &sh); @@ -256,6 +259,7 @@ class post_scheduler : public pass { val_set cleared_interf; + void emit_index_registers(); public: post_scheduler(shader &sh) : pass(sh), diff --git a/src/gallium/drivers/r600/sb/sb_shader.cpp b/src/gallium/drivers/r600/sb/sb_shader.cpp index f996c0786d1..87e28e98157 100644 --- a/src/gallium/drivers/r600/sb/sb_shader.cpp +++ b/src/gallium/drivers/r600/sb/sb_shader.cpp @@ -188,9 +188,9 @@ value* shader::create_temp_value() { return get_value(VLK_TEMP, id, 0); } -value* shader::get_kcache_value(unsigned bank, unsigned index, unsigned chan) { +value* shader::get_kcache_value(unsigned bank, unsigned index, unsigned chan, alu_kcache_index_mode index_mode) { return get_ro_value(kcache_values, VLK_KCACHE, - sel_chan((bank << 12) | index, chan)); + sel_chan(bank, index, chan, index_mode)); } void shader::add_input(unsigned gpr, bool preloaded, unsigned comp_mask) { diff --git a/src/gallium/drivers/r600/sb/sb_shader.h b/src/gallium/drivers/r600/sb/sb_shader.h index 7955bba9b67..70bea891b76 100644 --- a/src/gallium/drivers/r600/sb/sb_shader.h +++ b/src/gallium/drivers/r600/sb/sb_shader.h @@ -323,7 +323,7 @@ public: value* get_special_ro_value(unsigned sel); - value* get_kcache_value(unsigned bank, unsigned index, unsigned chan); + value* get_kcache_value(unsigned bank, unsigned index, unsigned chan, alu_kcache_index_mode index_mode); value* get_value_version(value* v, unsigned ver); -- 2.30.2