X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fcompiler%2Faco_spill.cpp;h=69d5cb23b23dee5e407682ab78f924001791e480;hb=b811b1d083f159c085f30a33a73472bb54c3427b;hp=9f687da4b98bbd6753ea25a7d8eac10005f2fd61;hpb=d97c0bdd5558e4e00ede38afac879606aff5f04b;p=mesa.git diff --git a/src/amd/compiler/aco_spill.cpp b/src/amd/compiler/aco_spill.cpp index 9f687da4b98..69d5cb23b23 100644 --- a/src/amd/compiler/aco_spill.cpp +++ b/src/amd/compiler/aco_spill.cpp @@ -28,6 +28,7 @@ #include "sid.h" #include +#include #include /* @@ -55,18 +56,19 @@ struct spill_ctx { std::stack loop_header; std::vector>> next_use_distances_start; std::vector>> next_use_distances_end; - std::vector>> interferences; + std::vector>> interferences; std::vector> affinities; std::vector is_reloaded; std::map remat; std::map remat_used; + unsigned wave_size; spill_ctx(const RegisterDemand target_pressure, Program* program, std::vector> register_demand) : target_pressure(target_pressure), program(program), - register_demand(register_demand), renames(program->blocks.size()), + register_demand(std::move(register_demand)), renames(program->blocks.size()), spills_entry(program->blocks.size()), spills_exit(program->blocks.size()), - processed(program->blocks.size(), false) {} + processed(program->blocks.size(), false), wave_size(program->wave_size) {} void add_affinity(uint32_t first, uint32_t second) { @@ -96,9 +98,19 @@ struct spill_ctx { } } + void add_interference(uint32_t first, uint32_t second) + { + if (interferences[first].first.type() != interferences[second].first.type()) + return; + + bool inserted = interferences[first].second.insert(second).second; + if (inserted) + interferences[second].second.insert(first); + } + uint32_t allocate_spill_id(RegClass rc) { - interferences.emplace_back(rc, std::set()); + interferences.emplace_back(rc, std::unordered_set()); is_reloaded.push_back(false); return next_spill_id++; } @@ -212,7 +224,7 @@ void next_uses_per_block(spill_ctx& ctx, unsigned block_idx, std::set& } -void compute_global_next_uses(spill_ctx& ctx, std::vector>& live_out) +void compute_global_next_uses(spill_ctx& ctx) { ctx.next_use_distances_start.resize(ctx.program->blocks.size()); ctx.next_use_distances_end.resize(ctx.program->blocks.size()); @@ -231,11 +243,13 @@ void compute_global_next_uses(spill_ctx& ctx, std::vector>& live_ bool should_rematerialize(aco_ptr& instr) { /* TODO: rematerialization is only supported for VOP1, SOP1 and PSEUDO */ - if (instr->format != Format::VOP1 && instr->format != Format::SOP1 && instr->format != Format::PSEUDO) + if (instr->format != Format::VOP1 && instr->format != Format::SOP1 && instr->format != Format::PSEUDO && instr->format != Format::SOPK) return false; /* TODO: pseudo-instruction rematerialization is only supported for p_create_vector */ if (instr->format == Format::PSEUDO && instr->opcode != aco_opcode::p_create_vector) return false; + if (instr->format == Format::SOPK && instr->opcode != aco_opcode::s_movk_i32) + return false; for (const Operand& op : instr->operands) { /* TODO: rematerialization using temporaries isn't yet supported */ @@ -255,7 +269,7 @@ aco_ptr do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t std::map::iterator remat = ctx.remat.find(tmp); if (remat != ctx.remat.end()) { Instruction *instr = remat->second.instr; - assert((instr->format == Format::VOP1 || instr->format == Format::SOP1 || instr->format == Format::PSEUDO) && "unsupported"); + assert((instr->format == Format::VOP1 || instr->format == Format::SOP1 || instr->format == Format::PSEUDO || instr->format == Format::SOPK) && "unsupported"); assert((instr->format != Format::PSEUDO || instr->opcode == aco_opcode::p_create_vector) && "unsupported"); assert(instr->definitions.size() == 1 && "unsupported"); @@ -265,7 +279,10 @@ aco_ptr do_reload(spill_ctx& ctx, Temp tmp, Temp new_name, uint32_t } else if (instr->format == Format::SOP1) { res.reset(create_instruction(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size())); } else if (instr->format == Format::PSEUDO) { - res.reset(create_instruction(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size())); + res.reset(create_instruction(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size())); + } else if (instr->format == Format::SOPK) { + res.reset(create_instruction(instr->opcode, instr->format, instr->operands.size(), instr->definitions.size())); + static_cast(res.get())->imm = static_cast(instr)->imm; } for (unsigned i = 0; i < instr->operands.size(); i++) { res->operands[i] = instr->operands[i]; @@ -597,6 +614,17 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id } } } else { + for (unsigned i = 0; i < idx; i++) { + aco_ptr& instr = block->instructions[i]; + assert(is_phi(instr)); + /* Killed phi definitions increase pressure in the predecessor but not + * the block they're in. Since the loops below are both to control + * pressure of the start of this block and the ends of it's + * predecessors, we need to count killed unspilled phi definitions here. */ + if (instr->definitions[0].isKill() && + !ctx.spills_entry[block_idx].count(instr->definitions[0].getTemp())) + reg_pressure += instr->definitions[0].getTemp(); + } idx--; } reg_pressure += ctx.register_demand[block_idx][idx] - spilled_registers; @@ -651,6 +679,18 @@ RegisterDemand init_live_in_vars(spill_ctx& ctx, Block* block, unsigned block_id } +RegisterDemand get_demand_before(spill_ctx& ctx, unsigned block_idx, unsigned idx) +{ + if (idx == 0) { + RegisterDemand demand = ctx.register_demand[block_idx][idx]; + aco_ptr& instr = ctx.program->blocks[block_idx].instructions[idx]; + aco_ptr instr_before(nullptr); + return get_demand_before(demand, instr, instr_before); + } else { + return ctx.register_demand[block_idx][idx - 1]; + } +} + void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) { /* no coupling code necessary */ @@ -659,12 +699,13 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) std::vector> instructions; /* branch block: TODO take other branch into consideration */ - if (block->linear_preds.size() == 1 && !(block->kind & block_kind_loop_exit)) { + if (block->linear_preds.size() == 1 && !(block->kind & (block_kind_loop_exit | block_kind_loop_header))) { assert(ctx.processed[block->linear_preds[0]]); assert(ctx.register_demand[block_idx].size() == block->instructions.size()); std::vector reg_demand; unsigned insert_idx = 0; unsigned pred_idx = block->linear_preds[0]; + RegisterDemand demand_before = get_demand_before(ctx, block_idx, 0); for (std::pair> live : ctx.next_use_distances_start[block_idx]) { if (!live.first.is_linear()) @@ -685,7 +726,7 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) Temp new_name = {ctx.program->allocateId(), live.first.regClass()}; aco_ptr reload = do_reload(ctx, live.first, new_name, ctx.spills_exit[pred_idx][live.first]); instructions.emplace_back(std::move(reload)); - reg_demand.push_back(RegisterDemand()); + reg_demand.push_back(demand_before); ctx.renames[block_idx][live.first] = new_name; } @@ -766,8 +807,7 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) for (std::pair pair : ctx.spills_exit[pred_idx]) { if (var == pair.first) continue; - ctx.interferences[def_spill_id].second.emplace(pair.second); - ctx.interferences[pair.second].second.emplace(def_spill_id); + ctx.add_interference(def_spill_id, pair.second); } /* check if variable is already spilled at predecessor */ @@ -827,8 +867,7 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) for (std::pair exit_spill : ctx.spills_exit[pred_idx]) { if (exit_spill.first == pair.first) continue; - ctx.interferences[exit_spill.second].second.emplace(pair.second); - ctx.interferences[pair.second].second.emplace(exit_spill.second); + ctx.add_interference(exit_spill.second, pair.second); } /* variable is in register at predecessor and has to be spilled */ @@ -983,8 +1022,12 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) idx++; } - ctx.register_demand[block->index].erase(ctx.register_demand[block->index].begin(), ctx.register_demand[block->index].begin() + idx); - ctx.register_demand[block->index].insert(ctx.register_demand[block->index].begin(), instructions.size(), RegisterDemand()); + if (!ctx.processed[block_idx]) { + assert(!(block->kind & block_kind_loop_header)); + RegisterDemand demand_before = get_demand_before(ctx, block_idx, idx); + ctx.register_demand[block->index].erase(ctx.register_demand[block->index].begin(), ctx.register_demand[block->index].begin() + idx); + ctx.register_demand[block->index].insert(ctx.register_demand[block->index].begin(), instructions.size(), demand_before); + } std::vector>::iterator start = std::next(block->instructions.begin(), idx); instructions.insert(instructions.end(), std::move_iterator>::iterator>(start), @@ -995,6 +1038,8 @@ void add_coupling_code(spill_ctx& ctx, Block* block, unsigned block_idx) void process_block(spill_ctx& ctx, unsigned block_idx, Block* block, std::map ¤t_spills, RegisterDemand spilled_registers) { + assert(!ctx.processed[block_idx]); + std::vector> local_next_use_distance; std::vector> instructions; unsigned idx = 0; @@ -1046,18 +1091,7 @@ void process_block(spill_ctx& ctx, unsigned block_idx, Block* block, if (block->register_demand.exceeds(ctx.target_pressure)) { RegisterDemand new_demand = ctx.register_demand[block_idx][idx]; - if (idx == 0) { - RegisterDemand demand_before = new_demand; - for (const Definition& def : instr->definitions) - demand_before -= def.getTemp(); - for (const Operand& op : instr->operands) { - if (op.isFirstKill()) - demand_before += op.getTemp(); - } - new_demand.update(demand_before); - } else { - new_demand.update(ctx.register_demand[block_idx][idx - 1]); - } + new_demand.update(get_demand_before(ctx, block_idx, idx)); assert(!local_next_use_distance.empty()); @@ -1098,14 +1132,10 @@ void process_block(spill_ctx& ctx, unsigned block_idx, Block* block, uint32_t spill_id = ctx.allocate_spill_id(to_spill.regClass()); /* add interferences with currently spilled variables */ - for (std::pair pair : current_spills) { - ctx.interferences[spill_id].second.emplace(pair.second); - ctx.interferences[pair.second].second.emplace(spill_id); - } - for (std::pair> pair : reloads) { - ctx.interferences[spill_id].second.emplace(pair.second.second); - ctx.interferences[pair.second.second].second.emplace(spill_id); - } + for (std::pair pair : current_spills) + ctx.add_interference(spill_id, pair.second); + for (std::pair> pair : reloads) + ctx.add_interference(spill_id, pair.second.second); current_spills[to_spill] = spill_id; spilled_registers += to_spill; @@ -1139,16 +1169,14 @@ void process_block(spill_ctx& ctx, unsigned block_idx, Block* block, void spill_block(spill_ctx& ctx, unsigned block_idx) { Block* block = &ctx.program->blocks[block_idx]; - ctx.processed[block_idx] = true; /* determine set of variables which are spilled at the beginning of the block */ RegisterDemand spilled_registers = init_live_in_vars(ctx, block, block_idx); /* add interferences for spilled variables */ - for (std::pair x : ctx.spills_entry[block_idx]) { - for (std::pair y : ctx.spills_entry[block_idx]) - if (x.second != y.second) - ctx.interferences[x.second].second.emplace(y.second); + for (auto it = ctx.spills_entry[block_idx].begin(); it != ctx.spills_entry[block_idx].end(); ++it) { + for (auto it2 = std::next(it); it2 != ctx.spills_entry[block_idx].end(); ++it2) + ctx.add_interference(it->second, it2->second); } bool is_loop_header = block->loop_nest_depth && ctx.loop_header.top()->index == block_idx; @@ -1176,6 +1204,8 @@ void spill_block(spill_ctx& ctx, unsigned block_idx) else ctx.spills_exit[block_idx].insert(current_spills.begin(), current_spills.end()); + ctx.processed[block_idx] = true; + /* check if the next block leaves the current loop */ if (block->loop_nest_depth == 0 || ctx.program->blocks[block_idx + 1].loop_nest_depth >= block->loop_nest_depth) return; @@ -1285,160 +1315,142 @@ Temp load_scratch_resource(spill_ctx& ctx, Temp& scratch_offset, if (ctx.program->chip_class >= GFX10) { rsrc_conf |= S_008F0C_FORMAT(V_008F0C_IMG_FORMAT_32_FLOAT) | - S_008F0C_OOB_SELECT(3) | + S_008F0C_OOB_SELECT(V_008F0C_OOB_SELECT_RAW) | S_008F0C_RESOURCE_LEVEL(1); } else if (ctx.program->chip_class <= GFX7) { /* dfmt modifies stride on GFX8/GFX9 when ADD_TID_EN=1 */ rsrc_conf |= S_008F0C_NUM_FORMAT(V_008F0C_BUF_NUM_FORMAT_FLOAT) | S_008F0C_DATA_FORMAT(V_008F0C_BUF_DATA_FORMAT_32); } - /* older generations need element size = 16 bytes. element size removed in GFX9 */ + /* older generations need element size = 4 bytes. element size removed in GFX9 */ if (ctx.program->chip_class <= GFX8) - rsrc_conf |= S_008F0C_ELEMENT_SIZE(3); + rsrc_conf |= S_008F0C_ELEMENT_SIZE(1); return bld.pseudo(aco_opcode::p_create_vector, bld.def(s4), private_segment_buffer, Operand(-1u), Operand(rsrc_conf)); } -void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { - std::map sgpr_slot; - std::map vgpr_slot; - std::vector is_assigned(ctx.interferences.size()); +void add_interferences(spill_ctx& ctx, std::vector& is_assigned, + std::vector& slots, std::vector& slots_used, + unsigned id) +{ + for (unsigned other : ctx.interferences[id].second) { + if (!is_assigned[other]) + continue; - /* first, handle affinities: just merge all interferences into both spill ids */ - for (std::vector& vec : ctx.affinities) { - for (unsigned i = 0; i < vec.size(); i++) { - for (unsigned j = i + 1; j < vec.size(); j++) { - assert(vec[i] != vec[j]); - for (uint32_t id : ctx.interferences[vec[i]].second) - ctx.interferences[id].second.insert(vec[j]); - for (uint32_t id : ctx.interferences[vec[j]].second) - ctx.interferences[id].second.insert(vec[i]); - ctx.interferences[vec[i]].second.insert(ctx.interferences[vec[j]].second.begin(), ctx.interferences[vec[j]].second.end()); - ctx.interferences[vec[j]].second.insert(ctx.interferences[vec[i]].second.begin(), ctx.interferences[vec[i]].second.end()); + RegClass other_rc = ctx.interferences[other].first; + unsigned slot = slots[other]; + std::fill(slots_used.begin() + slot, slots_used.begin() + slot + other_rc.size(), true); + } +} - bool reloaded = ctx.is_reloaded[vec[i]] || ctx.is_reloaded[vec[j]]; - ctx.is_reloaded[vec[i]] = reloaded; - ctx.is_reloaded[vec[j]] = reloaded; +unsigned find_available_slot(std::vector& used, unsigned wave_size, + unsigned size, bool is_sgpr, unsigned *num_slots) +{ + unsigned wave_size_minus_one = wave_size - 1; + unsigned slot = 0; + + while (true) { + bool available = true; + for (unsigned i = 0; i < size; i++) { + if (slot + i < used.size() && used[slot + i]) { + available = false; + break; } } + if (!available) { + slot++; + continue; + } + + if (is_sgpr && ((slot & wave_size_minus_one) > wave_size - size)) { + slot = align(slot, wave_size); + continue; + } + + std::fill(used.begin(), used.end(), false); + + if (slot + size > used.size()) + used.resize(slot + size); + + return slot; } - for (ASSERTED uint32_t i = 0; i < ctx.interferences.size(); i++) - for (ASSERTED uint32_t id : ctx.interferences[i].second) - assert(i != id); +} - /* for each spill slot, assign as many spill ids as possible */ - std::vector> spill_slot_interferences; - unsigned slot_idx = 0; - bool done = false; - - /* assign sgpr spill slots */ - while (!done) { - done = true; - for (unsigned id = 0; id < ctx.interferences.size(); id++) { - if (is_assigned[id] || !ctx.is_reloaded[id]) - continue; - if (ctx.interferences[id].first.type() != RegType::sgpr) - continue; +void assign_spill_slots_helper(spill_ctx& ctx, RegType type, + std::vector& is_assigned, + std::vector& slots, + unsigned *num_slots) +{ + std::vector slots_used(*num_slots); - /* check interferences */ - bool interferes = false; - for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++) { - if (i == spill_slot_interferences.size()) - spill_slot_interferences.emplace_back(std::set()); - if (spill_slot_interferences[i].find(id) != spill_slot_interferences[i].end() || i / 64 != slot_idx / 64) { - interferes = true; - break; - } - } - if (interferes) { - done = false; + /* assign slots for ids with affinities first */ + for (std::vector& vec : ctx.affinities) { + if (ctx.interferences[vec[0]].first.type() != type) + continue; + + for (unsigned id : vec) { + if (!ctx.is_reloaded[id]) continue; - } - /* we found a spill id which can be assigned to current spill slot */ - sgpr_slot[id] = slot_idx; - is_assigned[id] = true; - for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++) - spill_slot_interferences[i].insert(ctx.interferences[id].second.begin(), ctx.interferences[id].second.end()); - - /* add all affinities: there are no additional interferences */ - for (std::vector& vec : ctx.affinities) { - bool found_affinity = false; - for (uint32_t entry : vec) { - if (entry == id) { - found_affinity = true; - break; - } - } - if (!found_affinity) - continue; - for (uint32_t entry : vec) { - sgpr_slot[entry] = slot_idx; - is_assigned[entry] = true; - } + add_interferences(ctx, is_assigned, slots, slots_used, id); + } + + unsigned slot = find_available_slot(slots_used, ctx.wave_size, + ctx.interferences[vec[0]].first.size(), + type == RegType::sgpr, num_slots); + + for (unsigned id : vec) { + assert(!is_assigned[id]); + + if (ctx.is_reloaded[id]) { + slots[id] = slot; + is_assigned[id] = true; } } - slot_idx++; } - unsigned sgpr_spill_slots = spill_slot_interferences.size(); - spill_slot_interferences.clear(); - slot_idx = 0; - done = false; + /* assign slots for ids without affinities */ + for (unsigned id = 0; id < ctx.interferences.size(); id++) { + if (is_assigned[id] || !ctx.is_reloaded[id] || ctx.interferences[id].first.type() != type) + continue; - /* assign vgpr spill slots */ - while (!done) { - done = true; - for (unsigned id = 0; id < ctx.interferences.size(); id++) { - if (is_assigned[id] || !ctx.is_reloaded[id]) - continue; - if (ctx.interferences[id].first.type() != RegType::vgpr) - continue; + add_interferences(ctx, is_assigned, slots, slots_used, id); - /* check interferences */ - bool interferes = false; - for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++) { - if (i == spill_slot_interferences.size()) - spill_slot_interferences.emplace_back(std::set()); - /* check for interference and ensure that vector regs are stored next to each other */ - if (spill_slot_interferences[i].find(id) != spill_slot_interferences[i].end()) { - interferes = true; - break; - } - } - if (interferes) { - done = false; - continue; - } + unsigned slot = find_available_slot(slots_used, ctx.wave_size, + ctx.interferences[id].first.size(), + type == RegType::sgpr, num_slots); - /* we found a spill id which can be assigned to current spill slot */ - vgpr_slot[id] = slot_idx; - is_assigned[id] = true; - for (unsigned i = slot_idx; i < slot_idx + ctx.interferences[id].first.size(); i++) - spill_slot_interferences[i].insert(ctx.interferences[id].second.begin(), ctx.interferences[id].second.end()); - - /* add all affinities: there are no additional interferences */ - for (std::vector& vec : ctx.affinities) { - bool found_affinity = false; - for (uint32_t entry : vec) { - if (entry == id) { - found_affinity = true; - break; - } - } - if (!found_affinity) - continue; - for (uint32_t entry : vec) { - vgpr_slot[entry] = slot_idx; - is_assigned[entry] = true; - } + slots[id] = slot; + is_assigned[id] = true; + } + + *num_slots = slots_used.size(); +} + +void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { + std::vector slots(ctx.interferences.size()); + std::vector is_assigned(ctx.interferences.size()); + + /* first, handle affinities: just merge all interferences into both spill ids */ + for (std::vector& vec : ctx.affinities) { + for (unsigned i = 0; i < vec.size(); i++) { + for (unsigned j = i + 1; j < vec.size(); j++) { + assert(vec[i] != vec[j]); + bool reloaded = ctx.is_reloaded[vec[i]] || ctx.is_reloaded[vec[j]]; + ctx.is_reloaded[vec[i]] = reloaded; + ctx.is_reloaded[vec[j]] = reloaded; } } - slot_idx++; } + for (ASSERTED uint32_t i = 0; i < ctx.interferences.size(); i++) + for (ASSERTED uint32_t id : ctx.interferences[i].second) + assert(i != id); - unsigned vgpr_spill_slots = spill_slot_interferences.size(); + /* for each spill slot, assign as many spill ids as possible */ + unsigned sgpr_spill_slots = 0, vgpr_spill_slots = 0; + assign_spill_slots_helper(ctx, RegType::sgpr, is_assigned, slots, &sgpr_spill_slots); + assign_spill_slots_helper(ctx, RegType::vgpr, is_assigned, slots, &vgpr_spill_slots); for (unsigned id = 0; id < is_assigned.size(); id++) assert(is_assigned[id] || !ctx.is_reloaded[id]); @@ -1451,16 +1463,13 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { continue; assert(ctx.is_reloaded[vec[i]] == ctx.is_reloaded[vec[j]]); assert(ctx.interferences[vec[i]].first.type() == ctx.interferences[vec[j]].first.type()); - if (ctx.interferences[vec[i]].first.type() == RegType::sgpr) - assert(sgpr_slot[vec[i]] == sgpr_slot[vec[j]]); - else - assert(vgpr_slot[vec[i]] == vgpr_slot[vec[j]]); + assert(slots[vec[i]] == slots[vec[j]]); } } } /* hope, we didn't mess up */ - std::vector vgpr_spill_temps((sgpr_spill_slots + 63) / 64); + std::vector vgpr_spill_temps((sgpr_spill_slots + ctx.wave_size - 1) / ctx.wave_size); assert(vgpr_spill_temps.size() <= spills_to_vgpr); /* replace pseudo instructions with actual hardware instructions */ @@ -1504,8 +1513,8 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { bool can_destroy = true; for (std::pair pair : ctx.spills_exit[block.linear_preds[0]]) { - if (sgpr_slot.find(pair.second) != sgpr_slot.end() && - sgpr_slot[pair.second] / 64 == i) { + if (ctx.interferences[pair.second].first.type() == RegType::sgpr && + slots[pair.second] / ctx.wave_size == i) { can_destroy = false; break; } @@ -1526,10 +1535,12 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { if (!ctx.is_reloaded[spill_id]) { /* never reloaded, so don't spill */ - } else if (vgpr_slot.find(spill_id) != vgpr_slot.end()) { + } else if (!is_assigned[spill_id]) { + unreachable("No spill slot assigned for spill id"); + } else if (ctx.interferences[spill_id].first.type() == RegType::vgpr) { /* spill vgpr */ ctx.program->config->spilled_vgprs += (*it)->operands[0].size(); - uint32_t spill_slot = vgpr_slot[spill_id]; + uint32_t spill_slot = slots[spill_id]; bool add_offset_to_sgpr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + vgpr_spill_slots * 4 > 4096; unsigned base_offset = add_offset_to_sgpr ? 0 : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size; @@ -1544,46 +1555,33 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { } unsigned offset = base_offset + spill_slot * 4; - aco_opcode opcode; + aco_opcode opcode = aco_opcode::buffer_store_dword; assert((*it)->operands[0].isTemp()); Temp temp = (*it)->operands[0].getTemp(); assert(temp.type() == RegType::vgpr && !temp.is_linear()); - switch (temp.size()) { - case 1: opcode = aco_opcode::buffer_store_dword; break; - case 2: opcode = aco_opcode::buffer_store_dwordx2; break; - case 6: temp = bld.tmp(v3); /* fallthrough */ - case 3: opcode = aco_opcode::buffer_store_dwordx3; break; - case 8: temp = bld.tmp(v4); /* fallthrough */ - case 4: opcode = aco_opcode::buffer_store_dwordx4; break; - default: { + if (temp.size() > 1) { Instruction* split{create_instruction(aco_opcode::p_split_vector, Format::PSEUDO, 1, temp.size())}; split->operands[0] = Operand(temp); for (unsigned i = 0; i < temp.size(); i++) split->definitions[i] = bld.def(v1); bld.insert(split); - opcode = aco_opcode::buffer_store_dword; - for (unsigned i = 0; i < temp.size(); i++) - bld.mubuf(opcode, Operand(), scratch_rsrc, scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false); - continue; - } - } - - if ((*it)->operands[0].size() > 4) { - Temp temp2 = bld.pseudo(aco_opcode::p_split_vector, bld.def(temp.regClass()), Definition(temp), (*it)->operands[0]); - bld.mubuf(opcode, Operand(), scratch_rsrc, scratch_offset, temp2, offset, false); - offset += temp.size() * 4; + for (unsigned i = 0; i < temp.size(); i++) { + Instruction *instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, split->definitions[i].getTemp(), offset + i * 4, false, true); + static_cast(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private); + } + } else { + Instruction *instr = bld.mubuf(opcode, scratch_rsrc, Operand(v1), scratch_offset, temp, offset, false, true); + static_cast(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private); } - bld.mubuf(opcode, Operand(), scratch_rsrc, scratch_offset, temp, offset, false); - - } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) { + } else { ctx.program->config->spilled_sgprs += (*it)->operands[0].size(); - uint32_t spill_slot = sgpr_slot[spill_id]; + uint32_t spill_slot = slots[spill_id]; /* check if the linear vgpr already exists */ - if (vgpr_spill_temps[spill_slot / 64] == Temp()) { + if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) { Temp linear_vgpr = {ctx.program->allocateId(), v1.as_linear()}; - vgpr_spill_temps[spill_slot / 64] = linear_vgpr; + vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr; aco_ptr create{create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; create->definitions[0] = Definition(linear_vgpr); /* find the right place to insert this definition */ @@ -1600,21 +1598,21 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { /* spill sgpr: just add the vgpr temp to operands */ Pseudo_instruction* spill = create_instruction(aco_opcode::p_spill, Format::PSEUDO, 3, 0); - spill->operands[0] = Operand(vgpr_spill_temps[spill_slot / 64]); - spill->operands[1] = Operand(spill_slot % 64); + spill->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]); + spill->operands[1] = Operand(spill_slot % ctx.wave_size); spill->operands[2] = (*it)->operands[0]; instructions.emplace_back(aco_ptr(spill)); - } else { - unreachable("No spill slot assigned for spill id"); } } else if ((*it)->opcode == aco_opcode::p_reload) { uint32_t spill_id = (*it)->operands[0].constantValue(); assert(ctx.is_reloaded[spill_id]); - if (vgpr_slot.find(spill_id) != vgpr_slot.end()) { + if (!is_assigned[spill_id]) { + unreachable("No spill slot assigned for spill id"); + } else if (ctx.interferences[spill_id].first.type() == RegType::vgpr) { /* reload vgpr */ - uint32_t spill_slot = vgpr_slot[spill_id]; + uint32_t spill_slot = slots[spill_id]; bool add_offset_to_sgpr = ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size + vgpr_spill_slots * 4 > 4096; unsigned base_offset = add_offset_to_sgpr ? 0 : ctx.program->config->scratch_bytes_per_wave / ctx.program->wave_size; @@ -1629,43 +1627,30 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { } unsigned offset = base_offset + spill_slot * 4; - aco_opcode opcode; + aco_opcode opcode = aco_opcode::buffer_load_dword; Definition def = (*it)->definitions[0]; - switch (def.size()) { - case 1: opcode = aco_opcode::buffer_load_dword; break; - case 2: opcode = aco_opcode::buffer_load_dwordx2; break; - case 6: def = bld.def(v3); /* fallthrough */ - case 3: opcode = aco_opcode::buffer_load_dwordx3; break; - case 8: def = bld.def(v4); /* fallthrough */ - case 4: opcode = aco_opcode::buffer_load_dwordx4; break; - default: { + if (def.size() > 1) { Instruction* vec{create_instruction(aco_opcode::p_create_vector, Format::PSEUDO, def.size(), 1)}; vec->definitions[0] = def; - opcode = aco_opcode::buffer_load_dword; for (unsigned i = 0; i < def.size(); i++) { Temp tmp = bld.tmp(v1); vec->operands[i] = Operand(tmp); - bld.mubuf(opcode, Definition(tmp), Operand(), scratch_rsrc, scratch_offset, offset + i * 4, false); + Instruction *instr = bld.mubuf(opcode, Definition(tmp), scratch_rsrc, Operand(v1), scratch_offset, offset + i * 4, false, true); + static_cast(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private); } bld.insert(vec); - continue; - } + } else { + Instruction *instr = bld.mubuf(opcode, def, scratch_rsrc, Operand(v1), scratch_offset, offset, false, true); + static_cast(instr)->sync = memory_sync_info(storage_vgpr_spill, semantic_private); } - - bld.mubuf(opcode, def, Operand(), scratch_rsrc, scratch_offset, offset, false); - if ((*it)->definitions[0].size() > 4) { - Temp temp2 = bld.mubuf(opcode, bld.def(def.regClass()), Operand(), scratch_rsrc, scratch_offset, offset + def.size() * 4, false); - bld.pseudo(aco_opcode::p_create_vector, (*it)->definitions[0], def.getTemp(), temp2); - } - - } else if (sgpr_slot.find(spill_id) != sgpr_slot.end()) { - uint32_t spill_slot = sgpr_slot[spill_id]; - reload_in_loop[spill_slot / 64] = block.loop_nest_depth > 0; + } else { + uint32_t spill_slot = slots[spill_id]; + reload_in_loop[spill_slot / ctx.wave_size] = block.loop_nest_depth > 0; /* check if the linear vgpr already exists */ - if (vgpr_spill_temps[spill_slot / 64] == Temp()) { + if (vgpr_spill_temps[spill_slot / ctx.wave_size] == Temp()) { Temp linear_vgpr = {ctx.program->allocateId(), v1.as_linear()}; - vgpr_spill_temps[spill_slot / 64] = linear_vgpr; + vgpr_spill_temps[spill_slot / ctx.wave_size] = linear_vgpr; aco_ptr create{create_instruction(aco_opcode::p_start_linear_vgpr, Format::PSEUDO, 0, 1)}; create->definitions[0] = Definition(linear_vgpr); /* find the right place to insert this definition */ @@ -1682,12 +1667,10 @@ void assign_spill_slots(spill_ctx& ctx, unsigned spills_to_vgpr) { /* reload sgpr: just add the vgpr temp to operands */ Pseudo_instruction* reload = create_instruction(aco_opcode::p_reload, Format::PSEUDO, 2, 1); - reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / 64]); - reload->operands[1] = Operand(spill_slot % 64); + reload->operands[0] = Operand(vgpr_spill_temps[spill_slot / ctx.wave_size]); + reload->operands[1] = Operand(spill_slot % ctx.wave_size); reload->definitions[0] = (*it)->definitions[0]; instructions.emplace_back(aco_ptr(reload)); - } else { - unreachable("No spill slot assigned for spill id"); } } else if (!ctx.remat_used.count(it->get()) || ctx.remat_used[it->get()]) { instructions.emplace_back(std::move(*it)); @@ -1770,18 +1753,17 @@ void spill(Program* program, live& live_vars, const struct radv_nir_compiler_opt /* calculate target register demand */ RegisterDemand register_target = program->max_reg_demand; if (register_target.sgpr > program->sgpr_limit) - register_target.vgpr += (register_target.sgpr - program->sgpr_limit + 63 + 32) / 64; + register_target.vgpr += (register_target.sgpr - program->sgpr_limit + program->wave_size - 1 + 32) / program->wave_size; register_target.sgpr = program->sgpr_limit; if (register_target.vgpr > program->vgpr_limit) register_target.sgpr = program->sgpr_limit - 5; - register_target.vgpr = program->vgpr_limit - (register_target.vgpr - program->max_reg_demand.vgpr); - - int spills_to_vgpr = (program->max_reg_demand.sgpr - register_target.sgpr + 63 + 32) / 64; + int spills_to_vgpr = (program->max_reg_demand.sgpr - register_target.sgpr + program->wave_size - 1 + 32) / program->wave_size; + register_target.vgpr = program->vgpr_limit - spills_to_vgpr; /* initialize ctx */ spill_ctx ctx(register_target, program, live_vars.register_demand); - compute_global_next_uses(ctx, live_vars.live_out); + compute_global_next_uses(ctx); get_rematerialize_info(ctx); /* create spills and reloads */ @@ -1794,7 +1776,7 @@ void spill(Program* program, live& live_vars, const struct radv_nir_compiler_opt /* update live variable information */ live_vars = live_var_analysis(program, options); - assert(program->num_waves >= 0); + assert(program->num_waves > 0); } }