std::unordered_map<unsigned, Temp> orig_names;
std::unordered_map<unsigned, phi_info> phi_map;
std::unordered_map<unsigned, unsigned> affinities;
+ std::unordered_map<unsigned, Instruction*> vectors;
+ aco_ptr<Instruction> pseudo_dummy;
unsigned max_used_sgpr = 0;
unsigned max_used_vgpr = 0;
std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
renames(program->blocks.size()),
incomplete_phis(program->blocks.size()),
filled(program->blocks.size()),
- sealed(program->blocks.size()) {}
+ sealed(program->blocks.size())
+ {
+ pseudo_dummy.reset(create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
+ }
+};
+
+bool instr_can_access_subdword(aco_ptr<Instruction>& instr)
+{
+ return instr->isSDWA() || instr->format == Format::PSEUDO;
+}
+
+struct DefInfo {
+ uint16_t lb;
+ uint16_t ub;
+ uint8_t size;
+ uint8_t stride;
+ RegClass rc;
+
+ DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc) : rc(rc) {
+ size = rc.size();
+ stride = 1;
+
+ if (rc.type() == RegType::vgpr) {
+ lb = 256;
+ ub = 256 + ctx.program->max_reg_demand.vgpr;
+ } else {
+ lb = 0;
+ ub = ctx.program->max_reg_demand.sgpr;
+ if (size == 2)
+ stride = 2;
+ else if (size >= 4)
+ stride = 4;
+ }
+
+ if (rc.is_subdword()) {
+ /* stride in bytes */
+ if(!instr_can_access_subdword(instr))
+ stride = 4;
+ else if (rc.bytes() % 4 == 0)
+ stride = 4;
+ else if (rc.bytes() % 2 == 0)
+ stride = 2;
+ }
+ }
};
class RegisterFile {
}
}
-bool instr_can_access_subdword(aco_ptr<Instruction>& instr)
-{
- return instr->isSDWA() || instr->format == Format::PSEUDO;
-}
-
std::pair<PhysReg, bool> get_reg_simple(ra_ctx& ctx,
RegisterFile& reg_file,
- uint32_t lb, uint32_t ub,
- uint32_t size, uint32_t stride,
- RegClass rc)
+ DefInfo info)
{
+ uint32_t lb = info.lb;
+ uint32_t ub = info.ub;
+ uint32_t size = info.size;
+ uint32_t stride = info.stride;
+ RegClass rc = info.rc;
+
if (rc.is_subdword()) {
for (std::pair<uint32_t, std::array<uint32_t, 4>> entry : reg_file.subdword_regs) {
assert(reg_file[entry.first] == 0xF0000000);
/* best fit algorithm: find the smallest gap to fit in the variable */
if (stride == 1) {
+
+ if (rc.type() == RegType::vgpr && (size == 4 || size == 8)) {
+ info.stride = 4;
+ std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
+ if (res.second)
+ return res;
+ }
+
unsigned best_pos = 0xFFFF;
unsigned gap_size = 0xFFFF;
unsigned next_pos = 0xFFFF;
for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin(); it != vars.rend(); ++it) {
unsigned id = it->second;
assignment& var = ctx.assignments[id];
- uint32_t size = var.rc.size();
- uint32_t stride = 1;
- if (var.rc.type() == RegType::sgpr) {
- if (size == 2)
- stride = 2;
- if (size > 3)
- stride = 4;
- }
+ DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc);
+ uint32_t size = info.size;
/* check if this is a dead operand, then we can re-use the space from the definition */
bool is_dead_operand = false;
}
}
} else {
- res = get_reg_simple(ctx, reg_file, def_reg_lo, def_reg_hi + 1, size, stride, var.rc);
+ info.lb = def_reg_lo;
+ info.ub = def_reg_hi + 1;
+ res = get_reg_simple(ctx, reg_file, info);
}
} else {
- res = get_reg_simple(ctx, reg_file, lb, def_reg_lo, size, stride, var.rc);
+ info.lb = lb;
+ info.ub = def_reg_lo;
+ res = get_reg_simple(ctx, reg_file, info);
if (!res.second) {
- unsigned lb = (def_reg_hi + stride) & ~(stride - 1);
- res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, var.rc);
+ info.lb = (def_reg_hi + info.stride) & ~(info.stride - 1);
+ info.ub = ub;
+ res = get_reg_simple(ctx, reg_file, info);
}
}
/* we use a sliding window to find potential positions */
unsigned reg_lo = lb;
unsigned reg_hi = lb + size - 1;
+ unsigned stride = var.rc.is_subdword() ? 1 : info.stride;
for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
if (!is_dead_operand && ((reg_lo >= def_reg_lo && reg_lo <= def_reg_hi) ||
(reg_hi >= def_reg_lo && reg_hi <= def_reg_hi)))
std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
RegisterFile& reg_file,
std::vector<std::pair<Operand, Definition>>& parallelcopies,
- uint32_t lb, uint32_t ub,
- uint32_t size, uint32_t stride,
- RegClass rc,
+ DefInfo info,
aco_ptr<Instruction>& instr)
{
+ uint32_t lb = info.lb;
+ uint32_t ub = info.ub;
+ uint32_t size = info.size;
+ uint32_t stride = info.stride;
+ RegClass rc = info.rc;
+
/* check how many free regs we have */
unsigned regs_free = reg_file.count_zero(PhysReg{lb}, ub-lb);
aco_ptr<Instruction>& instr,
PhysReg reg)
{
+ if (rc.is_subdword() && reg.byte() && !instr_can_access_subdword(instr))
+ return false;
+
uint32_t size = rc.size();
uint32_t stride = 1;
uint32_t lb, ub;
ub = ctx.program->max_reg_demand.sgpr;
}
- if (rc.is_subdword() && reg.byte() && !instr_can_access_subdword(instr))
- return false;
-
uint32_t reg_lo = reg.reg();
uint32_t reg_hi = reg + (size - 1);
return true;
}
-std::pair<PhysReg, bool> get_reg_vec(ra_ctx& ctx,
- RegisterFile& reg_file,
- RegClass rc)
-{
- uint32_t size = rc.size();
- uint32_t stride = 1;
- uint32_t lb, ub;
- if (rc.type() == RegType::vgpr) {
- lb = 256;
- ub = 256 + ctx.program->max_reg_demand.vgpr;
- } else {
- lb = 0;
- ub = ctx.program->max_reg_demand.sgpr;
- if (size == 2)
- stride = 2;
- else if (size >= 4)
- stride = 4;
- }
- return get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc);
-}
-
PhysReg get_reg(ra_ctx& ctx,
RegisterFile& reg_file,
Temp temp,
std::vector<std::pair<Operand, Definition>>& parallelcopies,
aco_ptr<Instruction>& instr)
{
- RegClass rc = temp.regClass();
- uint32_t size = rc.size();
- uint32_t stride = 1;
- uint32_t lb, ub;
- if (rc.type() == RegType::vgpr) {
- lb = 256;
- ub = 256 + ctx.program->max_reg_demand.vgpr;
- } else {
- lb = 0;
- ub = ctx.program->max_reg_demand.sgpr;
- if (size == 2)
- stride = 2;
- else if (size >= 4)
- stride = 4;
+ if (ctx.affinities.find(temp.id()) != ctx.affinities.end() &&
+ ctx.assignments[ctx.affinities[temp.id()]].assigned) {
+ PhysReg reg = ctx.assignments[ctx.affinities[temp.id()]].reg;
+ if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
+ return reg;
}
- if (rc.is_subdword()) {
- /* stride in bytes */
- if(!instr_can_access_subdword(instr))
- stride = 4;
- else if (rc.bytes() % 4 == 0)
- stride = 4;
- else if (rc.bytes() % 2 == 0)
- stride = 2;
+ if (ctx.vectors.find(temp.id()) != ctx.vectors.end()) {
+ Instruction* vec = ctx.vectors[temp.id()];
+ unsigned byte_offset = 0;
+ for (const Operand& op : vec->operands) {
+ if (op.isTemp() && op.tempId() == temp.id())
+ break;
+ else
+ byte_offset += op.bytes();
+ }
+ unsigned k = 0;
+ for (const Operand& op : vec->operands) {
+ if (op.isTemp() &&
+ op.tempId() != temp.id() &&
+ op.getTemp().type() == temp.type() &&
+ ctx.assignments[op.tempId()].assigned) {
+ PhysReg reg = ctx.assignments[op.tempId()].reg;
+ reg.reg_b += (byte_offset - k);
+ if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
+ return reg;
+ }
+ k += op.bytes();
+ }
+
+ DefInfo info(ctx, ctx.pseudo_dummy, vec->definitions[0].regClass());
+ std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
+ PhysReg reg = res.first;
+ if (res.second) {
+ reg.reg_b += byte_offset;
+ /* make sure to only use byte offset if the instruction supports it */
+ if (get_reg_specified(ctx, reg_file, temp.regClass(), parallelcopies, instr, reg))
+ return reg;
+ }
}
- std::pair<PhysReg, bool> res = {{}, false};
+ DefInfo info(ctx, instr, temp.regClass());
+
/* try to find space without live-range splits */
- if (rc.type() == RegType::vgpr && (size == 4 || size == 8))
- res = get_reg_simple(ctx, reg_file, lb, ub, size, 4, rc);
- if (!res.second)
- res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc);
+ std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
+
if (res.second)
return res.first;
/* try to find space with live-range splits */
- res = get_reg_impl(ctx, reg_file, parallelcopies, lb, ub, size, stride, rc, instr);
+ res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr);
if (res.second)
return res.first;
/* We should only fail here because keeping under the limit would require
* too many moves. */
- assert(reg_file.count_zero(PhysReg{lb}, ub-lb) >= size);
+ assert(reg_file.count_zero(PhysReg{info.lb}, info.ub-info.lb) >= info.size);
uint16_t max_addressible_sgpr = ctx.program->sgpr_limit;
uint16_t max_addressible_vgpr = ctx.program->vgpr_limit;
- if (rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) {
+ if (info.rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) {
update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr));
return get_reg(ctx, reg_file, temp, parallelcopies, instr);
- } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) {
+ } else if (info.rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) {
update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1));
return get_reg(ctx, reg_file, temp, parallelcopies, instr);
}
pc_op.setFixed(operand.physReg());
/* find free reg */
- PhysReg reg = get_reg(ctx, register_file, pc_op.getTemp(), parallelcopy, instr);
+ PhysReg reg = get_reg(ctx, register_file, pc_op.getTemp(), parallelcopy, ctx.pseudo_dummy);
Definition pc_def = Definition(PhysReg{reg}, pc_op.regClass());
register_file.clear(pc_op);
parallelcopy.emplace_back(pc_op, pc_def);
void register_allocation(Program *program, std::vector<TempSet>& live_out_per_block)
{
ra_ctx ctx(program);
-
- std::unordered_map<unsigned, Instruction*> vectors;
std::vector<std::vector<Temp>> phi_ressources;
std::unordered_map<unsigned, unsigned> temp_to_phi_ressources;
/* add vector affinities */
if (instr->opcode == aco_opcode::p_create_vector) {
for (const Operand& op : instr->operands) {
- if (op.isTemp() && op.getTemp().type() == instr->definitions[0].getTemp().type())
- vectors[op.tempId()] = instr.get();
+ if (op.isTemp() && op.isFirstKill() && op.getTemp().type() == instr->definitions[0].getTemp().type())
+ ctx.vectors[op.tempId()] = instr.get();
}
}
else if (instr->opcode == aco_opcode::p_split_vector) {
PhysReg reg = instr->operands[0].physReg();
reg.reg_b += i * definition.bytes();
- if (!get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg))
- reg = get_reg(ctx, register_file, definition.getTemp(), parallelcopy, instr);
- definition.setFixed(reg);
+ if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg))
+ definition.setFixed(reg);
} else if (instr->opcode == aco_opcode::p_wqm) {
PhysReg reg;
if (instr->operands[0].isKillBeforeDef() && instr->operands[0].getTemp().type() == definition.getTemp().type()) {
reg = instr->operands[0].physReg();
+ definition.setFixed(reg);
assert(register_file[reg.reg()] == 0);
- } else {
- reg = get_reg(ctx, register_file, definition.getTemp(), parallelcopy, instr);
}
- definition.setFixed(reg);
} else if (instr->opcode == aco_opcode::p_extract_vector) {
PhysReg reg;
if (instr->operands[0].isKillBeforeDef() &&
reg = instr->operands[0].physReg();
reg.reg_b += definition.bytes() * instr->operands[1].constantValue();
assert(!register_file.test(reg, definition.bytes()));
- } else {
- reg = get_reg(ctx, register_file, definition.getTemp(), parallelcopy, instr);
+ definition.setFixed(reg);
}
- definition.setFixed(reg);
} else if (instr->opcode == aco_opcode::p_create_vector) {
PhysReg reg = get_reg_create_vector(ctx, register_file, definition.getTemp(),
parallelcopy, instr);
definition.setFixed(reg);
- } else if (ctx.affinities.find(definition.tempId()) != ctx.affinities.end() &&
- ctx.assignments[ctx.affinities[definition.tempId()]].assigned) {
- PhysReg reg = ctx.assignments[ctx.affinities[definition.tempId()]].reg;
- if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg))
- definition.setFixed(reg);
- else
- definition.setFixed(get_reg(ctx, register_file, definition.getTemp(), parallelcopy, instr));
+ }
- } else if (vectors.find(definition.tempId()) != vectors.end()) {
- Instruction* vec = vectors[definition.tempId()];
- unsigned byte_offset = 0;
- for (const Operand& op : vec->operands) {
- if (op.isTemp() && op.tempId() == definition.tempId())
- break;
- else
- byte_offset += op.bytes();
+ if (!definition.isFixed()) {
+ Temp tmp = definition.getTemp();
+ /* subdword instructions before RDNA write full registers */
+ if (tmp.regClass().is_subdword() &&
+ !instr_can_access_subdword(instr) &&
+ ctx.program->chip_class <= GFX9) {
+ assert(tmp.bytes() <= 4);
+ tmp = Temp(definition.tempId(), v1);
}
- unsigned k = 0;
- for (const Operand& op : vec->operands) {
- if (op.isTemp() &&
- op.tempId() != definition.tempId() &&
- op.getTemp().type() == definition.getTemp().type() &&
- ctx.assignments[op.tempId()].assigned) {
- PhysReg reg = ctx.assignments[op.tempId()].reg;
- reg.reg_b += (byte_offset - k);
- if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg)) {
- definition.setFixed(reg);
- break;
- }
- }
- k += op.bytes();
- }
- if (!definition.isFixed()) {
- std::pair<PhysReg, bool> res = get_reg_vec(ctx, register_file, vec->definitions[0].regClass());
- PhysReg reg = res.first;
- if (res.second) {
- reg.reg_b += byte_offset;
- /* make sure to only use byte offset if the instruction supports it */
- if (vec->definitions[0].regClass().is_subdword() && reg.byte() && !instr_can_access_subdword(instr))
- reg = get_reg(ctx, register_file, definition.getTemp(), parallelcopy, instr);
- } else {
- reg = get_reg(ctx, register_file, definition.getTemp(), parallelcopy, instr);
- }
- definition.setFixed(reg);
- }
- } else
- definition.setFixed(get_reg(ctx, register_file, definition.getTemp(), parallelcopy, instr));
+ definition.setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr));
+ }
assert(definition.isFixed() && ((definition.getTemp().type() == RegType::vgpr && definition.physReg() >= 256) ||
(definition.getTemp().type() != RegType::vgpr && definition.physReg() < 256)));