std::unordered_map<unsigned, phi_info> phi_map;
std::unordered_map<unsigned, unsigned> affinities;
std::unordered_map<unsigned, Instruction*> vectors;
+ aco_ptr<Instruction> pseudo_dummy;
unsigned max_used_sgpr = 0;
unsigned max_used_vgpr = 0;
std::bitset<64> defs_done; /* see MAX_ARGS in aco_instruction_selection_setup.cpp */
renames(program->blocks.size()),
incomplete_phis(program->blocks.size()),
filled(program->blocks.size()),
- sealed(program->blocks.size()) {}
+ sealed(program->blocks.size())
+ {
+ pseudo_dummy.reset(create_instruction<Instruction>(aco_opcode::p_parallelcopy, Format::PSEUDO, 0, 0));
+ }
+};
+
+bool instr_can_access_subdword(aco_ptr<Instruction>& instr)
+{
+ return instr->isSDWA() || instr->format == Format::PSEUDO;
+}
+
+struct DefInfo {
+ uint16_t lb;
+ uint16_t ub;
+ uint8_t size;
+ uint8_t stride;
+ RegClass rc;
+
+ DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc) : rc(rc) {
+ size = rc.size();
+ stride = 1;
+
+ if (rc.type() == RegType::vgpr) {
+ lb = 256;
+ ub = 256 + ctx.program->max_reg_demand.vgpr;
+ } else {
+ lb = 0;
+ ub = ctx.program->max_reg_demand.sgpr;
+ if (size == 2)
+ stride = 2;
+ else if (size >= 4)
+ stride = 4;
+ }
+
+ if (rc.is_subdword()) {
+ /* stride in bytes */
+ if(!instr_can_access_subdword(instr))
+ stride = 4;
+ else if (rc.bytes() % 4 == 0)
+ stride = 4;
+ else if (rc.bytes() % 2 == 0)
+ stride = 2;
+ }
+ }
};
class RegisterFile {
return false;
}
- void block(PhysReg start, unsigned num_bytes) {
- if (start.byte() || num_bytes % 4)
- fill_subdword(start, num_bytes, 0xFFFFFFFF);
+ void block(PhysReg start, RegClass rc) {
+ if (rc.is_subdword())
+ fill_subdword(start, rc.bytes(), 0xFFFFFFFF);
else
- fill(start, num_bytes / 4, 0xFFFFFFFF);
+ fill(start, rc.size(), 0xFFFFFFFF);
}
bool is_blocked(PhysReg start) {
}
}
-bool instr_can_access_subdword(aco_ptr<Instruction>& instr)
-{
- return instr->isSDWA() || instr->format == Format::PSEUDO;
-}
-
std::pair<PhysReg, bool> get_reg_simple(ra_ctx& ctx,
RegisterFile& reg_file,
- uint32_t lb, uint32_t ub,
- uint32_t size, uint32_t stride,
- RegClass rc)
+ DefInfo info)
{
+ uint32_t lb = info.lb;
+ uint32_t ub = info.ub;
+ uint32_t size = info.size;
+ uint32_t stride = info.stride;
+ RegClass rc = info.rc;
+
if (rc.is_subdword()) {
for (std::pair<uint32_t, std::array<uint32_t, 4>> entry : reg_file.subdword_regs) {
assert(reg_file[entry.first] == 0xF0000000);
reg_found &= entry.second[i + j] == 0;
/* check neighboring reg if needed */
- reg_found &= (i <= 4 - rc.bytes() || reg_file[entry.first + 1] == 0);
+ reg_found &= ((int)i <= 4 - (int)rc.bytes() || reg_file[entry.first + 1] == 0);
if (reg_found) {
PhysReg res{entry.first};
res.reg_b += i;
}
stride = 1; /* stride in full registers */
+ rc = info.rc = RegClass(RegType::vgpr, size);
}
- /* best fit algorithm: find the smallest gap to fit in the variable */
if (stride == 1) {
+
+ for (unsigned stride = 8; stride > 1; stride /= 2) {
+ if (size % stride)
+ continue;
+ info.stride = stride;
+ std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
+ if (res.second)
+ return res;
+ }
+
+ /* best fit algorithm: find the smallest gap to fit in the variable */
unsigned best_pos = 0xFFFF;
unsigned gap_size = 0xFFFF;
- unsigned next_pos = 0xFFFF;
+ unsigned last_pos = 0xFFFF;
for (unsigned current_reg = lb; current_reg < ub; current_reg++) {
- if (reg_file[current_reg] != 0 || ctx.war_hint[current_reg]) {
- if (next_pos == 0xFFFF)
- continue;
- /* check if the variable fits */
- if (next_pos + size > current_reg) {
- next_pos = 0xFFFF;
+ if (reg_file[current_reg] == 0 && !ctx.war_hint[current_reg]) {
+ if (last_pos == 0xFFFF)
+ last_pos = current_reg;
+
+ /* stop searching after max_used_gpr */
+ if (current_reg == ctx.max_used_sgpr + 1 || current_reg == 256 + ctx.max_used_vgpr + 1)
+ break;
+ else
continue;
- }
+ }
- /* check if the tested gap is smaller */
- if (current_reg - next_pos < gap_size) {
- best_pos = next_pos;
- gap_size = current_reg - next_pos;
- }
- next_pos = 0xFFFF;
+ if (last_pos == 0xFFFF)
continue;
+
+ /* early return on exact matches */
+ if (last_pos + size == current_reg) {
+ adjust_max_used_regs(ctx, rc, last_pos);
+ return {PhysReg{last_pos}, true};
}
- if (next_pos == 0xFFFF)
- next_pos = current_reg;
+ /* check if it fits and the gap size is smaller */
+ if (last_pos + size < current_reg && current_reg - last_pos < gap_size) {
+ best_pos = last_pos;
+ gap_size = current_reg - last_pos;
+ }
+ last_pos = 0xFFFF;
}
/* final check */
- if (next_pos != 0xFFFF &&
- next_pos + size <= ub &&
- ub - next_pos < gap_size) {
- best_pos = next_pos;
- gap_size = ub - next_pos;
+ if (last_pos + size <= ub && ub - last_pos < gap_size) {
+ best_pos = last_pos;
+ gap_size = ub - last_pos;
}
- if (best_pos != 0xFFFF) {
- adjust_max_used_regs(ctx, rc, best_pos);
- return {PhysReg{best_pos}, true};
+
+ if (best_pos == 0xFFFF)
+ return {{}, false};
+
+ /* find best position within gap by leaving a good stride for other variables*/
+ unsigned buffer = gap_size - size;
+ if (buffer > 1) {
+ if (((best_pos + size) % 8 != 0 && (best_pos + buffer) % 8 == 0) ||
+ ((best_pos + size) % 4 != 0 && (best_pos + buffer) % 4 == 0) ||
+ ((best_pos + size) % 2 != 0 && (best_pos + buffer) % 2 == 0))
+ best_pos = best_pos + buffer;
}
- return {{}, false};
+
+ adjust_max_used_regs(ctx, rc, best_pos);
+ return {PhysReg{best_pos}, true};
}
bool found = false;
for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin(); it != vars.rend(); ++it) {
unsigned id = it->second;
assignment& var = ctx.assignments[id];
- uint32_t size = var.rc.size();
- uint32_t stride = 1;
- if (var.rc.type() == RegType::sgpr) {
- if (size == 2)
- stride = 2;
- if (size > 3)
- stride = 4;
- }
+ DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc);
+ uint32_t size = info.size;
/* check if this is a dead operand, then we can re-use the space from the definition */
bool is_dead_operand = false;
}
}
} else {
- res = get_reg_simple(ctx, reg_file, def_reg_lo, def_reg_hi + 1, size, stride, var.rc);
+ info.lb = def_reg_lo;
+ info.ub = def_reg_hi + 1;
+ res = get_reg_simple(ctx, reg_file, info);
}
} else {
- res = get_reg_simple(ctx, reg_file, lb, def_reg_lo, size, stride, var.rc);
+ info.lb = lb;
+ info.ub = def_reg_lo;
+ res = get_reg_simple(ctx, reg_file, info);
if (!res.second) {
- unsigned lb = (def_reg_hi + stride) & ~(stride - 1);
- res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, var.rc);
+ info.lb = (def_reg_hi + info.stride) & ~(info.stride - 1);
+ info.ub = ub;
+ res = get_reg_simple(ctx, reg_file, info);
}
}
if (res.second) {
/* mark the area as blocked */
- reg_file.block(res.first, var.rc.bytes());
+ reg_file.block(res.first, var.rc);
/* create parallelcopy pair (without definition id) */
Temp tmp = Temp(id, var.rc);
/* we use a sliding window to find potential positions */
unsigned reg_lo = lb;
unsigned reg_hi = lb + size - 1;
+ unsigned stride = var.rc.is_subdword() ? 1 : info.stride;
for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
if (!is_dead_operand && ((reg_lo >= def_reg_lo && reg_lo <= def_reg_hi) ||
(reg_hi >= def_reg_lo && reg_hi <= def_reg_hi)))
std::set<std::pair<unsigned, unsigned>> new_vars = collect_vars(ctx, reg_file, PhysReg{reg_lo}, size);
/* mark the area as blocked */
- reg_file.block(PhysReg{reg_lo}, size * 4);
+ reg_file.block(PhysReg{reg_lo}, var.rc);
if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, lb, ub, instr, def_reg_lo, def_reg_hi))
return false;
std::pair<PhysReg, bool> get_reg_impl(ra_ctx& ctx,
RegisterFile& reg_file,
std::vector<std::pair<Operand, Definition>>& parallelcopies,
- uint32_t lb, uint32_t ub,
- uint32_t size, uint32_t stride,
- RegClass rc,
+ DefInfo info,
aco_ptr<Instruction>& instr)
{
+ uint32_t lb = info.lb;
+ uint32_t ub = info.ub;
+ uint32_t size = info.size;
+ uint32_t stride = info.stride;
+ RegClass rc = info.rc;
+
/* check how many free regs we have */
unsigned regs_free = reg_file.count_zero(PhysReg{lb}, ub-lb);
instr->operands[j].physReg() < ub) {
assert(instr->operands[j].isFixed());
assert(!reg_file.test(instr->operands[j].physReg(), instr->operands[j].bytes()));
- reg_file.block(instr->operands[j].physReg(), instr->operands[j].bytes());
+ reg_file.block(instr->operands[j].physReg(), instr->operands[j].regClass());
killed_ops += instr->operands[j].getTemp().size();
}
}
{
if (rc.is_subdword() && reg.byte() && !instr_can_access_subdword(instr))
return false;
+ if (!rc.is_subdword() && reg.byte())
+ return false;
uint32_t size = rc.size();
uint32_t stride = 1;
return true;
}
-std::pair<PhysReg, bool> get_reg_vec(ra_ctx& ctx,
- RegisterFile& reg_file,
- RegClass rc)
-{
- uint32_t size = rc.size();
- uint32_t stride = 1;
- uint32_t lb, ub;
- if (rc.type() == RegType::vgpr) {
- lb = 256;
- ub = 256 + ctx.program->max_reg_demand.vgpr;
- } else {
- lb = 0;
- ub = ctx.program->max_reg_demand.sgpr;
- if (size == 2)
- stride = 2;
- else if (size >= 4)
- stride = 4;
- }
- return get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc);
-}
-
PhysReg get_reg(ra_ctx& ctx,
RegisterFile& reg_file,
Temp temp,
k += op.bytes();
}
- std::pair<PhysReg, bool> res = get_reg_vec(ctx, reg_file, vec->definitions[0].regClass());
+ DefInfo info(ctx, ctx.pseudo_dummy, vec->definitions[0].regClass());
+ std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
PhysReg reg = res.first;
if (res.second) {
reg.reg_b += byte_offset;
}
}
- RegClass rc = temp.regClass();
- uint32_t size = rc.size();
- uint32_t stride = 1;
- uint32_t lb, ub;
- if (rc.type() == RegType::vgpr) {
- lb = 256;
- ub = 256 + ctx.program->max_reg_demand.vgpr;
- } else {
- lb = 0;
- ub = ctx.program->max_reg_demand.sgpr;
- if (size == 2)
- stride = 2;
- else if (size >= 4)
- stride = 4;
- }
-
- if (rc.is_subdword()) {
- /* stride in bytes */
- if(!instr_can_access_subdword(instr))
- stride = 4;
- else if (rc.bytes() % 4 == 0)
- stride = 4;
- else if (rc.bytes() % 2 == 0)
- stride = 2;
- }
+ DefInfo info(ctx, instr, temp.regClass());
- std::pair<PhysReg, bool> res = {{}, false};
/* try to find space without live-range splits */
- if (rc.type() == RegType::vgpr && (size == 4 || size == 8))
- res = get_reg_simple(ctx, reg_file, lb, ub, size, 4, rc);
- if (!res.second)
- res = get_reg_simple(ctx, reg_file, lb, ub, size, stride, rc);
+ std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
+
if (res.second)
return res.first;
/* try to find space with live-range splits */
- res = get_reg_impl(ctx, reg_file, parallelcopies, lb, ub, size, stride, rc, instr);
+ res = get_reg_impl(ctx, reg_file, parallelcopies, info, instr);
if (res.second)
return res.first;
/* We should only fail here because keeping under the limit would require
* too many moves. */
- assert(reg_file.count_zero(PhysReg{lb}, ub-lb) >= size);
+ assert(reg_file.count_zero(PhysReg{info.lb}, info.ub-info.lb) >= info.size);
uint16_t max_addressible_sgpr = ctx.program->sgpr_limit;
uint16_t max_addressible_vgpr = ctx.program->vgpr_limit;
- if (rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) {
+ if (info.rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) {
update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr));
return get_reg(ctx, reg_file, temp, parallelcopies, instr);
- } else if (rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) {
+ } else if (info.rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) {
update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1));
return get_reg(ctx, reg_file, temp, parallelcopies, instr);
}
//FIXME: if nothing helps, shift-rotate the registers to make space
- unreachable("did not find a register");
+ fprintf(stderr, "ACO: failed to allocate registers during shader compilation\n");
+ abort();
}
PhysReg get_reg_create_vector(ra_ctx& ctx,
pc_op.setFixed(operand.physReg());
/* find free reg */
- PhysReg reg = get_reg(ctx, register_file, pc_op.getTemp(), parallelcopy, instr);
+ PhysReg reg = get_reg(ctx, register_file, pc_op.getTemp(), parallelcopy, ctx.pseudo_dummy);
Definition pc_def = Definition(PhysReg{reg}, pc_op.regClass());
register_file.clear(pc_op);
parallelcopy.emplace_back(pc_op, pc_def);
/* add vector affinities */
if (instr->opcode == aco_opcode::p_create_vector) {
for (const Operand& op : instr->operands) {
- if (op.isTemp() && op.getTemp().type() == instr->definitions[0].getTemp().type())
+ if (op.isTemp() && op.isFirstKill() && op.getTemp().type() == instr->definitions[0].getTemp().type())
ctx.vectors[op.tempId()] = instr.get();
}
}
if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) {
phi_ressources[it->second][0] = def.getTemp();
/* try to coalesce phi affinities with parallelcopies */
- if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy) {
- Operand op = instr->operands[i];
- if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
- phi_ressources[it->second].emplace_back(op.getTemp());
- temp_to_phi_ressources[op.tempId()] = it->second;
- }
+ Operand op = Operand();
+ if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy)
+ op = instr->operands[i];
+ else if (instr->opcode == aco_opcode::v_mad_f32 && !instr->usesModifiers())
+ op = instr->operands[2];
+
+ if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
+ phi_ressources[it->second].emplace_back(op.getTemp());
+ temp_to_phi_ressources[op.tempId()] = it->second;
}
}
}
instr->operands[2].isKillBeforeDef() &&
instr->operands[2].getTemp().type() == RegType::vgpr &&
instr->operands[1].isTemp() &&
- instr->operands[1].getTemp().type() == RegType::vgpr) { /* TODO: swap src0 and src1 in this case */
- VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr.get());
- bool can_use_mac = !(vop3->abs[0] || vop3->abs[1] || vop3->abs[2] ||
- vop3->neg[0] || vop3->neg[1] || vop3->neg[2] ||
- vop3->clamp || vop3->omod || vop3->opsel);
- if (can_use_mac) {
- instr->format = Format::VOP2;
- instr->opcode = aco_opcode::v_mac_f32;
- }
+ instr->operands[1].getTemp().type() == RegType::vgpr &&
+ !instr->usesModifiers()) {
+ instr->format = Format::VOP2;
+ instr->opcode = aco_opcode::v_mac_f32;
}
/* handle definitions which must have the same register as an operand */
definition.setFixed(definition.physReg());
else if (instr->opcode == aco_opcode::p_split_vector) {
PhysReg reg = instr->operands[0].physReg();
- reg.reg_b += i * definition.bytes();
+ for (unsigned j = 0; j < i; j++)
+ reg.reg_b += instr->definitions[j].bytes();
if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg))
definition.setFixed(reg);
} else if (instr->opcode == aco_opcode::p_wqm) {
definition.setFixed(reg);
}
- if (!definition.isFixed())
- definition.setFixed(get_reg(ctx, register_file, definition.getTemp(), parallelcopy, instr));
+ if (!definition.isFixed()) {
+ Temp tmp = definition.getTemp();
+ /* subdword instructions before RDNA write full registers */
+ if (tmp.regClass().is_subdword() &&
+ !instr_can_access_subdword(instr) &&
+ ctx.program->chip_class <= GFX9) {
+ assert(tmp.bytes() <= 4);
+ tmp = Temp(definition.tempId(), v1);
+ }
+ definition.setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr));
+ }
assert(definition.isFixed() && ((definition.getTemp().type() == RegType::vgpr && definition.physReg() >= 256) ||
(definition.getTemp().type() != RegType::vgpr && definition.physReg() < 256)));
}
for (const Operand& op : instr->operands) {
if (op.isTemp() && op.isFirstKill())
- register_file.block(op.physReg(), op.bytes());
+ register_file.block(op.physReg(), op.regClass());
}
handle_pseudo(ctx, register_file, pc.get());
register_file.clear(def);
for (const Operand& op : instr->operands) {
if (op.isTemp() && op.isFirstKill())
- register_file.block(op.physReg(), op.bytes());
+ register_file.block(op.physReg(), op.regClass());
}
Temp tmp = {program->allocateId(), can_sgpr ? s1 : v1};
ctx.assignments.emplace_back();