return false;
}
- void block(PhysReg start, unsigned num_bytes) {
- if (start.byte() || num_bytes % 4)
- fill_subdword(start, num_bytes, 0xFFFFFFFF);
+ void block(PhysReg start, RegClass rc) {
+ if (rc.is_subdword())
+ fill_subdword(start, rc.bytes(), 0xFFFFFFFF);
else
- fill(start, num_bytes / 4, 0xFFFFFFFF);
+ fill(start, rc.size(), 0xFFFFFFFF);
}
bool is_blocked(PhysReg start) {
reg_found &= entry.second[i + j] == 0;
/* check neighboring reg if needed */
- reg_found &= (i <= 4 - rc.bytes() || reg_file[entry.first + 1] == 0);
+ reg_found &= ((int)i <= 4 - (int)rc.bytes() || reg_file[entry.first + 1] == 0);
if (reg_found) {
PhysReg res{entry.first};
res.reg_b += i;
}
stride = 1; /* stride in full registers */
+ rc = info.rc = RegClass(RegType::vgpr, size);
}
- /* best fit algorithm: find the smallest gap to fit in the variable */
if (stride == 1) {
- if (rc.type() == RegType::vgpr && (size == 4 || size == 8)) {
- info.stride = 4;
+ for (unsigned stride = 8; stride > 1; stride /= 2) {
+ if (size % stride)
+ continue;
+ info.stride = stride;
std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
if (res.second)
return res;
}
+ /* best fit algorithm: find the smallest gap to fit in the variable */
unsigned best_pos = 0xFFFF;
unsigned gap_size = 0xFFFF;
- unsigned next_pos = 0xFFFF;
+ unsigned last_pos = 0xFFFF;
for (unsigned current_reg = lb; current_reg < ub; current_reg++) {
- if (reg_file[current_reg] != 0 || ctx.war_hint[current_reg]) {
- if (next_pos == 0xFFFF)
- continue;
- /* check if the variable fits */
- if (next_pos + size > current_reg) {
- next_pos = 0xFFFF;
+ if (reg_file[current_reg] == 0 && !ctx.war_hint[current_reg]) {
+ if (last_pos == 0xFFFF)
+ last_pos = current_reg;
+
+ /* stop searching after max_used_gpr */
+ if (current_reg == ctx.max_used_sgpr + 1 || current_reg == 256 + ctx.max_used_vgpr + 1)
+ break;
+ else
continue;
- }
+ }
- /* check if the tested gap is smaller */
- if (current_reg - next_pos < gap_size) {
- best_pos = next_pos;
- gap_size = current_reg - next_pos;
- }
- next_pos = 0xFFFF;
+ if (last_pos == 0xFFFF)
continue;
+
+ /* early return on exact matches */
+ if (last_pos + size == current_reg) {
+ adjust_max_used_regs(ctx, rc, last_pos);
+ return {PhysReg{last_pos}, true};
}
- if (next_pos == 0xFFFF)
- next_pos = current_reg;
+ /* check if it fits and the gap size is smaller */
+ if (last_pos + size < current_reg && current_reg - last_pos < gap_size) {
+ best_pos = last_pos;
+ gap_size = current_reg - last_pos;
+ }
+ last_pos = 0xFFFF;
}
/* final check */
- if (next_pos != 0xFFFF &&
- next_pos + size <= ub &&
- ub - next_pos < gap_size) {
- best_pos = next_pos;
- gap_size = ub - next_pos;
+ if (last_pos + size <= ub && ub - last_pos < gap_size) {
+ best_pos = last_pos;
+ gap_size = ub - last_pos;
}
- if (best_pos != 0xFFFF) {
- adjust_max_used_regs(ctx, rc, best_pos);
- return {PhysReg{best_pos}, true};
+
+ if (best_pos == 0xFFFF)
+ return {{}, false};
+
+ /* find best position within gap by leaving a good stride for other variables*/
+ unsigned buffer = gap_size - size;
+ if (buffer > 1) {
+ if (((best_pos + size) % 8 != 0 && (best_pos + buffer) % 8 == 0) ||
+ ((best_pos + size) % 4 != 0 && (best_pos + buffer) % 4 == 0) ||
+ ((best_pos + size) % 2 != 0 && (best_pos + buffer) % 2 == 0))
+ best_pos = best_pos + buffer;
}
- return {{}, false};
+
+ adjust_max_used_regs(ctx, rc, best_pos);
+ return {PhysReg{best_pos}, true};
}
bool found = false;
if (res.second) {
/* mark the area as blocked */
- reg_file.block(res.first, var.rc.bytes());
+ reg_file.block(res.first, var.rc);
/* create parallelcopy pair (without definition id) */
Temp tmp = Temp(id, var.rc);
std::set<std::pair<unsigned, unsigned>> new_vars = collect_vars(ctx, reg_file, PhysReg{reg_lo}, size);
/* mark the area as blocked */
- reg_file.block(PhysReg{reg_lo}, size * 4);
+ reg_file.block(PhysReg{reg_lo}, var.rc);
if (!get_regs_for_copies(ctx, reg_file, parallelcopies, new_vars, lb, ub, instr, def_reg_lo, def_reg_hi))
return false;
instr->operands[j].physReg() < ub) {
assert(instr->operands[j].isFixed());
assert(!reg_file.test(instr->operands[j].physReg(), instr->operands[j].bytes()));
- reg_file.block(instr->operands[j].physReg(), instr->operands[j].bytes());
+ reg_file.block(instr->operands[j].physReg(), instr->operands[j].regClass());
killed_ops += instr->operands[j].getTemp().size();
}
}
{
if (rc.is_subdword() && reg.byte() && !instr_can_access_subdword(instr))
return false;
+ if (!rc.is_subdword() && reg.byte())
+ return false;
uint32_t size = rc.size();
uint32_t stride = 1;
//FIXME: if nothing helps, shift-rotate the registers to make space
- unreachable("did not find a register");
+ fprintf(stderr, "ACO: failed to allocate registers during shader compilation\n");
+ abort();
}
PhysReg get_reg_create_vector(ra_ctx& ctx,
/* add vector affinities */
if (instr->opcode == aco_opcode::p_create_vector) {
for (const Operand& op : instr->operands) {
- if (op.isTemp() && op.getTemp().type() == instr->definitions[0].getTemp().type())
+ if (op.isTemp() && op.isFirstKill() && op.getTemp().type() == instr->definitions[0].getTemp().type())
ctx.vectors[op.tempId()] = instr.get();
}
}
if (it != temp_to_phi_ressources.end() && def.regClass() == phi_ressources[it->second][0].regClass()) {
phi_ressources[it->second][0] = def.getTemp();
/* try to coalesce phi affinities with parallelcopies */
- if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy) {
- Operand op = instr->operands[i];
- if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
- phi_ressources[it->second].emplace_back(op.getTemp());
- temp_to_phi_ressources[op.tempId()] = it->second;
- }
+ Operand op = Operand();
+ if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy)
+ op = instr->operands[i];
+ else if (instr->opcode == aco_opcode::v_mad_f32 && !instr->usesModifiers())
+ op = instr->operands[2];
+
+ if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
+ phi_ressources[it->second].emplace_back(op.getTemp());
+ temp_to_phi_ressources[op.tempId()] = it->second;
}
}
}
instr->operands[2].isKillBeforeDef() &&
instr->operands[2].getTemp().type() == RegType::vgpr &&
instr->operands[1].isTemp() &&
- instr->operands[1].getTemp().type() == RegType::vgpr) { /* TODO: swap src0 and src1 in this case */
- VOP3A_instruction* vop3 = static_cast<VOP3A_instruction*>(instr.get());
- bool can_use_mac = !(vop3->abs[0] || vop3->abs[1] || vop3->abs[2] ||
- vop3->neg[0] || vop3->neg[1] || vop3->neg[2] ||
- vop3->clamp || vop3->omod || vop3->opsel);
- if (can_use_mac) {
- instr->format = Format::VOP2;
- instr->opcode = aco_opcode::v_mac_f32;
- }
+ instr->operands[1].getTemp().type() == RegType::vgpr &&
+ !instr->usesModifiers()) {
+ instr->format = Format::VOP2;
+ instr->opcode = aco_opcode::v_mac_f32;
}
/* handle definitions which must have the same register as an operand */
definition.setFixed(definition.physReg());
else if (instr->opcode == aco_opcode::p_split_vector) {
PhysReg reg = instr->operands[0].physReg();
- reg.reg_b += i * definition.bytes();
+ for (unsigned j = 0; j < i; j++)
+ reg.reg_b += instr->definitions[j].bytes();
if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg))
definition.setFixed(reg);
} else if (instr->opcode == aco_opcode::p_wqm) {
definition.setFixed(reg);
}
- if (!definition.isFixed())
- definition.setFixed(get_reg(ctx, register_file, definition.getTemp(), parallelcopy, instr));
+ if (!definition.isFixed()) {
+ Temp tmp = definition.getTemp();
+ /* subdword instructions before RDNA write full registers */
+ if (tmp.regClass().is_subdword() &&
+ !instr_can_access_subdword(instr) &&
+ ctx.program->chip_class <= GFX9) {
+ assert(tmp.bytes() <= 4);
+ tmp = Temp(definition.tempId(), v1);
+ }
+ definition.setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr));
+ }
assert(definition.isFixed() && ((definition.getTemp().type() == RegType::vgpr && definition.physReg() >= 256) ||
(definition.getTemp().type() != RegType::vgpr && definition.physReg() < 256)));
}
for (const Operand& op : instr->operands) {
if (op.isTemp() && op.isFirstKill())
- register_file.block(op.physReg(), op.bytes());
+ register_file.block(op.physReg(), op.regClass());
}
handle_pseudo(ctx, register_file, pc.get());
register_file.clear(def);
for (const Operand& op : instr->operands) {
if (op.isTemp() && op.isFirstKill())
- register_file.block(op.physReg(), op.bytes());
+ register_file.block(op.physReg(), op.regClass());
}
Temp tmp = {program->allocateId(), can_sgpr ? s1 : v1};
ctx.assignments.emplace_back();