return false;
}
+ bool is_empty_or_blocked(PhysReg start) {
+ if (regs[start] == 0xF0000000) {
+ return subdword_regs[start][start.byte()] + 1 <= 1;
+ }
+ return regs[start] + 1 <= 1;
+ }
+
void clear(PhysReg start, RegClass rc) {
if (rc.is_subdword())
fill_subdword(start, rc.bytes(), 0);
clear(def.physReg(), def.regClass());
}
+ unsigned get_id(PhysReg reg) {
+ return regs[reg] == 0xF0000000 ? subdword_regs[reg][reg.byte()] : regs[reg];
+ }
+
private:
void fill(PhysReg start, unsigned size, uint32_t val) {
for (unsigned i = 0; i < size; i++)
unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, unsigned idx, RegClass rc)
{
+ /* v_readfirstlane_b32 cannot use SDWA */
+ if (instr->opcode == aco_opcode::p_as_uniform)
+ return 4;
if (instr->format == Format::PSEUDO && chip >= GFX8)
return rc.bytes() % 2 == 0 ? 2 : 1;
if (can_use_SDWA(chip, instr)) {
return std::make_pair(rc.bytes(), rc.bytes());
} else if (rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, -1, 1)) {
- return std::make_pair(2u, chip >= GFX10 ? 2u : 4u);
+ return std::make_pair(2u, bytes_written);
}
switch (instr->opcode) {
void update_renames(ra_ctx& ctx, RegisterFile& reg_file,
std::vector<std::pair<Operand, Definition>>& parallelcopies,
- aco_ptr<Instruction>& instr)
+ aco_ptr<Instruction>& instr, bool rename_not_killed_ops)
{
/* allocate id's and rename operands: this is done transparently here */
for (std::pair<Operand, Definition>& copy : parallelcopies) {
reg_file.fill(copy.second);
/* check if we moved an operand */
- for (Operand& op : instr->operands) {
+ bool first = true;
+ for (unsigned i = 0; i < instr->operands.size(); i++) {
+ Operand& op = instr->operands[i];
if (!op.isTemp())
continue;
if (op.tempId() == copy.first.tempId()) {
- bool omit_renaming = instr->opcode == aco_opcode::p_create_vector && !op.isKillBeforeDef();
+ bool omit_renaming = !rename_not_killed_ops && !op.isKillBeforeDef();
for (std::pair<Operand, Definition>& pc : parallelcopies) {
PhysReg def_reg = pc.second.physReg();
omit_renaming &= def_reg > copy.first.physReg() ?
(copy.first.physReg() + copy.first.size() <= def_reg.reg()) :
(def_reg + pc.second.size() <= copy.first.physReg().reg());
}
- if (omit_renaming)
+ if (omit_renaming) {
+ if (first)
+ op.setFirstKill(true);
+ else
+ op.setKill(true);
+ first = false;
continue;
+ }
op.setTemp(copy.second.getTemp());
op.setFixed(copy.second.physReg());
}
RegClass rc = info.rc;
if (stride == 1) {
-
+ info.rc = RegClass(rc.type(), size);
for (unsigned stride = 8; stride > 1; stride /= 2) {
if (size % stride)
continue;
PhysReg reg(def_reg_lo);
for (unsigned i = 0; i < instr->operands.size(); i++) {
if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
- assert(!reg_file.test(reg, var.rc.bytes()));
- res = {reg, !var.rc.is_subdword() || (reg.byte() % info.stride == 0)};
+ res = {reg, (!var.rc.is_subdword() || (reg.byte() % info.stride == 0)) && !reg_file.test(reg, var.rc.bytes())};
break;
}
reg.reg_b += instr->operands[i].bytes();
}
+ if (!res.second)
+ res = {var.reg, !reg_file.test(var.reg, var.rc.bytes())};
} else {
info.lb = def_reg_lo;
info.ub = def_reg_hi + 1;
}
} else {
info.lb = lb;
- info.ub = def_reg_lo;
+ info.ub = MIN2(def_reg_lo, ub);
res = get_reg_simple(ctx, reg_file, info);
- if (!res.second) {
+ if (!res.second && def_reg_hi < ub) {
info.lb = (def_reg_hi + info.stride) & ~(info.stride - 1);
info.ub = ub;
res = get_reg_simple(ctx, reg_file, info);
unsigned reg_hi = lb + size - 1;
for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
/* first check the edges: this is what we have to fix to allow for num_moves > size */
- if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
+ if (reg_lo > lb && !reg_file.is_empty_or_blocked(PhysReg(reg_lo)) &&
+ reg_file.get_id(PhysReg(reg_lo)) == reg_file.get_id(PhysReg(reg_lo).advance(-1)))
continue;
- if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
+ if (reg_hi < ub - 1 && !reg_file.is_empty_or_blocked(PhysReg(reg_hi).advance(3)) &&
+ reg_file.get_id(PhysReg(reg_hi).advance(3)) == reg_file.get_id(PhysReg(reg_hi).advance(4)))
continue;
/* second, check that we have at most k=num_moves elements in the window
/* we set the definition regs == 0. the actual caller is responsible for correct setting */
reg_file.clear(PhysReg{best_pos}, rc);
- update_renames(ctx, reg_file, parallelcopies, instr);
+ update_renames(ctx, reg_file, parallelcopies, instr, instr->opcode != aco_opcode::p_create_vector);
/* remove killed operands from reg_file once again */
for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
//FIXME: if nothing helps, shift-rotate the registers to make space
- fprintf(stderr, "ACO: failed to allocate registers during shader compilation\n");
+ aco_err(ctx.program, "Failed to allocate registers during shader compilation.");
abort();
}
// TODO: this can be improved */
if (reg_lo < lb || reg_hi >= ub || reg_lo % stride != 0)
continue;
- if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
+ if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file.get_id(PhysReg(reg_lo)) == reg_file.get_id(PhysReg(reg_lo).advance(-1)))
continue;
- if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
+ if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file.get_id(PhysReg(reg_hi).advance(3)) == reg_file.get_id(PhysReg(reg_hi).advance(4)))
continue;
/* count variables to be moved and check war_hint */
/* collect variables to be moved */
std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size);
- /* GFX9+: move killed operands which aren't yet at the correct position
- * Moving all killed operands generally leads to more register swaps.
- * This is only done on GFX9+ because of the cheap v_swap instruction.
- */
- if (ctx.program->chip_class >= GFX9) {
- for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
- if (instr->operands[i].isTemp() &&
- instr->operands[i].isFirstKillBeforeDef() &&
- instr->operands[i].getTemp().type() == rc.type() &&
- instr->operands[i].physReg().reg_b != best_pos * 4 + offset) {
- vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
- reg_file.clear(instr->operands[i]);
- }
+ for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
+ if (!instr->operands[i].isTemp() || !instr->operands[i].isFirstKillBeforeDef() ||
+ instr->operands[i].getTemp().type() != rc.type())
+ continue;
+ bool correct_pos = instr->operands[i].physReg().reg_b == best_pos * 4 + offset;
+ /* GFX9+: move killed operands which aren't yet at the correct position
+ * Moving all killed operands generally leads to more register swaps.
+ * This is only done on GFX9+ because of the cheap v_swap instruction.
+ */
+ if (ctx.program->chip_class >= GFX9 && !correct_pos) {
+ vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
+ reg_file.clear(instr->operands[i]);
+ /* fill operands which are in the correct position to avoid overwriting */
+ } else if (correct_pos) {
+ reg_file.fill(instr->operands[i]);
}
}
ASSERTED bool success = false;
success = get_regs_for_copies(ctx, reg_file, parallelcopies, vars, lb, ub, instr, best_pos, best_pos + size - 1);
assert(success);
- update_renames(ctx, reg_file, parallelcopies, instr);
+ update_renames(ctx, reg_file, parallelcopies, instr, false);
adjust_max_used_regs(ctx, rc, best_pos);
/* remove killed operands from reg_file once again */
Definition pc_def = Definition(dst, pc_op.regClass());
register_file.clear(pc_op);
parallelcopy.emplace_back(pc_op, pc_def);
- update_renames(ctx, register_file, parallelcopy, instr);
+ update_renames(ctx, register_file, parallelcopy, instr, true);
}
Temp read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
Operand op = Operand();
if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy)
op = instr->operands[i];
- else if (instr->opcode == aco_opcode::v_mad_f32 && !instr->usesModifiers())
+ else if ((instr->opcode == aco_opcode::v_mad_f32 ||
+ (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) ||
+ instr->opcode == aco_opcode::v_mad_f16 ||
+ instr->opcode == aco_opcode::v_mad_legacy_f16 ||
+ (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10)) && !instr->usesModifiers())
op = instr->operands[2];
if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
}
/* try to optimize v_mad_f32 -> v_mac_f32 */
- if (instr->opcode == aco_opcode::v_mad_f32 &&
+ if ((instr->opcode == aco_opcode::v_mad_f32 ||
+ (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) ||
+ instr->opcode == aco_opcode::v_mad_f16 ||
+ instr->opcode == aco_opcode::v_mad_legacy_f16 ||
+ (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10)) &&
instr->operands[2].isTemp() &&
instr->operands[2].isKillBeforeDef() &&
instr->operands[2].getTemp().type() == RegType::vgpr &&
instr->operands[1].isTemp() &&
instr->operands[1].getTemp().type() == RegType::vgpr &&
- !instr->usesModifiers()) {
+ !instr->usesModifiers() &&
+ instr->operands[0].physReg().byte() == 0 &&
+ instr->operands[1].physReg().byte() == 0 &&
+ instr->operands[2].physReg().byte() == 0) {
unsigned def_id = instr->definitions[0].tempId();
auto it = ctx.affinities.find(def_id);
if (it == ctx.affinities.end() || !ctx.assignments[it->second].assigned ||
instr->operands[2].physReg() == ctx.assignments[it->second].reg ||
register_file.test(ctx.assignments[it->second].reg, instr->operands[2].bytes())) {
instr->format = Format::VOP2;
- instr->opcode = aco_opcode::v_mac_f32;
+ switch (instr->opcode) {
+ case aco_opcode::v_mad_f32:
+ instr->opcode = aco_opcode::v_mac_f32;
+ break;
+ case aco_opcode::v_fma_f32:
+ instr->opcode = aco_opcode::v_fmac_f32;
+ break;
+ case aco_opcode::v_mad_f16:
+ case aco_opcode::v_mad_legacy_f16:
+ instr->opcode = aco_opcode::v_mac_f16;
+ break;
+ case aco_opcode::v_fma_f16:
+ instr->opcode = aco_opcode::v_fmac_f16;
+ break;
+ default:
+ break;
+ }
}
}
/* handle definitions which must have the same register as an operand */
if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
instr->opcode == aco_opcode::v_mac_f32 ||
+ instr->opcode == aco_opcode::v_fmac_f32 ||
+ instr->opcode == aco_opcode::v_mac_f16 ||
+ instr->opcode == aco_opcode::v_fmac_f16 ||
instr->opcode == aco_opcode::v_writelane_b32 ||
instr->opcode == aco_opcode::v_writelane_b32_e64) {
instr->definitions[0].setFixed(instr->operands[2].physReg());
adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
/* check if the target register is blocked */
- if (register_file[definition.physReg().reg()] != 0) {
- /* create parallelcopy pair to move blocking var */
- Temp tmp = {register_file[definition.physReg()], ctx.assignments[register_file[definition.physReg()]].rc};
- Operand pc_op = Operand(tmp);
- pc_op.setFixed(ctx.assignments[register_file[definition.physReg().reg()]].reg);
- RegClass rc = pc_op.regClass();
- tmp = Temp{program->allocateId(), rc};
- Definition pc_def = Definition(tmp);
-
- /* re-enable the killed operands, so that we don't move the blocking var there */
+ if (register_file.test(definition.physReg(), definition.bytes())) {
+ /* create parallelcopy pair to move blocking vars */
+ std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, register_file, definition.physReg(), definition.size());
+
+ /* re-enable the killed operands, so that we don't move the blocking vars there */
for (const Operand& op : instr->operands) {
if (op.isTemp() && op.isFirstKillBeforeDef())
register_file.fill(op);
}
- /* find a new register for the blocking variable */
- PhysReg reg = get_reg(ctx, register_file, pc_op.getTemp(), parallelcopy, instr);
+ ASSERTED bool success = false;
+ DefInfo info(ctx, instr, definition.regClass(), -1);
+ success = get_regs_for_copies(ctx, register_file, parallelcopy,
+ vars, info.lb, info.ub, instr,
+ definition.physReg(),
+ definition.physReg() + definition.size() - 1);
+ assert(success);
+
+ update_renames(ctx, register_file, parallelcopy, instr, false);
+
/* once again, disable killed operands */
for (const Operand& op : instr->operands) {
if (op.isTemp() && op.isFirstKillBeforeDef())
if (instr->definitions[k].isTemp() && ctx.defs_done.test(k) && !instr->definitions[k].isKill())
register_file.fill(instr->definitions[k]);
}
- pc_def.setFixed(reg);
-
- /* finish assignment of parallelcopy */
- ctx.assignments.emplace_back(reg, pc_def.regClass());
- assert(ctx.assignments.size() == ctx.program->peekAllocationId());
- parallelcopy.emplace_back(pc_op, pc_def);
-
- /* add changes to reg_file */
- register_file.clear(pc_op);
- register_file.fill(pc_def);
}
ctx.defs_done.set(i);
if (!sgpr_operands_alias_defs) {
unsigned reg = parallelcopy[i].first.physReg().reg();
unsigned size = parallelcopy[i].first.getTemp().size();
- sgpr_operands[reg / 64u] |= ((1u << size) - 1) << (reg % 64u);
+ sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size);
reg = parallelcopy[i].second.physReg().reg();
size = parallelcopy[i].second.getTemp().size();
- if (sgpr_operands[reg / 64u] & ((1u << size) - 1) << (reg % 64u))
+ if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size))
sgpr_operands_alias_defs = true;
}
}