return false;
}
+ bool is_empty_or_blocked(PhysReg start) {
+ if (regs[start] == 0xF0000000) {
+ return subdword_regs[start][start.byte()] + 1 <= 1;
+ }
+ return regs[start] + 1 <= 1;
+ }
+
void clear(PhysReg start, RegClass rc) {
if (rc.is_subdword())
fill_subdword(start, rc.bytes(), 0);
clear(def.physReg(), def.regClass());
}
+ unsigned get_id(PhysReg reg) {
+ return regs[reg] == 0xF0000000 ? subdword_regs[reg][reg.byte()] : regs[reg];
+ }
+
private:
void fill(PhysReg start, unsigned size, uint32_t val) {
for (unsigned i = 0; i < size; i++)
unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, unsigned idx, RegClass rc)
{
+ /* v_readfirstlane_b32 cannot use SDWA */
+ if (instr->opcode == aco_opcode::p_as_uniform)
+ return 4;
if (instr->format == Format::PSEUDO && chip >= GFX8)
return rc.bytes() % 2 == 0 ? 2 : 1;
if (can_use_SDWA(chip, instr)) {
return std::make_pair(rc.bytes(), rc.bytes());
} else if (rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, -1, 1)) {
- return std::make_pair(2u, chip >= GFX10 ? 2u : 4u);
+ return std::make_pair(2u, bytes_written);
}
switch (instr->opcode) {
RegClass rc = info.rc;
if (stride == 1) {
-
+ info.rc = RegClass(rc.type(), size);
for (unsigned stride = 8; stride > 1; stride /= 2) {
if (size % stride)
continue;
PhysReg reg(def_reg_lo);
for (unsigned i = 0; i < instr->operands.size(); i++) {
if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
- assert(!reg_file.test(reg, var.rc.bytes()));
- res = {reg, !var.rc.is_subdword() || (reg.byte() % info.stride == 0)};
+ res = {reg, (!var.rc.is_subdword() || (reg.byte() % info.stride == 0)) && !reg_file.test(reg, var.rc.bytes())};
break;
}
reg.reg_b += instr->operands[i].bytes();
}
+ if (!res.second)
+ res = {var.reg, !reg_file.test(var.reg, var.rc.bytes())};
} else {
info.lb = def_reg_lo;
info.ub = def_reg_hi + 1;
unsigned reg_hi = lb + size - 1;
for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
/* first check the edges: this is what we have to fix to allow for num_moves > size */
- if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
+ if (reg_lo > lb && !reg_file.is_empty_or_blocked(PhysReg(reg_lo)) &&
+ reg_file.get_id(PhysReg(reg_lo)) == reg_file.get_id(PhysReg(reg_lo).advance(-1)))
continue;
- if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
+ if (reg_hi < ub - 1 && !reg_file.is_empty_or_blocked(PhysReg(reg_hi).advance(3)) &&
+ reg_file.get_id(PhysReg(reg_hi).advance(3)) == reg_file.get_id(PhysReg(reg_hi).advance(4)))
continue;
/* second, check that we have at most k=num_moves elements in the window
// TODO: this can be improved */
if (reg_lo < lb || reg_hi >= ub || reg_lo % stride != 0)
continue;
- if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
+ if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file.get_id(PhysReg(reg_lo)) == reg_file.get_id(PhysReg(reg_lo).advance(-1)))
continue;
- if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
+ if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file.get_id(PhysReg(reg_hi).advance(3)) == reg_file.get_id(PhysReg(reg_hi).advance(4)))
continue;
/* count variables to be moved and check war_hint */
/* collect variables to be moved */
std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size);
- /* GFX9+: move killed operands which aren't yet at the correct position
- * Moving all killed operands generally leads to more register swaps.
- * This is only done on GFX9+ because of the cheap v_swap instruction.
- */
- if (ctx.program->chip_class >= GFX9) {
- for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
- if (instr->operands[i].isTemp() &&
- instr->operands[i].isFirstKillBeforeDef() &&
- instr->operands[i].getTemp().type() == rc.type() &&
- instr->operands[i].physReg().reg_b != best_pos * 4 + offset) {
- vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
- reg_file.clear(instr->operands[i]);
- }
+ for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
+ if (!instr->operands[i].isTemp() || !instr->operands[i].isFirstKillBeforeDef() ||
+ instr->operands[i].getTemp().type() != rc.type())
+ continue;
+ bool correct_pos = instr->operands[i].physReg().reg_b == best_pos * 4 + offset;
+ /* GFX9+: move killed operands which aren't yet at the correct position
+ * Moving all killed operands generally leads to more register swaps.
+ * This is only done on GFX9+ because of the cheap v_swap instruction.
+ */
+ if (ctx.program->chip_class >= GFX9 && !correct_pos) {
+ vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
+ reg_file.clear(instr->operands[i]);
+ /* fill operands which are in the correct position to avoid overwriting */
+ } else if (correct_pos) {
+ reg_file.fill(instr->operands[i]);
}
}
ASSERTED bool success = false;
adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
/* check if the target register is blocked */
- if (register_file[definition.physReg().reg()] != 0) {
+ if (register_file.test(definition.physReg(), definition.bytes())) {
/* create parallelcopy pair to move blocking vars */
std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, register_file, definition.physReg(), definition.size());