namespace aco {
namespace {
+unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, unsigned idx, RegClass rc);
+void add_subdword_operand(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte, RegClass rc);
+std::pair<unsigned, unsigned> get_subdword_definition_info(Program *program, const aco_ptr<Instruction>& instr, RegClass rc);
+void add_subdword_definition(Program *program, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg, bool is_partial);
+
struct assignment {
PhysReg reg;
RegClass rc;
}
};
-bool instr_can_access_subdword(ra_ctx& ctx, aco_ptr<Instruction>& instr)
-{
- if (ctx.program->chip_class < GFX8)
- return false;
- return instr->isSDWA() || instr->format == Format::PSEUDO;
-}
-
struct DefInfo {
uint16_t lb;
uint16_t ub;
uint8_t stride;
RegClass rc;
- DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc) : rc(rc) {
+ DefInfo(ra_ctx& ctx, aco_ptr<Instruction>& instr, RegClass rc_, int operand) : rc(rc_) {
size = rc.size();
stride = 1;
stride = 4;
}
- if (rc.is_subdword()) {
+ if (rc.is_subdword() && operand >= 0) {
/* stride in bytes */
- if(!instr_can_access_subdword(ctx, instr))
- stride = 4;
- else if (rc.bytes() % 4 == 0)
- stride = 4;
- else if (rc.bytes() % 2 == 0)
- stride = 2;
+ stride = get_subdword_operand_stride(ctx.program->chip_class, instr, operand, rc);
+ } else if (rc.is_subdword()) {
+ std::pair<unsigned, unsigned> info = get_subdword_definition_info(ctx.program, instr, rc);
+ stride = info.first;
+ if (info.second > rc.bytes()) {
+ rc = RegClass::get(rc.type(), info.second);
+ size = rc.size();
+ /* we might still be able to put the definition in the high half,
+ * but that's only useful for affinities and this information isn't
+ * used for them */
+ stride = align(stride, info.second);
+ if (!rc.is_subdword())
+ stride = DIV_ROUND_UP(stride, 4);
+ }
+ assert(stride > 0);
}
}
};
return false;
}
+ bool is_empty_or_blocked(PhysReg start) {
+ if (regs[start] == 0xF0000000) {
+ return subdword_regs[start][start.byte()] + 1 <= 1;
+ }
+ return regs[start] + 1 <= 1;
+ }
+
void clear(PhysReg start, RegClass rc) {
if (rc.is_subdword())
fill_subdword(start, rc.bytes(), 0);
clear(def.physReg(), def.regClass());
}
+ unsigned get_id(PhysReg reg) {
+ return regs[reg] == 0xF0000000 ? subdword_regs[reg][reg.byte()] : regs[reg];
+ }
+
private:
void fill(PhysReg start, unsigned size, uint32_t val) {
for (unsigned i = 0; i < size; i++)
#endif
+unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr<Instruction>& instr, unsigned idx, RegClass rc)
+{
+ if (instr->format == Format::PSEUDO && chip >= GFX8)
+ return rc.bytes() % 2 == 0 ? 2 : 1;
+
+ if (instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {
+ return 1;
+ } else if (can_use_SDWA(chip, instr)) {
+ return rc.bytes() % 2 == 0 ? 2 : 1;
+ } else if (rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, idx, 1)) {
+ return 2;
+ }
+
+ switch (instr->opcode) {
+ case aco_opcode::ds_write_b8:
+ case aco_opcode::ds_write_b16:
+ return chip >= GFX8 ? 2 : 4;
+ case aco_opcode::buffer_store_byte:
+ case aco_opcode::buffer_store_short:
+ case aco_opcode::flat_store_byte:
+ case aco_opcode::flat_store_short:
+ case aco_opcode::scratch_store_byte:
+ case aco_opcode::scratch_store_short:
+ case aco_opcode::global_store_byte:
+ case aco_opcode::global_store_short:
+ return chip >= GFX9 ? 2 : 4;
+ default:
+ break;
+ }
+
+ return 4;
+}
+
+void add_subdword_operand(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx, unsigned byte, RegClass rc)
+{
+ if (instr->format == Format::PSEUDO || byte == 0)
+ return;
+
+ assert(rc.bytes() <= 2);
+
+ if (!instr->usesModifiers() && instr->opcode == aco_opcode::v_cvt_f32_ubyte0) {
+ switch (byte) {
+ case 0:
+ instr->opcode = aco_opcode::v_cvt_f32_ubyte0;
+ break;
+ case 1:
+ instr->opcode = aco_opcode::v_cvt_f32_ubyte1;
+ break;
+ case 2:
+ instr->opcode = aco_opcode::v_cvt_f32_ubyte2;
+ break;
+ case 3:
+ instr->opcode = aco_opcode::v_cvt_f32_ubyte3;
+ break;
+ }
+ return;
+ } else if (can_use_SDWA(chip, instr)) {
+ convert_to_SDWA(chip, instr);
+ return;
+ } else if (rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, idx, byte / 2)) {
+ VOP3A_instruction *vop3 = static_cast<VOP3A_instruction *>(instr.get());
+ vop3->opsel |= (byte / 2) << idx;
+ return;
+ }
+
+ if (chip >= GFX8 && instr->opcode == aco_opcode::ds_write_b8 && byte == 2) {
+ instr->opcode = aco_opcode::ds_write_b8_d16_hi;
+ return;
+ }
+ if (chip >= GFX8 && instr->opcode == aco_opcode::ds_write_b16 && byte == 2) {
+ instr->opcode = aco_opcode::ds_write_b16_d16_hi;
+ return;
+ }
+
+ if (chip >= GFX9 && byte == 2) {
+ if (instr->opcode == aco_opcode::buffer_store_byte)
+ instr->opcode = aco_opcode::buffer_store_byte_d16_hi;
+ else if (instr->opcode == aco_opcode::buffer_store_short)
+ instr->opcode = aco_opcode::buffer_store_short_d16_hi;
+ else if (instr->opcode == aco_opcode::flat_store_byte)
+ instr->opcode = aco_opcode::flat_store_byte_d16_hi;
+ else if (instr->opcode == aco_opcode::flat_store_short)
+ instr->opcode = aco_opcode::flat_store_short_d16_hi;
+ else if (instr->opcode == aco_opcode::scratch_store_byte)
+ instr->opcode = aco_opcode::scratch_store_byte_d16_hi;
+ else if (instr->opcode == aco_opcode::scratch_store_short)
+ instr->opcode = aco_opcode::scratch_store_short_d16_hi;
+ else if (instr->opcode == aco_opcode::global_store_byte)
+ instr->opcode = aco_opcode::global_store_byte_d16_hi;
+ else if (instr->opcode == aco_opcode::global_store_short)
+ instr->opcode = aco_opcode::global_store_short_d16_hi;
+ else
+ unreachable("Something went wrong: Impossible register assignment.");
+ }
+}
+
+/* minimum_stride, bytes_written */
+std::pair<unsigned, unsigned> get_subdword_definition_info(Program *program, const aco_ptr<Instruction>& instr, RegClass rc)
+{
+ chip_class chip = program->chip_class;
+
+ if (instr->format == Format::PSEUDO && chip >= GFX8)
+ return std::make_pair(rc.bytes() % 2 == 0 ? 2 : 1, rc.bytes());
+ else if (instr->format == Format::PSEUDO)
+ return std::make_pair(4, rc.size() * 4u);
+
+ unsigned bytes_written = chip >= GFX10 ? rc.bytes() : 4u;
+ switch (instr->opcode) {
+ case aco_opcode::v_mad_f16:
+ case aco_opcode::v_mad_u16:
+ case aco_opcode::v_mad_i16:
+ case aco_opcode::v_fma_f16:
+ case aco_opcode::v_div_fixup_f16:
+ case aco_opcode::v_interp_p2_f16:
+ bytes_written = chip >= GFX9 ? rc.bytes() : 4u;
+ break;
+ default:
+ break;
+ }
+ bytes_written = MAX2(bytes_written, instr_info.definition_size[(int)instr->opcode] / 8u);
+
+ if (can_use_SDWA(chip, instr)) {
+ return std::make_pair(rc.bytes(), rc.bytes());
+ } else if (rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, -1, 1)) {
+ return std::make_pair(2u, bytes_written);
+ }
+
+ switch (instr->opcode) {
+ case aco_opcode::buffer_load_ubyte_d16:
+ case aco_opcode::buffer_load_short_d16:
+ case aco_opcode::flat_load_ubyte_d16:
+ case aco_opcode::flat_load_short_d16:
+ case aco_opcode::scratch_load_ubyte_d16:
+ case aco_opcode::scratch_load_short_d16:
+ case aco_opcode::global_load_ubyte_d16:
+ case aco_opcode::global_load_short_d16:
+ case aco_opcode::ds_read_u8_d16:
+ case aco_opcode::ds_read_u16_d16:
+ if (chip >= GFX9 && !program->sram_ecc_enabled)
+ return std::make_pair(2u, 2u);
+ else
+ return std::make_pair(2u, 4u);
+ default:
+ break;
+ }
+
+ return std::make_pair(4u, bytes_written);
+}
+
+void add_subdword_definition(Program *program, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg, bool is_partial)
+{
+ RegClass rc = instr->definitions[idx].regClass();
+ chip_class chip = program->chip_class;
+
+ instr->definitions[idx].setFixed(reg);
+
+ if (instr->format == Format::PSEUDO) {
+ return;
+ } else if (can_use_SDWA(chip, instr)) {
+ if (reg.byte() || (is_partial && chip < GFX10))
+ convert_to_SDWA(chip, instr);
+ return;
+ } else if (reg.byte() && rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, -1, reg.byte() / 2)) {
+ VOP3A_instruction *vop3 = static_cast<VOP3A_instruction *>(instr.get());
+ if (reg.byte() == 2)
+ vop3->opsel |= (1 << 3); /* dst in high half */
+ return;
+ }
+
+ if (reg.byte() == 2) {
+ if (instr->opcode == aco_opcode::buffer_load_ubyte_d16)
+ instr->opcode = aco_opcode::buffer_load_ubyte_d16_hi;
+ else if (instr->opcode == aco_opcode::buffer_load_short_d16)
+ instr->opcode = aco_opcode::buffer_load_short_d16_hi;
+ else if (instr->opcode == aco_opcode::flat_load_ubyte_d16)
+ instr->opcode = aco_opcode::flat_load_ubyte_d16_hi;
+ else if (instr->opcode == aco_opcode::flat_load_short_d16)
+ instr->opcode = aco_opcode::flat_load_short_d16_hi;
+ else if (instr->opcode == aco_opcode::scratch_load_ubyte_d16)
+ instr->opcode = aco_opcode::scratch_load_ubyte_d16_hi;
+ else if (instr->opcode == aco_opcode::scratch_load_short_d16)
+ instr->opcode = aco_opcode::scratch_load_short_d16_hi;
+ else if (instr->opcode == aco_opcode::global_load_ubyte_d16)
+ instr->opcode = aco_opcode::global_load_ubyte_d16_hi;
+ else if (instr->opcode == aco_opcode::global_load_short_d16)
+ instr->opcode = aco_opcode::global_load_short_d16_hi;
+ else if (instr->opcode == aco_opcode::ds_read_u8_d16)
+ instr->opcode = aco_opcode::ds_read_u8_d16_hi;
+ else if (instr->opcode == aco_opcode::ds_read_u16_d16)
+ instr->opcode = aco_opcode::ds_read_u16_d16_hi;
+ else
+ unreachable("Something went wrong: Impossible register assignment.");
+ }
+}
+
void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg)
{
unsigned max_addressible_sgpr = ctx.program->sgpr_limit;
void update_renames(ra_ctx& ctx, RegisterFile& reg_file,
std::vector<std::pair<Operand, Definition>>& parallelcopies,
- aco_ptr<Instruction>& instr)
+ aco_ptr<Instruction>& instr, bool rename_not_killed_ops)
{
/* allocate id's and rename operands: this is done transparently here */
for (std::pair<Operand, Definition>& copy : parallelcopies) {
reg_file.fill(copy.second);
/* check if we moved an operand */
- for (Operand& op : instr->operands) {
+ bool first = true;
+ for (unsigned i = 0; i < instr->operands.size(); i++) {
+ Operand& op = instr->operands[i];
if (!op.isTemp())
continue;
if (op.tempId() == copy.first.tempId()) {
- bool omit_renaming = instr->opcode == aco_opcode::p_create_vector && !op.isKillBeforeDef();
+ bool omit_renaming = !rename_not_killed_ops && !op.isKillBeforeDef();
for (std::pair<Operand, Definition>& pc : parallelcopies) {
PhysReg def_reg = pc.second.physReg();
omit_renaming &= def_reg > copy.first.physReg() ?
(copy.first.physReg() + copy.first.size() <= def_reg.reg()) :
(def_reg + pc.second.size() <= copy.first.physReg().reg());
}
- if (omit_renaming)
+ if (omit_renaming) {
+ if (first)
+ op.setFirstKill(true);
+ else
+ op.setKill(true);
+ first = false;
continue;
+ }
op.setTemp(copy.second.getTemp());
op.setFixed(copy.second.physReg());
}
uint32_t lb = info.lb;
uint32_t ub = info.ub;
uint32_t size = info.size;
- uint32_t stride = info.stride;
+ uint32_t stride = info.rc.is_subdword() ? DIV_ROUND_UP(info.stride, 4) : info.stride;
RegClass rc = info.rc;
- if (rc.is_subdword()) {
- for (std::pair<uint32_t, std::array<uint32_t, 4>> entry : reg_file.subdword_regs) {
- assert(reg_file[entry.first] == 0xF0000000);
- if (lb > entry.first || entry.first >= ub)
- continue;
-
- for (unsigned i = 0; i < 4; i+= stride) {
- if (entry.second[i] != 0)
- continue;
-
- bool reg_found = true;
- for (unsigned j = 1; reg_found && i + j < 4 && j < rc.bytes(); j++)
- reg_found &= entry.second[i + j] == 0;
-
- /* check neighboring reg if needed */
- reg_found &= ((int)i <= 4 - (int)rc.bytes() || reg_file[entry.first + 1] == 0);
- if (reg_found) {
- PhysReg res{entry.first};
- res.reg_b += i;
- adjust_max_used_regs(ctx, rc, entry.first);
- return {res, true};
- }
- }
- }
-
- stride = 1; /* stride in full registers */
- rc = info.rc = RegClass(RegType::vgpr, size);
- }
-
if (stride == 1) {
-
+ info.rc = RegClass(rc.type(), size);
for (unsigned stride = 8; stride > 1; stride /= 2) {
if (size % stride)
continue;
reg_lo += stride;
}
+ /* do this late because using the upper bytes of a register can require
+ * larger instruction encodings or copies
+ * TODO: don't do this in situations where it doesn't benefit */
+ if (rc.is_subdword()) {
+ for (std::pair<uint32_t, std::array<uint32_t, 4>> entry : reg_file.subdword_regs) {
+ assert(reg_file[entry.first] == 0xF0000000);
+ if (lb > entry.first || entry.first >= ub)
+ continue;
+
+ for (unsigned i = 0; i < 4; i+= info.stride) {
+ if (entry.second[i] != 0)
+ continue;
+
+ bool reg_found = true;
+ for (unsigned j = 1; reg_found && i + j < 4 && j < rc.bytes(); j++)
+ reg_found &= entry.second[i + j] == 0;
+
+ /* check neighboring reg if needed */
+ reg_found &= ((int)i <= 4 - (int)rc.bytes() || reg_file[entry.first + 1] == 0);
+ if (reg_found) {
+ PhysReg res{entry.first};
+ res.reg_b += i;
+ adjust_max_used_regs(ctx, rc, entry.first);
+ return {res, true};
+ }
+ }
+ }
+ }
+
return {{}, false};
}
for (std::set<std::pair<unsigned, unsigned>>::const_reverse_iterator it = vars.rbegin(); it != vars.rend(); ++it) {
unsigned id = it->second;
assignment& var = ctx.assignments[id];
- DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc);
+ DefInfo info = DefInfo(ctx, ctx.pseudo_dummy, var.rc, -1);
uint32_t size = info.size;
- /* check if this is a dead operand, then we can re-use the space from the definition */
+ /* check if this is a dead operand, then we can re-use the space from the definition
+ * also use the correct stride for sub-dword operands */
bool is_dead_operand = false;
- for (unsigned i = 0; !is_phi(instr) && !is_dead_operand && (i < instr->operands.size()); i++) {
- if (instr->operands[i].isTemp() && instr->operands[i].isKillBeforeDef() && instr->operands[i].tempId() == id)
- is_dead_operand = true;
+ for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
+ if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
+ if (instr->operands[i].isKillBeforeDef())
+ is_dead_operand = true;
+ info = DefInfo(ctx, instr, var.rc, i);
+ break;
+ }
}
std::pair<PhysReg, bool> res;
if (is_dead_operand) {
if (instr->opcode == aco_opcode::p_create_vector) {
- for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
+ PhysReg reg(def_reg_lo);
+ for (unsigned i = 0; i < instr->operands.size(); i++) {
if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) {
- PhysReg reg(def_reg_lo);
- reg.reg_b += offset;
- assert(!reg_file.test(reg, var.rc.bytes()));
- res = {reg, reg.byte() == 0 || instr_can_access_subdword(ctx, instr)};
+ res = {reg, (!var.rc.is_subdword() || (reg.byte() % info.stride == 0)) && !reg_file.test(reg, var.rc.bytes())};
break;
}
+ reg.reg_b += instr->operands[i].bytes();
}
+ if (!res.second)
+ res = {var.reg, !reg_file.test(var.reg, var.rc.bytes())};
} else {
info.lb = def_reg_lo;
info.ub = def_reg_hi + 1;
}
} else {
info.lb = lb;
- info.ub = def_reg_lo;
+ info.ub = MIN2(def_reg_lo, ub);
res = get_reg_simple(ctx, reg_file, info);
- if (!res.second) {
+ if (!res.second && def_reg_hi < ub) {
info.lb = (def_reg_hi + info.stride) & ~(info.stride - 1);
info.ub = ub;
res = get_reg_simple(ctx, reg_file, info);
unsigned reg_hi = lb + size - 1;
for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) {
/* first check the edges: this is what we have to fix to allow for num_moves > size */
- if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
+ if (reg_lo > lb && !reg_file.is_empty_or_blocked(PhysReg(reg_lo)) &&
+ reg_file.get_id(PhysReg(reg_lo)) == reg_file.get_id(PhysReg(reg_lo).advance(-1)))
continue;
- if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
+ if (reg_hi < ub - 1 && !reg_file.is_empty_or_blocked(PhysReg(reg_hi).advance(3)) &&
+ reg_file.get_id(PhysReg(reg_hi).advance(3)) == reg_file.get_id(PhysReg(reg_hi).advance(4)))
continue;
/* second, check that we have at most k=num_moves elements in the window
/* now, we figured the placement for our definition */
std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size);
- if (instr->opcode == aco_opcode::p_create_vector && ctx.program->chip_class >= GFX9) {
- /* move killed operands which aren't yet at the correct position */
- for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].size(), i++) {
- if (instr->operands[i].isTemp() && instr->operands[i].isFirstKillBeforeDef() &&
- instr->operands[i].getTemp().type() == rc.type()) {
-
- if (instr->operands[i].physReg() != best_pos + offset) {
- vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
- reg_file.clear(instr->operands[i]);
+ if (instr->opcode == aco_opcode::p_create_vector) {
+ /* move killed operands which aren't yet at the correct position (GFX9+)
+ * or which are in the definition space */
+ PhysReg reg = PhysReg{best_pos};
+ for (Operand& op : instr->operands) {
+ if (op.isTemp() && op.isFirstKillBeforeDef() &&
+ op.getTemp().type() == rc.type()) {
+ if (op.physReg() != reg &&
+ (ctx.program->chip_class >= GFX9 ||
+ (op.physReg().advance(op.bytes()) > PhysReg{best_pos} &&
+ op.physReg() < PhysReg{best_pos + size}))) {
+ vars.emplace(op.bytes(), op.tempId());
+ reg_file.clear(op);
} else {
- reg_file.fill(instr->operands[i]);
+ reg_file.fill(op);
}
}
+ reg.reg_b += op.bytes();
}
- } else {
- /* re-enable the killed operands */
- for (unsigned j = 0; !is_phi(instr) && j < instr->operands.size(); j++) {
- if (instr->operands[j].isTemp() && instr->operands[j].isFirstKill())
- reg_file.fill(instr->operands[j]);
+ } else if (!is_phi(instr)) {
+ /* re-enable killed operands */
+ for (Operand& op : instr->operands) {
+ if (op.isTemp() && op.isFirstKillBeforeDef())
+ reg_file.fill(op);
}
}
/* we set the definition regs == 0. the actual caller is responsible for correct setting */
reg_file.clear(PhysReg{best_pos}, rc);
- update_renames(ctx, reg_file, parallelcopies, instr);
+ update_renames(ctx, reg_file, parallelcopies, instr, instr->opcode != aco_opcode::p_create_vector);
/* remove killed operands from reg_file once again */
for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) {
aco_ptr<Instruction>& instr,
PhysReg reg)
{
- if (rc.is_subdword() && reg.byte() && !instr_can_access_subdword(ctx, instr))
+ std::pair<unsigned, unsigned> sdw_def_info;
+ if (rc.is_subdword())
+ sdw_def_info = get_subdword_definition_info(ctx.program, instr, rc);
+
+ if (rc.is_subdword() && reg.byte() % sdw_def_info.first)
return false;
if (!rc.is_subdword() && reg.byte())
return false;
if (reg_lo < lb || reg_hi >= ub || reg_lo > reg_hi)
return false;
- if (reg_file.test(reg, rc.bytes()))
- return false;
+ if (rc.is_subdword()) {
+ PhysReg test_reg;
+ test_reg.reg_b = reg.reg_b & ~(sdw_def_info.second - 1);
+ if (reg_file.test(test_reg, sdw_def_info.second))
+ return false;
+ } else {
+ if (reg_file.test(reg, rc.bytes()))
+ return false;
+ }
adjust_max_used_regs(ctx, rc, reg_lo);
return true;
RegisterFile& reg_file,
Temp temp,
std::vector<std::pair<Operand, Definition>>& parallelcopies,
- aco_ptr<Instruction>& instr)
+ aco_ptr<Instruction>& instr,
+ int operand_index=-1)
{
auto split_vec = ctx.split_vectors.find(temp.id());
if (split_vec != ctx.split_vectors.end()) {
k += op.bytes();
}
- DefInfo info(ctx, ctx.pseudo_dummy, vec->definitions[0].regClass());
+ DefInfo info(ctx, ctx.pseudo_dummy, vec->definitions[0].regClass(), -1);
std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
PhysReg reg = res.first;
if (res.second) {
}
}
- DefInfo info(ctx, instr, temp.regClass());
+ DefInfo info(ctx, instr, temp.regClass(), operand_index);
/* try to find space without live-range splits */
std::pair<PhysReg, bool> res = get_reg_simple(ctx, reg_file, info);
uint16_t max_addressible_vgpr = ctx.program->vgpr_limit;
if (info.rc.type() == RegType::vgpr && ctx.program->max_reg_demand.vgpr < max_addressible_vgpr) {
update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr + 1, ctx.program->max_reg_demand.sgpr));
- return get_reg(ctx, reg_file, temp, parallelcopies, instr);
+ return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index);
} else if (info.rc.type() == RegType::sgpr && ctx.program->max_reg_demand.sgpr < max_addressible_sgpr) {
update_vgpr_sgpr_demand(ctx.program, RegisterDemand(ctx.program->max_reg_demand.vgpr, ctx.program->max_reg_demand.sgpr + 1));
- return get_reg(ctx, reg_file, temp, parallelcopies, instr);
+ return get_reg(ctx, reg_file, temp, parallelcopies, instr, operand_index);
}
//FIXME: if nothing helps, shift-rotate the registers to make space
// TODO: this can be improved */
if (reg_lo < lb || reg_hi >= ub || reg_lo % stride != 0)
continue;
- if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1])
+ if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file.get_id(PhysReg(reg_lo)) == reg_file.get_id(PhysReg(reg_lo).advance(-1)))
continue;
- if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1])
+ if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file.get_id(PhysReg(reg_hi).advance(3)) == reg_file.get_id(PhysReg(reg_hi).advance(4)))
continue;
/* count variables to be moved and check war_hint */
if (num_moves >= bytes)
return get_reg(ctx, reg_file, temp, parallelcopies, instr);
+ /* re-enable killed operands which are in the wrong position */
+ for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
+ if (instr->operands[i].isTemp() &&
+ instr->operands[i].isFirstKillBeforeDef() &&
+ instr->operands[i].physReg().reg_b != best_pos * 4 + offset)
+ reg_file.fill(instr->operands[i]);
+ }
+
/* collect variables to be moved */
std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size);
- /* GFX9+: move killed operands which aren't yet at the correct position
- * Moving all killed operands generally leads to more register swaps.
- * This is only done on GFX9+ because of the cheap v_swap instruction.
- */
- if (ctx.program->chip_class >= GFX9) {
- for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
- if (instr->operands[i].isTemp() &&
- instr->operands[i].isFirstKillBeforeDef() &&
- instr->operands[i].getTemp().type() == rc.type() &&
- instr->operands[i].physReg().reg_b != best_pos * 4 + offset) {
- vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
- }
- }
- } else {
- /* re-enable the killed operands */
- for (unsigned j = 0; j < instr->operands.size(); j++) {
- if (instr->operands[j].isTemp() && instr->operands[j].isFirstKill())
- reg_file.fill(instr->operands[j]);
+ for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) {
+ if (!instr->operands[i].isTemp() || !instr->operands[i].isFirstKillBeforeDef() ||
+ instr->operands[i].getTemp().type() != rc.type())
+ continue;
+ bool correct_pos = instr->operands[i].physReg().reg_b == best_pos * 4 + offset;
+ /* GFX9+: move killed operands which aren't yet at the correct position
+ * Moving all killed operands generally leads to more register swaps.
+ * This is only done on GFX9+ because of the cheap v_swap instruction.
+ */
+ if (ctx.program->chip_class >= GFX9 && !correct_pos) {
+ vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId());
+ reg_file.clear(instr->operands[i]);
+ /* fill operands which are in the correct position to avoid overwriting */
+ } else if (correct_pos) {
+ reg_file.fill(instr->operands[i]);
}
}
ASSERTED bool success = false;
success = get_regs_for_copies(ctx, reg_file, parallelcopies, vars, lb, ub, instr, best_pos, best_pos + size - 1);
assert(success);
- update_renames(ctx, reg_file, parallelcopies, instr);
+ update_renames(ctx, reg_file, parallelcopies, instr, false);
adjust_max_used_regs(ctx, rc, best_pos);
/* remove killed operands from reg_file once again */
}
/* if all operands are constant, no need to care either */
bool reads_sgpr = false;
+ bool reads_subdword = false;
for (Operand& op : instr->operands) {
if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
reads_sgpr = true;
break;
}
+ if (op.isTemp() && op.regClass().is_subdword())
+ reads_subdword = true;
}
- if (!(writes_sgpr && reads_sgpr))
+ bool needs_scratch_reg = (writes_sgpr && reads_sgpr) ||
+ (ctx.program->chip_class <= GFX7 && reads_subdword);
+ if (!needs_scratch_reg)
return;
Pseudo_instruction *pi = (Pseudo_instruction *)instr;
reg = ctx.max_used_sgpr + 1;
for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[reg]; reg++)
;
- assert(reg < ctx.program->max_reg_demand.sgpr);
+ if (reg == ctx.program->max_reg_demand.sgpr) {
+ assert(reads_subdword && reg_file[m0] == 0);
+ reg = m0;
+ }
}
adjust_max_used_regs(ctx, s1, reg);
}
}
-bool operand_can_use_reg(ra_ctx& ctx, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg)
+bool operand_can_use_reg(chip_class chip, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg, RegClass rc)
{
if (instr->operands[idx].isFixed())
return instr->operands[idx].physReg() == reg;
- if (reg.byte() && !instr_can_access_subdword(ctx, instr))
- return false;
+ if (reg.byte()) {
+ unsigned stride = get_subdword_operand_stride(chip, instr, idx, rc);
+ if (reg.byte() % stride)
+ return false;
+ }
switch (instr->format) {
case Format::SMEM:
void get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file,
std::vector<std::pair<Operand, Definition>>& parallelcopy,
- aco_ptr<Instruction>& instr, Operand& operand)
+ aco_ptr<Instruction>& instr, Operand& operand, unsigned operand_index)
{
/* check if the operand is fixed */
PhysReg dst;
dst = operand.physReg();
} else {
- dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr);
+ dst = get_reg(ctx, register_file, operand.getTemp(), parallelcopy, instr, operand_index);
}
Operand pc_op = operand;
Definition pc_def = Definition(dst, pc_op.regClass());
register_file.clear(pc_op);
parallelcopy.emplace_back(pc_op, pc_def);
- update_renames(ctx, register_file, parallelcopy, instr);
+ update_renames(ctx, register_file, parallelcopy, instr, true);
}
Temp read_variable(ra_ctx& ctx, Temp val, unsigned block_idx)
Operand op = Operand();
if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy)
op = instr->operands[i];
- else if (instr->opcode == aco_opcode::v_mad_f32 && !instr->usesModifiers())
+ else if ((instr->opcode == aco_opcode::v_mad_f32 ||
+ (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) ||
+ instr->opcode == aco_opcode::v_mad_f16 ||
+ instr->opcode == aco_opcode::v_mad_legacy_f16 ||
+ (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10)) && !instr->usesModifiers())
op = instr->operands[2];
if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) {
assert(ctx.assignments[operand.tempId()].assigned);
PhysReg reg = ctx.assignments[operand.tempId()].reg;
- if (operand_can_use_reg(ctx, instr, i, reg))
+ if (operand_can_use_reg(program->chip_class, instr, i, reg, operand.regClass()))
operand.setFixed(reg);
else
- get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand);
+ get_reg_for_operand(ctx, register_file, parallelcopy, instr, operand, i);
if (instr->format == Format::EXP ||
(instr->isVMEM() && i == 3 && ctx.program->chip_class == GFX6) ||
}
/* try to optimize v_mad_f32 -> v_mac_f32 */
- if (instr->opcode == aco_opcode::v_mad_f32 &&
+ if ((instr->opcode == aco_opcode::v_mad_f32 ||
+ (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) ||
+ instr->opcode == aco_opcode::v_mad_f16 ||
+ instr->opcode == aco_opcode::v_mad_legacy_f16 ||
+ (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10)) &&
instr->operands[2].isTemp() &&
instr->operands[2].isKillBeforeDef() &&
instr->operands[2].getTemp().type() == RegType::vgpr &&
instr->operands[1].isTemp() &&
instr->operands[1].getTemp().type() == RegType::vgpr &&
- !instr->usesModifiers()) {
+ !instr->usesModifiers() &&
+ instr->operands[0].physReg().byte() == 0 &&
+ instr->operands[1].physReg().byte() == 0 &&
+ instr->operands[2].physReg().byte() == 0) {
unsigned def_id = instr->definitions[0].tempId();
auto it = ctx.affinities.find(def_id);
if (it == ctx.affinities.end() || !ctx.assignments[it->second].assigned ||
instr->operands[2].physReg() == ctx.assignments[it->second].reg ||
register_file.test(ctx.assignments[it->second].reg, instr->operands[2].bytes())) {
instr->format = Format::VOP2;
- instr->opcode = aco_opcode::v_mac_f32;
+ switch (instr->opcode) {
+ case aco_opcode::v_mad_f32:
+ instr->opcode = aco_opcode::v_mac_f32;
+ break;
+ case aco_opcode::v_fma_f32:
+ instr->opcode = aco_opcode::v_fmac_f32;
+ break;
+ case aco_opcode::v_mad_f16:
+ case aco_opcode::v_mad_legacy_f16:
+ instr->opcode = aco_opcode::v_mac_f16;
+ break;
+ case aco_opcode::v_fma_f16:
+ instr->opcode = aco_opcode::v_fmac_f16;
+ break;
+ default:
+ break;
+ }
}
}
/* handle definitions which must have the same register as an operand */
if (instr->opcode == aco_opcode::v_interp_p2_f32 ||
instr->opcode == aco_opcode::v_mac_f32 ||
+ instr->opcode == aco_opcode::v_fmac_f32 ||
+ instr->opcode == aco_opcode::v_mac_f16 ||
+ instr->opcode == aco_opcode::v_fmac_f16 ||
instr->opcode == aco_opcode::v_writelane_b32 ||
instr->opcode == aco_opcode::v_writelane_b32_e64) {
instr->definitions[0].setFixed(instr->operands[2].physReg());
adjust_max_used_regs(ctx, definition.regClass(), definition.physReg());
/* check if the target register is blocked */
- if (register_file[definition.physReg().reg()] != 0) {
- /* create parallelcopy pair to move blocking var */
- Temp tmp = {register_file[definition.physReg()], ctx.assignments[register_file[definition.physReg()]].rc};
- Operand pc_op = Operand(tmp);
- pc_op.setFixed(ctx.assignments[register_file[definition.physReg().reg()]].reg);
- RegClass rc = pc_op.regClass();
- tmp = Temp{program->allocateId(), rc};
- Definition pc_def = Definition(tmp);
-
- /* re-enable the killed operands, so that we don't move the blocking var there */
+ if (register_file.test(definition.physReg(), definition.bytes())) {
+ /* create parallelcopy pair to move blocking vars */
+ std::set<std::pair<unsigned, unsigned>> vars = collect_vars(ctx, register_file, definition.physReg(), definition.size());
+
+ /* re-enable the killed operands, so that we don't move the blocking vars there */
for (const Operand& op : instr->operands) {
if (op.isTemp() && op.isFirstKillBeforeDef())
register_file.fill(op);
}
- /* find a new register for the blocking variable */
- PhysReg reg = get_reg(ctx, register_file, pc_op.getTemp(), parallelcopy, instr);
+ ASSERTED bool success = false;
+ DefInfo info(ctx, instr, definition.regClass(), -1);
+ success = get_regs_for_copies(ctx, register_file, parallelcopy,
+ vars, info.lb, info.ub, instr,
+ definition.physReg(),
+ definition.physReg() + definition.size() - 1);
+ assert(success);
+
+ update_renames(ctx, register_file, parallelcopy, instr, false);
+
/* once again, disable killed operands */
for (const Operand& op : instr->operands) {
if (op.isTemp() && op.isFirstKillBeforeDef())
if (instr->definitions[k].isTemp() && ctx.defs_done.test(k) && !instr->definitions[k].isKill())
register_file.fill(instr->definitions[k]);
}
- pc_def.setFixed(reg);
-
- /* finish assignment of parallelcopy */
- ctx.assignments.emplace_back(reg, pc_def.regClass());
- assert(ctx.assignments.size() == ctx.program->peekAllocationId());
- parallelcopy.emplace_back(pc_op, pc_def);
-
- /* add changes to reg_file */
- register_file.clear(pc_op);
- register_file.fill(pc_def);
}
ctx.defs_done.set(i);
/* handle all other definitions */
for (unsigned i = 0; i < instr->definitions.size(); ++i) {
- auto& definition = instr->definitions[i];
+ Definition *definition = &instr->definitions[i];
- if (definition.isFixed() || !definition.isTemp())
+ if (definition->isFixed() || !definition->isTemp())
continue;
/* find free reg */
- if (definition.hasHint() && register_file[definition.physReg().reg()] == 0)
- definition.setFixed(definition.physReg());
+ if (definition->hasHint() && register_file[definition->physReg().reg()] == 0)
+ definition->setFixed(definition->physReg());
else if (instr->opcode == aco_opcode::p_split_vector) {
PhysReg reg = instr->operands[0].physReg();
for (unsigned j = 0; j < i; j++)
reg.reg_b += instr->definitions[j].bytes();
- if (get_reg_specified(ctx, register_file, definition.regClass(), parallelcopy, instr, reg))
- definition.setFixed(reg);
+ if (get_reg_specified(ctx, register_file, definition->regClass(), parallelcopy, instr, reg))
+ definition->setFixed(reg);
} else if (instr->opcode == aco_opcode::p_wqm || instr->opcode == aco_opcode::p_parallelcopy) {
PhysReg reg = instr->operands[i].physReg();
if (instr->operands[i].isTemp() &&
- instr->operands[i].getTemp().type() == definition.getTemp().type() &&
- !register_file.test(reg, definition.bytes()))
- definition.setFixed(reg);
+ instr->operands[i].getTemp().type() == definition->getTemp().type() &&
+ !register_file.test(reg, definition->bytes()))
+ definition->setFixed(reg);
} else if (instr->opcode == aco_opcode::p_extract_vector) {
PhysReg reg;
if (instr->operands[0].isKillBeforeDef() &&
- instr->operands[0].getTemp().type() == definition.getTemp().type()) {
+ instr->operands[0].getTemp().type() == definition->getTemp().type()) {
reg = instr->operands[0].physReg();
- reg.reg_b += definition.bytes() * instr->operands[1].constantValue();
- assert(!register_file.test(reg, definition.bytes()));
- definition.setFixed(reg);
+ reg.reg_b += definition->bytes() * instr->operands[1].constantValue();
+ assert(!register_file.test(reg, definition->bytes()));
+ definition->setFixed(reg);
}
} else if (instr->opcode == aco_opcode::p_create_vector) {
- PhysReg reg = get_reg_create_vector(ctx, register_file, definition.getTemp(),
+ PhysReg reg = get_reg_create_vector(ctx, register_file, definition->getTemp(),
parallelcopy, instr);
- definition.setFixed(reg);
+ definition->setFixed(reg);
}
- if (!definition.isFixed()) {
- Temp tmp = definition.getTemp();
- if (tmp.regClass().is_subdword() &&
- !instr_can_access_subdword(ctx, instr)) {
- assert(tmp.bytes() <= 4);
- tmp = Temp(definition.tempId(), v1);
+ if (!definition->isFixed()) {
+ Temp tmp = definition->getTemp();
+ if (definition->regClass().is_subdword() && definition->bytes() < 4) {
+ PhysReg reg = get_reg(ctx, register_file, tmp, parallelcopy, instr);
+ bool partial = !(tmp.bytes() <= 4 && reg.byte() == 0 && !register_file.test(reg, 4));
+ add_subdword_definition(program, instr, i, reg, partial);
+ definition = &instr->definitions[i]; /* add_subdword_definition can invalidate the reference */
+ } else {
+ definition->setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr));
}
- definition.setFixed(get_reg(ctx, register_file, tmp, parallelcopy, instr));
}
- assert(definition.isFixed() && ((definition.getTemp().type() == RegType::vgpr && definition.physReg() >= 256) ||
- (definition.getTemp().type() != RegType::vgpr && definition.physReg() < 256)));
+ assert(definition->isFixed() && ((definition->getTemp().type() == RegType::vgpr && definition->physReg() >= 256) ||
+ (definition->getTemp().type() != RegType::vgpr && definition->physReg() < 256)));
ctx.defs_done.set(i);
/* set live if it has a kill point */
- if (!definition.isKill())
- live.emplace(definition.getTemp());
+ if (!definition->isKill())
+ live.emplace(definition->getTemp());
- ctx.assignments[definition.tempId()] = {definition.physReg(), definition.regClass()};
- register_file.fill(definition);
+ ctx.assignments[definition->tempId()] = {definition->physReg(), definition->regClass()};
+ register_file.fill(*definition);
}
handle_pseudo(ctx, register_file, instr.get());
- /* kill definitions and late-kill operands */
+ /* kill definitions and late-kill operands and ensure that sub-dword operands can actually be read */
for (const Definition& def : instr->definitions) {
if (def.isTemp() && def.isKill())
register_file.clear(def);
}
- for (const Operand& op : instr->operands) {
+ for (unsigned i = 0; i < instr->operands.size(); i++) {
+ const Operand& op = instr->operands[i];
if (op.isTemp() && op.isFirstKill() && op.isLateKill())
register_file.clear(op);
+ if (op.isTemp() && op.physReg().byte() != 0)
+ add_subdword_operand(program->chip_class, instr, i, op.physReg().byte(), op.regClass());
}
/* emit parallelcopy */
}
std::copy(tmp->definitions.begin(), tmp->definitions.end(), instr->definitions.begin());
}
+
instructions.emplace_back(std::move(*it));
} /* end for Instr */