X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Famd%2Fcompiler%2Faco_register_allocation.cpp;h=d5746e5a6369a7520874fcc54a55d817b63f3985;hb=HEAD;hp=74b52301ad395516730d65b6970054fc75a2aff7;hpb=edf863d1d29f7afbca2d53dca963e8fa0362b8a6;p=mesa.git diff --git a/src/amd/compiler/aco_register_allocation.cpp b/src/amd/compiler/aco_register_allocation.cpp index 74b52301ad3..d5746e5a636 100644 --- a/src/amd/compiler/aco_register_allocation.cpp +++ b/src/amd/compiler/aco_register_allocation.cpp @@ -185,6 +185,13 @@ public: return false; } + bool is_empty_or_blocked(PhysReg start) { + if (regs[start] == 0xF0000000) { + return subdword_regs[start][start.byte()] + 1 <= 1; + } + return regs[start] + 1 <= 1; + } + void clear(PhysReg start, RegClass rc) { if (rc.is_subdword()) fill_subdword(start, rc.bytes(), 0); @@ -214,6 +221,10 @@ public: clear(def.physReg(), def.regClass()); } + unsigned get_id(PhysReg reg) { + return regs[reg] == 0xF0000000 ? subdword_regs[reg][reg.byte()] : regs[reg]; + } + private: void fill(PhysReg start, unsigned size, uint32_t val) { for (unsigned i = 0; i < size; i++) @@ -307,6 +318,9 @@ void print_regs(ra_ctx& ctx, bool vgprs, RegisterFile& reg_file) unsigned get_subdword_operand_stride(chip_class chip, const aco_ptr& instr, unsigned idx, RegClass rc) { + /* v_readfirstlane_b32 cannot use SDWA */ + if (instr->opcode == aco_opcode::p_as_uniform) + return 4; if (instr->format == Format::PSEUDO && chip >= GFX8) return rc.bytes() % 2 == 0 ? 2 : 1; @@ -429,7 +443,7 @@ std::pair get_subdword_definition_info(Program *program, con if (can_use_SDWA(chip, instr)) { return std::make_pair(rc.bytes(), rc.bytes()); } else if (rc.bytes() == 2 && can_use_opsel(chip, instr->opcode, -1, 1)) { - return std::make_pair(2u, chip >= GFX10 ? 2u : 4u); + return std::make_pair(2u, bytes_written); } switch (instr->opcode) { @@ -517,7 +531,7 @@ void adjust_max_used_regs(ra_ctx& ctx, RegClass rc, unsigned reg) void update_renames(ra_ctx& ctx, RegisterFile& reg_file, std::vector>& parallelcopies, - aco_ptr& instr) + aco_ptr& instr, bool rename_not_killed_ops) { /* allocate id's and rename operands: this is done transparently here */ for (std::pair& copy : parallelcopies) { @@ -541,19 +555,27 @@ void update_renames(ra_ctx& ctx, RegisterFile& reg_file, reg_file.fill(copy.second); /* check if we moved an operand */ - for (Operand& op : instr->operands) { + bool first = true; + for (unsigned i = 0; i < instr->operands.size(); i++) { + Operand& op = instr->operands[i]; if (!op.isTemp()) continue; if (op.tempId() == copy.first.tempId()) { - bool omit_renaming = instr->opcode == aco_opcode::p_create_vector && !op.isKillBeforeDef(); + bool omit_renaming = !rename_not_killed_ops && !op.isKillBeforeDef(); for (std::pair& pc : parallelcopies) { PhysReg def_reg = pc.second.physReg(); omit_renaming &= def_reg > copy.first.physReg() ? (copy.first.physReg() + copy.first.size() <= def_reg.reg()) : (def_reg + pc.second.size() <= copy.first.physReg().reg()); } - if (omit_renaming) + if (omit_renaming) { + if (first) + op.setFirstKill(true); + else + op.setKill(true); + first = false; continue; + } op.setTemp(copy.second.getTemp()); op.setFixed(copy.second.physReg()); } @@ -572,7 +594,7 @@ std::pair get_reg_simple(ra_ctx& ctx, RegClass rc = info.rc; if (stride == 1) { - + info.rc = RegClass(rc.type(), size); for (unsigned stride = 8; stride > 1; stride /= 2) { if (size % stride) continue; @@ -758,12 +780,13 @@ bool get_regs_for_copies(ra_ctx& ctx, PhysReg reg(def_reg_lo); for (unsigned i = 0; i < instr->operands.size(); i++) { if (instr->operands[i].isTemp() && instr->operands[i].tempId() == id) { - assert(!reg_file.test(reg, var.rc.bytes())); - res = {reg, !var.rc.is_subdword() || (reg.byte() % info.stride == 0)}; + res = {reg, (!var.rc.is_subdword() || (reg.byte() % info.stride == 0)) && !reg_file.test(reg, var.rc.bytes())}; break; } reg.reg_b += instr->operands[i].bytes(); } + if (!res.second) + res = {var.reg, !reg_file.test(var.reg, var.rc.bytes())}; } else { info.lb = def_reg_lo; info.ub = def_reg_hi + 1; @@ -771,9 +794,9 @@ bool get_regs_for_copies(ra_ctx& ctx, } } else { info.lb = lb; - info.ub = def_reg_lo; + info.ub = MIN2(def_reg_lo, ub); res = get_reg_simple(ctx, reg_file, info); - if (!res.second) { + if (!res.second && def_reg_hi < ub) { info.lb = (def_reg_hi + info.stride) & ~(info.stride - 1); info.ub = ub; res = get_reg_simple(ctx, reg_file, info); @@ -934,9 +957,11 @@ std::pair get_reg_impl(ra_ctx& ctx, unsigned reg_hi = lb + size - 1; for (reg_lo = lb, reg_hi = lb + size - 1; reg_hi < ub; reg_lo += stride, reg_hi += stride) { /* first check the edges: this is what we have to fix to allow for num_moves > size */ - if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1]) + if (reg_lo > lb && !reg_file.is_empty_or_blocked(PhysReg(reg_lo)) && + reg_file.get_id(PhysReg(reg_lo)) == reg_file.get_id(PhysReg(reg_lo).advance(-1))) continue; - if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1]) + if (reg_hi < ub - 1 && !reg_file.is_empty_or_blocked(PhysReg(reg_hi).advance(3)) && + reg_file.get_id(PhysReg(reg_hi).advance(3)) == reg_file.get_id(PhysReg(reg_hi).advance(4))) continue; /* second, check that we have at most k=num_moves elements in the window @@ -1065,7 +1090,7 @@ std::pair get_reg_impl(ra_ctx& ctx, /* we set the definition regs == 0. the actual caller is responsible for correct setting */ reg_file.clear(PhysReg{best_pos}, rc); - update_renames(ctx, reg_file, parallelcopies, instr); + update_renames(ctx, reg_file, parallelcopies, instr, instr->opcode != aco_opcode::p_create_vector); /* remove killed operands from reg_file once again */ for (unsigned i = 0; !is_phi(instr) && i < instr->operands.size(); i++) { @@ -1234,7 +1259,7 @@ PhysReg get_reg(ra_ctx& ctx, //FIXME: if nothing helps, shift-rotate the registers to make space - fprintf(stderr, "ACO: failed to allocate registers during shader compilation\n"); + aco_err(ctx.program, "Failed to allocate registers during shader compilation."); abort(); } @@ -1292,9 +1317,9 @@ PhysReg get_reg_create_vector(ra_ctx& ctx, // TODO: this can be improved */ if (reg_lo < lb || reg_hi >= ub || reg_lo % stride != 0) continue; - if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file[reg_lo] == reg_file[reg_lo - 1]) + if (reg_lo > lb && reg_file[reg_lo] != 0 && reg_file.get_id(PhysReg(reg_lo)) == reg_file.get_id(PhysReg(reg_lo).advance(-1))) continue; - if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file[reg_hi] == reg_file[reg_hi + 1]) + if (reg_hi < ub - 1 && reg_file[reg_hi] != 0 && reg_file.get_id(PhysReg(reg_hi).advance(3)) == reg_file.get_id(PhysReg(reg_hi).advance(4))) continue; /* count variables to be moved and check war_hint */ @@ -1352,26 +1377,28 @@ PhysReg get_reg_create_vector(ra_ctx& ctx, /* collect variables to be moved */ std::set> vars = collect_vars(ctx, reg_file, PhysReg{best_pos}, size); - /* GFX9+: move killed operands which aren't yet at the correct position - * Moving all killed operands generally leads to more register swaps. - * This is only done on GFX9+ because of the cheap v_swap instruction. - */ - if (ctx.program->chip_class >= GFX9) { - for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) { - if (instr->operands[i].isTemp() && - instr->operands[i].isFirstKillBeforeDef() && - instr->operands[i].getTemp().type() == rc.type() && - instr->operands[i].physReg().reg_b != best_pos * 4 + offset) { - vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId()); - reg_file.clear(instr->operands[i]); - } + for (unsigned i = 0, offset = 0; i < instr->operands.size(); offset += instr->operands[i].bytes(), i++) { + if (!instr->operands[i].isTemp() || !instr->operands[i].isFirstKillBeforeDef() || + instr->operands[i].getTemp().type() != rc.type()) + continue; + bool correct_pos = instr->operands[i].physReg().reg_b == best_pos * 4 + offset; + /* GFX9+: move killed operands which aren't yet at the correct position + * Moving all killed operands generally leads to more register swaps. + * This is only done on GFX9+ because of the cheap v_swap instruction. + */ + if (ctx.program->chip_class >= GFX9 && !correct_pos) { + vars.emplace(instr->operands[i].bytes(), instr->operands[i].tempId()); + reg_file.clear(instr->operands[i]); + /* fill operands which are in the correct position to avoid overwriting */ + } else if (correct_pos) { + reg_file.fill(instr->operands[i]); } } ASSERTED bool success = false; success = get_regs_for_copies(ctx, reg_file, parallelcopies, vars, lb, ub, instr, best_pos, best_pos + size - 1); assert(success); - update_renames(ctx, reg_file, parallelcopies, instr); + update_renames(ctx, reg_file, parallelcopies, instr, false); adjust_max_used_regs(ctx, rc, best_pos); /* remove killed operands from reg_file once again */ @@ -1510,7 +1537,7 @@ void get_reg_for_operand(ra_ctx& ctx, RegisterFile& register_file, Definition pc_def = Definition(dst, pc_op.regClass()); register_file.clear(pc_op); parallelcopy.emplace_back(pc_op, pc_def); - update_renames(ctx, register_file, parallelcopy, instr); + update_renames(ctx, register_file, parallelcopy, instr, true); } Temp read_variable(ra_ctx& ctx, Temp val, unsigned block_idx) @@ -1726,7 +1753,11 @@ void register_allocation(Program *program, std::vector& live_out_per_bl Operand op = Operand(); if (!def.isFixed() && instr->opcode == aco_opcode::p_parallelcopy) op = instr->operands[i]; - else if (instr->opcode == aco_opcode::v_mad_f32 && !instr->usesModifiers()) + else if ((instr->opcode == aco_opcode::v_mad_f32 || + (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) || + instr->opcode == aco_opcode::v_mad_f16 || + instr->opcode == aco_opcode::v_mad_legacy_f16 || + (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10)) && !instr->usesModifiers()) op = instr->operands[2]; if (op.isTemp() && op.isFirstKillBeforeDef() && def.regClass() == op.regClass()) { @@ -2001,26 +2032,52 @@ void register_allocation(Program *program, std::vector& live_out_per_bl } /* try to optimize v_mad_f32 -> v_mac_f32 */ - if (instr->opcode == aco_opcode::v_mad_f32 && + if ((instr->opcode == aco_opcode::v_mad_f32 || + (instr->opcode == aco_opcode::v_fma_f32 && program->chip_class >= GFX10) || + instr->opcode == aco_opcode::v_mad_f16 || + instr->opcode == aco_opcode::v_mad_legacy_f16 || + (instr->opcode == aco_opcode::v_fma_f16 && program->chip_class >= GFX10)) && instr->operands[2].isTemp() && instr->operands[2].isKillBeforeDef() && instr->operands[2].getTemp().type() == RegType::vgpr && instr->operands[1].isTemp() && instr->operands[1].getTemp().type() == RegType::vgpr && - !instr->usesModifiers()) { + !instr->usesModifiers() && + instr->operands[0].physReg().byte() == 0 && + instr->operands[1].physReg().byte() == 0 && + instr->operands[2].physReg().byte() == 0) { unsigned def_id = instr->definitions[0].tempId(); auto it = ctx.affinities.find(def_id); if (it == ctx.affinities.end() || !ctx.assignments[it->second].assigned || instr->operands[2].physReg() == ctx.assignments[it->second].reg || register_file.test(ctx.assignments[it->second].reg, instr->operands[2].bytes())) { instr->format = Format::VOP2; - instr->opcode = aco_opcode::v_mac_f32; + switch (instr->opcode) { + case aco_opcode::v_mad_f32: + instr->opcode = aco_opcode::v_mac_f32; + break; + case aco_opcode::v_fma_f32: + instr->opcode = aco_opcode::v_fmac_f32; + break; + case aco_opcode::v_mad_f16: + case aco_opcode::v_mad_legacy_f16: + instr->opcode = aco_opcode::v_mac_f16; + break; + case aco_opcode::v_fma_f16: + instr->opcode = aco_opcode::v_fmac_f16; + break; + default: + break; + } } } /* handle definitions which must have the same register as an operand */ if (instr->opcode == aco_opcode::v_interp_p2_f32 || instr->opcode == aco_opcode::v_mac_f32 || + instr->opcode == aco_opcode::v_fmac_f32 || + instr->opcode == aco_opcode::v_mac_f16 || + instr->opcode == aco_opcode::v_fmac_f16 || instr->opcode == aco_opcode::v_writelane_b32 || instr->opcode == aco_opcode::v_writelane_b32_e64) { instr->definitions[0].setFixed(instr->operands[2].physReg()); @@ -2047,23 +2104,26 @@ void register_allocation(Program *program, std::vector& live_out_per_bl adjust_max_used_regs(ctx, definition.regClass(), definition.physReg()); /* check if the target register is blocked */ - if (register_file[definition.physReg().reg()] != 0) { - /* create parallelcopy pair to move blocking var */ - Temp tmp = {register_file[definition.physReg()], ctx.assignments[register_file[definition.physReg()]].rc}; - Operand pc_op = Operand(tmp); - pc_op.setFixed(ctx.assignments[register_file[definition.physReg().reg()]].reg); - RegClass rc = pc_op.regClass(); - tmp = Temp{program->allocateId(), rc}; - Definition pc_def = Definition(tmp); - - /* re-enable the killed operands, so that we don't move the blocking var there */ + if (register_file.test(definition.physReg(), definition.bytes())) { + /* create parallelcopy pair to move blocking vars */ + std::set> vars = collect_vars(ctx, register_file, definition.physReg(), definition.size()); + + /* re-enable the killed operands, so that we don't move the blocking vars there */ for (const Operand& op : instr->operands) { if (op.isTemp() && op.isFirstKillBeforeDef()) register_file.fill(op); } - /* find a new register for the blocking variable */ - PhysReg reg = get_reg(ctx, register_file, pc_op.getTemp(), parallelcopy, instr); + ASSERTED bool success = false; + DefInfo info(ctx, instr, definition.regClass(), -1); + success = get_regs_for_copies(ctx, register_file, parallelcopy, + vars, info.lb, info.ub, instr, + definition.physReg(), + definition.physReg() + definition.size() - 1); + assert(success); + + update_renames(ctx, register_file, parallelcopy, instr, false); + /* once again, disable killed operands */ for (const Operand& op : instr->operands) { if (op.isTemp() && op.isFirstKillBeforeDef()) @@ -2073,16 +2133,6 @@ void register_allocation(Program *program, std::vector& live_out_per_bl if (instr->definitions[k].isTemp() && ctx.defs_done.test(k) && !instr->definitions[k].isKill()) register_file.fill(instr->definitions[k]); } - pc_def.setFixed(reg); - - /* finish assignment of parallelcopy */ - ctx.assignments.emplace_back(reg, pc_def.regClass()); - assert(ctx.assignments.size() == ctx.program->peekAllocationId()); - parallelcopy.emplace_back(pc_op, pc_def); - - /* add changes to reg_file */ - register_file.clear(pc_op); - register_file.fill(pc_def); } ctx.defs_done.set(i); @@ -2185,11 +2235,11 @@ void register_allocation(Program *program, std::vector& live_out_per_bl if (!sgpr_operands_alias_defs) { unsigned reg = parallelcopy[i].first.physReg().reg(); unsigned size = parallelcopy[i].first.getTemp().size(); - sgpr_operands[reg / 64u] |= ((1u << size) - 1) << (reg % 64u); + sgpr_operands[reg / 64u] |= u_bit_consecutive64(reg % 64u, size); reg = parallelcopy[i].second.physReg().reg(); size = parallelcopy[i].second.getTemp().size(); - if (sgpr_operands[reg / 64u] & ((1u << size) - 1) << (reg % 64u)) + if (sgpr_operands[reg / 64u] & u_bit_consecutive64(reg % 64u, size)) sgpr_operands_alias_defs = true; } }