From 2ab45f41e08a3892138a1e9b20552621b4e18682 Mon Sep 17 00:00:00 2001 From: Rhys Perry Date: Mon, 6 Apr 2020 17:13:52 +0100 Subject: [PATCH] aco: implement sub-dword swaps MIME-Version: 1.0 Content-Type: text/plain; charset=utf8 Content-Transfer-Encoding: 8bit Signed-off-by: Rhys Perry Reviewed-by: Daniel Schürmann Part-of: --- src/amd/compiler/aco_builder_h.py | 2 + src/amd/compiler/aco_lower_to_hw_instr.cpp | 449 ++++++++++++++------- src/amd/compiler/aco_opcodes.py | 9 + 3 files changed, 320 insertions(+), 140 deletions(-) diff --git a/src/amd/compiler/aco_builder_h.py b/src/amd/compiler/aco_builder_h.py index 9e4e64101b8..097743658b3 100644 --- a/src/amd/compiler/aco_builder_h.py +++ b/src/amd/compiler/aco_builder_h.py @@ -490,6 +490,7 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod ("reduction", [Format.PSEUDO_REDUCTION], 'Pseudo_reduction_instruction', [(3, 2), (3, 4)]), ("vop1", [Format.VOP1], 'VOP1_instruction', [(1, 1), (2, 2)]), ("vop2", [Format.VOP2], 'VOP2_instruction', itertools.product([1, 2], [2, 3])), + ("vop2_sdwa", [Format.VOP2, Format.SDWA], 'SDWA_instruction', itertools.product([1, 2], [2, 3])), ("vopc", [Format.VOPC], 'VOPC_instruction', itertools.product([1, 2], [2])), ("vop3", [Format.VOP3A], 'VOP3A_instruction', [(1, 3), (1, 2), (1, 1), (2, 2)]), ("vintrp", [Format.VINTRP], 'Interp_instruction', [(1, 2), (1, 3)]), @@ -527,6 +528,7 @@ formats = [("pseudo", [Format.PSEUDO], 'Pseudo_instruction', list(itertools.prod % for dest, field_name in zip(f.get_builder_field_dests(), f.get_builder_field_names()): instr->${dest} = ${field_name}; % endfor + ${f.get_builder_initialization(num_operands)} % endfor return insert(instr); } diff --git a/src/amd/compiler/aco_lower_to_hw_instr.cpp b/src/amd/compiler/aco_lower_to_hw_instr.cpp index 4be1bef2a27..4a9cc9c9c62 100644 --- a/src/amd/compiler/aco_lower_to_hw_instr.cpp +++ b/src/amd/compiler/aco_lower_to_hw_instr.cpp @@ -703,6 +703,181 @@ struct copy_operation { }; }; +void split_copy(unsigned offset, Definition *def, Operand *op, const copy_operation& src, bool ignore_uses, unsigned max_size) +{ + PhysReg def_reg = src.def.physReg(); + PhysReg op_reg = src.op.physReg(); + def_reg.reg_b += offset; + op_reg.reg_b += offset; + + max_size = MIN2(max_size, src.def.regClass().type() == RegType::vgpr ? 4 : 8); + + /* make sure the size is a power of two and reg % bytes == 0 */ + unsigned bytes = 1; + for (; bytes <= max_size; bytes *= 2) { + unsigned next = bytes * 2u; + bool can_increase = def_reg.reg_b % next == 0 && + offset + next <= src.bytes && next <= max_size; + if (!src.op.isConstant() && can_increase) + can_increase = op_reg.reg_b % next == 0; + for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++) + can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0); + if (!can_increase) + break; + } + + RegClass def_cls = bytes % 4 == 0 ? RegClass(src.def.regClass().type(), bytes / 4u) : + RegClass(src.def.regClass().type(), bytes).as_subdword(); + *def = Definition(src.def.tempId(), def_reg, def_cls); + if (src.op.isConstant()) { + assert(offset == 0 || (offset == 4 && src.op.bytes() == 8)); + if (src.op.bytes() == 8 && bytes == 4) + *op = Operand(uint32_t(src.op.constantValue64() >> (offset * 8u))); + else + *op = src.op; + } else { + RegClass op_cls = bytes % 4 == 0 ? RegClass(src.op.regClass().type(), bytes / 4u) : + RegClass(src.op.regClass().type(), bytes).as_subdword(); + *op = Operand(op_reg, op_cls); + op->setTemp(Temp(src.op.tempId(), op_cls)); + } +} + +uint32_t get_intersection_mask(int a_start, int a_size, + int b_start, int b_size) +{ + int intersection_start = MAX2(b_start - a_start, 0); + int intersection_end = MAX2(b_start + b_size - a_start, 0); + if (intersection_start >= a_size || intersection_end == 0) + return 0; + + uint32_t mask = u_bit_consecutive(0, a_size); + return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask; +} + +bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool *preserve_scc) +{ + bool did_copy = false; + for (unsigned offset = 0; offset < copy.bytes;) { + if (copy.uses[offset]) { + offset++; + continue; + } + + Definition def; + Operand op; + split_copy(offset, &def, &op, copy, false, 8); + + if (def.physReg() == scc) { + bld.sopc(aco_opcode::s_cmp_lg_i32, def, op, Operand(0u)); + *preserve_scc = true; + } else if (def.bytes() == 8 && def.getTemp().type() == RegType::sgpr) { + bld.sop1(aco_opcode::s_mov_b64, def, Operand(op.physReg(), s2)); + } else { + bld.copy(def, op); + } + + ctx->program->statistics[statistic_copies]++; + + did_copy = true; + offset += def.bytes(); + } + return did_copy; +} + +void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool preserve_scc, Pseudo_instruction *pi) +{ + unsigned offset = 0; + + if (copy.bytes == 3 && (copy.def.physReg().reg_b % 4 <= 1) && + (copy.def.physReg().reg_b % 4) == (copy.op.physReg().reg_b % 4)) { + /* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte swap */ + PhysReg op = copy.op.physReg(); + PhysReg def = copy.def.physReg(); + op.reg_b &= ~0x3; + def.reg_b &= ~0x3; + + copy_operation tmp; + tmp.op = Operand(op, v1); + tmp.def = Definition(def, v1); + tmp.bytes = 4; + memset(tmp.uses, 1, 4); + do_swap(ctx, bld, tmp, preserve_scc, pi); + + op.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0; + def.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0; + tmp.op = Operand(op, v1b); + tmp.def = Definition(def, v1b); + tmp.bytes = 1; + tmp.uses[0] = 1; + do_swap(ctx, bld, tmp, preserve_scc, pi); + + offset = copy.bytes; + } + + for (; offset < copy.bytes;) { + Definition def; + Operand op; + split_copy(offset, &def, &op, copy, true, 4); + + assert(op.regClass() == def.regClass()); + Operand def_as_op = Operand(def.physReg(), def.regClass()); + Definition op_as_def = Definition(op.physReg(), op.regClass()); + if (ctx->program->chip_class >= GFX9 && def.regClass() == v1) { + bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op); + ctx->program->statistics[statistic_copies]++; + } else if (def.regClass() == v1) { + bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op); + bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op); + bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op); + ctx->program->statistics[statistic_copies] += 3; + } else if (op.physReg() == scc || def.physReg() == scc) { + /* we need to swap scc and another sgpr */ + assert(!preserve_scc); + + PhysReg other = op.physReg() == scc ? def.physReg() : op.physReg(); + + bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1)); + bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1), Operand(0u)); + bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1)); + ctx->program->statistics[statistic_copies] += 3; + } else if (def.regClass() == s1) { + if (preserve_scc) { + bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), op); + bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op); + bld.sop1(aco_opcode::s_mov_b32, def, Operand(pi->scratch_sgpr, s1)); + } else { + bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op); + bld.sop2(aco_opcode::s_xor_b32, def, Definition(scc, s1), op, def_as_op); + bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op); + } + ctx->program->statistics[statistic_copies] += 3; + } else if (ctx->program->chip_class >= GFX9 && def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) { + aco_ptr vop3p{create_instruction(aco_opcode::v_pk_add_u16, Format::VOP3P, 2, 1)}; + vop3p->operands[0] = Operand(PhysReg{op.physReg().reg()}, v1); + vop3p->operands[1] = Operand(0u); + vop3p->definitions[0] = Definition(PhysReg{op.physReg().reg()}, v1); + vop3p->opsel_lo = 0x1; + vop3p->opsel_hi = 0x2; + bld.insert(std::move(vop3p)); + } else { + assert(def.regClass().is_subdword()); + bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op); + bld.vop2_sdwa(aco_opcode::v_xor_b32, def, op, def_as_op); + bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op); + ctx->program->statistics[statistic_copies] += 3; + } + + offset += def.bytes(); + } + + /* fixup in case we swapped bytes we shouldn't have */ + copy_operation tmp_copy = copy; + tmp_copy.op.setFixed(copy.def.physReg()); + tmp_copy.def.setFixed(copy.op.physReg()); + do_copy(ctx, bld, tmp_copy, &preserve_scc); +} + void handle_operands(std::map& copy_map, lower_context* ctx, chip_class chip_class, Pseudo_instruction *pi) { Builder bld(ctx->program, &ctx->instructions); @@ -761,114 +936,107 @@ void handle_operands(std::map& copy_map, lower_context* it = copy_map.begin(); while (it != copy_map.end()) { - /* split cross half-reg copies: SDWA can only access bytes and shorts */ - if (it->second.def.regClass().is_subdword()) { - PhysReg def_reg = it->second.def.physReg(); - PhysReg op_reg = it->second.op.physReg(); - unsigned new_bytes = 0; - if (it->second.bytes > 1 && (def_reg.byte() % 2 || op_reg.byte() % 2)) { - new_bytes = 1; - } else if (it->second.bytes > 2 && (def_reg.byte() || op_reg.byte())) { - new_bytes = 2; - } else if (it->second.bytes == 3) { - new_bytes = 2; - } else if (it->second.bytes > 4) { - assert(it->second.op.physReg().byte() == 0 && it->second.def.physReg().byte() == 0); - new_bytes = 4; + /* try to coalesce 32-bit sgpr copies to 64-bit copies */ + if (it->second.is_used == 0 && + it->second.def.getTemp().type() == RegType::sgpr && it->second.bytes == 4 && + !it->second.op.isConstant() && it->first % 2 == it->second.op.physReg() % 2) { + + PhysReg other_def_reg = PhysReg{it->first % 2 ? it->first - 1 : it->first + 1}; + PhysReg other_op_reg = PhysReg{it->first % 2 ? it->second.op.physReg() - 1 : it->second.op.physReg() + 1}; + std::map::iterator other = copy_map.find(other_def_reg); + + if (other != copy_map.end() && !other->second.is_used && other->second.bytes == 4 && + other->second.op.physReg() == other_op_reg && !other->second.op.isConstant()) { + std::map::iterator to_erase = it->first % 2 ? it : other; + it = it->first % 2 ? other : it; + copy_map.erase(to_erase); + it->second.bytes = 8; } - if (new_bytes) { - RegClass rc = RegClass(RegType::vgpr, it->second.bytes - new_bytes).as_subdword(); - def_reg.reg_b += new_bytes; - op_reg.reg_b += new_bytes; - copy_operation copy = {Operand(op_reg, rc), Definition(def_reg, rc), it->second.bytes - new_bytes}; - copy.is_used = it->second.is_used >> (8 * new_bytes); - copy_map[def_reg] = copy; - rc = RegClass(RegType::vgpr, new_bytes).as_subdword(); - it->second.op = Operand(it->second.op.physReg(), rc); - it->second.def = Definition(it->second.def.physReg(), rc); - it->second.is_used = it->second.is_used & ((1 << (8 * new_bytes)) - 1); - it->second.bytes = new_bytes; - } - - /* convert dword moves to normal regclass */ - if (it->second.bytes == 4) { - it->second.op = Operand(it->second.op.physReg(), v1); - it->second.def = Definition(it->second.def.physReg(), v1); - } - } - - /* split multi-reg copies */ - if (it->second.bytes > 4 && !it->second.op.isConstant()) { - assert(!it->second.def.regClass().is_subdword()); - RegClass rc = RegClass(it->second.def.regClass().type(), it->second.def.size() - 1); - Definition hi_def = Definition(PhysReg{it->first + 1}, rc); - rc = RegClass(it->second.op.regClass().type(), it->second.op.size() - 1); - Operand hi_op = Operand(PhysReg{it->second.op.physReg() + 1}, rc); - copy_operation copy = {hi_op, hi_def, it->second.bytes - 4}; - copy.is_used = it->second.is_used >> 32; - copy_map[hi_def.physReg()] = copy; - assert(it->second.op.physReg().byte() == 0 && it->second.def.physReg().byte() == 0); - it->second.op = Operand(it->second.op.physReg(), it->second.op.regClass().type() == RegType::sgpr ? s1 : v1); - it->second.def = Definition(it->second.def.physReg(), it->second.def.regClass().type() == RegType::sgpr ? s1 : v1); - it->second.is_used = it->second.is_used & 0xFFFFFFFF; - it->second.bytes = 4; } + // TODO: try to coalesce subdword copies - /* the target reg is not used as operand for any other copy */ - if (it->second.is_used == 0) { - - /* try to coalesce 32-bit sgpr copies to 64-bit copies */ - if (it->second.def.getTemp().type() == RegType::sgpr && it->second.bytes == 4 && - !it->second.op.isConstant() && it->first % 2 == it->second.op.physReg() % 2) { - - PhysReg other_def_reg = PhysReg{it->first % 2 ? it->first - 1 : it->first + 1}; - PhysReg other_op_reg = PhysReg{it->first % 2 ? it->second.op.physReg() - 1 : it->second.op.physReg() + 1}; - std::map::iterator other = copy_map.find(other_def_reg); - - if (other != copy_map.end() && !other->second.is_used && other->second.bytes == 4 && - other->second.op.physReg() == other_op_reg && !other->second.op.isConstant()) { - std::map::iterator to_erase = it->first % 2 ? it : other; - it = it->first % 2 ? other : it; - copy_map.erase(to_erase); - it->second.bytes = 8; - } - } - // TODO: try to coalesce subdword copies - - if (it->second.def.physReg() == scc) { - bld.sopc(aco_opcode::s_cmp_lg_i32, it->second.def, it->second.op, Operand(0u)); - preserve_scc = true; - } else if (it->second.bytes == 8 && it->second.def.getTemp().type() == RegType::sgpr) { - bld.sop1(aco_opcode::s_mov_b64, it->second.def, Operand(it->second.op.physReg(), s2)); - } else if (it->second.bytes == 8 && it->second.op.isConstant()) { - uint64_t val = it->second.op.constantValue64(); - bld.vop1(aco_opcode::v_mov_b32, it->second.def, Operand((uint32_t)val)); - bld.vop1(aco_opcode::v_mov_b32, Definition(PhysReg{it->second.def.physReg() + 1}, v1), - Operand((uint32_t)(val >> 32))); - ctx->program->statistics[statistic_copies]++; - } else { - bld.copy(it->second.def, it->second.op); + /* find portions where the target reg is not used as operand for any other copy */ + if (it->second.is_used) { + if (it->second.op.isConstant()) { + /* we have to skip constants until is_used=0 */ + ++it; + continue; } - /* reduce the number of uses of the operand reg by one */ - if (!it->second.op.isConstant()) { + unsigned has_zero_use_bytes = 0; + for (unsigned i = 0; i < it->second.bytes; i++) + has_zero_use_bytes |= (it->second.uses[i] == 0) << i; + + if (has_zero_use_bytes) { + /* Skipping partial copying and doing a v_swap_b32 and then fixup + * copies is usually beneficial for sub-dword copies, but if doing + * a partial copy allows further copies, it should be done instead. */ + bool partial_copy = (has_zero_use_bytes == 0xf) || (has_zero_use_bytes == 0xf0); for (std::pair& copy : copy_map) { + if (partial_copy) + break; for (uint16_t i = 0; i < copy.second.bytes; i++) { /* distance might underflow */ unsigned distance = copy.first.reg_b + i - it->second.op.physReg().reg_b; - if (distance < it->second.bytes) - copy.second.uses[i] -= 1; + if (distance < it->second.bytes && copy.second.uses[i] == 1 && + !it->second.uses[distance]) + partial_copy = true; } } + + if (!partial_copy) { + ++it; + continue; + } + } else { + /* full target reg is used: register swapping needed */ + ++it; + continue; + } + } + + bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc); + + /* reduce the number of uses of the operand reg by one */ + if (did_copy && !it->second.op.isConstant()) { + for (std::pair& copy : copy_map) { + for (uint16_t i = 0; i < copy.second.bytes; i++) { + /* distance might underflow */ + unsigned distance = copy.first.reg_b + i - it->second.op.physReg().reg_b; + if (distance < it->second.bytes && !it->second.uses[distance]) + copy.second.uses[i] -= 1; + } } + } + if (it->second.is_used == 0) { + /* the target reg is not used as operand for any other copy, so we + * copied to all of it */ copy_map.erase(it); it = copy_map.begin(); - ctx->program->statistics[statistic_copies]++; - continue; } else { - /* the target reg is used as operand, check the next entry */ - ++it; + /* we only performed some portions of this copy, so split it to only + * leave the portions that still need to be done */ + copy_operation original = it->second; /* the map insertion below can overwrite this */ + copy_map.erase(it); + for (unsigned offset = 0; offset < original.bytes;) { + if (original.uses[offset] == 0) { + offset++; + continue; + } + Definition def; + Operand op; + split_copy(offset, &def, &op, original, false, 8); + + copy_operation copy = {op, def, def.bytes()}; + for (unsigned i = 0; i < copy.bytes; i++) + copy.uses[i] = original.uses[i + offset]; + copy_map[def.physReg()] = copy; + + offset += def.bytes(); + } + + it = copy_map.begin(); } } @@ -876,68 +1044,69 @@ void handle_operands(std::map& copy_map, lower_context* return; /* all target regs are needed as operand somewhere which means, all entries are part of a cycle */ - for (it = copy_map.begin(); it != copy_map.end(); ++it) { - assert(it->second.op.isFixed()); - if (it->first == it->second.op.physReg()) - continue; + unsigned largest = 0; + for (const std::pair& op : copy_map) + largest = MAX2(largest, op.second.bytes); + + while (!copy_map.empty()) { + + /* Perform larger swaps first, so that we don't have to split the uses of + * registers we swap (we don't have to because of alignment restrictions) and + * larger swaps swaps can make other swaps unnecessary. */ + auto it = copy_map.begin(); + for (auto it2 = copy_map.begin(); it2 != copy_map.end(); ++it2) { + if (it2->second.bytes > it->second.bytes) { + it = it2; + if (it->second.bytes == largest) + break; + } + } /* should already be done */ assert(!it->second.op.isConstant()); + assert(it->second.op.isFixed()); + assert(it->second.def.regClass() == it->second.op.regClass()); + + if (it->first == it->second.op.physReg()) { + copy_map.erase(it); + continue; + } + if (preserve_scc && it->second.def.getTemp().type() == RegType::sgpr) assert(!(it->second.def.physReg() == pi->scratch_sgpr)); /* to resolve the cycle, we have to swap the src reg with the dst reg */ copy_operation swap = it->second; - assert(swap.op.regClass() == swap.def.regClass()); - Operand def_as_op = Operand(swap.def.physReg(), swap.def.regClass()); - Definition op_as_def = Definition(swap.op.physReg(), swap.op.regClass()); - if (chip_class >= GFX9 && swap.def.regClass() == v1) { - bld.vop1(aco_opcode::v_swap_b32, swap.def, op_as_def, swap.op, def_as_op); - ctx->program->statistics[statistic_copies]++; - } else if (swap.def.regClass() == v1) { - bld.vop2(aco_opcode::v_xor_b32, op_as_def, swap.op, def_as_op); - bld.vop2(aco_opcode::v_xor_b32, swap.def, swap.op, def_as_op); - bld.vop2(aco_opcode::v_xor_b32, op_as_def, swap.op, def_as_op); - ctx->program->statistics[statistic_copies] += 3; - } else if (swap.op.physReg() == scc || swap.def.physReg() == scc) { - /* we need to swap scc and another sgpr */ - assert(!preserve_scc); + do_swap(ctx, bld, swap, preserve_scc, pi); - PhysReg other = swap.op.physReg() == scc ? swap.def.physReg() : swap.op.physReg(); - - bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1)); - bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1), Operand(0u)); - bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1)); - ctx->program->statistics[statistic_copies] += 3; - } else if (swap.def.regClass() == s1) { - if (preserve_scc) { - bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), swap.op); - bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op); - bld.sop1(aco_opcode::s_mov_b32, swap.def, Operand(pi->scratch_sgpr, s1)); - } else { - bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), swap.op, def_as_op); - bld.sop2(aco_opcode::s_xor_b32, swap.def, Definition(scc, s1), swap.op, def_as_op); - bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), swap.op, def_as_op); - } - ctx->program->statistics[statistic_copies] += 3; - } else { - bld.vop2(aco_opcode::v_xor_b32, op_as_def, swap.op, def_as_op); - bld.vop2(aco_opcode::v_xor_b32, swap.def, swap.op, def_as_op); - bld.vop2(aco_opcode::v_xor_b32, op_as_def, swap.op, def_as_op); - ctx->program->statistics[statistic_copies] += 3; - assert(swap.def.regClass().is_subdword()); - assert(false && "Subdword swaps not yet implemented."); - } + /* remove from map */ + copy_map.erase(it); - /* change the operand reg of the target's use */ - assert(swap.is_used == 0x01010101lu); // each 1 use per byte - target = it; - for (++target; target != copy_map.end(); ++target) { - if (target->second.op.physReg() == it->first) { + /* change the operand reg of the target's use and split uses if needed */ + target = copy_map.begin(); + uint32_t bytes_left = u_bit_consecutive(0, swap.bytes); + for (; target != copy_map.end(); ++target) { + if (target->second.op.physReg() == swap.def.physReg() && swap.bytes == target->second.bytes) { target->second.op.setFixed(swap.op.physReg()); break; } + + uint32_t imask = get_intersection_mask(swap.def.physReg().reg_b, swap.bytes, + target->second.op.physReg().reg_b, target->second.bytes); + + if (!imask) + continue; + + assert(target->second.bytes < swap.bytes); + + PhysReg new_reg = swap.op.physReg(); + new_reg.reg_b += target->second.op.physReg().reg_b - swap.def.physReg().reg_b; + target->second.op.setFixed(new_reg); + + bytes_left &= ~imask; + if (!bytes_left) + break; } } } diff --git a/src/amd/compiler/aco_opcodes.py b/src/amd/compiler/aco_opcodes.py index 3fb755f0c7c..bb777eb0d51 100644 --- a/src/amd/compiler/aco_opcodes.py +++ b/src/amd/compiler/aco_opcodes.py @@ -149,6 +149,15 @@ class Format(Enum): def get_builder_field_decls(self): return [('%s %s=%s' % (f[0], f[1], f[2]) if f[2] != None else '%s %s' % (f[0], f[1])) for f in self.get_builder_fields()] + def get_builder_initialization(self, num_operands): + res = '' + if self == Format.SDWA: + for i in range(min(num_operands, 2)): + res += f'instr->sel[{i}] = op{i}.op.bytes() == 2 ? sdwa_uword : (op{i}.op.bytes() == 1 ? sdwa_ubyte : sdwa_udword);\n' + res += 'instr->dst_sel = def0.bytes() == 2 ? sdwa_uword : (def0.bytes() == 1 ? sdwa_ubyte : sdwa_udword);\n' + res += 'instr->dst_preserve = true;' + return res + class Opcode(object): """Class that represents all the information we have about the opcode -- 2.30.2