+void split_copy(unsigned offset, Definition *def, Operand *op, const copy_operation& src, bool ignore_uses, unsigned max_size)
+{
+ PhysReg def_reg = src.def.physReg();
+ PhysReg op_reg = src.op.physReg();
+ def_reg.reg_b += offset;
+ op_reg.reg_b += offset;
+
+ max_size = MIN2(max_size, src.def.regClass().type() == RegType::vgpr ? 4 : 8);
+
+ /* make sure the size is a power of two and reg % bytes == 0 */
+ unsigned bytes = 1;
+ for (; bytes <= max_size; bytes *= 2) {
+ unsigned next = bytes * 2u;
+ bool can_increase = def_reg.reg_b % next == 0 &&
+ offset + next <= src.bytes && next <= max_size;
+ if (!src.op.isConstant() && can_increase)
+ can_increase = op_reg.reg_b % next == 0;
+ for (unsigned i = 0; !ignore_uses && can_increase && (i < bytes); i++)
+ can_increase = (src.uses[offset + bytes + i] == 0) == (src.uses[offset] == 0);
+ if (!can_increase)
+ break;
+ }
+
+ RegClass def_cls = bytes % 4 == 0 ? RegClass(src.def.regClass().type(), bytes / 4u) :
+ RegClass(src.def.regClass().type(), bytes).as_subdword();
+ *def = Definition(src.def.tempId(), def_reg, def_cls);
+ if (src.op.isConstant()) {
+ assert(bytes >= 1 && bytes <= 8);
+ if (bytes == 8)
+ *op = Operand(src.op.constantValue64() >> (offset * 8u));
+ else if (bytes == 4)
+ *op = Operand(uint32_t(src.op.constantValue64() >> (offset * 8u)));
+ else if (bytes == 2)
+ *op = Operand(uint16_t(src.op.constantValue64() >> (offset * 8u)));
+ else if (bytes == 1)
+ *op = Operand(uint8_t(src.op.constantValue64() >> (offset * 8u)));
+ } else {
+ RegClass op_cls = bytes % 4 == 0 ? RegClass(src.op.regClass().type(), bytes / 4u) :
+ RegClass(src.op.regClass().type(), bytes).as_subdword();
+ *op = Operand(op_reg, op_cls);
+ op->setTemp(Temp(src.op.tempId(), op_cls));
+ }
+}
+
+uint32_t get_intersection_mask(int a_start, int a_size,
+ int b_start, int b_size)
+{
+ int intersection_start = MAX2(b_start - a_start, 0);
+ int intersection_end = MAX2(b_start + b_size - a_start, 0);
+ if (intersection_start >= a_size || intersection_end == 0)
+ return 0;
+
+ uint32_t mask = u_bit_consecutive(0, a_size);
+ return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask;
+}
+
+bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool *preserve_scc, PhysReg scratch_sgpr)
+{
+ bool did_copy = false;
+ for (unsigned offset = 0; offset < copy.bytes;) {
+ if (copy.uses[offset]) {
+ offset++;
+ continue;
+ }
+
+ Definition def;
+ Operand op;
+ split_copy(offset, &def, &op, copy, false, 8);
+
+ if (def.physReg() == scc) {
+ bld.sopc(aco_opcode::s_cmp_lg_i32, def, op, Operand(0u));
+ *preserve_scc = true;
+ } else if (def.bytes() == 8 && def.getTemp().type() == RegType::sgpr) {
+ bld.sop1(aco_opcode::s_mov_b64, def, Operand(op.physReg(), s2));
+ } else if (def.regClass().is_subdword() && ctx->program->chip_class < GFX8) {
+ if (op.physReg().byte()) {
+ assert(def.physReg().byte() == 0);
+ bld.vop2(aco_opcode::v_lshrrev_b32, def, Operand(op.physReg().byte() * 8), op);
+ } else if (def.physReg().byte() == 2) {
+ assert(op.physReg().byte() == 0);
+ /* preserve the target's lower half */
+ def = Definition(def.physReg().advance(-2), v1);
+ bld.vop2(aco_opcode::v_and_b32, Definition(op.physReg(), v1), Operand(0xFFFFu), op);
+ if (def.physReg().reg() != op.physReg().reg())
+ bld.vop2(aco_opcode::v_and_b32, def, Operand(0xFFFFu), Operand(def.physReg(), v2b));
+ bld.vop2(aco_opcode::v_cvt_pk_u16_u32, def, Operand(def.physReg(), v2b), op);
+ } else if (def.physReg().byte()) {
+ unsigned bits = def.physReg().byte() * 8;
+ assert(op.physReg().byte() == 0);
+ def = Definition(def.physReg().advance(-def.physReg().byte()), v1);
+ bld.vop2(aco_opcode::v_and_b32, def, Operand((1 << bits) - 1u), Operand(def.physReg(), op.regClass()));
+ if (def.physReg().reg() == op.physReg().reg()) {
+ if (bits < 24) {
+ bld.vop2(aco_opcode::v_mul_u32_u24, def, Operand((1 << bits) + 1u), op);
+ } else {
+ bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand((1 << bits) + 1u));
+ bld.vop3(aco_opcode::v_mul_lo_u32, def, Operand(scratch_sgpr, s1), op);
+ }
+ } else {
+ bld.vop2(aco_opcode::v_lshlrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op);
+ bld.vop2(aco_opcode::v_or_b32, def, Operand(def.physReg(), op.regClass()), op);
+ bld.vop2(aco_opcode::v_lshrrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op);
+ }
+ } else {
+ bld.vop1(aco_opcode::v_mov_b32, def, op);
+ }
+ } else {
+ bld.copy(def, op);
+ }
+
+ did_copy = true;
+ offset += def.bytes();
+ }
+ return did_copy;
+}
+
+void do_swap(lower_context *ctx, Builder& bld, const copy_operation& copy, bool preserve_scc, Pseudo_instruction *pi)
+{
+ unsigned offset = 0;
+
+ if (copy.bytes == 3 && (copy.def.physReg().reg_b % 4 <= 1) &&
+ (copy.def.physReg().reg_b % 4) == (copy.op.physReg().reg_b % 4)) {
+ /* instead of doing a 2-byte and 1-byte swap, do a 4-byte swap and then fixup with a 1-byte swap */
+ PhysReg op = copy.op.physReg();
+ PhysReg def = copy.def.physReg();
+ op.reg_b &= ~0x3;
+ def.reg_b &= ~0x3;
+
+ copy_operation tmp;
+ tmp.op = Operand(op, v1);
+ tmp.def = Definition(def, v1);
+ tmp.bytes = 4;
+ memset(tmp.uses, 1, 4);
+ do_swap(ctx, bld, tmp, preserve_scc, pi);
+
+ op.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
+ def.reg_b += copy.def.physReg().reg_b % 4 == 0 ? 3 : 0;
+ tmp.op = Operand(op, v1b);
+ tmp.def = Definition(def, v1b);
+ tmp.bytes = 1;
+ tmp.uses[0] = 1;
+ do_swap(ctx, bld, tmp, preserve_scc, pi);
+
+ offset = copy.bytes;
+ }
+
+ for (; offset < copy.bytes;) {
+ Definition def;
+ Operand op;
+ split_copy(offset, &def, &op, copy, true, 8);
+
+ assert(op.regClass() == def.regClass());
+ Operand def_as_op = Operand(def.physReg(), def.regClass());
+ Definition op_as_def = Definition(op.physReg(), op.regClass());
+ if (ctx->program->chip_class >= GFX9 && def.regClass() == v1) {
+ bld.vop1(aco_opcode::v_swap_b32, def, op_as_def, op, def_as_op);
+ } else if (def.regClass() == v1) {
+ assert(def.physReg().byte() == 0 && op.physReg().byte() == 0);
+ bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
+ bld.vop2(aco_opcode::v_xor_b32, def, op, def_as_op);
+ bld.vop2(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
+ } else if (op.physReg() == scc || def.physReg() == scc) {
+ /* we need to swap scc and another sgpr */
+ assert(!preserve_scc);
+
+ PhysReg other = op.physReg() == scc ? def.physReg() : op.physReg();
+
+ bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
+ bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(other, s1), Operand(0u));
+ bld.sop1(aco_opcode::s_mov_b32, Definition(other, s1), Operand(pi->scratch_sgpr, s1));
+ } else if (def.regClass() == s1) {
+ if (preserve_scc) {
+ bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), op);
+ bld.sop1(aco_opcode::s_mov_b32, op_as_def, def_as_op);
+ bld.sop1(aco_opcode::s_mov_b32, def, Operand(pi->scratch_sgpr, s1));
+ } else {
+ bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
+ bld.sop2(aco_opcode::s_xor_b32, def, Definition(scc, s1), op, def_as_op);
+ bld.sop2(aco_opcode::s_xor_b32, op_as_def, Definition(scc, s1), op, def_as_op);
+ }
+ } else if (def.regClass() == s2) {
+ if (preserve_scc)
+ bld.sop1(aco_opcode::s_mov_b32, Definition(pi->scratch_sgpr, s1), Operand(scc, s1));
+ bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
+ bld.sop2(aco_opcode::s_xor_b64, def, Definition(scc, s1), op, def_as_op);
+ bld.sop2(aco_opcode::s_xor_b64, op_as_def, Definition(scc, s1), op, def_as_op);
+ if (preserve_scc)
+ bld.sopc(aco_opcode::s_cmp_lg_i32, Definition(scc, s1), Operand(pi->scratch_sgpr, s1), Operand(0u));
+ } else if (ctx->program->chip_class >= GFX9 && def.bytes() == 2 && def.physReg().reg() == op.physReg().reg()) {
+ aco_ptr<VOP3P_instruction> vop3p{create_instruction<VOP3P_instruction>(aco_opcode::v_pk_add_u16, Format::VOP3P, 2, 1)};
+ vop3p->operands[0] = Operand(PhysReg{op.physReg().reg()}, v1);
+ vop3p->operands[1] = Operand(0u);
+ vop3p->definitions[0] = Definition(PhysReg{op.physReg().reg()}, v1);
+ vop3p->opsel_lo = 0x1;
+ vop3p->opsel_hi = 0x2;
+ bld.insert(std::move(vop3p));
+ } else {
+ assert(def.regClass().is_subdword());
+ bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
+ bld.vop2_sdwa(aco_opcode::v_xor_b32, def, op, def_as_op);
+ bld.vop2_sdwa(aco_opcode::v_xor_b32, op_as_def, op, def_as_op);
+ }
+
+ offset += def.bytes();
+ }
+
+ if (ctx->program->chip_class <= GFX7)
+ return;
+
+ /* fixup in case we swapped bytes we shouldn't have */
+ copy_operation tmp_copy = copy;
+ tmp_copy.op.setFixed(copy.def.physReg());
+ tmp_copy.def.setFixed(copy.op.physReg());
+ do_copy(ctx, bld, tmp_copy, &preserve_scc, pi->scratch_sgpr);
+}
+
+void do_pack_2x16(lower_context *ctx, Builder& bld, Definition def, Operand lo, Operand hi)
+{
+ if (ctx->program->chip_class >= GFX9) {
+ Instruction* instr = bld.vop3(aco_opcode::v_pack_b32_f16, def, lo, hi);
+ /* opsel: 0 = select low half, 1 = select high half. [0] = src0, [1] = src1 */
+ static_cast<VOP3A_instruction*>(instr)->opsel = hi.physReg().byte() | (lo.physReg().byte() >> 1);
+ } else if (ctx->program->chip_class >= GFX8) {
+ // TODO: optimize with v_mov_b32 / v_lshlrev_b32
+ PhysReg reg = def.physReg();
+ bld.copy(Definition(reg, v2b), lo);
+ reg.reg_b += 2;
+ bld.copy(Definition(reg, v2b), hi);
+ } else {
+ assert(lo.physReg().byte() == 0 && hi.physReg().byte() == 0);
+ bld.vop2(aco_opcode::v_and_b32, Definition(lo.physReg(), v1), Operand(0xFFFFu), lo);
+ bld.vop2(aco_opcode::v_and_b32, Definition(hi.physReg(), v1), Operand(0xFFFFu), hi);
+ bld.vop2(aco_opcode::v_cvt_pk_u16_u32, def, lo, hi);
+ }
+}
+