return u_bit_consecutive(intersection_start, intersection_end - intersection_start) & mask;
}
-bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool *preserve_scc)
+bool do_copy(lower_context* ctx, Builder& bld, const copy_operation& copy, bool *preserve_scc, PhysReg scratch_sgpr)
{
bool did_copy = false;
for (unsigned offset = 0; offset < copy.bytes;) {
assert(op.physReg().byte() == 0);
def = Definition(def.physReg().advance(-def.physReg().byte()), v1);
bld.vop2(aco_opcode::v_and_b32, def, Operand((1 << bits) - 1u), Operand(def.physReg(), op.regClass()));
- bld.vop2(aco_opcode::v_lshlrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op);
- bld.vop2(aco_opcode::v_or_b32, def, Operand(def.physReg(), op.regClass()), op);
- bld.vop2(aco_opcode::v_lshrrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op);
+ if (def.physReg().reg() == op.physReg().reg()) {
+ if (bits < 24) {
+ bld.vop2(aco_opcode::v_mul_u32_u24, def, Operand((1 << bits) + 1u), op);
+ } else {
+ bld.sop1(aco_opcode::s_mov_b32, Definition(scratch_sgpr, s1), Operand((1 << bits) + 1u));
+ bld.vop3(aco_opcode::v_mul_lo_u32, def, Operand(scratch_sgpr, s1), op);
+ }
+ } else {
+ bld.vop2(aco_opcode::v_lshlrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op);
+ bld.vop2(aco_opcode::v_or_b32, def, Operand(def.physReg(), op.regClass()), op);
+ bld.vop2(aco_opcode::v_lshrrev_b32, Definition(op.physReg(), def.regClass()), Operand(bits), op);
+ }
} else {
bld.vop1(aco_opcode::v_mov_b32, def, op);
}
copy_operation tmp_copy = copy;
tmp_copy.op.setFixed(copy.def.physReg());
tmp_copy.def.setFixed(copy.op.physReg());
- do_copy(ctx, bld, tmp_copy, &preserve_scc);
+ do_copy(ctx, bld, tmp_copy, &preserve_scc, pi->scratch_sgpr);
}
void handle_operands(std::map<PhysReg, copy_operation>& copy_map, lower_context* ctx, chip_class chip_class, Pseudo_instruction *pi)
}
}
- bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc);
+ bool did_copy = do_copy(ctx, bld, it->second, &preserve_scc, pi->scratch_sgpr);
std::pair<PhysReg, copy_operation> copy = *it;
}
/* if all operands are constant, no need to care either */
bool reads_sgpr = false;
+ bool reads_subdword = false;
for (Operand& op : instr->operands) {
if (op.isTemp() && op.getTemp().type() == RegType::sgpr) {
reads_sgpr = true;
break;
}
+ if (op.isTemp() && op.regClass().is_subdword())
+ reads_subdword = true;
}
- if (!(writes_sgpr && reads_sgpr))
+ bool needs_scratch_reg = (writes_sgpr && reads_sgpr) ||
+ (ctx.program->chip_class <= GFX7 && reads_subdword);
+ if (!needs_scratch_reg)
return;
Pseudo_instruction *pi = (Pseudo_instruction *)instr;
reg = ctx.max_used_sgpr + 1;
for (; reg < ctx.program->max_reg_demand.sgpr && reg_file[reg]; reg++)
;
- assert(reg < ctx.program->max_reg_demand.sgpr);
+ if (reg == ctx.program->max_reg_demand.sgpr) {
+ assert(reads_subdword && reg_file[m0] == 0);
+ reg = m0;
+ }
}
adjust_max_used_regs(ctx, s1, reg);