From f83e99e97fb600336cb9c7de9f9a15c12926eb47 Mon Sep 17 00:00:00 2001 From: Jacob Lifshay Date: Fri, 13 Oct 2023 15:13:15 -0700 Subject: [PATCH] WIP divmod: finished writing out asm knuth's algorithm d, still buggy --- src/openpower/test/bigint/powmod.py | 249 ++++++++++++++++++++++++---- 1 file changed, 215 insertions(+), 34 deletions(-) diff --git a/src/openpower/test/bigint/powmod.py b/src/openpower/test/bigint/powmod.py index ea4eb29d..fd406f78 100644 --- a/src/openpower/test/bigint/powmod.py +++ b/src/openpower/test/bigint/powmod.py @@ -206,7 +206,7 @@ class _DivModRegsRegexLogger: for k, v in changes.items(): if v is None: - del self.__tracked[k] + self.__tracked.pop(k, None) else: if isinstance(v, (tuple, list)): start_gpr, size = v @@ -339,9 +339,9 @@ class DivModKnuthAlgorithmD: "t_single": 8, "s_scalar": 10, "t_for_uv_shift": 0, - "n_for_unnorm": 32, + "n_for_unnorm": 16, "t_for_unnorm": 3, - "s_for_unnorm": 34, + "s_for_unnorm": 18, "qhat": 12, "rhat_lo": 14, "rhat_hi": 15, @@ -350,6 +350,9 @@ class DivModKnuthAlgorithmD: "j": 11, "qhat_denom": 18, "qhat_num_hi": 16, + "qhat_prod_lo": 15, + "qhat_prod_hi": 18, + "sub_len": 3, } self.num_size = num_size @@ -521,20 +524,34 @@ class DivModKnuthAlgorithmD: do_log(locals(), qhat_num_hi=None, qhat_denom=None) while rhat_hi == 0: - if qhat * vn[n - 2] > (rhat_lo << self.word_size) + un[j + n - 2]: - on_corner_case("qhat adjustment") - qhat -= 1 - do_log(locals()) - carry = (rhat_lo + vn[n - 1]) >= 2 ** self.word_size - rhat_lo += vn[n - 1] - rhat_lo %= 2 ** self.word_size - do_log(locals()) - rhat_hi = carry - do_log(locals()) - else: + index = n - 2 + do_log(locals()) + qhat_prod_lo = (qhat * vn[index]) % 2 ** self.word_size + do_log(locals(), qhat_prod_lo="qhat_prod_lo", rhat_hi=None) + qhat_prod_hi = (qhat * vn[index]) >> self.word_size + do_log(locals(), qhat_prod_hi="qhat_prod_hi") + if qhat_prod_hi < rhat_lo: break + index = j + n - 2 + do_log(locals()) + if qhat_prod_hi == rhat_lo: + if qhat_prod_lo <= un[index]: + break + on_corner_case("qhat adjustment") + do_log(locals(), index=None, + qhat_prod_lo=None, qhat_prod_hi=None) + qhat -= 1 + do_log(locals(), index="index") + index = n - 1 + do_log(locals()) + carry = (rhat_lo + vn[index]) >= 2 ** self.word_size + rhat_lo = (rhat_lo + vn[index]) % 2 ** self.word_size + do_log(locals()) + rhat_hi = carry + do_log(locals(), rhat_hi="rhat_hi") - do_log(locals(), rhat_lo=None, rhat_hi=None, index=None) + do_log(locals(), rhat_lo=None, rhat_hi=None, index=None, + qhat_prod_lo=None, qhat_prod_hi=None) # Step D4: multiply and subtract @@ -546,11 +563,18 @@ class DivModKnuthAlgorithmD: product[i] = t % 2 ** self.word_size t >>= self.word_size do_log(locals()) - product[n] = t - do_log(locals(), t=None) + index = n + do_log(locals(), index="index") + product[index] = t + do_log(locals(), t=None, index=None) t = 1 - for i in range(n + 1): + do_log(locals()) + sub_len = n + 1 + do_log(locals(), sub_len="sub_len") + VL = sub_len + do_log(locals(), sub_len=None) + for i in range(VL): # subfe not_product = ~product[i] % 2 ** self.word_size t += not_product + un[j + i] @@ -581,12 +605,16 @@ class DivModKnuthAlgorithmD: do_log(locals()) q[j] = qhat - do_log(locals()) + do_log(locals(), index=None) # Step D8: un-normalize - do_log(locals(), s="s_for_unnorm", vn=None, m=None, j=None) + + # move s and n + do_log(locals(), s="s_for_unnorm", n="n_for_unnorm", + vn=None, m=None, j=None) + r = [0] * self.r_size # remainder - do_log(locals(), r=("r", self.r_size), n="n_for_unnorm") + do_log(locals(), r=("r", self.r_size)) # r = un >> s t = 0 do_log(locals(), t="t_for_unnorm") @@ -628,6 +656,9 @@ class DivModKnuthAlgorithmD: j = self.regs["j"] qhat_num_hi = self.regs["qhat_num_hi"] qhat_denom = self.regs["qhat_denom"] + qhat_prod_lo = self.regs["qhat_prod_lo"] + qhat_prod_hi = self.regs["qhat_prod_hi"] + sub_len = self.regs["sub_len"] num_size = self.num_size denom_size = self.denom_size q_size = self.q_size @@ -670,7 +701,7 @@ class DivModKnuthAlgorithmD: yield f"ori 0, 0, {svshape_low}" yield f"mtspr {SVSHAPE0}, 0 # mtspr SVSHAPE0, 0" yield f"svremap 0o01, 0, 0, 0, 0, 0, 0" # enable SVSHAPE0 for RA - yield f"sv.cmpi/ff=ne *0, 1, *{u}, 0" + yield f"sv.cmpli/ff=ne *0, 1, *{u}, 0" yield f"setvl {m}, 0, 1, 0, 0, 0 # getvl {m}" # m = VL yield f"subfic {m}, {m}, {num_size}" # m = num_size - m @@ -686,12 +717,12 @@ class DivModKnuthAlgorithmD: yield f"ori 0, 0, {svshape_low}" yield f"mtspr {SVSHAPE0}, 0 # mtspr SVSHAPE0, 0" yield f"svremap 0o01, 0, 0, 0, 0, 0, 0" # enable SVSHAPE0 for RA - yield f"sv.cmpi/ff=ne *0, 1, *{v}, 0" + yield f"sv.cmpli/ff=ne *0, 1, *{v}, 0" yield f"setvl {n_scalar}, 0, 1, 0, 0, 0 # getvl {n_scalar}" # n = VL # n = denom_size - n yield f"subfic {n_scalar}, {n_scalar}, {denom_size}" - yield f"cmpi 0, 1, {n_scalar}, 1 # cmpdi {n_scalar}, 1" + yield f"cmpli 0, 1, {n_scalar}, 1 # cmpldi {n_scalar}, 1" yield "bc 4, 2, divmod_skip_sw_divisor # bne divmod_skip_sw_divisor" # Knuth's algorithm D requires the divisor to have length >= 2 @@ -710,7 +741,7 @@ class DivModKnuthAlgorithmD: yield "b divmod_return" yield "divmod_skip_sw_divisor:" - yield f"cmp 0, 1, {m}, {n_scalar} # cmpd {m}, {n_scalar}" + yield f"cmpl 0, 1, {m}, {n_scalar} # cmpld {m}, {n_scalar}" yield "bc 4, 0, divmod_skip_copy_r # bge divmod_skip_copy_r" # if m < n: @@ -727,19 +758,19 @@ class DivModKnuthAlgorithmD: # calculate amount to shift by -- count leading zeros yield f"addi {index}, {n_scalar}, -1" # index = n - 1 assert index == 3, "index must be r3" - yield f"setvl. 0, 0, {denom_size}, 0, 1, 1" # VL = denom_size + yield f"setvl 0, 0, {denom_size}, 0, 1, 1" # VL = denom_size yield f"sv.cntlzd/m=1<> self.word_size + yield f"mulhdu {rhat_hi}, {qhat}, {qhat_denom}" + # borrow = un[index] < rhat_lo + # rhat_lo = (un[index] - rhat_lo) % 2 ** self.word_size + assert index == 3, "index must be r3" + yield f"sv.subfc/m=1<> self.word_size + yield f"sv.mulhdu/m=1<= 2 ** self.word_size + # rhat_lo = (rhat_lo + vn[index]) % 2 ** self.word_size + yield f"setvl 0, 0, {vn_size}, 0, 1, 1" # VL = vn_size + assert index == 3, "index must be r3" + yield f"sv.addc/m=1<> 16 + offset_field = svshape.fsi['offset'] + assert 2 ** (len(offset_field) - 1) >= q_size, \ + "max needed offset won't fit in SVSHAPE" + mask_start_le = len(svshape) - offset_field.br[0] - 1 + mask_start = 64 - mask_start_le - 1 + last = len(offset_field) - 1 + shift_amount = len(svshape) - offset_field.br[last] - 1 + # insert j in offset field + yield f"rldic 0, {j}, {shift_amount}, {mask_start}" + # or in all the other bits + if svshape_high != 0: + yield f"oris 0, 0, {svshape_high}" + yield f"ori 0, 0, {svshape_low}" + yield f"mtspr {SVSHAPE0}, 0 # mtspr SVSHAPE0, 0" + yield f"svremap 0o12, 0, 0, 0, 0, 0, 0" # enable SVSHAPE0 for RB & RT + # un[j:] -= product + yield f"sv.subfe *{un}, *{product}, *{un}" + # need_fixup = not CA + + # Step D5: test remainder + + yield f"mcrxrx 0" # move CA to CR0.eq + # if need_fixup: + yield "bc 4, 2, divmod_skip_fixup # bne divmod_skip_fixup" + + # Step D6: add back + + yield f"addi {qhat}, {qhat}, -1" # qhat -= 1 + yield "addic 0, 0, 0" # t = 0 (t is CA) + yield f"setvl 0, {n_scalar}, {vn_size}, 0, 1, 1" # VL = n + yield f"svremap 0o11, 0, 0, 0, 0, 0, 0" # enable SVSHAPE0 for RA & RT + # un[j:] += vn + yield f"sv.adde *{un}, *{un}, *{vn}" + yield f"svremap 0o11, 0, 0, 0, 0, 0, 0" # enable SVSHAPE0 for RA & RT + # un[j + n] += t + yield f"sv.addze *{un}, *{un}" + + yield "divmod_skip_fixup:" + yield f"setvl 0, 0, {q_size}, 0, 1, 1" # VL = q_size + yield f"svremap 0o10, 0, 0, 0, 0, 0, 0" # enable SVSHAPE0 for RT + # q[j] = qhat + yield f"sv.or {q}, {qhat}, {qhat}" # Step D2 and Step D7: loop yield f"addic. {j}, {j}, -1" # j -= 1 yield f"bc 4, 0, divmod_loop # bge divmod_loop" - # FIXME: finish + # Step D8: un-normalize + + # move s and n + yield f"or {s_for_unnorm}, {s_scalar}, {s_scalar}" + yield f"or {n_for_unnorm}, {n_scalar}, {n_scalar}" + + # r = [0] * self.r_size # remainder + yield f"setvl 0, 0, {r_size}, 0, 1, 1" # VL = r_size + yield f"sv.addi *{r}, 0, 0" + + # r = un >> s + yield f"addi {t_for_unnorm}, 0, 0" # t = 0 + yield f"setvl 0, {n_for_unnorm}, {r_size}, 0, 1, 1" # VL = n + yield f"sv.dsrd/mrr *{r}, *{un}, {s_for_unnorm}, {t_for_unnorm}" yield "divmod_return:" yield "addi 1, 1, 176" # teardown stack frame @@ -968,8 +1149,8 @@ class PowModCases(TestAccumulatorBase): if n << 64 < n: skip = False if skip: - # FIXME: only part of the algorithm is implemented, - # so we skip the cases that we expect to fail + # FIXME: only part of the algorithm works, + # so we skip the cases that we know fail continue q, r = divmod(n, d) with self.subTest(n=f"{n:#_x}", d=f"{d:#_x}", -- 2.30.2