From: Luke Kenneth Casson Leighton Date: Tue, 27 Jul 2021 15:40:38 +0000 (+0100) Subject: adding reduced COS table DCT test X-Git-Tag: xlen-bcd~200 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=40d4fe41d14633db571428a74afcc1866bd5a7cb;p=openpower-isa.git adding reduced COS table DCT test --- diff --git a/openpower/isa/simplev.mdwn b/openpower/isa/simplev.mdwn index fd0c242a..7d70691b 100644 --- a/openpower/isa/simplev.mdwn +++ b/openpower/isa/simplev.mdwn @@ -164,8 +164,10 @@ Pseudo-code: # set up FRB and FRS SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim SVSHAPE0[30:31] <- 0b01 # DCT/FFT mode - #if (SVRM = 0b0100) then - SVSHAPE0[6:11] <- 0b000001 # DCT Inner Butterfly mode + if (SVRM = 0b0100) then + SVSHAPE0[6:11] <- 0b000011 # DCT Inner Butterfly mode 4 + else + SVSHAPE0[6:11] <- 0b000001 # DCT Inner Butterfly mode 2 SVSHAPE0[18:20] <- 0b001 # DCT Inner Butterfly sub-mode SVSHAPE0[21:23] <- 0b001 # "inverse" on outer loop # copy diff --git a/src/openpower/decoder/isa/remap_dct_yield.py b/src/openpower/decoder/isa/remap_dct_yield.py index 344d3404..3b2bf64c 100644 --- a/src/openpower/decoder/isa/remap_dct_yield.py +++ b/src/openpower/decoder/isa/remap_dct_yield.py @@ -108,7 +108,7 @@ def iterate_dct_inner_butterfly_indices(SVSHAPE): # get indices to iterate over, in the required order n = SVSHAPE.lims[0] mode = SVSHAPE.lims[1] - #print ("inner butterfly", mode) + print ("inner butterfly", mode) # creating lists of indices to iterate over in each dimension # has to be done dynamically, because it depends on the size # first, the size-based loop (which can be done statically) @@ -145,6 +145,8 @@ def iterate_dct_inner_butterfly_indices(SVSHAPE): # start an infinite (wrapping) loop skip = 0 + k = 0 + k_start = 0 while True: for size in x_r: # loop over 3rd order dimension (size) x_end = size == x_r[-1] @@ -171,22 +173,30 @@ def iterate_dct_inner_butterfly_indices(SVSHAPE): jr = j_r[:hz2] #print ("xform jr", jr) # loop over 1st order dimension + k = k_start for ci, (jl, jh) in enumerate(zip(j, jr)): z_end = jl == j[-1] # now depending on MODE return the index. inner butterfly if SVSHAPE.skip == 0b00: # in [0b00, 0b10]: result = ri[ji[jl]] # lower half elif SVSHAPE.skip == 0b01: # in [0b01, 0b11]: - result = ri[ji[jh]] # upper half, reverse order - elif SVSHAPE.skip == 0b10: # - result = ci # coefficient helper - elif SVSHAPE.skip == 0b11: # - result = size # coefficient helper + result = ri[ji[jh]] # upper half + elif mode == 4: + # COS table pre-generated mode + if SVSHAPE.skip == 0b10: # + result = k # cos table offset + else: # mode 2 + # COS table generated on-demand ("Vertical-First") mode + if SVSHAPE.skip == 0b10: # + result = ci # coefficient helper + elif SVSHAPE.skip == 0b11: # + result = size # coefficient helper loopends = (z_end | ((y_end and z_end)<<1) | ((y_end and x_end and z_end)<<2)) yield result + SVSHAPE.offset, loopends + k += 1 # now in-place swap if inplace_mode: @@ -196,6 +206,9 @@ def iterate_dct_inner_butterfly_indices(SVSHAPE): tmp1, tmp2 = ji[jlh], ji[jh] ji[jlh], ji[jh] = tmp2, tmp1 + # new k_start point for cos tables( runs inside x_r loop NOT i loop) + k_start += halfsize + # python "yield" can be iterated. use this to make it clear how # the indices are generated by using natural-looking nested loops diff --git a/src/openpower/decoder/isa/test_caller_svp64_dct.py b/src/openpower/decoder/isa/test_caller_svp64_dct.py index 4b7c62b5..49fc8df7 100644 --- a/src/openpower/decoder/isa/test_caller_svp64_dct.py +++ b/src/openpower/decoder/isa/test_caller_svp64_dct.py @@ -457,7 +457,8 @@ class DCTTestCase(FHDLTestCase): def test_sv_remap_dct_cos_precompute_inner_8(self): """pre-computes a DCT COS table, using the shorter costable - indices schedule + indices schedule. turns out, some COS values are repeated + in each layer of the DCT butterfly. the simpler (scalar) version is in test_caller_transcendentals.py (test_fp_coss_cvt), this is the SVP64 variant. TODO: really @@ -530,6 +531,75 @@ class DCTTestCase(FHDLTestCase): "err", err) self.assertTrue(err < 1e-6) + def test_sv_remap_fpmadds_dct_8_mode_4(self): + """>>> lst = ["svremap 31, 1, 0, 2, 0, 1, 1", + "svshape 8, 1, 1, 4, 0", + "sv.fdmadds 0.v, 0.v, 0.v, 8.v" + "svshape 8, 1, 1, 3, 0", + "sv.fadds 0.v, 0.v, 0.v" + ] + runs a full in-place 8-long O(N log2 N) DCT, both + inner and outer butterfly "REMAP" schedules. + uses shorter tables: FRC also needs to be on a Schedule + """ + lst = SVP64Asm( ["svremap 31, 1, 0, 2, 0, 1, 1", + "svshape 8, 1, 1, 4, 0", + "sv.fdmadds 0.v, 0.v, 0.v, 8.v", + "svshape 8, 1, 1, 3, 0", + "sv.fadds 0.v, 0.v, 0.v" + ]) + lst = list(lst) + + # array and coefficients to test + avi = [7.0, -9.8, 3.0, -32.3, 2.1, 3.6, 0.7, -0.2] + n = len(avi) + levels = n.bit_length() - 1 + ri = list(range(n)) + ri = [ri[reverse_bits(i, levels)] for i in range(n)] + av = halfrev2(avi, False) + av = [av[ri[i]] for i in range(n)] + ctable = [] + size = n + while size >= 2: + halfsize = size // 2 + for ci in range(halfsize): + ctable.append(math.cos((ci + 0.5) * math.pi / size) * 2.0) + size //= 2 + + # store in regfile + fprs = [0] * 32 + for i, a in enumerate(av): + fprs[i+0] = fp64toselectable(a) + for i, c in enumerate(ctable): + fprs[i+8] = fp64toselectable(1.0 / c) # invert + + with Program(lst, bigendian=False) as program: + sim = self.run_tst_program(program, initial_fprs=fprs) + print ("spr svshape0", sim.spr['SVSHAPE0']) + print (" xdimsz", sim.spr['SVSHAPE0'].xdimsz) + print (" ydimsz", sim.spr['SVSHAPE0'].ydimsz) + print (" zdimsz", sim.spr['SVSHAPE0'].zdimsz) + print ("spr svshape1", sim.spr['SVSHAPE1']) + print ("spr svshape2", sim.spr['SVSHAPE2']) + print ("spr svshape3", sim.spr['SVSHAPE3']) + + # outer iterative sum + res = transform2(avi) + + for i, expected in enumerate(res): + print ("i", i, float(sim.fpr(i)), "expected", expected) + for i, expected in enumerate(res): + # convert to Power single + expected = DOUBLE2SINGLE(fp64toselectable(expected)) + expected = float(expected) + actual = float(sim.fpr(i)) + # approximate error calculation, good enough test + # reason: we are comparing FMAC against FMUL-plus-FADD-or-FSUB + # and the rounding is different + err = abs((actual - expected) / expected) + print ("err", i, err) + self.assertTrue(err < 1e-5) + def run_tst_program(self, prog, initial_regs=None, svstate=None, initial_mem=None,