From cbedd169c2d0848fb5bedb076ae401946067ec91 Mon Sep 17 00:00:00 2001 From: Luke Kenneth Casson Leighton Date: Sun, 1 Aug 2021 18:37:45 +0100 Subject: [PATCH] bit of a big update, remove all bit-reversed LD operations, replace with LD-with-shift, and fix LDST, DCT and FFT unit tests to use new bitrev-with-half-swap REMAP modes --- openpower/isa/simplev.mdwn | 7 +- openpower/isa/svfixedload.mdwn | 22 ++--- openpower/isa/svfpload.mdwn | 16 ++-- src/openpower/consts.py | 2 +- src/openpower/decoder/isa/caller.py | 30 ++---- src/openpower/decoder/isa/remap_dct_yield.py | 10 +- src/openpower/decoder/isa/svshape.py | 2 +- .../decoder/isa/test_caller_svp64_dct.py | 4 +- .../decoder/isa/test_caller_svp64_fft.py | 17 +++- .../decoder/isa/test_caller_svp64_ldst.py | 93 +++++++------------ src/openpower/decoder/power_decoder2.py | 8 +- src/openpower/decoder/power_enums.py | 2 +- src/openpower/decoder/power_svp64_rm.py | 6 +- src/openpower/sv/trans/svp64.py | 40 ++++---- 14 files changed, 114 insertions(+), 145 deletions(-) diff --git a/openpower/isa/simplev.mdwn b/openpower/isa/simplev.mdwn index 0ad1da1b..91a53841 100644 --- a/openpower/isa/simplev.mdwn +++ b/openpower/isa/simplev.mdwn @@ -245,13 +245,16 @@ Pseudo-code: SVSHAPE1[28:29] <- 0b10 # ci schedule SVSHAPE2[28:29] <- 0b11 # size schedule # set schedule up for iDCT / DCT inverse of half-swapped ordering - if (SVRM = 0b0110) | (SVRM = 0b1110) then + if (SVRM = 0b0110) | (SVRM = 0b1110) | (SVRM = 0b1111) then vlen[0:6] <- (0b00 || SVxd) + 0b0000001 # set up template in SVSHAPE0 SVSHAPE0[0:5] <- (0b0 || SVxd) # xdim if (SVRM = 0b1110) then SVSHAPE0[18:20] <- 0b001 # DCT opposite half-swap - SVSHAPE0[30:31] <- 0b01 # DCT/FFT mode + if (SVRM = 0b1111) then + SVSHAPE0[30:31] <- 0b01 # FFT mode + else + SVSHAPE0[30:31] <- 0b11 # DCT mode SVSHAPE0[6:11] <- 0b000101 # DCT "half-swap" mode # set VL, MVL and Vertical-First SVSTATE[0:6] <- vlen diff --git a/openpower/isa/svfixedload.mdwn b/openpower/isa/svfixedload.mdwn index f8d9de9c..d97eca95 100644 --- a/openpower/isa/svfixedload.mdwn +++ b/openpower/isa/svfixedload.mdwn @@ -12,7 +12,7 @@ Pseudo-code: b <- (RA|0) n <- (RC)[58:63] - EA <- b + SHL64(EXTS(SVD), n) + EA <- b + SHL64(srcstep * EXTS(SVD), n) RT <- [0]*56 || MEM(EA, 1) Special Registers Altered: @@ -28,7 +28,7 @@ SVD-Form Pseudo-code: n <- (RC)[58:63] - EA <- (RA) + SHL64(EXTS(SVD), n) + EA <- (RA) + SHL64(srcstep * EXTS(SVD), n) RT <- [0] * 56 || MEM(EA, 1) RA <- EA @@ -46,7 +46,7 @@ Pseudo-code: b <- (RA|0) n <- (RC)[58:63] - EA <- b + SHL64(EXTS(SVD), n) + EA <- b + SHL64(srcstep * EXTS(SVD), n) RT <- [0] * 48 || MEM(EA, 2) Special Registers Altered: @@ -62,7 +62,7 @@ SVD-Form Pseudo-code: n <- (RC)[58:63] - EA <- (RA) + SHL64(EXTS(SVD), n) + EA <- (RA) + SHL64(srcstep * EXTS(SVD), n) RT <- [0] * 48 || MEM(EA, 2) RA <- EA @@ -80,7 +80,7 @@ Pseudo-code: b <- (RA|0) n <- (RC)[58:63] - EA <- b + SHL64(EXTS(SVD), n) + EA <- b + SHL64(srcstep * EXTS(SVD), n) RT <- EXTS(MEM(EA, 2)) Special Registers Altered: @@ -96,7 +96,7 @@ SVD-Form Pseudo-code: n <- (RC)[58:63] - EA <- (RA) + SHL64(EXTS(SVD), n) + EA <- (RA) + SHL64(srcstep * EXTS(SVD), n) RT <- EXTS(MEM(EA, 2)) RA <- EA @@ -114,7 +114,7 @@ Pseudo-code: b <- (RA|0) n <- (RC)[58:63] - EA <- b + SHL64(EXTS(SVD), n) + EA <- b + SHL64(srcstep * EXTS(SVD), n) RT <- [0] * 32 || MEM(EA, 4) Special Registers Altered: @@ -130,7 +130,7 @@ SVD-Form Pseudo-code: n <- (RC)[58:63] - EA <- (RA) + SHL64(EXTS(SVD), n) + EA <- (RA) + SHL64(srcstep * EXTS(SVD), n) RT <- [0]*32 || MEM(EA, 4) RA <- EA @@ -148,7 +148,7 @@ Pseudo-code: b <- (RA|0) n <- (RC)[58:63] - EA <- b + SHL64(bitrev(EXTS(SVDS || 0b00), n) + EA <- b + SHL64(srcstep * EXTS(SVDS || 0b00), n) RT <- EXTS(MEM(EA, 4)) Special Registers Altered: @@ -165,7 +165,7 @@ Pseudo-code: b <- (RA|0) n <- (RC)[58:63] - EA <- b + SHL64(EXTS(SVDS || 0b00), n) + EA <- b + SHL64(srcstep * EXTS(SVDS || 0b00), n) RT <- MEM(EA, 8) Special Registers Altered: @@ -181,7 +181,7 @@ SVDS-Form Pseudo-code: n <- (RC)[58:63] - EA <- (RA) + SHL64(EXTS(SVDS || 0b00), n) + EA <- (RA) + SHL64(srcstep * EXTS(SVDS || 0b00), n) RT <- MEM(EA, 8) RA <- EA diff --git a/openpower/isa/svfpload.mdwn b/openpower/isa/svfpload.mdwn index 533ec829..d8856c50 100644 --- a/openpower/isa/svfpload.mdwn +++ b/openpower/isa/svfpload.mdwn @@ -6,13 +6,13 @@ SVD-Form -* lfsbr FRT,SVD(RA),RC +* lfssh FRT,SVD(RA),RC Pseudo-code: b <- (RA|0) n <- (RC)[58:63] - EA <- b + SHL64(bitrev(srcstep, VL) * EXTS(SVD), n) + EA <- b + SHL64(srcstep * EXTS(SVD), n) FRT <- DOUBLE(MEM(EA, 4)) Special Registers Altered: @@ -23,12 +23,12 @@ Special Registers Altered: SVD-Form -* lfsubr FRT,SVD(RA),RC +* lfsush FRT,SVD(RA),RC Pseudo-code: n <- (RC)[58:63] - EA <- (RA) + SHL64(bitrev(srcstep, VL) * EXTS(SVD), n) + EA <- (RA) + SHL64(srcstep * EXTS(SVD), n) FRT <- DOUBLE(MEM(EA, 4)) RA <- EA @@ -40,13 +40,13 @@ Special Registers Altered: SVD-Form -* lfdbr FRT,SVD(RA),RC +* lfdsh FRT,SVD(RA),RC Pseudo-code: b <- (RA|0) n <- (RC)[58:63] - EA <- b + SHL64(bitrev(srcstep, VL) * EXTS(SVD), n) + EA <- b + SHL64(srcstep * EXTS(SVD), n) FRT <- MEM(EA, 8) Special Registers Altered: @@ -57,12 +57,12 @@ Special Registers Altered: SVD-Form -* lfdubr FRT,SVD(RA),RC +* lfdush FRT,SVD(RA),RC Pseudo-code: n <- (RC)[58:63] - EA <- (RA) + SHL64(bitrev(srcstep, VL) * EXTS(SVD), n) + EA <- (RA) + SHL64(srcstep * EXTS(SVD), n) FRT <- MEM(EA, 8) RA <- EA diff --git a/src/openpower/consts.py b/src/openpower/consts.py index 5c60bd97..747c9d12 100644 --- a/src/openpower/consts.py +++ b/src/openpower/consts.py @@ -222,7 +222,7 @@ class SVP64MODEb: # mode bits MOD2_MSB = 0 MOD2_LSB = 1 - LDST_BITREV = 2 # set =1 for bitreverse mode + LDST_SHIFT = 2 # set =1 for shift mode # when predicate not set: 0=ignore/skip 1=zero DZ = 3 # for destination SZ = 4 # for source diff --git a/src/openpower/decoder/isa/caller.py b/src/openpower/decoder/isa/caller.py index 8150785b..413494ca 100644 --- a/src/openpower/decoder/isa/caller.py +++ b/src/openpower/decoder/isa/caller.py @@ -28,7 +28,7 @@ from openpower.decoder.power_enums import (spr_dict, spr_byname, XER_bits, from openpower.decoder.power_enums import SVPtype -from openpower.decoder.helpers import (exts, gtu, ltu, undefined, bitrev) +from openpower.decoder.helpers import (exts, gtu, ltu, undefined) from openpower.consts import PIb, MSRb # big-endian (PowerISA versions) from openpower.consts import SVP64CROffs from openpower.decoder.power_svp64 import SVP64RM, decode_extra @@ -931,8 +931,6 @@ class ISACaller: yield self.dec2.dec.raw_opcode_in.eq(ins & 0xffffffff) # v3.0B suffix yield self.dec2.sv_rm.eq(sv_rm) # svp64 prefix yield Settle() - # store this for use in get_src_dststeps() - self.ldstmode = yield self.dec2.rm_dec.ldstmode def execute_one(self): """execute one instruction @@ -1318,13 +1316,13 @@ class ISACaller: replace_d = False # update / replace constant in pseudocode if self.is_svp64_mode: ldstmode = yield self.dec2.rm_dec.ldstmode - # bitreverse mode reads SVD (or SVDS - TODO) + # shift mode reads SVD (or SVDS - TODO) # *BUT*... because this is "overloading" of LD operations, # it gets *STORED* into D (or DS, TODO) - if ldstmode == SVP64LDSTmode.BITREVERSE.value: + if ldstmode == SVP64LDSTmode.SHIFT.value: imm = yield self.dec2.dec.fields.FormSVD.SVD[0:11] imm = exts(imm, 11) # sign-extend to integer - log ("bitrev SVD", imm) + log ("shift SVD", imm) replace_d = True else: if info.form == 'DS': @@ -1348,11 +1346,11 @@ class ISACaller: offsmul = dststep log("D-field dst", imm, offsmul) # bit-reverse mode, rev already done through get_src_dst_steps() - if ldstmode == SVP64LDSTmode.BITREVERSE.value: + if ldstmode == SVP64LDSTmode.SHIFT.value: # manually look up RC, sigh RC = yield self.dec2.dec.RC[0:5] RC = self.gpr(RC) - log ("LD-BITREVERSE:", "VL", vl, + log ("LD-SHIFT:", "VL", vl, "RC", RC.value, "imm", imm, "offs", bin(offsmul), ) @@ -1665,21 +1663,9 @@ class ISACaller: log (" new dststep", dststep) def get_src_dststeps(self): - """gets srcstep and dststep but performs bit-reversal on srcstep if - required. use this ONLY to perform calculations, do NOT update - SVSTATE with the bit-reversed value of srcstep - - ARGH, had to store self.ldstmode and VL due to yield issues + """gets srcstep and dststep """ - srcstep, dststep = self.new_srcstep, self.new_dststep - if self.is_svp64_mode: - if self.ldstmode == SVP64LDSTmode.BITREVERSE.value: - vl = self.svstate.vl - log ("SRCSTEP-BITREVERSE:", "VL", vl, "srcstep", srcstep, - "rev", bin(bitrev(srcstep, vl))) - srcstep = bitrev(srcstep, vl) - - return (srcstep, dststep) + return self.new_srcstep, self.new_dststep def update_new_svstate_steps(self): # note, do not get the bit-reversed srcstep here! diff --git a/src/openpower/decoder/isa/remap_dct_yield.py b/src/openpower/decoder/isa/remap_dct_yield.py index e97f2ec6..c2758444 100644 --- a/src/openpower/decoder/isa/remap_dct_yield.py +++ b/src/openpower/decoder/isa/remap_dct_yield.py @@ -57,13 +57,15 @@ def iterate_dct_inner_halfswap_loadstore(SVSHAPE): ji = list(range(n)) levels = n.bit_length() - 1 - if SVSHAPE.submode2 == 0b001: + ri = [reverse_bits(i, levels) for i in range(n)] + + if SVSHAPE.mode == 0b01: # FFT, bitrev only + ji = [ji[ri[i]] for i in range(n)] + elif SVSHAPE.submode2 == 0b001: + ji = [ji[ri[i]] for i in range(n)] ji = halfrev2(ji, True) else: ji = halfrev2(ji, False) - - if False: # swap: TODO, add extra bit-reverse mode - ri = [reverse_bits(i, levels) for i in range(n)] ji = [ji[ri[i]] for i in range(n)] # invert order if requested diff --git a/src/openpower/decoder/isa/svshape.py b/src/openpower/decoder/isa/svshape.py index e445f583..4d4cb1d2 100644 --- a/src/openpower/decoder/isa/svshape.py +++ b/src/openpower/decoder/isa/svshape.py @@ -128,7 +128,7 @@ class SVSHAPE(SelectableInt): iterate_fn = iterate_dct_outer_butterfly_indices elif self.ydimsz in [5, 13]: iterate_fn = iterate_dct_inner_costable_indices - elif self.ydimsz == 6: + elif self.ydimsz in [6, 14, 15]: iterate_fn = iterate_dct_inner_halfswap_loadstore # create a **NEW** iterator each time this is called return iterate_fn(deepcopy(self)) diff --git a/src/openpower/decoder/isa/test_caller_svp64_dct.py b/src/openpower/decoder/isa/test_caller_svp64_dct.py index ccdae236..f19b1d3e 100644 --- a/src/openpower/decoder/isa/test_caller_svp64_dct.py +++ b/src/openpower/decoder/isa/test_caller_svp64_dct.py @@ -921,7 +921,7 @@ class DCTTestCase(FHDLTestCase): """>>> lst = [# LOAD bit-reversed with half-swap "svshape 8, 1, 1, 6, 0", "svremap 1, 0, 0, 0, 0, 0, 0, 1", - "sv.lfsbr 0.v, 4(1), 2", + "sv.lfssh 0.v, 4(1), 2", # Inner butterfly, twin +/- MUL-ADD-SUB "svremap 31, 1, 0, 2, 0, 1, 1", "svshape 8, 1, 1, 4, 0", @@ -939,7 +939,7 @@ class DCTTestCase(FHDLTestCase): lst = SVP64Asm( ["addi 1, 0, 0x000", "svshape 8, 1, 1, 6, 0", "svremap 1, 0, 0, 0, 0, 0, 0, 1", - "sv.lfsbr 0.v, 4(1), 2", + "sv.lfssh 0.v, 4(1), 2", "svremap 31, 1, 0, 2, 0, 1, 1", "svshape 8, 1, 1, 4, 0", "sv.fdmadds 0.v, 0.v, 0.v, 8.v", diff --git a/src/openpower/decoder/isa/test_caller_svp64_fft.py b/src/openpower/decoder/isa/test_caller_svp64_fft.py index 28aca452..25970efd 100644 --- a/src/openpower/decoder/isa/test_caller_svp64_fft.py +++ b/src/openpower/decoder/isa/test_caller_svp64_fft.py @@ -530,10 +530,16 @@ class FFTTestCase(FHDLTestCase): however it turns out that they can be *merged*, and for the first one (sv.fmadds/sv.fmsubs) the scalar arguments (RT, RB) - *ignore* their REMAPs (by definition), and for the second - one (sv.ffads) exactly the right REMAPs are also ignored! + *ignore* their REMAPs (by definition, because you can't REMAP + scalar operands), and for the second one (sv.ffads) exactly the + right REMAPs are also ignored! + therefore we can merge: + "svremap 5, 1, 0, 2, 0, 0, 1", + "svremap 26, 0, 0, 0, 0, 1, 1", + into: "svremap 31, 1, 0, 2, 0, 1, 1", + and save one instruction. """ lst = SVP64Asm( [ # set triple butterfly mode with persistent "REMAP" @@ -682,7 +688,7 @@ class FFTTestCase(FHDLTestCase): def test_sv_remap_fpmadds_fft_ldst(self): """>>>lst = ["setvl 0, 0, 8, 0, 1, 1", - "sv.lfsbr 0.v, 4(0), 20", # bit-reversed + "sv.lfssh 0.v, 4(0), 20", # bit-reversed "svshape 8, 1, 1, 1, 0", "svremap 31, 1, 0, 2, 0, 1, 0", "sv.ffmadds 0.v, 0.v, 0.v, 8.v" @@ -690,8 +696,9 @@ class FFTTestCase(FHDLTestCase): runs a full in-place O(N log2 N) butterfly schedule for Discrete Fourier Transform, using bit-reversed LD/ST """ - lst = SVP64Asm( ["setvl 0, 0, 8, 0, 1, 1", - "sv.lfsbr 0.v, 4(0), 20", # bit-reversed + lst = SVP64Asm( ["svshape 8, 1, 1, 15, 0", + "svremap 1, 0, 0, 0, 0, 0, 0, 0", + "sv.lfssh 0.v, 4(0), 20", # shifted "svshape 8, 1, 1, 1, 0", "svremap 31, 1, 0, 2, 0, 1, 0", "sv.ffmadds 0.v, 0.v, 0.v, 8.v" diff --git a/src/openpower/decoder/isa/test_caller_svp64_ldst.py b/src/openpower/decoder/isa/test_caller_svp64_ldst.py index eee4d51f..c76fc874 100644 --- a/src/openpower/decoder/isa/test_caller_svp64_ldst.py +++ b/src/openpower/decoder/isa/test_caller_svp64_ldst.py @@ -118,7 +118,7 @@ class DecoderTestCase(FHDLTestCase): self.assertEqual(sim.gpr(12), SelectableInt(0x1234, 64)) self.assertEqual(sim.gpr(13), SelectableInt(0x1235, 64)) - def test_sv_load_store_bitreverse(self): + def test_sv_load_store_shifted(self): """>>> lst = ["addi 1, 0, 0x0010", "addi 2, 0, 0x0004", "addi 3, 0, 0x0002", @@ -127,21 +127,11 @@ class DecoderTestCase(FHDLTestCase): "addi 7, 0, 0x303", "addi 8, 0, 0x404", "sv.stw 5.v, 0(1)", - "sv.lwzbr 12.v, 4(1), 2"] + "sv.lwzsh 12.v, 4(1), 2"] - note: bitreverse mode is... odd. it's the butterfly generator - from Cooley-Tukey FFT: - https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms - - bitreverse LD is computed as: + shifted LD is computed as: for i in range(VL): - EA = (RA|0) + (EXTS(D) * LDSTsize * bitreverse(i, VL)) << RC - - bitreversal of 0 1 2 3 in binary 0b00 0b01 0b10 0b11 - produces 0 2 1 3 in binary 0b00 0b10 0b01 0b11 - - and thus creates the butterfly needed for one iteration of FFT. - the RC (shift) is to be able to offset the LDs by Radix-2 spans + EA = (RA|0) + (EXTS(D) * LDSTsize * i) << RC """ lst = SVP64Asm(["addi 1, 0, 0x0010", "addi 2, 0, 0x0000", @@ -150,7 +140,7 @@ class DecoderTestCase(FHDLTestCase): "addi 7, 0, 0x303", "addi 8, 0, 0x404", "sv.stw 5.v, 0(1)", # scalar r1 + 0 + wordlen*offs - "sv.lwzbr 12.v, 4(1), 2"]) # bit-reversed + "sv.lwzsh 12.v, 4(1), 2"]) # bit-reversed lst = list(lst) # SVSTATE (in this case, VL=4) @@ -173,21 +163,21 @@ class DecoderTestCase(FHDLTestCase): self.assertEqual(sim.gpr(7), SelectableInt(0x303, 64)) self.assertEqual(sim.gpr(8), SelectableInt(0x404, 64)) # r1=0x10, RC=0, offs=4: contents of memory expected at: - # element 0: EA = r1 + bitrev(0b00)*4 => 0x10 + 0b00*4 => 0x10 - # element 1: EA = r1 + bitrev(0b01)*4 => 0x10 + 0b10*4 => 0x18 - # element 2: EA = r1 + bitrev(0b10)*4 => 0x10 + 0b01*4 => 0x14 - # element 3: EA = r1 + bitrev(0b11)*4 => 0x10 + 0b10*4 => 0x1c + # element 0: EA = r1 + 0b00*4 => 0x10 + 0b00*4 => 0x10 + # element 1: EA = r1 + 0b01*4 => 0x10 + 0b01*4 => 0x18 + # element 2: EA = r1 + 0b10*4 => 0x10 + 0b10*4 => 0x14 + # element 3: EA = r1 + 0b11*4 => 0x10 + 0b11*4 => 0x1c # therefore loaded from (bit-reversed indexing): # r9 => mem[0x10] which was stored from r5 # r10 => mem[0x18] which was stored from r6 # r11 => mem[0x18] which was stored from r7 # r12 => mem[0x1c] which was stored from r8 self.assertEqual(sim.gpr(12), SelectableInt(0x101, 64)) - self.assertEqual(sim.gpr(13), SelectableInt(0x303, 64)) - self.assertEqual(sim.gpr(14), SelectableInt(0x202, 64)) + self.assertEqual(sim.gpr(13), SelectableInt(0x202, 64)) + self.assertEqual(sim.gpr(14), SelectableInt(0x303, 64)) self.assertEqual(sim.gpr(15), SelectableInt(0x404, 64)) - def test_sv_load_store_bitreverse_fp(self): + def test_sv_load_store_shifted_fp(self): """>>> lst = ["addi 1, 0, 0x0010", "addi 2, 0, 0x0004", "addi 3, 0, 0x0002", @@ -198,19 +188,9 @@ class DecoderTestCase(FHDLTestCase): "sv.std 5.v, 0(1)", "sv.lfdbr 12.v, 4(1), 2"] - note: bitreverse mode is... odd. it's the butterfly generator - from Cooley-Tukey FFT: - https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms - - bitreverse LD is computed as: + shifted LD is computed as: for i in range(VL): - EA = (RA|0) + (EXTS(D) * LDSTsize * bitreverse(i, VL)) << RC - - bitreversal of 0 1 2 3 in binary 0b00 0b01 0b10 0b11 - produces 0 2 1 3 in binary 0b00 0b10 0b01 0b11 - - and thus creates the butterfly needed for one iteration of FFT. - the RC (shift) is to be able to offset the LDs by Radix-2 spans + EA = (RA|0) + (EXTS(D) * LDSTsize * i) << RC """ lst = SVP64Asm(["addi 1, 0, 0x0010", "addi 2, 0, 0x0000", @@ -219,7 +199,7 @@ class DecoderTestCase(FHDLTestCase): "addi 7, 0, 0x303", "addi 8, 0, 0x404", "sv.std 5.v, 0(1)", # scalar r1 + 0 + wordlen*offs - "sv.lfdbr 12.v, 8(1), 2"]) # bit-reversed + "sv.lfdsh 12.v, 8(1), 2"]) # shifted lst = list(lst) # SVSTATE (in this case, VL=4) @@ -258,35 +238,26 @@ class DecoderTestCase(FHDLTestCase): # r11 => mem[0x18] which was stored from r7 # r12 => mem[0x1c] which was stored from r8 self.assertEqual(sim.fpr(12), SelectableInt(0x101, 64)) - self.assertEqual(sim.fpr(13), SelectableInt(0x303, 64)) - self.assertEqual(sim.fpr(14), SelectableInt(0x202, 64)) + self.assertEqual(sim.fpr(13), SelectableInt(0x202, 64)) + self.assertEqual(sim.fpr(14), SelectableInt(0x303, 64)) self.assertEqual(sim.fpr(15), SelectableInt(0x404, 64)) - def test_sv_load_store_bitreverse2(self): + def test_sv_load_store_shifted2(self): """>>> lst = ["addi 1, 0, 0x0010", "addi 2, 0, 0x0004", "addi 3, 0, 0x0002", "sv.stfs 4.v, 0(1)", - "sv.lfsbr 12.v, 4(1), 2"] - - note: bitreverse mode is... odd. it's the butterfly generator - from Cooley-Tukey FFT: - https://en.wikipedia.org/wiki/Cooley%E2%80%93Tukey_FFT_algorithm#Data_reordering,_bit_reversal,_and_in-place_algorithms + "sv.lfssh 12.v, 4(1), 2"] - bitreverse LD is computed as: + shifted LD is computed as: for i in range(VL): - EA = (RA|0) + (EXTS(D) * LDSTsize * bitreverse(i, VL)) << RC - - bitreversal of 0 1 2 3 in binary 0b00 0b01 0b10 0b11 - produces 0 2 1 3 in binary 0b00 0b10 0b01 0b11 + EA = (RA|0) + (EXTS(D) * LDSTsize * i) << RC - and thus creates the butterfly needed for one iteration of FFT. - the RC (shift) is to be able to offset the LDs by Radix-2 spans """ lst = SVP64Asm(["addi 1, 0, 0x0010", "addi 2, 0, 0x0000", "sv.stfs 4.v, 0(1)", # scalar r1 + 0 + wordlen*offs - "sv.lfsbr 12.v, 4(1), 2"]) # bit-reversed + "sv.lfssh 12.v, 4(1), 2"]) # shifted (by zero, but hey) lst = list(lst) # SVSTATE (in this case, VL=4) @@ -306,8 +277,8 @@ class DecoderTestCase(FHDLTestCase): # expected results, remember that bit-reversed load has been done expected_fprs = deepcopy(fprs) expected_fprs[12] = fprs[4] # 0b00 -> 0b00 - expected_fprs[13] = fprs[6] # 0b01 -> 0b10 - expected_fprs[14] = fprs[5] # 0b10 -> 0b01 + expected_fprs[13] = fprs[5] # 0b10 -> 0b01 + expected_fprs[14] = fprs[6] # 0b01 -> 0b10 expected_fprs[15] = fprs[7] # 0b11 -> 0b11 with Program(lst, bigendian=False) as program: @@ -362,7 +333,7 @@ class DecoderTestCase(FHDLTestCase): "svshape 3, 3, 4, 0, 0", "svremap 1, 1, 2, 0, 0, 0, 0, 1", "sv.lwz 20.v, 0(1)", - #"sv.lwzbr 12.v, 4(1), 2", # bit-reversed + #"sv.lwzsh 12.v, 4(1), 2", # bit-reversed ]) lst = list(lst) @@ -419,11 +390,11 @@ class DecoderTestCase(FHDLTestCase): "sv.stw 5.v, 0(1)", "svshape 8, 1, 1, 6, 0", "svremap 31, 1, 2, 3, 0, 0, 0, 0", - "sv.lwzbr 12.v, 4(1), 2"] + "sv.lwzsh 12.v, 4(1), 2"] - bitreverse LD is computed as: + shifted LD is computed as: for i in range(VL): - EA = (RA|0) + (EXTS(D) * LDSTsize * bitreverse(i, VL)) << RC + EA = (RA|0) + (EXTS(D) * LDSTsize * i) << RC bitreversal of 0 1 2 3 in binary 0b00 0b01 0b10 0b11 produces 0 2 1 3 in binary 0b00 0b10 0b01 0b11 @@ -448,7 +419,7 @@ class DecoderTestCase(FHDLTestCase): "svshape 8, 1, 1, 6, 0", "svremap 1, 0, 0, 0, 0, 0, 0, 1", #"setvl 0, 0, 8, 0, 1, 1", - "sv.lwzbr 12.v, 4(1), 2", # bit-reversed + "sv.lwzsh 12.v, 4(1), 2", # bit-reversed #"sv.lwz 12.v, 0(1)" ]) lst = list(lst) @@ -506,11 +477,11 @@ class DecoderTestCase(FHDLTestCase): "sv.stw 5.v, 0(1)", "svshape 8, 1, 1, 6, 0", "svremap 31, 1, 2, 3, 0, 0, 0, 0", - "sv.lwzbr 12.v, 4(1), 2"] + "sv.lwzsh 12.v, 4(1), 2"] bitreverse LD is computed as: for i in range(VL): - EA = (RA|0) + (EXTS(D) * LDSTsize * bitreverse(i, VL)) << RC + EA = (RA|0) + (EXTS(D) * LDSTsize * i) << RC bitreversal of 0 1 2 3 in binary 0b00 0b01 0b10 0b11 produces 0 2 1 3 in binary 0b00 0b10 0b01 0b11 @@ -535,7 +506,7 @@ class DecoderTestCase(FHDLTestCase): "svshape 8, 1, 1, 14, 0", "svremap 16, 0, 0, 0, 0, 0, 0, 1", #"setvl 0, 0, 8, 0, 1, 1", - "sv.lwzbr 12.v, 4(1), 2", # bit-reversed + "sv.lwzsh 12.v, 4(1), 2", # bit-reversed #"sv.lwz 12.v, 0(1)" ]) lst = list(lst) diff --git a/src/openpower/decoder/power_decoder2.py b/src/openpower/decoder/power_decoder2.py index 8bc3ec6b..b20a014a 100644 --- a/src/openpower/decoder/power_decoder2.py +++ b/src/openpower/decoder/power_decoder2.py @@ -1008,7 +1008,7 @@ class PowerDecodeSubset(Elaboratable): with m.If(self.is_svp64_mode & is_major_ld): # straight-up: "it's a LD". this gives enough info # for SVP64 RM Mode decoding to detect LD/ST, and - # consequently detect the BITREVERSE mode. sigh + # consequently detect the SHIFT mode. sigh comb += rm_dec.fn_in.eq(Function.LDST) with m.Else(): comb += rm_dec.fn_in.eq(fn) # decode needs to know Fn type @@ -1021,9 +1021,9 @@ class PowerDecodeSubset(Elaboratable): # main PowerDecoder2 determines if different SVP64 modes enabled if not self.final: - # if bit-reverse mode requested - bitrev = rm_dec.ldstmode == SVP64LDSTmode.BITREVERSE - comb += self.use_svp64_ldst_dec.eq(bitrev) + # if shift mode requested + shiftmode = rm_dec.ldstmode == SVP64LDSTmode.SHIFT + comb += self.use_svp64_ldst_dec.eq(shiftmode) # detect if SVP64 FFT mode enabled (really bad hack), # exclude fcfids and others # XXX this is a REALLY bad hack, REALLY has to be done better. diff --git a/src/openpower/decoder/power_enums.py b/src/openpower/decoder/power_enums.py index 4f999f73..aeca8efe 100644 --- a/src/openpower/decoder/power_enums.py +++ b/src/openpower/decoder/power_enums.py @@ -216,7 +216,7 @@ class SVP64LDSTmode(Enum): INDEXED = 1 ELSTRIDE = 2 UNITSTRIDE = 3 - BITREVERSE = 4 + SHIFT = 4 # supported instructions: make sure to keep up-to-date with CSV files diff --git a/src/openpower/decoder/power_svp64_rm.py b/src/openpower/decoder/power_svp64_rm.py index c052924d..fe78f65a 100644 --- a/src/openpower/decoder/power_svp64_rm.py +++ b/src/openpower/decoder/power_svp64_rm.py @@ -182,9 +182,9 @@ class SVP64RMModeDecode(Elaboratable): with m.If(self.rc_in): comb += els.eq(mode[SVP64MODE.ELS_FFIRST_PRED]) - # Bit-reversed Mode - with m.If(mode[SVP64MODE.LDST_BITREV]): - comb += self.ldstmode.eq(SVP64LDSTmode.BITREVERSE) + # Shifted Mode + with m.If(mode[SVP64MODE.LDST_SHIFT]): + comb += self.ldstmode.eq(SVP64LDSTmode.SHIFT) # RA is vectorised with m.Elif(self.ldst_ra_vec): comb += self.ldstmode.eq(SVP64LDSTmode.INDEXED) diff --git a/src/openpower/sv/trans/svp64.py b/src/openpower/sv/trans/svp64.py index deed174a..2898a3d9 100644 --- a/src/openpower/sv/trans/svp64.py +++ b/src/openpower/sv/trans/svp64.py @@ -290,17 +290,17 @@ class SVP64Asm: v30b_op = v30b_op[:-1] # sigh again, have to recognised LD/ST bit-reverse instructions - # this has to be "processed" to fit into a v3.0B without the "br" - # e.g. ldbr is actually ld - ldst_bitreverse = v30b_op.startswith("l") and v30b_op.endswith("br") + # this has to be "processed" to fit into a v3.0B without the "sh" + # e.g. ldsh is actually ld + ldst_shift = v30b_op.startswith("l") and v30b_op.endswith("sh") if v30b_op not in isa.instr: raise Exception("opcode %s of '%s' not supported" % \ (v30b_op, insn)) - if ldst_bitreverse: + if ldst_shift: # okaay we need to process the fields and make this: - # ldbr RT, SVD(RA), RC - 11 bits for SVD, 5 for RC + # ldsh RT, SVD(RA), RC - 11 bits for SVD, 5 for RC # into this: # ld RT, D(RA) - 16 bits # likewise same for SVDS (9 bits for SVDS, 5 for RC, 14 bits for DS) @@ -329,9 +329,9 @@ class SVP64Asm: newfields[1] = "%d(%s)" % (immed, RA) fields = newfields - # and strip off "br" from end, and add "br" to opmodes, instead + # and strip off "sh" from end, and add "sh" to opmodes, instead v30b_op = v30b_op[:-2] - opmodes.append("br") + opmodes.append("sh") log ("rewritten", v30b_op, opmodes, fields) if v30b_op not in svp64.instrs: @@ -630,9 +630,9 @@ class SVP64Asm: smmode, smask = decode_predicate(encmode[3:]) mmode = smmode has_smask = True - # bitreverse LD/ST - elif encmode.startswith("br"): - ldst_bitreverse = True + # shifted LD/ST + elif encmode.startswith("sh"): + ldst_shift = True # vec2/3/4 elif encmode.startswith("vec"): subvl = decode_subvl(encmode[3:]) @@ -730,10 +730,10 @@ class SVP64Asm: assert has_pmask or mask_m_specified, \ "dest zeroing requires a dest predicate" - # check LDST bitreverse, only available in "normal" mode - if is_ldst and ldst_bitreverse: + # check LDST shifted, only available in "normal" mode + if is_ldst and ldst_shift: assert sv_mode is None, \ - "LD bit-reverse cannot have modes (%s) applied" % sv_mode + "LD shift cannot have modes (%s) applied" % sv_mode ###################################### # "normal" mode @@ -743,9 +743,9 @@ class SVP64Asm: if is_ldst: # TODO: for now, LD/ST-indexed is ignored. mode |= ldst_elstride << SVP64MODE.ELS_NORMAL # element-strided - # bitreverse mode - if ldst_bitreverse: - mode |= 1 << SVP64MODE.LDST_BITREV + # shifted mode + if ldst_shift: + mode |= 1 << SVP64MODE.LDST_SHIFT else: # TODO, reduce and subvector mode # 00 1 dz CRM reduce mode (mapreduce), SUBVL=1 @@ -1089,8 +1089,8 @@ if __name__ == '__main__': lst = [ 'sv.addi win2.v, win.v, -1', 'sv.add./mrr 5.v, 2.v, 1.v', - #'sv.lhzbr 5.v, 11(9.v), 15', - #'sv.lwzbr 5.v, 11(9.v), 15', + #'sv.lhzsh 5.v, 11(9.v), 15', + #'sv.lwzsh 5.v, 11(9.v), 15', 'sv.ffmadds 6.v, 2.v, 4.v, 6.v', ] lst = [ @@ -1101,8 +1101,8 @@ if __name__ == '__main__': 'svshape 8, 1, 1, 1, 1', ] lst = [ - #'sv.lfsbr 4.v, 11(8.v), 15', - #'sv.lwzbr 4.v, 11(8.v), 15', + #'sv.lfssh 4.v, 11(8.v), 15', + #'sv.lwzsh 4.v, 11(8.v), 15', #'sv.svstep. 2.v, 4, 0', #'sv.fcfids. 48.v, 64.v', 'sv.fcoss. 80.v, 0.v', -- 2.30.2