argh, have LD-bitreverse select the offset from RA REMAP schedule
authorLuke Kenneth Casson Leighton <lkcl@lkcl.net>
Wed, 28 Jul 2021 13:24:21 +0000 (14:24 +0100)
committerLuke Kenneth Casson Leighton <lkcl@lkcl.net>
Wed, 28 Jul 2021 13:24:21 +0000 (14:24 +0100)
openpower/isa/simplev.mdwn
src/openpower/decoder/isa/caller.py
src/openpower/decoder/isa/remap_dct_yield.py
src/openpower/decoder/isa/test_caller_svp64_ldst.py

index 1d670ea05fbafc20fa811998bd89fa18c96b90de..a48c405ecce47435c9f0143de18043db87fe901e 100644 (file)
@@ -235,7 +235,7 @@ Pseudo-code:
         SVSHAPE2[28:29] <- 0b11           # size schedule
     # set schedule up for DCT inverse of half-swapped ordering
     if (SVRM = 0b0110) then
-        vlen[0:6] <- (0b00 || SVxd)
+        vlen[0:6] <- (0b00 || SVxd) + 0b0000001
         # set up template in SVSHAPE0
         SVSHAPE0[0:5] <- (0b0 || SVxd)   # xdim
         SVSHAPE0[30:31] <- 0b01          # DCT/FFT mode
index 02a3abdccc93ce71e4c15f3ef50063f869f06040..8150785ba00d8210f54bbc0952468adcad01515d 100644 (file)
@@ -931,6 +931,8 @@ class ISACaller:
         yield self.dec2.dec.raw_opcode_in.eq(ins & 0xffffffff) # v3.0B suffix
         yield self.dec2.sv_rm.eq(sv_rm)                        # svp64 prefix
         yield Settle()
+        # store this for use in get_src_dststeps()
+        self.ldstmode = yield self.dec2.rm_dec.ldstmode
 
     def execute_one(self):
         """execute one instruction
@@ -1031,7 +1033,7 @@ class ISACaller:
         in the class for later use.  this to avoid problems with yield
         """
         # go through all iterators in lock-step, advance to next remap_idx
-        srcstep, dststep = self.new_srcstep, self.new_dststep
+        srcstep, dststep = self.get_src_dststeps()
         # get four SVSHAPEs. here we are hard-coding
         SVSHAPE0 = self.spr['SVSHAPE0']
         SVSHAPE1 = self.spr['SVSHAPE1']
@@ -1206,7 +1208,7 @@ class ISACaller:
                 self.update_nia()
                 self.update_pc_next()
                 return
-            srcstep, dststep = self.new_srcstep, self.new_dststep
+            srcstep, dststep = self.get_src_dststeps()
             pred_dst_zero = self.pred_dst_zero
             pred_src_zero = self.pred_src_zero
             vl = self.svstate.vl
@@ -1335,19 +1337,26 @@ class ISACaller:
             op = yield self.dec2.e.do.insn_type
             offsmul = 0
             if op == MicrOp.OP_LOAD.value:
-                offsmul = srcstep
-                log("D-field src", imm, offsmul)
+                if remap_active:
+                    offsmul = yield self.dec2.in1_step
+                    log("D-field REMAP src", imm, offsmul)
+                else:
+                    offsmul = srcstep
+                    log("D-field src", imm, offsmul)
             elif op == MicrOp.OP_STORE.value:
+                # XXX NOTE! no bit-reversed STORE! this should not ever be used
                 offsmul = dststep
                 log("D-field dst", imm, offsmul)
-            # bit-reverse mode
+            # bit-reverse mode, rev already done through get_src_dst_steps()
             if ldstmode == SVP64LDSTmode.BITREVERSE.value:
                 # manually look up RC, sigh
                 RC = yield self.dec2.dec.RC[0:5]
                 RC = self.gpr(RC)
-                log ("RC", RC.value, "imm", imm, "offs", bin(offsmul),
-                     "rev", bin(bitrev(offsmul, vl)))
-                imm = SelectableInt((imm * bitrev(offsmul, vl)) << RC.value, 32)
+                log ("LD-BITREVERSE:", "VL", vl,
+                      "RC", RC.value, "imm", imm,
+                     "offs", bin(offsmul),
+                     )
+                imm = SelectableInt((imm * offsmul) << RC.value, 32)
             # Unit-Strided LD/ST adds offset*width to immediate
             elif ldstmode == SVP64LDSTmode.UNITSTRIDE.value:
                 ldst_len = yield self.dec2.e.do.data_len
@@ -1523,6 +1532,8 @@ class ISACaller:
                     yield from self.svstate_post_inc()
                 else:
                     log ("SVSTATE_NEXT: post-inc")
+                # use actual src/dst-step here to check end, do NOT
+                # use bit-reversed version
                 srcstep, dststep = self.new_srcstep, self.new_dststep
                 remaps = self.get_remap_indices()
                 remap_idxs = self.remap_idxs
@@ -1653,7 +1664,25 @@ class ISACaller:
         log ("    new srcstep", srcstep)
         log ("    new dststep", dststep)
 
+    def get_src_dststeps(self):
+        """gets srcstep and dststep but performs bit-reversal on srcstep if
+        required.  use this ONLY to perform calculations, do NOT update
+        SVSTATE with the bit-reversed value of srcstep
+
+        ARGH, had to store self.ldstmode and VL due to yield issues
+        """
+        srcstep, dststep = self.new_srcstep, self.new_dststep
+        if self.is_svp64_mode:
+            if self.ldstmode == SVP64LDSTmode.BITREVERSE.value:
+                vl = self.svstate.vl
+                log ("SRCSTEP-BITREVERSE:", "VL", vl, "srcstep", srcstep,
+                     "rev", bin(bitrev(srcstep, vl)))
+                srcstep = bitrev(srcstep, vl)
+
+        return (srcstep, dststep)
+
     def update_new_svstate_steps(self):
+        # note, do not get the bit-reversed srcstep here!
         srcstep, dststep = self.new_srcstep, self.new_dststep
 
         # update SVSTATE with new srcstep
index a6bc3a82fa53dab7740e525037ca847fa2db130b..606a881e179c3aa2e81ba6b05e91703d66ee34a4 100644 (file)
@@ -53,13 +53,21 @@ def iterate_dct_inner_halfswap_loadstore(SVSHAPE):
     # *indices* are referenced (two levels of indirection at the moment)
     # pre-reverse the data-swap list so that it *ends up* in the order 0123..
     ji = list(range(n))
-    ji = halfrev2(ji, True)
+
+    levels = n.bit_length() - 1
+    ji = halfrev2(ji, False)
+    if False: # swap: TODO, add extra bit-reverse mode
+        ri = [reverse_bits(i, levels) for i in range(n)]
+        ji = [ji[ri[i]] for i in range(n)]
+
 
     # invert order if requested
     if SVSHAPE.invxyz[0]:
         ji.reverse()
 
-    yield from ji
+    for i, jl in enumerate(ji):
+        y_end = jl == ji[-1]
+        yield jl, (0b111 if y_end else 0b000)
 
 
 # python "yield" can be iterated. use this to make it clear how
@@ -635,6 +643,31 @@ def demo():
     print ("outer butterfly")
     pprint_schedule_outer(schedule, n)
 
+    # for DCT half-swap LDs
+    # j schedule
+    SVSHAPE0 = SVSHAPE()
+    SVSHAPE0.lims = [xdim, 0b000101, zdim]
+    SVSHAPE0.mode = 0b01
+    SVSHAPE0.submode2 = 0
+    SVSHAPE0.skip = 0
+    SVSHAPE0.offset = 0       # experiment with different offset, here
+    SVSHAPE0.invxyz = [0,0,0] # inversion if desired
+
+    # expected results
+    levels = n.bit_length() - 1
+    avi = list(range(n))
+    ri = [reverse_bits(i, levels) for i in range(n)]
+    av = halfrev2(avi, False)
+    av = [av[ri[i]] for i in range(n)]
+
+
+    i0 = iterate_dct_inner_halfswap_loadstore(SVSHAPE0)
+    for idx, (jl) in enumerate(i0):
+        print ("inverse half-swap ld", idx, jl, av[idx])
+        if jl[1] == 0b111: # end
+            break
+
+
 # run the demo
 if __name__ == '__main__':
     demo()
index bc74f7b9c23b78bf54b137488487c6ae8d0cca8c..85b046403d6d91171f5cfe4835e556c8c71d5eaa 100644 (file)
@@ -14,6 +14,8 @@ from openpower.decoder.isa.test_caller import Register, run_tst
 from openpower.sv.trans.svp64 import SVP64Asm
 from openpower.consts import SVP64CROffs
 from openpower.decoder.helpers import fp64toselectable
+from openpower.decoder.isa.remap_dct_yield import (halfrev2, reverse_bits,
+                                                  )
 from copy import deepcopy
 
 
@@ -247,7 +249,7 @@ class DecoderTestCase(FHDLTestCase):
             #                       (24, 0x040400000303)])
             self._check_fpregs(sim, expected_fprs)
 
-    def test_sv_load_store_bitreverse_remap(self):
+    def test_sv_load_store_bitreverse_remap_matrix(self):
         """>>> lst = ["addi 1, 0, 0x0010",
                         "addi 2, 0, 0x0004",
                         "addi 3, 0, 0x0002",
@@ -256,7 +258,7 @@ class DecoderTestCase(FHDLTestCase):
                         "addi 7, 0, 0x303",
                         "addi 8, 0, 0x404",
                         "sv.stw 5.v, 0(1)",
-                        "svshape 4, 4, 4, 0, 0",
+                        "svshape 4, 4, 2, 0, 0",
                         "svremap 31, 1, 2, 3, 0, 0, 0, 0",
                         "sv.lwzbr 12.v, 4(1), 2"]
 
@@ -273,6 +275,9 @@ class DecoderTestCase(FHDLTestCase):
 
         and thus creates the butterfly needed for one iteration of FFT.
         the RC (shift) is to be able to offset the LDs by Radix-2 spans
+
+        in this case however it is REMAPed via a Matrix Multiply Schedule,
+        which is set up as 4x2.
         """
         lst = SVP64Asm(["addi 1, 0, 0x0010",
                         "addi 2, 0, 0x0000",
@@ -331,6 +336,98 @@ class DecoderTestCase(FHDLTestCase):
             self.assertEqual(sim.gpr(18), SelectableInt(0x404, 64))
             self.assertEqual(sim.gpr(19), SelectableInt(0x808, 64))
 
+    def test_sv_load_store_bitreverse_remap_halfswap(self):
+        """>>> lst = ["addi 1, 0, 0x0010",
+                        "addi 2, 0, 0x0000",
+                        "addi 4, 0, 0x101",
+                        "addi 5, 0, 0x202",
+                        "addi 6, 0, 0x303",
+                        "addi 7, 0, 0x404",
+                        "addi 8, 0, 0x505",
+                        "addi 9, 0, 0x606",
+                        "addi 10, 0, 0x707",
+                        "addi 11, 0, 0x808",
+                        "sv.stw 5.v, 0(1)",
+                        "svshape 8, 1, 1, 6, 0",
+                        "svremap 31, 1, 2, 3, 0, 0, 0, 0",
+                        "sv.lwzbr 12.v, 4(1), 2"]
+
+        bitreverse LD is computed as:
+        for i in range(VL):
+            EA = (RA|0) + (EXTS(D) * LDSTsize * bitreverse(i, VL)) << RC
+
+        bitreversal of 0 1 2 3 in binary 0b00 0b01 0b10 0b11
+        produces       0 2 1 3 in binary 0b00 0b10 0b01 0b11
+
+        and thus creates the butterfly needed for one iteration of FFT.
+        the RC (shift) is to be able to offset the LDs by Radix-2 spans
+
+        on top of the bit-reversal is a REMAP for half-swaps for DCT
+        in-place.
+        """
+        lst = SVP64Asm(["addi 1, 0, 0x0010",
+                        "addi 2, 0, 0x0000",
+                        "addi 4, 0, 0x001",
+                        "addi 5, 0, 0x102",
+                        "addi 6, 0, 0x203",
+                        "addi 7, 0, 0x304",
+                        "addi 8, 0, 0x405",
+                        "addi 9, 0, 0x506",
+                        "addi 10, 0, 0x607",
+                        "addi 11, 0, 0x708",
+                        "sv.stw 4.v, 0(1)",  # scalar r1 + 0 + wordlen*offs
+                        "svshape 8, 1, 1, 6, 0",
+                        "svremap 1, 0, 0, 0, 0, 0, 0, 1",
+                        #"setvl 0, 0, 8, 0, 1, 1",
+                        "sv.lwzbr 12.v, 4(1), 2",
+                        #"sv.lwz 12.v, 0(1)"  # bit-reversed
+                        ]) 
+        lst = list(lst)
+
+        # SVSTATE (in this case, VL=4)
+        svstate = SVP64State()
+        svstate.vl = 8 # VL
+        svstate.maxvl = 8 # MAXVL
+        print ("SVSTATE", bin(svstate.asint()))
+
+        regs = [0] * 64
+
+        avi = [0x001, 0x102, 0x203, 0x304, 0x405, 0x506, 0x607, 0x708]
+        n = len(avi)
+        levels = n.bit_length() - 1
+        ri = list(range(n))
+        ri = [ri[reverse_bits(i, levels)] for i in range(n)]
+        av = halfrev2(avi, False)
+        av = [av[ri[i]] for i in range(n)]
+
+        with Program(lst, bigendian=False) as program:
+            sim = self.run_tst_program(program, svstate=svstate,
+                                                initial_regs=regs)
+            mem = sim.mem.dump(printout=False)
+            print ("Mem")
+            print (mem)
+
+            self.assertEqual(mem, [(16, 0x010200000001),
+                                   (24, 0x030400000203),
+                                   (32, 0x050600000405),
+                                   (40, 0x070800000607)])
+            # from STs
+            for i in range(len(avi)):
+                print ("st gpr", i, sim.gpr(i+4), hex(avi[i]))
+                self.assertEqual(sim.gpr(i+4), avi[i])
+            self.assertEqual(sim.gpr(5), SelectableInt(0x102, 64))
+            self.assertEqual(sim.gpr(6), SelectableInt(0x203, 64))
+            self.assertEqual(sim.gpr(7), SelectableInt(0x304, 64))
+            self.assertEqual(sim.gpr(8), SelectableInt(0x405, 64))
+            self.assertEqual(sim.gpr(9), SelectableInt(0x506, 64))
+            self.assertEqual(sim.gpr(10), SelectableInt(0x607, 64))
+            self.assertEqual(sim.gpr(11), SelectableInt(0x708, 64))
+            # combination of bit-reversed load with a DCT half-swap REMAP
+            # schedule
+            for i in range(len(avi)):
+                print ("ld gpr", i, sim.gpr(i+12), hex(av[i]))
+                self.assertEqual(sim.gpr(i+12), av[i])
+
     def run_tst_program(self, prog, initial_regs=None,
                               svstate=None, initial_fprs=None):
         if initial_regs is None: