From 10c1b76d5fb1329faaaa4f61089a00420138673d Mon Sep 17 00:00:00 2001
From: Luke Kenneth Casson Leighton <lkcl@lkcl.net>
Date: Sun, 21 Nov 2021 21:10:47 +0000
Subject: [PATCH] complex.  TestRunner now does not work properly unless
 recognising that first, instructions can complete out-of-order from which
 they were issued, and second, that just because the Function Unit has said it
 accepted the instruction does not mean it has actually completed.

TestRunner therefore needed quite a bit of sorting out
---
 src/soc/debug/dmi.py               |   3 +-
 src/soc/simple/issuer.py           | 101 +++++++++++++++++------------
 src/soc/simple/test/test_runner.py |  14 ++++
 3 files changed, 76 insertions(+), 42 deletions(-)

diff --git a/src/soc/debug/dmi.py b/src/soc/debug/dmi.py
index 4d897699..8a6686df 100644
--- a/src/soc/debug/dmi.py
+++ b/src/soc/debug/dmi.py
@@ -98,6 +98,7 @@ class CoreDebug(Elaboratable):
         self.core_stop_o       = Signal()
         self.core_rst_o        = Signal()
         self.icache_rst_o      = Signal()
+        self.stopping_o = Signal(name="stopping")
 
         # Core status inputs
         self.terminate_i    = Signal()
@@ -135,7 +136,7 @@ class CoreDebug(Elaboratable):
         stat_reg = Signal(64)
 
         # Some internal latches
-        stopping     = Signal()
+        stopping     = self.stopping_o
         do_step      = Signal()
         do_reset     = Signal()
         do_icreset   = Signal()
diff --git a/src/soc/simple/issuer.py b/src/soc/simple/issuer.py
index 049cdb2d..671f113f 100644
--- a/src/soc/simple/issuer.py
+++ b/src/soc/simple/issuer.py
@@ -166,6 +166,10 @@ class TestIssuerInternal(Elaboratable):
         self.regreduce_en = (hasattr(pspec, "regreduce") and
                                             (pspec.regreduce == True))
 
+        # and if overlap requested
+        self.allow_overlap = (hasattr(pspec, "allow_overlap") and
+                                            (pspec.allow_overlap == True))
+
         # JTAG interface.  add this right at the start because if it's
         # added it *modifies* the pspec, by adding enable/disable signals
         # for parts of the rest of the core
@@ -275,12 +279,15 @@ class TestIssuerInternal(Elaboratable):
         # pulse to synchronize the simulator at instruction end
         self.insn_done = Signal()
 
+        # indicate any instruction still outstanding, in execution
+        self.any_busy = Signal()
+
         if self.svp64_en:
             # store copies of predicate masks
             self.srcmask = Signal(64)
             self.dstmask = Signal(64)
 
-    def fetch_fsm(self, m, core, pc, svstate, nia, is_svp64_mode,
+    def fetch_fsm(self, m, dbg, core, pc, svstate, nia, is_svp64_mode,
                         fetch_pc_o_ready, fetch_pc_i_valid,
                         fetch_insn_o_valid, fetch_insn_i_ready):
         """fetch FSM
@@ -301,7 +308,8 @@ class TestIssuerInternal(Elaboratable):
 
             # waiting (zzz)
             with m.State("IDLE"):
-                comb += fetch_pc_o_ready.eq(1)
+                with m.If(~dbg.stopping_o):
+                    comb += fetch_pc_o_ready.eq(1)
                 with m.If(fetch_pc_i_valid):
                     # instruction allowed to go: start by reading the PC
                     # capture the PC and also drop it into Insn Memory
@@ -321,47 +329,55 @@ class TestIssuerInternal(Elaboratable):
 
             # dummy pause to find out why simulation is not keeping up
             with m.State("INSN_READ"):
-                # one cycle later, msr/sv read arrives.  valid only once.
-                with m.If(~msr_read):
-                    sync += msr_read.eq(1) # yeah don't read it again
-                    sync += cur_state.msr.eq(self.state_r_msr.o_data)
-                with m.If(self.imem.f_busy_o): # zzz...
-                    # busy: stay in wait-read
-                    comb += self.imem.a_i_valid.eq(1)
-                    comb += self.imem.f_i_valid.eq(1)
+                if self.allow_overlap:
+                    stopping = dbg.stopping_o
+                else:
+                    stopping = Const(0)
+                with m.If(stopping):
+                    # stopping: jump back to idle
+                    m.next = "IDLE"
                 with m.Else():
-                    # not busy: instruction fetched
-                    insn = get_insn(self.imem.f_instr_o, cur_state.pc)
-                    if self.svp64_en:
-                        svp64 = self.svp64
-                        # decode the SVP64 prefix, if any
-                        comb += svp64.raw_opcode_in.eq(insn)
-                        comb += svp64.bigendian.eq(self.core_bigendian_i)
-                        # pass the decoded prefix (if any) to PowerDecoder2
-                        sync += pdecode2.sv_rm.eq(svp64.svp64_rm)
-                        sync += pdecode2.is_svp64_mode.eq(is_svp64_mode)
-                        # remember whether this is a prefixed instruction, so
-                        # the FSM can readily loop when VL==0
-                        sync += is_svp64_mode.eq(svp64.is_svp64_mode)
-                        # calculate the address of the following instruction
-                        insn_size = Mux(svp64.is_svp64_mode, 8, 4)
-                        sync += nia.eq(cur_state.pc + insn_size)
-                        with m.If(~svp64.is_svp64_mode):
-                            # with no prefix, store the instruction
-                            # and hand it directly to the next FSM
+                    # one cycle later, msr/sv read arrives.  valid only once.
+                    with m.If(~msr_read):
+                        sync += msr_read.eq(1) # yeah don't read it again
+                        sync += cur_state.msr.eq(self.state_r_msr.o_data)
+                    with m.If(self.imem.f_busy_o): # zzz...
+                        # busy: stay in wait-read
+                        comb += self.imem.a_i_valid.eq(1)
+                        comb += self.imem.f_i_valid.eq(1)
+                    with m.Else():
+                        # not busy: instruction fetched
+                        insn = get_insn(self.imem.f_instr_o, cur_state.pc)
+                        if self.svp64_en:
+                            svp64 = self.svp64
+                            # decode the SVP64 prefix, if any
+                            comb += svp64.raw_opcode_in.eq(insn)
+                            comb += svp64.bigendian.eq(self.core_bigendian_i)
+                            # pass the decoded prefix (if any) to PowerDecoder2
+                            sync += pdecode2.sv_rm.eq(svp64.svp64_rm)
+                            sync += pdecode2.is_svp64_mode.eq(is_svp64_mode)
+                            # remember whether this is a prefixed instruction, 
+                            # so the FSM can readily loop when VL==0
+                            sync += is_svp64_mode.eq(svp64.is_svp64_mode)
+                            # calculate the address of the following instruction
+                            insn_size = Mux(svp64.is_svp64_mode, 8, 4)
+                            sync += nia.eq(cur_state.pc + insn_size)
+                            with m.If(~svp64.is_svp64_mode):
+                                # with no prefix, store the instruction
+                                # and hand it directly to the next FSM
+                                sync += dec_opcode_i.eq(insn)
+                                m.next = "INSN_READY"
+                            with m.Else():
+                                # fetch the rest of the instruction from memory
+                                comb += self.imem.a_pc_i.eq(cur_state.pc + 4)
+                                comb += self.imem.a_i_valid.eq(1)
+                                comb += self.imem.f_i_valid.eq(1)
+                                m.next = "INSN_READ2"
+                        else:
+                            # not SVP64 - 32-bit only
+                            sync += nia.eq(cur_state.pc + 4)
                             sync += dec_opcode_i.eq(insn)
                             m.next = "INSN_READY"
-                        with m.Else():
-                            # fetch the rest of the instruction from memory
-                            comb += self.imem.a_pc_i.eq(cur_state.pc + 4)
-                            comb += self.imem.a_i_valid.eq(1)
-                            comb += self.imem.f_i_valid.eq(1)
-                            m.next = "INSN_READ2"
-                    else:
-                        # not SVP64 - 32-bit only
-                        sync += nia.eq(cur_state.pc + 4)
-                        sync += dec_opcode_i.eq(insn)
-                        m.next = "INSN_READY"
 
             with m.State("INSN_READ2"):
                 with m.If(self.imem.f_busy_o):  # zzz...
@@ -1013,6 +1029,9 @@ class TestIssuerInternal(Elaboratable):
         pc_changed = Signal() # note write to PC
         sv_changed = Signal() # note write to SVSTATE
 
+        # indicate to outside world if any FU is still executing
+        comb += self.any_busy.eq(core.n.o_data.any_busy_o) # any FU executing
+
         # read state either from incoming override or from regfile
         # TODO: really should be doing MSR in the same way
         pc = state_get(m, core_rst, self.pc_i,
@@ -1081,7 +1100,7 @@ class TestIssuerInternal(Elaboratable):
         # Issue is where the VL for-loop # lives.  the ready/valid
         # signalling is used to communicate between the four.
 
-        self.fetch_fsm(m, core, pc, svstate, nia, is_svp64_mode,
+        self.fetch_fsm(m, dbg, core, pc, svstate, nia, is_svp64_mode,
                        fetch_pc_o_ready, fetch_pc_i_valid,
                        fetch_insn_o_valid, fetch_insn_i_ready)
 
diff --git a/src/soc/simple/test/test_runner.py b/src/soc/simple/test/test_runner.py
index 74085731..03d4fe96 100644
--- a/src/soc/simple/test/test_runner.py
+++ b/src/soc/simple/test/test_runner.py
@@ -214,6 +214,10 @@ class HDLRunner(StateRunner):
             while not (yield self.issuer.insn_done):
                 yield
 
+            # okaaay long story: in overlap mode, PC is updated one cycle
+            # late.
+            #if self.dut.allow_overlap:
+            yield
             yield Settle()
 
             index = (yield self.issuer.cur_state.pc) // 4
@@ -239,6 +243,16 @@ class HDLRunner(StateRunner):
             if terminated:
                 break
 
+        # wait until all settled
+        #while (yield self.issuer.any_busy):
+        #    yield
+
+        if self.dut.allow_overlap:
+            # get last state, at end of run
+            state = yield from TestState("hdl", core, self.dut,
+                                        code)
+            hdl_states.append(state)
+
         return hdl_states
 
     def end_test(self):
-- 
2.30.2