from nmigen.compat.sim import run_simulation
 from random import random
-from openpower.test.wb_get import wb_get
+from openpower.test.wb_get import wb_get_classic
 from openpower.test import wb_get as wbget
 from openpower.exceptions import LDSTExceptionTuple
 
     wbget.stop = True
 
 
-def test_pi_ld_misalign(pi,addr,data_len,msr):
+def test_pi_ld_misalign(pi, addr, data_len, msr):
     for i in range(0,data_len):
         ld_data, exctype, exc = yield from pi_ld(pi, addr+i, data_len, msr=msr)
         yield
-        if i == 0:
-            assert exc is None # use "is None" not "== None"
-            print("MISALIGN: test_pi_ld_misalign returned",hex(ld_data))
-        else:
-            assert exc.alignment == 1
+        assert exc is None # use "is None" not "== None"
+        print("MISALIGN: test_pi_ld_misalign returned",hex(ld_data))
+
+
+def test_pi_st_ld_misalign(pi, addr, data_len, msr):
+    data = 0x0102030405060708
+    for i in range(0, data_len):
+        exctype, exc = yield from pi_st(pi, addr+i, data, data_len, msr=msr)
+        print (exctype, exc)
+        assert exc is None # use "is None" not "== None"
+        ld_data, exctype, exc = yield from pi_ld(pi, addr+i, data_len, msr=msr)
+        yield
+        assert exc is None # use "is None" not "== None"
+        print("MISALIGN: test_pi_ld_misalign returned",hex(ld_data))
+        assert ld_data == data
 
 
 def _test_loadstore1_misalign(dut, mem):
 
     yield from test_pi_ld_misalign(pi,0,8,msr)
 
+    yield from test_pi_st_ld_misalign(pi,0,8,msr)
+
     wbget.stop = True
 
 
     sim.add_clock(1e-6)
 
     sim.add_sync_process(wrap(_test_loadstore1_ifetch_iface(m, mem)))
-    # add two wb_get processes onto the *same* memory dictionary.
+    # add two wb_get_classic processes onto the *same* memory dictionary.
     # this shouuuld work.... cross-fingers...
-    sim.add_sync_process(wrap(wb_get(cmpi.wb_bus(), mem)))
-    sim.add_sync_process(wrap(wb_get(icache.ibus, mem))) # ibus not bus
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    sim.add_sync_process(wrap(wb_get_classic(icache.ibus, mem))) # ibus not bus
     with sim.write_vcd('test_loadstore1_ifetch_iface.vcd',
                       traces=[m.debug_status]): # include extra debug
         sim.run()
 
     icache = m.submodules.ldst.icache
     sim.add_sync_process(wrap(_test_loadstore1_ifetch(m, mem)))
-    # add two wb_get processes onto the *same* memory dictionary.
+    # add two wb_get_classic processes onto the *same* memory dictionary.
     # this shouuuld work.... cross-fingers...
-    sim.add_sync_process(wrap(wb_get(cmpi.wb_bus(), mem)))
-    sim.add_sync_process(wrap(wb_get(icache.bus, mem)))
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    sim.add_sync_process(wrap(wb_get_classic(icache.bus, mem)))
     with sim.write_vcd('test_loadstore1_ifetch.vcd',
                       traces=[m.debug_status]): # include extra debug
         sim.run()
     sim.add_clock(1e-6)
 
     sim.add_sync_process(wrap(_test_loadstore1(m, mem)))
-    sim.add_sync_process(wrap(wb_get(cmpi.wb_bus(), mem)))
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
     with sim.write_vcd('test_loadstore1.vcd'):
         sim.run()
 
     sim.add_clock(1e-6)
 
     sim.add_sync_process(wrap(_test_loadstore1_microwatt_mmu_bin_test2(m, mem)))
-    sim.add_sync_process(wrap(wb_get(cmpi.wb_bus(), mem)))
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
     with sim.write_vcd('test_microwatt_mmu_test2.vcd'):
         sim.run()
 
     sim.add_clock(1e-6)
 
     sim.add_sync_process(wrap(_test_loadstore1_microwatt_mmu_bin_test5(m, mem)))
-    sim.add_sync_process(wrap(wb_get(cmpi.wb_bus(), mem)))
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
     with sim.write_vcd('test_microwatt_mmu_test5.vcd'):
         sim.run()
 
 
     ###########1122334455667788
     mem[0] = 0x0102030405060708
+    mem[8] = 0xffffffffffffffff
 
     sim.add_sync_process(wrap(_test_loadstore1_misalign(m, mem)))
-    sim.add_sync_process(wrap(wb_get(cmpi.wb_bus(), mem)))
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
     with sim.write_vcd('test_loadstore1_misalign.vcd'):
         sim.run()
+    print ("mem", mem)
 
 
 def test_loadstore1_invalid():
     sim.add_clock(1e-6)
 
     sim.add_sync_process(wrap(_test_loadstore1_invalid(m, mem)))
-    sim.add_sync_process(wrap(wb_get(cmpi.wb_bus(), mem)))
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
     with sim.write_vcd('test_loadstore1_invalid.vcd'):
         sim.run()
 
 
     icache = m.submodules.ldst.icache
     sim.add_sync_process(wrap(_test_loadstore1_ifetch_invalid(m, mem)))
-    # add two wb_get processes onto the *same* memory dictionary.
+    # add two wb_get_classic processes onto the *same* memory dictionary.
     # this shouuuld work.... cross-fingers...
-    sim.add_sync_process(wrap(wb_get(cmpi.wb_bus(), mem)))
-    sim.add_sync_process(wrap(wb_get(icache.bus, mem)))
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    sim.add_sync_process(wrap(wb_get_classic(icache.bus, mem)))
     with sim.write_vcd('test_loadstore1_ifetch_invalid.vcd',
                       traces=[m.debug_status]): # include extra debug
         sim.run()
     sim.add_clock(1e-6)
 
     sim.add_sync_process(wrap(_test_loadstore1_ifetch_multi(m, mem)))
-    # add two wb_get processes onto the *same* memory dictionary.
+    # add two wb_get_classic processes onto the *same* memory dictionary.
     # this shouuuld work.... cross-fingers...
-    sim.add_sync_process(wrap(wb_get(cmpi.wb_bus(), mem)))
-    sim.add_sync_process(wrap(wb_get(icache.ibus, mem))) # ibus not bus
+    sim.add_sync_process(wrap(wb_get_classic(cmpi.wb_bus(), mem)))
+    sim.add_sync_process(wrap(wb_get_classic(icache.ibus, mem))) # ibus not bus
     with sim.write_vcd('test_loadstore1_ifetch_multi.vcd',
                       traces=[m.debug_status]): # include extra debug
         sim.run()
 if __name__ == '__main__':
     #test_loadstore1()
     #test_loadstore1_microwatt_mmu_bin_test2()
-    test_loadstore1_microwatt_mmu_bin_test5()
+    #test_loadstore1_microwatt_mmu_bin_test5()
     #test_loadstore1_invalid()
     #test_loadstore1_ifetch() #FIXME
     #test_loadstore1_ifetch_invalid()
     #test_loadstore1_ifetch_unit_iface() # guess: should be working
     #test_loadstore1_ifetch_multi()
-    #test_loadstore1_misalign()
+    test_loadstore1_misalign()
 
         self.dcbz          = Signal()
         self.raddr          = Signal(64)
         self.maddr          = Signal(64)
-        self.store_data    = Signal(128)   # 128-bit to cope with
-        self.load_data     = Signal(128)   # misalignment
+        self.store_data    = Signal(64)   # first half (aligned)
+        self.store_data2   = Signal(64)   # second half (misaligned)
+        self.load_data     = Signal(128)   # 128 to cope with misalignment
         self.load_data_delay = Signal(128) # perform 2 LD/STs
         self.byte_sel      = Signal(16)    # also for misaligned, 16-bit
         self.alignstate    = Signal(Misalign) # progress of alignment request
         # put data into comb which is picked up in main elaborate()
         m.d.comb += self.d_w_valid.eq(1)
         m.d.comb += self.store_data.eq(data)
+        m.d.sync += self.store_data2.eq(data[64:128])
         st_ok = self.done # TODO indicates write data is valid
         m.d.comb += self.pi.store_done.data.eq(self.d_in.store_done)
         m.d.comb += self.pi.store_done.ok.eq(1)
         # fsm skeleton
         with m.Switch(self.state):
             with m.Case(State.IDLE):
+                sync += self.load_data_delay.eq(0) # clear out
                 with m.If((self.d_validblip | self.instr_fault) &
                           ~exc.happened):
                     comb += self.busy.eq(1)
                         with m.If(ldst_r.load):
                             m.d.comb += self.load_data[0:63].eq(d_in.data)
                             sync += self.load_data_delay[0:64].eq(d_in.data)
+                        with m.Else():
+                            m.d.sync += d_out.data.eq(self.store_data2)
                         # mmm kinda cheating, make a 2nd blip.
                         # use an aligned version of the address
                         addr_aligned, z3 = Signal(64), Const(0, 3)
         if hasattr(dbus, "stall"):
             comb += dcache.bus.stall.eq(dbus.stall)
 
-        # update out d data when flag set
+        # update out d data when flag set, for first half (second done in FSM)
         with m.If(self.d_w_valid):
-            with m.If(ldst_r.alignstate == Misalign.WAITSECOND):
-                m.d.sync += d_out.data.eq(self.store_data[64:128])
-            with m.Else():
-                m.d.sync += d_out.data.eq(self.store_data[0:64])
+            m.d.sync += d_out.data.eq(self.store_data)
         #with m.Else():
         #    m.d.sync += d_out.data.eq(0)
         # unit test passes with that change