use microwatt-specific PLRU due to bug in nmutil version
[soc.git] / src / soc / experiment / dcache.py
index da5c6add64e4d3893dd235acb507680d0ca8ace9..6d2d3cf2ae425a243722f4ce882fc1935a11807f 100644 (file)
@@ -13,6 +13,8 @@ Links:
 
 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
 * https://bugs.libre-soc.org/show_bug.cgi?id=469
+* https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
+  (discussion about brams for ECP5)
 
 """
 
@@ -25,7 +27,7 @@ sys.setrecursionlimit(1000000)
 from enum import Enum, unique
 
 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
-                    Record)
+                    Record, Memory)
 from nmutil.util import Display
 from nmigen.lib.coding import Decoder
 
@@ -49,8 +51,8 @@ from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
                                 WBIOMasterOut, WBIOSlaveOut)
 
 from soc.experiment.cache_ram import CacheRam
-#from soc.experiment.plru import PLRU
-from nmutil.plru import PLRU, PLRUs
+from soc.experiment.plru import PLRU, PLRUs
+#from nmutil.plru import PLRU, PLRUs
 
 # for test
 from soc.bus.sram import SRAM
@@ -134,15 +136,18 @@ TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 WAY_BITS = log2_int(NUM_WAYS)
 
 # Example of layout for 32 lines of 64 bytes:
-layout = """\
+layout = f"""\
+  DCache Layout:
+ |.. -----------------------| REAL_ADDR_BITS ({REAL_ADDR_BITS})
+  ..         |--------------| SET_SIZE_BITS ({SET_SIZE_BITS})
   ..  tag    |index|  line  |
   ..         |   row   |    |
-  ..         |     |---|    | ROW_LINE_BITS  (3)
-  ..         |     |--- - --| LINE_OFF_BITS (6)
-  ..         |         |- --| ROW_OFF_BITS  (3)
-  ..         |----- ---|    | ROW_BITS      (8)
-  ..         |-----|        | INDEX_BITS    (5)
-  .. --------|              | TAG_BITS      (45)
+  ..         |     |---|    | ROW_LINE_BITS ({ROW_LINE_BITS})
+  ..         |     |--- - --| LINE_OFF_BITS ({LINE_OFF_BITS})
+  ..         |         |- --| ROW_OFF_BITS  ({ROW_OFF_BITS})
+  ..         |----- ---|    | ROW_BITS      ({ROW_BITS})
+  ..         |-----|        | INDEX_BITS    ({INDEX_BITS})
+  .. --------|              | TAG_BITS      ({TAG_BITS})
 """
 print (layout)
 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
@@ -155,9 +160,12 @@ print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 
 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
+print ("    TAG_WIDTH", TAG_WIDTH)
+print ("     NUM_WAYS", NUM_WAYS)
+print ("    NUM_LINES", NUM_LINES)
 
 def CacheTagArray():
-    tag_layout = [('valid', 1),
+    tag_layout = [('valid', NUM_WAYS),
                   ('tag', TAG_RAM_WIDTH),
                  ]
     return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
@@ -191,6 +199,7 @@ assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 
+
 def TLBHit(name):
     return Record([('valid', 1),
                    ('way', TLB_WAY_BITS)], name=name)
@@ -206,8 +215,9 @@ def TLBRecord(name):
                  ]
     return Record(tlb_layout, name=name)
 
-def TLBArray():
-    return Array(TLBRecord(name="tlb%d" % x) for x in range(TLB_SET_SIZE))
+def TLBValidArray():
+    return Array(Signal(TLB_NUM_WAYS, name="tlb_valid%d" % x)
+                        for x in range(TLB_SET_SIZE))
 
 def HitWaySet():
     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
@@ -431,59 +441,136 @@ class Reservation(RecordObject):
 
 class DTLBUpdate(Elaboratable):
     def __init__(self):
-        self.dtlb     = TLBArray()
         self.tlbie    = Signal()
         self.tlbwe    = Signal()
         self.doall    = Signal()
-        self.updated  = Signal()
-        self.v_updated  = Signal()
         self.tlb_hit     = TLBHit("tlb_hit")
         self.tlb_req_index = Signal(TLB_SET_BITS)
 
-        self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
-        self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
         self.repl_way        = Signal(TLB_WAY_BITS)
         self.eatag           = Signal(TLB_EA_TAG_BITS)
         self.pte_data        = Signal(TLB_PTE_BITS)
 
-        self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
-
-        self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
-        self.db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
-        self.pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
+        # read from dtlb array
+        self.tlb_read       = Signal()
+        self.tlb_read_index = Signal(TLB_SET_BITS)
+        self.tlb_way        = TLBRecord("o_tlb_way")
 
     def elaborate(self, platform):
         m = Module()
         comb = m.d.comb
         sync = m.d.sync
 
-        tagset   = Signal(TLB_TAG_WAY_BITS)
-        pteset   = Signal(TLB_PTE_WAY_BITS)
-
-        tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
-        comb += db_out.eq(self.dv)
+        # there are 3 parts to this:
+        # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
+        # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
+        # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
+        # be a Memory because they can all be cleared (tlbie, doall), i mean,
+        # we _could_, in theory, by overriding the Reset Signal of the Memory,
+        # hmmm....
+
+        dtlb_valid = TLBValidArray()
+        tlb_req_index = self.tlb_req_index
+
+        print ("TLB_TAG_WAY_BITS", TLB_TAG_WAY_BITS)
+        print ("     TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
+        print ("        TLB_NUM_WAYS", TLB_NUM_WAYS)
+        print ("TLB_PTE_WAY_BITS", TLB_PTE_WAY_BITS)
+        print ("    TLB_PTE_BITS", TLB_PTE_BITS)
+        print ("    TLB_NUM_WAYS", TLB_NUM_WAYS)
+
+        # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
+        tagway = Memory(depth=TLB_SET_SIZE, width=TLB_TAG_WAY_BITS)
+        m.submodules.rd_tagway = rd_tagway = tagway.read_port()
+        m.submodules.wr_tagway = wr_tagway = tagway.write_port(
+                                    granularity=TLB_EA_TAG_BITS)
+
+        pteway = Memory(depth=TLB_SET_SIZE, width=TLB_PTE_WAY_BITS)
+        m.submodules.rd_pteway = rd_pteway = pteway.read_port()
+        m.submodules.wr_pteway = wr_pteway = pteway.write_port(
+                                    granularity=TLB_PTE_BITS)
+
+        # commented out for now, can be put in if Memory.reset can be
+        # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
+        #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
+        #m.submodules.rd_valid = rd_valid = validm.read_port()
+        #m.submodules.wr_valid = wr_valid = validm.write_port(
+                                    #granularity=1)
+
+        # connect up read and write addresses to Valid/PTE/TAG SRAMs
+        m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
+        m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
+        #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
+        m.d.comb += wr_tagway.addr.eq(tlb_req_index)
+        m.d.comb += wr_pteway.addr.eq(tlb_req_index)
+        #m.d.comb += wr_valid.addr.eq(tlb_req_index)
+
+        updated  = Signal()
+        v_updated  = Signal()
+        tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
+        db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
+        pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
+        dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
+
+        comb += dv.eq(dtlb_valid[tlb_req_index])
+        comb += db_out.eq(dv)
 
         with m.If(self.tlbie & self.doall):
-            pass # clear all back in parent
+            # clear all valid bits at once
+            # XXX hmmm, validm _could_ use Memory reset here...
+            for i in range(TLB_SET_SIZE):
+                sync += dtlb_valid[i].eq(0)
         with m.Elif(self.tlbie):
+            # invalidate just the hit_way
             with m.If(self.tlb_hit.valid):
                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
-                comb += self.v_updated.eq(1)
-
+                comb += v_updated.eq(1)
         with m.Elif(self.tlbwe):
-
-            comb += tagset.eq(self.tlb_tag_way)
-            comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
-            comb += tb_out.eq(tagset)
-
-            comb += pteset.eq(self.tlb_pte_way)
-            comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
-            comb += pb_out.eq(pteset)
-
+            # write to the requested tag and PTE
+            comb += write_tlb_tag(self.repl_way, tb_out, self.eatag)
+            comb += write_tlb_pte(self.repl_way, pb_out, self.pte_data)
+            # set valid bit
             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 
-            comb += self.updated.eq(1)
-            comb += self.v_updated.eq(1)
+            comb += updated.eq(1)
+            comb += v_updated.eq(1)
+
+        # above, sometimes valid is requested to be updated but data not
+        # therefore split them out, here.  note the granularity thing matches
+        # with the shift-up of the eatag/pte_data into the correct TLB way.
+        # thus is it not necessary to write the entire lot, just the portion
+        # being altered: hence writing the *old* copy of the row is not needed
+        with m.If(updated): # PTE and TAG to be written
+            comb += wr_pteway.data.eq(pb_out)
+            comb += wr_pteway.en.eq(1<<self.repl_way)
+            comb += wr_tagway.data.eq(tb_out)
+            comb += wr_tagway.en.eq(1<<self.repl_way)
+        with m.If(v_updated): # Valid to be written
+            sync += dtlb_valid[tlb_req_index].eq(db_out)
+            #comb += wr_valid.data.eq(db_out)
+            #comb += wr_valid.en.eq(1<<self.repl_way)
+
+        # select one TLB way, use a register here
+        r_tlb_way        = TLBRecord("r_tlb_way")
+        r_delay = Signal()
+        sync += r_delay.eq(self.tlb_read)
+        with m.If(self.tlb_read):
+            sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
+        with m.If(r_delay):
+            # on one clock delay, output the contents of the read port(s)
+            # comb += self.tlb_way.valid.eq(rd_valid.data)
+            comb += self.tlb_way.tag.eq(rd_tagway.data)
+            comb += self.tlb_way.pte.eq(rd_pteway.data)
+            # and also capture the (delayed) output...
+            #sync += r_tlb_way.valid.eq(rd_valid.data)
+            sync += r_tlb_way.tag.eq(rd_tagway.data)
+            sync += r_tlb_way.pte.eq(rd_pteway.data)
+        with m.Else():
+            # ... so that the register can output it when no read is requested
+            # it's rather overkill but better to be safe than sorry
+            comb += self.tlb_way.tag.eq(r_tlb_way.tag)
+            comb += self.tlb_way.pte.eq(r_tlb_way.pte)
+            #comb += self.tlb_way.eq(r_tlb_way)
 
         return m
 
@@ -492,8 +579,7 @@ class DCachePendingHit(Elaboratable):
 
     def __init__(self, tlb_way,
                       cache_i_validdx, cache_tag_set,
-                    req_addr,
-                    hit_set):
+                    req_addr):
 
         self.go          = Signal()
         self.virt_mode   = Signal()
@@ -508,7 +594,6 @@ class DCachePendingHit(Elaboratable):
         self.cache_i_validdx = cache_i_validdx
         self.cache_tag_set = cache_tag_set
         self.req_addr = req_addr
-        self.hit_set = hit_set
 
     def elaborate(self, platform):
         m = Module()
@@ -523,12 +608,13 @@ class DCachePendingHit(Elaboratable):
         cache_tag_set = self.cache_tag_set
         req_addr = self.req_addr
         tlb_hit = self.tlb_hit
-        hit_set = self.hit_set
         hit_way = self.hit_way
         rel_match = self.rel_match
         req_index = self.req_index
         reload_tag = self.reload_tag
 
+        hit_set     = Array(Signal(name="hit_set_%d" % i) \
+                                  for i in range(TLB_NUM_WAYS))
         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
                                     for i in range(TLB_NUM_WAYS))
         hit_way_set = HitWaySet()
@@ -542,14 +628,15 @@ class DCachePendingHit(Elaboratable):
         with m.If(virt_mode):
             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
-                s_hit       = Signal()
-                s_pte       = Signal(TLB_PTE_BITS)
-                s_ra        = Signal(REAL_ADDR_BITS)
+                s_hit       = Signal(name="s_hit%d" % j)
+                s_pte       = Signal(TLB_PTE_BITS, name="s_pte%d" % j)
+                s_ra        = Signal(REAL_ADDR_BITS, name="s_ra%d" % j)
+                # read the PTE, calc the Real Address, get tge tag
                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
                 comb += s_tag.eq(get_tag(s_ra))
-
+                # for each way check tge tag against the cache tag set
                 for i in range(NUM_WAYS): # way_t
                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
@@ -559,9 +646,8 @@ class DCachePendingHit(Elaboratable):
                         comb += hit_way_set[j].eq(i)
                         comb += s_hit.eq(1)
                 comb += hit_set[j].eq(s_hit)
-                with m.If(s_tag == reload_tag):
-                    comb += rel_matches[j].eq(1)
-            with m.If(tlb_hit.way):
+                comb += rel_matches[j].eq(s_tag == reload_tag)
+            with m.If(tlb_hit.valid):
                 comb += is_hit.eq(hit_set[tlb_hit.way])
                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
                 comb += rel_match.eq(rel_matches[tlb_hit.way])
@@ -590,7 +676,7 @@ class DCache(Elaboratable):
       at the end of line (this requires dealing with requests coming in
       while not idle...)
     """
-    def __init__(self):
+    def __init__(self, pspec=None):
         self.d_in      = LoadStore1ToDCacheType("d_in")
         self.d_out     = DCacheToLoadStore1Type("d_out")
 
@@ -609,6 +695,10 @@ class DCache(Elaboratable):
 
         self.log_out   = Signal(20)
 
+        # test if microwatt compatibility is to be enabled
+        self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
+                                 (pspec.microwatt_compat == True))
+
     def stage_0(self, m, r0, r1, r0_full):
         """Latch the request in r0.req as long as we're not stalling
         """
@@ -647,6 +737,7 @@ class DCache(Elaboratable):
             comb += r.doall.eq(0)
             comb += r.tlbld.eq(0)
             comb += r.mmu_req.eq(0)
+
         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
             sync += r0.eq(r)
             sync += r0_full.eq(r.req.valid)
@@ -663,7 +754,7 @@ class DCache(Elaboratable):
                                  r.req.virt_mode, r.req.addr,
                                  r.req.data, r.req.load)
 
-    def tlb_read(self, m, r0_stall, tlb_way, dtlb):
+    def tlb_read(self, m, r0_stall, tlb_way):
         """TLB
         Operates in the second cycle on the request latched in r0.req.
         TLB updates write the entry at the end of the second cycle.
@@ -672,7 +763,6 @@ class DCache(Elaboratable):
         sync = m.d.sync
         m_in, d_in = self.m_in, self.d_in
 
-        index    = Signal(TLB_SET_BITS)
         addrbits = Signal(TLB_SET_BITS)
 
         amin = TLB_LG_PGSZ
@@ -682,12 +772,13 @@ class DCache(Elaboratable):
             comb += addrbits.eq(m_in.addr[amin : amax])
         with m.Else():
             comb += addrbits.eq(d_in.addr[amin : amax])
-        comb += index.eq(addrbits)
 
         # If we have any op and the previous op isn't finished,
         # then keep the same output for next cycle.
-        with m.If(~r0_stall):
-            sync += tlb_way.eq(dtlb[index])
+        d = self.dtlb_update
+        comb += d.tlb_read_index.eq(addrbits)
+        comb += d.tlb_read.eq(~r0_stall)
+        comb += tlb_way.eq(d.tlb_way)
 
     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
         """Generate TLB PLRUs
@@ -698,7 +789,7 @@ class DCache(Elaboratable):
         if TLB_NUM_WAYS == 0:
             return
 
-        # Binary-to-Unary one-hot, enabled by tlb_hit valid
+        # suite of PLRUs with a selection and output mechanism
         tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
         m.submodules.tlb_plrus = tlb_plrus
         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
@@ -767,8 +858,8 @@ class DCache(Elaboratable):
             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 
-    def tlb_update(self, m, r0_valid, r0, dtlb, tlb_req_index,
-                    tlb_hit, tlb_plru_victim, tlb_way):
+    def tlb_update(self, m, r0_valid, r0, tlb_req_index,
+                    tlb_hit, tlb_plru_victim):
 
         comb = m.d.comb
         sync = m.d.sync
@@ -781,24 +872,10 @@ class DCache(Elaboratable):
 
         d = self.dtlb_update
 
-        with m.If(tlbie & r0.doall):
-            # clear all valid bits at once
-            for i in range(TLB_SET_SIZE):
-                sync += dtlb[i].valid.eq(0)
-        with m.If(d.updated):
-            sync += dtlb[tlb_req_index].tag.eq(d.tb_out)
-            sync += dtlb[tlb_req_index].pte.eq(d.pb_out)
-        with m.If(d.v_updated):
-            sync += dtlb[tlb_req_index].valid.eq(d.db_out)
-
-        comb += d.dv.eq(dtlb[tlb_req_index].valid)
-
         comb += d.tlbie.eq(tlbie)
         comb += d.tlbwe.eq(tlbwe)
         comb += d.doall.eq(r0.doall)
         comb += d.tlb_hit.eq(tlb_hit)
-        comb += d.tlb_tag_way.eq(tlb_way.tag)
-        comb += d.tlb_pte_way.eq(tlb_way.pte)
         comb += d.tlb_req_index.eq(tlb_req_index)
 
         with m.If(tlb_hit.valid):
@@ -817,6 +894,7 @@ class DCache(Elaboratable):
         if TLB_NUM_WAYS == 0:
             return
 
+        # suite of PLRUs with a selection and output mechanism
         m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
         comb += plrus.way.eq(r1.hit_way)
         comb += plrus.valid.eq(r1.cache_hit)
@@ -860,8 +938,6 @@ class DCache(Elaboratable):
         opsel       = Signal(3)
         go          = Signal()
         nc          = Signal()
-        hit_set     = Array(Signal(name="hit_set_%d" % i) \
-                                  for i in range(TLB_NUM_WAYS))
         cache_i_validdx = Signal(NUM_WAYS)
 
         # Extract line, row and tag from request
@@ -878,8 +954,7 @@ class DCache(Elaboratable):
 
         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
                                             cache_i_validdx, cache_tag_set,
-                                            r0.req.addr,
-                                            hit_set)
+                                            r0.req.addr)
         comb += dc.tlb_hit.eq(tlb_hit)
         comb += dc.reload_tag.eq(r1.reload_tag)
         comb += dc.virt_mode.eq(r0.req.virt_mode)
@@ -1163,7 +1238,7 @@ class DCache(Elaboratable):
             d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
 
             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
-            setattr(m.submodules, "cacheram_%d" % i, way)
+            m.submodules["cacheram_%d" % i] = way
 
             comb += way.rd_en.eq(do_read)
             comb += way.rd_addr.eq(rd_addr)
@@ -1292,10 +1367,10 @@ class DCache(Elaboratable):
         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
 
         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
-            with m.If(~r0.mmu_req):
-                sync += r1.ls_valid.eq(1)
-            with m.Else():
+            with m.If(r0.mmu_req):
                 sync += r1.mmu_done.eq(1)
+            with m.Else():
+                sync += r1.ls_valid.eq(1)
 
         with m.If(r1.write_tag):
             # Store new tag in selected way
@@ -1408,10 +1483,10 @@ class DCache(Elaboratable):
                             sync += r1.full.eq(0)
                             sync += r1.slow_valid.eq(1)
 
-                            with m.If(~req.mmu_req):
-                                sync += r1.ls_valid.eq(1)
-                            with m.Else():
+                            with m.If(req.mmu_req):
                                 sync += r1.mmu_done.eq(1)
+                            with m.Else():
+                                sync += r1.ls_valid.eq(1)
 
                             with m.If(req.op == Op.OP_STORE_HIT):
                                 sync += r1.write_bram.eq(1)
@@ -1476,10 +1551,10 @@ class DCache(Elaboratable):
                                 (r1.store_row == get_row(req.real_addr))):
                         sync += r1.full.eq(0)
                         sync += r1.slow_valid.eq(1)
-                        with m.If(~r1.mmu_req):
-                            sync += r1.ls_valid.eq(1)
-                        with m.Else():
+                        with m.If(r1.mmu_req):
                             sync += r1.mmu_done.eq(1)
+                        with m.Else():
+                            sync += r1.ls_valid.eq(1)
                         sync += r1.forward_sel.eq(~0) # all 1s
                         sync += r1.use_forward1.eq(1)
 
@@ -1505,19 +1580,17 @@ class DCache(Elaboratable):
 
             with m.Case(State.STORE_WAIT_ACK):
                 st_stbs_done = Signal()
-                acks        = Signal(3)
                 adjust_acks = Signal(3)
 
                 comb += st_stbs_done.eq(~r1.wb.stb)
-                comb += acks.eq(r1.acks_pending)
 
                 with m.If(r1.inc_acks != r1.dec_acks):
                     with m.If(r1.inc_acks):
-                        comb += adjust_acks.eq(acks + 1)
+                        comb += adjust_acks.eq(r1.acks_pending + 1)
                     with m.Else():
-                        comb += adjust_acks.eq(acks - 1)
+                        comb += adjust_acks.eq(r1.acks_pending - 1)
                 with m.Else():
-                    comb += adjust_acks.eq(acks)
+                    comb += adjust_acks.eq(r1.acks_pending)
 
                 sync += r1.acks_pending.eq(adjust_acks)
 
@@ -1551,6 +1624,8 @@ class DCache(Elaboratable):
                         comb += st_stbs_done.eq(1)
 
                 # Got ack ? See if complete.
+                sync += Display("got ack %d %d stbs %d adjust_acks %d",
+                                bus.ack, bus.ack, st_stbs_done, adjust_acks)
                 with m.If(bus.ack):
                     with m.If(st_stbs_done & (adjust_acks == 1)):
                         sync += r1.state.eq(State.IDLE)
@@ -1569,10 +1644,10 @@ class DCache(Elaboratable):
                     sync += r1.full.eq(0)
                     sync += r1.slow_valid.eq(1)
 
-                    with m.If(~r1.mmu_req):
-                        sync += r1.ls_valid.eq(1)
-                    with m.Else():
+                    with m.If(r1.mmu_req):
                         sync += r1.mmu_done.eq(1)
+                    with m.Else():
+                        sync += r1.ls_valid.eq(1)
 
                     sync += r1.forward_sel.eq(~0) # all 1s
                     sync += r1.use_forward1.eq(1)
@@ -1673,7 +1748,8 @@ class DCache(Elaboratable):
         # deal with litex not doing wishbone pipeline mode
         # XXX in wrong way.  FIFOs are needed in the SRAM test
         # so that stb/ack match up. same thing done in icache.py
-        comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
+        if not self.microwatt_compat:
+            comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
 
         # Wire up wishbone request latch out of stage 1
         comb += self.bus.we.eq(r1.wb.we)
@@ -1685,18 +1761,16 @@ class DCache(Elaboratable):
 
         # create submodule TLBUpdate
         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
-        dtlb = self.dtlb_update.dtlb
 
         # call sub-functions putting everything together, using shared
         # signals established above
         self.stage_0(m, r0, r1, r0_full)
-        self.tlb_read(m, r0_stall, tlb_way, dtlb)
+        self.tlb_read(m, r0_stall, tlb_way)
         self.tlb_search(m, tlb_req_index, r0, r0_valid,
                         tlb_way,
                         pte, tlb_hit, valid_ra, perm_attr, ra)
-        self.tlb_update(m, r0_valid, r0, dtlb, tlb_req_index,
-                        tlb_hit, tlb_plru_victim,
-                        tlb_way)
+        self.tlb_update(m, r0_valid, r0, tlb_req_index,
+                        tlb_hit, tlb_plru_victim)
         self.maybe_plrus(m, r1, plru_victim)
         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)