double the number of lines in the L1 D/I-Cache to match microwatt
[soc.git] / src / soc / experiment / dcache.py
index 7cd75a79257413d99ab2161b3328f2408abd1452..a828e3c3c2137b05e3197ef54e2c69bc2a80496e 100644 (file)
@@ -2,16 +2,40 @@
 
 based on Anton Blanchard microwatt dcache.vhdl
 
 
 based on Anton Blanchard microwatt dcache.vhdl
 
+note that the microwatt dcache wishbone interface expects "stall".
+for simplicity at the moment this is hard-coded to cyc & ~ack.
+see WB4 spec, p84, section 5.2.1
+
+IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
+is raised.  sigh
+
+Links:
+
+* https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
+* https://bugs.libre-soc.org/show_bug.cgi?id=469
+* https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
+  (discussion about brams for ECP5)
+
 """
 
 """
 
+import sys
+
+from nmutil.gtkw import write_gtkw
+
+sys.setrecursionlimit(1000000)
+
 from enum import Enum, unique
 
 from enum import Enum, unique
 
-from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
+from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
+                    Record, Memory)
 from nmutil.util import Display
 from nmutil.util import Display
+from nmigen.lib.coding import Decoder
 
 from copy import deepcopy
 from random import randint, seed
 
 
 from copy import deepcopy
 from random import randint, seed
 
+from nmigen_soc.wishbone.bus import Interface
+
 from nmigen.cli import main
 from nmutil.iocontrol import RecordObject
 from nmigen.utils import log2_int
 from nmigen.cli import main
 from nmutil.iocontrol import RecordObject
 from nmigen.utils import log2_int
@@ -27,8 +51,8 @@ from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
                                 WBIOMasterOut, WBIOSlaveOut)
 
 from soc.experiment.cache_ram import CacheRam
                                 WBIOMasterOut, WBIOSlaveOut)
 
 from soc.experiment.cache_ram import CacheRam
-#from soc.experiment.plru import PLRU
-from nmutil.plru import PLRU
+from soc.experiment.plru import PLRU, PLRUs
+#from nmutil.plru import PLRU, PLRUs
 
 # for test
 from soc.bus.sram import SRAM
 
 # for test
 from soc.bus.sram import SRAM
@@ -44,7 +68,7 @@ from nmutil.util import wrap
 
 # TODO: make these parameters of DCache at some point
 LINE_SIZE = 64    # Line size in bytes
 
 # TODO: make these parameters of DCache at some point
 LINE_SIZE = 64    # Line size in bytes
-NUM_LINES = 16    # Number of lines in a set
+NUM_LINES = 32    # Number of lines in a set
 NUM_WAYS = 4      # Number of ways
 TLB_SET_SIZE = 64 # L1 DTLB entries per set
 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
 NUM_WAYS = 4      # Number of ways
 TLB_SET_SIZE = 64 # L1 DTLB entries per set
 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
@@ -54,7 +78,7 @@ LOG_LENGTH = 0    # Non-zero to enable log data collection
 # BRAM organisation: We never access more than
 #     -- WB_DATA_BITS at a time so to save
 #     -- resources we make the array only that wide, and
 # BRAM organisation: We never access more than
 #     -- WB_DATA_BITS at a time so to save
 #     -- resources we make the array only that wide, and
-#     -- use consecutive indices for to make a cache "line"
+#     -- use consecutive indices to make a cache "line"
 #     --
 #     -- ROW_SIZE is the width in bytes of the BRAM
 #     -- (based on WB, so 64-bits)
 #     --
 #     -- ROW_SIZE is the width in bytes of the BRAM
 #     -- (based on WB, so 64-bits)
@@ -112,15 +136,18 @@ TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 WAY_BITS = log2_int(NUM_WAYS)
 
 # Example of layout for 32 lines of 64 bytes:
 WAY_BITS = log2_int(NUM_WAYS)
 
 # Example of layout for 32 lines of 64 bytes:
-layout = """\
+layout = f"""\
+  DCache Layout:
+ |.. -----------------------| REAL_ADDR_BITS ({REAL_ADDR_BITS})
+  ..         |--------------| SET_SIZE_BITS ({SET_SIZE_BITS})
   ..  tag    |index|  line  |
   ..         |   row   |    |
   ..  tag    |index|  line  |
   ..         |   row   |    |
-  ..         |     |---|    | ROW_LINE_BITS  (3)
-  ..         |     |--- - --| LINE_OFF_BITS (6)
-  ..         |         |- --| ROW_OFF_BITS  (3)
-  ..         |----- ---|    | ROW_BITS      (8)
-  ..         |-----|        | INDEX_BITS    (5)
-  .. --------|              | TAG_BITS      (45)
+  ..         |     |---|    | ROW_LINE_BITS ({ROW_LINE_BITS})
+  ..         |     |--- - --| LINE_OFF_BITS ({LINE_OFF_BITS})
+  ..         |         |- --| ROW_OFF_BITS  ({ROW_OFF_BITS})
+  ..         |----- ---|    | ROW_BITS      ({ROW_BITS})
+  ..         |-----|        | INDEX_BITS    ({INDEX_BITS})
+  .. --------|              | TAG_BITS      ({TAG_BITS})
 """
 print (layout)
 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 """
 print (layout)
 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
@@ -133,14 +160,15 @@ print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 
 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 
 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
+print ("    TAG_WIDTH", TAG_WIDTH)
+print ("     NUM_WAYS", NUM_WAYS)
+print ("    NUM_LINES", NUM_LINES)
 
 def CacheTagArray():
 
 def CacheTagArray():
-    return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
-                        for x in range(NUM_LINES))
-
-def CacheValidBitsArray():
-    return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
-                        for x in range(NUM_LINES))
+    tag_layout = [('valid', NUM_WAYS),
+                  ('tag', TAG_RAM_WIDTH),
+                 ]
+    return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
 
 def RowPerLineValidArray():
     return Array(Signal(name="rows_valid%d" % x) \
 
 def RowPerLineValidArray():
     return Array(Signal(name="rows_valid%d" % x) \
@@ -172,21 +200,24 @@ assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 
 
 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 
 
-def TLBValidBitsArray():
-    return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
-                for x in range(TLB_SET_SIZE))
+def TLBHit(name):
+    return Record([('valid', 1),
+                   ('way', TLB_WAY_BITS)], name=name)
 
 def TLBTagEAArray():
     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
                 for x in range (TLB_NUM_WAYS))
 
 
 def TLBTagEAArray():
     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
                 for x in range (TLB_NUM_WAYS))
 
-def TLBTagsArray():
-    return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
-                for x in range (TLB_SET_SIZE))
+def TLBRecord(name):
+    tlb_layout = [('valid', TLB_NUM_WAYS),
+                  ('tag', TLB_TAG_WAY_BITS),
+                  ('pte', TLB_PTE_WAY_BITS)
+                 ]
+    return Record(tlb_layout, name=name)
 
 
-def TLBPtesArray():
-    return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
-                for x in range(TLB_SET_SIZE))
+def TLBValidArray():
+    return Array(Signal(TLB_NUM_WAYS, name="tlb_valid%d" % x)
+                        for x in range(TLB_SET_SIZE))
 
 def HitWaySet():
     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 
 def HitWaySet():
     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
@@ -321,10 +352,11 @@ class RegStage0(RecordObject):
     def __init__(self, name=None):
         super().__init__(name=name)
         self.req     = LoadStore1ToDCacheType(name="lsmem")
     def __init__(self, name=None):
         super().__init__(name=name)
         self.req     = LoadStore1ToDCacheType(name="lsmem")
-        self.tlbie   = Signal()
-        self.doall   = Signal()
-        self.tlbld   = Signal()
+        self.tlbie   = Signal() # indicates a tlbie request (from MMU)
+        self.doall   = Signal() # with tlbie, indicates flush whole TLB
+        self.tlbld   = Signal() # indicates a TLB load request (from MMU)
         self.mmu_req = Signal() # indicates source of request
         self.mmu_req = Signal() # indicates source of request
+        self.d_valid = Signal() # indicates req.data is valid now
 
 
 class MemAccessRequest(RecordObject):
 
 
 class MemAccessRequest(RecordObject):
@@ -358,9 +390,8 @@ class RegStage1(RecordObject):
         self.cache_hit        = Signal()
 
         # TLB hit state
         self.cache_hit        = Signal()
 
         # TLB hit state
-        self.tlb_hit          = Signal()
-        self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
-        self.tlb_hit_index    = Signal(TLB_WAY_BITS)
+        self.tlb_hit          = TLBHit("tlb_hit")
+        self.tlb_hit_index    = Signal(TLB_SET_BITS)
 
         # 2-stage data buffer for data forwarded from writes to reads
         self.forward_data1    = Signal(64)
 
         # 2-stage data buffer for data forwarded from writes to reads
         self.forward_data1    = Signal(64)
@@ -378,7 +409,6 @@ class RegStage1(RecordObject):
         self.write_bram       = Signal()
         self.write_tag        = Signal()
         self.slow_valid       = Signal()
         self.write_bram       = Signal()
         self.write_tag        = Signal()
         self.slow_valid       = Signal()
-        self.real_adr         = Signal(REAL_ADDR_BITS)
         self.wb               = WBMasterOut("wb")
         self.reload_tag       = Signal(TAG_BITS)
         self.store_way        = Signal(WAY_BITS)
         self.wb               = WBMasterOut("wb")
         self.reload_tag       = Signal(TAG_BITS)
         self.store_way        = Signal(WAY_BITS)
@@ -414,83 +444,156 @@ class DTLBUpdate(Elaboratable):
         self.tlbie    = Signal()
         self.tlbwe    = Signal()
         self.doall    = Signal()
         self.tlbie    = Signal()
         self.tlbwe    = Signal()
         self.doall    = Signal()
-        self.updated  = Signal()
-        self.v_updated  = Signal()
-        self.tlb_hit    = Signal()
+        self.tlb_hit     = TLBHit("tlb_hit")
         self.tlb_req_index = Signal(TLB_SET_BITS)
 
         self.tlb_req_index = Signal(TLB_SET_BITS)
 
-        self.tlb_hit_way     = Signal(TLB_WAY_BITS)
-        self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
-        self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
         self.repl_way        = Signal(TLB_WAY_BITS)
         self.eatag           = Signal(TLB_EA_TAG_BITS)
         self.pte_data        = Signal(TLB_PTE_BITS)
 
         self.repl_way        = Signal(TLB_WAY_BITS)
         self.eatag           = Signal(TLB_EA_TAG_BITS)
         self.pte_data        = Signal(TLB_PTE_BITS)
 
-        self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
-
-        self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
-        self.pb_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
-        self.db_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
+        # read from dtlb array
+        self.tlb_read       = Signal()
+        self.tlb_read_index = Signal(TLB_SET_BITS)
+        self.tlb_way        = TLBRecord("o_tlb_way")
 
     def elaborate(self, platform):
         m = Module()
         comb = m.d.comb
         sync = m.d.sync
 
 
     def elaborate(self, platform):
         m = Module()
         comb = m.d.comb
         sync = m.d.sync
 
-        tagset   = Signal(TLB_TAG_WAY_BITS)
-        pteset   = Signal(TLB_PTE_WAY_BITS)
-
-        tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
-        comb += db_out.eq(self.dv)
+        # there are 3 parts to this:
+        # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
+        # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
+        # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
+        # be a Memory because they can all be cleared (tlbie, doall), i mean,
+        # we _could_, in theory, by overriding the Reset Signal of the Memory,
+        # hmmm....
+
+        dtlb_valid = TLBValidArray()
+        tlb_req_index = self.tlb_req_index
+
+        print ("TLB_TAG_WAY_BITS", TLB_TAG_WAY_BITS)
+        print ("     TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
+        print ("        TLB_NUM_WAYS", TLB_NUM_WAYS)
+        print ("TLB_PTE_WAY_BITS", TLB_PTE_WAY_BITS)
+        print ("    TLB_PTE_BITS", TLB_PTE_BITS)
+        print ("    TLB_NUM_WAYS", TLB_NUM_WAYS)
+
+        # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
+        tagway = Memory(depth=TLB_SET_SIZE, width=TLB_TAG_WAY_BITS)
+        m.submodules.rd_tagway = rd_tagway = tagway.read_port()
+        m.submodules.wr_tagway = wr_tagway = tagway.write_port(
+                                    granularity=TLB_EA_TAG_BITS)
+
+        pteway = Memory(depth=TLB_SET_SIZE, width=TLB_PTE_WAY_BITS)
+        m.submodules.rd_pteway = rd_pteway = pteway.read_port()
+        m.submodules.wr_pteway = wr_pteway = pteway.write_port(
+                                    granularity=TLB_PTE_BITS)
+
+        # commented out for now, can be put in if Memory.reset can be
+        # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
+        #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
+        #m.submodules.rd_valid = rd_valid = validm.read_port()
+        #m.submodules.wr_valid = wr_valid = validm.write_port(
+                                    #granularity=1)
+
+        # connect up read and write addresses to Valid/PTE/TAG SRAMs
+        m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
+        m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
+        #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
+        m.d.comb += wr_tagway.addr.eq(tlb_req_index)
+        m.d.comb += wr_pteway.addr.eq(tlb_req_index)
+        #m.d.comb += wr_valid.addr.eq(tlb_req_index)
+
+        updated  = Signal()
+        v_updated  = Signal()
+        tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
+        db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
+        pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
+        dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
+
+        comb += dv.eq(dtlb_valid[tlb_req_index])
+        comb += db_out.eq(dv)
 
         with m.If(self.tlbie & self.doall):
 
         with m.If(self.tlbie & self.doall):
-            pass # clear all back in parent
+            # clear all valid bits at once
+            # XXX hmmm, validm _could_ use Memory reset here...
+            for i in range(TLB_SET_SIZE):
+                sync += dtlb_valid[i].eq(0)
         with m.Elif(self.tlbie):
         with m.Elif(self.tlbie):
-            with m.If(self.tlb_hit):
-                comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
-                comb += self.v_updated.eq(1)
-
+            # invalidate just the hit_way
+            with m.If(self.tlb_hit.valid):
+                comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
+                comb += v_updated.eq(1)
         with m.Elif(self.tlbwe):
         with m.Elif(self.tlbwe):
-
-            comb += tagset.eq(self.tlb_tag_way)
-            comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
-            comb += tb_out.eq(tagset)
-
-            comb += pteset.eq(self.tlb_pte_way)
-            comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
-            comb += pb_out.eq(pteset)
-
+            # write to the requested tag and PTE
+            comb += write_tlb_tag(self.repl_way, tb_out, self.eatag)
+            comb += write_tlb_pte(self.repl_way, pb_out, self.pte_data)
+            # set valid bit
             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 
             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 
-            comb += self.updated.eq(1)
-            comb += self.v_updated.eq(1)
+            comb += updated.eq(1)
+            comb += v_updated.eq(1)
+
+        # above, sometimes valid is requested to be updated but data not
+        # therefore split them out, here.  note the granularity thing matches
+        # with the shift-up of the eatag/pte_data into the correct TLB way.
+        # thus is it not necessary to write the entire lot, just the portion
+        # being altered: hence writing the *old* copy of the row is not needed
+        with m.If(updated): # PTE and TAG to be written
+            comb += wr_pteway.data.eq(pb_out)
+            comb += wr_pteway.en.eq(1<<self.repl_way)
+            comb += wr_tagway.data.eq(tb_out)
+            comb += wr_tagway.en.eq(1<<self.repl_way)
+        with m.If(v_updated): # Valid to be written
+            sync += dtlb_valid[tlb_req_index].eq(db_out)
+            #comb += wr_valid.data.eq(db_out)
+            #comb += wr_valid.en.eq(1<<self.repl_way)
+
+        # select one TLB way, use a register here
+        r_tlb_way        = TLBRecord("r_tlb_way")
+        r_delay = Signal()
+        sync += r_delay.eq(self.tlb_read)
+        with m.If(self.tlb_read):
+            sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
+        with m.If(r_delay):
+            # on one clock delay, output the contents of the read port(s)
+            # comb += self.tlb_way.valid.eq(rd_valid.data)
+            comb += self.tlb_way.tag.eq(rd_tagway.data)
+            comb += self.tlb_way.pte.eq(rd_pteway.data)
+            # and also capture the (delayed) output...
+            #sync += r_tlb_way.valid.eq(rd_valid.data)
+            sync += r_tlb_way.tag.eq(rd_tagway.data)
+            sync += r_tlb_way.pte.eq(rd_pteway.data)
+        with m.Else():
+            # ... so that the register can output it when no read is requested
+            # it's rather overkill but better to be safe than sorry
+            comb += self.tlb_way.tag.eq(r_tlb_way.tag)
+            comb += self.tlb_way.pte.eq(r_tlb_way.pte)
+            #comb += self.tlb_way.eq(r_tlb_way)
 
         return m
 
 
 class DCachePendingHit(Elaboratable):
 
 
         return m
 
 
 class DCachePendingHit(Elaboratable):
 
-    def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
-                      cache_valid_idx, cache_tag_set,
-                    req_addr,
-                    hit_set):
+    def __init__(self, tlb_way,
+                      cache_i_validdx, cache_tag_set,
+                    req_addr):
 
         self.go          = Signal()
         self.virt_mode   = Signal()
         self.is_hit      = Signal()
 
         self.go          = Signal()
         self.virt_mode   = Signal()
         self.is_hit      = Signal()
-        self.tlb_hit     = Signal()
+        self.tlb_hit      = TLBHit("tlb_hit")
         self.hit_way     = Signal(WAY_BITS)
         self.rel_match   = Signal()
         self.req_index   = Signal(INDEX_BITS)
         self.reload_tag  = Signal(TAG_BITS)
 
         self.hit_way     = Signal(WAY_BITS)
         self.rel_match   = Signal()
         self.req_index   = Signal(INDEX_BITS)
         self.reload_tag  = Signal(TAG_BITS)
 
-        self.tlb_hit_way = tlb_hit_way
-        self.tlb_pte_way = tlb_pte_way
-        self.tlb_valid_way = tlb_valid_way
-        self.cache_valid_idx = cache_valid_idx
+        self.tlb_way = tlb_way
+        self.cache_i_validdx = cache_i_validdx
         self.cache_tag_set = cache_tag_set
         self.req_addr = req_addr
         self.cache_tag_set = cache_tag_set
         self.req_addr = req_addr
-        self.hit_set = hit_set
 
     def elaborate(self, platform):
         m = Module()
 
     def elaborate(self, platform):
         m = Module()
@@ -500,19 +603,18 @@ class DCachePendingHit(Elaboratable):
         go = self.go
         virt_mode = self.virt_mode
         is_hit = self.is_hit
         go = self.go
         virt_mode = self.virt_mode
         is_hit = self.is_hit
-        tlb_pte_way = self.tlb_pte_way
-        tlb_valid_way = self.tlb_valid_way
-        cache_valid_idx = self.cache_valid_idx
+        tlb_way = self.tlb_way
+        cache_i_validdx = self.cache_i_validdx
         cache_tag_set = self.cache_tag_set
         req_addr = self.req_addr
         cache_tag_set = self.cache_tag_set
         req_addr = self.req_addr
-        tlb_hit_way = self.tlb_hit_way
         tlb_hit = self.tlb_hit
         tlb_hit = self.tlb_hit
-        hit_set = self.hit_set
         hit_way = self.hit_way
         rel_match = self.rel_match
         req_index = self.req_index
         reload_tag = self.reload_tag
 
         hit_way = self.hit_way
         rel_match = self.rel_match
         req_index = self.req_index
         reload_tag = self.reload_tag
 
+        hit_set     = Array(Signal(name="hit_set_%d" % i) \
+                                  for i in range(TLB_NUM_WAYS))
         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
                                     for i in range(TLB_NUM_WAYS))
         hit_way_set = HitWaySet()
         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
                                     for i in range(TLB_NUM_WAYS))
         hit_way_set = HitWaySet()
@@ -526,35 +628,35 @@ class DCachePendingHit(Elaboratable):
         with m.If(virt_mode):
             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
         with m.If(virt_mode):
             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
-                s_hit       = Signal()
-                s_pte       = Signal(TLB_PTE_BITS)
-                s_ra        = Signal(REAL_ADDR_BITS)
-                comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
+                s_hit       = Signal(name="s_hit%d" % j)
+                s_pte       = Signal(TLB_PTE_BITS, name="s_pte%d" % j)
+                s_ra        = Signal(REAL_ADDR_BITS, name="s_ra%d" % j)
+                # read the PTE, calc the Real Address, get tge tag
+                comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
                 comb += s_tag.eq(get_tag(s_ra))
                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
                 comb += s_tag.eq(get_tag(s_ra))
-
+                # for each way check tge tag against the cache tag set
                 for i in range(NUM_WAYS): # way_t
                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
                 for i in range(NUM_WAYS): # way_t
                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
-                    comb += is_tag_hit.eq(go & cache_valid_idx[i] &
+                    comb += is_tag_hit.eq(go & cache_i_validdx[i] &
                                   (read_tag(i, cache_tag_set) == s_tag)
                                   (read_tag(i, cache_tag_set) == s_tag)
-                                  & tlb_valid_way[j])
+                                  & (tlb_way.valid[j]))
                     with m.If(is_tag_hit):
                         comb += hit_way_set[j].eq(i)
                         comb += s_hit.eq(1)
                 comb += hit_set[j].eq(s_hit)
                     with m.If(is_tag_hit):
                         comb += hit_way_set[j].eq(i)
                         comb += s_hit.eq(1)
                 comb += hit_set[j].eq(s_hit)
-                with m.If(s_tag == reload_tag):
-                    comb += rel_matches[j].eq(1)
-            with m.If(tlb_hit):
-                comb += is_hit.eq(hit_set[tlb_hit_way])
-                comb += hit_way.eq(hit_way_set[tlb_hit_way])
-                comb += rel_match.eq(rel_matches[tlb_hit_way])
+                comb += rel_matches[j].eq(s_tag == reload_tag)
+            with m.If(tlb_hit.valid):
+                comb += is_hit.eq(hit_set[tlb_hit.way])
+                comb += hit_way.eq(hit_way_set[tlb_hit.way])
+                comb += rel_match.eq(rel_matches[tlb_hit.way])
         with m.Else():
             s_tag       = Signal(TAG_BITS)
             comb += s_tag.eq(get_tag(req_addr))
             for i in range(NUM_WAYS): # way_t
                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
         with m.Else():
             s_tag       = Signal(TAG_BITS)
             comb += s_tag.eq(get_tag(req_addr))
             for i in range(NUM_WAYS): # way_t
                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
-                comb += is_tag_hit.eq(go & cache_valid_idx[i] &
+                comb += is_tag_hit.eq(go & cache_i_validdx[i] &
                           (read_tag(i, cache_tag_set) == s_tag))
                 with m.If(is_tag_hit):
                     comb += hit_way.eq(i)
                           (read_tag(i, cache_tag_set) == s_tag))
                 with m.If(is_tag_hit):
                     comb += hit_way.eq(i)
@@ -567,13 +669,14 @@ class DCachePendingHit(Elaboratable):
 
 class DCache(Elaboratable):
     """Set associative dcache write-through
 
 class DCache(Elaboratable):
     """Set associative dcache write-through
+
     TODO (in no specific order):
     * See list in icache.vhdl
     * Complete load misses on the cycle when WB data comes instead of
       at the end of line (this requires dealing with requests coming in
       while not idle...)
     """
     TODO (in no specific order):
     * See list in icache.vhdl
     * Complete load misses on the cycle when WB data comes instead of
       at the end of line (this requires dealing with requests coming in
       while not idle...)
     """
-    def __init__(self):
+    def __init__(self, pspec=None):
         self.d_in      = LoadStore1ToDCacheType("d_in")
         self.d_out     = DCacheToLoadStore1Type("d_out")
 
         self.d_in      = LoadStore1ToDCacheType("d_in")
         self.d_out     = DCacheToLoadStore1Type("d_out")
 
@@ -582,11 +685,20 @@ class DCache(Elaboratable):
 
         self.stall_out = Signal()
 
 
         self.stall_out = Signal()
 
-        self.wb_out    = WBMasterOut()
-        self.wb_in     = WBSlaveOut()
+        # standard naming (wired to non-standard for compatibility)
+        self.bus = Interface(addr_width=32,
+                            data_width=64,
+                            granularity=8,
+                            features={'stall'},
+                            alignment=0,
+                            name="dcache")
 
         self.log_out   = Signal(20)
 
 
         self.log_out   = Signal(20)
 
+        # test if microwatt compatibility is to be enabled
+        self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
+                                 (pspec.microwatt_compat == True))
+
     def stage_0(self, m, r0, r1, r0_full):
         """Latch the request in r0.req as long as we're not stalling
         """
     def stage_0(self, m, r0, r1, r0_full):
         """Latch the request in r0.req as long as we're not stalling
         """
@@ -602,7 +714,7 @@ class DCache(Elaboratable):
 
         with m.If(m_in.valid):
             comb += r.req.valid.eq(1)
 
         with m.If(m_in.valid):
             comb += r.req.valid.eq(1)
-            comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
+            comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
             comb += r.req.dcbz.eq(0)
             comb += r.req.nc.eq(0)
             comb += r.req.reserve.eq(0)
             comb += r.req.dcbz.eq(0)
             comb += r.req.nc.eq(0)
             comb += r.req.reserve.eq(0)
@@ -615,19 +727,34 @@ class DCache(Elaboratable):
             comb += r.doall.eq(m_in.doall)
             comb += r.tlbld.eq(m_in.tlbld)
             comb += r.mmu_req.eq(1)
             comb += r.doall.eq(m_in.doall)
             comb += r.tlbld.eq(m_in.tlbld)
             comb += r.mmu_req.eq(1)
+            m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
+                                 m_in.addr, m_in.pte, r.req.load)
+
         with m.Else():
             comb += r.req.eq(d_in)
         with m.Else():
             comb += r.req.eq(d_in)
+            comb += r.req.data.eq(0)
             comb += r.tlbie.eq(0)
             comb += r.doall.eq(0)
             comb += r.tlbld.eq(0)
             comb += r.mmu_req.eq(0)
             comb += r.tlbie.eq(0)
             comb += r.doall.eq(0)
             comb += r.tlbld.eq(0)
             comb += r.mmu_req.eq(0)
-        with m.If(~(r1.full & r0_full)):
+
+        with m.If((~r1.full & ~d_in.hold) | ~r0_full):
             sync += r0.eq(r)
             sync += r0_full.eq(r.req.valid)
             sync += r0.eq(r)
             sync += r0_full.eq(r.req.valid)
-
-    def tlb_read(self, m, r0_stall, tlb_valid_way,
-                 tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
-                 dtlb_tags, dtlb_ptes):
+            # Sample data the cycle after a request comes in from loadstore1.
+            # If another request has come in already then the data will get
+            # put directly into req.data below.
+            with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
+                     ~r0.mmu_req):
+                sync += r0.req.data.eq(d_in.data)
+                sync += r0.d_valid.eq(1)
+        with m.If(d_in.valid):
+            m.d.sync += Display("    DCACHE req cache "
+                                "virt %d addr %x data %x ld %d",
+                                 r.req.virt_mode, r.req.addr,
+                                 r.req.data, r.req.load)
+
+    def tlb_read(self, m, r0_stall, tlb_way):
         """TLB
         Operates in the second cycle on the request latched in r0.req.
         TLB updates write the entry at the end of the second cycle.
         """TLB
         Operates in the second cycle on the request latched in r0.req.
         TLB updates write the entry at the end of the second cycle.
@@ -636,7 +763,6 @@ class DCache(Elaboratable):
         sync = m.d.sync
         m_in, d_in = self.m_in, self.d_in
 
         sync = m.d.sync
         m_in, d_in = self.m_in, self.d_in
 
-        index    = Signal(TLB_SET_BITS)
         addrbits = Signal(TLB_SET_BITS)
 
         amin = TLB_LG_PGSZ
         addrbits = Signal(TLB_SET_BITS)
 
         amin = TLB_LG_PGSZ
@@ -646,16 +772,15 @@ class DCache(Elaboratable):
             comb += addrbits.eq(m_in.addr[amin : amax])
         with m.Else():
             comb += addrbits.eq(d_in.addr[amin : amax])
             comb += addrbits.eq(m_in.addr[amin : amax])
         with m.Else():
             comb += addrbits.eq(d_in.addr[amin : amax])
-        comb += index.eq(addrbits)
 
         # If we have any op and the previous op isn't finished,
         # then keep the same output for next cycle.
 
         # If we have any op and the previous op isn't finished,
         # then keep the same output for next cycle.
-        with m.If(~r0_stall):
-            sync += tlb_valid_way.eq(dtlb_valid_bits[index])
-            sync += tlb_tag_way.eq(dtlb_tags[index])
-            sync += tlb_pte_way.eq(dtlb_ptes[index])
+        d = self.dtlb_update
+        comb += d.tlb_read_index.eq(addrbits)
+        comb += d.tlb_read.eq(~r0_stall)
+        comb += tlb_way.eq(d.tlb_way)
 
 
-    def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
+    def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
         """Generate TLB PLRUs
         """
         comb = m.d.comb
         """Generate TLB PLRUs
         """
         comb = m.d.comb
@@ -663,20 +788,19 @@ class DCache(Elaboratable):
 
         if TLB_NUM_WAYS == 0:
             return
 
         if TLB_NUM_WAYS == 0:
             return
-        for i in range(TLB_SET_SIZE):
-            # TLB PLRU interface
-            tlb_plru        = PLRU(TLB_WAY_BITS)
-            setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
-            tlb_plru_acc_en = Signal()
 
 
-            comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
-            comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
-            comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
-            comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
+        # suite of PLRUs with a selection and output mechanism
+        tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
+        m.submodules.tlb_plrus = tlb_plrus
+        comb += tlb_plrus.way.eq(r1.tlb_hit.way)
+        comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
+        comb += tlb_plrus.index.eq(r1.tlb_hit_index)
+        comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
+        comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 
     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 
     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
-                   tlb_valid_way, tlb_tag_way, tlb_hit_way,
-                   tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
+                   tlb_way,
+                   pte, tlb_hit, valid_ra, perm_attr, ra):
 
         comb = m.d.comb
 
 
         comb = m.d.comb
 
@@ -689,19 +813,20 @@ class DCache(Elaboratable):
         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 
         for i in range(TLB_NUM_WAYS):
         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 
         for i in range(TLB_NUM_WAYS):
-            is_tag_hit = Signal()
-            comb += is_tag_hit.eq(tlb_valid_way[i]
-                                  & (read_tlb_tag(i, tlb_tag_way) == eatag))
+            is_tag_hit = Signal(name="is_tag_hit%d" % i)
+            tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
+            comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
+            comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
             with m.If(is_tag_hit):
                 comb += hitway.eq(i)
                 comb += hit.eq(1)
 
             with m.If(is_tag_hit):
                 comb += hitway.eq(i)
                 comb += hit.eq(1)
 
-        comb += tlb_hit.eq(hit & r0_valid)
-        comb += tlb_hit_way.eq(hitway)
+        comb += tlb_hit.valid.eq(hit & r0_valid)
+        comb += tlb_hit.way.eq(hitway)
 
 
-        with m.If(tlb_hit):
-            comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
-        comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
+        with m.If(tlb_hit.valid):
+            comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
+        comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 
         with m.If(r0.req.virt_mode):
             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 
         with m.If(r0.req.virt_mode):
             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
@@ -723,11 +848,18 @@ class DCache(Elaboratable):
             comb += perm_attr.rd_perm.eq(1)
             comb += perm_attr.wr_perm.eq(1)
 
             comb += perm_attr.rd_perm.eq(1)
             comb += perm_attr.wr_perm.eq(1)
 
-    def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
-                    tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
-                    dtlb_tags, tlb_pte_way, dtlb_ptes):
+        with m.If(valid_ra):
+            m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
+                                r0.req.virt_mode, tlb_hit.valid, ra, pte)
+            m.d.sync += Display("       perm ref=%d", perm_attr.reference)
+            m.d.sync += Display("       perm chg=%d", perm_attr.changed)
+            m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
+            m.d.sync += Display("       perm prv=%d", perm_attr.priv)
+            m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
+            m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 
 
-        dtlb_valids = TLBValidBitsArray()
+    def tlb_update(self, m, r0_valid, r0, tlb_req_index,
+                    tlb_hit, tlb_plru_victim):
 
         comb = m.d.comb
         sync = m.d.sync
 
         comb = m.d.comb
         sync = m.d.sync
@@ -738,32 +870,18 @@ class DCache(Elaboratable):
         comb += tlbie.eq(r0_valid & r0.tlbie)
         comb += tlbwe.eq(r0_valid & r0.tlbld)
 
         comb += tlbie.eq(r0_valid & r0.tlbie)
         comb += tlbwe.eq(r0_valid & r0.tlbld)
 
-        m.submodules.tlb_update = d = DTLBUpdate()
-        with m.If(tlbie & r0.doall):
-            # clear all valid bits at once
-            for i in range(TLB_SET_SIZE):
-                sync += dtlb_valid_bits[i].eq(0)
-        with m.If(d.updated):
-            sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
-            sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
-        with m.If(d.v_updated):
-            sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
-
-        comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
+        d = self.dtlb_update
 
         comb += d.tlbie.eq(tlbie)
         comb += d.tlbwe.eq(tlbwe)
         comb += d.doall.eq(r0.doall)
         comb += d.tlb_hit.eq(tlb_hit)
 
         comb += d.tlbie.eq(tlbie)
         comb += d.tlbwe.eq(tlbwe)
         comb += d.doall.eq(r0.doall)
         comb += d.tlb_hit.eq(tlb_hit)
-        comb += d.tlb_hit_way.eq(tlb_hit_way)
-        comb += d.tlb_tag_way.eq(tlb_tag_way)
-        comb += d.tlb_pte_way.eq(tlb_pte_way)
         comb += d.tlb_req_index.eq(tlb_req_index)
 
         comb += d.tlb_req_index.eq(tlb_req_index)
 
-        with m.If(tlb_hit):
-            comb += d.repl_way.eq(tlb_hit_way)
+        with m.If(tlb_hit.valid):
+            comb += d.repl_way.eq(tlb_hit.way)
         with m.Else():
         with m.Else():
-            comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
+            comb += d.repl_way.eq(tlb_plru_victim)
         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
         comb += d.pte_data.eq(r0.req.data)
 
         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
         comb += d.pte_data.eq(r0.req.data)
 
@@ -776,16 +894,13 @@ class DCache(Elaboratable):
         if TLB_NUM_WAYS == 0:
             return
 
         if TLB_NUM_WAYS == 0:
             return
 
-        for i in range(NUM_LINES):
-            # PLRU interface
-            plru        = PLRU(WAY_BITS)
-            setattr(m.submodules, "plru%d" % i, plru)
-            plru_acc_en = Signal()
-
-            comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
-            comb += plru.acc_en.eq(plru_acc_en)
-            comb += plru.acc_i.eq(r1.hit_way)
-            comb += plru_victim[i].eq(plru.lru_o)
+        # suite of PLRUs with a selection and output mechanism
+        m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
+        comb += plrus.way.eq(r1.hit_way)
+        comb += plrus.valid.eq(r1.cache_hit)
+        comb += plrus.index.eq(r1.hit_index)
+        comb += plrus.isel.eq(r1.store_index) # select victim
+        comb += plru_victim.eq(plrus.o_index) # selected victim
 
     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
         """Cache tag RAM read port
 
     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
         """Cache tag RAM read port
@@ -802,21 +917,19 @@ class DCache(Elaboratable):
             comb += index.eq(get_index(m_in.addr))
         with m.Else():
             comb += index.eq(get_index(d_in.addr))
             comb += index.eq(get_index(m_in.addr))
         with m.Else():
             comb += index.eq(get_index(d_in.addr))
-        sync += cache_tag_set.eq(cache_tags[index])
+        sync += cache_tag_set.eq(cache_tags[index].tag)
 
     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 
     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
-                       r0_valid, r1, cache_valids, replace_way,
+                       r0_valid, r1, cache_tags, replace_way,
                        use_forward1_next, use_forward2_next,
                        req_hit_way, plru_victim, rc_ok, perm_attr,
                        valid_ra, perm_ok, access_ok, req_op, req_go,
                        use_forward1_next, use_forward2_next,
                        req_hit_way, plru_victim, rc_ok, perm_attr,
                        valid_ra, perm_ok, access_ok, req_op, req_go,
-                       tlb_pte_way,
-                       tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
+                       tlb_hit, tlb_way, cache_tag_set,
                        cancel_store, req_same_tag, r0_stall, early_req_row):
         """Cache request parsing and hit detection
         """
 
         comb = m.d.comb
                        cancel_store, req_same_tag, r0_stall, early_req_row):
         """Cache request parsing and hit detection
         """
 
         comb = m.d.comb
-        sync = m.d.sync
         m_in, d_in = self.m_in, self.d_in
 
         is_hit      = Signal()
         m_in, d_in = self.m_in, self.d_in
 
         is_hit      = Signal()
@@ -825,9 +938,7 @@ class DCache(Elaboratable):
         opsel       = Signal(3)
         go          = Signal()
         nc          = Signal()
         opsel       = Signal(3)
         go          = Signal()
         nc          = Signal()
-        hit_set     = Array(Signal(name="hit_set_%d" % i) \
-                                  for i in range(TLB_NUM_WAYS))
-        cache_valid_idx = Signal(NUM_WAYS)
+        cache_i_validdx = Signal(NUM_WAYS)
 
         # Extract line, row and tag from request
         comb += req_index.eq(get_index(r0.req.addr))
 
         # Extract line, row and tag from request
         comb += req_index.eq(get_index(r0.req.addr))
@@ -839,19 +950,17 @@ class DCache(Elaboratable):
                     r0.req.addr, ra, req_index, req_tag, req_row)
 
         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
                     r0.req.addr, ra, req_index, req_tag, req_row)
 
         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
-        comb += cache_valid_idx.eq(cache_valids[req_index])
-
-        m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
-                                tlb_valid_way, tlb_hit_way,
-                                cache_valid_idx, cache_tag_set,
-                                r0.req.addr,
-                                hit_set)
+        comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 
 
+        m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
+                                            cache_i_validdx, cache_tag_set,
+                                            r0.req.addr)
         comb += dc.tlb_hit.eq(tlb_hit)
         comb += dc.reload_tag.eq(r1.reload_tag)
         comb += dc.virt_mode.eq(r0.req.virt_mode)
         comb += dc.go.eq(go)
         comb += dc.req_index.eq(req_index)
         comb += dc.tlb_hit.eq(tlb_hit)
         comb += dc.reload_tag.eq(r1.reload_tag)
         comb += dc.virt_mode.eq(r0.req.virt_mode)
         comb += dc.go.eq(go)
         comb += dc.req_index.eq(req_index)
+
         comb += is_hit.eq(dc.is_hit)
         comb += hit_way.eq(dc.hit_way)
         comb += req_same_tag.eq(dc.rel_match)
         comb += is_hit.eq(dc.is_hit)
         comb += hit_way.eq(dc.hit_way)
         comb += req_same_tag.eq(dc.rel_match)
@@ -888,7 +997,7 @@ class DCache(Elaboratable):
 
         # The way to replace on a miss
         with m.If(r1.write_tag):
 
         # The way to replace on a miss
         with m.If(r1.write_tag):
-            comb += replace_way.eq(plru_victim[r1.store_index])
+            comb += replace_way.eq(plru_victim)
         with m.Else():
             comb += replace_way.eq(r1.store_way)
 
         with m.Else():
             comb += replace_way.eq(r1.store_way)
 
@@ -900,16 +1009,22 @@ class DCache(Elaboratable):
                            (perm_attr.wr_perm |
                               (r0.req.load & perm_attr.rd_perm)))
         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
                            (perm_attr.wr_perm |
                               (r0.req.load & perm_attr.rd_perm)))
         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
+
         # Combine the request and cache hit status to decide what
         # operation needs to be done
         comb += nc.eq(r0.req.nc | perm_attr.nocache)
         comb += op.eq(Op.OP_NONE)
         with m.If(go):
             with m.If(~access_ok):
         # Combine the request and cache hit status to decide what
         # operation needs to be done
         comb += nc.eq(r0.req.nc | perm_attr.nocache)
         comb += op.eq(Op.OP_NONE)
         with m.If(go):
             with m.If(~access_ok):
+                m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
+                                 valid_ra, perm_ok, rc_ok)
                 comb += op.eq(Op.OP_BAD)
             with m.Elif(cancel_store):
                 comb += op.eq(Op.OP_BAD)
             with m.Elif(cancel_store):
+                m.d.sync += Display("DCACHE cancel store")
                 comb += op.eq(Op.OP_STCX_FAIL)
             with m.Else():
                 comb += op.eq(Op.OP_STCX_FAIL)
             with m.Else():
+                m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
+                                 valid_ra, nc, r0.req.load)
                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
                 with m.Switch(opsel):
                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
                 with m.Switch(opsel):
                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
@@ -945,16 +1060,15 @@ class DCache(Elaboratable):
             # XXX generate alignment interrupt if address
             # is not aligned XXX or if r0.req.nc = '1'
             with m.If(r0.req.load):
             # XXX generate alignment interrupt if address
             # is not aligned XXX or if r0.req.nc = '1'
             with m.If(r0.req.load):
-                comb += set_rsrv.eq(1) # load with reservation
+                comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
             with m.Else():
             with m.Else():
-                comb += clear_rsrv.eq(1) # store conditional
+                comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
                 with m.If((~reservation.valid) |
                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
                     comb += cancel_store.eq(1)
 
     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
                         reservation, r0):
                 with m.If((~reservation.valid) |
                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
                     comb += cancel_store.eq(1)
 
     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
                         reservation, r0):
-
         comb = m.d.comb
         sync = m.d.sync
 
         comb = m.d.comb
         sync = m.d.sync
 
@@ -991,6 +1105,7 @@ class DCache(Elaboratable):
                 dsel = data_fwd.word_select(i, 8)
                 comb += data_out.word_select(i, 8).eq(dsel)
 
                 dsel = data_fwd.word_select(i, 8)
                 comb += data_out.word_select(i, 8).eq(dsel)
 
+        # DCache output to LoadStore
         comb += d_out.valid.eq(r1.ls_valid)
         comb += d_out.data.eq(data_out)
         comb += d_out.store_done.eq(~r1.stcx_fail)
         comb += d_out.valid.eq(r1.ls_valid)
         comb += d_out.data.eq(data_out)
         comb += d_out.store_done.eq(~r1.stcx_fail)
@@ -1029,12 +1144,15 @@ class DCache(Elaboratable):
 
             # error cases complete without stalling
             with m.If(r1.ls_error):
 
             # error cases complete without stalling
             with m.If(r1.ls_error):
-                sync += Display("completing ld/st with error")
+                with m.If(r1.dcbz):
+                    sync += Display("completing dcbz with error")
+                with m.Else():
+                    sync += Display("completing ld/st with error")
 
             # Slow ops (load miss, NC, stores)
             with m.If(r1.slow_valid):
 
             # Slow ops (load miss, NC, stores)
             with m.If(r1.slow_valid):
-                sync += Display("completing store or load miss data=%x",
-                                data_out)
+                sync += Display("completing store or load miss adr=%x data=%x",
+                                r1.req.real_addr, data_out)
 
         with m.Else():
             # Request came from MMU
 
         with m.Else():
             # Request came from MMU
@@ -1047,8 +1165,8 @@ class DCache(Elaboratable):
 
             # Slow ops (i.e. load miss)
             with m.If(r1.slow_valid):
 
             # Slow ops (i.e. load miss)
             with m.If(r1.slow_valid):
-                sync += Display("completing MMU load miss, data=%x",
-                                m_out.data)
+                sync += Display("completing MMU load miss, adr=%x data=%x",
+                                r1.req.real_addr, m_out.data)
 
     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
         """rams
 
     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
         """rams
@@ -1062,62 +1180,80 @@ class DCache(Elaboratable):
         account by using 1-cycle delayed signals for load hits.
         """
         comb = m.d.comb
         account by using 1-cycle delayed signals for load hits.
         """
         comb = m.d.comb
-        wb_in = self.wb_in
+        bus = self.bus
+
+        # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
+        # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
+        m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
+        comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
+                   ~r1.write_bram))
+        comb += rwe.i.eq(replace_way)
+
+        m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
+        comb += hwe.i.eq(r1.hit_way)
+
+        # this one is gated with write_bram, and replace_way_e can never be
+        # set at the same time.  that means that do_write can OR the outputs
+        m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
+        comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
+        comb += hre.i.eq(r1.req.hit_way)
+
+        # common Signals
+        do_read  = Signal()
+        wr_addr  = Signal(ROW_BITS)
+        wr_data  = Signal(WB_DATA_BITS)
+        wr_sel   = Signal(ROW_SIZE)
+        rd_addr  = Signal(ROW_BITS)
+
+        comb += do_read.eq(1) # always enable
+        comb += rd_addr.eq(early_req_row)
+
+        # Write mux:
+        #
+        # Defaults to wishbone read responses (cache refill)
+        #
+        # For timing, the mux on wr_data/sel/addr is not
+        # dependent on anything other than the current state.
+
+        with m.If(r1.write_bram):
+            # Write store data to BRAM.  This happens one
+            # cycle after the store is in r0.
+            comb += wr_data.eq(r1.req.data)
+            comb += wr_sel.eq(r1.req.byte_sel)
+            comb += wr_addr.eq(get_row(r1.req.real_addr))
 
 
+        with m.Else():
+            # Otherwise, we might be doing a reload or a DCBZ
+            with m.If(r1.dcbz):
+                comb += wr_data.eq(0)
+            with m.Else():
+                comb += wr_data.eq(bus.dat_r)
+            comb += wr_addr.eq(r1.store_row)
+            comb += wr_sel.eq(~0) # all 1s
+
+        # set up Cache Rams
         for i in range(NUM_WAYS):
         for i in range(NUM_WAYS):
-            do_read  = Signal(name="do_rd%d" % i)
-            rd_addr  = Signal(ROW_BITS)
             do_write = Signal(name="do_wr%d" % i)
             do_write = Signal(name="do_wr%d" % i)
-            wr_addr  = Signal(ROW_BITS)
-            wr_data  = Signal(WB_DATA_BITS)
-            wr_sel   = Signal(ROW_SIZE)
-            wr_sel_m = Signal(ROW_SIZE)
-            _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
+            wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
+            d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
 
 
-            way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True)
-            setattr(m.submodules, "cacheram_%d" % i, way)
+            way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
+            m.submodules["cacheram_%d" % i] = way
 
             comb += way.rd_en.eq(do_read)
             comb += way.rd_addr.eq(rd_addr)
 
             comb += way.rd_en.eq(do_read)
             comb += way.rd_addr.eq(rd_addr)
-            comb += _d_out.eq(way.rd_data_o)
+            comb += d_out.eq(way.rd_data_o)
             comb += way.wr_sel.eq(wr_sel_m)
             comb += way.wr_addr.eq(wr_addr)
             comb += way.wr_data.eq(wr_data)
 
             # Cache hit reads
             comb += way.wr_sel.eq(wr_sel_m)
             comb += way.wr_addr.eq(wr_addr)
             comb += way.wr_data.eq(wr_data)
 
             # Cache hit reads
-            comb += do_read.eq(1)
-            comb += rd_addr.eq(early_req_row)
-            with m.If(r1.hit_way == i):
-                comb += cache_out_row.eq(_d_out)
-
-            # Write mux:
-            #
-            # Defaults to wishbone read responses (cache refill)
-            #
-            # For timing, the mux on wr_data/sel/addr is not
-            # dependent on anything other than the current state.
-
-            with m.If(r1.write_bram):
-                # Write store data to BRAM.  This happens one
-                # cycle after the store is in r0.
-                comb += wr_data.eq(r1.req.data)
-                comb += wr_sel.eq(r1.req.byte_sel)
-                comb += wr_addr.eq(get_row(r1.req.real_addr))
-
-                with m.If(i == r1.req.hit_way):
-                    comb += do_write.eq(1)
-            with m.Else():
-                # Otherwise, we might be doing a reload or a DCBZ
-                with m.If(r1.dcbz):
-                    comb += wr_data.eq(0)
-                with m.Else():
-                    comb += wr_data.eq(wb_in.dat)
-                comb += wr_addr.eq(r1.store_row)
-                comb += wr_sel.eq(~0) # all 1s
+            with m.If(hwe.o[i]):
+                comb += cache_out_row.eq(d_out)
 
 
-            with m.If((r1.state == State.RELOAD_WAIT_ACK)
-                      & wb_in.ack & (replace_way == i)):
-                comb += do_write.eq(1)
+            # these are mutually-exclusive via their Decoder-enablers
+            # (note: Decoder-enable is inverted)
+            comb += do_write.eq(hre.o[i] | rwe.o[i])
 
             # Mask write selects with do_write since BRAM
             # doesn't have a global write-enable
 
             # Mask write selects with do_write since BRAM
             # doesn't have a global write-enable
@@ -1129,8 +1265,7 @@ class DCache(Elaboratable):
     # It also handles error cases (TLB miss, cache paradox)
     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
                         req_hit_way, req_index, req_tag, access_ok,
     # It also handles error cases (TLB miss, cache paradox)
     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
                         req_hit_way, req_index, req_tag, access_ok,
-                        tlb_hit, tlb_hit_way, tlb_req_index):
-
+                        tlb_hit, tlb_req_index):
         comb = m.d.comb
         sync = m.d.sync
 
         comb = m.d.comb
         sync = m.d.sync
 
@@ -1147,36 +1282,26 @@ class DCache(Elaboratable):
         sync += r1.hit_way.eq(req_hit_way)
         sync += r1.hit_index.eq(req_index)
 
         sync += r1.hit_way.eq(req_hit_way)
         sync += r1.hit_index.eq(req_index)
 
-        with m.If(req_op == Op.OP_LOAD_HIT):
-            sync += r1.hit_load_valid.eq(1)
-        with m.Else():
-            sync += r1.hit_load_valid.eq(0)
-
-        with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
-            sync += r1.cache_hit.eq(1)
-        with m.Else():
-            sync += r1.cache_hit.eq(0)
+        sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
+        sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
+                                (req_op == Op.OP_STORE_HIT))
 
         with m.If(req_op == Op.OP_BAD):
 
         with m.If(req_op == Op.OP_BAD):
-            # Display(f"Signalling ld/st error valid_ra={valid_ra}"
-            #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
+            sync += Display("Signalling ld/st error "
+                            "ls_error=%i mmu_error=%i cache_paradox=%i",
+                            ~r0.mmu_req,r0.mmu_req,access_ok)
             sync += r1.ls_error.eq(~r0.mmu_req)
             sync += r1.mmu_error.eq(r0.mmu_req)
             sync += r1.cache_paradox.eq(access_ok)
             sync += r1.ls_error.eq(~r0.mmu_req)
             sync += r1.mmu_error.eq(r0.mmu_req)
             sync += r1.cache_paradox.eq(access_ok)
-
-            with m.Else():
-                sync += r1.ls_error.eq(0)
-                sync += r1.mmu_error.eq(0)
-                sync += r1.cache_paradox.eq(0)
-
-        with m.If(req_op == Op.OP_STCX_FAIL):
-            sync += r1.stcx_fail.eq(1)
         with m.Else():
         with m.Else():
-            sync += r1.stcx_fail.eq(0)
+            sync += r1.ls_error.eq(0)
+            sync += r1.mmu_error.eq(0)
+            sync += r1.cache_paradox.eq(0)
+
+        sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
 
         # Record TLB hit information for updating TLB PLRU
         sync += r1.tlb_hit.eq(tlb_hit)
 
         # Record TLB hit information for updating TLB PLRU
         sync += r1.tlb_hit.eq(tlb_hit)
-        sync += r1.tlb_hit_way.eq(tlb_hit_way)
         sync += r1.tlb_hit_index.eq(tlb_req_index)
 
     # Memory accesses are handled by this state machine:
         sync += r1.tlb_hit_index.eq(tlb_req_index)
 
     # Memory accesses are handled by this state machine:
@@ -1188,17 +1313,16 @@ class DCache(Elaboratable):
     # All wishbone requests generation is done here.
     # This machine operates at stage 1.
     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
     # All wishbone requests generation is done here.
     # This machine operates at stage 1.
     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
-                    cache_valids, r0, replace_way,
+                    r0, replace_way,
                     req_hit_way, req_same_tag,
                     r0_valid, req_op, cache_tags, req_go, ra):
 
         comb = m.d.comb
         sync = m.d.sync
                     req_hit_way, req_same_tag,
                     r0_valid, req_op, cache_tags, req_go, ra):
 
         comb = m.d.comb
         sync = m.d.sync
-        wb_in = self.wb_in
+        bus = self.bus
+        d_in = self.d_in
 
         req         = MemAccessRequest("mreq_ds")
 
         req         = MemAccessRequest("mreq_ds")
-        acks        = Signal(3)
-        adjust_acks = Signal(3)
 
         req_row = Signal(ROW_BITS)
         req_idx = Signal(INDEX_BITS)
 
         req_row = Signal(ROW_BITS)
         req_idx = Signal(INDEX_BITS)
@@ -1226,7 +1350,7 @@ class DCache(Elaboratable):
             with m.If(r1.dcbz):
                 sync += r1.forward_data1.eq(0)
             with m.Else():
             with m.If(r1.dcbz):
                 sync += r1.forward_data1.eq(0)
             with m.Else():
-                sync += r1.forward_data1.eq(wb_in.dat)
+                sync += r1.forward_data1.eq(bus.dat_r)
             sync += r1.forward_sel1.eq(~0) # all 1s
             sync += r1.forward_way1.eq(replace_way)
             sync += r1.forward_row1.eq(r1.store_row)
             sync += r1.forward_sel1.eq(~0) # all 1s
             sync += r1.forward_way1.eq(replace_way)
             sync += r1.forward_row1.eq(r1.store_row)
@@ -1243,19 +1367,21 @@ class DCache(Elaboratable):
         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
 
         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
 
         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
-            with m.If(~r0.mmu_req):
-                sync += r1.ls_valid.eq(1)
-            with m.Else():
+            with m.If(r0.mmu_req):
                 sync += r1.mmu_done.eq(1)
                 sync += r1.mmu_done.eq(1)
+            with m.Else():
+                sync += r1.ls_valid.eq(1)
 
         with m.If(r1.write_tag):
             # Store new tag in selected way
 
         with m.If(r1.write_tag):
             # Store new tag in selected way
+            replace_way_onehot = Signal(NUM_WAYS)
+            comb += replace_way_onehot.eq(1<<replace_way)
             for i in range(NUM_WAYS):
             for i in range(NUM_WAYS):
-                with m.If(i == replace_way):
+                with m.If(replace_way_onehot[i]):
                     ct = Signal(TAG_RAM_WIDTH)
                     ct = Signal(TAG_RAM_WIDTH)
-                    comb += ct.eq(cache_tags[r1.store_index])
+                    comb += ct.eq(cache_tags[r1.store_index].tag)
                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
-                    sync += cache_tags[r1.store_index].eq(ct)
+                    sync += cache_tags[r1.store_index].tag.eq(ct)
             sync += r1.store_way.eq(replace_way)
             sync += r1.write_tag.eq(0)
 
             sync += r1.store_way.eq(replace_way)
             sync += r1.write_tag.eq(0)
 
@@ -1270,10 +1396,13 @@ class DCache(Elaboratable):
             comb += req.dcbz.eq(r0.req.dcbz)
             comb += req.real_addr.eq(ra)
 
             comb += req.dcbz.eq(r0.req.dcbz)
             comb += req.real_addr.eq(ra)
 
-            with m.If(~r0.req.dcbz):
+            with m.If(r0.req.dcbz):
+                # force data to 0 for dcbz
+                comb += req.data.eq(0)
+            with m.Elif(r0.d_valid):
                 comb += req.data.eq(r0.req.data)
             with m.Else():
                 comb += req.data.eq(r0.req.data)
             with m.Else():
-                comb += req.data.eq(0)
+                comb += req.data.eq(d_in.data)
 
             # Select all bytes for dcbz
             # and for cacheable loads
 
             # Select all bytes for dcbz
             # and for cacheable loads
@@ -1298,7 +1427,7 @@ class DCache(Elaboratable):
         with m.Switch(r1.state):
 
             with m.Case(State.IDLE):
         with m.Switch(r1.state):
 
             with m.Case(State.IDLE):
-                sync += r1.real_adr.eq(req.real_addr)
+                sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
                 sync += r1.wb.sel.eq(req.byte_sel)
                 sync += r1.wb.dat.eq(req.data)
                 sync += r1.dcbz.eq(req.dcbz)
                 sync += r1.wb.sel.eq(req.byte_sel)
                 sync += r1.wb.dat.eq(req.data)
                 sync += r1.dcbz.eq(req.dcbz)
@@ -1354,10 +1483,10 @@ class DCache(Elaboratable):
                             sync += r1.full.eq(0)
                             sync += r1.slow_valid.eq(1)
 
                             sync += r1.full.eq(0)
                             sync += r1.slow_valid.eq(1)
 
-                            with m.If(~req.mmu_req):
-                                sync += r1.ls_valid.eq(1)
-                            with m.Else():
+                            with m.If(req.mmu_req):
                                 sync += r1.mmu_done.eq(1)
                                 sync += r1.mmu_done.eq(1)
+                            with m.Else():
+                                sync += r1.ls_valid.eq(1)
 
                             with m.If(req.op == Op.OP_STORE_HIT):
                                 sync += r1.write_bram.eq(1)
 
                             with m.If(req.op == Op.OP_STORE_HIT):
                                 sync += r1.write_bram.eq(1)
@@ -1388,22 +1517,25 @@ class DCache(Elaboratable):
                 # Requests are all sent if stb is 0
                 comb += ld_stbs_done.eq(~r1.wb.stb)
 
                 # Requests are all sent if stb is 0
                 comb += ld_stbs_done.eq(~r1.wb.stb)
 
-                with m.If((~wb_in.stall) & r1.wb.stb):
+                # If we are still sending requests, was one accepted?
+                with m.If((~bus.stall) & r1.wb.stb):
                     # That was the last word?  We are done sending.
                     # Clear stb and set ld_stbs_done so we can handle an
                     # eventual last ack on the same cycle.
                     # That was the last word?  We are done sending.
                     # Clear stb and set ld_stbs_done so we can handle an
                     # eventual last ack on the same cycle.
-                    with m.If(is_last_row_addr(r1.real_adr, r1.end_row_ix)):
+                    # sigh - reconstruct wb adr with 3 extra 0s at front
+                    wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
+                    with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
                         sync += r1.wb.stb.eq(0)
                         comb += ld_stbs_done.eq(1)
 
                     # Calculate the next row address in the current cache line
                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
                         sync += r1.wb.stb.eq(0)
                         comb += ld_stbs_done.eq(1)
 
                     # Calculate the next row address in the current cache line
                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
-                    comb += row.eq(r1.real_adr[ROW_OFF_BITS:])
-                    sync += r1.real_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(row+1)
+                    comb += row.eq(r1.wb.adr)
+                    sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
 
                 # Incoming acks processing
 
                 # Incoming acks processing
-                sync += r1.forward_valid1.eq(wb_in.ack)
-                with m.If(wb_in.ack):
+                sync += r1.forward_valid1.eq(bus.ack)
+                with m.If(bus.ack):
                     srow = Signal(ROW_LINE_BITS)
                     comb += srow.eq(r1.store_row)
                     sync += r1.rows_valid[srow].eq(1)
                     srow = Signal(ROW_LINE_BITS)
                     comb += srow.eq(r1.store_row)
                     sync += r1.rows_valid[srow].eq(1)
@@ -1413,16 +1545,16 @@ class DCache(Elaboratable):
                     # Compare the whole address in case the
                     # request in r1.req is not the one that
                     # started this refill.
                     # Compare the whole address in case the
                     # request in r1.req is not the one that
                     # started this refill.
-                    with m.If(r1.full & r1.req.same_tag &
+                    with m.If(req.valid & r1.req.same_tag &
                               ((r1.dcbz & r1.req.dcbz) |
                               ((r1.dcbz & r1.req.dcbz) |
-                               ((~r1.dcbz) & (r1.req.op == Op.OP_LOAD_MISS))) &
-                                (r1.store_row == get_row(r1.req.real_addr))):
+                               (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
+                                (r1.store_row == get_row(req.real_addr))):
                         sync += r1.full.eq(0)
                         sync += r1.slow_valid.eq(1)
                         sync += r1.full.eq(0)
                         sync += r1.slow_valid.eq(1)
-                        with m.If(~r1.mmu_req):
-                            sync += r1.ls_valid.eq(1)
-                        with m.Else():
+                        with m.If(r1.mmu_req):
                             sync += r1.mmu_done.eq(1)
                             sync += r1.mmu_done.eq(1)
+                        with m.Else():
+                            sync += r1.ls_valid.eq(1)
                         sync += r1.forward_sel.eq(~0) # all 1s
                         sync += r1.use_forward1.eq(1)
 
                         sync += r1.forward_sel.eq(~0) # all 1s
                         sync += r1.use_forward1.eq(1)
 
@@ -1434,41 +1566,45 @@ class DCache(Elaboratable):
 
                         # Cache line is now valid
                         cv = Signal(INDEX_BITS)
 
                         # Cache line is now valid
                         cv = Signal(INDEX_BITS)
-                        comb += cv.eq(cache_valids[r1.store_index])
+                        comb += cv.eq(cache_tags[r1.store_index].valid)
                         comb += cv.bit_select(r1.store_way, 1).eq(1)
                         comb += cv.bit_select(r1.store_way, 1).eq(1)
-                        sync += cache_valids[r1.store_index].eq(cv)
+                        sync += cache_tags[r1.store_index].valid.eq(cv)
 
                         sync += r1.state.eq(State.IDLE)
 
                         sync += r1.state.eq(State.IDLE)
+                        sync += Display("cache valid set %x "
+                                        "idx %d way %d",
+                                         cv, r1.store_index, r1.store_way)
 
                     # Increment store row counter
                     sync += r1.store_row.eq(next_row(r1.store_row))
 
             with m.Case(State.STORE_WAIT_ACK):
                 st_stbs_done = Signal()
 
                     # Increment store row counter
                     sync += r1.store_row.eq(next_row(r1.store_row))
 
             with m.Case(State.STORE_WAIT_ACK):
                 st_stbs_done = Signal()
+                adjust_acks = Signal(3)
+
                 comb += st_stbs_done.eq(~r1.wb.stb)
                 comb += st_stbs_done.eq(~r1.wb.stb)
-                comb += acks.eq(r1.acks_pending)
 
                 with m.If(r1.inc_acks != r1.dec_acks):
                     with m.If(r1.inc_acks):
 
                 with m.If(r1.inc_acks != r1.dec_acks):
                     with m.If(r1.inc_acks):
-                        comb += adjust_acks.eq(acks + 1)
+                        comb += adjust_acks.eq(r1.acks_pending + 1)
                     with m.Else():
                     with m.Else():
-                        comb += adjust_acks.eq(acks - 1)
+                        comb += adjust_acks.eq(r1.acks_pending - 1)
                 with m.Else():
                 with m.Else():
-                    comb += adjust_acks.eq(acks)
+                    comb += adjust_acks.eq(r1.acks_pending)
 
                 sync += r1.acks_pending.eq(adjust_acks)
 
                 # Clear stb when slave accepted request
 
                 sync += r1.acks_pending.eq(adjust_acks)
 
                 # Clear stb when slave accepted request
-                with m.If(~wb_in.stall):
+                with m.If(~bus.stall):
                     # See if there is another store waiting
                     # to be done which is in the same real page.
                     with m.If(req.valid):
                     # See if there is another store waiting
                     # to be done which is in the same real page.
                     with m.If(req.valid):
-                        ra = req.real_addr[0:SET_SIZE_BITS]
-                        sync += r1.real_adr[0:SET_SIZE_BITS].eq(ra)
+                        _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
+                        sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
                         sync += r1.wb.dat.eq(req.data)
                         sync += r1.wb.sel.eq(req.byte_sel)
 
                         sync += r1.wb.dat.eq(req.data)
                         sync += r1.wb.sel.eq(req.byte_sel)
 
-                    with m.Elif((adjust_acks < 7) & req.same_tag &
+                    with m.If((adjust_acks < 7) & req.same_tag &
                                 ((req.op == Op.OP_STORE_MISS)
                                  | (req.op == Op.OP_STORE_HIT))):
                         sync += r1.wb.stb.eq(1)
                                 ((req.op == Op.OP_STORE_MISS)
                                  | (req.op == Op.OP_STORE_HIT))):
                         sync += r1.wb.stb.eq(1)
@@ -1488,7 +1624,9 @@ class DCache(Elaboratable):
                         comb += st_stbs_done.eq(1)
 
                 # Got ack ? See if complete.
                         comb += st_stbs_done.eq(1)
 
                 # Got ack ? See if complete.
-                with m.If(wb_in.ack):
+                sync += Display("got ack %d %d stbs %d adjust_acks %d",
+                                bus.ack, bus.ack, st_stbs_done, adjust_acks)
+                with m.If(bus.ack):
                     with m.If(st_stbs_done & (adjust_acks == 1)):
                         sync += r1.state.eq(State.IDLE)
                         sync += r1.wb.cyc.eq(0)
                     with m.If(st_stbs_done & (adjust_acks == 1)):
                         sync += r1.state.eq(State.IDLE)
                         sync += r1.wb.cyc.eq(0)
@@ -1497,44 +1635,44 @@ class DCache(Elaboratable):
 
             with m.Case(State.NC_LOAD_WAIT_ACK):
                 # Clear stb when slave accepted request
 
             with m.Case(State.NC_LOAD_WAIT_ACK):
                 # Clear stb when slave accepted request
-                with m.If(~wb_in.stall):
+                with m.If(~bus.stall):
                     sync += r1.wb.stb.eq(0)
 
                 # Got ack ? complete.
                     sync += r1.wb.stb.eq(0)
 
                 # Got ack ? complete.
-                with m.If(wb_in.ack):
+                with m.If(bus.ack):
                     sync += r1.state.eq(State.IDLE)
                     sync += r1.full.eq(0)
                     sync += r1.slow_valid.eq(1)
 
                     sync += r1.state.eq(State.IDLE)
                     sync += r1.full.eq(0)
                     sync += r1.slow_valid.eq(1)
 
-                    with m.If(~r1.mmu_req):
-                        sync += r1.ls_valid.eq(1)
-                    with m.Else():
+                    with m.If(r1.mmu_req):
                         sync += r1.mmu_done.eq(1)
                         sync += r1.mmu_done.eq(1)
+                    with m.Else():
+                        sync += r1.ls_valid.eq(1)
 
                     sync += r1.forward_sel.eq(~0) # all 1s
                     sync += r1.use_forward1.eq(1)
                     sync += r1.wb.cyc.eq(0)
                     sync += r1.wb.stb.eq(0)
 
 
                     sync += r1.forward_sel.eq(~0) # all 1s
                     sync += r1.use_forward1.eq(1)
                     sync += r1.wb.cyc.eq(0)
                     sync += r1.wb.stb.eq(0)
 
-    def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
+    def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
 
         sync = m.d.sync
 
         sync = m.d.sync
-        d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
+        d_out, bus, log_out = self.d_out, self.bus, self.log_out
 
 
-        sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
+        sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
                                stall_out, req_op[:3], d_out.valid, d_out.error,
                                stall_out, req_op[:3], d_out.valid, d_out.error,
-                               r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
+                               r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
                                r1.real_adr[3:6]))
 
     def elaborate(self, platform):
 
         m = Module()
         comb = m.d.comb
                                r1.real_adr[3:6]))
 
     def elaborate(self, platform):
 
         m = Module()
         comb = m.d.comb
+        d_in = self.d_in
 
         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
         cache_tags       = CacheTagArray()
         cache_tag_set    = Signal(TAG_RAM_WIDTH)
 
         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
         cache_tags       = CacheTagArray()
         cache_tag_set    = Signal(TAG_RAM_WIDTH)
-        cache_valids = CacheValidBitsArray()
 
         # TODO attribute ram_style : string;
         # TODO attribute ram_style of cache_tags : signal is "distributed";
 
         # TODO attribute ram_style : string;
         # TODO attribute ram_style of cache_tags : signal is "distributed";
@@ -1542,9 +1680,6 @@ class DCache(Elaboratable):
         """note: these are passed to nmigen.hdl.Memory as "attributes".
            don't know how, just that they are.
         """
         """note: these are passed to nmigen.hdl.Memory as "attributes".
            don't know how, just that they are.
         """
-        dtlb_valid_bits = TLBValidBitsArray()
-        dtlb_tags       = TLBTagsArray()
-        dtlb_ptes       = TLBPtesArray()
         # TODO attribute ram_style of
         #  dtlb_tags : signal is "distributed";
         # TODO attribute ram_style of
         # TODO attribute ram_style of
         #  dtlb_tags : signal is "distributed";
         # TODO attribute ram_style of
@@ -1581,19 +1716,16 @@ class DCache(Elaboratable):
 
         cache_out_row     = Signal(WB_DATA_BITS)
 
 
         cache_out_row     = Signal(WB_DATA_BITS)
 
-        plru_victim       = PLRUOut()
+        plru_victim       = Signal(WAY_BITS)
         replace_way       = Signal(WAY_BITS)
 
         # Wishbone read/write/cache write formatting signals
         bus_sel           = Signal(8)
 
         # TLB signals
         replace_way       = Signal(WAY_BITS)
 
         # Wishbone read/write/cache write formatting signals
         bus_sel           = Signal(8)
 
         # TLB signals
-        tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
-        tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
-        tlb_valid_way = Signal(TLB_NUM_WAYS)
+        tlb_way       = TLBRecord("tlb_way")
         tlb_req_index = Signal(TLB_SET_BITS)
         tlb_req_index = Signal(TLB_SET_BITS)
-        tlb_hit       = Signal()
-        tlb_hit_way   = Signal(TLB_WAY_BITS)
+        tlb_hit       = TLBHit("tlb_hit")
         pte           = Signal(TLB_PTE_BITS)
         ra            = Signal(REAL_ADDR_BITS)
         valid_ra      = Signal()
         pte           = Signal(TLB_PTE_BITS)
         ra            = Signal(REAL_ADDR_BITS)
         valid_ra      = Signal()
@@ -1602,44 +1734,52 @@ class DCache(Elaboratable):
         perm_ok       = Signal()
         access_ok     = Signal()
 
         perm_ok       = Signal()
         access_ok     = Signal()
 
-        tlb_plru_victim = TLBPLRUOut()
+        tlb_plru_victim = Signal(TLB_WAY_BITS)
 
         # we don't yet handle collisions between loadstore1 requests
         # and MMU requests
         comb += self.m_out.stall.eq(0)
 
         # Hold off the request in r0 when r1 has an uncompleted request
 
         # we don't yet handle collisions between loadstore1 requests
         # and MMU requests
         comb += self.m_out.stall.eq(0)
 
         # Hold off the request in r0 when r1 has an uncompleted request
-        comb += r0_stall.eq(r0_full & r1.full)
-        comb += r0_valid.eq(r0_full & ~r1.full)
+        comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
+        comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
         comb += self.stall_out.eq(r0_stall)
 
         comb += self.stall_out.eq(r0_stall)
 
+        # deal with litex not doing wishbone pipeline mode
+        # XXX in wrong way.  FIFOs are needed in the SRAM test
+        # so that stb/ack match up. same thing done in icache.py
+        if not self.microwatt_compat:
+            comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
+
         # Wire up wishbone request latch out of stage 1
         # Wire up wishbone request latch out of stage 1
-        comb += r1.wb.adr.eq(r1.real_adr)
-        comb += self.wb_out.eq(r1.wb)
-        comb += self.wb_out.adr.eq(r1.wb.adr[3:]) # truncate LSBs
+        comb += self.bus.we.eq(r1.wb.we)
+        comb += self.bus.adr.eq(r1.wb.adr)
+        comb += self.bus.sel.eq(r1.wb.sel)
+        comb += self.bus.stb.eq(r1.wb.stb)
+        comb += self.bus.dat_w.eq(r1.wb.dat)
+        comb += self.bus.cyc.eq(r1.wb.cyc)
+
+        # create submodule TLBUpdate
+        m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
 
         # call sub-functions putting everything together, using shared
         # signals established above
         self.stage_0(m, r0, r1, r0_full)
 
         # call sub-functions putting everything together, using shared
         # signals established above
         self.stage_0(m, r0, r1, r0_full)
-        self.tlb_read(m, r0_stall, tlb_valid_way,
-                      tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
-                      dtlb_tags, dtlb_ptes)
+        self.tlb_read(m, r0_stall, tlb_way)
         self.tlb_search(m, tlb_req_index, r0, r0_valid,
         self.tlb_search(m, tlb_req_index, r0, r0_valid,
-                        tlb_valid_way, tlb_tag_way, tlb_hit_way,
-                        tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
-        self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
-                        tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
-                        dtlb_tags, tlb_pte_way, dtlb_ptes)
+                        tlb_way,
+                        pte, tlb_hit, valid_ra, perm_attr, ra)
+        self.tlb_update(m, r0_valid, r0, tlb_req_index,
+                        tlb_hit, tlb_plru_victim)
         self.maybe_plrus(m, r1, plru_victim)
         self.maybe_plrus(m, r1, plru_victim)
-        self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
+        self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
-                           r0_valid, r1, cache_valids, replace_way,
+                           r0_valid, r1, cache_tags, replace_way,
                            use_forward1_next, use_forward2_next,
                            req_hit_way, plru_victim, rc_ok, perm_attr,
                            valid_ra, perm_ok, access_ok, req_op, req_go,
                            use_forward1_next, use_forward2_next,
                            req_hit_way, plru_victim, rc_ok, perm_attr,
                            valid_ra, perm_ok, access_ok, req_op, req_go,
-                           tlb_pte_way,
-                           tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
+                           tlb_hit, tlb_way, cache_tag_set,
                            cancel_store, req_same_tag, r0_stall, early_req_row)
         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
                            r0_valid, r0, reservation)
                            cancel_store, req_same_tag, r0_stall, early_req_row)
         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
                            r0_valid, r0, reservation)
@@ -1649,214 +1789,18 @@ class DCache(Elaboratable):
         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
                         req_hit_way, req_index, req_tag, access_ok,
         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
                         req_hit_way, req_index, req_tag, access_ok,
-                        tlb_hit, tlb_hit_way, tlb_req_index)
+                        tlb_hit, tlb_req_index)
         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
-                    cache_valids, r0, replace_way,
+                    r0, replace_way,
                     req_hit_way, req_same_tag,
                          r0_valid, req_op, cache_tags, req_go, ra)
                     req_hit_way, req_same_tag,
                          r0_valid, req_op, cache_tags, req_go, ra)
-        #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
+        #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
 
         return m
 
 
         return m
 
-def dcache_load(dut, addr, nc=0):
-    yield dut.d_in.load.eq(1)
-    yield dut.d_in.nc.eq(nc)
-    yield dut.d_in.addr.eq(addr)
-    yield dut.d_in.byte_sel.eq(~0)
-    yield dut.d_in.valid.eq(1)
-    yield
-    yield dut.d_in.valid.eq(0)
-    yield dut.d_in.byte_sel.eq(0)
-    while not (yield dut.d_out.valid):
-        yield
-    data = yield dut.d_out.data
-    return data
-
-
-def dcache_store(dut, addr, data, nc=0):
-    yield dut.d_in.load.eq(0)
-    yield dut.d_in.nc.eq(nc)
-    yield dut.d_in.data.eq(data)
-    yield dut.d_in.byte_sel.eq(~0)
-    yield dut.d_in.addr.eq(addr)
-    yield dut.d_in.valid.eq(1)
-    yield
-    yield dut.d_in.valid.eq(0)
-    yield dut.d_in.byte_sel.eq(0)
-    while not (yield dut.d_out.valid):
-        yield
-
-
-def dcache_random_sim(dut, mem):
-
-    # start copy of mem
-    sim_mem = deepcopy(mem)
-    print ("mem len", len(sim_mem))
-
-    # clear stuff
-    yield dut.d_in.valid.eq(0)
-    yield dut.d_in.load.eq(0)
-    yield dut.d_in.priv_mode.eq(1)
-    yield dut.d_in.nc.eq(0)
-    yield dut.d_in.addr.eq(0)
-    yield dut.d_in.data.eq(0)
-    yield dut.m_in.valid.eq(0)
-    yield dut.m_in.addr.eq(0)
-    yield dut.m_in.pte.eq(0)
-    # wait 4 * clk_period
-    yield
-    yield
-    yield
-    yield
-
-    print ()
-
-    #for i in range(1024):
-    #    sim_mem[i] = i
-
-    for i in range(1024):
-        addr = randint(0, 1023)
-        data = randint(0, (1<<64)-1)
-        sim_mem[addr] = data
-        row = addr
-        addr *= 8
-
-        print ("random testing %d 0x%x row %d data 0x%x" % (i, addr, row, data))
-
-        yield from dcache_load(dut, addr)
-        yield from dcache_store(dut, addr, data)
-
-        addr = randint(0, 1023)
-        sim_data = sim_mem[addr]
-        row = addr
-        addr *= 8
-
-        print ("    load 0x%x row %d expect data 0x%x" % (addr, row, sim_data))
-        data = yield from dcache_load(dut, addr)
-        assert data == sim_data, \
-            "check addr 0x%x row %d data %x != %x" % (addr, row, data, sim_data)
-
-    for addr in range(1024):
-        data = yield from dcache_load(dut, addr*8)
-        assert data == sim_mem[addr], \
-            "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
-
-
-def dcache_sim(dut, mem):
-    # clear stuff
-    yield dut.d_in.valid.eq(0)
-    yield dut.d_in.load.eq(0)
-    yield dut.d_in.priv_mode.eq(1)
-    yield dut.d_in.nc.eq(0)
-    yield dut.d_in.addr.eq(0)
-    yield dut.d_in.data.eq(0)
-    yield dut.m_in.valid.eq(0)
-    yield dut.m_in.addr.eq(0)
-    yield dut.m_in.pte.eq(0)
-    # wait 4 * clk_period
-    yield
-    yield
-    yield
-    yield
-
-    # Cacheable read of address 4
-    data = yield from dcache_load(dut, 0x58)
-    addr = yield dut.d_in.addr
-    assert data == 0x0000001700000016, \
-        f"data @%x=%x expected 0x0000001700000016" % (addr, data)
-
-    # Cacheable read of address 20
-    data = yield from dcache_load(dut, 0x20)
-    addr = yield dut.d_in.addr
-    assert data == 0x0000000900000008, \
-        f"data @%x=%x expected 0x0000000900000008" % (addr, data)
-
-    # Cacheable read of address 30
-    data = yield from dcache_load(dut, 0x530)
-    addr = yield dut.d_in.addr
-    assert data == 0x0000014D0000014C, \
-        f"data @%x=%x expected 0000014D0000014C" % (addr, data)
-
-    # 2nd Cacheable read of address 30
-    data = yield from dcache_load(dut, 0x530)
-    addr = yield dut.d_in.addr
-    assert data == 0x0000014D0000014C, \
-        f"data @%x=%x expected 0000014D0000014C" % (addr, data)
-
-    # Non-cacheable read of address 100
-    data = yield from dcache_load(dut, 0x100, nc=1)
-    addr = yield dut.d_in.addr
-    assert data == 0x0000004100000040, \
-        f"data @%x=%x expected 0000004100000040" % (addr, data)
-
-    # Store at address 530
-    yield from dcache_store(dut, 0x530, 0x121)
-
-    # Store at address 30
-    yield from dcache_store(dut, 0x530, 0x12345678)
-
-    # 3nd Cacheable read of address 530
-    data = yield from dcache_load(dut, 0x530)
-    addr = yield dut.d_in.addr
-    assert data == 0x12345678, \
-        f"data @%x=%x expected 0x12345678" % (addr, data)
-
-    # 4th Cacheable read of address 20
-    data = yield from dcache_load(dut, 0x20)
-    addr = yield dut.d_in.addr
-    assert data == 0x0000000900000008, \
-        f"data @%x=%x expected 0x0000000900000008" % (addr, data)
-
-    yield
-    yield
-    yield
-    yield
-
-
-def test_dcache(mem, test_fn, test_name):
-    dut = DCache()
-
-    memory = Memory(width=64, depth=16*64, init=mem, simulate=True)
-    sram = SRAM(memory=memory, granularity=8)
-
-    m = Module()
-    m.submodules.dcache = dut
-    m.submodules.sram = sram
-
-    m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
-    m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
-    m.d.comb += sram.bus.we.eq(dut.wb_out.we)
-    m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
-    m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
-    m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
-
-    m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
-    m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
-
-    # nmigen Simulation
-    sim = Simulator(m)
-    sim.add_clock(1e-6)
-
-    sim.add_sync_process(wrap(test_fn(dut, mem)))
-    with sim.write_vcd('test_dcache%s.vcd' % test_name):
-        sim.run()
 
 if __name__ == '__main__':
 
 if __name__ == '__main__':
-    seed(0)
     dut = DCache()
     vl = rtlil.convert(dut, ports=[])
     with open("test_dcache.il", "w") as f:
         f.write(vl)
     dut = DCache()
     vl = rtlil.convert(dut, ports=[])
     with open("test_dcache.il", "w") as f:
         f.write(vl)
-
-    mem = []
-    for i in range(1024):
-        mem.append((i*2)| ((i*2+1)<<32))
-
-    test_dcache(mem, dcache_sim, "")
-
-    mem = []
-    for i in range(0, 1024):
-        mem.append(i)
-
-    test_dcache(mem, dcache_random_sim, "random")
-