double the number of lines in the L1 D/I-Cache to match microwatt

[soc.git] / src / soc / experiment / dcache.py
diff --git a/src/soc/experiment/dcache.py b/src/soc/experiment/dcache.py

index 7cd75a79257413d99ab2161b3328f2408abd1452..a828e3c3c2137b05e3197ef54e2c69bc2a80496e 100644 (file)
--- a/src/soc/experiment/dcache.py
+++ b/src/soc/experiment/dcache.py
@@ -2,16 +2,40 @@
  
  based on Anton Blanchard microwatt dcache.vhdl
  
  
  based on Anton Blanchard microwatt dcache.vhdl
  
+note that the microwatt dcache wishbone interface expects "stall".
+for simplicity at the moment this is hard-coded to cyc & ~ack.
+see WB4 spec, p84, section 5.2.1
+
+IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
+is raised.  sigh
+
+Links:
+
+* https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
+* https://bugs.libre-soc.org/show_bug.cgi?id=469
+* https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
+  (discussion about brams for ECP5)
+
  """
  
  """
  
+import sys
+
+from nmutil.gtkw import write_gtkw
+
+sys.setrecursionlimit(1000000)
+
  from enum import Enum, unique
  
  from enum import Enum, unique
  
-from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
+from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
+                    Record, Memory)
  from nmutil.util import Display
  from nmutil.util import Display
+from nmigen.lib.coding import Decoder
  
  from copy import deepcopy
  from random import randint, seed
  
  
  from copy import deepcopy
  from random import randint, seed
  
+from nmigen_soc.wishbone.bus import Interface
+
  from nmigen.cli import main
  from nmutil.iocontrol import RecordObject
  from nmigen.utils import log2_int
  from nmigen.cli import main
  from nmutil.iocontrol import RecordObject
  from nmigen.utils import log2_int
@@ -27,8 +51,8 @@ from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
                                  WBIOMasterOut, WBIOSlaveOut)
  
  from soc.experiment.cache_ram import CacheRam
                                  WBIOMasterOut, WBIOSlaveOut)
  
  from soc.experiment.cache_ram import CacheRam
-#from soc.experiment.plru import PLRU
-from nmutil.plru import PLRU
+from soc.experiment.plru import PLRU, PLRUs
+#from nmutil.plru import PLRU, PLRUs
  
  # for test
  from soc.bus.sram import SRAM
  
  # for test
  from soc.bus.sram import SRAM
@@ -44,7 +68,7 @@ from nmutil.util import wrap
  
  # TODO: make these parameters of DCache at some point
  LINE_SIZE = 64    # Line size in bytes
  
  # TODO: make these parameters of DCache at some point
  LINE_SIZE = 64    # Line size in bytes
-NUM_LINES = 16    # Number of lines in a set
+NUM_LINES = 32    # Number of lines in a set
  NUM_WAYS = 4      # Number of ways
  TLB_SET_SIZE = 64 # L1 DTLB entries per set
  TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  NUM_WAYS = 4      # Number of ways
  TLB_SET_SIZE = 64 # L1 DTLB entries per set
  TLB_NUM_WAYS = 2  # L1 DTLB number of sets
@@ -54,7 +78,7 @@ LOG_LENGTH = 0    # Non-zero to enable log data collection
  # BRAM organisation: We never access more than
  #     -- WB_DATA_BITS at a time so to save
  #     -- resources we make the array only that wide, and
  # BRAM organisation: We never access more than
  #     -- WB_DATA_BITS at a time so to save
  #     -- resources we make the array only that wide, and
-#     -- use consecutive indices for to make a cache "line"
+#     -- use consecutive indices to make a cache "line"
  #     --
  #     -- ROW_SIZE is the width in bytes of the BRAM
  #     -- (based on WB, so 64-bits)
  #     --
  #     -- ROW_SIZE is the width in bytes of the BRAM
  #     -- (based on WB, so 64-bits)
@@ -112,15 +136,18 @@ TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
  WAY_BITS = log2_int(NUM_WAYS)
  
  # Example of layout for 32 lines of 64 bytes:
  WAY_BITS = log2_int(NUM_WAYS)
  
  # Example of layout for 32 lines of 64 bytes:
-layout = """\
+layout = f"""\
+  DCache Layout:
+ |.. -----------------------| REAL_ADDR_BITS ({REAL_ADDR_BITS})
+  ..         |--------------| SET_SIZE_BITS ({SET_SIZE_BITS})
    ..  tag    |index|  line  |
    ..         |   row   |    |
    ..  tag    |index|  line  |
    ..         |   row   |    |
-  ..         |     |---|    | ROW_LINE_BITS  (3)
-  ..         |     |--- - --| LINE_OFF_BITS (6)
-  ..         |         |- --| ROW_OFF_BITS  (3)
-  ..         |----- ---|    | ROW_BITS      (8)
-  ..         |-----|        | INDEX_BITS    (5)
-  .. --------|              | TAG_BITS      (45)
+  ..         |     |---|    | ROW_LINE_BITS ({ROW_LINE_BITS})
+  ..         |     |--- - --| LINE_OFF_BITS ({LINE_OFF_BITS})
+  ..         |         |- --| ROW_OFF_BITS  ({ROW_OFF_BITS})
+  ..         |----- ---|    | ROW_BITS      ({ROW_BITS})
+  ..         |-----|        | INDEX_BITS    ({INDEX_BITS})
+  .. --------|              | TAG_BITS      ({TAG_BITS})
  """
  print (layout)
  print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
  """
  print (layout)
  print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
@@ -133,14 +160,15 @@ print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
  TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
  
  print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
  TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
  
  print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
+print ("    TAG_WIDTH", TAG_WIDTH)
+print ("     NUM_WAYS", NUM_WAYS)
+print ("    NUM_LINES", NUM_LINES)
  
  def CacheTagArray():
  
  def CacheTagArray():
-    return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
-                        for x in range(NUM_LINES))
-
-def CacheValidBitsArray():
-    return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
-                        for x in range(NUM_LINES))
+    tag_layout = [('valid', NUM_WAYS),
+                  ('tag', TAG_RAM_WIDTH),
+                 ]
+    return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
  
  def RowPerLineValidArray():
      return Array(Signal(name="rows_valid%d" % x) \
  
  def RowPerLineValidArray():
      return Array(Signal(name="rows_valid%d" % x) \
@@ -172,21 +200,24 @@ assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
  assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
  
  
  assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
  
  
-def TLBValidBitsArray():
-    return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
-                for x in range(TLB_SET_SIZE))
+def TLBHit(name):
+    return Record([('valid', 1),
+                   ('way', TLB_WAY_BITS)], name=name)
  
  def TLBTagEAArray():
      return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
                  for x in range (TLB_NUM_WAYS))
  
  
  def TLBTagEAArray():
      return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
                  for x in range (TLB_NUM_WAYS))
  
-def TLBTagsArray():
-    return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
-                for x in range (TLB_SET_SIZE))
+def TLBRecord(name):
+    tlb_layout = [('valid', TLB_NUM_WAYS),
+                  ('tag', TLB_TAG_WAY_BITS),
+                  ('pte', TLB_PTE_WAY_BITS)
+                 ]
+    return Record(tlb_layout, name=name)
  
  
-def TLBPtesArray():
-    return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
-                for x in range(TLB_SET_SIZE))
+def TLBValidArray():
+    return Array(Signal(TLB_NUM_WAYS, name="tlb_valid%d" % x)
+                        for x in range(TLB_SET_SIZE))
  
  def HitWaySet():
      return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
  
  def HitWaySet():
      return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
@@ -321,10 +352,11 @@ class RegStage0(RecordObject):
      def __init__(self, name=None):
          super().__init__(name=name)
          self.req     = LoadStore1ToDCacheType(name="lsmem")
      def __init__(self, name=None):
          super().__init__(name=name)
          self.req     = LoadStore1ToDCacheType(name="lsmem")
-        self.tlbie   = Signal()
-        self.doall   = Signal()
-        self.tlbld   = Signal()
+        self.tlbie   = Signal() # indicates a tlbie request (from MMU)
+        self.doall   = Signal() # with tlbie, indicates flush whole TLB
+        self.tlbld   = Signal() # indicates a TLB load request (from MMU)
          self.mmu_req = Signal() # indicates source of request
          self.mmu_req = Signal() # indicates source of request
+        self.d_valid = Signal() # indicates req.data is valid now
  
  
  class MemAccessRequest(RecordObject):
  
  
  class MemAccessRequest(RecordObject):
@@ -358,9 +390,8 @@ class RegStage1(RecordObject):
          self.cache_hit        = Signal()
  
          # TLB hit state
          self.cache_hit        = Signal()
  
          # TLB hit state
-        self.tlb_hit          = Signal()
-        self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
-        self.tlb_hit_index    = Signal(TLB_WAY_BITS)
+        self.tlb_hit          = TLBHit("tlb_hit")
+        self.tlb_hit_index    = Signal(TLB_SET_BITS)
  
          # 2-stage data buffer for data forwarded from writes to reads
          self.forward_data1    = Signal(64)
  
          # 2-stage data buffer for data forwarded from writes to reads
          self.forward_data1    = Signal(64)
@@ -378,7 +409,6 @@ class RegStage1(RecordObject):
          self.write_bram       = Signal()
          self.write_tag        = Signal()
          self.slow_valid       = Signal()
          self.write_bram       = Signal()
          self.write_tag        = Signal()
          self.slow_valid       = Signal()
-        self.real_adr         = Signal(REAL_ADDR_BITS)
          self.wb               = WBMasterOut("wb")
          self.reload_tag       = Signal(TAG_BITS)
          self.store_way        = Signal(WAY_BITS)
          self.wb               = WBMasterOut("wb")
          self.reload_tag       = Signal(TAG_BITS)
          self.store_way        = Signal(WAY_BITS)
@@ -414,83 +444,156 @@ class DTLBUpdate(Elaboratable):
          self.tlbie    = Signal()
          self.tlbwe    = Signal()
          self.doall    = Signal()
          self.tlbie    = Signal()
          self.tlbwe    = Signal()
          self.doall    = Signal()
-        self.updated  = Signal()
-        self.v_updated  = Signal()
-        self.tlb_hit    = Signal()
+        self.tlb_hit     = TLBHit("tlb_hit")
          self.tlb_req_index = Signal(TLB_SET_BITS)
  
          self.tlb_req_index = Signal(TLB_SET_BITS)
  
-        self.tlb_hit_way     = Signal(TLB_WAY_BITS)
-        self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
-        self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
          self.repl_way        = Signal(TLB_WAY_BITS)
          self.eatag           = Signal(TLB_EA_TAG_BITS)
          self.pte_data        = Signal(TLB_PTE_BITS)
  
          self.repl_way        = Signal(TLB_WAY_BITS)
          self.eatag           = Signal(TLB_EA_TAG_BITS)
          self.pte_data        = Signal(TLB_PTE_BITS)
  
-        self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
-
-        self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
-        self.pb_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
-        self.db_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
+        # read from dtlb array
+        self.tlb_read       = Signal()
+        self.tlb_read_index = Signal(TLB_SET_BITS)
+        self.tlb_way        = TLBRecord("o_tlb_way")
  
      def elaborate(self, platform):
          m = Module()
          comb = m.d.comb
          sync = m.d.sync
  
  
      def elaborate(self, platform):
          m = Module()
          comb = m.d.comb
          sync = m.d.sync
  
-        tagset   = Signal(TLB_TAG_WAY_BITS)
-        pteset   = Signal(TLB_PTE_WAY_BITS)
-
-        tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
-        comb += db_out.eq(self.dv)
+        # there are 3 parts to this:
+        # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
+        # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
+        # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
+        # be a Memory because they can all be cleared (tlbie, doall), i mean,
+        # we _could_, in theory, by overriding the Reset Signal of the Memory,
+        # hmmm....
+
+        dtlb_valid = TLBValidArray()
+        tlb_req_index = self.tlb_req_index
+
+        print ("TLB_TAG_WAY_BITS", TLB_TAG_WAY_BITS)
+        print ("     TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
+        print ("        TLB_NUM_WAYS", TLB_NUM_WAYS)
+        print ("TLB_PTE_WAY_BITS", TLB_PTE_WAY_BITS)
+        print ("    TLB_PTE_BITS", TLB_PTE_BITS)
+        print ("    TLB_NUM_WAYS", TLB_NUM_WAYS)
+
+        # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
+        tagway = Memory(depth=TLB_SET_SIZE, width=TLB_TAG_WAY_BITS)
+        m.submodules.rd_tagway = rd_tagway = tagway.read_port()
+        m.submodules.wr_tagway = wr_tagway = tagway.write_port(
+                                    granularity=TLB_EA_TAG_BITS)
+
+        pteway = Memory(depth=TLB_SET_SIZE, width=TLB_PTE_WAY_BITS)
+        m.submodules.rd_pteway = rd_pteway = pteway.read_port()
+        m.submodules.wr_pteway = wr_pteway = pteway.write_port(
+                                    granularity=TLB_PTE_BITS)
+
+        # commented out for now, can be put in if Memory.reset can be
+        # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
+        #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
+        #m.submodules.rd_valid = rd_valid = validm.read_port()
+        #m.submodules.wr_valid = wr_valid = validm.write_port(
+                                    #granularity=1)
+
+        # connect up read and write addresses to Valid/PTE/TAG SRAMs
+        m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
+        m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
+        #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
+        m.d.comb += wr_tagway.addr.eq(tlb_req_index)
+        m.d.comb += wr_pteway.addr.eq(tlb_req_index)
+        #m.d.comb += wr_valid.addr.eq(tlb_req_index)
+
+        updated  = Signal()
+        v_updated  = Signal()
+        tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
+        db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
+        pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
+        dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
+
+        comb += dv.eq(dtlb_valid[tlb_req_index])
+        comb += db_out.eq(dv)
  
          with m.If(self.tlbie & self.doall):
  
          with m.If(self.tlbie & self.doall):
-            pass # clear all back in parent
+            # clear all valid bits at once
+            # XXX hmmm, validm _could_ use Memory reset here...
+            for i in range(TLB_SET_SIZE):
+                sync += dtlb_valid[i].eq(0)
          with m.Elif(self.tlbie):
          with m.Elif(self.tlbie):
-            with m.If(self.tlb_hit):
-                comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
-                comb += self.v_updated.eq(1)
-
+            # invalidate just the hit_way
+            with m.If(self.tlb_hit.valid):
+                comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
+                comb += v_updated.eq(1)
          with m.Elif(self.tlbwe):
          with m.Elif(self.tlbwe):
-
-            comb += tagset.eq(self.tlb_tag_way)
-            comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
-            comb += tb_out.eq(tagset)
-
-            comb += pteset.eq(self.tlb_pte_way)
-            comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
-            comb += pb_out.eq(pteset)
-
+            # write to the requested tag and PTE
+            comb += write_tlb_tag(self.repl_way, tb_out, self.eatag)
+            comb += write_tlb_pte(self.repl_way, pb_out, self.pte_data)
+            # set valid bit
              comb += db_out.bit_select(self.repl_way, 1).eq(1)
  
              comb += db_out.bit_select(self.repl_way, 1).eq(1)
  
-            comb += self.updated.eq(1)
-            comb += self.v_updated.eq(1)
+            comb += updated.eq(1)
+            comb += v_updated.eq(1)
+
+        # above, sometimes valid is requested to be updated but data not
+        # therefore split them out, here.  note the granularity thing matches
+        # with the shift-up of the eatag/pte_data into the correct TLB way.
+        # thus is it not necessary to write the entire lot, just the portion
+        # being altered: hence writing the *old* copy of the row is not needed
+        with m.If(updated): # PTE and TAG to be written
+            comb += wr_pteway.data.eq(pb_out)
+            comb += wr_pteway.en.eq(1<<self.repl_way)
+            comb += wr_tagway.data.eq(tb_out)
+            comb += wr_tagway.en.eq(1<<self.repl_way)
+        with m.If(v_updated): # Valid to be written
+            sync += dtlb_valid[tlb_req_index].eq(db_out)
+            #comb += wr_valid.data.eq(db_out)
+            #comb += wr_valid.en.eq(1<<self.repl_way)
+
+        # select one TLB way, use a register here
+        r_tlb_way        = TLBRecord("r_tlb_way")
+        r_delay = Signal()
+        sync += r_delay.eq(self.tlb_read)
+        with m.If(self.tlb_read):
+            sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
+        with m.If(r_delay):
+            # on one clock delay, output the contents of the read port(s)
+            # comb += self.tlb_way.valid.eq(rd_valid.data)
+            comb += self.tlb_way.tag.eq(rd_tagway.data)
+            comb += self.tlb_way.pte.eq(rd_pteway.data)
+            # and also capture the (delayed) output...
+            #sync += r_tlb_way.valid.eq(rd_valid.data)
+            sync += r_tlb_way.tag.eq(rd_tagway.data)
+            sync += r_tlb_way.pte.eq(rd_pteway.data)
+        with m.Else():
+            # ... so that the register can output it when no read is requested
+            # it's rather overkill but better to be safe than sorry
+            comb += self.tlb_way.tag.eq(r_tlb_way.tag)
+            comb += self.tlb_way.pte.eq(r_tlb_way.pte)
+            #comb += self.tlb_way.eq(r_tlb_way)
  
          return m
  
  
  class DCachePendingHit(Elaboratable):
  
  
          return m
  
  
  class DCachePendingHit(Elaboratable):
  
-    def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
-                      cache_valid_idx, cache_tag_set,
-                    req_addr,
-                    hit_set):
+    def __init__(self, tlb_way,
+                      cache_i_validdx, cache_tag_set,
+                    req_addr):
  
          self.go          = Signal()
          self.virt_mode   = Signal()
          self.is_hit      = Signal()
  
          self.go          = Signal()
          self.virt_mode   = Signal()
          self.is_hit      = Signal()
-        self.tlb_hit     = Signal()
+        self.tlb_hit      = TLBHit("tlb_hit")
          self.hit_way     = Signal(WAY_BITS)
          self.rel_match   = Signal()
          self.req_index   = Signal(INDEX_BITS)
          self.reload_tag  = Signal(TAG_BITS)
  
          self.hit_way     = Signal(WAY_BITS)
          self.rel_match   = Signal()
          self.req_index   = Signal(INDEX_BITS)
          self.reload_tag  = Signal(TAG_BITS)
  
-        self.tlb_hit_way = tlb_hit_way
-        self.tlb_pte_way = tlb_pte_way
-        self.tlb_valid_way = tlb_valid_way
-        self.cache_valid_idx = cache_valid_idx
+        self.tlb_way = tlb_way
+        self.cache_i_validdx = cache_i_validdx
          self.cache_tag_set = cache_tag_set
          self.req_addr = req_addr
          self.cache_tag_set = cache_tag_set
          self.req_addr = req_addr
-        self.hit_set = hit_set
  
      def elaborate(self, platform):
          m = Module()
  
      def elaborate(self, platform):
          m = Module()
@@ -500,19 +603,18 @@ class DCachePendingHit(Elaboratable):
          go = self.go
          virt_mode = self.virt_mode
          is_hit = self.is_hit
          go = self.go
          virt_mode = self.virt_mode
          is_hit = self.is_hit
-        tlb_pte_way = self.tlb_pte_way
-        tlb_valid_way = self.tlb_valid_way
-        cache_valid_idx = self.cache_valid_idx
+        tlb_way = self.tlb_way
+        cache_i_validdx = self.cache_i_validdx
          cache_tag_set = self.cache_tag_set
          req_addr = self.req_addr
          cache_tag_set = self.cache_tag_set
          req_addr = self.req_addr
-        tlb_hit_way = self.tlb_hit_way
          tlb_hit = self.tlb_hit
          tlb_hit = self.tlb_hit
-        hit_set = self.hit_set
          hit_way = self.hit_way
          rel_match = self.rel_match
          req_index = self.req_index
          reload_tag = self.reload_tag
  
          hit_way = self.hit_way
          rel_match = self.rel_match
          req_index = self.req_index
          reload_tag = self.reload_tag
  
+        hit_set     = Array(Signal(name="hit_set_%d" % i) \
+                                  for i in range(TLB_NUM_WAYS))
          rel_matches = Array(Signal(name="rel_matches_%d" % i) \
                                      for i in range(TLB_NUM_WAYS))
          hit_way_set = HitWaySet()
          rel_matches = Array(Signal(name="rel_matches_%d" % i) \
                                      for i in range(TLB_NUM_WAYS))
          hit_way_set = HitWaySet()
@@ -526,35 +628,35 @@ class DCachePendingHit(Elaboratable):
          with m.If(virt_mode):
              for j in range(TLB_NUM_WAYS): # tlb_num_way_t
                  s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
          with m.If(virt_mode):
              for j in range(TLB_NUM_WAYS): # tlb_num_way_t
                  s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
-                s_hit       = Signal()
-                s_pte       = Signal(TLB_PTE_BITS)
-                s_ra        = Signal(REAL_ADDR_BITS)
-                comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
+                s_hit       = Signal(name="s_hit%d" % j)
+                s_pte       = Signal(TLB_PTE_BITS, name="s_pte%d" % j)
+                s_ra        = Signal(REAL_ADDR_BITS, name="s_ra%d" % j)
+                # read the PTE, calc the Real Address, get tge tag
+                comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
                  comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
                                      s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
                  comb += s_tag.eq(get_tag(s_ra))
                  comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
                                      s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
                  comb += s_tag.eq(get_tag(s_ra))
-
+                # for each way check tge tag against the cache tag set
                  for i in range(NUM_WAYS): # way_t
                      is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
                  for i in range(NUM_WAYS): # way_t
                      is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
-                    comb += is_tag_hit.eq(go & cache_valid_idx[i] &
+                    comb += is_tag_hit.eq(go & cache_i_validdx[i] &
                                    (read_tag(i, cache_tag_set) == s_tag)
                                    (read_tag(i, cache_tag_set) == s_tag)
-                                  & tlb_valid_way[j])
+                                  & (tlb_way.valid[j]))
                      with m.If(is_tag_hit):
                          comb += hit_way_set[j].eq(i)
                          comb += s_hit.eq(1)
                  comb += hit_set[j].eq(s_hit)
                      with m.If(is_tag_hit):
                          comb += hit_way_set[j].eq(i)
                          comb += s_hit.eq(1)
                  comb += hit_set[j].eq(s_hit)
-                with m.If(s_tag == reload_tag):
-                    comb += rel_matches[j].eq(1)
-            with m.If(tlb_hit):
-                comb += is_hit.eq(hit_set[tlb_hit_way])
-                comb += hit_way.eq(hit_way_set[tlb_hit_way])
-                comb += rel_match.eq(rel_matches[tlb_hit_way])
+                comb += rel_matches[j].eq(s_tag == reload_tag)
+            with m.If(tlb_hit.valid):
+                comb += is_hit.eq(hit_set[tlb_hit.way])
+                comb += hit_way.eq(hit_way_set[tlb_hit.way])
+                comb += rel_match.eq(rel_matches[tlb_hit.way])
          with m.Else():
              s_tag       = Signal(TAG_BITS)
              comb += s_tag.eq(get_tag(req_addr))
              for i in range(NUM_WAYS): # way_t
                  is_tag_hit = Signal(name="is_tag_hit_%d" % i)
          with m.Else():
              s_tag       = Signal(TAG_BITS)
              comb += s_tag.eq(get_tag(req_addr))
              for i in range(NUM_WAYS): # way_t
                  is_tag_hit = Signal(name="is_tag_hit_%d" % i)
-                comb += is_tag_hit.eq(go & cache_valid_idx[i] &
+                comb += is_tag_hit.eq(go & cache_i_validdx[i] &
                            (read_tag(i, cache_tag_set) == s_tag))
                  with m.If(is_tag_hit):
                      comb += hit_way.eq(i)
                            (read_tag(i, cache_tag_set) == s_tag))
                  with m.If(is_tag_hit):
                      comb += hit_way.eq(i)
@@ -567,13 +669,14 @@ class DCachePendingHit(Elaboratable):
  
  class DCache(Elaboratable):
      """Set associative dcache write-through
  
  class DCache(Elaboratable):
      """Set associative dcache write-through
+
      TODO (in no specific order):
      * See list in icache.vhdl
      * Complete load misses on the cycle when WB data comes instead of
        at the end of line (this requires dealing with requests coming in
        while not idle...)
      """
      TODO (in no specific order):
      * See list in icache.vhdl
      * Complete load misses on the cycle when WB data comes instead of
        at the end of line (this requires dealing with requests coming in
        while not idle...)
      """
-    def __init__(self):
+    def __init__(self, pspec=None):
          self.d_in      = LoadStore1ToDCacheType("d_in")
          self.d_out     = DCacheToLoadStore1Type("d_out")
  
          self.d_in      = LoadStore1ToDCacheType("d_in")
          self.d_out     = DCacheToLoadStore1Type("d_out")
  
@@ -582,11 +685,20 @@ class DCache(Elaboratable):
  
          self.stall_out = Signal()
  
  
          self.stall_out = Signal()
  
-        self.wb_out    = WBMasterOut()
-        self.wb_in     = WBSlaveOut()
+        # standard naming (wired to non-standard for compatibility)
+        self.bus = Interface(addr_width=32,
+                            data_width=64,
+                            granularity=8,
+                            features={'stall'},
+                            alignment=0,
+                            name="dcache")
  
          self.log_out   = Signal(20)
  
  
          self.log_out   = Signal(20)
  
+        # test if microwatt compatibility is to be enabled
+        self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
+                                 (pspec.microwatt_compat == True))
+
      def stage_0(self, m, r0, r1, r0_full):
          """Latch the request in r0.req as long as we're not stalling
          """
      def stage_0(self, m, r0, r1, r0_full):
          """Latch the request in r0.req as long as we're not stalling
          """
@@ -602,7 +714,7 @@ class DCache(Elaboratable):
  
          with m.If(m_in.valid):
              comb += r.req.valid.eq(1)
  
          with m.If(m_in.valid):
              comb += r.req.valid.eq(1)
-            comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
+            comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
              comb += r.req.dcbz.eq(0)
              comb += r.req.nc.eq(0)
              comb += r.req.reserve.eq(0)
              comb += r.req.dcbz.eq(0)
              comb += r.req.nc.eq(0)
              comb += r.req.reserve.eq(0)
@@ -615,19 +727,34 @@ class DCache(Elaboratable):
              comb += r.doall.eq(m_in.doall)
              comb += r.tlbld.eq(m_in.tlbld)
              comb += r.mmu_req.eq(1)
              comb += r.doall.eq(m_in.doall)
              comb += r.tlbld.eq(m_in.tlbld)
              comb += r.mmu_req.eq(1)
+            m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
+                                 m_in.addr, m_in.pte, r.req.load)
+
          with m.Else():
              comb += r.req.eq(d_in)
          with m.Else():
              comb += r.req.eq(d_in)
+            comb += r.req.data.eq(0)
              comb += r.tlbie.eq(0)
              comb += r.doall.eq(0)
              comb += r.tlbld.eq(0)
              comb += r.mmu_req.eq(0)
              comb += r.tlbie.eq(0)
              comb += r.doall.eq(0)
              comb += r.tlbld.eq(0)
              comb += r.mmu_req.eq(0)
-        with m.If(~(r1.full & r0_full)):
+
+        with m.If((~r1.full & ~d_in.hold) | ~r0_full):
              sync += r0.eq(r)
              sync += r0_full.eq(r.req.valid)
              sync += r0.eq(r)
              sync += r0_full.eq(r.req.valid)
-
-    def tlb_read(self, m, r0_stall, tlb_valid_way,
-                 tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
-                 dtlb_tags, dtlb_ptes):
+            # Sample data the cycle after a request comes in from loadstore1.
+            # If another request has come in already then the data will get
+            # put directly into req.data below.
+            with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
+                     ~r0.mmu_req):
+                sync += r0.req.data.eq(d_in.data)
+                sync += r0.d_valid.eq(1)
+        with m.If(d_in.valid):
+            m.d.sync += Display("    DCACHE req cache "
+                                "virt %d addr %x data %x ld %d",
+                                 r.req.virt_mode, r.req.addr,
+                                 r.req.data, r.req.load)
+
+    def tlb_read(self, m, r0_stall, tlb_way):
          """TLB
          Operates in the second cycle on the request latched in r0.req.
          TLB updates write the entry at the end of the second cycle.
          """TLB
          Operates in the second cycle on the request latched in r0.req.
          TLB updates write the entry at the end of the second cycle.
@@ -636,7 +763,6 @@ class DCache(Elaboratable):
          sync = m.d.sync
          m_in, d_in = self.m_in, self.d_in
  
          sync = m.d.sync
          m_in, d_in = self.m_in, self.d_in
  
-        index    = Signal(TLB_SET_BITS)
          addrbits = Signal(TLB_SET_BITS)
  
          amin = TLB_LG_PGSZ
          addrbits = Signal(TLB_SET_BITS)
  
          amin = TLB_LG_PGSZ
@@ -646,16 +772,15 @@ class DCache(Elaboratable):
              comb += addrbits.eq(m_in.addr[amin : amax])
          with m.Else():
              comb += addrbits.eq(d_in.addr[amin : amax])
              comb += addrbits.eq(m_in.addr[amin : amax])
          with m.Else():
              comb += addrbits.eq(d_in.addr[amin : amax])
-        comb += index.eq(addrbits)
  
          # If we have any op and the previous op isn't finished,
          # then keep the same output for next cycle.
  
          # If we have any op and the previous op isn't finished,
          # then keep the same output for next cycle.
-        with m.If(~r0_stall):
-            sync += tlb_valid_way.eq(dtlb_valid_bits[index])
-            sync += tlb_tag_way.eq(dtlb_tags[index])
-            sync += tlb_pte_way.eq(dtlb_ptes[index])
+        d = self.dtlb_update
+        comb += d.tlb_read_index.eq(addrbits)
+        comb += d.tlb_read.eq(~r0_stall)
+        comb += tlb_way.eq(d.tlb_way)
  
  
-    def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
+    def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
          """Generate TLB PLRUs
          """
          comb = m.d.comb
          """Generate TLB PLRUs
          """
          comb = m.d.comb
@@ -663,20 +788,19 @@ class DCache(Elaboratable):
  
          if TLB_NUM_WAYS == 0:
              return
  
          if TLB_NUM_WAYS == 0:
              return
-        for i in range(TLB_SET_SIZE):
-            # TLB PLRU interface
-            tlb_plru        = PLRU(TLB_WAY_BITS)
-            setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
-            tlb_plru_acc_en = Signal()
  
  
-            comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
-            comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
-            comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
-            comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
+        # suite of PLRUs with a selection and output mechanism
+        tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
+        m.submodules.tlb_plrus = tlb_plrus
+        comb += tlb_plrus.way.eq(r1.tlb_hit.way)
+        comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
+        comb += tlb_plrus.index.eq(r1.tlb_hit_index)
+        comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
+        comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
  
      def tlb_search(self, m, tlb_req_index, r0, r0_valid,
  
      def tlb_search(self, m, tlb_req_index, r0, r0_valid,
-                   tlb_valid_way, tlb_tag_way, tlb_hit_way,
-                   tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
+                   tlb_way,
+                   pte, tlb_hit, valid_ra, perm_attr, ra):
  
          comb = m.d.comb
  
  
          comb = m.d.comb
  
@@ -689,19 +813,20 @@ class DCache(Elaboratable):
          comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
  
          for i in range(TLB_NUM_WAYS):
          comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
  
          for i in range(TLB_NUM_WAYS):
-            is_tag_hit = Signal()
-            comb += is_tag_hit.eq(tlb_valid_way[i]
-                                  & (read_tlb_tag(i, tlb_tag_way) == eatag))
+            is_tag_hit = Signal(name="is_tag_hit%d" % i)
+            tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
+            comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
+            comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
              with m.If(is_tag_hit):
                  comb += hitway.eq(i)
                  comb += hit.eq(1)
  
              with m.If(is_tag_hit):
                  comb += hitway.eq(i)
                  comb += hit.eq(1)
  
-        comb += tlb_hit.eq(hit & r0_valid)
-        comb += tlb_hit_way.eq(hitway)
+        comb += tlb_hit.valid.eq(hit & r0_valid)
+        comb += tlb_hit.way.eq(hitway)
  
  
-        with m.If(tlb_hit):
-            comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
-        comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
+        with m.If(tlb_hit.valid):
+            comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
+        comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
  
          with m.If(r0.req.virt_mode):
              comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
  
          with m.If(r0.req.virt_mode):
              comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
@@ -723,11 +848,18 @@ class DCache(Elaboratable):
              comb += perm_attr.rd_perm.eq(1)
              comb += perm_attr.wr_perm.eq(1)
  
              comb += perm_attr.rd_perm.eq(1)
              comb += perm_attr.wr_perm.eq(1)
  
-    def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
-                    tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
-                    dtlb_tags, tlb_pte_way, dtlb_ptes):
+        with m.If(valid_ra):
+            m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
+                                r0.req.virt_mode, tlb_hit.valid, ra, pte)
+            m.d.sync += Display("       perm ref=%d", perm_attr.reference)
+            m.d.sync += Display("       perm chg=%d", perm_attr.changed)
+            m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
+            m.d.sync += Display("       perm prv=%d", perm_attr.priv)
+            m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
+            m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
  
  
-        dtlb_valids = TLBValidBitsArray()
+    def tlb_update(self, m, r0_valid, r0, tlb_req_index,
+                    tlb_hit, tlb_plru_victim):
  
          comb = m.d.comb
          sync = m.d.sync
  
          comb = m.d.comb
          sync = m.d.sync
@@ -738,32 +870,18 @@ class DCache(Elaboratable):
          comb += tlbie.eq(r0_valid & r0.tlbie)
          comb += tlbwe.eq(r0_valid & r0.tlbld)
  
          comb += tlbie.eq(r0_valid & r0.tlbie)
          comb += tlbwe.eq(r0_valid & r0.tlbld)
  
-        m.submodules.tlb_update = d = DTLBUpdate()
-        with m.If(tlbie & r0.doall):
-            # clear all valid bits at once
-            for i in range(TLB_SET_SIZE):
-                sync += dtlb_valid_bits[i].eq(0)
-        with m.If(d.updated):
-            sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
-            sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
-        with m.If(d.v_updated):
-            sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
-
-        comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
+        d = self.dtlb_update
  
          comb += d.tlbie.eq(tlbie)
          comb += d.tlbwe.eq(tlbwe)
          comb += d.doall.eq(r0.doall)
          comb += d.tlb_hit.eq(tlb_hit)
  
          comb += d.tlbie.eq(tlbie)
          comb += d.tlbwe.eq(tlbwe)
          comb += d.doall.eq(r0.doall)
          comb += d.tlb_hit.eq(tlb_hit)
-        comb += d.tlb_hit_way.eq(tlb_hit_way)
-        comb += d.tlb_tag_way.eq(tlb_tag_way)
-        comb += d.tlb_pte_way.eq(tlb_pte_way)
          comb += d.tlb_req_index.eq(tlb_req_index)
  
          comb += d.tlb_req_index.eq(tlb_req_index)
  
-        with m.If(tlb_hit):
-            comb += d.repl_way.eq(tlb_hit_way)
+        with m.If(tlb_hit.valid):
+            comb += d.repl_way.eq(tlb_hit.way)
          with m.Else():
          with m.Else():
-            comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
+            comb += d.repl_way.eq(tlb_plru_victim)
          comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
          comb += d.pte_data.eq(r0.req.data)
  
          comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
          comb += d.pte_data.eq(r0.req.data)
  
@@ -776,16 +894,13 @@ class DCache(Elaboratable):
          if TLB_NUM_WAYS == 0:
              return
  
          if TLB_NUM_WAYS == 0:
              return
  
-        for i in range(NUM_LINES):
-            # PLRU interface
-            plru        = PLRU(WAY_BITS)
-            setattr(m.submodules, "plru%d" % i, plru)
-            plru_acc_en = Signal()
-
-            comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
-            comb += plru.acc_en.eq(plru_acc_en)
-            comb += plru.acc_i.eq(r1.hit_way)
-            comb += plru_victim[i].eq(plru.lru_o)
+        # suite of PLRUs with a selection and output mechanism
+        m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
+        comb += plrus.way.eq(r1.hit_way)
+        comb += plrus.valid.eq(r1.cache_hit)
+        comb += plrus.index.eq(r1.hit_index)
+        comb += plrus.isel.eq(r1.store_index) # select victim
+        comb += plru_victim.eq(plrus.o_index) # selected victim
  
      def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
          """Cache tag RAM read port
  
      def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
          """Cache tag RAM read port
@@ -802,21 +917,19 @@ class DCache(Elaboratable):
              comb += index.eq(get_index(m_in.addr))
          with m.Else():
              comb += index.eq(get_index(d_in.addr))
              comb += index.eq(get_index(m_in.addr))
          with m.Else():
              comb += index.eq(get_index(d_in.addr))
-        sync += cache_tag_set.eq(cache_tags[index])
+        sync += cache_tag_set.eq(cache_tags[index].tag)
  
      def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
  
      def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
-                       r0_valid, r1, cache_valids, replace_way,
+                       r0_valid, r1, cache_tags, replace_way,
                         use_forward1_next, use_forward2_next,
                         req_hit_way, plru_victim, rc_ok, perm_attr,
                         valid_ra, perm_ok, access_ok, req_op, req_go,
                         use_forward1_next, use_forward2_next,
                         req_hit_way, plru_victim, rc_ok, perm_attr,
                         valid_ra, perm_ok, access_ok, req_op, req_go,
-                       tlb_pte_way,
-                       tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
+                       tlb_hit, tlb_way, cache_tag_set,
                         cancel_store, req_same_tag, r0_stall, early_req_row):
          """Cache request parsing and hit detection
          """
  
          comb = m.d.comb
                         cancel_store, req_same_tag, r0_stall, early_req_row):
          """Cache request parsing and hit detection
          """
  
          comb = m.d.comb
-        sync = m.d.sync
          m_in, d_in = self.m_in, self.d_in
  
          is_hit      = Signal()
          m_in, d_in = self.m_in, self.d_in
  
          is_hit      = Signal()
@@ -825,9 +938,7 @@ class DCache(Elaboratable):
          opsel       = Signal(3)
          go          = Signal()
          nc          = Signal()
          opsel       = Signal(3)
          go          = Signal()
          nc          = Signal()
-        hit_set     = Array(Signal(name="hit_set_%d" % i) \
-                                  for i in range(TLB_NUM_WAYS))
-        cache_valid_idx = Signal(NUM_WAYS)
+        cache_i_validdx = Signal(NUM_WAYS)
  
          # Extract line, row and tag from request
          comb += req_index.eq(get_index(r0.req.addr))
  
          # Extract line, row and tag from request
          comb += req_index.eq(get_index(r0.req.addr))
@@ -839,19 +950,17 @@ class DCache(Elaboratable):
                      r0.req.addr, ra, req_index, req_tag, req_row)
  
          comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
                      r0.req.addr, ra, req_index, req_tag, req_row)
  
          comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
-        comb += cache_valid_idx.eq(cache_valids[req_index])
-
-        m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
-                                tlb_valid_way, tlb_hit_way,
-                                cache_valid_idx, cache_tag_set,
-                                r0.req.addr,
-                                hit_set)
+        comb += cache_i_validdx.eq(cache_tags[req_index].valid)
  
  
+        m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
+                                            cache_i_validdx, cache_tag_set,
+                                            r0.req.addr)
          comb += dc.tlb_hit.eq(tlb_hit)
          comb += dc.reload_tag.eq(r1.reload_tag)
          comb += dc.virt_mode.eq(r0.req.virt_mode)
          comb += dc.go.eq(go)
          comb += dc.req_index.eq(req_index)
          comb += dc.tlb_hit.eq(tlb_hit)
          comb += dc.reload_tag.eq(r1.reload_tag)
          comb += dc.virt_mode.eq(r0.req.virt_mode)
          comb += dc.go.eq(go)
          comb += dc.req_index.eq(req_index)
+
          comb += is_hit.eq(dc.is_hit)
          comb += hit_way.eq(dc.hit_way)
          comb += req_same_tag.eq(dc.rel_match)
          comb += is_hit.eq(dc.is_hit)
          comb += hit_way.eq(dc.hit_way)
          comb += req_same_tag.eq(dc.rel_match)
@@ -888,7 +997,7 @@ class DCache(Elaboratable):
  
          # The way to replace on a miss
          with m.If(r1.write_tag):
  
          # The way to replace on a miss
          with m.If(r1.write_tag):
-            comb += replace_way.eq(plru_victim[r1.store_index])
+            comb += replace_way.eq(plru_victim)
          with m.Else():
              comb += replace_way.eq(r1.store_way)
  
          with m.Else():
              comb += replace_way.eq(r1.store_way)
  
@@ -900,16 +1009,22 @@ class DCache(Elaboratable):
                             (perm_attr.wr_perm |
                                (r0.req.load & perm_attr.rd_perm)))
          comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
                             (perm_attr.wr_perm |
                                (r0.req.load & perm_attr.rd_perm)))
          comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
+
          # Combine the request and cache hit status to decide what
          # operation needs to be done
          comb += nc.eq(r0.req.nc | perm_attr.nocache)
          comb += op.eq(Op.OP_NONE)
          with m.If(go):
              with m.If(~access_ok):
          # Combine the request and cache hit status to decide what
          # operation needs to be done
          comb += nc.eq(r0.req.nc | perm_attr.nocache)
          comb += op.eq(Op.OP_NONE)
          with m.If(go):
              with m.If(~access_ok):
+                m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
+                                 valid_ra, perm_ok, rc_ok)
                  comb += op.eq(Op.OP_BAD)
              with m.Elif(cancel_store):
                  comb += op.eq(Op.OP_BAD)
              with m.Elif(cancel_store):
+                m.d.sync += Display("DCACHE cancel store")
                  comb += op.eq(Op.OP_STCX_FAIL)
              with m.Else():
                  comb += op.eq(Op.OP_STCX_FAIL)
              with m.Else():
+                m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
+                                 valid_ra, nc, r0.req.load)
                  comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
                  with m.Switch(opsel):
                      with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
                  comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
                  with m.Switch(opsel):
                      with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
@@ -945,16 +1060,15 @@ class DCache(Elaboratable):
              # XXX generate alignment interrupt if address
              # is not aligned XXX or if r0.req.nc = '1'
              with m.If(r0.req.load):
              # XXX generate alignment interrupt if address
              # is not aligned XXX or if r0.req.nc = '1'
              with m.If(r0.req.load):
-                comb += set_rsrv.eq(1) # load with reservation
+                comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
              with m.Else():
              with m.Else():
-                comb += clear_rsrv.eq(1) # store conditional
+                comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
                  with m.If((~reservation.valid) |
                           (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
                      comb += cancel_store.eq(1)
  
      def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
                          reservation, r0):
                  with m.If((~reservation.valid) |
                           (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
                      comb += cancel_store.eq(1)
  
      def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
                          reservation, r0):
-
          comb = m.d.comb
          sync = m.d.sync
  
          comb = m.d.comb
          sync = m.d.sync
  
@@ -991,6 +1105,7 @@ class DCache(Elaboratable):
                  dsel = data_fwd.word_select(i, 8)
                  comb += data_out.word_select(i, 8).eq(dsel)
  
                  dsel = data_fwd.word_select(i, 8)
                  comb += data_out.word_select(i, 8).eq(dsel)
  
+        # DCache output to LoadStore
          comb += d_out.valid.eq(r1.ls_valid)
          comb += d_out.data.eq(data_out)
          comb += d_out.store_done.eq(~r1.stcx_fail)
          comb += d_out.valid.eq(r1.ls_valid)
          comb += d_out.data.eq(data_out)
          comb += d_out.store_done.eq(~r1.stcx_fail)
@@ -1029,12 +1144,15 @@ class DCache(Elaboratable):
  
              # error cases complete without stalling
              with m.If(r1.ls_error):
  
              # error cases complete without stalling
              with m.If(r1.ls_error):
-                sync += Display("completing ld/st with error")
+                with m.If(r1.dcbz):
+                    sync += Display("completing dcbz with error")
+                with m.Else():
+                    sync += Display("completing ld/st with error")
  
              # Slow ops (load miss, NC, stores)
              with m.If(r1.slow_valid):
  
              # Slow ops (load miss, NC, stores)
              with m.If(r1.slow_valid):
-                sync += Display("completing store or load miss data=%x",
-                                data_out)
+                sync += Display("completing store or load miss adr=%x data=%x",
+                                r1.req.real_addr, data_out)
  
          with m.Else():
              # Request came from MMU
  
          with m.Else():
              # Request came from MMU
@@ -1047,8 +1165,8 @@ class DCache(Elaboratable):
  
              # Slow ops (i.e. load miss)
              with m.If(r1.slow_valid):
  
              # Slow ops (i.e. load miss)
              with m.If(r1.slow_valid):
-                sync += Display("completing MMU load miss, data=%x",
-                                m_out.data)
+                sync += Display("completing MMU load miss, adr=%x data=%x",
+                                r1.req.real_addr, m_out.data)
  
      def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
          """rams
  
      def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
          """rams
@@ -1062,62 +1180,80 @@ class DCache(Elaboratable):
          account by using 1-cycle delayed signals for load hits.
          """
          comb = m.d.comb
          account by using 1-cycle delayed signals for load hits.
          """
          comb = m.d.comb
-        wb_in = self.wb_in
+        bus = self.bus
+
+        # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
+        # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
+        m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
+        comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
+                   ~r1.write_bram))
+        comb += rwe.i.eq(replace_way)
+
+        m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
+        comb += hwe.i.eq(r1.hit_way)
+
+        # this one is gated with write_bram, and replace_way_e can never be
+        # set at the same time.  that means that do_write can OR the outputs
+        m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
+        comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
+        comb += hre.i.eq(r1.req.hit_way)
+
+        # common Signals
+        do_read  = Signal()
+        wr_addr  = Signal(ROW_BITS)
+        wr_data  = Signal(WB_DATA_BITS)
+        wr_sel   = Signal(ROW_SIZE)
+        rd_addr  = Signal(ROW_BITS)
+
+        comb += do_read.eq(1) # always enable
+        comb += rd_addr.eq(early_req_row)
+
+        # Write mux:
+        #
+        # Defaults to wishbone read responses (cache refill)
+        #
+        # For timing, the mux on wr_data/sel/addr is not
+        # dependent on anything other than the current state.
+
+        with m.If(r1.write_bram):
+            # Write store data to BRAM.  This happens one
+            # cycle after the store is in r0.
+            comb += wr_data.eq(r1.req.data)
+            comb += wr_sel.eq(r1.req.byte_sel)
+            comb += wr_addr.eq(get_row(r1.req.real_addr))
  
  
+        with m.Else():
+            # Otherwise, we might be doing a reload or a DCBZ
+            with m.If(r1.dcbz):
+                comb += wr_data.eq(0)
+            with m.Else():
+                comb += wr_data.eq(bus.dat_r)
+            comb += wr_addr.eq(r1.store_row)
+            comb += wr_sel.eq(~0) # all 1s
+
+        # set up Cache Rams
          for i in range(NUM_WAYS):
          for i in range(NUM_WAYS):
-            do_read  = Signal(name="do_rd%d" % i)
-            rd_addr  = Signal(ROW_BITS)
              do_write = Signal(name="do_wr%d" % i)
              do_write = Signal(name="do_wr%d" % i)
-            wr_addr  = Signal(ROW_BITS)
-            wr_data  = Signal(WB_DATA_BITS)
-            wr_sel   = Signal(ROW_SIZE)
-            wr_sel_m = Signal(ROW_SIZE)
-            _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
+            wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
+            d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
  
  
-            way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True)
-            setattr(m.submodules, "cacheram_%d" % i, way)
+            way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
+            m.submodules["cacheram_%d" % i] = way
  
              comb += way.rd_en.eq(do_read)
              comb += way.rd_addr.eq(rd_addr)
  
              comb += way.rd_en.eq(do_read)
              comb += way.rd_addr.eq(rd_addr)
-            comb += _d_out.eq(way.rd_data_o)
+            comb += d_out.eq(way.rd_data_o)
              comb += way.wr_sel.eq(wr_sel_m)
              comb += way.wr_addr.eq(wr_addr)
              comb += way.wr_data.eq(wr_data)
  
              # Cache hit reads
              comb += way.wr_sel.eq(wr_sel_m)
              comb += way.wr_addr.eq(wr_addr)
              comb += way.wr_data.eq(wr_data)
  
              # Cache hit reads
-            comb += do_read.eq(1)
-            comb += rd_addr.eq(early_req_row)
-            with m.If(r1.hit_way == i):
-                comb += cache_out_row.eq(_d_out)
-
-            # Write mux:
-            #
-            # Defaults to wishbone read responses (cache refill)
-            #
-            # For timing, the mux on wr_data/sel/addr is not
-            # dependent on anything other than the current state.
-
-            with m.If(r1.write_bram):
-                # Write store data to BRAM.  This happens one
-                # cycle after the store is in r0.
-                comb += wr_data.eq(r1.req.data)
-                comb += wr_sel.eq(r1.req.byte_sel)
-                comb += wr_addr.eq(get_row(r1.req.real_addr))
-
-                with m.If(i == r1.req.hit_way):
-                    comb += do_write.eq(1)
-            with m.Else():
-                # Otherwise, we might be doing a reload or a DCBZ
-                with m.If(r1.dcbz):
-                    comb += wr_data.eq(0)
-                with m.Else():
-                    comb += wr_data.eq(wb_in.dat)
-                comb += wr_addr.eq(r1.store_row)
-                comb += wr_sel.eq(~0) # all 1s
+            with m.If(hwe.o[i]):
+                comb += cache_out_row.eq(d_out)
  
  
-            with m.If((r1.state == State.RELOAD_WAIT_ACK)
-                      & wb_in.ack & (replace_way == i)):
-                comb += do_write.eq(1)
+            # these are mutually-exclusive via their Decoder-enablers
+            # (note: Decoder-enable is inverted)
+            comb += do_write.eq(hre.o[i] | rwe.o[i])
  
              # Mask write selects with do_write since BRAM
              # doesn't have a global write-enable
  
              # Mask write selects with do_write since BRAM
              # doesn't have a global write-enable
@@ -1129,8 +1265,7 @@ class DCache(Elaboratable):
      # It also handles error cases (TLB miss, cache paradox)
      def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
                          req_hit_way, req_index, req_tag, access_ok,
      # It also handles error cases (TLB miss, cache paradox)
      def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
                          req_hit_way, req_index, req_tag, access_ok,
-                        tlb_hit, tlb_hit_way, tlb_req_index):
-
+                        tlb_hit, tlb_req_index):
          comb = m.d.comb
          sync = m.d.sync
  
          comb = m.d.comb
          sync = m.d.sync
  
@@ -1147,36 +1282,26 @@ class DCache(Elaboratable):
          sync += r1.hit_way.eq(req_hit_way)
          sync += r1.hit_index.eq(req_index)
  
          sync += r1.hit_way.eq(req_hit_way)
          sync += r1.hit_index.eq(req_index)
  
-        with m.If(req_op == Op.OP_LOAD_HIT):
-            sync += r1.hit_load_valid.eq(1)
-        with m.Else():
-            sync += r1.hit_load_valid.eq(0)
-
-        with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
-            sync += r1.cache_hit.eq(1)
-        with m.Else():
-            sync += r1.cache_hit.eq(0)
+        sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
+        sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
+                                (req_op == Op.OP_STORE_HIT))
  
          with m.If(req_op == Op.OP_BAD):
  
          with m.If(req_op == Op.OP_BAD):
-            # Display(f"Signalling ld/st error valid_ra={valid_ra}"
-            #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
+            sync += Display("Signalling ld/st error "
+                            "ls_error=%i mmu_error=%i cache_paradox=%i",
+                            ~r0.mmu_req,r0.mmu_req,access_ok)
              sync += r1.ls_error.eq(~r0.mmu_req)
              sync += r1.mmu_error.eq(r0.mmu_req)
              sync += r1.cache_paradox.eq(access_ok)
              sync += r1.ls_error.eq(~r0.mmu_req)
              sync += r1.mmu_error.eq(r0.mmu_req)
              sync += r1.cache_paradox.eq(access_ok)
-
-            with m.Else():
-                sync += r1.ls_error.eq(0)
-                sync += r1.mmu_error.eq(0)
-                sync += r1.cache_paradox.eq(0)
-
-        with m.If(req_op == Op.OP_STCX_FAIL):
-            sync += r1.stcx_fail.eq(1)
          with m.Else():
          with m.Else():
-            sync += r1.stcx_fail.eq(0)
+            sync += r1.ls_error.eq(0)
+            sync += r1.mmu_error.eq(0)
+            sync += r1.cache_paradox.eq(0)
+
+        sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
  
          # Record TLB hit information for updating TLB PLRU
          sync += r1.tlb_hit.eq(tlb_hit)
  
          # Record TLB hit information for updating TLB PLRU
          sync += r1.tlb_hit.eq(tlb_hit)
-        sync += r1.tlb_hit_way.eq(tlb_hit_way)
          sync += r1.tlb_hit_index.eq(tlb_req_index)
  
      # Memory accesses are handled by this state machine:
          sync += r1.tlb_hit_index.eq(tlb_req_index)
  
      # Memory accesses are handled by this state machine:
@@ -1188,17 +1313,16 @@ class DCache(Elaboratable):
      # All wishbone requests generation is done here.
      # This machine operates at stage 1.
      def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
      # All wishbone requests generation is done here.
      # This machine operates at stage 1.
      def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
-                    cache_valids, r0, replace_way,
+                    r0, replace_way,
                      req_hit_way, req_same_tag,
                      r0_valid, req_op, cache_tags, req_go, ra):
  
          comb = m.d.comb
          sync = m.d.sync
                      req_hit_way, req_same_tag,
                      r0_valid, req_op, cache_tags, req_go, ra):
  
          comb = m.d.comb
          sync = m.d.sync
-        wb_in = self.wb_in
+        bus = self.bus
+        d_in = self.d_in
  
          req         = MemAccessRequest("mreq_ds")
  
          req         = MemAccessRequest("mreq_ds")
-        acks        = Signal(3)
-        adjust_acks = Signal(3)
  
          req_row = Signal(ROW_BITS)
          req_idx = Signal(INDEX_BITS)
  
          req_row = Signal(ROW_BITS)
          req_idx = Signal(INDEX_BITS)
@@ -1226,7 +1350,7 @@ class DCache(Elaboratable):
              with m.If(r1.dcbz):
                  sync += r1.forward_data1.eq(0)
              with m.Else():
              with m.If(r1.dcbz):
                  sync += r1.forward_data1.eq(0)
              with m.Else():
-                sync += r1.forward_data1.eq(wb_in.dat)
+                sync += r1.forward_data1.eq(bus.dat_r)
              sync += r1.forward_sel1.eq(~0) # all 1s
              sync += r1.forward_way1.eq(replace_way)
              sync += r1.forward_row1.eq(r1.store_row)
              sync += r1.forward_sel1.eq(~0) # all 1s
              sync += r1.forward_way1.eq(replace_way)
              sync += r1.forward_row1.eq(r1.store_row)
@@ -1243,19 +1367,21 @@ class DCache(Elaboratable):
          sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
  
          with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
          sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
  
          with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
-            with m.If(~r0.mmu_req):
-                sync += r1.ls_valid.eq(1)
-            with m.Else():
+            with m.If(r0.mmu_req):
                  sync += r1.mmu_done.eq(1)
                  sync += r1.mmu_done.eq(1)
+            with m.Else():
+                sync += r1.ls_valid.eq(1)
  
          with m.If(r1.write_tag):
              # Store new tag in selected way
  
          with m.If(r1.write_tag):
              # Store new tag in selected way
+            replace_way_onehot = Signal(NUM_WAYS)
+            comb += replace_way_onehot.eq(1<<replace_way)
              for i in range(NUM_WAYS):
              for i in range(NUM_WAYS):
-                with m.If(i == replace_way):
+                with m.If(replace_way_onehot[i]):
                      ct = Signal(TAG_RAM_WIDTH)
                      ct = Signal(TAG_RAM_WIDTH)
-                    comb += ct.eq(cache_tags[r1.store_index])
+                    comb += ct.eq(cache_tags[r1.store_index].tag)
                      comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
                      comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
-                    sync += cache_tags[r1.store_index].eq(ct)
+                    sync += cache_tags[r1.store_index].tag.eq(ct)
              sync += r1.store_way.eq(replace_way)
              sync += r1.write_tag.eq(0)
  
              sync += r1.store_way.eq(replace_way)
              sync += r1.write_tag.eq(0)
  
@@ -1270,10 +1396,13 @@ class DCache(Elaboratable):
              comb += req.dcbz.eq(r0.req.dcbz)
              comb += req.real_addr.eq(ra)
  
              comb += req.dcbz.eq(r0.req.dcbz)
              comb += req.real_addr.eq(ra)
  
-            with m.If(~r0.req.dcbz):
+            with m.If(r0.req.dcbz):
+                # force data to 0 for dcbz
+                comb += req.data.eq(0)
+            with m.Elif(r0.d_valid):
                  comb += req.data.eq(r0.req.data)
              with m.Else():
                  comb += req.data.eq(r0.req.data)
              with m.Else():
-                comb += req.data.eq(0)
+                comb += req.data.eq(d_in.data)
  
              # Select all bytes for dcbz
              # and for cacheable loads
  
              # Select all bytes for dcbz
              # and for cacheable loads
@@ -1298,7 +1427,7 @@ class DCache(Elaboratable):
          with m.Switch(r1.state):
  
              with m.Case(State.IDLE):
          with m.Switch(r1.state):
  
              with m.Case(State.IDLE):
-                sync += r1.real_adr.eq(req.real_addr)
+                sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
                  sync += r1.wb.sel.eq(req.byte_sel)
                  sync += r1.wb.dat.eq(req.data)
                  sync += r1.dcbz.eq(req.dcbz)
                  sync += r1.wb.sel.eq(req.byte_sel)
                  sync += r1.wb.dat.eq(req.data)
                  sync += r1.dcbz.eq(req.dcbz)
@@ -1354,10 +1483,10 @@ class DCache(Elaboratable):
                              sync += r1.full.eq(0)
                              sync += r1.slow_valid.eq(1)
  
                              sync += r1.full.eq(0)
                              sync += r1.slow_valid.eq(1)
  
-                            with m.If(~req.mmu_req):
-                                sync += r1.ls_valid.eq(1)
-                            with m.Else():
+                            with m.If(req.mmu_req):
                                  sync += r1.mmu_done.eq(1)
                                  sync += r1.mmu_done.eq(1)
+                            with m.Else():
+                                sync += r1.ls_valid.eq(1)
  
                              with m.If(req.op == Op.OP_STORE_HIT):
                                  sync += r1.write_bram.eq(1)
  
                              with m.If(req.op == Op.OP_STORE_HIT):
                                  sync += r1.write_bram.eq(1)
@@ -1388,22 +1517,25 @@ class DCache(Elaboratable):
                  # Requests are all sent if stb is 0
                  comb += ld_stbs_done.eq(~r1.wb.stb)
  
                  # Requests are all sent if stb is 0
                  comb += ld_stbs_done.eq(~r1.wb.stb)
  
-                with m.If((~wb_in.stall) & r1.wb.stb):
+                # If we are still sending requests, was one accepted?
+                with m.If((~bus.stall) & r1.wb.stb):
                      # That was the last word?  We are done sending.
                      # Clear stb and set ld_stbs_done so we can handle an
                      # eventual last ack on the same cycle.
                      # That was the last word?  We are done sending.
                      # Clear stb and set ld_stbs_done so we can handle an
                      # eventual last ack on the same cycle.
-                    with m.If(is_last_row_addr(r1.real_adr, r1.end_row_ix)):
+                    # sigh - reconstruct wb adr with 3 extra 0s at front
+                    wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
+                    with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
                          sync += r1.wb.stb.eq(0)
                          comb += ld_stbs_done.eq(1)
  
                      # Calculate the next row address in the current cache line
                      row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
                          sync += r1.wb.stb.eq(0)
                          comb += ld_stbs_done.eq(1)
  
                      # Calculate the next row address in the current cache line
                      row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
-                    comb += row.eq(r1.real_adr[ROW_OFF_BITS:])
-                    sync += r1.real_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(row+1)
+                    comb += row.eq(r1.wb.adr)
+                    sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
  
                  # Incoming acks processing
  
                  # Incoming acks processing
-                sync += r1.forward_valid1.eq(wb_in.ack)
-                with m.If(wb_in.ack):
+                sync += r1.forward_valid1.eq(bus.ack)
+                with m.If(bus.ack):
                      srow = Signal(ROW_LINE_BITS)
                      comb += srow.eq(r1.store_row)
                      sync += r1.rows_valid[srow].eq(1)
                      srow = Signal(ROW_LINE_BITS)
                      comb += srow.eq(r1.store_row)
                      sync += r1.rows_valid[srow].eq(1)
@@ -1413,16 +1545,16 @@ class DCache(Elaboratable):
                      # Compare the whole address in case the
                      # request in r1.req is not the one that
                      # started this refill.
                      # Compare the whole address in case the
                      # request in r1.req is not the one that
                      # started this refill.
-                    with m.If(r1.full & r1.req.same_tag &
+                    with m.If(req.valid & r1.req.same_tag &
                                ((r1.dcbz & r1.req.dcbz) |
                                ((r1.dcbz & r1.req.dcbz) |
-                               ((~r1.dcbz) & (r1.req.op == Op.OP_LOAD_MISS))) &
-                                (r1.store_row == get_row(r1.req.real_addr))):
+                               (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
+                                (r1.store_row == get_row(req.real_addr))):
                          sync += r1.full.eq(0)
                          sync += r1.slow_valid.eq(1)
                          sync += r1.full.eq(0)
                          sync += r1.slow_valid.eq(1)
-                        with m.If(~r1.mmu_req):
-                            sync += r1.ls_valid.eq(1)
-                        with m.Else():
+                        with m.If(r1.mmu_req):
                              sync += r1.mmu_done.eq(1)
                              sync += r1.mmu_done.eq(1)
+                        with m.Else():
+                            sync += r1.ls_valid.eq(1)
                          sync += r1.forward_sel.eq(~0) # all 1s
                          sync += r1.use_forward1.eq(1)
  
                          sync += r1.forward_sel.eq(~0) # all 1s
                          sync += r1.use_forward1.eq(1)
  
@@ -1434,41 +1566,45 @@ class DCache(Elaboratable):
  
                          # Cache line is now valid
                          cv = Signal(INDEX_BITS)
  
                          # Cache line is now valid
                          cv = Signal(INDEX_BITS)
-                        comb += cv.eq(cache_valids[r1.store_index])
+                        comb += cv.eq(cache_tags[r1.store_index].valid)
                          comb += cv.bit_select(r1.store_way, 1).eq(1)
                          comb += cv.bit_select(r1.store_way, 1).eq(1)
-                        sync += cache_valids[r1.store_index].eq(cv)
+                        sync += cache_tags[r1.store_index].valid.eq(cv)
  
                          sync += r1.state.eq(State.IDLE)
  
                          sync += r1.state.eq(State.IDLE)
+                        sync += Display("cache valid set %x "
+                                        "idx %d way %d",
+                                         cv, r1.store_index, r1.store_way)
  
                      # Increment store row counter
                      sync += r1.store_row.eq(next_row(r1.store_row))
  
              with m.Case(State.STORE_WAIT_ACK):
                  st_stbs_done = Signal()
  
                      # Increment store row counter
                      sync += r1.store_row.eq(next_row(r1.store_row))
  
              with m.Case(State.STORE_WAIT_ACK):
                  st_stbs_done = Signal()
+                adjust_acks = Signal(3)
+
                  comb += st_stbs_done.eq(~r1.wb.stb)
                  comb += st_stbs_done.eq(~r1.wb.stb)
-                comb += acks.eq(r1.acks_pending)
  
                  with m.If(r1.inc_acks != r1.dec_acks):
                      with m.If(r1.inc_acks):
  
                  with m.If(r1.inc_acks != r1.dec_acks):
                      with m.If(r1.inc_acks):
-                        comb += adjust_acks.eq(acks + 1)
+                        comb += adjust_acks.eq(r1.acks_pending + 1)
                      with m.Else():
                      with m.Else():
-                        comb += adjust_acks.eq(acks - 1)
+                        comb += adjust_acks.eq(r1.acks_pending - 1)
                  with m.Else():
                  with m.Else():
-                    comb += adjust_acks.eq(acks)
+                    comb += adjust_acks.eq(r1.acks_pending)
  
                  sync += r1.acks_pending.eq(adjust_acks)
  
                  # Clear stb when slave accepted request
  
                  sync += r1.acks_pending.eq(adjust_acks)
  
                  # Clear stb when slave accepted request
-                with m.If(~wb_in.stall):
+                with m.If(~bus.stall):
                      # See if there is another store waiting
                      # to be done which is in the same real page.
                      with m.If(req.valid):
                      # See if there is another store waiting
                      # to be done which is in the same real page.
                      with m.If(req.valid):
-                        ra = req.real_addr[0:SET_SIZE_BITS]
-                        sync += r1.real_adr[0:SET_SIZE_BITS].eq(ra)
+                        _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
+                        sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
                          sync += r1.wb.dat.eq(req.data)
                          sync += r1.wb.sel.eq(req.byte_sel)
  
                          sync += r1.wb.dat.eq(req.data)
                          sync += r1.wb.sel.eq(req.byte_sel)
  
-                    with m.Elif((adjust_acks < 7) & req.same_tag &
+                    with m.If((adjust_acks < 7) & req.same_tag &
                                  ((req.op == Op.OP_STORE_MISS)
                                   | (req.op == Op.OP_STORE_HIT))):
                          sync += r1.wb.stb.eq(1)
                                  ((req.op == Op.OP_STORE_MISS)
                                   | (req.op == Op.OP_STORE_HIT))):
                          sync += r1.wb.stb.eq(1)
@@ -1488,7 +1624,9 @@ class DCache(Elaboratable):
                          comb += st_stbs_done.eq(1)
  
                  # Got ack ? See if complete.
                          comb += st_stbs_done.eq(1)
  
                  # Got ack ? See if complete.
-                with m.If(wb_in.ack):
+                sync += Display("got ack %d %d stbs %d adjust_acks %d",
+                                bus.ack, bus.ack, st_stbs_done, adjust_acks)
+                with m.If(bus.ack):
                      with m.If(st_stbs_done & (adjust_acks == 1)):
                          sync += r1.state.eq(State.IDLE)
                          sync += r1.wb.cyc.eq(0)
                      with m.If(st_stbs_done & (adjust_acks == 1)):
                          sync += r1.state.eq(State.IDLE)
                          sync += r1.wb.cyc.eq(0)
@@ -1497,44 +1635,44 @@ class DCache(Elaboratable):
  
              with m.Case(State.NC_LOAD_WAIT_ACK):
                  # Clear stb when slave accepted request
  
              with m.Case(State.NC_LOAD_WAIT_ACK):
                  # Clear stb when slave accepted request
-                with m.If(~wb_in.stall):
+                with m.If(~bus.stall):
                      sync += r1.wb.stb.eq(0)
  
                  # Got ack ? complete.
                      sync += r1.wb.stb.eq(0)
  
                  # Got ack ? complete.
-                with m.If(wb_in.ack):
+                with m.If(bus.ack):
                      sync += r1.state.eq(State.IDLE)
                      sync += r1.full.eq(0)
                      sync += r1.slow_valid.eq(1)
  
                      sync += r1.state.eq(State.IDLE)
                      sync += r1.full.eq(0)
                      sync += r1.slow_valid.eq(1)
  
-                    with m.If(~r1.mmu_req):
-                        sync += r1.ls_valid.eq(1)
-                    with m.Else():
+                    with m.If(r1.mmu_req):
                          sync += r1.mmu_done.eq(1)
                          sync += r1.mmu_done.eq(1)
+                    with m.Else():
+                        sync += r1.ls_valid.eq(1)
  
                      sync += r1.forward_sel.eq(~0) # all 1s
                      sync += r1.use_forward1.eq(1)
                      sync += r1.wb.cyc.eq(0)
                      sync += r1.wb.stb.eq(0)
  
  
                      sync += r1.forward_sel.eq(~0) # all 1s
                      sync += r1.use_forward1.eq(1)
                      sync += r1.wb.cyc.eq(0)
                      sync += r1.wb.stb.eq(0)
  
-    def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
+    def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
  
          sync = m.d.sync
  
          sync = m.d.sync
-        d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
+        d_out, bus, log_out = self.d_out, self.bus, self.log_out
  
  
-        sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
+        sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
                                 stall_out, req_op[:3], d_out.valid, d_out.error,
                                 stall_out, req_op[:3], d_out.valid, d_out.error,
-                               r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
+                               r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
                                 r1.real_adr[3:6]))
  
      def elaborate(self, platform):
  
          m = Module()
          comb = m.d.comb
                                 r1.real_adr[3:6]))
  
      def elaborate(self, platform):
  
          m = Module()
          comb = m.d.comb
+        d_in = self.d_in
  
          # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
          cache_tags       = CacheTagArray()
          cache_tag_set    = Signal(TAG_RAM_WIDTH)
  
          # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
          cache_tags       = CacheTagArray()
          cache_tag_set    = Signal(TAG_RAM_WIDTH)
-        cache_valids = CacheValidBitsArray()
  
          # TODO attribute ram_style : string;
          # TODO attribute ram_style of cache_tags : signal is "distributed";
  
          # TODO attribute ram_style : string;
          # TODO attribute ram_style of cache_tags : signal is "distributed";
@@ -1542,9 +1680,6 @@ class DCache(Elaboratable):
          """note: these are passed to nmigen.hdl.Memory as "attributes".
             don't know how, just that they are.
          """
          """note: these are passed to nmigen.hdl.Memory as "attributes".
             don't know how, just that they are.
          """
-        dtlb_valid_bits = TLBValidBitsArray()
-        dtlb_tags       = TLBTagsArray()
-        dtlb_ptes       = TLBPtesArray()
          # TODO attribute ram_style of
          #  dtlb_tags : signal is "distributed";
          # TODO attribute ram_style of
          # TODO attribute ram_style of
          #  dtlb_tags : signal is "distributed";
          # TODO attribute ram_style of
@@ -1581,19 +1716,16 @@ class DCache(Elaboratable):
  
          cache_out_row     = Signal(WB_DATA_BITS)
  
  
          cache_out_row     = Signal(WB_DATA_BITS)
  
-        plru_victim       = PLRUOut()
+        plru_victim       = Signal(WAY_BITS)
          replace_way       = Signal(WAY_BITS)
  
          # Wishbone read/write/cache write formatting signals
          bus_sel           = Signal(8)
  
          # TLB signals
          replace_way       = Signal(WAY_BITS)
  
          # Wishbone read/write/cache write formatting signals
          bus_sel           = Signal(8)
  
          # TLB signals
-        tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
-        tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
-        tlb_valid_way = Signal(TLB_NUM_WAYS)
+        tlb_way       = TLBRecord("tlb_way")
          tlb_req_index = Signal(TLB_SET_BITS)
          tlb_req_index = Signal(TLB_SET_BITS)
-        tlb_hit       = Signal()
-        tlb_hit_way   = Signal(TLB_WAY_BITS)
+        tlb_hit       = TLBHit("tlb_hit")
          pte           = Signal(TLB_PTE_BITS)
          ra            = Signal(REAL_ADDR_BITS)
          valid_ra      = Signal()
          pte           = Signal(TLB_PTE_BITS)
          ra            = Signal(REAL_ADDR_BITS)
          valid_ra      = Signal()
@@ -1602,44 +1734,52 @@ class DCache(Elaboratable):
          perm_ok       = Signal()
          access_ok     = Signal()
  
          perm_ok       = Signal()
          access_ok     = Signal()
  
-        tlb_plru_victim = TLBPLRUOut()
+        tlb_plru_victim = Signal(TLB_WAY_BITS)
  
          # we don't yet handle collisions between loadstore1 requests
          # and MMU requests
          comb += self.m_out.stall.eq(0)
  
          # Hold off the request in r0 when r1 has an uncompleted request
  
          # we don't yet handle collisions between loadstore1 requests
          # and MMU requests
          comb += self.m_out.stall.eq(0)
  
          # Hold off the request in r0 when r1 has an uncompleted request
-        comb += r0_stall.eq(r0_full & r1.full)
-        comb += r0_valid.eq(r0_full & ~r1.full)
+        comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
+        comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
          comb += self.stall_out.eq(r0_stall)
  
          comb += self.stall_out.eq(r0_stall)
  
+        # deal with litex not doing wishbone pipeline mode
+        # XXX in wrong way.  FIFOs are needed in the SRAM test
+        # so that stb/ack match up. same thing done in icache.py
+        if not self.microwatt_compat:
+            comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
+
          # Wire up wishbone request latch out of stage 1
          # Wire up wishbone request latch out of stage 1
-        comb += r1.wb.adr.eq(r1.real_adr)
-        comb += self.wb_out.eq(r1.wb)
-        comb += self.wb_out.adr.eq(r1.wb.adr[3:]) # truncate LSBs
+        comb += self.bus.we.eq(r1.wb.we)
+        comb += self.bus.adr.eq(r1.wb.adr)
+        comb += self.bus.sel.eq(r1.wb.sel)
+        comb += self.bus.stb.eq(r1.wb.stb)
+        comb += self.bus.dat_w.eq(r1.wb.dat)
+        comb += self.bus.cyc.eq(r1.wb.cyc)
+
+        # create submodule TLBUpdate
+        m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
  
          # call sub-functions putting everything together, using shared
          # signals established above
          self.stage_0(m, r0, r1, r0_full)
  
          # call sub-functions putting everything together, using shared
          # signals established above
          self.stage_0(m, r0, r1, r0_full)
-        self.tlb_read(m, r0_stall, tlb_valid_way,
-                      tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
-                      dtlb_tags, dtlb_ptes)
+        self.tlb_read(m, r0_stall, tlb_way)
          self.tlb_search(m, tlb_req_index, r0, r0_valid,
          self.tlb_search(m, tlb_req_index, r0, r0_valid,
-                        tlb_valid_way, tlb_tag_way, tlb_hit_way,
-                        tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
-        self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
-                        tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
-                        dtlb_tags, tlb_pte_way, dtlb_ptes)
+                        tlb_way,
+                        pte, tlb_hit, valid_ra, perm_attr, ra)
+        self.tlb_update(m, r0_valid, r0, tlb_req_index,
+                        tlb_hit, tlb_plru_victim)
          self.maybe_plrus(m, r1, plru_victim)
          self.maybe_plrus(m, r1, plru_victim)
-        self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
+        self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
          self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
          self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
          self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
          self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
-                           r0_valid, r1, cache_valids, replace_way,
+                           r0_valid, r1, cache_tags, replace_way,
                             use_forward1_next, use_forward2_next,
                             req_hit_way, plru_victim, rc_ok, perm_attr,
                             valid_ra, perm_ok, access_ok, req_op, req_go,
                             use_forward1_next, use_forward2_next,
                             req_hit_way, plru_victim, rc_ok, perm_attr,
                             valid_ra, perm_ok, access_ok, req_op, req_go,
-                           tlb_pte_way,
-                           tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
+                           tlb_hit, tlb_way, cache_tag_set,
                             cancel_store, req_same_tag, r0_stall, early_req_row)
          self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
                             r0_valid, r0, reservation)
                             cancel_store, req_same_tag, r0_stall, early_req_row)
          self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
                             r0_valid, r0, reservation)
@@ -1649,214 +1789,18 @@ class DCache(Elaboratable):
          self.rams(m, r1, early_req_row, cache_out_row, replace_way)
          self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
                          req_hit_way, req_index, req_tag, access_ok,
          self.rams(m, r1, early_req_row, cache_out_row, replace_way)
          self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
                          req_hit_way, req_index, req_tag, access_ok,
-                        tlb_hit, tlb_hit_way, tlb_req_index)
+                        tlb_hit, tlb_req_index)
          self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
          self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
-                    cache_valids, r0, replace_way,
+                    r0, replace_way,
                      req_hit_way, req_same_tag,
                           r0_valid, req_op, cache_tags, req_go, ra)
                      req_hit_way, req_same_tag,
                           r0_valid, req_op, cache_tags, req_go, ra)
-        #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
+        #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
  
          return m
  
  
          return m
  
-def dcache_load(dut, addr, nc=0):
-    yield dut.d_in.load.eq(1)
-    yield dut.d_in.nc.eq(nc)
-    yield dut.d_in.addr.eq(addr)
-    yield dut.d_in.byte_sel.eq(~0)
-    yield dut.d_in.valid.eq(1)
-    yield
-    yield dut.d_in.valid.eq(0)
-    yield dut.d_in.byte_sel.eq(0)
-    while not (yield dut.d_out.valid):
-        yield
-    data = yield dut.d_out.data
-    return data
-
-
-def dcache_store(dut, addr, data, nc=0):
-    yield dut.d_in.load.eq(0)
-    yield dut.d_in.nc.eq(nc)
-    yield dut.d_in.data.eq(data)
-    yield dut.d_in.byte_sel.eq(~0)
-    yield dut.d_in.addr.eq(addr)
-    yield dut.d_in.valid.eq(1)
-    yield
-    yield dut.d_in.valid.eq(0)
-    yield dut.d_in.byte_sel.eq(0)
-    while not (yield dut.d_out.valid):
-        yield
-
-
-def dcache_random_sim(dut, mem):
-
-    # start copy of mem
-    sim_mem = deepcopy(mem)
-    print ("mem len", len(sim_mem))
-
-    # clear stuff
-    yield dut.d_in.valid.eq(0)
-    yield dut.d_in.load.eq(0)
-    yield dut.d_in.priv_mode.eq(1)
-    yield dut.d_in.nc.eq(0)
-    yield dut.d_in.addr.eq(0)
-    yield dut.d_in.data.eq(0)
-    yield dut.m_in.valid.eq(0)
-    yield dut.m_in.addr.eq(0)
-    yield dut.m_in.pte.eq(0)
-    # wait 4 * clk_period
-    yield
-    yield
-    yield
-    yield
-
-    print ()
-
-    #for i in range(1024):
-    #    sim_mem[i] = i
-
-    for i in range(1024):
-        addr = randint(0, 1023)
-        data = randint(0, (1<<64)-1)
-        sim_mem[addr] = data
-        row = addr
-        addr *= 8
-
-        print ("random testing %d 0x%x row %d data 0x%x" % (i, addr, row, data))
-
-        yield from dcache_load(dut, addr)
-        yield from dcache_store(dut, addr, data)
-
-        addr = randint(0, 1023)
-        sim_data = sim_mem[addr]
-        row = addr
-        addr *= 8
-
-        print ("    load 0x%x row %d expect data 0x%x" % (addr, row, sim_data))
-        data = yield from dcache_load(dut, addr)
-        assert data == sim_data, \
-            "check addr 0x%x row %d data %x != %x" % (addr, row, data, sim_data)
-
-    for addr in range(1024):
-        data = yield from dcache_load(dut, addr*8)
-        assert data == sim_mem[addr], \
-            "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
-
-
-def dcache_sim(dut, mem):
-    # clear stuff
-    yield dut.d_in.valid.eq(0)
-    yield dut.d_in.load.eq(0)
-    yield dut.d_in.priv_mode.eq(1)
-    yield dut.d_in.nc.eq(0)
-    yield dut.d_in.addr.eq(0)
-    yield dut.d_in.data.eq(0)
-    yield dut.m_in.valid.eq(0)
-    yield dut.m_in.addr.eq(0)
-    yield dut.m_in.pte.eq(0)
-    # wait 4 * clk_period
-    yield
-    yield
-    yield
-    yield
-
-    # Cacheable read of address 4
-    data = yield from dcache_load(dut, 0x58)
-    addr = yield dut.d_in.addr
-    assert data == 0x0000001700000016, \
-        f"data @%x=%x expected 0x0000001700000016" % (addr, data)
-
-    # Cacheable read of address 20
-    data = yield from dcache_load(dut, 0x20)
-    addr = yield dut.d_in.addr
-    assert data == 0x0000000900000008, \
-        f"data @%x=%x expected 0x0000000900000008" % (addr, data)
-
-    # Cacheable read of address 30
-    data = yield from dcache_load(dut, 0x530)
-    addr = yield dut.d_in.addr
-    assert data == 0x0000014D0000014C, \
-        f"data @%x=%x expected 0000014D0000014C" % (addr, data)
-
-    # 2nd Cacheable read of address 30
-    data = yield from dcache_load(dut, 0x530)
-    addr = yield dut.d_in.addr
-    assert data == 0x0000014D0000014C, \
-        f"data @%x=%x expected 0000014D0000014C" % (addr, data)
-
-    # Non-cacheable read of address 100
-    data = yield from dcache_load(dut, 0x100, nc=1)
-    addr = yield dut.d_in.addr
-    assert data == 0x0000004100000040, \
-        f"data @%x=%x expected 0000004100000040" % (addr, data)
-
-    # Store at address 530
-    yield from dcache_store(dut, 0x530, 0x121)
-
-    # Store at address 30
-    yield from dcache_store(dut, 0x530, 0x12345678)
-
-    # 3nd Cacheable read of address 530
-    data = yield from dcache_load(dut, 0x530)
-    addr = yield dut.d_in.addr
-    assert data == 0x12345678, \
-        f"data @%x=%x expected 0x12345678" % (addr, data)
-
-    # 4th Cacheable read of address 20
-    data = yield from dcache_load(dut, 0x20)
-    addr = yield dut.d_in.addr
-    assert data == 0x0000000900000008, \
-        f"data @%x=%x expected 0x0000000900000008" % (addr, data)
-
-    yield
-    yield
-    yield
-    yield
-
-
-def test_dcache(mem, test_fn, test_name):
-    dut = DCache()
-
-    memory = Memory(width=64, depth=16*64, init=mem, simulate=True)
-    sram = SRAM(memory=memory, granularity=8)
-
-    m = Module()
-    m.submodules.dcache = dut
-    m.submodules.sram = sram
-
-    m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
-    m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
-    m.d.comb += sram.bus.we.eq(dut.wb_out.we)
-    m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
-    m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
-    m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
-
-    m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
-    m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
-
-    # nmigen Simulation
-    sim = Simulator(m)
-    sim.add_clock(1e-6)
-
-    sim.add_sync_process(wrap(test_fn(dut, mem)))
-    with sim.write_vcd('test_dcache%s.vcd' % test_name):
-        sim.run()
  
  if __name__ == '__main__':
  
  if __name__ == '__main__':
-    seed(0)
      dut = DCache()
      vl = rtlil.convert(dut, ports=[])
      with open("test_dcache.il", "w") as f:
          f.write(vl)
      dut = DCache()
      vl = rtlil.convert(dut, ports=[])
      with open("test_dcache.il", "w") as f:
          f.write(vl)
-
-    mem = []
-    for i in range(1024):
-        mem.append((i*2)| ((i*2+1)<<32))
-
-    test_dcache(mem, dcache_sim, "")
-
-    mem = []
-    for i in range(0, 1024):
-        mem.append(i)
-
-    test_dcache(mem, dcache_random_sim, "random")
-