Allow more test cases to be run with CXXSim

[soc.git] / src / soc / experiment / dcache.py
diff --git a/src/soc/experiment/dcache.py b/src/soc/experiment/dcache.py

index 64503ec2bd0f1c5ee223d0363fd301c6ed77bb75..e1f82b77dc337467c1f9eeff306adc2ade4a7120 100644 (file)
--- a/src/soc/experiment/dcache.py
+++ b/src/soc/experiment/dcache.py
@@ -7,15 +7,12 @@ based on Anton Blanchard microwatt dcache.vhdl
  from enum import Enum, unique
  
  from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  from enum import Enum, unique
  
  from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
-try:
-    from nmigen.hdl.ast import Display
-except ImportError:
-    def Display(*args):
-        return []
+from nmutil.util import Display
+
+from random import randint
  
  from nmigen.cli import main
  from nmutil.iocontrol import RecordObject
  
  from nmigen.cli import main
  from nmutil.iocontrol import RecordObject
-from nmutil.util import wrap
  from nmigen.utils import log2_int
  from soc.experiment.mem_types import (LoadStore1ToDCacheType,
                                       DCacheToLoadStore1Type,
  from nmigen.utils import log2_int
  from soc.experiment.mem_types import (LoadStore1ToDCacheType,
                                       DCacheToLoadStore1Type,
@@ -29,16 +26,19 @@ from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
                                  WBIOMasterOut, WBIOSlaveOut)
  
  from soc.experiment.cache_ram import CacheRam
                                  WBIOMasterOut, WBIOSlaveOut)
  
  from soc.experiment.cache_ram import CacheRam
-from soc.experiment.plru import PLRU
+#from soc.experiment.plru import PLRU
+from nmutil.plru import PLRU
  
  # for test
  from nmigen_soc.wishbone.sram import SRAM
  from nmigen import Memory
  from nmigen.cli import rtlil
  
  # for test
  from nmigen_soc.wishbone.sram import SRAM
  from nmigen import Memory
  from nmigen.cli import rtlil
-if True:
-    from nmigen.back.pysim import Simulator, Delay, Settle
-else:
-    from nmigen.sim.cxxsim import Simulator, Delay, Settle
+
+# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
+# Also, check out the cxxsim nmigen branch, and latest yosys from git
+from nmutil.sim_tmp_alternative import Simulator
+
+from nmutil.util import wrap
  
  
  # TODO: make these parameters of DCache at some point
  
  
  # TODO: make these parameters of DCache at some point
@@ -46,7 +46,7 @@ LINE_SIZE = 64    # Line size in bytes
  NUM_LINES = 16    # Number of lines in a set
  NUM_WAYS = 4      # Number of ways
  TLB_SET_SIZE = 64 # L1 DTLB entries per set
  NUM_LINES = 16    # Number of lines in a set
  NUM_WAYS = 4      # Number of ways
  TLB_SET_SIZE = 64 # L1 DTLB entries per set
-TLB_NUM_WAYS = 2  # L1 DTLB number of sets
+TLB_NUM_WAYS = 4  # L1 DTLB number of sets
  TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  LOG_LENGTH = 0    # Non-zero to enable log data collection
  
  TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  LOG_LENGTH = 0    # Non-zero to enable log data collection
  
@@ -67,6 +67,10 @@ ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  # to represent the full dcache
  BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  
  # to represent the full dcache
  BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  
+print ("ROW_SIZE", ROW_SIZE)
+print ("ROW_PER_LINE", ROW_PER_LINE)
+print ("BRAM_ROWS", BRAM_ROWS)
+print ("NUM_WAYS", NUM_WAYS)
  
  # Bit fields counts in the address
  
  
  # Bit fields counts in the address
  
@@ -107,24 +111,34 @@ TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
  WAY_BITS = log2_int(NUM_WAYS)
  
  # Example of layout for 32 lines of 64 bytes:
  WAY_BITS = log2_int(NUM_WAYS)
  
  # Example of layout for 32 lines of 64 bytes:
-#
-# ..  tag    |index|  line  |
-# ..         |   row   |    |
-# ..         |     |---|    | ROW_LINE_BITS  (3)
-# ..         |     |--- - --| LINE_OFF_BITS (6)
-# ..         |         |- --| ROW_OFF_BITS  (3)
-# ..         |----- ---|    | ROW_BITS      (8)
-# ..         |-----|        | INDEX_BITS    (5)
-# .. --------|              | TAG_BITS      (45)
+layout = """\
+  ..  tag    |index|  line  |
+  ..         |   row   |    |
+  ..         |     |---|    | ROW_LINE_BITS  (3)
+  ..         |     |--- - --| LINE_OFF_BITS (6)
+  ..         |         |- --| ROW_OFF_BITS  (3)
+  ..         |----- ---|    | ROW_BITS      (8)
+  ..         |-----|        | INDEX_BITS    (5)
+  .. --------|              | TAG_BITS      (45)
+"""
+print (layout)
+print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
+            (TAG_BITS, INDEX_BITS, ROW_BITS,
+             ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
+print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
+print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
+print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
  
  TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
  
  
  TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
  
+print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
+
  def CacheTagArray():
      return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
                          for x in range(NUM_LINES))
  
  def CacheValidBitsArray():
  def CacheTagArray():
      return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
                          for x in range(NUM_LINES))
  
  def CacheValidBitsArray():
-    return Array(Signal(INDEX_BITS, name="cachevalid_%d" % x) \
+    return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
                          for x in range(NUM_LINES))
  
  def RowPerLineValidArray():
                          for x in range(NUM_LINES))
  
  def RowPerLineValidArray():
@@ -139,10 +153,13 @@ TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
  TLB_PTE_BITS     = 64
  TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
  
  TLB_PTE_BITS     = 64
  TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
  
+def ispow2(x):
+    return (1<<log2_int(x, False)) == x
+
  assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
  assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
-assert (LINE_SIZE % 2) == 0, "LINE_SIZE not power of 2"
-assert (NUM_LINES % 2) == 0, "NUM_LINES not power of 2"
-assert (ROW_PER_LINE % 2) == 0, "ROW_PER_LINE not power of 2"
+assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
+assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
+assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
  assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
  assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
          "geometry bits don't add up"
  assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
  assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
          "geometry bits don't add up"
@@ -155,16 +172,20 @@ assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
  
  
  def TLBValidBitsArray():
  
  
  def TLBValidBitsArray():
-    return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
+    return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
+                for x in range(TLB_SET_SIZE))
  
  def TLBTagEAArray():
  
  def TLBTagEAArray():
-    return Array(Signal(TLB_EA_TAG_BITS) for x in range (TLB_NUM_WAYS))
+    return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
+                for x in range (TLB_NUM_WAYS))
  
  def TLBTagsArray():
  
  def TLBTagsArray():
-    return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
+    return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
+                for x in range (TLB_SET_SIZE))
  
  def TLBPtesArray():
  
  def TLBPtesArray():
-    return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
+    return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
+                for x in range(TLB_SET_SIZE))
  
  def HitWaySet():
      return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
  
  def HitWaySet():
      return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
@@ -177,11 +198,13 @@ def CacheRamOut():
  
  # PLRU output interface
  def PLRUOut():
  
  # PLRU output interface
  def PLRUOut():
-    return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
+    return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
+                for x in range(NUM_LINES))
  
  # TLB PLRU output interface
  def TLBPLRUOut():
  
  # TLB PLRU output interface
  def TLBPLRUOut():
-    return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
+    return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
+                for x in range(TLB_SET_SIZE))
  
  # Helper functions to decode incoming requests
  #
  
  # Helper functions to decode incoming requests
  #
@@ -195,7 +218,7 @@ def get_row(addr):
  
  # Return the index of a row within a line
  def get_row_of_line(row):
  
  # Return the index of a row within a line
  def get_row_of_line(row):
-    return row[:ROW_LINE_BITS]
+    return row[:ROW_BITS][:ROW_LINE_BITS]
  
  # Returns whether this is the last row of a line
  def is_last_row_addr(addr, last):
  
  # Returns whether this is the last row of a line
  def is_last_row_addr(addr, last):
@@ -251,12 +274,6 @@ class PermAttr(RecordObject):
  
  def extract_perm_attr(pte):
      pa = PermAttr()
  
  def extract_perm_attr(pte):
      pa = PermAttr()
-    pa.reference = pte[8]
-    pa.changed   = pte[7]
-    pa.nocache   = pte[5]
-    pa.priv      = pte[3]
-    pa.rd_perm   = pte[2]
-    pa.wr_perm   = pte[1]
      return pa;
  
  
      return pa;
  
  
@@ -360,7 +377,8 @@ class RegStage1(RecordObject):
          self.write_bram       = Signal()
          self.write_tag        = Signal()
          self.slow_valid       = Signal()
          self.write_bram       = Signal()
          self.write_tag        = Signal()
          self.slow_valid       = Signal()
-        self.wb               = WBMasterOut()
+        self.real_adr         = Signal(REAL_ADDR_BITS)
+        self.wb               = WBMasterOut("wb")
          self.reload_tag       = Signal(TAG_BITS)
          self.store_way        = Signal(WAY_BITS)
          self.store_row        = Signal(ROW_BITS)
          self.reload_tag       = Signal(TAG_BITS)
          self.store_way        = Signal(WAY_BITS)
          self.store_row        = Signal(ROW_BITS)
@@ -448,16 +466,6 @@ class DTLBUpdate(Elaboratable):
  
          return m
  
  
          return m
  
-    def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
-                       r0_valid, r1, cache_valid_bits, replace_way,
-                       use_forward1_next, use_forward2_next,
-                       req_hit_way, plru_victim, rc_ok, perm_attr,
-                       valid_ra, perm_ok, access_ok, req_op, req_go,
-                       tlb_pte_way,
-                       tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
-                       cancel_store, req_same_tag, r0_stall, early_req_row):
-        """Cache request parsing and hit detection
-        """
  
  class DCachePendingHit(Elaboratable):
  
  
  class DCachePendingHit(Elaboratable):
  
@@ -588,9 +596,8 @@ class DCache(Elaboratable):
          r = RegStage0("stage0")
  
          # TODO, this goes in unit tests and formal proofs
          r = RegStage0("stage0")
  
          # TODO, this goes in unit tests and formal proofs
-        with m.If(~(d_in.valid & m_in.valid)):
-            #sync += Display("request collision loadstore vs MMU")
-            pass
+        with m.If(d_in.valid & m_in.valid):
+            sync += Display("request collision loadstore vs MMU")
  
          with m.If(m_in.valid):
              sync += r.req.valid.eq(1)
  
          with m.If(m_in.valid):
              sync += r.req.valid.eq(1)
@@ -598,7 +605,7 @@ class DCache(Elaboratable):
              sync += r.req.dcbz.eq(0)
              sync += r.req.nc.eq(0)
              sync += r.req.reserve.eq(0)
              sync += r.req.dcbz.eq(0)
              sync += r.req.nc.eq(0)
              sync += r.req.reserve.eq(0)
-            sync += r.req.virt_mode.eq(1)
+            sync += r.req.virt_mode.eq(0)
              sync += r.req.priv_mode.eq(1)
              sync += r.req.addr.eq(m_in.addr)
              sync += r.req.data.eq(m_in.pte)
              sync += r.req.priv_mode.eq(1)
              sync += r.req.addr.eq(m_in.addr)
              sync += r.req.data.eq(m_in.pte)
@@ -657,13 +664,13 @@ class DCache(Elaboratable):
              return
          for i in range(TLB_SET_SIZE):
              # TLB PLRU interface
              return
          for i in range(TLB_SET_SIZE):
              # TLB PLRU interface
-            tlb_plru        = PLRU(WAY_BITS)
+            tlb_plru        = PLRU(TLB_WAY_BITS)
              setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
              tlb_plru_acc_en = Signal()
  
              comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
              comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
              setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
              tlb_plru_acc_en = Signal()
  
              comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
              comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
-            comb += tlb_plru.acc.eq(r1.tlb_hit_way)
+            comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
              comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
  
      def tlb_search(self, m, tlb_req_index, r0, r0_valid,
              comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
  
      def tlb_search(self, m, tlb_req_index, r0, r0_valid,
@@ -684,7 +691,7 @@ class DCache(Elaboratable):
          for i in range(TLB_NUM_WAYS):
              is_tag_hit = Signal()
              comb += is_tag_hit.eq(tlb_valid_way[i]
          for i in range(TLB_NUM_WAYS):
              is_tag_hit = Signal()
              comb += is_tag_hit.eq(tlb_valid_way[i]
-                                  & read_tlb_tag(i, tlb_tag_way) == eatag)
+                                  & (read_tlb_tag(i, tlb_tag_way) == eatag))
              with m.If(is_tag_hit):
                  comb += hitway.eq(i)
                  comb += hit.eq(1)
              with m.If(is_tag_hit):
                  comb += hitway.eq(i)
                  comb += hit.eq(1)
@@ -701,7 +708,12 @@ class DCache(Elaboratable):
              comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
                                r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
                                pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
              comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
                                r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
                                pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
-            comb += perm_attr.eq(extract_perm_attr(pte))
+            comb += perm_attr.reference.eq(pte[8])
+            comb += perm_attr.changed.eq(pte[7])
+            comb += perm_attr.nocache.eq(pte[5])
+            comb += perm_attr.priv.eq(pte[3])
+            comb += perm_attr.rd_perm.eq(pte[2])
+            comb += perm_attr.wr_perm.eq(pte[1])
          with m.Else():
              comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
                                r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
          with m.Else():
              comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
                                r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
@@ -772,7 +784,7 @@ class DCache(Elaboratable):
  
              comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
              comb += plru.acc_en.eq(plru_acc_en)
  
              comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
              comb += plru.acc_en.eq(plru_acc_en)
-            comb += plru.acc.eq(r1.hit_way)
+            comb += plru.acc_i.eq(r1.hit_way)
              comb += plru_victim[i].eq(plru.lru_o)
  
      def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
              comb += plru_victim[i].eq(plru.lru_o)
  
      def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
@@ -793,7 +805,7 @@ class DCache(Elaboratable):
          sync += cache_tag_set.eq(cache_tags[index])
  
      def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
          sync += cache_tag_set.eq(cache_tags[index])
  
      def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
-                       r0_valid, r1, cache_valid_bits, replace_way,
+                       r0_valid, r1, cache_valids, replace_way,
                         use_forward1_next, use_forward2_next,
                         req_hit_way, plru_victim, rc_ok, perm_attr,
                         valid_ra, perm_ok, access_ok, req_op, req_go,
                         use_forward1_next, use_forward2_next,
                         req_hit_way, plru_victim, rc_ok, perm_attr,
                         valid_ra, perm_ok, access_ok, req_op, req_go,
@@ -815,15 +827,19 @@ class DCache(Elaboratable):
          nc          = Signal()
          hit_set     = Array(Signal(name="hit_set_%d" % i) \
                                    for i in range(TLB_NUM_WAYS))
          nc          = Signal()
          hit_set     = Array(Signal(name="hit_set_%d" % i) \
                                    for i in range(TLB_NUM_WAYS))
-        cache_valid_idx = Signal(INDEX_BITS)
+        cache_valid_idx = Signal(NUM_WAYS)
  
          # Extract line, row and tag from request
          comb += req_index.eq(get_index(r0.req.addr))
          comb += req_row.eq(get_row(r0.req.addr))
          comb += req_tag.eq(get_tag(ra))
  
  
          # Extract line, row and tag from request
          comb += req_index.eq(get_index(r0.req.addr))
          comb += req_row.eq(get_row(r0.req.addr))
          comb += req_tag.eq(get_tag(ra))
  
+        if False: # display on comb is a bit... busy.
+            comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
+                    r0.req.addr, ra, req_index, req_tag, req_row)
+
          comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
          comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
-        comb += cache_valid_idx.eq(cache_valid_bits[req_index])
+        comb += cache_valid_idx.eq(cache_valids[req_index])
  
          m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
                                  tlb_valid_way, tlb_hit_way,
  
          m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
                                  tlb_valid_way, tlb_hit_way,
@@ -846,7 +862,9 @@ class DCache(Elaboratable):
              # For a store, consider this a hit even if the row isn't
              # valid since it will be by the time we perform the store.
              # For a load, check the appropriate row valid bit.
              # For a store, consider this a hit even if the row isn't
              # valid since it will be by the time we perform the store.
              # For a load, check the appropriate row valid bit.
-            valid = r1.rows_valid[req_row % ROW_PER_LINE]
+            rrow = Signal(ROW_LINE_BITS)
+            comb += rrow.eq(req_row)
+            valid = r1.rows_valid[rrow]
              comb += is_hit.eq(~r0.req.load | valid)
              comb += hit_way.eq(replace_way)
  
              comb += is_hit.eq(~r0.req.load | valid)
              comb += hit_way.eq(replace_way)
  
@@ -895,24 +913,14 @@ class DCache(Elaboratable):
              with m.Else():
                  comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
                  with m.Switch(opsel):
              with m.Else():
                  comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
                  with m.Switch(opsel):
-                    with m.Case(0b101):
-                        comb += op.eq(Op.OP_LOAD_HIT)
-                    with m.Case(0b100):
-                        comb += op.eq(Op.OP_LOAD_MISS)
-                    with m.Case(0b110):
-                        comb += op.eq(Op.OP_LOAD_NC)
-                    with m.Case(0b001):
-                        comb += op.eq(Op.OP_STORE_HIT)
-                    with m.Case(0b000):
-                        comb += op.eq(Op.OP_STORE_MISS)
-                    with m.Case(0b010):
-                        comb += op.eq(Op.OP_STORE_MISS)
-                    with m.Case(0b011):
-                        comb += op.eq(Op.OP_BAD)
-                    with m.Case(0b111):
-                        comb += op.eq(Op.OP_BAD)
-                    with m.Default():
-                        comb += op.eq(Op.OP_NONE)
+                    with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
+                    with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
+                    with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
+                    with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
+                    with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
+                    with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
+                    with m.Case(0b011): comb += op.eq(Op.OP_BAD)
+                    with m.Case(0b111): comb += op.eq(Op.OP_BAD)
          comb += req_op.eq(op)
          comb += req_go.eq(go)
  
          comb += req_op.eq(op)
          comb += req_go.eq(go)
  
@@ -936,14 +944,14 @@ class DCache(Elaboratable):
          sync = m.d.sync
  
          with m.If(r0_valid & r0.req.reserve):
          sync = m.d.sync
  
          with m.If(r0_valid & r0.req.reserve):
-
              # XXX generate alignment interrupt if address
              # is not aligned XXX or if r0.req.nc = '1'
              with m.If(r0.req.load):
                  comb += set_rsrv.eq(1) # load with reservation
              with m.Else():
                  comb += clear_rsrv.eq(1) # store conditional
              # XXX generate alignment interrupt if address
              # is not aligned XXX or if r0.req.nc = '1'
              with m.If(r0.req.load):
                  comb += set_rsrv.eq(1) # load with reservation
              with m.Else():
                  comb += clear_rsrv.eq(1) # store conditional
-                with m.If(~reservation.valid | r0.req.addr[LINE_OFF_BITS:64]):
+                with m.If(~reservation.valid |
+                         (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
                      comb += cancel_store.eq(1)
  
      def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
                      comb += cancel_store.eq(1)
  
      def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
@@ -959,7 +967,7 @@ class DCache(Elaboratable):
                  sync += reservation.valid.eq(1)
                  sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
  
                  sync += reservation.valid.eq(1)
                  sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
  
-    def writeback_control(self, m, r1, cache_out):
+    def writeback_control(self, m, r1, cache_out_row):
          """Return data for loads & completion control logic
          """
          comb = m.d.comb
          """Return data for loads & completion control logic
          """
          comb = m.d.comb
@@ -978,7 +986,7 @@ class DCache(Elaboratable):
          with m.Else():
              comb += data_fwd.eq(r1.forward_data2)
  
          with m.Else():
              comb += data_fwd.eq(r1.forward_data2)
  
-        comb += data_out.eq(cache_out[r1.hit_way])
+        comb += data_out.eq(cache_out_row)
  
          for i in range(8):
              with m.If(r1.forward_sel[i]):
  
          for i in range(8):
              with m.If(r1.forward_sel[i]):
@@ -1044,7 +1052,7 @@ class DCache(Elaboratable):
                  sync += Display("completing MMU load miss, data=%x",
                                  m_out.data)
  
                  sync += Display("completing MMU load miss, data=%x",
                                  m_out.data)
  
-    def rams(self, m, r1, early_req_row, cache_out, replace_way):
+    def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
          """rams
          Generate a cache RAM for each way. This handles the normal
          reads, writes from reloads and the special store-hit update
          """rams
          Generate a cache RAM for each way. This handles the normal
          reads, writes from reloads and the special store-hit update
@@ -1068,7 +1076,7 @@ class DCache(Elaboratable):
              wr_sel_m = Signal(ROW_SIZE)
              _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i)
  
              wr_sel_m = Signal(ROW_SIZE)
              _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i)
  
-            way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
+            way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True)
              setattr(m.submodules, "cacheram_%d" % i, way)
  
              comb += way.rd_en.eq(do_read)
              setattr(m.submodules, "cacheram_%d" % i, way)
  
              comb += way.rd_en.eq(do_read)
@@ -1081,7 +1089,8 @@ class DCache(Elaboratable):
              # Cache hit reads
              comb += do_read.eq(1)
              comb += rd_addr.eq(early_req_row[:ROW_BITS])
              # Cache hit reads
              comb += do_read.eq(1)
              comb += rd_addr.eq(early_req_row[:ROW_BITS])
-            comb += cache_out[i].eq(_d_out)
+            with m.If(r1.hit_way == i):
+                comb += cache_out_row.eq(_d_out)
  
              # Write mux:
              #
  
              # Write mux:
              #
@@ -1181,7 +1190,7 @@ class DCache(Elaboratable):
      # All wishbone requests generation is done here.
      # This machine operates at stage 1.
      def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
      # All wishbone requests generation is done here.
      # This machine operates at stage 1.
      def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
-                    cache_valid_bits, r0, replace_way,
+                    cache_valids, r0, replace_way,
                      req_hit_way, req_same_tag,
                      r0_valid, req_op, cache_tags, req_go, ra):
  
                      req_hit_way, req_same_tag,
                      r0_valid, req_op, cache_tags, req_go, ra):
  
@@ -1193,6 +1202,13 @@ class DCache(Elaboratable):
          acks        = Signal(3)
          adjust_acks = Signal(3)
  
          acks        = Signal(3)
          adjust_acks = Signal(3)
  
+        req_row = Signal(ROW_BITS)
+        req_idx = Signal(INDEX_BITS)
+        req_tag = Signal(TAG_BITS)
+        comb += req_idx.eq(get_index(req.real_addr))
+        comb += req_row.eq(get_row(req.real_addr))
+        comb += req_tag.eq(get_tag(req.real_addr))
+
          sync += r1.use_forward1.eq(use_forward1_next)
          sync += r1.forward_sel.eq(0)
  
          sync += r1.use_forward1.eq(use_forward1_next)
          sync += r1.forward_sel.eq(0)
  
@@ -1285,23 +1301,17 @@ class DCache(Elaboratable):
          with m.Switch(r1.state):
  
              with m.Case(State.IDLE):
          with m.Switch(r1.state):
  
              with m.Case(State.IDLE):
-                # XXX check 'left downto.  probably means len(r1.wb.adr)
-                #                     r1.wb.adr <= req.real_addr(
-                #                                   r1.wb.adr'left downto 0
-                #                                  );
-                sync += r1.wb.adr.eq(req.real_addr)
+                sync += r1.real_adr.eq(req.real_addr)
                  sync += r1.wb.sel.eq(req.byte_sel)
                  sync += r1.wb.dat.eq(req.data)
                  sync += r1.dcbz.eq(req.dcbz)
  
                  # Keep track of our index and way
                  # for subsequent stores.
                  sync += r1.wb.sel.eq(req.byte_sel)
                  sync += r1.wb.dat.eq(req.data)
                  sync += r1.dcbz.eq(req.dcbz)
  
                  # Keep track of our index and way
                  # for subsequent stores.
-                sync += r1.store_index.eq(get_index(req.real_addr))
-                sync += r1.store_row.eq(get_row(req.real_addr))
-                sync += r1.end_row_ix.eq(
-                         get_row_of_line(get_row(req.real_addr))
-                        )
-                sync += r1.reload_tag.eq(get_tag(req.real_addr))
+                sync += r1.store_index.eq(req_idx)
+                sync += r1.store_row.eq(req_row)
+                sync += r1.end_row_ix.eq(get_row_of_line(req_row))
+                sync += r1.reload_tag.eq(req_tag)
                  sync += r1.req.same_tag.eq(1)
  
                  with m.If(req.op == Op.OP_STORE_HIT):
                  sync += r1.req.same_tag.eq(1)
  
                  with m.If(req.op == Op.OP_STORE_HIT):
@@ -1321,11 +1331,9 @@ class DCache(Elaboratable):
                          pass
  
                      with m.Case(Op.OP_LOAD_MISS):
                          pass
  
                      with m.Case(Op.OP_LOAD_MISS):
-                        #Display(f"cache miss real addr:" \
-                        #      f"{req_real_addr}" \
-                        #      f" idx:{get_index(req_real_addr)}" \
-                        #      f" tag:{get_tag(req.real_addr)}")
-                        pass
+                        sync += Display("cache miss real addr: %x " \
+                                "idx: %x tag: %x",
+                                req.real_addr, req_row, req_tag)
  
                          # Start the wishbone cycle
                          sync += r1.wb.we.eq(0)
  
                          # Start the wishbone cycle
                          sync += r1.wb.we.eq(0)
@@ -1389,18 +1397,21 @@ class DCache(Elaboratable):
                      # Clear stb and set ld_stbs_done
                      # so we can handle an eventual
                      # last ack on the same cycle.
                      # Clear stb and set ld_stbs_done
                      # so we can handle an eventual
                      # last ack on the same cycle.
-                    with m.If(is_last_row_addr(r1.wb.adr, r1.end_row_ix)):
+                    with m.If(is_last_row_addr(r1.real_adr, r1.end_row_ix)):
                          sync += r1.wb.stb.eq(0)
                          comb += ld_stbs_done.eq(1)
  
                      # Calculate the next row address in the current cache line
                          sync += r1.wb.stb.eq(0)
                          comb += ld_stbs_done.eq(1)
  
                      # Calculate the next row address in the current cache line
-                    rarange = r1.wb.adr[ROW_OFF_BITS : LINE_OFF_BITS]
-                    sync += rarange.eq(rarange + 1)
+                    row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
+                    comb += row.eq(r1.real_adr[ROW_OFF_BITS:])
+                    sync += r1.real_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(row+1)
  
                  # Incoming acks processing
                  sync += r1.forward_valid1.eq(wb_in.ack)
                  with m.If(wb_in.ack):
  
                  # Incoming acks processing
                  sync += r1.forward_valid1.eq(wb_in.ack)
                  with m.If(wb_in.ack):
-                    sync += r1.rows_valid[r1.store_row % ROW_PER_LINE].eq(1)
+                    srow = Signal(ROW_LINE_BITS)
+                    comb += srow.eq(r1.store_row)
+                    sync += r1.rows_valid[srow].eq(1)
  
                      # If this is the data we were looking for,
                      # we can complete the request next cycle.
  
                      # If this is the data we were looking for,
                      # we can complete the request next cycle.
@@ -1428,9 +1439,9 @@ class DCache(Elaboratable):
  
                          # Cache line is now valid
                          cv = Signal(INDEX_BITS)
  
                          # Cache line is now valid
                          cv = Signal(INDEX_BITS)
-                        comb += cv.eq(cache_valid_bits[r1.store_index])
+                        comb += cv.eq(cache_valids[r1.store_index])
                          comb += cv.bit_select(r1.store_way, 1).eq(1)
                          comb += cv.bit_select(r1.store_way, 1).eq(1)
-                        sync += cache_valid_bits[r1.store_index].eq(cv)
+                        sync += cache_valids[r1.store_index].eq(cv)
                          sync += r1.state.eq(State.IDLE)
  
                      # Increment store row counter
                          sync += r1.state.eq(State.IDLE)
  
                      # Increment store row counter
@@ -1457,7 +1468,7 @@ class DCache(Elaboratable):
                      # to be done which is in the same real page.
                      with m.If(req.valid):
                          ra = req.real_addr[0:SET_SIZE_BITS]
                      # to be done which is in the same real page.
                      with m.If(req.valid):
                          ra = req.real_addr[0:SET_SIZE_BITS]
-                        sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
+                        sync += r1.real_adr[0:SET_SIZE_BITS].eq(ra)
                          sync += r1.wb.dat.eq(req.data)
                          sync += r1.wb.sel.eq(req.byte_sel)
  
                          sync += r1.wb.dat.eq(req.data)
                          sync += r1.wb.sel.eq(req.byte_sel)
  
@@ -1517,7 +1528,7 @@ class DCache(Elaboratable):
          sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
                                 stall_out, req_op[:3], d_out.valid, d_out.error,
                                 r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
          sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
                                 stall_out, req_op[:3], d_out.valid, d_out.error,
                                 r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
-                               r1.wb.adr[3:6]))
+                               r1.real_adr[3:6]))
  
      def elaborate(self, platform):
  
  
      def elaborate(self, platform):
  
@@ -1527,7 +1538,7 @@ class DCache(Elaboratable):
          # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
          cache_tags       = CacheTagArray()
          cache_tag_set    = Signal(TAG_RAM_WIDTH)
          # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
          cache_tags       = CacheTagArray()
          cache_tag_set    = Signal(TAG_RAM_WIDTH)
-        cache_valid_bits = CacheValidBitsArray()
+        cache_valids = CacheValidBitsArray()
  
          # TODO attribute ram_style : string;
          # TODO attribute ram_style of cache_tags : signal is "distributed";
  
          # TODO attribute ram_style : string;
          # TODO attribute ram_style of cache_tags : signal is "distributed";
@@ -1572,7 +1583,7 @@ class DCache(Elaboratable):
          use_forward1_next = Signal()
          use_forward2_next = Signal()
  
          use_forward1_next = Signal()
          use_forward2_next = Signal()
  
-        cache_out         = CacheRamOut()
+        cache_out_row     = Signal(WB_DATA_BITS)
  
          plru_victim       = PLRUOut()
          replace_way       = Signal(WAY_BITS)
  
          plru_victim       = PLRUOut()
          replace_way       = Signal(WAY_BITS)
@@ -1607,6 +1618,7 @@ class DCache(Elaboratable):
          comb += self.stall_out.eq(r0_stall)
  
          # Wire up wishbone request latch out of stage 1
          comb += self.stall_out.eq(r0_stall)
  
          # Wire up wishbone request latch out of stage 1
+        comb += r1.wb.adr.eq(r1.real_adr[ROW_OFF_BITS:]) # truncate LSBs
          comb += self.wb_out.eq(r1.wb)
  
          # call sub-functions putting everything together, using shared
          comb += self.wb_out.eq(r1.wb)
  
          # call sub-functions putting everything together, using shared
@@ -1625,7 +1637,7 @@ class DCache(Elaboratable):
          self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
          self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
          self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
          self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
          self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
          self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
-                           r0_valid, r1, cache_valid_bits, replace_way,
+                           r0_valid, r1, cache_valids, replace_way,
                             use_forward1_next, use_forward2_next,
                             req_hit_way, plru_victim, rc_ok, perm_attr,
                             valid_ra, perm_ok, access_ok, req_op, req_go,
                             use_forward1_next, use_forward2_next,
                             req_hit_way, plru_victim, rc_ok, perm_attr,
                             valid_ra, perm_ok, access_ok, req_op, req_go,
@@ -1636,13 +1648,13 @@ class DCache(Elaboratable):
                             r0_valid, r0, reservation)
          self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
                             reservation, r0)
                             r0_valid, r0, reservation)
          self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
                             reservation, r0)
-        self.writeback_control(m, r1, cache_out)
-        self.rams(m, r1, early_req_row, cache_out, replace_way)
+        self.writeback_control(m, r1, cache_out_row)
+        self.rams(m, r1, early_req_row, cache_out_row, replace_way)
          self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
                          req_hit_way, req_index, req_tag, access_ok,
                          tlb_hit, tlb_hit_way, tlb_req_index)
          self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
          self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
                          req_hit_way, req_index, req_tag, access_ok,
                          tlb_hit, tlb_hit_way, tlb_req_index)
          self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
-                    cache_valid_bits, r0, replace_way,
+                    cache_valids, r0, replace_way,
                      req_hit_way, req_same_tag,
                           r0_valid, req_op, cache_tags, req_go, ra)
          #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
                      req_hit_way, req_same_tag,
                           r0_valid, req_op, cache_tags, req_go, ra)
          #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
@@ -1653,9 +1665,11 @@ def dcache_load(dut, addr, nc=0):
      yield dut.d_in.load.eq(1)
      yield dut.d_in.nc.eq(nc)
      yield dut.d_in.addr.eq(addr)
      yield dut.d_in.load.eq(1)
      yield dut.d_in.nc.eq(nc)
      yield dut.d_in.addr.eq(addr)
+    yield dut.d_in.byte_sel.eq(~0)
      yield dut.d_in.valid.eq(1)
      yield
      yield dut.d_in.valid.eq(0)
      yield dut.d_in.valid.eq(1)
      yield
      yield dut.d_in.valid.eq(0)
+    yield dut.d_in.byte_sel.eq(0)
      yield
      while not (yield dut.d_out.valid):
          yield
      yield
      while not (yield dut.d_out.valid):
          yield
@@ -1678,6 +1692,53 @@ def dcache_store(dut, addr, data, nc=0):
          yield
  
  
          yield
  
  
+def dcache_random_sim(dut):
+
+    # start with stack of zeros
+    sim_mem = [0] * 512
+
+    # clear stuff
+    yield dut.d_in.valid.eq(0)
+    yield dut.d_in.load.eq(0)
+    yield dut.d_in.priv_mode.eq(1)
+    yield dut.d_in.nc.eq(0)
+    yield dut.d_in.addr.eq(0)
+    yield dut.d_in.data.eq(0)
+    yield dut.m_in.valid.eq(0)
+    yield dut.m_in.addr.eq(0)
+    yield dut.m_in.pte.eq(0)
+    # wait 4 * clk_period
+    yield
+    yield
+    yield
+    yield
+
+    print ()
+
+    for i in range(256):
+        addr = randint(0, 255)
+        data = randint(0, (1<<64)-1)
+        sim_mem[addr] = data
+        addr *= 8
+
+        print ("testing %x data %x" % (addr, data))
+
+        yield from dcache_load(dut, addr)
+        yield from dcache_store(dut, addr, data)
+
+        addr = randint(0, 255)
+        sim_data = sim_mem[addr]
+        addr *= 8
+
+        data = yield from dcache_load(dut, addr)
+        assert data == sim_data, \
+            "check %x data %x != %x" % (addr, data, sim_data)
+
+    for addr in range(256):
+        data = yield from dcache_load(dut, addr*8)
+        assert data == sim_mem[addr], \
+            "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
+
  def dcache_sim(dut):
      # clear stuff
      yield dut.d_in.valid.eq(0)
  def dcache_sim(dut):
      # clear stuff
      yield dut.d_in.valid.eq(0)
@@ -1696,22 +1757,28 @@ def dcache_sim(dut):
      yield
  
      # Cacheable read of address 4
      yield
  
      # Cacheable read of address 4
-    data = yield from dcache_load(dut, 0x4)
+    data = yield from dcache_load(dut, 0x58)
      addr = yield dut.d_in.addr
      addr = yield dut.d_in.addr
-    assert data == 0x0000000100000000, \
-        f"data @%x=%x expected 0x0000000100000000" % (addr, data)
+    assert data == 0x0000001700000016, \
+        f"data @%x=%x expected 0x0000001700000016" % (addr, data)
+
+    # Cacheable read of address 20
+    data = yield from dcache_load(dut, 0x20)
+    addr = yield dut.d_in.addr
+    assert data == 0x0000000900000008, \
+        f"data @%x=%x expected 0x0000000900000008" % (addr, data)
  
      # Cacheable read of address 30
      data = yield from dcache_load(dut, 0x530)
      addr = yield dut.d_in.addr
  
      # Cacheable read of address 30
      data = yield from dcache_load(dut, 0x530)
      addr = yield dut.d_in.addr
-    assert data == 0x0000004D0000004C, \
-        f"data @%x=%x expected 0000004D0000004C" % (addr, data)
+    assert data == 0x0000014D0000014C, \
+        f"data @%x=%x expected 0000014D0000014C" % (addr, data)
  
      # 2nd Cacheable read of address 30
      data = yield from dcache_load(dut, 0x530)
      addr = yield dut.d_in.addr
  
      # 2nd Cacheable read of address 30
      data = yield from dcache_load(dut, 0x530)
      addr = yield dut.d_in.addr
-    assert data == 0x0000004D0000004C, \
-        f"data @%x=%x expected 0000004D0000004C" % (addr, data)
+    assert data == 0x0000014D0000014C, \
+        f"data @%x=%x expected 0000014D0000014C" % (addr, data)
  
      # Non-cacheable read of address 100
      data = yield from dcache_load(dut, 0x100, nc=1)
  
      # Non-cacheable read of address 100
      data = yield from dcache_load(dut, 0x100, nc=1)
@@ -1731,22 +1798,22 @@ def dcache_sim(dut):
      assert data == 0x12345678, \
          f"data @%x=%x expected 0x12345678" % (addr, data)
  
      assert data == 0x12345678, \
          f"data @%x=%x expected 0x12345678" % (addr, data)
  
+    # 4th Cacheable read of address 20
+    data = yield from dcache_load(dut, 0x20)
+    addr = yield dut.d_in.addr
+    assert data == 0x0000000900000008, \
+        f"data @%x=%x expected 0x0000000900000008" % (addr, data)
+
      yield
      yield
      yield
      yield
  
  
      yield
      yield
      yield
      yield
  
  
-def test_dcache():
+def test_dcache(mem, test_fn, test_name):
      dut = DCache()
      dut = DCache()
-    vl = rtlil.convert(dut, ports=[])
-    with open("test_dcache.il", "w") as f:
-        f.write(vl)
  
  
-    mem = []
-    for i in range(0,128):
-        mem.append((i*2)| ((i*2+1)<<32))
-    memory = Memory(width=64, depth=16*8, init=mem)
+    memory = Memory(width=64, depth=16*64, init=mem)
      sram = SRAM(memory=memory, granularity=8)
  
      m = Module()
      sram = SRAM(memory=memory, granularity=8)
  
      m = Module()
@@ -1757,7 +1824,7 @@ def test_dcache():
      m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
      m.d.comb += sram.bus.we.eq(dut.wb_out.we)
      m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
      m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
      m.d.comb += sram.bus.we.eq(dut.wb_out.we)
      m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
-    m.d.comb += sram.bus.adr.eq(dut.wb_out.adr[3:])
+    m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
      m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
  
      m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
      m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
  
      m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
@@ -1767,10 +1834,20 @@ def test_dcache():
      sim = Simulator(m)
      sim.add_clock(1e-6)
  
      sim = Simulator(m)
      sim.add_clock(1e-6)
  
-    sim.add_sync_process(wrap(dcache_sim(dut)))
-    with sim.write_vcd('test_dcache.vcd'):
+    sim.add_sync_process(wrap(test_fn(dut)))
+    with sim.write_vcd('test_dcache%s.vcd' % test_name):
          sim.run()
  
  if __name__ == '__main__':
          sim.run()
  
  if __name__ == '__main__':
-    test_dcache()
+    dut = DCache()
+    vl = rtlil.convert(dut, ports=[])
+    with open("test_dcache.il", "w") as f:
+        f.write(vl)
+
+    mem = []
+    for i in range(0,512):
+        mem.append((i*2)| ((i*2+1)<<32))
+
+    test_dcache(mem, dcache_sim, "")
+    test_dcache(None, dcache_random_sim, "random")