Allow more test cases to be run with CXXSim
[soc.git] / src / soc / experiment / dcache.py
index 8756b58952b7205b271326995d185d1a873085c4..e1f82b77dc337467c1f9eeff306adc2ade4a7120 100644 (file)
@@ -7,12 +7,13 @@ based on Anton Blanchard microwatt dcache.vhdl
 from enum import Enum, unique
 
 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
 from enum import Enum, unique
 
 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
+from nmutil.util import Display
+
+from random import randint
+
 from nmigen.cli import main
 from nmutil.iocontrol import RecordObject
 from nmigen.utils import log2_int
 from nmigen.cli import main
 from nmutil.iocontrol import RecordObject
 from nmigen.utils import log2_int
-from nmigen.cli import rtlil
-
-
 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
                                      DCacheToLoadStore1Type,
                                      MMUToDCacheType,
 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
                                      DCacheToLoadStore1Type,
                                      MMUToDCacheType,
@@ -25,15 +26,27 @@ from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
                                 WBIOMasterOut, WBIOSlaveOut)
 
 from soc.experiment.cache_ram import CacheRam
                                 WBIOMasterOut, WBIOSlaveOut)
 
 from soc.experiment.cache_ram import CacheRam
-from soc.experiment.plru import PLRU
+#from soc.experiment.plru import PLRU
+from nmutil.plru import PLRU
+
+# for test
+from nmigen_soc.wishbone.sram import SRAM
+from nmigen import Memory
+from nmigen.cli import rtlil
+
+# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
+# Also, check out the cxxsim nmigen branch, and latest yosys from git
+from nmutil.sim_tmp_alternative import Simulator
+
+from nmutil.util import wrap
 
 
 # TODO: make these parameters of DCache at some point
 LINE_SIZE = 64    # Line size in bytes
 
 
 # TODO: make these parameters of DCache at some point
 LINE_SIZE = 64    # Line size in bytes
-NUM_LINES = 32    # Number of lines in a set
+NUM_LINES = 16    # Number of lines in a set
 NUM_WAYS = 4      # Number of ways
 TLB_SET_SIZE = 64 # L1 DTLB entries per set
 NUM_WAYS = 4      # Number of ways
 TLB_SET_SIZE = 64 # L1 DTLB entries per set
-TLB_NUM_WAYS = 2  # L1 DTLB number of sets
+TLB_NUM_WAYS = 4  # L1 DTLB number of sets
 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
 LOG_LENGTH = 0    # Non-zero to enable log data collection
 
 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
 LOG_LENGTH = 0    # Non-zero to enable log data collection
 
@@ -54,6 +67,10 @@ ROW_PER_LINE = LINE_SIZE // ROW_SIZE
 # to represent the full dcache
 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
 
 # to represent the full dcache
 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
 
+print ("ROW_SIZE", ROW_SIZE)
+print ("ROW_PER_LINE", ROW_PER_LINE)
+print ("BRAM_ROWS", BRAM_ROWS)
+print ("NUM_WAYS", NUM_WAYS)
 
 # Bit fields counts in the address
 
 
 # Bit fields counts in the address
 
@@ -94,26 +111,39 @@ TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 WAY_BITS = log2_int(NUM_WAYS)
 
 # Example of layout for 32 lines of 64 bytes:
 WAY_BITS = log2_int(NUM_WAYS)
 
 # Example of layout for 32 lines of 64 bytes:
-#
-# ..  tag    |index|  line  |
-# ..         |   row   |    |
-# ..         |     |---|    | ROW_LINE_BITS  (3)
-# ..         |     |--- - --| LINE_OFF_BITS (6)
-# ..         |         |- --| ROW_OFF_BITS  (3)
-# ..         |----- ---|    | ROW_BITS      (8)
-# ..         |-----|        | INDEX_BITS    (5)
-# .. --------|              | TAG_BITS      (45)
+layout = """\
+  ..  tag    |index|  line  |
+  ..         |   row   |    |
+  ..         |     |---|    | ROW_LINE_BITS  (3)
+  ..         |     |--- - --| LINE_OFF_BITS (6)
+  ..         |         |- --| ROW_OFF_BITS  (3)
+  ..         |----- ---|    | ROW_BITS      (8)
+  ..         |-----|        | INDEX_BITS    (5)
+  .. --------|              | TAG_BITS      (45)
+"""
+print (layout)
+print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
+            (TAG_BITS, INDEX_BITS, ROW_BITS,
+             ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
+print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
+print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
+print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 
 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 
 
 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 
+print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
+
 def CacheTagArray():
 def CacheTagArray():
-    return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
+    return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
+                        for x in range(NUM_LINES))
 
 def CacheValidBitsArray():
 
 def CacheValidBitsArray():
-    return Array(Signal(INDEX_BITS) for x in range(NUM_LINES))
+    return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
+                        for x in range(NUM_LINES))
 
 def RowPerLineValidArray():
 
 def RowPerLineValidArray():
-    return Array(Signal() for x in range(ROW_PER_LINE))
+    return Array(Signal(name="rows_valid%d" % x) \
+                        for x in range(ROW_PER_LINE))
 
 # L1 TLB
 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 
 # L1 TLB
 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
@@ -123,10 +153,13 @@ TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 TLB_PTE_BITS     = 64
 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 
 TLB_PTE_BITS     = 64
 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 
+def ispow2(x):
+    return (1<<log2_int(x, False)) == x
+
 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
-assert (LINE_SIZE % 2) == 0, "LINE_SIZE not power of 2"
-assert (NUM_LINES % 2) == 0, "NUM_LINES not power of 2"
-assert (ROW_PER_LINE % 2) == 0, "ROW_PER_LINE not power of 2"
+assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
+assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
+assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
         "geometry bits don't add up"
 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
         "geometry bits don't add up"
@@ -139,31 +172,39 @@ assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 
 
 def TLBValidBitsArray():
 
 
 def TLBValidBitsArray():
-    return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
+    return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
+                for x in range(TLB_SET_SIZE))
 
 def TLBTagEAArray():
 
 def TLBTagEAArray():
-    return Array(Signal(TLB_EA_TAG_BITS) for x in range (TLB_NUM_WAYS))
+    return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
+                for x in range (TLB_NUM_WAYS))
 
 def TLBTagsArray():
 
 def TLBTagsArray():
-    return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
+    return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
+                for x in range (TLB_SET_SIZE))
 
 def TLBPtesArray():
 
 def TLBPtesArray():
-    return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
+    return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
+                for x in range(TLB_SET_SIZE))
 
 def HitWaySet():
 
 def HitWaySet():
-    return Array(Signal(NUM_WAYS) for x in range(TLB_NUM_WAYS))
+    return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
+                        for x in range(TLB_NUM_WAYS))
 
 # Cache RAM interface
 def CacheRamOut():
 
 # Cache RAM interface
 def CacheRamOut():
-    return Array(Signal(WB_DATA_BITS) for x in range(NUM_WAYS))
+    return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
+                 for x in range(NUM_WAYS))
 
 # PLRU output interface
 def PLRUOut():
 
 # PLRU output interface
 def PLRUOut():
-    return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
+    return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
+                for x in range(NUM_LINES))
 
 # TLB PLRU output interface
 def TLBPLRUOut():
 
 # TLB PLRU output interface
 def TLBPLRUOut():
-    return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
+    return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
+                for x in range(TLB_SET_SIZE))
 
 # Helper functions to decode incoming requests
 #
 
 # Helper functions to decode incoming requests
 #
@@ -177,7 +218,7 @@ def get_row(addr):
 
 # Return the index of a row within a line
 def get_row_of_line(row):
 
 # Return the index of a row within a line
 def get_row_of_line(row):
-    return row[:ROW_LINE_BITS]
+    return row[:ROW_BITS][:ROW_LINE_BITS]
 
 # Returns whether this is the last row of a line
 def is_last_row_addr(addr, last):
 
 # Returns whether this is the last row of a line
 def is_last_row_addr(addr, last):
@@ -221,8 +262,8 @@ def write_tlb_pte(way, ptes, newpte):
 
 # Record for storing permission, attribute, etc. bits from a PTE
 class PermAttr(RecordObject):
 
 # Record for storing permission, attribute, etc. bits from a PTE
 class PermAttr(RecordObject):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, name=None):
+        super().__init__(name=name)
         self.reference = Signal()
         self.changed   = Signal()
         self.nocache   = Signal()
         self.reference = Signal()
         self.changed   = Signal()
         self.nocache   = Signal()
@@ -233,12 +274,6 @@ class PermAttr(RecordObject):
 
 def extract_perm_attr(pte):
     pa = PermAttr()
 
 def extract_perm_attr(pte):
     pa = PermAttr()
-    pa.reference = pte[8]
-    pa.changed   = pte[7]
-    pa.nocache   = pte[5]
-    pa.priv      = pte[3]
-    pa.rd_perm   = pte[2]
-    pa.wr_perm   = pte[1]
     return pa;
 
 
     return pa;
 
 
@@ -282,9 +317,9 @@ class State(Enum):
 # Stage 0 register, basically contains just the latched request
 
 class RegStage0(RecordObject):
 # Stage 0 register, basically contains just the latched request
 
 class RegStage0(RecordObject):
-    def __init__(self):
-        super().__init__()
-        self.req     = LoadStore1ToDCacheType()
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.req     = LoadStore1ToDCacheType(name="lsmem")
         self.tlbie   = Signal()
         self.doall   = Signal()
         self.tlbld   = Signal()
         self.tlbie   = Signal()
         self.doall   = Signal()
         self.tlbld   = Signal()
@@ -292,8 +327,8 @@ class RegStage0(RecordObject):
 
 
 class MemAccessRequest(RecordObject):
 
 
 class MemAccessRequest(RecordObject):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, name=None):
+        super().__init__(name=name)
         self.op        = Signal(Op)
         self.valid     = Signal()
         self.dcbz      = Signal()
         self.op        = Signal(Op)
         self.valid     = Signal()
         self.dcbz      = Signal()
@@ -308,17 +343,17 @@ class MemAccessRequest(RecordObject):
 # First stage register, contains state for stage 1 of load hits
 # and for the state machine used by all other operations
 class RegStage1(RecordObject):
 # First stage register, contains state for stage 1 of load hits
 # and for the state machine used by all other operations
 class RegStage1(RecordObject):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, name=None):
+        super().__init__(name=name)
         # Info about the request
         self.full             = Signal() # have uncompleted request
         self.mmu_req          = Signal() # request is from MMU
         # Info about the request
         self.full             = Signal() # have uncompleted request
         self.mmu_req          = Signal() # request is from MMU
-        self.req              = MemAccessRequest()
+        self.req              = MemAccessRequest(name="reqmem")
 
         # Cache hit state
         self.hit_way          = Signal(WAY_BITS)
         self.hit_load_valid   = Signal()
 
         # Cache hit state
         self.hit_way          = Signal(WAY_BITS)
         self.hit_load_valid   = Signal()
-        self.hit_index        = Signal(NUM_LINES)
+        self.hit_index        = Signal(INDEX_BITS)
         self.cache_hit        = Signal()
 
         # TLB hit state
         self.cache_hit        = Signal()
 
         # TLB hit state
@@ -342,12 +377,13 @@ class RegStage1(RecordObject):
         self.write_bram       = Signal()
         self.write_tag        = Signal()
         self.slow_valid       = Signal()
         self.write_bram       = Signal()
         self.write_tag        = Signal()
         self.slow_valid       = Signal()
-        self.wb               = WBMasterOut()
+        self.real_adr         = Signal(REAL_ADDR_BITS)
+        self.wb               = WBMasterOut("wb")
         self.reload_tag       = Signal(TAG_BITS)
         self.store_way        = Signal(WAY_BITS)
         self.store_row        = Signal(ROW_BITS)
         self.store_index      = Signal(INDEX_BITS)
         self.reload_tag       = Signal(TAG_BITS)
         self.store_way        = Signal(WAY_BITS)
         self.store_row        = Signal(ROW_BITS)
         self.store_index      = Signal(INDEX_BITS)
-        self.end_row_ix       = Signal(log2_int(ROW_LINE_BITS, False))
+        self.end_row_ix       = Signal(ROW_LINE_BITS)
         self.rows_valid       = RowPerLineValidArray()
         self.acks_pending     = Signal(3)
         self.inc_acks         = Signal()
         self.rows_valid       = RowPerLineValidArray()
         self.acks_pending     = Signal(3)
         self.inc_acks         = Signal()
@@ -373,16 +409,15 @@ class Reservation(RecordObject):
 
 
 class DTLBUpdate(Elaboratable):
 
 
 class DTLBUpdate(Elaboratable):
-    def __init__(self, dtlb_valid_bits, dtlb_ptes):
+    def __init__(self):
         self.tlbie    = Signal()
         self.tlbwe    = Signal()
         self.doall    = Signal()
         self.tlbie    = Signal()
         self.tlbwe    = Signal()
         self.doall    = Signal()
+        self.updated  = Signal()
+        self.v_updated  = Signal()
         self.tlb_hit    = Signal()
         self.tlb_req_index = Signal(TLB_SET_BITS)
 
         self.tlb_hit    = Signal()
         self.tlb_req_index = Signal(TLB_SET_BITS)
 
-        self.dtlb_valid_bits = dtlb_valid_bits
-        self.dtlb_ptes       = dtlb_ptes
-
         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
@@ -390,6 +425,12 @@ class DTLBUpdate(Elaboratable):
         self.eatag           = Signal(TLB_EA_TAG_BITS)
         self.pte_data        = Signal(TLB_PTE_BITS)
 
         self.eatag           = Signal(TLB_EA_TAG_BITS)
         self.pte_data        = Signal(TLB_PTE_BITS)
 
+        self.dv = Signal(TLB_PTE_WAY_BITS)
+
+        self.tb_out = Signal(TLB_TAG_WAY_BITS)
+        self.pb_out = Signal(TLB_NUM_WAYS)
+        self.db_out = Signal(TLB_PTE_WAY_BITS)
+
     def elaborate(self, platform):
         m = Module()
         comb = m.d.comb
     def elaborate(self, platform):
         m = Module()
         comb = m.d.comb
@@ -398,32 +439,127 @@ class DTLBUpdate(Elaboratable):
         tagset   = Signal(TLB_TAG_WAY_BITS)
         pteset   = Signal(TLB_PTE_WAY_BITS)
 
         tagset   = Signal(TLB_TAG_WAY_BITS)
         pteset   = Signal(TLB_PTE_WAY_BITS)
 
-        vb = Signal(TLB_NUM_WAYS)
-        db = Signal(TLB_PTE_WAY_BITS)
-
-        sync += vb.eq(self.dtlb_valid_bits[self.tlb_req_index])
-        sync += db.eq(self.dtlb_ptes[self.tlb_req_index])
+        tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 
         with m.If(self.tlbie & self.doall):
 
         with m.If(self.tlbie & self.doall):
-            # clear all valid bits at once
-            for i in range(TLB_SET_SIZE):
-                sync += self.dtlb_valid_bits[i].eq(0)
-
+            pass # clear all back in parent
         with m.Elif(self.tlbie):
             with m.If(self.tlb_hit):
         with m.Elif(self.tlbie):
             with m.If(self.tlb_hit):
-                sync += vb.bit_select(self.tlb_hit_way, 1).eq(Const(0, 1))
+                comb += db_out.eq(self.dv)
+                comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
+                comb += self.v_updated.eq(1)
 
         with m.Elif(self.tlbwe):
 
             comb += tagset.eq(self.tlb_tag_way)
             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 
         with m.Elif(self.tlbwe):
 
             comb += tagset.eq(self.tlb_tag_way)
             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
-            sync += db.eq(tagset)
+            comb += tb_out.eq(tagset)
 
             comb += pteset.eq(self.tlb_pte_way)
             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 
             comb += pteset.eq(self.tlb_pte_way)
             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
-            sync += db.eq(pteset)
+            comb += pb_out.eq(pteset)
+
+            comb += db_out.bit_select(self.repl_way, 1).eq(1)
+
+            comb += self.updated.eq(1)
+            comb += self.v_updated.eq(1)
+
+        return m
+
+
+class DCachePendingHit(Elaboratable):
+
+    def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
+                      cache_valid_idx, cache_tag_set,
+                    req_addr,
+                    hit_set):
+
+        self.go          = Signal()
+        self.virt_mode   = Signal()
+        self.is_hit      = Signal()
+        self.tlb_hit     = Signal()
+        self.hit_way     = Signal(WAY_BITS)
+        self.rel_match   = Signal()
+        self.req_index   = Signal(INDEX_BITS)
+        self.reload_tag  = Signal(TAG_BITS)
+
+        self.tlb_hit_way = tlb_hit_way
+        self.tlb_pte_way = tlb_pte_way
+        self.tlb_valid_way = tlb_valid_way
+        self.cache_valid_idx = cache_valid_idx
+        self.cache_tag_set = cache_tag_set
+        self.req_addr = req_addr
+        self.hit_set = hit_set
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        sync = m.d.sync
+
+        go = self.go
+        virt_mode = self.virt_mode
+        is_hit = self.is_hit
+        tlb_pte_way = self.tlb_pte_way
+        tlb_valid_way = self.tlb_valid_way
+        cache_valid_idx = self.cache_valid_idx
+        cache_tag_set = self.cache_tag_set
+        req_addr = self.req_addr
+        tlb_hit_way = self.tlb_hit_way
+        tlb_hit = self.tlb_hit
+        hit_set = self.hit_set
+        hit_way = self.hit_way
+        rel_match = self.rel_match
+        req_index = self.req_index
+        reload_tag = self.reload_tag
+
+        rel_matches = Array(Signal(name="rel_matches_%d" % i) \
+                                    for i in range(TLB_NUM_WAYS))
+        hit_way_set = HitWaySet()
+
+        # Test if pending request is a hit on any way
+        # In order to make timing in virtual mode,
+        # when we are using the TLB, we compare each
+        # way with each of the real addresses from each way of
+        # the TLB, and then decide later which match to use.
+
+        with m.If(virt_mode):
+            for j in range(TLB_NUM_WAYS):
+                s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
+                s_hit       = Signal()
+                s_pte       = Signal(TLB_PTE_BITS)
+                s_ra        = Signal(REAL_ADDR_BITS)
+                comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
+                comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
+                                    s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
+                comb += s_tag.eq(get_tag(s_ra))
 
 
-            sync += vb.bit_select(self.repl_way, 1).eq(1)
+                for i in range(NUM_WAYS):
+                    is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
+                    comb += is_tag_hit.eq(go & cache_valid_idx[i] &
+                                  (read_tag(i, cache_tag_set) == s_tag)
+                                  & tlb_valid_way[j])
+                    with m.If(is_tag_hit):
+                        comb += hit_way_set[j].eq(i)
+                        comb += s_hit.eq(1)
+                comb += hit_set[j].eq(s_hit)
+                with m.If(s_tag == reload_tag):
+                    comb += rel_matches[j].eq(1)
+            with m.If(tlb_hit):
+                comb += is_hit.eq(hit_set[tlb_hit_way])
+                comb += hit_way.eq(hit_way_set[tlb_hit_way])
+                comb += rel_match.eq(rel_matches[tlb_hit_way])
+        with m.Else():
+            s_tag       = Signal(TAG_BITS)
+            comb += s_tag.eq(get_tag(req_addr))
+            for i in range(NUM_WAYS):
+                is_tag_hit = Signal(name="is_tag_hit_%d" % i)
+                comb += is_tag_hit.eq(go & cache_valid_idx[i] &
+                          (read_tag(i, cache_tag_set) == s_tag))
+                with m.If(is_tag_hit):
+                    comb += hit_way.eq(i)
+                    comb += is_hit.eq(1)
+            with m.If(s_tag == reload_tag):
+                comb += rel_match.eq(1)
 
         return m
 
 
         return m
 
@@ -437,11 +573,11 @@ class DCache(Elaboratable):
       while not idle...)
     """
     def __init__(self):
       while not idle...)
     """
     def __init__(self):
-        self.d_in      = LoadStore1ToDCacheType()
-        self.d_out     = DCacheToLoadStore1Type()
+        self.d_in      = LoadStore1ToDCacheType("d_in")
+        self.d_out     = DCacheToLoadStore1Type("d_out")
 
 
-        self.m_in      = MMUToDCacheType()
-        self.m_out     = DCacheToMMUType()
+        self.m_in      = MMUToDCacheType("m_in")
+        self.m_out     = DCacheToMMUType("m_out")
 
         self.stall_out = Signal()
 
 
         self.stall_out = Signal()
 
@@ -457,12 +593,11 @@ class DCache(Elaboratable):
         sync = m.d.sync
         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 
         sync = m.d.sync
         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 
-        r = RegStage0()
+        r = RegStage0("stage0")
 
         # TODO, this goes in unit tests and formal proofs
 
         # TODO, this goes in unit tests and formal proofs
-        with m.If(~(d_in.valid & m_in.valid)):
-            #sync += Display("request collision loadstore vs MMU")
-            pass
+        with m.If(d_in.valid & m_in.valid):
+            sync += Display("request collision loadstore vs MMU")
 
         with m.If(m_in.valid):
             sync += r.req.valid.eq(1)
 
         with m.If(m_in.valid):
             sync += r.req.valid.eq(1)
@@ -470,7 +605,7 @@ class DCache(Elaboratable):
             sync += r.req.dcbz.eq(0)
             sync += r.req.nc.eq(0)
             sync += r.req.reserve.eq(0)
             sync += r.req.dcbz.eq(0)
             sync += r.req.nc.eq(0)
             sync += r.req.reserve.eq(0)
-            sync += r.req.virt_mode.eq(1)
+            sync += r.req.virt_mode.eq(0)
             sync += r.req.priv_mode.eq(1)
             sync += r.req.addr.eq(m_in.addr)
             sync += r.req.data.eq(m_in.pte)
             sync += r.req.priv_mode.eq(1)
             sync += r.req.addr.eq(m_in.addr)
             sync += r.req.data.eq(m_in.pte)
@@ -519,33 +654,24 @@ class DCache(Elaboratable):
             sync += tlb_tag_way.eq(dtlb_tags[index])
             sync += tlb_pte_way.eq(dtlb_ptes[index])
 
             sync += tlb_tag_way.eq(dtlb_tags[index])
             sync += tlb_pte_way.eq(dtlb_ptes[index])
 
-    def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, acc, acc_en, lru):
+    def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
         """Generate TLB PLRUs
         """
         comb = m.d.comb
         sync = m.d.sync
 
         """Generate TLB PLRUs
         """
         comb = m.d.comb
         sync = m.d.sync
 
-        with m.If(TLB_NUM_WAYS > 1):
-            for i in range(TLB_SET_SIZE):
-                # TLB PLRU interface
-                tlb_plru        = PLRU(TLB_WAY_BITS)
-                setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
-                tlb_plru_acc    = Signal(TLB_WAY_BITS)
-                tlb_plru_acc_en = Signal()
-                tlb_plru_out    = Signal(TLB_WAY_BITS)
-
-                comb += tlb_plru.acc.eq(tlb_plru_acc)
-                comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
-                comb += tlb_plru.lru.eq(tlb_plru_out)
-
-                # PLRU interface
-                with m.If(r1.tlb_hit_index == i):
-                    comb += tlb_plru.acc_en.eq(r1.tlb_hit)
-                with m.Else():
-                    comb += tlb_plru.acc_en.eq(0)
-                comb += tlb_plru.acc.eq(r1.tlb_hit_way)
+        if TLB_NUM_WAYS == 0:
+            return
+        for i in range(TLB_SET_SIZE):
+            # TLB PLRU interface
+            tlb_plru        = PLRU(TLB_WAY_BITS)
+            setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
+            tlb_plru_acc_en = Signal()
 
 
-                comb += tlb_plru_victim[i].eq(tlb_plru.lru)
+            comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
+            comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
+            comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
+            comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 
     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 
     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
@@ -563,8 +689,10 @@ class DCache(Elaboratable):
         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 
         for i in range(TLB_NUM_WAYS):
         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 
         for i in range(TLB_NUM_WAYS):
-            with m.If(tlb_valid_way[i]
-                      & read_tlb_tag(i, tlb_tag_way) == eatag):
+            is_tag_hit = Signal()
+            comb += is_tag_hit.eq(tlb_valid_way[i]
+                                  & (read_tlb_tag(i, tlb_tag_way) == eatag))
+            with m.If(is_tag_hit):
                 comb += hitway.eq(i)
                 comb += hit.eq(1)
 
                 comb += hitway.eq(i)
                 comb += hit.eq(1)
 
@@ -580,15 +708,20 @@ class DCache(Elaboratable):
             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
-            comb += perm_attr.eq(extract_perm_attr(pte))
+            comb += perm_attr.reference.eq(pte[8])
+            comb += perm_attr.changed.eq(pte[7])
+            comb += perm_attr.nocache.eq(pte[5])
+            comb += perm_attr.priv.eq(pte[3])
+            comb += perm_attr.rd_perm.eq(pte[2])
+            comb += perm_attr.wr_perm.eq(pte[1])
         with m.Else():
             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 
             comb += perm_attr.reference.eq(1)
             comb += perm_attr.changed.eq(1)
         with m.Else():
             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 
             comb += perm_attr.reference.eq(1)
             comb += perm_attr.changed.eq(1)
-            comb += perm_attr.priv.eq(1)
             comb += perm_attr.nocache.eq(0)
             comb += perm_attr.nocache.eq(0)
+            comb += perm_attr.priv.eq(1)
             comb += perm_attr.rd_perm.eq(1)
             comb += perm_attr.wr_perm.eq(1)
 
             comb += perm_attr.rd_perm.eq(1)
             comb += perm_attr.wr_perm.eq(1)
 
@@ -597,6 +730,7 @@ class DCache(Elaboratable):
                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 
         comb = m.d.comb
                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 
         comb = m.d.comb
+        sync = m.d.sync
 
         tlbie    = Signal()
         tlbwe    = Signal()
 
         tlbie    = Signal()
         tlbwe    = Signal()
@@ -604,7 +738,19 @@ class DCache(Elaboratable):
         comb += tlbie.eq(r0_valid & r0.tlbie)
         comb += tlbwe.eq(r0_valid & r0.tlbld)
 
         comb += tlbie.eq(r0_valid & r0.tlbie)
         comb += tlbwe.eq(r0_valid & r0.tlbld)
 
-        m.submodules.tlb_update = d = DTLBUpdate(dtlb_valid_bits, dtlb_ptes)
+        m.submodules.tlb_update = d = DTLBUpdate()
+        with m.If(tlbie & r0.doall):
+            # clear all valid bits at once
+            for i in range(TLB_SET_SIZE):
+                sync += dtlb_valid_bits[i].eq(0)
+        with m.If(d.updated):
+            sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
+            sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
+        with m.If(d.v_updated):
+            sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
+
+        comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
+
         comb += d.tlbie.eq(tlbie)
         comb += d.tlbwe.eq(tlbwe)
         comb += d.doall.eq(r0.doall)
         comb += d.tlbie.eq(tlbie)
         comb += d.tlbwe.eq(tlbwe)
         comb += d.doall.eq(r0.doall)
@@ -627,23 +773,19 @@ class DCache(Elaboratable):
         comb = m.d.comb
         sync = m.d.sync
 
         comb = m.d.comb
         sync = m.d.sync
 
+        if TLB_NUM_WAYS == 0:
+            return
+
         for i in range(NUM_LINES):
             # PLRU interface
         for i in range(NUM_LINES):
             # PLRU interface
-            plru        = PLRU(TLB_WAY_BITS)
+            plru        = PLRU(WAY_BITS)
             setattr(m.submodules, "plru%d" % i, plru)
             setattr(m.submodules, "plru%d" % i, plru)
-            plru_acc    = Signal(WAY_BITS)
             plru_acc_en = Signal()
             plru_acc_en = Signal()
-            plru_out    = Signal(WAY_BITS)
 
 
-            comb += plru.acc.eq(plru_acc)
+            comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
             comb += plru.acc_en.eq(plru_acc_en)
             comb += plru.acc_en.eq(plru_acc_en)
-            comb += plru_out.eq(plru.lru_o)
-
-            with m.If(r1.hit_index == i):
-                comb += plru_acc_en.eq(r1.cache_hit)
-
-            comb += plru_acc.eq(r1.hit_way)
-            comb += plru_victim[i].eq(plru_out)
+            comb += plru.acc_i.eq(r1.hit_way)
+            comb += plru_victim[i].eq(plru.lru_o)
 
     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
         """Cache tag RAM read port
 
     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
         """Cache tag RAM read port
@@ -663,7 +805,7 @@ class DCache(Elaboratable):
         sync += cache_tag_set.eq(cache_tags[index])
 
     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
         sync += cache_tag_set.eq(cache_tags[index])
 
     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
-                       r0_valid, r1, cache_valid_bits, replace_way,
+                       r0_valid, r1, cache_valids, replace_way,
                        use_forward1_next, use_forward2_next,
                        req_hit_way, plru_victim, rc_ok, perm_attr,
                        valid_ra, perm_ok, access_ok, req_op, req_go,
                        use_forward1_next, use_forward2_next,
                        req_hit_way, plru_victim, rc_ok, perm_attr,
                        valid_ra, perm_ok, access_ok, req_op, req_go,
@@ -683,72 +825,50 @@ class DCache(Elaboratable):
         opsel       = Signal(3)
         go          = Signal()
         nc          = Signal()
         opsel       = Signal(3)
         go          = Signal()
         nc          = Signal()
-        s_hit       = Signal()
-        s_tag       = Signal(TAG_BITS)
-        s_pte       = Signal(TLB_PTE_BITS)
-        s_ra        = Signal(REAL_ADDR_BITS)
-        hit_set     = Signal(TLB_NUM_WAYS)
-        hit_way_set = HitWaySet()
-        rel_matches = Signal(TLB_NUM_WAYS)
-        rel_match   = Signal()
+        hit_set     = Array(Signal(name="hit_set_%d" % i) \
+                                  for i in range(TLB_NUM_WAYS))
+        cache_valid_idx = Signal(NUM_WAYS)
 
         # Extract line, row and tag from request
         comb += req_index.eq(get_index(r0.req.addr))
         comb += req_row.eq(get_row(r0.req.addr))
         comb += req_tag.eq(get_tag(ra))
 
 
         # Extract line, row and tag from request
         comb += req_index.eq(get_index(r0.req.addr))
         comb += req_row.eq(get_row(r0.req.addr))
         comb += req_tag.eq(get_tag(ra))
 
-        comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
-
-        # Test if pending request is a hit on any way
-        # In order to make timing in virtual mode,
-        # when we are using the TLB, we compare each
-        # way with each of the real addresses from each way of
-        # the TLB, and then decide later which match to use.
-
-        with m.If(r0.req.virt_mode):
-            comb += rel_matches.eq(0)
-            for j in range(TLB_NUM_WAYS):
-                comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
-                comb += s_ra.eq(Cat(r0.req.addr[0:TLB_LG_PGSZ],
-                                    s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
-                comb += s_tag.eq(get_tag(s_ra))
+        if False: # display on comb is a bit... busy.
+            comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
+                    r0.req.addr, ra, req_index, req_tag, req_row)
 
 
-                for i in range(NUM_WAYS):
-                    with m.If(go & cache_valid_bits[req_index][i] &
-                              (read_tag(i, cache_tag_set) == s_tag)
-                              & tlb_valid_way[j]):
-                        comb += hit_way_set[j].eq(i)
-                        comb += s_hit.eq(1)
-                comb += hit_set[j].eq(s_hit)
-                with m.If(s_tag == r1.reload_tag):
-                    comb += rel_matches[j].eq(1)
-            with m.If(tlb_hit):
-                comb += is_hit.eq(hit_set.bit_select(tlb_hit_way, 1))
-                comb += hit_way.eq(hit_way_set[tlb_hit_way])
-                comb += rel_match.eq(rel_matches.bit_select(tlb_hit_way, 1))
-        with m.Else():
-            comb += s_tag.eq(get_tag(r0.req.addr))
-            for i in range(NUM_WAYS):
-                with m.If(go & cache_valid_bits[req_index][i] &
-                          read_tag(i, cache_tag_set) == s_tag):
-                    comb += hit_way.eq(i)
-                    comb += is_hit.eq(1)
-            with m.If(s_tag == r1.reload_tag):
-                comb += rel_match.eq(1)
-        comb += req_same_tag.eq(rel_match)
+        comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
+        comb += cache_valid_idx.eq(cache_valids[req_index])
+
+        m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
+                                tlb_valid_way, tlb_hit_way,
+                                cache_valid_idx, cache_tag_set,
+                                r0.req.addr,
+                                hit_set)
+
+        comb += dc.tlb_hit.eq(tlb_hit)
+        comb += dc.reload_tag.eq(r1.reload_tag)
+        comb += dc.virt_mode.eq(r0.req.virt_mode)
+        comb += dc.go.eq(go)
+        comb += dc.req_index.eq(req_index)
+        comb += is_hit.eq(dc.is_hit)
+        comb += hit_way.eq(dc.hit_way)
+        comb += req_same_tag.eq(dc.rel_match)
 
         # See if the request matches the line currently being reloaded
         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 
         # See if the request matches the line currently being reloaded
         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
-                  (req_index == r1.store_index) & rel_match):
+                  (req_index == r1.store_index) & req_same_tag):
             # For a store, consider this a hit even if the row isn't
             # valid since it will be by the time we perform the store.
             # For a load, check the appropriate row valid bit.
             # For a store, consider this a hit even if the row isn't
             # valid since it will be by the time we perform the store.
             # For a load, check the appropriate row valid bit.
-            valid = r1.rows_valid[req_row % ROW_PER_LINE]
+            rrow = Signal(ROW_LINE_BITS)
+            comb += rrow.eq(req_row)
+            valid = r1.rows_valid[rrow]
             comb += is_hit.eq(~r0.req.load | valid)
             comb += hit_way.eq(replace_way)
 
         # Whether to use forwarded data for a load or not
             comb += is_hit.eq(~r0.req.load | valid)
             comb += hit_way.eq(replace_way)
 
         # Whether to use forwarded data for a load or not
-        comb += use_forward1_next.eq(0)
         with m.If((get_row(r1.req.real_addr) == req_row) &
                   (r1.req.hit_way == hit_way)):
             # Only need to consider r1.write_bram here, since if we
         with m.If((get_row(r1.req.real_addr) == req_row) &
                   (r1.req.hit_way == hit_way)):
             # Only need to consider r1.write_bram here, since if we
@@ -760,7 +880,6 @@ class DCache(Elaboratable):
             # cycles after the refill starts before we see the updated
             # cache tag. In that case we don't use the bypass.)
             comb += use_forward1_next.eq(r1.write_bram)
             # cycles after the refill starts before we see the updated
             # cache tag. In that case we don't use the bypass.)
             comb += use_forward1_next.eq(r1.write_bram)
-        comb += use_forward2_next.eq(0)
         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
             comb += use_forward2_next.eq(r1.forward_valid1)
 
         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
             comb += use_forward2_next.eq(r1.forward_valid1)
 
@@ -769,7 +888,7 @@ class DCache(Elaboratable):
 
         # The way to replace on a miss
         with m.If(r1.write_tag):
 
         # The way to replace on a miss
         with m.If(r1.write_tag):
-            replace_way.eq(plru_victim[r1.store_index])
+            comb += replace_way.eq(plru_victim[r1.store_index])
         with m.Else():
             comb += replace_way.eq(r1.store_way)
 
         with m.Else():
             comb += replace_way.eq(r1.store_way)
 
@@ -778,10 +897,9 @@ class DCache(Elaboratable):
         comb += rc_ok.eq(perm_attr.reference
                          & (r0.req.load | perm_attr.changed)
                 )
         comb += rc_ok.eq(perm_attr.reference
                          & (r0.req.load | perm_attr.changed)
                 )
-        comb += perm_ok.eq((r0.req.priv_mode | ~perm_attr.priv)
-                           & perm_attr.wr_perm
-                           | (r0.req.load & perm_attr.rd_perm)
-                          )
+        comb += perm_ok.eq((r0.req.priv_mode | ~perm_attr.priv) &
+                           (perm_attr.wr_perm |
+                              (r0.req.load & perm_attr.rd_perm)))
         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
         # Combine the request and cache hit status to decide what
         # operation needs to be done
         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
         # Combine the request and cache hit status to decide what
         # operation needs to be done
@@ -795,24 +913,14 @@ class DCache(Elaboratable):
             with m.Else():
                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
                 with m.Switch(opsel):
             with m.Else():
                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
                 with m.Switch(opsel):
-                    with m.Case(0b101):
-                        comb += op.eq(Op.OP_LOAD_HIT)
-                    with m.Case(0b100):
-                        comb += op.eq(Op.OP_LOAD_MISS)
-                    with m.Case(0b110):
-                        comb += op.eq(Op.OP_LOAD_NC)
-                    with m.Case(0b001):
-                        comb += op.eq(Op.OP_STORE_HIT)
-                    with m.Case(0b000):
-                        comb += op.eq(Op.OP_STORE_MISS)
-                    with m.Case(0b010):
-                        comb += op.eq(Op.OP_STORE_MISS)
-                    with m.Case(0b011):
-                        comb += op.eq(Op.OP_BAD)
-                    with m.Case(0b111):
-                        comb += op.eq(Op.OP_BAD)
-                    with m.Default():
-                        comb += op.eq(Op.OP_NONE)
+                    with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
+                    with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
+                    with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
+                    with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
+                    with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
+                    with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
+                    with m.Case(0b011): comb += op.eq(Op.OP_BAD)
+                    with m.Case(0b111): comb += op.eq(Op.OP_BAD)
         comb += req_op.eq(op)
         comb += req_go.eq(go)
 
         comb += req_op.eq(op)
         comb += req_go.eq(go)
 
@@ -836,14 +944,14 @@ class DCache(Elaboratable):
         sync = m.d.sync
 
         with m.If(r0_valid & r0.req.reserve):
         sync = m.d.sync
 
         with m.If(r0_valid & r0.req.reserve):
-
             # XXX generate alignment interrupt if address
             # is not aligned XXX or if r0.req.nc = '1'
             with m.If(r0.req.load):
                 comb += set_rsrv.eq(1) # load with reservation
             with m.Else():
                 comb += clear_rsrv.eq(1) # store conditional
             # XXX generate alignment interrupt if address
             # is not aligned XXX or if r0.req.nc = '1'
             with m.If(r0.req.load):
                 comb += set_rsrv.eq(1) # load with reservation
             with m.Else():
                 comb += clear_rsrv.eq(1) # store conditional
-                with m.If(~reservation.valid | r0.req.addr[LINE_OFF_BITS:64]):
+                with m.If(~reservation.valid |
+                         (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
                     comb += cancel_store.eq(1)
 
     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
                     comb += cancel_store.eq(1)
 
     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
@@ -859,7 +967,7 @@ class DCache(Elaboratable):
                 sync += reservation.valid.eq(1)
                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 
                 sync += reservation.valid.eq(1)
                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 
-    def writeback_control(self, m, r1, cache_out):
+    def writeback_control(self, m, r1, cache_out_row):
         """Return data for loads & completion control logic
         """
         comb = m.d.comb
         """Return data for loads & completion control logic
         """
         comb = m.d.comb
@@ -878,7 +986,7 @@ class DCache(Elaboratable):
         with m.Else():
             comb += data_fwd.eq(r1.forward_data2)
 
         with m.Else():
             comb += data_fwd.eq(r1.forward_data2)
 
-        comb += data_out.eq(cache_out[r1.hit_way])
+        comb += data_out.eq(cache_out_row)
 
         for i in range(8):
             with m.If(r1.forward_sel[i]):
 
         for i in range(8):
             with m.If(r1.forward_sel[i]):
@@ -919,35 +1027,32 @@ class DCache(Elaboratable):
             # Request came from loadstore1...
             # Load hit case is the standard path
             with m.If(r1.hit_load_valid):
             # Request came from loadstore1...
             # Load hit case is the standard path
             with m.If(r1.hit_load_valid):
-                #Display(f"completing load hit data={data_out}")
-                pass
+                sync += Display("completing load hit data=%x", data_out)
 
             # error cases complete without stalling
             with m.If(r1.ls_error):
 
             # error cases complete without stalling
             with m.If(r1.ls_error):
-                # Display("completing ld/st with error")
-                pass
+                sync += Display("completing ld/st with error")
 
             # Slow ops (load miss, NC, stores)
             with m.If(r1.slow_valid):
 
             # Slow ops (load miss, NC, stores)
             with m.If(r1.slow_valid):
-                #Display(f"completing store or load miss data={data_out}")
-                pass
+                sync += Display("completing store or load miss data=%x",
+                                data_out)
 
         with m.Else():
             # Request came from MMU
             with m.If(r1.hit_load_valid):
 
         with m.Else():
             # Request came from MMU
             with m.If(r1.hit_load_valid):
-                # Display(f"completing load hit to MMU, data={m_out.data}")
-                pass
+                sync += Display("completing load hit to MMU, data=%x",
+                                m_out.data)
             # error cases complete without stalling
             with m.If(r1.mmu_error):
             # error cases complete without stalling
             with m.If(r1.mmu_error):
-                #Display("combpleting MMU ld with error")
-                pass
+                sync += Display("combpleting MMU ld with error")
 
             # Slow ops (i.e. load miss)
             with m.If(r1.slow_valid):
 
             # Slow ops (i.e. load miss)
             with m.If(r1.slow_valid):
-                #Display("completing MMU load miss, data={m_out.data}")
-                pass
+                sync += Display("completing MMU load miss, data=%x",
+                                m_out.data)
 
 
-    def rams(self, m, r1, early_req_row, cache_out, replace_way):
+    def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
         """rams
         Generate a cache RAM for each way. This handles the normal
         reads, writes from reloads and the special store-hit update
         """rams
         Generate a cache RAM for each way. This handles the normal
         reads, writes from reloads and the special store-hit update
@@ -962,16 +1067,16 @@ class DCache(Elaboratable):
         wb_in = self.wb_in
 
         for i in range(NUM_WAYS):
         wb_in = self.wb_in
 
         for i in range(NUM_WAYS):
-            do_read  = Signal()
+            do_read  = Signal(name="do_rd%d" % i)
             rd_addr  = Signal(ROW_BITS)
             rd_addr  = Signal(ROW_BITS)
-            do_write = Signal()
+            do_write = Signal(name="do_wr%d" % i)
             wr_addr  = Signal(ROW_BITS)
             wr_data  = Signal(WB_DATA_BITS)
             wr_sel   = Signal(ROW_SIZE)
             wr_sel_m = Signal(ROW_SIZE)
             wr_addr  = Signal(ROW_BITS)
             wr_data  = Signal(WB_DATA_BITS)
             wr_sel   = Signal(ROW_SIZE)
             wr_sel_m = Signal(ROW_SIZE)
-            _d_out   = Signal(WB_DATA_BITS)
+            _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i)
 
 
-            way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
+            way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True)
             setattr(m.submodules, "cacheram_%d" % i, way)
 
             comb += way.rd_en.eq(do_read)
             setattr(m.submodules, "cacheram_%d" % i, way)
 
             comb += way.rd_en.eq(do_read)
@@ -983,8 +1088,9 @@ class DCache(Elaboratable):
 
             # Cache hit reads
             comb += do_read.eq(1)
 
             # Cache hit reads
             comb += do_read.eq(1)
-            comb += rd_addr.eq(early_req_row)
-            comb += cache_out[i].eq(_d_out)
+            comb += rd_addr.eq(early_req_row[:ROW_BITS])
+            with m.If(r1.hit_way == i):
+                comb += cache_out_row.eq(_d_out)
 
             # Write mux:
             #
 
             # Write mux:
             #
@@ -1015,26 +1121,25 @@ class DCache(Elaboratable):
                       & wb_in.ack & (replace_way == i)):
                 comb += do_write.eq(1)
 
                       & wb_in.ack & (replace_way == i)):
                 comb += do_write.eq(1)
 
-                # Mask write selects with do_write since BRAM
-                # doesn't have a global write-enable
-                with m.If(do_write):
-                    comb += wr_sel_m.eq(wr_sel)
+            # Mask write selects with do_write since BRAM
+            # doesn't have a global write-enable
+            with m.If(do_write):
+                comb += wr_sel_m.eq(wr_sel)
 
     # Cache hit synchronous machine for the easy case.
     # This handles load hits.
     # It also handles error cases (TLB miss, cache paradox)
     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
 
     # Cache hit synchronous machine for the easy case.
     # This handles load hits.
     # It also handles error cases (TLB miss, cache paradox)
     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
-                        req_hit_way, req_index, access_ok,
+                        req_hit_way, req_index, req_tag, access_ok,
                         tlb_hit, tlb_hit_way, tlb_req_index):
 
         comb = m.d.comb
         sync = m.d.sync
 
         with m.If(req_op != Op.OP_NONE):
                         tlb_hit, tlb_hit_way, tlb_req_index):
 
         comb = m.d.comb
         sync = m.d.sync
 
         with m.If(req_op != Op.OP_NONE):
-            #Display(f"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \
-            #      f"idx:{req_index} tag:{req_tag} way: {req_hit_way}"
-            #     )
-            pass
+            sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
+                    req_op, r0.req.addr, r0.req.nc,
+                    req_index, req_tag, req_hit_way)
 
         with m.If(r0_valid):
             sync += r1.mmu_req.eq(r0.mmu_req)
 
         with m.If(r0_valid):
             sync += r1.mmu_req.eq(r0.mmu_req)
@@ -1085,18 +1190,24 @@ class DCache(Elaboratable):
     # All wishbone requests generation is done here.
     # This machine operates at stage 1.
     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
     # All wishbone requests generation is done here.
     # This machine operates at stage 1.
     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
-                    cache_valid_bits, r0, replace_way,
+                    cache_valids, r0, replace_way,
                     req_hit_way, req_same_tag,
                     req_hit_way, req_same_tag,
-                    r0_valid, req_op, cache_tag, req_go, ra):
+                    r0_valid, req_op, cache_tags, req_go, ra):
 
         comb = m.d.comb
         sync = m.d.sync
         wb_in = self.wb_in
 
 
         comb = m.d.comb
         sync = m.d.sync
         wb_in = self.wb_in
 
-        req         = MemAccessRequest()
+        req         = MemAccessRequest("mreq_ds")
         acks        = Signal(3)
         adjust_acks = Signal(3)
         acks        = Signal(3)
         adjust_acks = Signal(3)
-        stbs_done = Signal()
+
+        req_row = Signal(ROW_BITS)
+        req_idx = Signal(INDEX_BITS)
+        req_tag = Signal(TAG_BITS)
+        comb += req_idx.eq(get_index(req.real_addr))
+        comb += req_row.eq(get_row(req.real_addr))
+        comb += req_tag.eq(get_tag(req.real_addr))
 
         sync += r1.use_forward1.eq(use_forward1_next)
         sync += r1.forward_sel.eq(0)
 
         sync += r1.use_forward1.eq(use_forward1_next)
         sync += r1.forward_sel.eq(0)
@@ -1145,8 +1256,9 @@ class DCache(Elaboratable):
             for i in range(NUM_WAYS):
                 with m.If(i == replace_way):
                     ct = Signal(TAG_RAM_WIDTH)
             for i in range(NUM_WAYS):
                 with m.If(i == replace_way):
                     ct = Signal(TAG_RAM_WIDTH)
-                    sync += ct.eq(cache_tag[r1.store_index])
-                    sync += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
+                    comb += ct.eq(cache_tags[r1.store_index])
+                    comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
+                    sync += cache_tags[r1.store_index].eq(ct)
             sync += r1.store_way.eq(replace_way)
             sync += r1.write_tag.eq(0)
 
             sync += r1.store_way.eq(replace_way)
             sync += r1.write_tag.eq(0)
 
@@ -1189,23 +1301,17 @@ class DCache(Elaboratable):
         with m.Switch(r1.state):
 
             with m.Case(State.IDLE):
         with m.Switch(r1.state):
 
             with m.Case(State.IDLE):
-# XXX check 'left downto.  probably means len(r1.wb.adr)
-#                     r1.wb.adr <= req.real_addr(
-#                                   r1.wb.adr'left downto 0
-#                                  );
-                sync += r1.wb.adr.eq(req.real_addr)
+                sync += r1.real_adr.eq(req.real_addr)
                 sync += r1.wb.sel.eq(req.byte_sel)
                 sync += r1.wb.dat.eq(req.data)
                 sync += r1.dcbz.eq(req.dcbz)
 
                 # Keep track of our index and way
                 # for subsequent stores.
                 sync += r1.wb.sel.eq(req.byte_sel)
                 sync += r1.wb.dat.eq(req.data)
                 sync += r1.dcbz.eq(req.dcbz)
 
                 # Keep track of our index and way
                 # for subsequent stores.
-                sync += r1.store_index.eq(get_index(req.real_addr))
-                sync += r1.store_row.eq(get_row(req.real_addr))
-                sync += r1.end_row_ix.eq(
-                         get_row_of_line(get_row(req.real_addr))
-                        )
-                sync += r1.reload_tag.eq(get_tag(req.real_addr))
+                sync += r1.store_index.eq(req_idx)
+                sync += r1.store_row.eq(req_row)
+                sync += r1.end_row_ix.eq(get_row_of_line(req_row))
+                sync += r1.reload_tag.eq(req_tag)
                 sync += r1.req.same_tag.eq(1)
 
                 with m.If(req.op == Op.OP_STORE_HIT):
                 sync += r1.req.same_tag.eq(1)
 
                 with m.If(req.op == Op.OP_STORE_HIT):
@@ -1216,17 +1322,18 @@ class DCache(Elaboratable):
                 for i in range(ROW_PER_LINE):
                     sync += r1.rows_valid[i].eq(0)
 
                 for i in range(ROW_PER_LINE):
                     sync += r1.rows_valid[i].eq(0)
 
+                with m.If(req_op != Op.OP_NONE):
+                    sync += Display("cache op %d", req.op)
+
                 with m.Switch(req.op):
                     with m.Case(Op.OP_LOAD_HIT):
                         # stay in IDLE state
                         pass
 
                     with m.Case(Op.OP_LOAD_MISS):
                 with m.Switch(req.op):
                     with m.Case(Op.OP_LOAD_HIT):
                         # stay in IDLE state
                         pass
 
                     with m.Case(Op.OP_LOAD_MISS):
-                        #Display(f"cache miss real addr:" \
-                        #      f"{req_real_addr}" \
-                        #      f" idx:{get_index(req_real_addr)}" \
-                        #      f" tag:{get_tag(req.real_addr)}")
-                        pass
+                        sync += Display("cache miss real addr: %x " \
+                                "idx: %x tag: %x",
+                                req.real_addr, req_row, req_tag)
 
                         # Start the wishbone cycle
                         sync += r1.wb.we.eq(0)
 
                         # Start the wishbone cycle
                         sync += r1.wb.we.eq(0)
@@ -1258,6 +1365,8 @@ class DCache(Elaboratable):
                             with m.If(req.op == Op.OP_STORE_HIT):
                                 sync += r1.write_bram.eq(1)
                         with m.Else():
                             with m.If(req.op == Op.OP_STORE_HIT):
                                 sync += r1.write_bram.eq(1)
                         with m.Else():
+                            # dcbz is handled much like a load miss except
+                            # that we are writing to memory instead of reading
                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
 
                             with m.If(req.op == Op.OP_STORE_MISS):
                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
 
                             with m.If(req.op == Op.OP_STORE_MISS):
@@ -1278,29 +1387,31 @@ class DCache(Elaboratable):
                         pass
 
             with m.Case(State.RELOAD_WAIT_ACK):
                         pass
 
             with m.Case(State.RELOAD_WAIT_ACK):
+                ld_stbs_done = Signal()
                 # Requests are all sent if stb is 0
                 # Requests are all sent if stb is 0
-                comb += stbs_done.eq(~r1.wb.stb)
+                comb += ld_stbs_done.eq(~r1.wb.stb)
 
 
-                with m.If(~wb_in.stall & ~stbs_done):
+                with m.If((~wb_in.stall) & r1.wb.stb):
                     # That was the last word?
                     # We are done sending.
                     # That was the last word?
                     # We are done sending.
-                    # Clear stb and set stbs_done
+                    # Clear stb and set ld_stbs_done
                     # so we can handle an eventual
                     # last ack on the same cycle.
                     # so we can handle an eventual
                     # last ack on the same cycle.
-                    with m.If(is_last_row_addr(
-                              r1.wb.adr, r1.end_row_ix)):
+                    with m.If(is_last_row_addr(r1.real_adr, r1.end_row_ix)):
                         sync += r1.wb.stb.eq(0)
                         sync += r1.wb.stb.eq(0)
-                        comb += stbs_done.eq(0)
+                        comb += ld_stbs_done.eq(1)
 
                     # Calculate the next row address in the current cache line
 
                     # Calculate the next row address in the current cache line
-                    rarange = r1.wb.adr[ROW_OFF_BITS : LINE_OFF_BITS]
-                    sync += rarange.eq(rarange + 1)
+                    row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
+                    comb += row.eq(r1.real_adr[ROW_OFF_BITS:])
+                    sync += r1.real_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(row+1)
 
                 # Incoming acks processing
                 sync += r1.forward_valid1.eq(wb_in.ack)
                 with m.If(wb_in.ack):
 
                 # Incoming acks processing
                 sync += r1.forward_valid1.eq(wb_in.ack)
                 with m.If(wb_in.ack):
-                    # XXX needs an Array bit-accessor here
-                    sync += r1.rows_valid[r1.store_row % ROW_PER_LINE].eq(1)
+                    srow = Signal(ROW_LINE_BITS)
+                    comb += srow.eq(r1.store_row)
+                    sync += r1.rows_valid[srow].eq(1)
 
                     # If this is the data we were looking for,
                     # we can complete the request next cycle.
 
                     # If this is the data we were looking for,
                     # we can complete the request next cycle.
@@ -1321,22 +1432,24 @@ class DCache(Elaboratable):
                         sync += r1.use_forward1.eq(1)
 
                     # Check for completion
                         sync += r1.use_forward1.eq(1)
 
                     # Check for completion
-                    with m.If(stbs_done & is_last_row(r1.store_row,
+                    with m.If(ld_stbs_done & is_last_row(r1.store_row,
                                                       r1.end_row_ix)):
                         # Complete wishbone cycle
                         sync += r1.wb.cyc.eq(0)
 
                         # Cache line is now valid
                         cv = Signal(INDEX_BITS)
                                                       r1.end_row_ix)):
                         # Complete wishbone cycle
                         sync += r1.wb.cyc.eq(0)
 
                         # Cache line is now valid
                         cv = Signal(INDEX_BITS)
-                        sync += cv.eq(cache_valid_bits[r1.store_index])
-                        sync += cv.bit_select(r1.store_way, 1).eq(1)
+                        comb += cv.eq(cache_valids[r1.store_index])
+                        comb += cv.bit_select(r1.store_way, 1).eq(1)
+                        sync += cache_valids[r1.store_index].eq(cv)
                         sync += r1.state.eq(State.IDLE)
 
                     # Increment store row counter
                     sync += r1.store_row.eq(next_row(r1.store_row))
 
             with m.Case(State.STORE_WAIT_ACK):
                         sync += r1.state.eq(State.IDLE)
 
                     # Increment store row counter
                     sync += r1.store_row.eq(next_row(r1.store_row))
 
             with m.Case(State.STORE_WAIT_ACK):
-                comb += stbs_done.eq(~r1.wb.stb)
+                st_stbs_done = Signal()
+                comb += st_stbs_done.eq(~r1.wb.stb)
                 comb += acks.eq(r1.acks_pending)
 
                 with m.If(r1.inc_acks != r1.dec_acks):
                 comb += acks.eq(r1.acks_pending)
 
                 with m.If(r1.inc_acks != r1.dec_acks):
@@ -1355,7 +1468,7 @@ class DCache(Elaboratable):
                     # to be done which is in the same real page.
                     with m.If(req.valid):
                         ra = req.real_addr[0:SET_SIZE_BITS]
                     # to be done which is in the same real page.
                     with m.If(req.valid):
                         ra = req.real_addr[0:SET_SIZE_BITS]
-                        sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
+                        sync += r1.real_adr[0:SET_SIZE_BITS].eq(ra)
                         sync += r1.wb.dat.eq(req.data)
                         sync += r1.wb.sel.eq(req.byte_sel)
 
                         sync += r1.wb.dat.eq(req.data)
                         sync += r1.wb.sel.eq(req.byte_sel)
 
@@ -1363,7 +1476,7 @@ class DCache(Elaboratable):
                                 ((req.op == Op.OP_STORE_MISS)
                                  | (req.op == Op.OP_STORE_HIT))):
                         sync += r1.wb.stb.eq(1)
                                 ((req.op == Op.OP_STORE_MISS)
                                  | (req.op == Op.OP_STORE_HIT))):
                         sync += r1.wb.stb.eq(1)
-                        comb += stbs_done.eq(0)
+                        comb += st_stbs_done.eq(0)
 
                         with m.If(req.op == Op.OP_STORE_HIT):
                             sync += r1.write_bram.eq(1)
 
                         with m.If(req.op == Op.OP_STORE_HIT):
                             sync += r1.write_bram.eq(1)
@@ -1372,15 +1485,15 @@ class DCache(Elaboratable):
 
                         # Store requests never come from the MMU
                         sync += r1.ls_valid.eq(1)
 
                         # Store requests never come from the MMU
                         sync += r1.ls_valid.eq(1)
-                        comb += stbs_done.eq(0)
+                        comb += st_stbs_done.eq(0)
                         sync += r1.inc_acks.eq(1)
                     with m.Else():
                         sync += r1.wb.stb.eq(0)
                         sync += r1.inc_acks.eq(1)
                     with m.Else():
                         sync += r1.wb.stb.eq(0)
-                        comb += stbs_done.eq(1)
+                        comb += st_stbs_done.eq(1)
 
                 # Got ack ? See if complete.
                 with m.If(wb_in.ack):
 
                 # Got ack ? See if complete.
                 with m.If(wb_in.ack):
-                    with m.If(stbs_done & (adjust_acks == 1)):
+                    with m.If(st_stbs_done & (adjust_acks == 1)):
                         sync += r1.state.eq(State.IDLE)
                         sync += r1.wb.cyc.eq(0)
                         sync += r1.wb.stb.eq(0)
                         sync += r1.state.eq(State.IDLE)
                         sync += r1.wb.cyc.eq(0)
                         sync += r1.wb.stb.eq(0)
@@ -1415,7 +1528,7 @@ class DCache(Elaboratable):
         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
                                stall_out, req_op[:3], d_out.valid, d_out.error,
                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
                                stall_out, req_op[:3], d_out.valid, d_out.error,
                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
-                               r1.wb.adr[3:6]))
+                               r1.real_adr[3:6]))
 
     def elaborate(self, platform):
 
 
     def elaborate(self, platform):
 
@@ -1425,7 +1538,7 @@ class DCache(Elaboratable):
         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
         cache_tags       = CacheTagArray()
         cache_tag_set    = Signal(TAG_RAM_WIDTH)
         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
         cache_tags       = CacheTagArray()
         cache_tag_set    = Signal(TAG_RAM_WIDTH)
-        cache_valid_bits = CacheValidBitsArray()
+        cache_valids = CacheValidBitsArray()
 
         # TODO attribute ram_style : string;
         # TODO attribute ram_style of cache_tags : signal is "distributed";
 
         # TODO attribute ram_style : string;
         # TODO attribute ram_style of cache_tags : signal is "distributed";
@@ -1441,10 +1554,10 @@ class DCache(Elaboratable):
         # TODO attribute ram_style of
         #  dtlb_ptes : signal is "distributed";
 
         # TODO attribute ram_style of
         #  dtlb_ptes : signal is "distributed";
 
-        r0      = RegStage0()
+        r0      = RegStage0("r0")
         r0_full = Signal()
 
         r0_full = Signal()
 
-        r1 = RegStage1()
+        r1 = RegStage1("r1")
 
         reservation = Reservation()
 
 
         reservation = Reservation()
 
@@ -1470,7 +1583,7 @@ class DCache(Elaboratable):
         use_forward1_next = Signal()
         use_forward2_next = Signal()
 
         use_forward1_next = Signal()
         use_forward2_next = Signal()
 
-        cache_out         = CacheRamOut()
+        cache_out_row     = Signal(WB_DATA_BITS)
 
         plru_victim       = PLRUOut()
         replace_way       = Signal(WAY_BITS)
 
         plru_victim       = PLRUOut()
         replace_way       = Signal(WAY_BITS)
@@ -1488,7 +1601,7 @@ class DCache(Elaboratable):
         pte           = Signal(TLB_PTE_BITS)
         ra            = Signal(REAL_ADDR_BITS)
         valid_ra      = Signal()
         pte           = Signal(TLB_PTE_BITS)
         ra            = Signal(REAL_ADDR_BITS)
         valid_ra      = Signal()
-        perm_attr     = PermAttr()
+        perm_attr     = PermAttr("dc_perms")
         rc_ok         = Signal()
         perm_ok       = Signal()
         access_ok     = Signal()
         rc_ok         = Signal()
         perm_ok       = Signal()
         access_ok     = Signal()
@@ -1505,6 +1618,7 @@ class DCache(Elaboratable):
         comb += self.stall_out.eq(r0_stall)
 
         # Wire up wishbone request latch out of stage 1
         comb += self.stall_out.eq(r0_stall)
 
         # Wire up wishbone request latch out of stage 1
+        comb += r1.wb.adr.eq(r1.real_adr[ROW_OFF_BITS:]) # truncate LSBs
         comb += self.wb_out.eq(r1.wb)
 
         # call sub-functions putting everything together, using shared
         comb += self.wb_out.eq(r1.wb)
 
         # call sub-functions putting everything together, using shared
@@ -1520,9 +1634,10 @@ class DCache(Elaboratable):
                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
                         dtlb_tags, tlb_pte_way, dtlb_ptes)
         self.maybe_plrus(m, r1, plru_victim)
                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
                         dtlb_tags, tlb_pte_way, dtlb_ptes)
         self.maybe_plrus(m, r1, plru_victim)
+        self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
-                           r0_valid, r1, cache_valid_bits, replace_way,
+                           r0_valid, r1, cache_valids, replace_way,
                            use_forward1_next, use_forward2_next,
                            req_hit_way, plru_victim, rc_ok, perm_attr,
                            valid_ra, perm_ok, access_ok, req_op, req_go,
                            use_forward1_next, use_forward2_next,
                            req_hit_way, plru_victim, rc_ok, perm_attr,
                            valid_ra, perm_ok, access_ok, req_op, req_go,
@@ -1533,160 +1648,61 @@ class DCache(Elaboratable):
                            r0_valid, r0, reservation)
         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
                            reservation, r0)
                            r0_valid, r0, reservation)
         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
                            reservation, r0)
-        self.writeback_control(m, r1, cache_out)
-        self.rams(m, r1, early_req_row, cache_out, replace_way)
+        self.writeback_control(m, r1, cache_out_row)
+        self.rams(m, r1, early_req_row, cache_out_row, replace_way)
         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
-                        req_hit_way, req_index, access_ok,
+                        req_hit_way, req_index, req_tag, access_ok,
                         tlb_hit, tlb_hit_way, tlb_req_index)
         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
                         tlb_hit, tlb_hit_way, tlb_req_index)
         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
-                    cache_valid_bits, r0, replace_way,
+                    cache_valids, r0, replace_way,
                     req_hit_way, req_same_tag,
                          r0_valid, req_op, cache_tags, req_go, ra)
         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
 
         return m
 
                     req_hit_way, req_same_tag,
                          r0_valid, req_op, cache_tags, req_go, ra)
         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
 
         return m
 
+def dcache_load(dut, addr, nc=0):
+    yield dut.d_in.load.eq(1)
+    yield dut.d_in.nc.eq(nc)
+    yield dut.d_in.addr.eq(addr)
+    yield dut.d_in.byte_sel.eq(~0)
+    yield dut.d_in.valid.eq(1)
+    yield
+    yield dut.d_in.valid.eq(0)
+    yield dut.d_in.byte_sel.eq(0)
+    yield
+    while not (yield dut.d_out.valid):
+        yield
+    data = yield dut.d_out.data
+    return data
+
+
+def dcache_store(dut, addr, data, nc=0):
+    yield dut.d_in.load.eq(0)
+    yield dut.d_in.nc.eq(nc)
+    yield dut.d_in.data.eq(data)
+    yield dut.d_in.byte_sel.eq(~0)
+    yield dut.d_in.addr.eq(addr)
+    yield dut.d_in.valid.eq(1)
+    yield
+    yield dut.d_in.valid.eq(0)
+    yield dut.d_in.byte_sel.eq(0)
+    yield
+    while not (yield dut.d_out.valid):
+        yield
+
+
+def dcache_random_sim(dut):
+
+    # start with stack of zeros
+    sim_mem = [0] * 512
 
 
-# dcache_tb.vhdl
-#
-# entity dcache_tb is
-# end dcache_tb;
-#
-# architecture behave of dcache_tb is
-#     signal clk          : std_ulogic;
-#     signal rst          : std_ulogic;
-#
-#     signal d_in         : Loadstore1ToDcacheType;
-#     signal d_out        : DcacheToLoadstore1Type;
-#
-#     signal m_in         : MmuToDcacheType;
-#     signal m_out        : DcacheToMmuType;
-#
-#     signal wb_bram_in   : wishbone_master_out;
-#     signal wb_bram_out  : wishbone_slave_out;
-#
-#     constant clk_period : time := 10 ns;
-# begin
-#     dcache0: entity work.dcache
-#         generic map(
-#
-#             LINE_SIZE => 64,
-#             NUM_LINES => 4
-#             )
-#         port map(
-#             clk => clk,
-#             rst => rst,
-#             d_in => d_in,
-#             d_out => d_out,
-#             m_in => m_in,
-#             m_out => m_out,
-#             wishbone_out => wb_bram_in,
-#             wishbone_in => wb_bram_out
-#             );
-#
-#     -- BRAM Memory slave
-#     bram0: entity work.wishbone_bram_wrapper
-#         generic map(
-#             MEMORY_SIZE   => 1024,
-#             RAM_INIT_FILE => "icache_test.bin"
-#             )
-#         port map(
-#             clk => clk,
-#             rst => rst,
-#             wishbone_in => wb_bram_in,
-#             wishbone_out => wb_bram_out
-#             );
-#
-#     clk_process: process
-#     begin
-#         clk <= '0';
-#         wait for clk_period/2;
-#         clk <= '1';
-#         wait for clk_period/2;
-#     end process;
-#
-#     rst_process: process
-#     begin
-#         rst <= '1';
-#         wait for 2*clk_period;
-#         rst <= '0';
-#         wait;
-#     end process;
-#
-#     stim: process
-#     begin
-#     -- Clear stuff
-#     d_in.valid <= '0';
-#     d_in.load <= '0';
-#     d_in.nc <= '0';
-#     d_in.addr <= (others => '0');
-#     d_in.data <= (others => '0');
-#         m_in.valid <= '0';
-#         m_in.addr <= (others => '0');
-#         m_in.pte <= (others => '0');
-#
-#         wait for 4*clk_period;
-#     wait until rising_edge(clk);
-#
-#     -- Cacheable read of address 4
-#     d_in.load <= '1';
-#     d_in.nc <= '0';
-#         d_in.addr <= x"0000000000000004";
-#         d_in.valid <= '1';
-#     wait until rising_edge(clk);
-#         d_in.valid <= '0';
-#
-#     wait until rising_edge(clk) and d_out.valid = '1';
-#         assert d_out.data = x"0000000100000000"
-#         report "data @" & to_hstring(d_in.addr) &
-#         "=" & to_hstring(d_out.data) &
-#         " expected 0000000100000000"
-#         severity failure;
-# --      wait for clk_period;
-#
-#     -- Cacheable read of address 30
-#     d_in.load <= '1';
-#     d_in.nc <= '0';
-#         d_in.addr <= x"0000000000000030";
-#         d_in.valid <= '1';
-#     wait until rising_edge(clk);
-#         d_in.valid <= '0';
-#
-#     wait until rising_edge(clk) and d_out.valid = '1';
-#         assert d_out.data = x"0000000D0000000C"
-#         report "data @" & to_hstring(d_in.addr) &
-#         "=" & to_hstring(d_out.data) &
-#         " expected 0000000D0000000C"
-#         severity failure;
-#
-#     -- Non-cacheable read of address 100
-#     d_in.load <= '1';
-#     d_in.nc <= '1';
-#         d_in.addr <= x"0000000000000100";
-#         d_in.valid <= '1';
-#     wait until rising_edge(clk);
-#     d_in.valid <= '0';
-#     wait until rising_edge(clk) and d_out.valid = '1';
-#         assert d_out.data = x"0000004100000040"
-#         report "data @" & to_hstring(d_in.addr) &
-#         "=" & to_hstring(d_out.data) &
-#         " expected 0000004100000040"
-#         severity failure;
-#
-#     wait until rising_edge(clk);
-#     wait until rising_edge(clk);
-#     wait until rising_edge(clk);
-#     wait until rising_edge(clk);
-#
-#     std.env.finish;
-#     end process;
-# end;
-def dcache_sim(dut):
     # clear stuff
     yield dut.d_in.valid.eq(0)
     yield dut.d_in.load.eq(0)
     # clear stuff
     yield dut.d_in.valid.eq(0)
     yield dut.d_in.load.eq(0)
+    yield dut.d_in.priv_mode.eq(1)
     yield dut.d_in.nc.eq(0)
     yield dut.d_in.nc.eq(0)
-    yield dut.d_in.adrr.eq(0)
+    yield dut.d_in.addr.eq(0)
     yield dut.d_in.data.eq(0)
     yield dut.m_in.valid.eq(0)
     yield dut.m_in.addr.eq(0)
     yield dut.d_in.data.eq(0)
     yield dut.m_in.valid.eq(0)
     yield dut.m_in.addr.eq(0)
@@ -1696,48 +1712,97 @@ def dcache_sim(dut):
     yield
     yield
     yield
     yield
     yield
     yield
-    # wait_until rising_edge(clk)
-    yield
-    # Cacheable read of address 4
-    yield dut.d_in.load.eq(1)
+
+    print ()
+
+    for i in range(256):
+        addr = randint(0, 255)
+        data = randint(0, (1<<64)-1)
+        sim_mem[addr] = data
+        addr *= 8
+
+        print ("testing %x data %x" % (addr, data))
+
+        yield from dcache_load(dut, addr)
+        yield from dcache_store(dut, addr, data)
+
+        addr = randint(0, 255)
+        sim_data = sim_mem[addr]
+        addr *= 8
+
+        data = yield from dcache_load(dut, addr)
+        assert data == sim_data, \
+            "check %x data %x != %x" % (addr, data, sim_data)
+
+    for addr in range(256):
+        data = yield from dcache_load(dut, addr*8)
+        assert data == sim_mem[addr], \
+            "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
+
+def dcache_sim(dut):
+    # clear stuff
+    yield dut.d_in.valid.eq(0)
+    yield dut.d_in.load.eq(0)
+    yield dut.d_in.priv_mode.eq(1)
     yield dut.d_in.nc.eq(0)
     yield dut.d_in.nc.eq(0)
-    yield dut.d_in.addr.eq(Const(0x0000000000000004, 64))
-    yield dut.d_in.valid.eq(1)
-    # wait-until rising_edge(clk)
+    yield dut.d_in.addr.eq(0)
+    yield dut.d_in.data.eq(0)
+    yield dut.m_in.valid.eq(0)
+    yield dut.m_in.addr.eq(0)
+    yield dut.m_in.pte.eq(0)
+    # wait 4 * clk_period
+    yield
+    yield
     yield
     yield
-    yield dut.d_in.valid.eq(0)
     yield
     yield
-    while not (yield dut.d_out.valid):
-        yield
-    assert dut.d_out.data == 0x0000000100000000, \
-        f"data @ {dut.d_in.addr}={dut.d_in.data} expected 0000000100000000"
 
 
+    # Cacheable read of address 4
+    data = yield from dcache_load(dut, 0x58)
+    addr = yield dut.d_in.addr
+    assert data == 0x0000001700000016, \
+        f"data @%x=%x expected 0x0000001700000016" % (addr, data)
+
+    # Cacheable read of address 20
+    data = yield from dcache_load(dut, 0x20)
+    addr = yield dut.d_in.addr
+    assert data == 0x0000000900000008, \
+        f"data @%x=%x expected 0x0000000900000008" % (addr, data)
 
     # Cacheable read of address 30
 
     # Cacheable read of address 30
-    yield dut.d_in.load.eq(1)
-    yield dut.d_in.nc.eq(0)
-    yield dut.d_in.addr.eq(Const(0x0000000000000030, 64))
-    yield dut.d_in.valid.eq(1)
-    yield
-    yield dut.d_in.valid.eq(0)
-    yield
-    while not (yield dut.d_out.valid):
-        yield
-    assert dut.d_out.data == 0x0000000D0000000C, \
-        f"data @{dut.d_in.addr}={dut.d_out.data} expected 0000000D0000000C"
+    data = yield from dcache_load(dut, 0x530)
+    addr = yield dut.d_in.addr
+    assert data == 0x0000014D0000014C, \
+        f"data @%x=%x expected 0000014D0000014C" % (addr, data)
+
+    # 2nd Cacheable read of address 30
+    data = yield from dcache_load(dut, 0x530)
+    addr = yield dut.d_in.addr
+    assert data == 0x0000014D0000014C, \
+        f"data @%x=%x expected 0000014D0000014C" % (addr, data)
 
     # Non-cacheable read of address 100
 
     # Non-cacheable read of address 100
-    yield dut.d_in.load.eq(1)
-    yield dut.d_in.nc.eq(1)
-    yield dut.d_in.addr.eq(Const(0x0000000000000100, 64))
-    yield dut.d_in.valid.eq(1)
-    yield
-    yield dut.d_in.valid.eq(0)
-    yield
-    while not (yield dut.d_out.valid):
-        yield
-    assert dut.d_out.data == 0x0000004100000040, \
-        f"data @ {dut.d_in.addr}={dut.d_out.data} expected 0000004100000040"
+    data = yield from dcache_load(dut, 0x100, nc=1)
+    addr = yield dut.d_in.addr
+    assert data == 0x0000004100000040, \
+        f"data @%x=%x expected 0000004100000040" % (addr, data)
+
+    # Store at address 530
+    yield from dcache_store(dut, 0x530, 0x121)
+
+    # Store at address 30
+    yield from dcache_store(dut, 0x530, 0x12345678)
+
+    # 3nd Cacheable read of address 530
+    data = yield from dcache_load(dut, 0x530)
+    addr = yield dut.d_in.addr
+    assert data == 0x12345678, \
+        f"data @%x=%x expected 0x12345678" % (addr, data)
+
+    # 4th Cacheable read of address 20
+    data = yield from dcache_load(dut, 0x20)
+    addr = yield dut.d_in.addr
+    assert data == 0x0000000900000008, \
+        f"data @%x=%x expected 0x0000000900000008" % (addr, data)
 
     yield
     yield
 
     yield
     yield
@@ -1745,14 +1810,44 @@ def dcache_sim(dut):
     yield
 
 
     yield
 
 
-def test_dcache():
+def test_dcache(mem, test_fn, test_name):
+    dut = DCache()
+
+    memory = Memory(width=64, depth=16*64, init=mem)
+    sram = SRAM(memory=memory, granularity=8)
+
+    m = Module()
+    m.submodules.dcache = dut
+    m.submodules.sram = sram
+
+    m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
+    m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
+    m.d.comb += sram.bus.we.eq(dut.wb_out.we)
+    m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
+    m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
+    m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
+
+    m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
+    m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    sim.add_sync_process(wrap(test_fn(dut)))
+    with sim.write_vcd('test_dcache%s.vcd' % test_name):
+        sim.run()
+
+if __name__ == '__main__':
     dut = DCache()
     vl = rtlil.convert(dut, ports=[])
     with open("test_dcache.il", "w") as f:
         f.write(vl)
 
     dut = DCache()
     vl = rtlil.convert(dut, ports=[])
     with open("test_dcache.il", "w") as f:
         f.write(vl)
 
-    #run_simulation(dut, dcache_sim(), vcd_name='test_dcache.vcd')
+    mem = []
+    for i in range(0,512):
+        mem.append((i*2)| ((i*2+1)<<32))
 
 
-if __name__ == '__main__':
-    test_dcache()
+    test_dcache(mem, dcache_sim, "")
+    test_dcache(None, dcache_random_sim, "random")