Allow more test cases to be run with CXXSim

[soc.git] / src / soc / experiment / dcache.py
diff --git a/src/soc/experiment/dcache.py b/src/soc/experiment/dcache.py

index 8756b58952b7205b271326995d185d1a873085c4..e1f82b77dc337467c1f9eeff306adc2ade4a7120 100644 (file)
--- a/src/soc/experiment/dcache.py
+++ b/src/soc/experiment/dcache.py
@@ -7,12 +7,13 @@ based on Anton Blanchard microwatt dcache.vhdl
  from enum import Enum, unique
  
  from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  from enum import Enum, unique
  
  from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
+from nmutil.util import Display
+
+from random import randint
+
  from nmigen.cli import main
  from nmutil.iocontrol import RecordObject
  from nmigen.utils import log2_int
  from nmigen.cli import main
  from nmutil.iocontrol import RecordObject
  from nmigen.utils import log2_int
-from nmigen.cli import rtlil
-
-
  from soc.experiment.mem_types import (LoadStore1ToDCacheType,
                                       DCacheToLoadStore1Type,
                                       MMUToDCacheType,
  from soc.experiment.mem_types import (LoadStore1ToDCacheType,
                                       DCacheToLoadStore1Type,
                                       MMUToDCacheType,
@@ -25,15 +26,27 @@ from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
                                  WBIOMasterOut, WBIOSlaveOut)
  
  from soc.experiment.cache_ram import CacheRam
                                  WBIOMasterOut, WBIOSlaveOut)
  
  from soc.experiment.cache_ram import CacheRam
-from soc.experiment.plru import PLRU
+#from soc.experiment.plru import PLRU
+from nmutil.plru import PLRU
+
+# for test
+from nmigen_soc.wishbone.sram import SRAM
+from nmigen import Memory
+from nmigen.cli import rtlil
+
+# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
+# Also, check out the cxxsim nmigen branch, and latest yosys from git
+from nmutil.sim_tmp_alternative import Simulator
+
+from nmutil.util import wrap
  
  
  # TODO: make these parameters of DCache at some point
  LINE_SIZE = 64    # Line size in bytes
  
  
  # TODO: make these parameters of DCache at some point
  LINE_SIZE = 64    # Line size in bytes
-NUM_LINES = 32    # Number of lines in a set
+NUM_LINES = 16    # Number of lines in a set
  NUM_WAYS = 4      # Number of ways
  TLB_SET_SIZE = 64 # L1 DTLB entries per set
  NUM_WAYS = 4      # Number of ways
  TLB_SET_SIZE = 64 # L1 DTLB entries per set
-TLB_NUM_WAYS = 2  # L1 DTLB number of sets
+TLB_NUM_WAYS = 4  # L1 DTLB number of sets
  TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  LOG_LENGTH = 0    # Non-zero to enable log data collection
  
  TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  LOG_LENGTH = 0    # Non-zero to enable log data collection
  
@@ -54,6 +67,10 @@ ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  # to represent the full dcache
  BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  
  # to represent the full dcache
  BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  
+print ("ROW_SIZE", ROW_SIZE)
+print ("ROW_PER_LINE", ROW_PER_LINE)
+print ("BRAM_ROWS", BRAM_ROWS)
+print ("NUM_WAYS", NUM_WAYS)
  
  # Bit fields counts in the address
  
  
  # Bit fields counts in the address
  
@@ -94,26 +111,39 @@ TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
  WAY_BITS = log2_int(NUM_WAYS)
  
  # Example of layout for 32 lines of 64 bytes:
  WAY_BITS = log2_int(NUM_WAYS)
  
  # Example of layout for 32 lines of 64 bytes:
-#
-# ..  tag    |index|  line  |
-# ..         |   row   |    |
-# ..         |     |---|    | ROW_LINE_BITS  (3)
-# ..         |     |--- - --| LINE_OFF_BITS (6)
-# ..         |         |- --| ROW_OFF_BITS  (3)
-# ..         |----- ---|    | ROW_BITS      (8)
-# ..         |-----|        | INDEX_BITS    (5)
-# .. --------|              | TAG_BITS      (45)
+layout = """\
+  ..  tag    |index|  line  |
+  ..         |   row   |    |
+  ..         |     |---|    | ROW_LINE_BITS  (3)
+  ..         |     |--- - --| LINE_OFF_BITS (6)
+  ..         |         |- --| ROW_OFF_BITS  (3)
+  ..         |----- ---|    | ROW_BITS      (8)
+  ..         |-----|        | INDEX_BITS    (5)
+  .. --------|              | TAG_BITS      (45)
+"""
+print (layout)
+print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
+            (TAG_BITS, INDEX_BITS, ROW_BITS,
+             ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
+print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
+print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
+print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
  
  TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
  
  
  TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
  
+print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
+
  def CacheTagArray():
  def CacheTagArray():
-    return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
+    return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
+                        for x in range(NUM_LINES))
  
  def CacheValidBitsArray():
  
  def CacheValidBitsArray():
-    return Array(Signal(INDEX_BITS) for x in range(NUM_LINES))
+    return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
+                        for x in range(NUM_LINES))
  
  def RowPerLineValidArray():
  
  def RowPerLineValidArray():
-    return Array(Signal() for x in range(ROW_PER_LINE))
+    return Array(Signal(name="rows_valid%d" % x) \
+                        for x in range(ROW_PER_LINE))
  
  # L1 TLB
  TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
  
  # L1 TLB
  TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
@@ -123,10 +153,13 @@ TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
  TLB_PTE_BITS     = 64
  TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
  
  TLB_PTE_BITS     = 64
  TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
  
+def ispow2(x):
+    return (1<<log2_int(x, False)) == x
+
  assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
  assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
-assert (LINE_SIZE % 2) == 0, "LINE_SIZE not power of 2"
-assert (NUM_LINES % 2) == 0, "NUM_LINES not power of 2"
-assert (ROW_PER_LINE % 2) == 0, "ROW_PER_LINE not power of 2"
+assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
+assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
+assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
  assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
  assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
          "geometry bits don't add up"
  assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
  assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
          "geometry bits don't add up"
@@ -139,31 +172,39 @@ assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
  
  
  def TLBValidBitsArray():
  
  
  def TLBValidBitsArray():
-    return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
+    return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
+                for x in range(TLB_SET_SIZE))
  
  def TLBTagEAArray():
  
  def TLBTagEAArray():
-    return Array(Signal(TLB_EA_TAG_BITS) for x in range (TLB_NUM_WAYS))
+    return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
+                for x in range (TLB_NUM_WAYS))
  
  def TLBTagsArray():
  
  def TLBTagsArray():
-    return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
+    return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
+                for x in range (TLB_SET_SIZE))
  
  def TLBPtesArray():
  
  def TLBPtesArray():
-    return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
+    return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
+                for x in range(TLB_SET_SIZE))
  
  def HitWaySet():
  
  def HitWaySet():
-    return Array(Signal(NUM_WAYS) for x in range(TLB_NUM_WAYS))
+    return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
+                        for x in range(TLB_NUM_WAYS))
  
  # Cache RAM interface
  def CacheRamOut():
  
  # Cache RAM interface
  def CacheRamOut():
-    return Array(Signal(WB_DATA_BITS) for x in range(NUM_WAYS))
+    return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
+                 for x in range(NUM_WAYS))
  
  # PLRU output interface
  def PLRUOut():
  
  # PLRU output interface
  def PLRUOut():
-    return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
+    return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
+                for x in range(NUM_LINES))
  
  # TLB PLRU output interface
  def TLBPLRUOut():
  
  # TLB PLRU output interface
  def TLBPLRUOut():
-    return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
+    return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
+                for x in range(TLB_SET_SIZE))
  
  # Helper functions to decode incoming requests
  #
  
  # Helper functions to decode incoming requests
  #
@@ -177,7 +218,7 @@ def get_row(addr):
  
  # Return the index of a row within a line
  def get_row_of_line(row):
  
  # Return the index of a row within a line
  def get_row_of_line(row):
-    return row[:ROW_LINE_BITS]
+    return row[:ROW_BITS][:ROW_LINE_BITS]
  
  # Returns whether this is the last row of a line
  def is_last_row_addr(addr, last):
  
  # Returns whether this is the last row of a line
  def is_last_row_addr(addr, last):
@@ -221,8 +262,8 @@ def write_tlb_pte(way, ptes, newpte):
  
  # Record for storing permission, attribute, etc. bits from a PTE
  class PermAttr(RecordObject):
  
  # Record for storing permission, attribute, etc. bits from a PTE
  class PermAttr(RecordObject):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, name=None):
+        super().__init__(name=name)
          self.reference = Signal()
          self.changed   = Signal()
          self.nocache   = Signal()
          self.reference = Signal()
          self.changed   = Signal()
          self.nocache   = Signal()
@@ -233,12 +274,6 @@ class PermAttr(RecordObject):
  
  def extract_perm_attr(pte):
      pa = PermAttr()
  
  def extract_perm_attr(pte):
      pa = PermAttr()
-    pa.reference = pte[8]
-    pa.changed   = pte[7]
-    pa.nocache   = pte[5]
-    pa.priv      = pte[3]
-    pa.rd_perm   = pte[2]
-    pa.wr_perm   = pte[1]
      return pa;
  
  
      return pa;
  
  
@@ -282,9 +317,9 @@ class State(Enum):
  # Stage 0 register, basically contains just the latched request
  
  class RegStage0(RecordObject):
  # Stage 0 register, basically contains just the latched request
  
  class RegStage0(RecordObject):
-    def __init__(self):
-        super().__init__()
-        self.req     = LoadStore1ToDCacheType()
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.req     = LoadStore1ToDCacheType(name="lsmem")
          self.tlbie   = Signal()
          self.doall   = Signal()
          self.tlbld   = Signal()
          self.tlbie   = Signal()
          self.doall   = Signal()
          self.tlbld   = Signal()
@@ -292,8 +327,8 @@ class RegStage0(RecordObject):
  
  
  class MemAccessRequest(RecordObject):
  
  
  class MemAccessRequest(RecordObject):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, name=None):
+        super().__init__(name=name)
          self.op        = Signal(Op)
          self.valid     = Signal()
          self.dcbz      = Signal()
          self.op        = Signal(Op)
          self.valid     = Signal()
          self.dcbz      = Signal()
@@ -308,17 +343,17 @@ class MemAccessRequest(RecordObject):
  # First stage register, contains state for stage 1 of load hits
  # and for the state machine used by all other operations
  class RegStage1(RecordObject):
  # First stage register, contains state for stage 1 of load hits
  # and for the state machine used by all other operations
  class RegStage1(RecordObject):
-    def __init__(self):
-        super().__init__()
+    def __init__(self, name=None):
+        super().__init__(name=name)
          # Info about the request
          self.full             = Signal() # have uncompleted request
          self.mmu_req          = Signal() # request is from MMU
          # Info about the request
          self.full             = Signal() # have uncompleted request
          self.mmu_req          = Signal() # request is from MMU
-        self.req              = MemAccessRequest()
+        self.req              = MemAccessRequest(name="reqmem")
  
          # Cache hit state
          self.hit_way          = Signal(WAY_BITS)
          self.hit_load_valid   = Signal()
  
          # Cache hit state
          self.hit_way          = Signal(WAY_BITS)
          self.hit_load_valid   = Signal()
-        self.hit_index        = Signal(NUM_LINES)
+        self.hit_index        = Signal(INDEX_BITS)
          self.cache_hit        = Signal()
  
          # TLB hit state
          self.cache_hit        = Signal()
  
          # TLB hit state
@@ -342,12 +377,13 @@ class RegStage1(RecordObject):
          self.write_bram       = Signal()
          self.write_tag        = Signal()
          self.slow_valid       = Signal()
          self.write_bram       = Signal()
          self.write_tag        = Signal()
          self.slow_valid       = Signal()
-        self.wb               = WBMasterOut()
+        self.real_adr         = Signal(REAL_ADDR_BITS)
+        self.wb               = WBMasterOut("wb")
          self.reload_tag       = Signal(TAG_BITS)
          self.store_way        = Signal(WAY_BITS)
          self.store_row        = Signal(ROW_BITS)
          self.store_index      = Signal(INDEX_BITS)
          self.reload_tag       = Signal(TAG_BITS)
          self.store_way        = Signal(WAY_BITS)
          self.store_row        = Signal(ROW_BITS)
          self.store_index      = Signal(INDEX_BITS)
-        self.end_row_ix       = Signal(log2_int(ROW_LINE_BITS, False))
+        self.end_row_ix       = Signal(ROW_LINE_BITS)
          self.rows_valid       = RowPerLineValidArray()
          self.acks_pending     = Signal(3)
          self.inc_acks         = Signal()
          self.rows_valid       = RowPerLineValidArray()
          self.acks_pending     = Signal(3)
          self.inc_acks         = Signal()
@@ -373,16 +409,15 @@ class Reservation(RecordObject):
  
  
  class DTLBUpdate(Elaboratable):
  
  
  class DTLBUpdate(Elaboratable):
-    def __init__(self, dtlb_valid_bits, dtlb_ptes):
+    def __init__(self):
          self.tlbie    = Signal()
          self.tlbwe    = Signal()
          self.doall    = Signal()
          self.tlbie    = Signal()
          self.tlbwe    = Signal()
          self.doall    = Signal()
+        self.updated  = Signal()
+        self.v_updated  = Signal()
          self.tlb_hit    = Signal()
          self.tlb_req_index = Signal(TLB_SET_BITS)
  
          self.tlb_hit    = Signal()
          self.tlb_req_index = Signal(TLB_SET_BITS)
  
-        self.dtlb_valid_bits = dtlb_valid_bits
-        self.dtlb_ptes       = dtlb_ptes
-
          self.tlb_hit_way     = Signal(TLB_WAY_BITS)
          self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
          self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
          self.tlb_hit_way     = Signal(TLB_WAY_BITS)
          self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
          self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
@@ -390,6 +425,12 @@ class DTLBUpdate(Elaboratable):
          self.eatag           = Signal(TLB_EA_TAG_BITS)
          self.pte_data        = Signal(TLB_PTE_BITS)
  
          self.eatag           = Signal(TLB_EA_TAG_BITS)
          self.pte_data        = Signal(TLB_PTE_BITS)
  
+        self.dv = Signal(TLB_PTE_WAY_BITS)
+
+        self.tb_out = Signal(TLB_TAG_WAY_BITS)
+        self.pb_out = Signal(TLB_NUM_WAYS)
+        self.db_out = Signal(TLB_PTE_WAY_BITS)
+
      def elaborate(self, platform):
          m = Module()
          comb = m.d.comb
      def elaborate(self, platform):
          m = Module()
          comb = m.d.comb
@@ -398,32 +439,127 @@ class DTLBUpdate(Elaboratable):
          tagset   = Signal(TLB_TAG_WAY_BITS)
          pteset   = Signal(TLB_PTE_WAY_BITS)
  
          tagset   = Signal(TLB_TAG_WAY_BITS)
          pteset   = Signal(TLB_PTE_WAY_BITS)
  
-        vb = Signal(TLB_NUM_WAYS)
-        db = Signal(TLB_PTE_WAY_BITS)
-
-        sync += vb.eq(self.dtlb_valid_bits[self.tlb_req_index])
-        sync += db.eq(self.dtlb_ptes[self.tlb_req_index])
+        tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
  
          with m.If(self.tlbie & self.doall):
  
          with m.If(self.tlbie & self.doall):
-            # clear all valid bits at once
-            for i in range(TLB_SET_SIZE):
-                sync += self.dtlb_valid_bits[i].eq(0)
-
+            pass # clear all back in parent
          with m.Elif(self.tlbie):
              with m.If(self.tlb_hit):
          with m.Elif(self.tlbie):
              with m.If(self.tlb_hit):
-                sync += vb.bit_select(self.tlb_hit_way, 1).eq(Const(0, 1))
+                comb += db_out.eq(self.dv)
+                comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
+                comb += self.v_updated.eq(1)
  
          with m.Elif(self.tlbwe):
  
              comb += tagset.eq(self.tlb_tag_way)
              comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
  
          with m.Elif(self.tlbwe):
  
              comb += tagset.eq(self.tlb_tag_way)
              comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
-            sync += db.eq(tagset)
+            comb += tb_out.eq(tagset)
  
              comb += pteset.eq(self.tlb_pte_way)
              comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
  
              comb += pteset.eq(self.tlb_pte_way)
              comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
-            sync += db.eq(pteset)
+            comb += pb_out.eq(pteset)
+
+            comb += db_out.bit_select(self.repl_way, 1).eq(1)
+
+            comb += self.updated.eq(1)
+            comb += self.v_updated.eq(1)
+
+        return m
+
+
+class DCachePendingHit(Elaboratable):
+
+    def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
+                      cache_valid_idx, cache_tag_set,
+                    req_addr,
+                    hit_set):
+
+        self.go          = Signal()
+        self.virt_mode   = Signal()
+        self.is_hit      = Signal()
+        self.tlb_hit     = Signal()
+        self.hit_way     = Signal(WAY_BITS)
+        self.rel_match   = Signal()
+        self.req_index   = Signal(INDEX_BITS)
+        self.reload_tag  = Signal(TAG_BITS)
+
+        self.tlb_hit_way = tlb_hit_way
+        self.tlb_pte_way = tlb_pte_way
+        self.tlb_valid_way = tlb_valid_way
+        self.cache_valid_idx = cache_valid_idx
+        self.cache_tag_set = cache_tag_set
+        self.req_addr = req_addr
+        self.hit_set = hit_set
+
+    def elaborate(self, platform):
+        m = Module()
+        comb = m.d.comb
+        sync = m.d.sync
+
+        go = self.go
+        virt_mode = self.virt_mode
+        is_hit = self.is_hit
+        tlb_pte_way = self.tlb_pte_way
+        tlb_valid_way = self.tlb_valid_way
+        cache_valid_idx = self.cache_valid_idx
+        cache_tag_set = self.cache_tag_set
+        req_addr = self.req_addr
+        tlb_hit_way = self.tlb_hit_way
+        tlb_hit = self.tlb_hit
+        hit_set = self.hit_set
+        hit_way = self.hit_way
+        rel_match = self.rel_match
+        req_index = self.req_index
+        reload_tag = self.reload_tag
+
+        rel_matches = Array(Signal(name="rel_matches_%d" % i) \
+                                    for i in range(TLB_NUM_WAYS))
+        hit_way_set = HitWaySet()
+
+        # Test if pending request is a hit on any way
+        # In order to make timing in virtual mode,
+        # when we are using the TLB, we compare each
+        # way with each of the real addresses from each way of
+        # the TLB, and then decide later which match to use.
+
+        with m.If(virt_mode):
+            for j in range(TLB_NUM_WAYS):
+                s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
+                s_hit       = Signal()
+                s_pte       = Signal(TLB_PTE_BITS)
+                s_ra        = Signal(REAL_ADDR_BITS)
+                comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
+                comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
+                                    s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
+                comb += s_tag.eq(get_tag(s_ra))
  
  
-            sync += vb.bit_select(self.repl_way, 1).eq(1)
+                for i in range(NUM_WAYS):
+                    is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
+                    comb += is_tag_hit.eq(go & cache_valid_idx[i] &
+                                  (read_tag(i, cache_tag_set) == s_tag)
+                                  & tlb_valid_way[j])
+                    with m.If(is_tag_hit):
+                        comb += hit_way_set[j].eq(i)
+                        comb += s_hit.eq(1)
+                comb += hit_set[j].eq(s_hit)
+                with m.If(s_tag == reload_tag):
+                    comb += rel_matches[j].eq(1)
+            with m.If(tlb_hit):
+                comb += is_hit.eq(hit_set[tlb_hit_way])
+                comb += hit_way.eq(hit_way_set[tlb_hit_way])
+                comb += rel_match.eq(rel_matches[tlb_hit_way])
+        with m.Else():
+            s_tag       = Signal(TAG_BITS)
+            comb += s_tag.eq(get_tag(req_addr))
+            for i in range(NUM_WAYS):
+                is_tag_hit = Signal(name="is_tag_hit_%d" % i)
+                comb += is_tag_hit.eq(go & cache_valid_idx[i] &
+                          (read_tag(i, cache_tag_set) == s_tag))
+                with m.If(is_tag_hit):
+                    comb += hit_way.eq(i)
+                    comb += is_hit.eq(1)
+            with m.If(s_tag == reload_tag):
+                comb += rel_match.eq(1)
  
          return m
  
  
          return m
  
@@ -437,11 +573,11 @@ class DCache(Elaboratable):
        while not idle...)
      """
      def __init__(self):
        while not idle...)
      """
      def __init__(self):
-        self.d_in      = LoadStore1ToDCacheType()
-        self.d_out     = DCacheToLoadStore1Type()
+        self.d_in      = LoadStore1ToDCacheType("d_in")
+        self.d_out     = DCacheToLoadStore1Type("d_out")
  
  
-        self.m_in      = MMUToDCacheType()
-        self.m_out     = DCacheToMMUType()
+        self.m_in      = MMUToDCacheType("m_in")
+        self.m_out     = DCacheToMMUType("m_out")
  
          self.stall_out = Signal()
  
  
          self.stall_out = Signal()
  
@@ -457,12 +593,11 @@ class DCache(Elaboratable):
          sync = m.d.sync
          d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
  
          sync = m.d.sync
          d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
  
-        r = RegStage0()
+        r = RegStage0("stage0")
  
          # TODO, this goes in unit tests and formal proofs
  
          # TODO, this goes in unit tests and formal proofs
-        with m.If(~(d_in.valid & m_in.valid)):
-            #sync += Display("request collision loadstore vs MMU")
-            pass
+        with m.If(d_in.valid & m_in.valid):
+            sync += Display("request collision loadstore vs MMU")
  
          with m.If(m_in.valid):
              sync += r.req.valid.eq(1)
  
          with m.If(m_in.valid):
              sync += r.req.valid.eq(1)
@@ -470,7 +605,7 @@ class DCache(Elaboratable):
              sync += r.req.dcbz.eq(0)
              sync += r.req.nc.eq(0)
              sync += r.req.reserve.eq(0)
              sync += r.req.dcbz.eq(0)
              sync += r.req.nc.eq(0)
              sync += r.req.reserve.eq(0)
-            sync += r.req.virt_mode.eq(1)
+            sync += r.req.virt_mode.eq(0)
              sync += r.req.priv_mode.eq(1)
              sync += r.req.addr.eq(m_in.addr)
              sync += r.req.data.eq(m_in.pte)
              sync += r.req.priv_mode.eq(1)
              sync += r.req.addr.eq(m_in.addr)
              sync += r.req.data.eq(m_in.pte)
@@ -519,33 +654,24 @@ class DCache(Elaboratable):
              sync += tlb_tag_way.eq(dtlb_tags[index])
              sync += tlb_pte_way.eq(dtlb_ptes[index])
  
              sync += tlb_tag_way.eq(dtlb_tags[index])
              sync += tlb_pte_way.eq(dtlb_ptes[index])
  
-    def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, acc, acc_en, lru):
+    def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
          """Generate TLB PLRUs
          """
          comb = m.d.comb
          sync = m.d.sync
  
          """Generate TLB PLRUs
          """
          comb = m.d.comb
          sync = m.d.sync
  
-        with m.If(TLB_NUM_WAYS > 1):
-            for i in range(TLB_SET_SIZE):
-                # TLB PLRU interface
-                tlb_plru        = PLRU(TLB_WAY_BITS)
-                setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
-                tlb_plru_acc    = Signal(TLB_WAY_BITS)
-                tlb_plru_acc_en = Signal()
-                tlb_plru_out    = Signal(TLB_WAY_BITS)
-
-                comb += tlb_plru.acc.eq(tlb_plru_acc)
-                comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
-                comb += tlb_plru.lru.eq(tlb_plru_out)
-
-                # PLRU interface
-                with m.If(r1.tlb_hit_index == i):
-                    comb += tlb_plru.acc_en.eq(r1.tlb_hit)
-                with m.Else():
-                    comb += tlb_plru.acc_en.eq(0)
-                comb += tlb_plru.acc.eq(r1.tlb_hit_way)
+        if TLB_NUM_WAYS == 0:
+            return
+        for i in range(TLB_SET_SIZE):
+            # TLB PLRU interface
+            tlb_plru        = PLRU(TLB_WAY_BITS)
+            setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
+            tlb_plru_acc_en = Signal()
  
  
-                comb += tlb_plru_victim[i].eq(tlb_plru.lru)
+            comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
+            comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
+            comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
+            comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
  
      def tlb_search(self, m, tlb_req_index, r0, r0_valid,
                     tlb_valid_way, tlb_tag_way, tlb_hit_way,
  
      def tlb_search(self, m, tlb_req_index, r0, r0_valid,
                     tlb_valid_way, tlb_tag_way, tlb_hit_way,
@@ -563,8 +689,10 @@ class DCache(Elaboratable):
          comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
  
          for i in range(TLB_NUM_WAYS):
          comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
  
          for i in range(TLB_NUM_WAYS):
-            with m.If(tlb_valid_way[i]
-                      & read_tlb_tag(i, tlb_tag_way) == eatag):
+            is_tag_hit = Signal()
+            comb += is_tag_hit.eq(tlb_valid_way[i]
+                                  & (read_tlb_tag(i, tlb_tag_way) == eatag))
+            with m.If(is_tag_hit):
                  comb += hitway.eq(i)
                  comb += hit.eq(1)
  
                  comb += hitway.eq(i)
                  comb += hit.eq(1)
  
@@ -580,15 +708,20 @@ class DCache(Elaboratable):
              comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
                                r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
                                pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
              comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
                                r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
                                pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
-            comb += perm_attr.eq(extract_perm_attr(pte))
+            comb += perm_attr.reference.eq(pte[8])
+            comb += perm_attr.changed.eq(pte[7])
+            comb += perm_attr.nocache.eq(pte[5])
+            comb += perm_attr.priv.eq(pte[3])
+            comb += perm_attr.rd_perm.eq(pte[2])
+            comb += perm_attr.wr_perm.eq(pte[1])
          with m.Else():
              comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
                                r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
  
              comb += perm_attr.reference.eq(1)
              comb += perm_attr.changed.eq(1)
          with m.Else():
              comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
                                r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
  
              comb += perm_attr.reference.eq(1)
              comb += perm_attr.changed.eq(1)
-            comb += perm_attr.priv.eq(1)
              comb += perm_attr.nocache.eq(0)
              comb += perm_attr.nocache.eq(0)
+            comb += perm_attr.priv.eq(1)
              comb += perm_attr.rd_perm.eq(1)
              comb += perm_attr.wr_perm.eq(1)
  
              comb += perm_attr.rd_perm.eq(1)
              comb += perm_attr.wr_perm.eq(1)
  
@@ -597,6 +730,7 @@ class DCache(Elaboratable):
                      dtlb_tags, tlb_pte_way, dtlb_ptes):
  
          comb = m.d.comb
                      dtlb_tags, tlb_pte_way, dtlb_ptes):
  
          comb = m.d.comb
+        sync = m.d.sync
  
          tlbie    = Signal()
          tlbwe    = Signal()
  
          tlbie    = Signal()
          tlbwe    = Signal()
@@ -604,7 +738,19 @@ class DCache(Elaboratable):
          comb += tlbie.eq(r0_valid & r0.tlbie)
          comb += tlbwe.eq(r0_valid & r0.tlbld)
  
          comb += tlbie.eq(r0_valid & r0.tlbie)
          comb += tlbwe.eq(r0_valid & r0.tlbld)
  
-        m.submodules.tlb_update = d = DTLBUpdate(dtlb_valid_bits, dtlb_ptes)
+        m.submodules.tlb_update = d = DTLBUpdate()
+        with m.If(tlbie & r0.doall):
+            # clear all valid bits at once
+            for i in range(TLB_SET_SIZE):
+                sync += dtlb_valid_bits[i].eq(0)
+        with m.If(d.updated):
+            sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
+            sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
+        with m.If(d.v_updated):
+            sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
+
+        comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
+
          comb += d.tlbie.eq(tlbie)
          comb += d.tlbwe.eq(tlbwe)
          comb += d.doall.eq(r0.doall)
          comb += d.tlbie.eq(tlbie)
          comb += d.tlbwe.eq(tlbwe)
          comb += d.doall.eq(r0.doall)
@@ -627,23 +773,19 @@ class DCache(Elaboratable):
          comb = m.d.comb
          sync = m.d.sync
  
          comb = m.d.comb
          sync = m.d.sync
  
+        if TLB_NUM_WAYS == 0:
+            return
+
          for i in range(NUM_LINES):
              # PLRU interface
          for i in range(NUM_LINES):
              # PLRU interface
-            plru        = PLRU(TLB_WAY_BITS)
+            plru        = PLRU(WAY_BITS)
              setattr(m.submodules, "plru%d" % i, plru)
              setattr(m.submodules, "plru%d" % i, plru)
-            plru_acc    = Signal(WAY_BITS)
              plru_acc_en = Signal()
              plru_acc_en = Signal()
-            plru_out    = Signal(WAY_BITS)
  
  
-            comb += plru.acc.eq(plru_acc)
+            comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
              comb += plru.acc_en.eq(plru_acc_en)
              comb += plru.acc_en.eq(plru_acc_en)
-            comb += plru_out.eq(plru.lru_o)
-
-            with m.If(r1.hit_index == i):
-                comb += plru_acc_en.eq(r1.cache_hit)
-
-            comb += plru_acc.eq(r1.hit_way)
-            comb += plru_victim[i].eq(plru_out)
+            comb += plru.acc_i.eq(r1.hit_way)
+            comb += plru_victim[i].eq(plru.lru_o)
  
      def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
          """Cache tag RAM read port
  
      def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
          """Cache tag RAM read port
@@ -663,7 +805,7 @@ class DCache(Elaboratable):
          sync += cache_tag_set.eq(cache_tags[index])
  
      def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
          sync += cache_tag_set.eq(cache_tags[index])
  
      def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
-                       r0_valid, r1, cache_valid_bits, replace_way,
+                       r0_valid, r1, cache_valids, replace_way,
                         use_forward1_next, use_forward2_next,
                         req_hit_way, plru_victim, rc_ok, perm_attr,
                         valid_ra, perm_ok, access_ok, req_op, req_go,
                         use_forward1_next, use_forward2_next,
                         req_hit_way, plru_victim, rc_ok, perm_attr,
                         valid_ra, perm_ok, access_ok, req_op, req_go,
@@ -683,72 +825,50 @@ class DCache(Elaboratable):
          opsel       = Signal(3)
          go          = Signal()
          nc          = Signal()
          opsel       = Signal(3)
          go          = Signal()
          nc          = Signal()
-        s_hit       = Signal()
-        s_tag       = Signal(TAG_BITS)
-        s_pte       = Signal(TLB_PTE_BITS)
-        s_ra        = Signal(REAL_ADDR_BITS)
-        hit_set     = Signal(TLB_NUM_WAYS)
-        hit_way_set = HitWaySet()
-        rel_matches = Signal(TLB_NUM_WAYS)
-        rel_match   = Signal()
+        hit_set     = Array(Signal(name="hit_set_%d" % i) \
+                                  for i in range(TLB_NUM_WAYS))
+        cache_valid_idx = Signal(NUM_WAYS)
  
          # Extract line, row and tag from request
          comb += req_index.eq(get_index(r0.req.addr))
          comb += req_row.eq(get_row(r0.req.addr))
          comb += req_tag.eq(get_tag(ra))
  
  
          # Extract line, row and tag from request
          comb += req_index.eq(get_index(r0.req.addr))
          comb += req_row.eq(get_row(r0.req.addr))
          comb += req_tag.eq(get_tag(ra))
  
-        comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
-
-        # Test if pending request is a hit on any way
-        # In order to make timing in virtual mode,
-        # when we are using the TLB, we compare each
-        # way with each of the real addresses from each way of
-        # the TLB, and then decide later which match to use.
-
-        with m.If(r0.req.virt_mode):
-            comb += rel_matches.eq(0)
-            for j in range(TLB_NUM_WAYS):
-                comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
-                comb += s_ra.eq(Cat(r0.req.addr[0:TLB_LG_PGSZ],
-                                    s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
-                comb += s_tag.eq(get_tag(s_ra))
+        if False: # display on comb is a bit... busy.
+            comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
+                    r0.req.addr, ra, req_index, req_tag, req_row)
  
  
-                for i in range(NUM_WAYS):
-                    with m.If(go & cache_valid_bits[req_index][i] &
-                              (read_tag(i, cache_tag_set) == s_tag)
-                              & tlb_valid_way[j]):
-                        comb += hit_way_set[j].eq(i)
-                        comb += s_hit.eq(1)
-                comb += hit_set[j].eq(s_hit)
-                with m.If(s_tag == r1.reload_tag):
-                    comb += rel_matches[j].eq(1)
-            with m.If(tlb_hit):
-                comb += is_hit.eq(hit_set.bit_select(tlb_hit_way, 1))
-                comb += hit_way.eq(hit_way_set[tlb_hit_way])
-                comb += rel_match.eq(rel_matches.bit_select(tlb_hit_way, 1))
-        with m.Else():
-            comb += s_tag.eq(get_tag(r0.req.addr))
-            for i in range(NUM_WAYS):
-                with m.If(go & cache_valid_bits[req_index][i] &
-                          read_tag(i, cache_tag_set) == s_tag):
-                    comb += hit_way.eq(i)
-                    comb += is_hit.eq(1)
-            with m.If(s_tag == r1.reload_tag):
-                comb += rel_match.eq(1)
-        comb += req_same_tag.eq(rel_match)
+        comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
+        comb += cache_valid_idx.eq(cache_valids[req_index])
+
+        m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
+                                tlb_valid_way, tlb_hit_way,
+                                cache_valid_idx, cache_tag_set,
+                                r0.req.addr,
+                                hit_set)
+
+        comb += dc.tlb_hit.eq(tlb_hit)
+        comb += dc.reload_tag.eq(r1.reload_tag)
+        comb += dc.virt_mode.eq(r0.req.virt_mode)
+        comb += dc.go.eq(go)
+        comb += dc.req_index.eq(req_index)
+        comb += is_hit.eq(dc.is_hit)
+        comb += hit_way.eq(dc.hit_way)
+        comb += req_same_tag.eq(dc.rel_match)
  
          # See if the request matches the line currently being reloaded
          with m.If((r1.state == State.RELOAD_WAIT_ACK) &
  
          # See if the request matches the line currently being reloaded
          with m.If((r1.state == State.RELOAD_WAIT_ACK) &
-                  (req_index == r1.store_index) & rel_match):
+                  (req_index == r1.store_index) & req_same_tag):
              # For a store, consider this a hit even if the row isn't
              # valid since it will be by the time we perform the store.
              # For a load, check the appropriate row valid bit.
              # For a store, consider this a hit even if the row isn't
              # valid since it will be by the time we perform the store.
              # For a load, check the appropriate row valid bit.
-            valid = r1.rows_valid[req_row % ROW_PER_LINE]
+            rrow = Signal(ROW_LINE_BITS)
+            comb += rrow.eq(req_row)
+            valid = r1.rows_valid[rrow]
              comb += is_hit.eq(~r0.req.load | valid)
              comb += hit_way.eq(replace_way)
  
          # Whether to use forwarded data for a load or not
              comb += is_hit.eq(~r0.req.load | valid)
              comb += hit_way.eq(replace_way)
  
          # Whether to use forwarded data for a load or not
-        comb += use_forward1_next.eq(0)
          with m.If((get_row(r1.req.real_addr) == req_row) &
                    (r1.req.hit_way == hit_way)):
              # Only need to consider r1.write_bram here, since if we
          with m.If((get_row(r1.req.real_addr) == req_row) &
                    (r1.req.hit_way == hit_way)):
              # Only need to consider r1.write_bram here, since if we
@@ -760,7 +880,6 @@ class DCache(Elaboratable):
              # cycles after the refill starts before we see the updated
              # cache tag. In that case we don't use the bypass.)
              comb += use_forward1_next.eq(r1.write_bram)
              # cycles after the refill starts before we see the updated
              # cache tag. In that case we don't use the bypass.)
              comb += use_forward1_next.eq(r1.write_bram)
-        comb += use_forward2_next.eq(0)
          with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
              comb += use_forward2_next.eq(r1.forward_valid1)
  
          with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
              comb += use_forward2_next.eq(r1.forward_valid1)
  
@@ -769,7 +888,7 @@ class DCache(Elaboratable):
  
          # The way to replace on a miss
          with m.If(r1.write_tag):
  
          # The way to replace on a miss
          with m.If(r1.write_tag):
-            replace_way.eq(plru_victim[r1.store_index])
+            comb += replace_way.eq(plru_victim[r1.store_index])
          with m.Else():
              comb += replace_way.eq(r1.store_way)
  
          with m.Else():
              comb += replace_way.eq(r1.store_way)
  
@@ -778,10 +897,9 @@ class DCache(Elaboratable):
          comb += rc_ok.eq(perm_attr.reference
                           & (r0.req.load | perm_attr.changed)
                  )
          comb += rc_ok.eq(perm_attr.reference
                           & (r0.req.load | perm_attr.changed)
                  )
-        comb += perm_ok.eq((r0.req.priv_mode | ~perm_attr.priv)
-                           & perm_attr.wr_perm
-                           | (r0.req.load & perm_attr.rd_perm)
-                          )
+        comb += perm_ok.eq((r0.req.priv_mode | ~perm_attr.priv) &
+                           (perm_attr.wr_perm |
+                              (r0.req.load & perm_attr.rd_perm)))
          comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
          # Combine the request and cache hit status to decide what
          # operation needs to be done
          comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
          # Combine the request and cache hit status to decide what
          # operation needs to be done
@@ -795,24 +913,14 @@ class DCache(Elaboratable):
              with m.Else():
                  comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
                  with m.Switch(opsel):
              with m.Else():
                  comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
                  with m.Switch(opsel):
-                    with m.Case(0b101):
-                        comb += op.eq(Op.OP_LOAD_HIT)
-                    with m.Case(0b100):
-                        comb += op.eq(Op.OP_LOAD_MISS)
-                    with m.Case(0b110):
-                        comb += op.eq(Op.OP_LOAD_NC)
-                    with m.Case(0b001):
-                        comb += op.eq(Op.OP_STORE_HIT)
-                    with m.Case(0b000):
-                        comb += op.eq(Op.OP_STORE_MISS)
-                    with m.Case(0b010):
-                        comb += op.eq(Op.OP_STORE_MISS)
-                    with m.Case(0b011):
-                        comb += op.eq(Op.OP_BAD)
-                    with m.Case(0b111):
-                        comb += op.eq(Op.OP_BAD)
-                    with m.Default():
-                        comb += op.eq(Op.OP_NONE)
+                    with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
+                    with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
+                    with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
+                    with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
+                    with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
+                    with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
+                    with m.Case(0b011): comb += op.eq(Op.OP_BAD)
+                    with m.Case(0b111): comb += op.eq(Op.OP_BAD)
          comb += req_op.eq(op)
          comb += req_go.eq(go)
  
          comb += req_op.eq(op)
          comb += req_go.eq(go)
  
@@ -836,14 +944,14 @@ class DCache(Elaboratable):
          sync = m.d.sync
  
          with m.If(r0_valid & r0.req.reserve):
          sync = m.d.sync
  
          with m.If(r0_valid & r0.req.reserve):
-
              # XXX generate alignment interrupt if address
              # is not aligned XXX or if r0.req.nc = '1'
              with m.If(r0.req.load):
                  comb += set_rsrv.eq(1) # load with reservation
              with m.Else():
                  comb += clear_rsrv.eq(1) # store conditional
              # XXX generate alignment interrupt if address
              # is not aligned XXX or if r0.req.nc = '1'
              with m.If(r0.req.load):
                  comb += set_rsrv.eq(1) # load with reservation
              with m.Else():
                  comb += clear_rsrv.eq(1) # store conditional
-                with m.If(~reservation.valid | r0.req.addr[LINE_OFF_BITS:64]):
+                with m.If(~reservation.valid |
+                         (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
                      comb += cancel_store.eq(1)
  
      def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
                      comb += cancel_store.eq(1)
  
      def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
@@ -859,7 +967,7 @@ class DCache(Elaboratable):
                  sync += reservation.valid.eq(1)
                  sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
  
                  sync += reservation.valid.eq(1)
                  sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
  
-    def writeback_control(self, m, r1, cache_out):
+    def writeback_control(self, m, r1, cache_out_row):
          """Return data for loads & completion control logic
          """
          comb = m.d.comb
          """Return data for loads & completion control logic
          """
          comb = m.d.comb
@@ -878,7 +986,7 @@ class DCache(Elaboratable):
          with m.Else():
              comb += data_fwd.eq(r1.forward_data2)
  
          with m.Else():
              comb += data_fwd.eq(r1.forward_data2)
  
-        comb += data_out.eq(cache_out[r1.hit_way])
+        comb += data_out.eq(cache_out_row)
  
          for i in range(8):
              with m.If(r1.forward_sel[i]):
  
          for i in range(8):
              with m.If(r1.forward_sel[i]):
@@ -919,35 +1027,32 @@ class DCache(Elaboratable):
              # Request came from loadstore1...
              # Load hit case is the standard path
              with m.If(r1.hit_load_valid):
              # Request came from loadstore1...
              # Load hit case is the standard path
              with m.If(r1.hit_load_valid):
-                #Display(f"completing load hit data={data_out}")
-                pass
+                sync += Display("completing load hit data=%x", data_out)
  
              # error cases complete without stalling
              with m.If(r1.ls_error):
  
              # error cases complete without stalling
              with m.If(r1.ls_error):
-                # Display("completing ld/st with error")
-                pass
+                sync += Display("completing ld/st with error")
  
              # Slow ops (load miss, NC, stores)
              with m.If(r1.slow_valid):
  
              # Slow ops (load miss, NC, stores)
              with m.If(r1.slow_valid):
-                #Display(f"completing store or load miss data={data_out}")
-                pass
+                sync += Display("completing store or load miss data=%x",
+                                data_out)
  
          with m.Else():
              # Request came from MMU
              with m.If(r1.hit_load_valid):
  
          with m.Else():
              # Request came from MMU
              with m.If(r1.hit_load_valid):
-                # Display(f"completing load hit to MMU, data={m_out.data}")
-                pass
+                sync += Display("completing load hit to MMU, data=%x",
+                                m_out.data)
              # error cases complete without stalling
              with m.If(r1.mmu_error):
              # error cases complete without stalling
              with m.If(r1.mmu_error):
-                #Display("combpleting MMU ld with error")
-                pass
+                sync += Display("combpleting MMU ld with error")
  
              # Slow ops (i.e. load miss)
              with m.If(r1.slow_valid):
  
              # Slow ops (i.e. load miss)
              with m.If(r1.slow_valid):
-                #Display("completing MMU load miss, data={m_out.data}")
-                pass
+                sync += Display("completing MMU load miss, data=%x",
+                                m_out.data)
  
  
-    def rams(self, m, r1, early_req_row, cache_out, replace_way):
+    def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
          """rams
          Generate a cache RAM for each way. This handles the normal
          reads, writes from reloads and the special store-hit update
          """rams
          Generate a cache RAM for each way. This handles the normal
          reads, writes from reloads and the special store-hit update
@@ -962,16 +1067,16 @@ class DCache(Elaboratable):
          wb_in = self.wb_in
  
          for i in range(NUM_WAYS):
          wb_in = self.wb_in
  
          for i in range(NUM_WAYS):
-            do_read  = Signal()
+            do_read  = Signal(name="do_rd%d" % i)
              rd_addr  = Signal(ROW_BITS)
              rd_addr  = Signal(ROW_BITS)
-            do_write = Signal()
+            do_write = Signal(name="do_wr%d" % i)
              wr_addr  = Signal(ROW_BITS)
              wr_data  = Signal(WB_DATA_BITS)
              wr_sel   = Signal(ROW_SIZE)
              wr_sel_m = Signal(ROW_SIZE)
              wr_addr  = Signal(ROW_BITS)
              wr_data  = Signal(WB_DATA_BITS)
              wr_sel   = Signal(ROW_SIZE)
              wr_sel_m = Signal(ROW_SIZE)
-            _d_out   = Signal(WB_DATA_BITS)
+            _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i)
  
  
-            way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
+            way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True)
              setattr(m.submodules, "cacheram_%d" % i, way)
  
              comb += way.rd_en.eq(do_read)
              setattr(m.submodules, "cacheram_%d" % i, way)
  
              comb += way.rd_en.eq(do_read)
@@ -983,8 +1088,9 @@ class DCache(Elaboratable):
  
              # Cache hit reads
              comb += do_read.eq(1)
  
              # Cache hit reads
              comb += do_read.eq(1)
-            comb += rd_addr.eq(early_req_row)
-            comb += cache_out[i].eq(_d_out)
+            comb += rd_addr.eq(early_req_row[:ROW_BITS])
+            with m.If(r1.hit_way == i):
+                comb += cache_out_row.eq(_d_out)
  
              # Write mux:
              #
  
              # Write mux:
              #
@@ -1015,26 +1121,25 @@ class DCache(Elaboratable):
                        & wb_in.ack & (replace_way == i)):
                  comb += do_write.eq(1)
  
                        & wb_in.ack & (replace_way == i)):
                  comb += do_write.eq(1)
  
-                # Mask write selects with do_write since BRAM
-                # doesn't have a global write-enable
-                with m.If(do_write):
-                    comb += wr_sel_m.eq(wr_sel)
+            # Mask write selects with do_write since BRAM
+            # doesn't have a global write-enable
+            with m.If(do_write):
+                comb += wr_sel_m.eq(wr_sel)
  
      # Cache hit synchronous machine for the easy case.
      # This handles load hits.
      # It also handles error cases (TLB miss, cache paradox)
      def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
  
      # Cache hit synchronous machine for the easy case.
      # This handles load hits.
      # It also handles error cases (TLB miss, cache paradox)
      def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
-                        req_hit_way, req_index, access_ok,
+                        req_hit_way, req_index, req_tag, access_ok,
                          tlb_hit, tlb_hit_way, tlb_req_index):
  
          comb = m.d.comb
          sync = m.d.sync
  
          with m.If(req_op != Op.OP_NONE):
                          tlb_hit, tlb_hit_way, tlb_req_index):
  
          comb = m.d.comb
          sync = m.d.sync
  
          with m.If(req_op != Op.OP_NONE):
-            #Display(f"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \
-            #      f"idx:{req_index} tag:{req_tag} way: {req_hit_way}"
-            #     )
-            pass
+            sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
+                    req_op, r0.req.addr, r0.req.nc,
+                    req_index, req_tag, req_hit_way)
  
          with m.If(r0_valid):
              sync += r1.mmu_req.eq(r0.mmu_req)
  
          with m.If(r0_valid):
              sync += r1.mmu_req.eq(r0.mmu_req)
@@ -1085,18 +1190,24 @@ class DCache(Elaboratable):
      # All wishbone requests generation is done here.
      # This machine operates at stage 1.
      def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
      # All wishbone requests generation is done here.
      # This machine operates at stage 1.
      def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
-                    cache_valid_bits, r0, replace_way,
+                    cache_valids, r0, replace_way,
                      req_hit_way, req_same_tag,
                      req_hit_way, req_same_tag,
-                    r0_valid, req_op, cache_tag, req_go, ra):
+                    r0_valid, req_op, cache_tags, req_go, ra):
  
          comb = m.d.comb
          sync = m.d.sync
          wb_in = self.wb_in
  
  
          comb = m.d.comb
          sync = m.d.sync
          wb_in = self.wb_in
  
-        req         = MemAccessRequest()
+        req         = MemAccessRequest("mreq_ds")
          acks        = Signal(3)
          adjust_acks = Signal(3)
          acks        = Signal(3)
          adjust_acks = Signal(3)
-        stbs_done = Signal()
+
+        req_row = Signal(ROW_BITS)
+        req_idx = Signal(INDEX_BITS)
+        req_tag = Signal(TAG_BITS)
+        comb += req_idx.eq(get_index(req.real_addr))
+        comb += req_row.eq(get_row(req.real_addr))
+        comb += req_tag.eq(get_tag(req.real_addr))
  
          sync += r1.use_forward1.eq(use_forward1_next)
          sync += r1.forward_sel.eq(0)
  
          sync += r1.use_forward1.eq(use_forward1_next)
          sync += r1.forward_sel.eq(0)
@@ -1145,8 +1256,9 @@ class DCache(Elaboratable):
              for i in range(NUM_WAYS):
                  with m.If(i == replace_way):
                      ct = Signal(TAG_RAM_WIDTH)
              for i in range(NUM_WAYS):
                  with m.If(i == replace_way):
                      ct = Signal(TAG_RAM_WIDTH)
-                    sync += ct.eq(cache_tag[r1.store_index])
-                    sync += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
+                    comb += ct.eq(cache_tags[r1.store_index])
+                    comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
+                    sync += cache_tags[r1.store_index].eq(ct)
              sync += r1.store_way.eq(replace_way)
              sync += r1.write_tag.eq(0)
  
              sync += r1.store_way.eq(replace_way)
              sync += r1.write_tag.eq(0)
  
@@ -1189,23 +1301,17 @@ class DCache(Elaboratable):
          with m.Switch(r1.state):
  
              with m.Case(State.IDLE):
          with m.Switch(r1.state):
  
              with m.Case(State.IDLE):
-# XXX check 'left downto.  probably means len(r1.wb.adr)
-#                     r1.wb.adr <= req.real_addr(
-#                                   r1.wb.adr'left downto 0
-#                                  );
-                sync += r1.wb.adr.eq(req.real_addr)
+                sync += r1.real_adr.eq(req.real_addr)
                  sync += r1.wb.sel.eq(req.byte_sel)
                  sync += r1.wb.dat.eq(req.data)
                  sync += r1.dcbz.eq(req.dcbz)
  
                  # Keep track of our index and way
                  # for subsequent stores.
                  sync += r1.wb.sel.eq(req.byte_sel)
                  sync += r1.wb.dat.eq(req.data)
                  sync += r1.dcbz.eq(req.dcbz)
  
                  # Keep track of our index and way
                  # for subsequent stores.
-                sync += r1.store_index.eq(get_index(req.real_addr))
-                sync += r1.store_row.eq(get_row(req.real_addr))
-                sync += r1.end_row_ix.eq(
-                         get_row_of_line(get_row(req.real_addr))
-                        )
-                sync += r1.reload_tag.eq(get_tag(req.real_addr))
+                sync += r1.store_index.eq(req_idx)
+                sync += r1.store_row.eq(req_row)
+                sync += r1.end_row_ix.eq(get_row_of_line(req_row))
+                sync += r1.reload_tag.eq(req_tag)
                  sync += r1.req.same_tag.eq(1)
  
                  with m.If(req.op == Op.OP_STORE_HIT):
                  sync += r1.req.same_tag.eq(1)
  
                  with m.If(req.op == Op.OP_STORE_HIT):
@@ -1216,17 +1322,18 @@ class DCache(Elaboratable):
                  for i in range(ROW_PER_LINE):
                      sync += r1.rows_valid[i].eq(0)
  
                  for i in range(ROW_PER_LINE):
                      sync += r1.rows_valid[i].eq(0)
  
+                with m.If(req_op != Op.OP_NONE):
+                    sync += Display("cache op %d", req.op)
+
                  with m.Switch(req.op):
                      with m.Case(Op.OP_LOAD_HIT):
                          # stay in IDLE state
                          pass
  
                      with m.Case(Op.OP_LOAD_MISS):
                  with m.Switch(req.op):
                      with m.Case(Op.OP_LOAD_HIT):
                          # stay in IDLE state
                          pass
  
                      with m.Case(Op.OP_LOAD_MISS):
-                        #Display(f"cache miss real addr:" \
-                        #      f"{req_real_addr}" \
-                        #      f" idx:{get_index(req_real_addr)}" \
-                        #      f" tag:{get_tag(req.real_addr)}")
-                        pass
+                        sync += Display("cache miss real addr: %x " \
+                                "idx: %x tag: %x",
+                                req.real_addr, req_row, req_tag)
  
                          # Start the wishbone cycle
                          sync += r1.wb.we.eq(0)
  
                          # Start the wishbone cycle
                          sync += r1.wb.we.eq(0)
@@ -1258,6 +1365,8 @@ class DCache(Elaboratable):
                              with m.If(req.op == Op.OP_STORE_HIT):
                                  sync += r1.write_bram.eq(1)
                          with m.Else():
                              with m.If(req.op == Op.OP_STORE_HIT):
                                  sync += r1.write_bram.eq(1)
                          with m.Else():
+                            # dcbz is handled much like a load miss except
+                            # that we are writing to memory instead of reading
                              sync += r1.state.eq(State.RELOAD_WAIT_ACK)
  
                              with m.If(req.op == Op.OP_STORE_MISS):
                              sync += r1.state.eq(State.RELOAD_WAIT_ACK)
  
                              with m.If(req.op == Op.OP_STORE_MISS):
@@ -1278,29 +1387,31 @@ class DCache(Elaboratable):
                          pass
  
              with m.Case(State.RELOAD_WAIT_ACK):
                          pass
  
              with m.Case(State.RELOAD_WAIT_ACK):
+                ld_stbs_done = Signal()
                  # Requests are all sent if stb is 0
                  # Requests are all sent if stb is 0
-                comb += stbs_done.eq(~r1.wb.stb)
+                comb += ld_stbs_done.eq(~r1.wb.stb)
  
  
-                with m.If(~wb_in.stall & ~stbs_done):
+                with m.If((~wb_in.stall) & r1.wb.stb):
                      # That was the last word?
                      # We are done sending.
                      # That was the last word?
                      # We are done sending.
-                    # Clear stb and set stbs_done
+                    # Clear stb and set ld_stbs_done
                      # so we can handle an eventual
                      # last ack on the same cycle.
                      # so we can handle an eventual
                      # last ack on the same cycle.
-                    with m.If(is_last_row_addr(
-                              r1.wb.adr, r1.end_row_ix)):
+                    with m.If(is_last_row_addr(r1.real_adr, r1.end_row_ix)):
                          sync += r1.wb.stb.eq(0)
                          sync += r1.wb.stb.eq(0)
-                        comb += stbs_done.eq(0)
+                        comb += ld_stbs_done.eq(1)
  
                      # Calculate the next row address in the current cache line
  
                      # Calculate the next row address in the current cache line
-                    rarange = r1.wb.adr[ROW_OFF_BITS : LINE_OFF_BITS]
-                    sync += rarange.eq(rarange + 1)
+                    row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
+                    comb += row.eq(r1.real_adr[ROW_OFF_BITS:])
+                    sync += r1.real_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(row+1)
  
                  # Incoming acks processing
                  sync += r1.forward_valid1.eq(wb_in.ack)
                  with m.If(wb_in.ack):
  
                  # Incoming acks processing
                  sync += r1.forward_valid1.eq(wb_in.ack)
                  with m.If(wb_in.ack):
-                    # XXX needs an Array bit-accessor here
-                    sync += r1.rows_valid[r1.store_row % ROW_PER_LINE].eq(1)
+                    srow = Signal(ROW_LINE_BITS)
+                    comb += srow.eq(r1.store_row)
+                    sync += r1.rows_valid[srow].eq(1)
  
                      # If this is the data we were looking for,
                      # we can complete the request next cycle.
  
                      # If this is the data we were looking for,
                      # we can complete the request next cycle.
@@ -1321,22 +1432,24 @@ class DCache(Elaboratable):
                          sync += r1.use_forward1.eq(1)
  
                      # Check for completion
                          sync += r1.use_forward1.eq(1)
  
                      # Check for completion
-                    with m.If(stbs_done & is_last_row(r1.store_row,
+                    with m.If(ld_stbs_done & is_last_row(r1.store_row,
                                                        r1.end_row_ix)):
                          # Complete wishbone cycle
                          sync += r1.wb.cyc.eq(0)
  
                          # Cache line is now valid
                          cv = Signal(INDEX_BITS)
                                                        r1.end_row_ix)):
                          # Complete wishbone cycle
                          sync += r1.wb.cyc.eq(0)
  
                          # Cache line is now valid
                          cv = Signal(INDEX_BITS)
-                        sync += cv.eq(cache_valid_bits[r1.store_index])
-                        sync += cv.bit_select(r1.store_way, 1).eq(1)
+                        comb += cv.eq(cache_valids[r1.store_index])
+                        comb += cv.bit_select(r1.store_way, 1).eq(1)
+                        sync += cache_valids[r1.store_index].eq(cv)
                          sync += r1.state.eq(State.IDLE)
  
                      # Increment store row counter
                      sync += r1.store_row.eq(next_row(r1.store_row))
  
              with m.Case(State.STORE_WAIT_ACK):
                          sync += r1.state.eq(State.IDLE)
  
                      # Increment store row counter
                      sync += r1.store_row.eq(next_row(r1.store_row))
  
              with m.Case(State.STORE_WAIT_ACK):
-                comb += stbs_done.eq(~r1.wb.stb)
+                st_stbs_done = Signal()
+                comb += st_stbs_done.eq(~r1.wb.stb)
                  comb += acks.eq(r1.acks_pending)
  
                  with m.If(r1.inc_acks != r1.dec_acks):
                  comb += acks.eq(r1.acks_pending)
  
                  with m.If(r1.inc_acks != r1.dec_acks):
@@ -1355,7 +1468,7 @@ class DCache(Elaboratable):
                      # to be done which is in the same real page.
                      with m.If(req.valid):
                          ra = req.real_addr[0:SET_SIZE_BITS]
                      # to be done which is in the same real page.
                      with m.If(req.valid):
                          ra = req.real_addr[0:SET_SIZE_BITS]
-                        sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
+                        sync += r1.real_adr[0:SET_SIZE_BITS].eq(ra)
                          sync += r1.wb.dat.eq(req.data)
                          sync += r1.wb.sel.eq(req.byte_sel)
  
                          sync += r1.wb.dat.eq(req.data)
                          sync += r1.wb.sel.eq(req.byte_sel)
  
@@ -1363,7 +1476,7 @@ class DCache(Elaboratable):
                                  ((req.op == Op.OP_STORE_MISS)
                                   | (req.op == Op.OP_STORE_HIT))):
                          sync += r1.wb.stb.eq(1)
                                  ((req.op == Op.OP_STORE_MISS)
                                   | (req.op == Op.OP_STORE_HIT))):
                          sync += r1.wb.stb.eq(1)
-                        comb += stbs_done.eq(0)
+                        comb += st_stbs_done.eq(0)
  
                          with m.If(req.op == Op.OP_STORE_HIT):
                              sync += r1.write_bram.eq(1)
  
                          with m.If(req.op == Op.OP_STORE_HIT):
                              sync += r1.write_bram.eq(1)
@@ -1372,15 +1485,15 @@ class DCache(Elaboratable):
  
                          # Store requests never come from the MMU
                          sync += r1.ls_valid.eq(1)
  
                          # Store requests never come from the MMU
                          sync += r1.ls_valid.eq(1)
-                        comb += stbs_done.eq(0)
+                        comb += st_stbs_done.eq(0)
                          sync += r1.inc_acks.eq(1)
                      with m.Else():
                          sync += r1.wb.stb.eq(0)
                          sync += r1.inc_acks.eq(1)
                      with m.Else():
                          sync += r1.wb.stb.eq(0)
-                        comb += stbs_done.eq(1)
+                        comb += st_stbs_done.eq(1)
  
                  # Got ack ? See if complete.
                  with m.If(wb_in.ack):
  
                  # Got ack ? See if complete.
                  with m.If(wb_in.ack):
-                    with m.If(stbs_done & (adjust_acks == 1)):
+                    with m.If(st_stbs_done & (adjust_acks == 1)):
                          sync += r1.state.eq(State.IDLE)
                          sync += r1.wb.cyc.eq(0)
                          sync += r1.wb.stb.eq(0)
                          sync += r1.state.eq(State.IDLE)
                          sync += r1.wb.cyc.eq(0)
                          sync += r1.wb.stb.eq(0)
@@ -1415,7 +1528,7 @@ class DCache(Elaboratable):
          sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
                                 stall_out, req_op[:3], d_out.valid, d_out.error,
                                 r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
          sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
                                 stall_out, req_op[:3], d_out.valid, d_out.error,
                                 r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
-                               r1.wb.adr[3:6]))
+                               r1.real_adr[3:6]))
  
      def elaborate(self, platform):
  
  
      def elaborate(self, platform):
  
@@ -1425,7 +1538,7 @@ class DCache(Elaboratable):
          # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
          cache_tags       = CacheTagArray()
          cache_tag_set    = Signal(TAG_RAM_WIDTH)
          # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
          cache_tags       = CacheTagArray()
          cache_tag_set    = Signal(TAG_RAM_WIDTH)
-        cache_valid_bits = CacheValidBitsArray()
+        cache_valids = CacheValidBitsArray()
  
          # TODO attribute ram_style : string;
          # TODO attribute ram_style of cache_tags : signal is "distributed";
  
          # TODO attribute ram_style : string;
          # TODO attribute ram_style of cache_tags : signal is "distributed";
@@ -1441,10 +1554,10 @@ class DCache(Elaboratable):
          # TODO attribute ram_style of
          #  dtlb_ptes : signal is "distributed";
  
          # TODO attribute ram_style of
          #  dtlb_ptes : signal is "distributed";
  
-        r0      = RegStage0()
+        r0      = RegStage0("r0")
          r0_full = Signal()
  
          r0_full = Signal()
  
-        r1 = RegStage1()
+        r1 = RegStage1("r1")
  
          reservation = Reservation()
  
  
          reservation = Reservation()
  
@@ -1470,7 +1583,7 @@ class DCache(Elaboratable):
          use_forward1_next = Signal()
          use_forward2_next = Signal()
  
          use_forward1_next = Signal()
          use_forward2_next = Signal()
  
-        cache_out         = CacheRamOut()
+        cache_out_row     = Signal(WB_DATA_BITS)
  
          plru_victim       = PLRUOut()
          replace_way       = Signal(WAY_BITS)
  
          plru_victim       = PLRUOut()
          replace_way       = Signal(WAY_BITS)
@@ -1488,7 +1601,7 @@ class DCache(Elaboratable):
          pte           = Signal(TLB_PTE_BITS)
          ra            = Signal(REAL_ADDR_BITS)
          valid_ra      = Signal()
          pte           = Signal(TLB_PTE_BITS)
          ra            = Signal(REAL_ADDR_BITS)
          valid_ra      = Signal()
-        perm_attr     = PermAttr()
+        perm_attr     = PermAttr("dc_perms")
          rc_ok         = Signal()
          perm_ok       = Signal()
          access_ok     = Signal()
          rc_ok         = Signal()
          perm_ok       = Signal()
          access_ok     = Signal()
@@ -1505,6 +1618,7 @@ class DCache(Elaboratable):
          comb += self.stall_out.eq(r0_stall)
  
          # Wire up wishbone request latch out of stage 1
          comb += self.stall_out.eq(r0_stall)
  
          # Wire up wishbone request latch out of stage 1
+        comb += r1.wb.adr.eq(r1.real_adr[ROW_OFF_BITS:]) # truncate LSBs
          comb += self.wb_out.eq(r1.wb)
  
          # call sub-functions putting everything together, using shared
          comb += self.wb_out.eq(r1.wb)
  
          # call sub-functions putting everything together, using shared
@@ -1520,9 +1634,10 @@ class DCache(Elaboratable):
                          tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
                          dtlb_tags, tlb_pte_way, dtlb_ptes)
          self.maybe_plrus(m, r1, plru_victim)
                          tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
                          dtlb_tags, tlb_pte_way, dtlb_ptes)
          self.maybe_plrus(m, r1, plru_victim)
+        self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
          self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
          self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
          self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
          self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
-                           r0_valid, r1, cache_valid_bits, replace_way,
+                           r0_valid, r1, cache_valids, replace_way,
                             use_forward1_next, use_forward2_next,
                             req_hit_way, plru_victim, rc_ok, perm_attr,
                             valid_ra, perm_ok, access_ok, req_op, req_go,
                             use_forward1_next, use_forward2_next,
                             req_hit_way, plru_victim, rc_ok, perm_attr,
                             valid_ra, perm_ok, access_ok, req_op, req_go,
@@ -1533,160 +1648,61 @@ class DCache(Elaboratable):
                             r0_valid, r0, reservation)
          self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
                             reservation, r0)
                             r0_valid, r0, reservation)
          self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
                             reservation, r0)
-        self.writeback_control(m, r1, cache_out)
-        self.rams(m, r1, early_req_row, cache_out, replace_way)
+        self.writeback_control(m, r1, cache_out_row)
+        self.rams(m, r1, early_req_row, cache_out_row, replace_way)
          self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
          self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
-                        req_hit_way, req_index, access_ok,
+                        req_hit_way, req_index, req_tag, access_ok,
                          tlb_hit, tlb_hit_way, tlb_req_index)
          self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
                          tlb_hit, tlb_hit_way, tlb_req_index)
          self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
-                    cache_valid_bits, r0, replace_way,
+                    cache_valids, r0, replace_way,
                      req_hit_way, req_same_tag,
                           r0_valid, req_op, cache_tags, req_go, ra)
          #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
  
          return m
  
                      req_hit_way, req_same_tag,
                           r0_valid, req_op, cache_tags, req_go, ra)
          #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
  
          return m
  
+def dcache_load(dut, addr, nc=0):
+    yield dut.d_in.load.eq(1)
+    yield dut.d_in.nc.eq(nc)
+    yield dut.d_in.addr.eq(addr)
+    yield dut.d_in.byte_sel.eq(~0)
+    yield dut.d_in.valid.eq(1)
+    yield
+    yield dut.d_in.valid.eq(0)
+    yield dut.d_in.byte_sel.eq(0)
+    yield
+    while not (yield dut.d_out.valid):
+        yield
+    data = yield dut.d_out.data
+    return data
+
+
+def dcache_store(dut, addr, data, nc=0):
+    yield dut.d_in.load.eq(0)
+    yield dut.d_in.nc.eq(nc)
+    yield dut.d_in.data.eq(data)
+    yield dut.d_in.byte_sel.eq(~0)
+    yield dut.d_in.addr.eq(addr)
+    yield dut.d_in.valid.eq(1)
+    yield
+    yield dut.d_in.valid.eq(0)
+    yield dut.d_in.byte_sel.eq(0)
+    yield
+    while not (yield dut.d_out.valid):
+        yield
+
+
+def dcache_random_sim(dut):
+
+    # start with stack of zeros
+    sim_mem = [0] * 512
  
  
-# dcache_tb.vhdl
-#
-# entity dcache_tb is
-# end dcache_tb;
-#
-# architecture behave of dcache_tb is
-#     signal clk          : std_ulogic;
-#     signal rst          : std_ulogic;
-#
-#     signal d_in         : Loadstore1ToDcacheType;
-#     signal d_out        : DcacheToLoadstore1Type;
-#
-#     signal m_in         : MmuToDcacheType;
-#     signal m_out        : DcacheToMmuType;
-#
-#     signal wb_bram_in   : wishbone_master_out;
-#     signal wb_bram_out  : wishbone_slave_out;
-#
-#     constant clk_period : time := 10 ns;
-# begin
-#     dcache0: entity work.dcache
-#         generic map(
-#
-#             LINE_SIZE => 64,
-#             NUM_LINES => 4
-#             )
-#         port map(
-#             clk => clk,
-#             rst => rst,
-#             d_in => d_in,
-#             d_out => d_out,
-#             m_in => m_in,
-#             m_out => m_out,
-#             wishbone_out => wb_bram_in,
-#             wishbone_in => wb_bram_out
-#             );
-#
-#     -- BRAM Memory slave
-#     bram0: entity work.wishbone_bram_wrapper
-#         generic map(
-#             MEMORY_SIZE   => 1024,
-#             RAM_INIT_FILE => "icache_test.bin"
-#             )
-#         port map(
-#             clk => clk,
-#             rst => rst,
-#             wishbone_in => wb_bram_in,
-#             wishbone_out => wb_bram_out
-#             );
-#
-#     clk_process: process
-#     begin
-#         clk <= '0';
-#         wait for clk_period/2;
-#         clk <= '1';
-#         wait for clk_period/2;
-#     end process;
-#
-#     rst_process: process
-#     begin
-#         rst <= '1';
-#         wait for 2*clk_period;
-#         rst <= '0';
-#         wait;
-#     end process;
-#
-#     stim: process
-#     begin
-#     -- Clear stuff
-#     d_in.valid <= '0';
-#     d_in.load <= '0';
-#     d_in.nc <= '0';
-#     d_in.addr <= (others => '0');
-#     d_in.data <= (others => '0');
-#         m_in.valid <= '0';
-#         m_in.addr <= (others => '0');
-#         m_in.pte <= (others => '0');
-#
-#         wait for 4*clk_period;
-#     wait until rising_edge(clk);
-#
-#     -- Cacheable read of address 4
-#     d_in.load <= '1';
-#     d_in.nc <= '0';
-#         d_in.addr <= x"0000000000000004";
-#         d_in.valid <= '1';
-#     wait until rising_edge(clk);
-#         d_in.valid <= '0';
-#
-#     wait until rising_edge(clk) and d_out.valid = '1';
-#         assert d_out.data = x"0000000100000000"
-#         report "data @" & to_hstring(d_in.addr) &
-#         "=" & to_hstring(d_out.data) &
-#         " expected 0000000100000000"
-#         severity failure;
-# --      wait for clk_period;
-#
-#     -- Cacheable read of address 30
-#     d_in.load <= '1';
-#     d_in.nc <= '0';
-#         d_in.addr <= x"0000000000000030";
-#         d_in.valid <= '1';
-#     wait until rising_edge(clk);
-#         d_in.valid <= '0';
-#
-#     wait until rising_edge(clk) and d_out.valid = '1';
-#         assert d_out.data = x"0000000D0000000C"
-#         report "data @" & to_hstring(d_in.addr) &
-#         "=" & to_hstring(d_out.data) &
-#         " expected 0000000D0000000C"
-#         severity failure;
-#
-#     -- Non-cacheable read of address 100
-#     d_in.load <= '1';
-#     d_in.nc <= '1';
-#         d_in.addr <= x"0000000000000100";
-#         d_in.valid <= '1';
-#     wait until rising_edge(clk);
-#     d_in.valid <= '0';
-#     wait until rising_edge(clk) and d_out.valid = '1';
-#         assert d_out.data = x"0000004100000040"
-#         report "data @" & to_hstring(d_in.addr) &
-#         "=" & to_hstring(d_out.data) &
-#         " expected 0000004100000040"
-#         severity failure;
-#
-#     wait until rising_edge(clk);
-#     wait until rising_edge(clk);
-#     wait until rising_edge(clk);
-#     wait until rising_edge(clk);
-#
-#     std.env.finish;
-#     end process;
-# end;
-def dcache_sim(dut):
      # clear stuff
      yield dut.d_in.valid.eq(0)
      yield dut.d_in.load.eq(0)
      # clear stuff
      yield dut.d_in.valid.eq(0)
      yield dut.d_in.load.eq(0)
+    yield dut.d_in.priv_mode.eq(1)
      yield dut.d_in.nc.eq(0)
      yield dut.d_in.nc.eq(0)
-    yield dut.d_in.adrr.eq(0)
+    yield dut.d_in.addr.eq(0)
      yield dut.d_in.data.eq(0)
      yield dut.m_in.valid.eq(0)
      yield dut.m_in.addr.eq(0)
      yield dut.d_in.data.eq(0)
      yield dut.m_in.valid.eq(0)
      yield dut.m_in.addr.eq(0)
@@ -1696,48 +1712,97 @@ def dcache_sim(dut):
      yield
      yield
      yield
      yield
      yield
      yield
-    # wait_until rising_edge(clk)
-    yield
-    # Cacheable read of address 4
-    yield dut.d_in.load.eq(1)
+
+    print ()
+
+    for i in range(256):
+        addr = randint(0, 255)
+        data = randint(0, (1<<64)-1)
+        sim_mem[addr] = data
+        addr *= 8
+
+        print ("testing %x data %x" % (addr, data))
+
+        yield from dcache_load(dut, addr)
+        yield from dcache_store(dut, addr, data)
+
+        addr = randint(0, 255)
+        sim_data = sim_mem[addr]
+        addr *= 8
+
+        data = yield from dcache_load(dut, addr)
+        assert data == sim_data, \
+            "check %x data %x != %x" % (addr, data, sim_data)
+
+    for addr in range(256):
+        data = yield from dcache_load(dut, addr*8)
+        assert data == sim_mem[addr], \
+            "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
+
+def dcache_sim(dut):
+    # clear stuff
+    yield dut.d_in.valid.eq(0)
+    yield dut.d_in.load.eq(0)
+    yield dut.d_in.priv_mode.eq(1)
      yield dut.d_in.nc.eq(0)
      yield dut.d_in.nc.eq(0)
-    yield dut.d_in.addr.eq(Const(0x0000000000000004, 64))
-    yield dut.d_in.valid.eq(1)
-    # wait-until rising_edge(clk)
+    yield dut.d_in.addr.eq(0)
+    yield dut.d_in.data.eq(0)
+    yield dut.m_in.valid.eq(0)
+    yield dut.m_in.addr.eq(0)
+    yield dut.m_in.pte.eq(0)
+    # wait 4 * clk_period
+    yield
+    yield
      yield
      yield
-    yield dut.d_in.valid.eq(0)
      yield
      yield
-    while not (yield dut.d_out.valid):
-        yield
-    assert dut.d_out.data == 0x0000000100000000, \
-        f"data @ {dut.d_in.addr}={dut.d_in.data} expected 0000000100000000"
  
  
+    # Cacheable read of address 4
+    data = yield from dcache_load(dut, 0x58)
+    addr = yield dut.d_in.addr
+    assert data == 0x0000001700000016, \
+        f"data @%x=%x expected 0x0000001700000016" % (addr, data)
+
+    # Cacheable read of address 20
+    data = yield from dcache_load(dut, 0x20)
+    addr = yield dut.d_in.addr
+    assert data == 0x0000000900000008, \
+        f"data @%x=%x expected 0x0000000900000008" % (addr, data)
  
      # Cacheable read of address 30
  
      # Cacheable read of address 30
-    yield dut.d_in.load.eq(1)
-    yield dut.d_in.nc.eq(0)
-    yield dut.d_in.addr.eq(Const(0x0000000000000030, 64))
-    yield dut.d_in.valid.eq(1)
-    yield
-    yield dut.d_in.valid.eq(0)
-    yield
-    while not (yield dut.d_out.valid):
-        yield
-    assert dut.d_out.data == 0x0000000D0000000C, \
-        f"data @{dut.d_in.addr}={dut.d_out.data} expected 0000000D0000000C"
+    data = yield from dcache_load(dut, 0x530)
+    addr = yield dut.d_in.addr
+    assert data == 0x0000014D0000014C, \
+        f"data @%x=%x expected 0000014D0000014C" % (addr, data)
+
+    # 2nd Cacheable read of address 30
+    data = yield from dcache_load(dut, 0x530)
+    addr = yield dut.d_in.addr
+    assert data == 0x0000014D0000014C, \
+        f"data @%x=%x expected 0000014D0000014C" % (addr, data)
  
      # Non-cacheable read of address 100
  
      # Non-cacheable read of address 100
-    yield dut.d_in.load.eq(1)
-    yield dut.d_in.nc.eq(1)
-    yield dut.d_in.addr.eq(Const(0x0000000000000100, 64))
-    yield dut.d_in.valid.eq(1)
-    yield
-    yield dut.d_in.valid.eq(0)
-    yield
-    while not (yield dut.d_out.valid):
-        yield
-    assert dut.d_out.data == 0x0000004100000040, \
-        f"data @ {dut.d_in.addr}={dut.d_out.data} expected 0000004100000040"
+    data = yield from dcache_load(dut, 0x100, nc=1)
+    addr = yield dut.d_in.addr
+    assert data == 0x0000004100000040, \
+        f"data @%x=%x expected 0000004100000040" % (addr, data)
+
+    # Store at address 530
+    yield from dcache_store(dut, 0x530, 0x121)
+
+    # Store at address 30
+    yield from dcache_store(dut, 0x530, 0x12345678)
+
+    # 3nd Cacheable read of address 530
+    data = yield from dcache_load(dut, 0x530)
+    addr = yield dut.d_in.addr
+    assert data == 0x12345678, \
+        f"data @%x=%x expected 0x12345678" % (addr, data)
+
+    # 4th Cacheable read of address 20
+    data = yield from dcache_load(dut, 0x20)
+    addr = yield dut.d_in.addr
+    assert data == 0x0000000900000008, \
+        f"data @%x=%x expected 0x0000000900000008" % (addr, data)
  
      yield
      yield
  
      yield
      yield
@@ -1745,14 +1810,44 @@ def dcache_sim(dut):
      yield
  
  
      yield
  
  
-def test_dcache():
+def test_dcache(mem, test_fn, test_name):
+    dut = DCache()
+
+    memory = Memory(width=64, depth=16*64, init=mem)
+    sram = SRAM(memory=memory, granularity=8)
+
+    m = Module()
+    m.submodules.dcache = dut
+    m.submodules.sram = sram
+
+    m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
+    m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
+    m.d.comb += sram.bus.we.eq(dut.wb_out.we)
+    m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
+    m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
+    m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
+
+    m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
+    m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
+
+    # nmigen Simulation
+    sim = Simulator(m)
+    sim.add_clock(1e-6)
+
+    sim.add_sync_process(wrap(test_fn(dut)))
+    with sim.write_vcd('test_dcache%s.vcd' % test_name):
+        sim.run()
+
+if __name__ == '__main__':
      dut = DCache()
      vl = rtlil.convert(dut, ports=[])
      with open("test_dcache.il", "w") as f:
          f.write(vl)
  
      dut = DCache()
      vl = rtlil.convert(dut, ports=[])
      with open("test_dcache.il", "w") as f:
          f.write(vl)
  
-    #run_simulation(dut, dcache_sim(), vcd_name='test_dcache.vcd')
+    mem = []
+    for i in range(0,512):
+        mem.append((i*2)| ((i*2+1)<<32))
  
  
-if __name__ == '__main__':
-    test_dcache()
+    test_dcache(mem, dcache_sim, "")
+    test_dcache(None, dcache_random_sim, "random")