double the number of lines in the L1 D/I-Cache to match microwatt

[soc.git] / src / soc / experiment / dcache.py
diff --git a/src/soc/experiment/dcache.py b/src/soc/experiment/dcache.py

index 39c9e309b8f84aef5b545f7213a91bbda20fcdd4..a828e3c3c2137b05e3197ef54e2c69bc2a80496e 100644 (file)
--- a/src/soc/experiment/dcache.py
+++ b/src/soc/experiment/dcache.py
@@ -13,6 +13,8 @@ Links:
  
  * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  * https://bugs.libre-soc.org/show_bug.cgi?id=469
  
  * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  * https://bugs.libre-soc.org/show_bug.cgi?id=469
+* https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
+  (discussion about brams for ECP5)
  
  """
  
  
  """
  
@@ -25,8 +27,9 @@ sys.setrecursionlimit(1000000)
  from enum import Enum, unique
  
  from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  from enum import Enum, unique
  
  from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
-                    Record)
+                    Record, Memory)
  from nmutil.util import Display
  from nmutil.util import Display
+from nmigen.lib.coding import Decoder
  
  from copy import deepcopy
  from random import randint, seed
  
  from copy import deepcopy
  from random import randint, seed
@@ -48,8 +51,8 @@ from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
                                  WBIOMasterOut, WBIOSlaveOut)
  
  from soc.experiment.cache_ram import CacheRam
                                  WBIOMasterOut, WBIOSlaveOut)
  
  from soc.experiment.cache_ram import CacheRam
-#from soc.experiment.plru import PLRU
-from nmutil.plru import PLRU
+from soc.experiment.plru import PLRU, PLRUs
+#from nmutil.plru import PLRU, PLRUs
  
  # for test
  from soc.bus.sram import SRAM
  
  # for test
  from soc.bus.sram import SRAM
@@ -65,7 +68,7 @@ from nmutil.util import wrap
  
  # TODO: make these parameters of DCache at some point
  LINE_SIZE = 64    # Line size in bytes
  
  # TODO: make these parameters of DCache at some point
  LINE_SIZE = 64    # Line size in bytes
-NUM_LINES = 16    # Number of lines in a set
+NUM_LINES = 32    # Number of lines in a set
  NUM_WAYS = 4      # Number of ways
  TLB_SET_SIZE = 64 # L1 DTLB entries per set
  TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  NUM_WAYS = 4      # Number of ways
  TLB_SET_SIZE = 64 # L1 DTLB entries per set
  TLB_NUM_WAYS = 2  # L1 DTLB number of sets
@@ -133,15 +136,18 @@ TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
  WAY_BITS = log2_int(NUM_WAYS)
  
  # Example of layout for 32 lines of 64 bytes:
  WAY_BITS = log2_int(NUM_WAYS)
  
  # Example of layout for 32 lines of 64 bytes:
-layout = """\
+layout = f"""\
+  DCache Layout:
+ |.. -----------------------| REAL_ADDR_BITS ({REAL_ADDR_BITS})
+  ..         |--------------| SET_SIZE_BITS ({SET_SIZE_BITS})
    ..  tag    |index|  line  |
    ..         |   row   |    |
    ..  tag    |index|  line  |
    ..         |   row   |    |
-  ..         |     |---|    | ROW_LINE_BITS  (3)
-  ..         |     |--- - --| LINE_OFF_BITS (6)
-  ..         |         |- --| ROW_OFF_BITS  (3)
-  ..         |----- ---|    | ROW_BITS      (8)
-  ..         |-----|        | INDEX_BITS    (5)
-  .. --------|              | TAG_BITS      (45)
+  ..         |     |---|    | ROW_LINE_BITS ({ROW_LINE_BITS})
+  ..         |     |--- - --| LINE_OFF_BITS ({LINE_OFF_BITS})
+  ..         |         |- --| ROW_OFF_BITS  ({ROW_OFF_BITS})
+  ..         |----- ---|    | ROW_BITS      ({ROW_BITS})
+  ..         |-----|        | INDEX_BITS    ({INDEX_BITS})
+  .. --------|              | TAG_BITS      ({TAG_BITS})
  """
  print (layout)
  print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
  """
  print (layout)
  print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
@@ -154,9 +160,12 @@ print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
  TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
  
  print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
  TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
  
  print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
+print ("    TAG_WIDTH", TAG_WIDTH)
+print ("     NUM_WAYS", NUM_WAYS)
+print ("    NUM_LINES", NUM_LINES)
  
  def CacheTagArray():
  
  def CacheTagArray():
-    tag_layout = [('valid', 1),
+    tag_layout = [('valid', NUM_WAYS),
                    ('tag', TAG_RAM_WIDTH),
                   ]
      return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
                    ('tag', TAG_RAM_WIDTH),
                   ]
      return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
@@ -190,6 +199,7 @@ assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
  assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
  assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
  
  assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
  assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
  
+
  def TLBHit(name):
      return Record([('valid', 1),
                     ('way', TLB_WAY_BITS)], name=name)
  def TLBHit(name):
      return Record([('valid', 1),
                     ('way', TLB_WAY_BITS)], name=name)
@@ -205,8 +215,9 @@ def TLBRecord(name):
                   ]
      return Record(tlb_layout, name=name)
  
                   ]
      return Record(tlb_layout, name=name)
  
-def TLBArray():
-    return Array(TLBRecord(name="tlb%d" % x) for x in range(TLB_SET_SIZE))
+def TLBValidArray():
+    return Array(Signal(TLB_NUM_WAYS, name="tlb_valid%d" % x)
+                        for x in range(TLB_SET_SIZE))
  
  def HitWaySet():
      return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
  
  def HitWaySet():
      return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
@@ -433,55 +444,133 @@ class DTLBUpdate(Elaboratable):
          self.tlbie    = Signal()
          self.tlbwe    = Signal()
          self.doall    = Signal()
          self.tlbie    = Signal()
          self.tlbwe    = Signal()
          self.doall    = Signal()
-        self.updated  = Signal()
-        self.v_updated  = Signal()
          self.tlb_hit     = TLBHit("tlb_hit")
          self.tlb_req_index = Signal(TLB_SET_BITS)
  
          self.tlb_hit     = TLBHit("tlb_hit")
          self.tlb_req_index = Signal(TLB_SET_BITS)
  
-        self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
-        self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
          self.repl_way        = Signal(TLB_WAY_BITS)
          self.eatag           = Signal(TLB_EA_TAG_BITS)
          self.pte_data        = Signal(TLB_PTE_BITS)
  
          self.repl_way        = Signal(TLB_WAY_BITS)
          self.eatag           = Signal(TLB_EA_TAG_BITS)
          self.pte_data        = Signal(TLB_PTE_BITS)
  
-        self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
-
-        self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
-        self.db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
-        self.pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
+        # read from dtlb array
+        self.tlb_read       = Signal()
+        self.tlb_read_index = Signal(TLB_SET_BITS)
+        self.tlb_way        = TLBRecord("o_tlb_way")
  
      def elaborate(self, platform):
          m = Module()
          comb = m.d.comb
          sync = m.d.sync
  
  
      def elaborate(self, platform):
          m = Module()
          comb = m.d.comb
          sync = m.d.sync
  
-        tagset   = Signal(TLB_TAG_WAY_BITS)
-        pteset   = Signal(TLB_PTE_WAY_BITS)
-
-        tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
-        comb += db_out.eq(self.dv)
+        # there are 3 parts to this:
+        # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
+        # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
+        # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
+        # be a Memory because they can all be cleared (tlbie, doall), i mean,
+        # we _could_, in theory, by overriding the Reset Signal of the Memory,
+        # hmmm....
+
+        dtlb_valid = TLBValidArray()
+        tlb_req_index = self.tlb_req_index
+
+        print ("TLB_TAG_WAY_BITS", TLB_TAG_WAY_BITS)
+        print ("     TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
+        print ("        TLB_NUM_WAYS", TLB_NUM_WAYS)
+        print ("TLB_PTE_WAY_BITS", TLB_PTE_WAY_BITS)
+        print ("    TLB_PTE_BITS", TLB_PTE_BITS)
+        print ("    TLB_NUM_WAYS", TLB_NUM_WAYS)
+
+        # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
+        tagway = Memory(depth=TLB_SET_SIZE, width=TLB_TAG_WAY_BITS)
+        m.submodules.rd_tagway = rd_tagway = tagway.read_port()
+        m.submodules.wr_tagway = wr_tagway = tagway.write_port(
+                                    granularity=TLB_EA_TAG_BITS)
+
+        pteway = Memory(depth=TLB_SET_SIZE, width=TLB_PTE_WAY_BITS)
+        m.submodules.rd_pteway = rd_pteway = pteway.read_port()
+        m.submodules.wr_pteway = wr_pteway = pteway.write_port(
+                                    granularity=TLB_PTE_BITS)
+
+        # commented out for now, can be put in if Memory.reset can be
+        # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
+        #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
+        #m.submodules.rd_valid = rd_valid = validm.read_port()
+        #m.submodules.wr_valid = wr_valid = validm.write_port(
+                                    #granularity=1)
+
+        # connect up read and write addresses to Valid/PTE/TAG SRAMs
+        m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
+        m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
+        #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
+        m.d.comb += wr_tagway.addr.eq(tlb_req_index)
+        m.d.comb += wr_pteway.addr.eq(tlb_req_index)
+        #m.d.comb += wr_valid.addr.eq(tlb_req_index)
+
+        updated  = Signal()
+        v_updated  = Signal()
+        tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
+        db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
+        pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
+        dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
+
+        comb += dv.eq(dtlb_valid[tlb_req_index])
+        comb += db_out.eq(dv)
  
          with m.If(self.tlbie & self.doall):
  
          with m.If(self.tlbie & self.doall):
-            pass # clear all back in parent
+            # clear all valid bits at once
+            # XXX hmmm, validm _could_ use Memory reset here...
+            for i in range(TLB_SET_SIZE):
+                sync += dtlb_valid[i].eq(0)
          with m.Elif(self.tlbie):
          with m.Elif(self.tlbie):
+            # invalidate just the hit_way
              with m.If(self.tlb_hit.valid):
                  comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
              with m.If(self.tlb_hit.valid):
                  comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
-                comb += self.v_updated.eq(1)
-
+                comb += v_updated.eq(1)
          with m.Elif(self.tlbwe):
          with m.Elif(self.tlbwe):
-
-            comb += tagset.eq(self.tlb_tag_way)
-            comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
-            comb += tb_out.eq(tagset)
-
-            comb += pteset.eq(self.tlb_pte_way)
-            comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
-            comb += pb_out.eq(pteset)
-
+            # write to the requested tag and PTE
+            comb += write_tlb_tag(self.repl_way, tb_out, self.eatag)
+            comb += write_tlb_pte(self.repl_way, pb_out, self.pte_data)
+            # set valid bit
              comb += db_out.bit_select(self.repl_way, 1).eq(1)
  
              comb += db_out.bit_select(self.repl_way, 1).eq(1)
  
-            comb += self.updated.eq(1)
-            comb += self.v_updated.eq(1)
+            comb += updated.eq(1)
+            comb += v_updated.eq(1)
+
+        # above, sometimes valid is requested to be updated but data not
+        # therefore split them out, here.  note the granularity thing matches
+        # with the shift-up of the eatag/pte_data into the correct TLB way.
+        # thus is it not necessary to write the entire lot, just the portion
+        # being altered: hence writing the *old* copy of the row is not needed
+        with m.If(updated): # PTE and TAG to be written
+            comb += wr_pteway.data.eq(pb_out)
+            comb += wr_pteway.en.eq(1<<self.repl_way)
+            comb += wr_tagway.data.eq(tb_out)
+            comb += wr_tagway.en.eq(1<<self.repl_way)
+        with m.If(v_updated): # Valid to be written
+            sync += dtlb_valid[tlb_req_index].eq(db_out)
+            #comb += wr_valid.data.eq(db_out)
+            #comb += wr_valid.en.eq(1<<self.repl_way)
+
+        # select one TLB way, use a register here
+        r_tlb_way        = TLBRecord("r_tlb_way")
+        r_delay = Signal()
+        sync += r_delay.eq(self.tlb_read)
+        with m.If(self.tlb_read):
+            sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
+        with m.If(r_delay):
+            # on one clock delay, output the contents of the read port(s)
+            # comb += self.tlb_way.valid.eq(rd_valid.data)
+            comb += self.tlb_way.tag.eq(rd_tagway.data)
+            comb += self.tlb_way.pte.eq(rd_pteway.data)
+            # and also capture the (delayed) output...
+            #sync += r_tlb_way.valid.eq(rd_valid.data)
+            sync += r_tlb_way.tag.eq(rd_tagway.data)
+            sync += r_tlb_way.pte.eq(rd_pteway.data)
+        with m.Else():
+            # ... so that the register can output it when no read is requested
+            # it's rather overkill but better to be safe than sorry
+            comb += self.tlb_way.tag.eq(r_tlb_way.tag)
+            comb += self.tlb_way.pte.eq(r_tlb_way.pte)
+            #comb += self.tlb_way.eq(r_tlb_way)
  
          return m
  
  
          return m
  
@@ -490,8 +579,7 @@ class DCachePendingHit(Elaboratable):
  
      def __init__(self, tlb_way,
                        cache_i_validdx, cache_tag_set,
  
      def __init__(self, tlb_way,
                        cache_i_validdx, cache_tag_set,
-                    req_addr,
-                    hit_set):
+                    req_addr):
  
          self.go          = Signal()
          self.virt_mode   = Signal()
  
          self.go          = Signal()
          self.virt_mode   = Signal()
@@ -506,7 +594,6 @@ class DCachePendingHit(Elaboratable):
          self.cache_i_validdx = cache_i_validdx
          self.cache_tag_set = cache_tag_set
          self.req_addr = req_addr
          self.cache_i_validdx = cache_i_validdx
          self.cache_tag_set = cache_tag_set
          self.req_addr = req_addr
-        self.hit_set = hit_set
  
      def elaborate(self, platform):
          m = Module()
  
      def elaborate(self, platform):
          m = Module()
@@ -521,12 +608,13 @@ class DCachePendingHit(Elaboratable):
          cache_tag_set = self.cache_tag_set
          req_addr = self.req_addr
          tlb_hit = self.tlb_hit
          cache_tag_set = self.cache_tag_set
          req_addr = self.req_addr
          tlb_hit = self.tlb_hit
-        hit_set = self.hit_set
          hit_way = self.hit_way
          rel_match = self.rel_match
          req_index = self.req_index
          reload_tag = self.reload_tag
  
          hit_way = self.hit_way
          rel_match = self.rel_match
          req_index = self.req_index
          reload_tag = self.reload_tag
  
+        hit_set     = Array(Signal(name="hit_set_%d" % i) \
+                                  for i in range(TLB_NUM_WAYS))
          rel_matches = Array(Signal(name="rel_matches_%d" % i) \
                                      for i in range(TLB_NUM_WAYS))
          hit_way_set = HitWaySet()
          rel_matches = Array(Signal(name="rel_matches_%d" % i) \
                                      for i in range(TLB_NUM_WAYS))
          hit_way_set = HitWaySet()
@@ -540,14 +628,15 @@ class DCachePendingHit(Elaboratable):
          with m.If(virt_mode):
              for j in range(TLB_NUM_WAYS): # tlb_num_way_t
                  s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
          with m.If(virt_mode):
              for j in range(TLB_NUM_WAYS): # tlb_num_way_t
                  s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
-                s_hit       = Signal()
-                s_pte       = Signal(TLB_PTE_BITS)
-                s_ra        = Signal(REAL_ADDR_BITS)
+                s_hit       = Signal(name="s_hit%d" % j)
+                s_pte       = Signal(TLB_PTE_BITS, name="s_pte%d" % j)
+                s_ra        = Signal(REAL_ADDR_BITS, name="s_ra%d" % j)
+                # read the PTE, calc the Real Address, get tge tag
                  comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
                  comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
                                      s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
                  comb += s_tag.eq(get_tag(s_ra))
                  comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
                  comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
                                      s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
                  comb += s_tag.eq(get_tag(s_ra))
-
+                # for each way check tge tag against the cache tag set
                  for i in range(NUM_WAYS): # way_t
                      is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
                      comb += is_tag_hit.eq(go & cache_i_validdx[i] &
                  for i in range(NUM_WAYS): # way_t
                      is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
                      comb += is_tag_hit.eq(go & cache_i_validdx[i] &
@@ -557,9 +646,8 @@ class DCachePendingHit(Elaboratable):
                          comb += hit_way_set[j].eq(i)
                          comb += s_hit.eq(1)
                  comb += hit_set[j].eq(s_hit)
                          comb += hit_way_set[j].eq(i)
                          comb += s_hit.eq(1)
                  comb += hit_set[j].eq(s_hit)
-                with m.If(s_tag == reload_tag):
-                    comb += rel_matches[j].eq(1)
-            with m.If(tlb_hit.way):
+                comb += rel_matches[j].eq(s_tag == reload_tag)
+            with m.If(tlb_hit.valid):
                  comb += is_hit.eq(hit_set[tlb_hit.way])
                  comb += hit_way.eq(hit_way_set[tlb_hit.way])
                  comb += rel_match.eq(rel_matches[tlb_hit.way])
                  comb += is_hit.eq(hit_set[tlb_hit.way])
                  comb += hit_way.eq(hit_way_set[tlb_hit.way])
                  comb += rel_match.eq(rel_matches[tlb_hit.way])
@@ -588,7 +676,7 @@ class DCache(Elaboratable):
        at the end of line (this requires dealing with requests coming in
        while not idle...)
      """
        at the end of line (this requires dealing with requests coming in
        while not idle...)
      """
-    def __init__(self):
+    def __init__(self, pspec=None):
          self.d_in      = LoadStore1ToDCacheType("d_in")
          self.d_out     = DCacheToLoadStore1Type("d_out")
  
          self.d_in      = LoadStore1ToDCacheType("d_in")
          self.d_out     = DCacheToLoadStore1Type("d_out")
  
@@ -607,6 +695,10 @@ class DCache(Elaboratable):
  
          self.log_out   = Signal(20)
  
  
          self.log_out   = Signal(20)
  
+        # test if microwatt compatibility is to be enabled
+        self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
+                                 (pspec.microwatt_compat == True))
+
      def stage_0(self, m, r0, r1, r0_full):
          """Latch the request in r0.req as long as we're not stalling
          """
      def stage_0(self, m, r0, r1, r0_full):
          """Latch the request in r0.req as long as we're not stalling
          """
@@ -645,6 +737,7 @@ class DCache(Elaboratable):
              comb += r.doall.eq(0)
              comb += r.tlbld.eq(0)
              comb += r.mmu_req.eq(0)
              comb += r.doall.eq(0)
              comb += r.tlbld.eq(0)
              comb += r.mmu_req.eq(0)
+
          with m.If((~r1.full & ~d_in.hold) | ~r0_full):
              sync += r0.eq(r)
              sync += r0_full.eq(r.req.valid)
          with m.If((~r1.full & ~d_in.hold) | ~r0_full):
              sync += r0.eq(r)
              sync += r0_full.eq(r.req.valid)
@@ -661,7 +754,7 @@ class DCache(Elaboratable):
                                   r.req.virt_mode, r.req.addr,
                                   r.req.data, r.req.load)
  
                                   r.req.virt_mode, r.req.addr,
                                   r.req.data, r.req.load)
  
-    def tlb_read(self, m, r0_stall, tlb_way, dtlb):
+    def tlb_read(self, m, r0_stall, tlb_way):
          """TLB
          Operates in the second cycle on the request latched in r0.req.
          TLB updates write the entry at the end of the second cycle.
          """TLB
          Operates in the second cycle on the request latched in r0.req.
          TLB updates write the entry at the end of the second cycle.
@@ -670,7 +763,6 @@ class DCache(Elaboratable):
          sync = m.d.sync
          m_in, d_in = self.m_in, self.d_in
  
          sync = m.d.sync
          m_in, d_in = self.m_in, self.d_in
  
-        index    = Signal(TLB_SET_BITS)
          addrbits = Signal(TLB_SET_BITS)
  
          amin = TLB_LG_PGSZ
          addrbits = Signal(TLB_SET_BITS)
  
          amin = TLB_LG_PGSZ
@@ -680,14 +772,15 @@ class DCache(Elaboratable):
              comb += addrbits.eq(m_in.addr[amin : amax])
          with m.Else():
              comb += addrbits.eq(d_in.addr[amin : amax])
              comb += addrbits.eq(m_in.addr[amin : amax])
          with m.Else():
              comb += addrbits.eq(d_in.addr[amin : amax])
-        comb += index.eq(addrbits)
  
          # If we have any op and the previous op isn't finished,
          # then keep the same output for next cycle.
  
          # If we have any op and the previous op isn't finished,
          # then keep the same output for next cycle.
-        with m.If(~r0_stall):
-            sync += tlb_way.eq(dtlb[index])
+        d = self.dtlb_update
+        comb += d.tlb_read_index.eq(addrbits)
+        comb += d.tlb_read.eq(~r0_stall)
+        comb += tlb_way.eq(d.tlb_way)
  
  
-    def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
+    def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
          """Generate TLB PLRUs
          """
          comb = m.d.comb
          """Generate TLB PLRUs
          """
          comb = m.d.comb
@@ -695,17 +788,15 @@ class DCache(Elaboratable):
  
          if TLB_NUM_WAYS == 0:
              return
  
          if TLB_NUM_WAYS == 0:
              return
-        for i in range(TLB_SET_SIZE):
-            # TLB PLRU interface
-            tlb_plru        = PLRU(TLB_WAY_BITS)
-            setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
-            tlb_plru_acc_en = Signal()
-
-            comb += tlb_plru_acc_en.eq(r1.tlb_hit.valid &
-                                       (r1.tlb_hit_index == i))
-            comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
-            comb += tlb_plru.acc_i.eq(r1.tlb_hit.way)
-            comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
+
+        # suite of PLRUs with a selection and output mechanism
+        tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
+        m.submodules.tlb_plrus = tlb_plrus
+        comb += tlb_plrus.way.eq(r1.tlb_hit.way)
+        comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
+        comb += tlb_plrus.index.eq(r1.tlb_hit_index)
+        comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
+        comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
  
      def tlb_search(self, m, tlb_req_index, r0, r0_valid,
                     tlb_way,
  
      def tlb_search(self, m, tlb_req_index, r0, r0_valid,
                     tlb_way,
@@ -767,8 +858,8 @@ class DCache(Elaboratable):
              m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
              m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
  
              m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
              m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
  
-    def tlb_update(self, m, r0_valid, r0, dtlb, tlb_req_index,
-                    tlb_hit, tlb_plru_victim, tlb_way):
+    def tlb_update(self, m, r0_valid, r0, tlb_req_index,
+                    tlb_hit, tlb_plru_victim):
  
          comb = m.d.comb
          sync = m.d.sync
  
          comb = m.d.comb
          sync = m.d.sync
@@ -779,31 +870,18 @@ class DCache(Elaboratable):
          comb += tlbie.eq(r0_valid & r0.tlbie)
          comb += tlbwe.eq(r0_valid & r0.tlbld)
  
          comb += tlbie.eq(r0_valid & r0.tlbie)
          comb += tlbwe.eq(r0_valid & r0.tlbld)
  
-        m.submodules.tlb_update = d = DTLBUpdate()
-        with m.If(tlbie & r0.doall):
-            # clear all valid bits at once
-            for i in range(TLB_SET_SIZE):
-                sync += dtlb[i].valid.eq(0)
-        with m.If(d.updated):
-            sync += dtlb[tlb_req_index].tag.eq(d.tb_out)
-            sync += dtlb[tlb_req_index].pte.eq(d.pb_out)
-        with m.If(d.v_updated):
-            sync += dtlb[tlb_req_index].valid.eq(d.db_out)
-
-        comb += d.dv.eq(dtlb[tlb_req_index].valid)
+        d = self.dtlb_update
  
          comb += d.tlbie.eq(tlbie)
          comb += d.tlbwe.eq(tlbwe)
          comb += d.doall.eq(r0.doall)
          comb += d.tlb_hit.eq(tlb_hit)
  
          comb += d.tlbie.eq(tlbie)
          comb += d.tlbwe.eq(tlbwe)
          comb += d.doall.eq(r0.doall)
          comb += d.tlb_hit.eq(tlb_hit)
-        comb += d.tlb_tag_way.eq(tlb_way.tag)
-        comb += d.tlb_pte_way.eq(tlb_way.pte)
          comb += d.tlb_req_index.eq(tlb_req_index)
  
          with m.If(tlb_hit.valid):
              comb += d.repl_way.eq(tlb_hit.way)
          with m.Else():
          comb += d.tlb_req_index.eq(tlb_req_index)
  
          with m.If(tlb_hit.valid):
              comb += d.repl_way.eq(tlb_hit.way)
          with m.Else():
-            comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
+            comb += d.repl_way.eq(tlb_plru_victim)
          comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
          comb += d.pte_data.eq(r0.req.data)
  
          comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
          comb += d.pte_data.eq(r0.req.data)
  
@@ -816,16 +894,13 @@ class DCache(Elaboratable):
          if TLB_NUM_WAYS == 0:
              return
  
          if TLB_NUM_WAYS == 0:
              return
  
-        for i in range(NUM_LINES):
-            # PLRU interface
-            plru        = PLRU(WAY_BITS)
-            setattr(m.submodules, "plru%d" % i, plru)
-            plru_acc_en = Signal()
-
-            comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
-            comb += plru.acc_en.eq(plru_acc_en)
-            comb += plru.acc_i.eq(r1.hit_way)
-            comb += plru_victim[i].eq(plru.lru_o)
+        # suite of PLRUs with a selection and output mechanism
+        m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
+        comb += plrus.way.eq(r1.hit_way)
+        comb += plrus.valid.eq(r1.cache_hit)
+        comb += plrus.index.eq(r1.hit_index)
+        comb += plrus.isel.eq(r1.store_index) # select victim
+        comb += plru_victim.eq(plrus.o_index) # selected victim
  
      def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
          """Cache tag RAM read port
  
      def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
          """Cache tag RAM read port
@@ -863,8 +938,6 @@ class DCache(Elaboratable):
          opsel       = Signal(3)
          go          = Signal()
          nc          = Signal()
          opsel       = Signal(3)
          go          = Signal()
          nc          = Signal()
-        hit_set     = Array(Signal(name="hit_set_%d" % i) \
-                                  for i in range(TLB_NUM_WAYS))
          cache_i_validdx = Signal(NUM_WAYS)
  
          # Extract line, row and tag from request
          cache_i_validdx = Signal(NUM_WAYS)
  
          # Extract line, row and tag from request
@@ -881,8 +954,7 @@ class DCache(Elaboratable):
  
          m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
                                              cache_i_validdx, cache_tag_set,
  
          m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
                                              cache_i_validdx, cache_tag_set,
-                                            r0.req.addr,
-                                            hit_set)
+                                            r0.req.addr)
          comb += dc.tlb_hit.eq(tlb_hit)
          comb += dc.reload_tag.eq(r1.reload_tag)
          comb += dc.virt_mode.eq(r0.req.virt_mode)
          comb += dc.tlb_hit.eq(tlb_hit)
          comb += dc.reload_tag.eq(r1.reload_tag)
          comb += dc.virt_mode.eq(r0.req.virt_mode)
@@ -925,7 +997,7 @@ class DCache(Elaboratable):
  
          # The way to replace on a miss
          with m.If(r1.write_tag):
  
          # The way to replace on a miss
          with m.If(r1.write_tag):
-            comb += replace_way.eq(plru_victim[r1.store_index])
+            comb += replace_way.eq(plru_victim)
          with m.Else():
              comb += replace_way.eq(r1.store_way)
  
          with m.Else():
              comb += replace_way.eq(r1.store_way)
  
@@ -1110,60 +1182,78 @@ class DCache(Elaboratable):
          comb = m.d.comb
          bus = self.bus
  
          comb = m.d.comb
          bus = self.bus
  
+        # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
+        # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
+        m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
+        comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
+                   ~r1.write_bram))
+        comb += rwe.i.eq(replace_way)
+
+        m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
+        comb += hwe.i.eq(r1.hit_way)
+
+        # this one is gated with write_bram, and replace_way_e can never be
+        # set at the same time.  that means that do_write can OR the outputs
+        m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
+        comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
+        comb += hre.i.eq(r1.req.hit_way)
+
+        # common Signals
+        do_read  = Signal()
+        wr_addr  = Signal(ROW_BITS)
+        wr_data  = Signal(WB_DATA_BITS)
+        wr_sel   = Signal(ROW_SIZE)
+        rd_addr  = Signal(ROW_BITS)
+
+        comb += do_read.eq(1) # always enable
+        comb += rd_addr.eq(early_req_row)
+
+        # Write mux:
+        #
+        # Defaults to wishbone read responses (cache refill)
+        #
+        # For timing, the mux on wr_data/sel/addr is not
+        # dependent on anything other than the current state.
+
+        with m.If(r1.write_bram):
+            # Write store data to BRAM.  This happens one
+            # cycle after the store is in r0.
+            comb += wr_data.eq(r1.req.data)
+            comb += wr_sel.eq(r1.req.byte_sel)
+            comb += wr_addr.eq(get_row(r1.req.real_addr))
+
+        with m.Else():
+            # Otherwise, we might be doing a reload or a DCBZ
+            with m.If(r1.dcbz):
+                comb += wr_data.eq(0)
+            with m.Else():
+                comb += wr_data.eq(bus.dat_r)
+            comb += wr_addr.eq(r1.store_row)
+            comb += wr_sel.eq(~0) # all 1s
+
+        # set up Cache Rams
          for i in range(NUM_WAYS):
          for i in range(NUM_WAYS):
-            do_read  = Signal(name="do_rd%d" % i)
-            rd_addr  = Signal(ROW_BITS, name="rd_addr_%d" % i)
              do_write = Signal(name="do_wr%d" % i)
              do_write = Signal(name="do_wr%d" % i)
-            wr_addr  = Signal(ROW_BITS, name="wr_addr_%d" % i)
-            wr_data  = Signal(WB_DATA_BITS, name="din_%d" % i)
-            wr_sel   = Signal(ROW_SIZE)
-            wr_sel_m = Signal(ROW_SIZE)
-            _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
+            wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
+            d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
  
              way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
  
              way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
-            setattr(m.submodules, "cacheram_%d" % i, way)
+            m.submodules["cacheram_%d" % i] = way
  
              comb += way.rd_en.eq(do_read)
              comb += way.rd_addr.eq(rd_addr)
  
              comb += way.rd_en.eq(do_read)
              comb += way.rd_addr.eq(rd_addr)
-            comb += _d_out.eq(way.rd_data_o)
+            comb += d_out.eq(way.rd_data_o)
              comb += way.wr_sel.eq(wr_sel_m)
              comb += way.wr_addr.eq(wr_addr)
              comb += way.wr_data.eq(wr_data)
  
              # Cache hit reads
              comb += way.wr_sel.eq(wr_sel_m)
              comb += way.wr_addr.eq(wr_addr)
              comb += way.wr_data.eq(wr_data)
  
              # Cache hit reads
-            comb += do_read.eq(1)
-            comb += rd_addr.eq(early_req_row)
-            with m.If(r1.hit_way == i):
-                comb += cache_out_row.eq(_d_out)
-
-            # Write mux:
-            #
-            # Defaults to wishbone read responses (cache refill)
-            #
-            # For timing, the mux on wr_data/sel/addr is not
-            # dependent on anything other than the current state.
-
-            with m.If(r1.write_bram):
-                # Write store data to BRAM.  This happens one
-                # cycle after the store is in r0.
-                comb += wr_data.eq(r1.req.data)
-                comb += wr_sel.eq(r1.req.byte_sel)
-                comb += wr_addr.eq(get_row(r1.req.real_addr))
-
-                with m.If(i == r1.req.hit_way):
-                    comb += do_write.eq(1)
-            with m.Else():
-                # Otherwise, we might be doing a reload or a DCBZ
-                with m.If(r1.dcbz):
-                    comb += wr_data.eq(0)
-                with m.Else():
-                    comb += wr_data.eq(bus.dat_r)
-                comb += wr_addr.eq(r1.store_row)
-                comb += wr_sel.eq(~0) # all 1s
+            with m.If(hwe.o[i]):
+                comb += cache_out_row.eq(d_out)
  
  
-                with m.If((r1.state == State.RELOAD_WAIT_ACK)
-                          & bus.ack & (replace_way == i)):
-                    comb += do_write.eq(1)
+            # these are mutually-exclusive via their Decoder-enablers
+            # (note: Decoder-enable is inverted)
+            comb += do_write.eq(hre.o[i] | rwe.o[i])
  
              # Mask write selects with do_write since BRAM
              # doesn't have a global write-enable
  
              # Mask write selects with do_write since BRAM
              # doesn't have a global write-enable
@@ -1277,15 +1367,17 @@ class DCache(Elaboratable):
          sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
  
          with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
          sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
  
          with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
-            with m.If(~r0.mmu_req):
-                sync += r1.ls_valid.eq(1)
-            with m.Else():
+            with m.If(r0.mmu_req):
                  sync += r1.mmu_done.eq(1)
                  sync += r1.mmu_done.eq(1)
+            with m.Else():
+                sync += r1.ls_valid.eq(1)
  
          with m.If(r1.write_tag):
              # Store new tag in selected way
  
          with m.If(r1.write_tag):
              # Store new tag in selected way
+            replace_way_onehot = Signal(NUM_WAYS)
+            comb += replace_way_onehot.eq(1<<replace_way)
              for i in range(NUM_WAYS):
              for i in range(NUM_WAYS):
-                with m.If(i == replace_way):
+                with m.If(replace_way_onehot[i]):
                      ct = Signal(TAG_RAM_WIDTH)
                      comb += ct.eq(cache_tags[r1.store_index].tag)
                      comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
                      ct = Signal(TAG_RAM_WIDTH)
                      comb += ct.eq(cache_tags[r1.store_index].tag)
                      comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
@@ -1391,10 +1483,10 @@ class DCache(Elaboratable):
                              sync += r1.full.eq(0)
                              sync += r1.slow_valid.eq(1)
  
                              sync += r1.full.eq(0)
                              sync += r1.slow_valid.eq(1)
  
-                            with m.If(~req.mmu_req):
-                                sync += r1.ls_valid.eq(1)
-                            with m.Else():
+                            with m.If(req.mmu_req):
                                  sync += r1.mmu_done.eq(1)
                                  sync += r1.mmu_done.eq(1)
+                            with m.Else():
+                                sync += r1.ls_valid.eq(1)
  
                              with m.If(req.op == Op.OP_STORE_HIT):
                                  sync += r1.write_bram.eq(1)
  
                              with m.If(req.op == Op.OP_STORE_HIT):
                                  sync += r1.write_bram.eq(1)
@@ -1459,10 +1551,10 @@ class DCache(Elaboratable):
                                  (r1.store_row == get_row(req.real_addr))):
                          sync += r1.full.eq(0)
                          sync += r1.slow_valid.eq(1)
                                  (r1.store_row == get_row(req.real_addr))):
                          sync += r1.full.eq(0)
                          sync += r1.slow_valid.eq(1)
-                        with m.If(~r1.mmu_req):
-                            sync += r1.ls_valid.eq(1)
-                        with m.Else():
+                        with m.If(r1.mmu_req):
                              sync += r1.mmu_done.eq(1)
                              sync += r1.mmu_done.eq(1)
+                        with m.Else():
+                            sync += r1.ls_valid.eq(1)
                          sync += r1.forward_sel.eq(~0) # all 1s
                          sync += r1.use_forward1.eq(1)
  
                          sync += r1.forward_sel.eq(~0) # all 1s
                          sync += r1.use_forward1.eq(1)
  
@@ -1488,19 +1580,17 @@ class DCache(Elaboratable):
  
              with m.Case(State.STORE_WAIT_ACK):
                  st_stbs_done = Signal()
  
              with m.Case(State.STORE_WAIT_ACK):
                  st_stbs_done = Signal()
-                acks        = Signal(3)
                  adjust_acks = Signal(3)
  
                  comb += st_stbs_done.eq(~r1.wb.stb)
                  adjust_acks = Signal(3)
  
                  comb += st_stbs_done.eq(~r1.wb.stb)
-                comb += acks.eq(r1.acks_pending)
  
                  with m.If(r1.inc_acks != r1.dec_acks):
                      with m.If(r1.inc_acks):
  
                  with m.If(r1.inc_acks != r1.dec_acks):
                      with m.If(r1.inc_acks):
-                        comb += adjust_acks.eq(acks + 1)
+                        comb += adjust_acks.eq(r1.acks_pending + 1)
                      with m.Else():
                      with m.Else():
-                        comb += adjust_acks.eq(acks - 1)
+                        comb += adjust_acks.eq(r1.acks_pending - 1)
                  with m.Else():
                  with m.Else():
-                    comb += adjust_acks.eq(acks)
+                    comb += adjust_acks.eq(r1.acks_pending)
  
                  sync += r1.acks_pending.eq(adjust_acks)
  
  
                  sync += r1.acks_pending.eq(adjust_acks)
  
@@ -1534,6 +1624,8 @@ class DCache(Elaboratable):
                          comb += st_stbs_done.eq(1)
  
                  # Got ack ? See if complete.
                          comb += st_stbs_done.eq(1)
  
                  # Got ack ? See if complete.
+                sync += Display("got ack %d %d stbs %d adjust_acks %d",
+                                bus.ack, bus.ack, st_stbs_done, adjust_acks)
                  with m.If(bus.ack):
                      with m.If(st_stbs_done & (adjust_acks == 1)):
                          sync += r1.state.eq(State.IDLE)
                  with m.If(bus.ack):
                      with m.If(st_stbs_done & (adjust_acks == 1)):
                          sync += r1.state.eq(State.IDLE)
@@ -1552,10 +1644,10 @@ class DCache(Elaboratable):
                      sync += r1.full.eq(0)
                      sync += r1.slow_valid.eq(1)
  
                      sync += r1.full.eq(0)
                      sync += r1.slow_valid.eq(1)
  
-                    with m.If(~r1.mmu_req):
-                        sync += r1.ls_valid.eq(1)
-                    with m.Else():
+                    with m.If(r1.mmu_req):
                          sync += r1.mmu_done.eq(1)
                          sync += r1.mmu_done.eq(1)
+                    with m.Else():
+                        sync += r1.ls_valid.eq(1)
  
                      sync += r1.forward_sel.eq(~0) # all 1s
                      sync += r1.use_forward1.eq(1)
  
                      sync += r1.forward_sel.eq(~0) # all 1s
                      sync += r1.use_forward1.eq(1)
@@ -1588,7 +1680,6 @@ class DCache(Elaboratable):
          """note: these are passed to nmigen.hdl.Memory as "attributes".
             don't know how, just that they are.
          """
          """note: these are passed to nmigen.hdl.Memory as "attributes".
             don't know how, just that they are.
          """
-        dtlb            = TLBArray()
          # TODO attribute ram_style of
          #  dtlb_tags : signal is "distributed";
          # TODO attribute ram_style of
          # TODO attribute ram_style of
          #  dtlb_tags : signal is "distributed";
          # TODO attribute ram_style of
@@ -1625,7 +1716,7 @@ class DCache(Elaboratable):
  
          cache_out_row     = Signal(WB_DATA_BITS)
  
  
          cache_out_row     = Signal(WB_DATA_BITS)
  
-        plru_victim       = PLRUOut()
+        plru_victim       = Signal(WAY_BITS)
          replace_way       = Signal(WAY_BITS)
  
          # Wishbone read/write/cache write formatting signals
          replace_way       = Signal(WAY_BITS)
  
          # Wishbone read/write/cache write formatting signals
@@ -1643,7 +1734,7 @@ class DCache(Elaboratable):
          perm_ok       = Signal()
          access_ok     = Signal()
  
          perm_ok       = Signal()
          access_ok     = Signal()
  
-        tlb_plru_victim = TLBPLRUOut()
+        tlb_plru_victim = Signal(TLB_WAY_BITS)
  
          # we don't yet handle collisions between loadstore1 requests
          # and MMU requests
  
          # we don't yet handle collisions between loadstore1 requests
          # and MMU requests
@@ -1657,7 +1748,8 @@ class DCache(Elaboratable):
          # deal with litex not doing wishbone pipeline mode
          # XXX in wrong way.  FIFOs are needed in the SRAM test
          # so that stb/ack match up. same thing done in icache.py
          # deal with litex not doing wishbone pipeline mode
          # XXX in wrong way.  FIFOs are needed in the SRAM test
          # so that stb/ack match up. same thing done in icache.py
-        comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
+        if not self.microwatt_compat:
+            comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
  
          # Wire up wishbone request latch out of stage 1
          comb += self.bus.we.eq(r1.wb.we)
  
          # Wire up wishbone request latch out of stage 1
          comb += self.bus.we.eq(r1.wb.we)
@@ -1667,18 +1759,20 @@ class DCache(Elaboratable):
          comb += self.bus.dat_w.eq(r1.wb.dat)
          comb += self.bus.cyc.eq(r1.wb.cyc)
  
          comb += self.bus.dat_w.eq(r1.wb.dat)
          comb += self.bus.cyc.eq(r1.wb.cyc)
  
+        # create submodule TLBUpdate
+        m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
+
          # call sub-functions putting everything together, using shared
          # signals established above
          self.stage_0(m, r0, r1, r0_full)
          # call sub-functions putting everything together, using shared
          # signals established above
          self.stage_0(m, r0, r1, r0_full)
-        self.tlb_read(m, r0_stall, tlb_way, dtlb)
+        self.tlb_read(m, r0_stall, tlb_way)
          self.tlb_search(m, tlb_req_index, r0, r0_valid,
                          tlb_way,
                          pte, tlb_hit, valid_ra, perm_attr, ra)
          self.tlb_search(m, tlb_req_index, r0, r0_valid,
                          tlb_way,
                          pte, tlb_hit, valid_ra, perm_attr, ra)
-        self.tlb_update(m, r0_valid, r0, dtlb, tlb_req_index,
-                        tlb_hit, tlb_plru_victim,
-                        tlb_way)
+        self.tlb_update(m, r0_valid, r0, tlb_req_index,
+                        tlb_hit, tlb_plru_victim)
          self.maybe_plrus(m, r1, plru_victim)
          self.maybe_plrus(m, r1, plru_victim)
-        self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
+        self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
          self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
          self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
                             r0_valid, r1, cache_tags, replace_way,
          self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
          self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
                             r0_valid, r1, cache_tags, replace_way,