double the number of lines in the L1 D/I-Cache to match microwatt
[soc.git] / src / soc / experiment / dcache.py
index b3b3fd57b954b7c24e8443bbaa39e0d9ab14a7bb..a828e3c3c2137b05e3197ef54e2c69bc2a80496e 100644 (file)
@@ -13,6 +13,8 @@ Links:
 
 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
 * https://bugs.libre-soc.org/show_bug.cgi?id=469
+* https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
+  (discussion about brams for ECP5)
 
 """
 
@@ -49,8 +51,8 @@ from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
                                 WBIOMasterOut, WBIOSlaveOut)
 
 from soc.experiment.cache_ram import CacheRam
-#from soc.experiment.plru import PLRU
-from nmutil.plru import PLRU, PLRUs
+from soc.experiment.plru import PLRU, PLRUs
+#from nmutil.plru import PLRU, PLRUs
 
 # for test
 from soc.bus.sram import SRAM
@@ -66,7 +68,7 @@ from nmutil.util import wrap
 
 # TODO: make these parameters of DCache at some point
 LINE_SIZE = 64    # Line size in bytes
-NUM_LINES = 16    # Number of lines in a set
+NUM_LINES = 32    # Number of lines in a set
 NUM_WAYS = 4      # Number of ways
 TLB_SET_SIZE = 64 # L1 DTLB entries per set
 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
@@ -160,9 +162,10 @@ TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 print ("    TAG_WIDTH", TAG_WIDTH)
 print ("     NUM_WAYS", NUM_WAYS)
+print ("    NUM_LINES", NUM_LINES)
 
 def CacheTagArray():
-    tag_layout = [('valid', 1),
+    tag_layout = [('valid', NUM_WAYS),
                   ('tag', TAG_RAM_WIDTH),
                  ]
     return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
@@ -625,14 +628,15 @@ class DCachePendingHit(Elaboratable):
         with m.If(virt_mode):
             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
-                s_hit       = Signal()
-                s_pte       = Signal(TLB_PTE_BITS)
-                s_ra        = Signal(REAL_ADDR_BITS)
+                s_hit       = Signal(name="s_hit%d" % j)
+                s_pte       = Signal(TLB_PTE_BITS, name="s_pte%d" % j)
+                s_ra        = Signal(REAL_ADDR_BITS, name="s_ra%d" % j)
+                # read the PTE, calc the Real Address, get tge tag
                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
                 comb += s_tag.eq(get_tag(s_ra))
-
+                # for each way check tge tag against the cache tag set
                 for i in range(NUM_WAYS): # way_t
                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
@@ -642,8 +646,7 @@ class DCachePendingHit(Elaboratable):
                         comb += hit_way_set[j].eq(i)
                         comb += s_hit.eq(1)
                 comb += hit_set[j].eq(s_hit)
-                with m.If(s_tag == reload_tag):
-                    comb += rel_matches[j].eq(1)
+                comb += rel_matches[j].eq(s_tag == reload_tag)
             with m.If(tlb_hit.valid):
                 comb += is_hit.eq(hit_set[tlb_hit.way])
                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
@@ -673,7 +676,7 @@ class DCache(Elaboratable):
       at the end of line (this requires dealing with requests coming in
       while not idle...)
     """
-    def __init__(self):
+    def __init__(self, pspec=None):
         self.d_in      = LoadStore1ToDCacheType("d_in")
         self.d_out     = DCacheToLoadStore1Type("d_out")
 
@@ -692,6 +695,10 @@ class DCache(Elaboratable):
 
         self.log_out   = Signal(20)
 
+        # test if microwatt compatibility is to be enabled
+        self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
+                                 (pspec.microwatt_compat == True))
+
     def stage_0(self, m, r0, r1, r0_full):
         """Latch the request in r0.req as long as we're not stalling
         """
@@ -730,6 +737,7 @@ class DCache(Elaboratable):
             comb += r.doall.eq(0)
             comb += r.tlbld.eq(0)
             comb += r.mmu_req.eq(0)
+
         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
             sync += r0.eq(r)
             sync += r0_full.eq(r.req.valid)
@@ -1230,7 +1238,7 @@ class DCache(Elaboratable):
             d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
 
             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
-            setattr(m.submodules, "cacheram_%d" % i, way)
+            m.submodules["cacheram_%d" % i] = way
 
             comb += way.rd_en.eq(do_read)
             comb += way.rd_addr.eq(rd_addr)
@@ -1359,10 +1367,10 @@ class DCache(Elaboratable):
         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
 
         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
-            with m.If(~r0.mmu_req):
-                sync += r1.ls_valid.eq(1)
-            with m.Else():
+            with m.If(r0.mmu_req):
                 sync += r1.mmu_done.eq(1)
+            with m.Else():
+                sync += r1.ls_valid.eq(1)
 
         with m.If(r1.write_tag):
             # Store new tag in selected way
@@ -1475,10 +1483,10 @@ class DCache(Elaboratable):
                             sync += r1.full.eq(0)
                             sync += r1.slow_valid.eq(1)
 
-                            with m.If(~req.mmu_req):
-                                sync += r1.ls_valid.eq(1)
-                            with m.Else():
+                            with m.If(req.mmu_req):
                                 sync += r1.mmu_done.eq(1)
+                            with m.Else():
+                                sync += r1.ls_valid.eq(1)
 
                             with m.If(req.op == Op.OP_STORE_HIT):
                                 sync += r1.write_bram.eq(1)
@@ -1543,10 +1551,10 @@ class DCache(Elaboratable):
                                 (r1.store_row == get_row(req.real_addr))):
                         sync += r1.full.eq(0)
                         sync += r1.slow_valid.eq(1)
-                        with m.If(~r1.mmu_req):
-                            sync += r1.ls_valid.eq(1)
-                        with m.Else():
+                        with m.If(r1.mmu_req):
                             sync += r1.mmu_done.eq(1)
+                        with m.Else():
+                            sync += r1.ls_valid.eq(1)
                         sync += r1.forward_sel.eq(~0) # all 1s
                         sync += r1.use_forward1.eq(1)
 
@@ -1572,19 +1580,17 @@ class DCache(Elaboratable):
 
             with m.Case(State.STORE_WAIT_ACK):
                 st_stbs_done = Signal()
-                acks        = Signal(3)
                 adjust_acks = Signal(3)
 
                 comb += st_stbs_done.eq(~r1.wb.stb)
-                comb += acks.eq(r1.acks_pending)
 
                 with m.If(r1.inc_acks != r1.dec_acks):
                     with m.If(r1.inc_acks):
-                        comb += adjust_acks.eq(acks + 1)
+                        comb += adjust_acks.eq(r1.acks_pending + 1)
                     with m.Else():
-                        comb += adjust_acks.eq(acks - 1)
+                        comb += adjust_acks.eq(r1.acks_pending - 1)
                 with m.Else():
-                    comb += adjust_acks.eq(acks)
+                    comb += adjust_acks.eq(r1.acks_pending)
 
                 sync += r1.acks_pending.eq(adjust_acks)
 
@@ -1618,6 +1624,8 @@ class DCache(Elaboratable):
                         comb += st_stbs_done.eq(1)
 
                 # Got ack ? See if complete.
+                sync += Display("got ack %d %d stbs %d adjust_acks %d",
+                                bus.ack, bus.ack, st_stbs_done, adjust_acks)
                 with m.If(bus.ack):
                     with m.If(st_stbs_done & (adjust_acks == 1)):
                         sync += r1.state.eq(State.IDLE)
@@ -1636,10 +1644,10 @@ class DCache(Elaboratable):
                     sync += r1.full.eq(0)
                     sync += r1.slow_valid.eq(1)
 
-                    with m.If(~r1.mmu_req):
-                        sync += r1.ls_valid.eq(1)
-                    with m.Else():
+                    with m.If(r1.mmu_req):
                         sync += r1.mmu_done.eq(1)
+                    with m.Else():
+                        sync += r1.ls_valid.eq(1)
 
                     sync += r1.forward_sel.eq(~0) # all 1s
                     sync += r1.use_forward1.eq(1)
@@ -1740,7 +1748,8 @@ class DCache(Elaboratable):
         # deal with litex not doing wishbone pipeline mode
         # XXX in wrong way.  FIFOs are needed in the SRAM test
         # so that stb/ack match up. same thing done in icache.py
-        comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
+        if not self.microwatt_compat:
+            comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
 
         # Wire up wishbone request latch out of stage 1
         comb += self.bus.we.eq(r1.wb.we)