src/soc/experiment/l0_cache.py

   1 """L0 Cache/Buffer
   2
   3 This first version is intended for prototyping and test purposes:
   4 it has "direct" access to Memory.
   5
   6 The intention is that this version remains an integral part of the
   7 test infrastructure, and, just as with minerva's memory arrangement,
   8 a dynamic runtime config *selects* alternative memory arrangements
   9 rather than *replaces and discards* this code.
  10
  11 Links:
  12
  13 * https://bugs.libre-soc.org/show_bug.cgi?id=216
  14 * https://libre-soc.org/3d_gpu/architecture/memory_and_cache/
  15
  16 """
  17
  18 from nmigen.compat.sim import run_simulation, Settle
  19 from nmigen.cli import verilog, rtlil
  20 from nmigen import Module, Signal, Mux, Elaboratable, Array, Cat
  21 from nmutil.iocontrol import RecordObject
  22 from nmigen.utils import log2_int
  23 from nmigen.hdl.rec import Record, Layout
  24
  25 from nmutil.latch import SRLatch, latchregister
  26 from soc.decoder.power_decoder2 import Data
  27 from soc.decoder.power_enums import InternalOp
  28 from soc.regfile.regfile import ortreereduce
  29 from nmutil.util import treereduce
  30
  31 from soc.decoder.power_decoder2 import Data
  32 #from nmutil.picker import PriorityPicker
  33 from nmigen.lib.coding import PriorityEncoder
  34 from soc.scoreboard.addr_split import LDSTSplitter
  35 from soc.scoreboard.addr_match import LenExpand
  36
  37 # for testing purposes
  38 from soc.config.test.test_loadstore import TestMemPspec
  39 from soc.config.loadstore import ConfigMemoryPortInterface
  40 from soc.experiment.pimem import PortInterface
  41 from soc.config.test.test_pi2ls import pi_ld, pi_st, pi_ldst
  42 import unittest
  43
  44
  45 class DualPortSplitter(Elaboratable):
  46     """DualPortSplitter
  47
  48     * one incoming PortInterface
  49     * two *OUTGOING* PortInterfaces
  50     * uses LDSTSplitter to do it
  51
  52     (actually, thinking about it LDSTSplitter could simply be
  53      modified to conform to PortInterface: one in, two out)
  54
  55     once that is done each pair of ports may be wired directly
  56     to the dual ports of L0CacheBuffer
  57
  58     The split is carried out so that, regardless of alignment or
  59     mis-alignment, outgoing PortInterface[0] takes bit 4 == 0
  60     of the address, whilst outgoing PortInterface[1] takes
  61     bit 4 == 1.
  62
  63     PortInterface *may* need to be changed so that the length is
  64     a binary number (accepting values 1-16).
  65     """
  66     def __init__(self):
  67         self.outp = [PortInterface(name="outp_0"),
  68                      PortInterface(name="outp_1")]
  69         self.inp  = PortInterface(name="inp")
  70         print(self.outp)
  71
  72     def elaborate(self, platform):
  73         m = Module()
  74         comb = m.d.comb
  75         m.submodules.splitter = splitter = LDSTSplitter(64, 48, 4)
  76         comb += splitter.addr_i.eq(self.inp.addr) #XXX
  77         #comb += splitter.len_i.eq()
  78         #comb += splitter.valid_i.eq()
  79         comb += splitter.is_ld_i.eq(self.inp.is_ld_i)
  80         comb += splitter.is_st_i.eq(self.inp.is_st_i)
  81         #comb += splitter.st_data_i.eq()
  82         #comb += splitter.sld_valid_i.eq()
  83         #comb += splitter.sld_data_i.eq()
  84         #comb += splitter.sst_valid_i.eq()
  85         return m
  86
  87
  88 class DataMergerRecord(Record):
  89     """
  90     {data: 128 bit, byte_enable: 16 bit}
  91     """
  92
  93     def __init__(self, name=None):
  94         layout = (('data', 128),
  95                   ('en', 16))
  96         Record.__init__(self, Layout(layout), name=name)
  97
  98         self.data.reset_less = True
  99         self.en.reset_less = True
 100
 101
 102 # TODO: formal verification
 103 class DataMerger(Elaboratable):
 104     """DataMerger
 105
 106     Merges data based on an address-match matrix.
 107     Identifies (picks) one (any) row, then uses that row,
 108     based on matching address bits, to merge (OR) all data
 109     rows into the output.
 110
 111     Basically, by the time DataMerger is used, all of its incoming data is
 112     determined not to conflict.  The last step before actually submitting
 113     the request to the Memory Subsystem is to work out which requests,
 114     on the same 128-bit cache line, can be "merged" due to them being:
 115     (A) on the same address (bits 4 and above) (B) having byte-enable
 116     lines that (as previously mentioned) do not conflict.
 117
 118     Therefore, put simply, this module will:
 119     (1) pick a row (any row) and identify it by an index labelled "idx"
 120     (2) merge all byte-enable lines which are on that same address, as
 121         indicated by addr_match_i[idx], onto the output
 122     """
 123
 124     def __init__(self, array_size):
 125         """
 126         :addr_array_i: an NxN Array of Signals with bits set indicating address
 127                        match.  bits across the diagonal (addr_array_i[x][x])
 128                        will always be set, to indicate "active".
 129         :data_i: an Nx Array of Records {data: 128 bit, byte_enable: 16 bit}
 130         :data_o: an Output Record of same type
 131                  {data: 128 bit, byte_enable: 16 bit}
 132         """
 133         self.array_size = array_size
 134         ul = []
 135         for i in range(array_size):
 136             ul.append(Signal(array_size,
 137                              reset_less=True,
 138                              name="addr_match_%d" % i))
 139         self.addr_array_i = Array(ul)
 140
 141         ul = []
 142         for i in range(array_size):
 143             ul.append(DataMergerRecord())
 144         self.data_i = Array(ul)
 145         self.data_o = DataMergerRecord()
 146
 147     def elaborate(self, platform):
 148         m = Module()
 149         comb = m.d.comb
 150         #(1) pick a row
 151         m.submodules.pick = pick = PriorityEncoder(self.array_size)
 152         for j in range(self.array_size):
 153             comb += pick.i[j].eq(self.addr_array_i[j].bool())
 154         valid = ~pick.n
 155         idx = pick.o
 156         #(2) merge
 157         with m.If(valid):
 158             l = []
 159             for j in range(self.array_size):
 160                 select = self.addr_array_i[idx][j]
 161                 r = DataMergerRecord()
 162                 with m.If(select):
 163                     comb += r.eq(self.data_i[j])
 164                 l.append(r)
 165             comb += self.data_o.data.eq(ortreereduce(l,"data"))
 166             comb += self.data_o.en.eq(ortreereduce(l,"en"))
 167
 168         return m
 169
 170
 171 class L0CacheBuffer(Elaboratable):
 172     """L0 Cache / Buffer
 173
 174     Note that the final version will have *two* interfaces per LDSTCompUnit,
 175     to cover mis-aligned requests, as well as *two* 128-bit L1 Cache
 176     interfaces: one for odd (addr[4] == 1) and one for even (addr[4] == 1).
 177
 178     This version is to be used for test purposes (and actively maintained
 179     for such, rather than "replaced")
 180
 181     There are much better ways to implement this.  However it's only
 182     a "demo" / "test" class, and one important aspect: it responds
 183     combinatorially, where a nmigen FSM's state-changes only activate
 184     on clock-sync boundaries.
 185
 186     Note: the data byte-order is *not* expected to be normalised (LE/BE)
 187     by this class.  That task is taken care of by LDSTCompUnit.
 188     """
 189
 190     def __init__(self, n_units, pimem, regwid=64, addrwid=48):
 191         self.n_units = n_units
 192         self.pimem = pimem
 193         self.regwid = regwid
 194         self.addrwid = addrwid
 195         ul = []
 196         for i in range(n_units):
 197             ul.append(PortInterface("ldst_port%d" % i, regwid, addrwid))
 198         self.dports = Array(ul)
 199
 200     def elaborate(self, platform):
 201         m = Module()
 202         comb, sync = m.d.comb, m.d.sync
 203
 204         # connect the ports as modules
 205         #for i in range(self.n_units):
 206         #    setattr(m.submodules, "port%d" % i, self.dports[i])
 207
 208         # state-machine latches
 209         m.submodules.idx_l = idx_l = SRLatch(False, name="idx_l")
 210         m.submodules.reset_l = reset_l = SRLatch(True, name="reset")
 211
 212         # find one LD (or ST) and do it.  only one per cycle.
 213         # TODO: in the "live" (production) L0Cache/Buffer, merge multiple
 214         # LD/STs using mask-expansion - see LenExpand class
 215
 216         m.submodules.pick = pick = PriorityEncoder(self.n_units)
 217
 218         ldsti = []
 219         for i in range(self.n_units):
 220             pi = self.dports[i]
 221             busy = (pi.is_ld_i | pi.is_st_i)# & pi.busy_o
 222             ldsti.append(busy) # accumulate ld/st-req
 223         # put the requests into the priority-picker
 224         comb += pick.i.eq(Cat(*ldsti))
 225
 226         # hmm, have to select (record) the right port index
 227         nbits = log2_int(self.n_units, False)
 228         idx = Signal(nbits, reset_less=False)
 229
 230         # use these because of the sync-and-comb pass-through capability
 231         latchregister(m, pick.o, idx, idx_l.q, name="idx_l")
 232
 233         # convenience variables to reference the "picked" port
 234         port = self.dports[idx]
 235
 236         # pick (and capture) the port index
 237         with m.If(~pick.n):
 238             comb += idx_l.s.eq(1)
 239
 240         # from this point onwards, with the port "picked", it stays picked
 241         # until idx_l is deasserted
 242         comb += reset_l.s.eq(0)
 243         comb += reset_l.r.eq(0)
 244
 245         with m.If(idx_l.q):
 246             comb += self.pimem.connect_port(port)
 247             with m.If(~self.pimem.pi.busy_o):
 248                 comb += reset_l.s.eq(1) # reset when no longer busy
 249
 250         # ugly hack, due to simultaneous addr req-go acknowledge
 251         reset_delay = Signal(reset_less=True)
 252         sync += reset_delay.eq(reset_l.q)
 253
 254         # after waiting one cycle (reset_l is "sync" mode), reset the port
 255         with m.If(reset_l.q):
 256             comb += idx_l.r.eq(1)  # deactivate port-index selector
 257             comb += reset_l.r.eq(1)     # clear reset
 258
 259         return m
 260
 261     def ports(self):
 262         for p in self.dports:
 263             yield from p.ports()
 264
 265
 266 class TstL0CacheBuffer(Elaboratable):
 267     def __init__(self, pspec, n_units=3):
 268         regwid = pspec.reg_wid
 269         addrwid = pspec.addr_wid
 270         self.cmpi = ConfigMemoryPortInterface(pspec)
 271         self.pimem = self.cmpi.pi
 272         self.l0 = L0CacheBuffer(n_units, self.pimem, regwid, addrwid<<1)
 273
 274     def elaborate(self, platform):
 275         m = Module()
 276         m.submodules.pimem = self.pimem
 277         m.submodules.l0 = self.l0
 278         if hasattr(self.cmpi, 'lsmem'): # hmmm not happy about this
 279             m.submodules.lsmem = self.cmpi.lsmem.lsi
 280
 281         return m
 282
 283     def ports(self):
 284         yield from self.cmpi.ports()
 285         yield from self.l0.ports()
 286         yield from self.pimem.ports()
 287
 288
 289 def wait_busy(port, no=False):
 290     while True:
 291         busy = yield port.busy_o
 292         print("busy", no, busy)
 293         if bool(busy) == no:
 294             break
 295         yield
 296
 297
 298 def wait_addr(port):
 299     while True:
 300         addr_ok = yield port.addr_ok_o
 301         print("addrok", addr_ok)
 302         if not addr_ok:
 303             break
 304         yield
 305
 306
 307 def wait_ldok(port):
 308     while True:
 309         ldok = yield port.ld.ok
 310         print("ldok", ldok)
 311         if ldok:
 312             break
 313         yield
 314
 315
 316 def l0_cache_st(dut, addr, data, datalen):
 317     return pi_st(dut.l0, addr, datalen)
 318
 319
 320 def l0_cache_ld(dut, addr, datalen, expected):
 321     return pi_ld(dut.l0, addr, datalen)
 322
 323
 324 def l0_cache_ldst(arg, dut):
 325     port0 = dut.l0.dports[0]
 326     return pi_ldst(arg, port0)
 327
 328
 329 def data_merger_merge(dut):
 330     print("data_merger")
 331     #starting with all inputs zero
 332     yield Settle()
 333     en = yield dut.data_o.en
 334     data = yield dut.data_o.data
 335     assert en == 0, "en must be zero"
 336     assert data == 0, "data must be zero"
 337     yield
 338
 339     yield dut.addr_array_i[0].eq(0xFF)
 340     for j in range(dut.array_size):
 341         yield dut.data_i[j].en.eq(1 << j)
 342         yield dut.data_i[j].data.eq(0xFF << (16*j))
 343     yield Settle()
 344
 345     en = yield dut.data_o.en
 346     data = yield dut.data_o.data
 347     assert data == 0xff00ff00ff00ff00ff00ff00ff00ff
 348     assert en == 0xff
 349     yield
 350
 351
 352 class TestL0Cache(unittest.TestCase):
 353
 354     def test_l0_cache_test_bare_wb(self):
 355
 356         pspec = TestMemPspec(ldst_ifacetype='test_bare_wb',
 357                              addr_wid=48,
 358                              mask_wid=8,
 359                              reg_wid=64)
 360         dut = TstL0CacheBuffer(pspec)
 361         vl = rtlil.convert(dut, ports=[])# TODOdut.ports())
 362         with open("test_basic_l0_cache_bare_wb.il", "w") as f:
 363             f.write(vl)
 364
 365         run_simulation(dut, l0_cache_ldst(self, dut),
 366                        vcd_name='test_l0_cache_basic_bare_wb.vcd')
 367
 368     def test_l0_cache_testpi(self):
 369
 370         pspec = TestMemPspec(ldst_ifacetype='testpi',
 371                              addr_wid=48,
 372                              mask_wid=8,
 373                              reg_wid=64)
 374         dut = TstL0CacheBuffer(pspec)
 375         vl = rtlil.convert(dut, ports=[])# TODOdut.ports())
 376         with open("test_basic_l0_cache.il", "w") as f:
 377             f.write(vl)
 378
 379         run_simulation(dut, l0_cache_ldst(self, dut),
 380                        vcd_name='test_l0_cache_basic_testpi.vcd')
 381
 382
 383 class TestDataMerger(unittest.TestCase):
 384
 385     def test_data_merger(self):
 386
 387         dut = DataMerger(8)
 388         #vl = rtlil.convert(dut, ports=dut.ports())
 389         #with open("test_data_merger.il", "w") as f:
 390         #    f.write(vl)
 391
 392         run_simulation(dut, data_merger_merge(dut),
 393                        vcd_name='test_data_merger.vcd')
 394
 395
 396 class TestDualPortSplitter(unittest.TestCase):
 397
 398     def test_dual_port_splitter(self):
 399
 400         dut = DualPortSplitter()
 401         #vl = rtlil.convert(dut, ports=dut.ports())
 402         #with open("test_data_merger.il", "w") as f:
 403         #    f.write(vl)
 404
 405         #run_simulation(dut, data_merger_merge(dut),
 406         #               vcd_name='test_dual_port_splitter.vcd')
 407
 408
 409 if __name__ == '__main__':
 410     unittest.main(exit=False)
 411