src/soc/experiment/l0_cache.py

   1 """L0 Cache/Buffer
   2
   3 This first version is intended for prototyping and test purposes:
   4 it has "direct" access to Memory.
   5
   6 The intention is that this version remains an integral part of the
   7 test infrastructure, and, just as with minerva's memory arrangement,
   8 a dynamic runtime config *selects* alternative memory arrangements
   9 rather than *replaces and discards* this code.
  10
  11 Links:
  12
  13 * https://bugs.libre-soc.org/show_bug.cgi?id=216
  14 * https://libre-soc.org/3d_gpu/architecture/memory_and_cache/
  15
  16 """
  17
  18 from nmigen.compat.sim import run_simulation, Settle
  19 from nmigen.cli import verilog, rtlil
  20 from nmigen import Module, Signal, Mux, Elaboratable, Array, Cat
  21 from nmutil.iocontrol import RecordObject
  22 from nmigen.utils import log2_int
  23 from nmigen.hdl.rec import Record, Layout
  24
  25 from nmutil.latch import SRLatch, latchregister
  26 from openpower.decoder.power_decoder2 import Data
  27 from openpower.decoder.power_enums import MicrOp
  28 from soc.regfile.regfile import ortreereduce
  29 from nmutil.util import treereduce
  30
  31 from openpower.decoder.power_decoder2 import Data
  32 #from nmutil.picker import PriorityPicker
  33 from nmigen.lib.coding import PriorityEncoder
  34 from soc.scoreboard.addr_split import LDSTSplitter
  35 from soc.scoreboard.addr_match import LenExpand
  36
  37 # for testing purposes
  38 from soc.config.test.test_loadstore import TestMemPspec
  39 from soc.config.loadstore import ConfigMemoryPortInterface
  40 from soc.experiment.pimem import PortInterface
  41 from soc.config.test.test_pi2ls import pi_ld, pi_st, pi_ldst
  42 import unittest
  43
  44 class L0CacheBuffer2(Elaboratable):
  45     """L0CacheBuffer2"""
  46     def __init__(self, n_units=8, regwid=64, addrwid=64):
  47         self.n_units = n_units
  48         self.regwid = regwid
  49         self.addrwid = addrwid
  50         ul = []
  51         for i in range(self.n_units):
  52             ul += [PortInterface()]
  53         self.dports = Array(ul)
  54
  55     def elaborate(self, platform):
  56         m = Module()
  57         comb, sync = m.d.comb, m.d.sync
  58
  59         # connect the ports as modules
  60
  61         for i in range(self.n_units):
  62             d = LDSTSplitter(64, 64, 4, self.dports[i])
  63             setattr(m.submodules, "ldst_splitter%d" % i, d)
  64
  65         # state-machine latches TODO
  66         return m
  67
  68 class DataMergerRecord(Record):
  69     """
  70     {data: 128 bit, byte_enable: 16 bit}
  71     """
  72
  73     def __init__(self, name=None):
  74         layout = (('data', 128),
  75                   ('en', 16))
  76         Record.__init__(self, Layout(layout), name=name)
  77
  78         self.data.reset_less = True
  79         self.en.reset_less = True
  80
  81 class CacheRecord(Record):
  82     def __init__(self, name=None):
  83         layout = (('addr', 37),
  84                   ('a_even', 7),
  85                   ('bytemask_even', 16),
  86                   ('data_even', 128),
  87                   ('a_odd', 7),
  88                   ('bytemask_odd', 16),
  89                   ('data_odd', 128))
  90         Record.__init__(self, Layout(layout), name=name)
  91
  92         self.addr.reset_less = True
  93         self.a_even.reset_less = True
  94         self.bytemask_even.reset_less = True
  95         self.data_even.reset_less = True
  96         self.a_odd.reset_less = True
  97         self.bytemask_odd.reset_less = True
  98         self.data_odd.reset_less = True
  99
 100
 101
 102 # TODO: formal verification
 103 class DataMerger(Elaboratable):
 104     """DataMerger
 105
 106     Merges data based on an address-match matrix.
 107     Identifies (picks) one (any) row, then uses that row,
 108     based on matching address bits, to merge (OR) all data
 109     rows into the output.
 110
 111     Basically, by the time DataMerger is used, all of its incoming data is
 112     determined not to conflict.  The last step before actually submitting
 113     the request to the Memory Subsystem is to work out which requests,
 114     on the same 128-bit cache line, can be "merged" due to them being:
 115     (A) on the same address (bits 4 and above) (B) having byte-enable
 116     lines that (as previously mentioned) do not conflict.
 117
 118     Therefore, put simply, this module will:
 119     (1) pick a row (any row) and identify it by an index labelled "idx"
 120     (2) merge all byte-enable lines which are on that same address, as
 121         indicated by addr_match_i[idx], onto the output
 122     """
 123
 124     def __init__(self, array_size):
 125         """
 126         :addr_array_i: an NxN Array of Signals with bits set indicating address
 127                        match.  bits across the diagonal (addr_array_i[x][x])
 128                        will always be set, to indicate "active".
 129         :i_data: an Nx Array of Records {data: 128 bit, byte_enable: 16 bit}
 130         :o_data: an Output Record of same type
 131                  {data: 128 bit, byte_enable: 16 bit}
 132         """
 133         self.array_size = array_size
 134         ul = []
 135         for i in range(array_size):
 136             ul.append(Signal(array_size,
 137                              reset_less=True,
 138                              name="addr_match_%d" % i))
 139         self.addr_array_i = Array(ul)
 140
 141         ul = []
 142         for i in range(array_size):
 143             ul.append(DataMergerRecord())
 144         self.i_data = Array(ul)
 145         self.o_data = DataMergerRecord()
 146
 147     def elaborate(self, platform):
 148         m = Module()
 149         comb = m.d.comb
 150         # (1) pick a row
 151         m.submodules.pick = pick = PriorityEncoder(self.array_size)
 152         for j in range(self.array_size):
 153             comb += pick.i[j].eq(self.addr_array_i[j].bool())
 154         valid = ~pick.n
 155         idx = pick.o
 156         # (2) merge
 157         with m.If(valid):
 158             l = []
 159             for j in range(self.array_size):
 160                 select = self.addr_array_i[idx][j]
 161                 r = DataMergerRecord()
 162                 with m.If(select):
 163                     comb += r.eq(self.i_data[j])
 164                 l.append(r)
 165             comb += self.o_data.data.eq(ortreereduce(l, "data"))
 166             comb += self.o_data.en.eq(ortreereduce(l, "en"))
 167
 168         return m
 169
 170 class TstDataMerger2(Elaboratable):
 171     def __init__(self):
 172         self.data_odd = Signal(128,reset_less=True)
 173         self.data_even = Signal(128,reset_less=True)
 174         self.n_units = 8
 175         ul = []
 176         for i in range(self.n_units):
 177             ul.append(CacheRecord())
 178         self.input_array = Array(ul)
 179
 180     def addr_match(self,j,addr):
 181         ret = []
 182         for k in range(self.n_units):
 183             ret += [(addr[j] == addr[k])]
 184         return Cat(*ret)
 185
 186     def elaborate(self, platform):
 187         m = Module()
 188         m.submodules.dm_odd = dm_odd = DataMerger(self.n_units)
 189         m.submodules.dm_even = dm_even = DataMerger(self.n_units)
 190
 191         addr_even = []
 192         addr_odd = []
 193         for j in range(self.n_units):
 194             inp = self.input_array[j]
 195             addr_even += [Cat(inp.addr,inp.a_even)]
 196             addr_odd +=  [Cat(inp.addr,inp.a_odd)]
 197
 198         for j in range(self.n_units):
 199             inp = self.input_array[j]
 200             m.d.comb += dm_even.i_data[j].en.eq(inp.bytemask_even)
 201             m.d.comb += dm_odd.i_data[j].en.eq(inp.bytemask_odd)
 202             m.d.comb += dm_even.i_data[j].data.eq(inp.data_even)
 203             m.d.comb += dm_odd.i_data[j].data.eq(inp.data_odd)
 204             m.d.comb += dm_even.addr_array_i[j].eq(self.addr_match(j,addr_even))
 205             m.d.comb += dm_odd.addr_array_i[j].eq(self.addr_match(j,addr_odd))
 206
 207         m.d.comb += self.data_odd.eq(dm_odd.o_data.data)
 208         m.d.comb += self.data_even.eq(dm_even.o_data.data)
 209         return m
 210
 211
 212 class L0CacheBuffer(Elaboratable):
 213     """L0 Cache / Buffer
 214
 215     Note that the final version will have *two* interfaces per LDSTCompUnit,
 216     to cover mis-aligned requests, as well as *two* 128-bit L1 Cache
 217     interfaces: one for odd (addr[4] == 1) and one for even (addr[4] == 1).
 218
 219     This version is to be used for test purposes (and actively maintained
 220     for such, rather than "replaced")
 221
 222     There are much better ways to implement this.  However it's only
 223     a "demo" / "test" class, and one important aspect: it responds
 224     combinatorially, where a nmigen FSM's state-changes only activate
 225     on clock-sync boundaries.
 226
 227     Note: the data byte-order is *not* expected to be normalised (LE/BE)
 228     by this class.  That task is taken care of by LDSTCompUnit.
 229     """
 230
 231     def __init__(self, n_units, pimem, regwid=64, addrwid=64):
 232         self.n_units = n_units
 233         self.pimem = pimem
 234         self.regwid = regwid
 235         self.addrwid = addrwid
 236         ul = []
 237         for i in range(n_units):
 238             ul.append(PortInterface("ldst_port%d" % i, regwid, addrwid))
 239         self.dports = Array(ul)
 240
 241     def elaborate(self, platform):
 242         m = Module()
 243         comb, sync = m.d.comb, m.d.sync
 244
 245         # connect the ports as modules
 246         # for i in range(self.n_units):
 247         #    setattr(m.submodules, "port%d" % i, self.dports[i])
 248
 249         # state-machine latches
 250         m.submodules.idx_l = idx_l = SRLatch(False, name="idx_l")
 251         m.submodules.reset_l = reset_l = SRLatch(True, name="reset")
 252
 253         # find one LD (or ST) and do it.  only one per cycle.
 254         # TODO: in the "live" (production) L0Cache/Buffer, merge multiple
 255         # LD/STs using mask-expansion - see LenExpand class
 256
 257         m.submodules.pick = pick = PriorityEncoder(self.n_units)
 258
 259         ldsti = []
 260         for i in range(self.n_units):
 261             pi = self.dports[i]
 262             busy = (pi.is_ld_i | pi.is_st_i)  # & pi.busy_o
 263             ldsti.append(busy)  # accumulate ld/st-req
 264         # put the requests into the priority-picker
 265         comb += pick.i.eq(Cat(*ldsti))
 266
 267         # hmm, have to select (record) the right port index
 268         nbits = log2_int(self.n_units, False)
 269         idx = Signal(nbits, reset_less=False)
 270
 271         # use these because of the sync-and-comb pass-through capability
 272         latchregister(m, pick.o, idx, idx_l.q, name="idx_l")
 273
 274         # convenience variables to reference the "picked" port
 275         port = self.dports[idx]
 276
 277         # pick (and capture) the port index
 278         with m.If(~pick.n):
 279             comb += idx_l.s.eq(1)
 280
 281         # from this point onwards, with the port "picked", it stays picked
 282         # until idx_l is deasserted
 283         comb += reset_l.s.eq(0)
 284         comb += reset_l.r.eq(0)
 285
 286         with m.If(idx_l.q):
 287             comb += self.pimem.connect_port(port)
 288             with m.If(~self.pimem.pi.busy_o):
 289                 comb += reset_l.s.eq(1)  # reset when no longer busy
 290
 291         # ugly hack, due to simultaneous addr req-go acknowledge
 292         reset_delay = Signal(reset_less=True)
 293         sync += reset_delay.eq(reset_l.q)
 294
 295         # after waiting one cycle (reset_l is "sync" mode), reset the port
 296         with m.If(reset_l.q):
 297             comb += idx_l.r.eq(1)  # deactivate port-index selector
 298             comb += reset_l.r.eq(1)     # clear reset
 299
 300         return m
 301
 302     def __iter__(self):
 303         for p in self.dports:
 304             yield from p.ports()
 305
 306     def ports(self):
 307         return list(self)
 308
 309
 310 class TstL0CacheBuffer(Elaboratable):
 311     def __init__(self, pspec, n_units=3):
 312         self.pspec = pspec
 313         regwid = pspec.reg_wid
 314         addrwid = pspec.addr_wid
 315         self.cmpi = ConfigMemoryPortInterface(pspec)
 316         self.pimem = self.cmpi.pi
 317         self.l0 = L0CacheBuffer(n_units, self.pimem, regwid, addrwid << 1)
 318
 319     def elaborate(self, platform):
 320         m = Module()
 321         m.submodules.pimem = self.pimem
 322         m.submodules.l0 = self.l0
 323
 324         if not hasattr(self.cmpi, 'lsmem'):
 325             return m
 326
 327         # really bad hack, the LoadStore1 classes already have the
 328         # lsi (LoadStoreInterface) as a submodule.
 329         if self.pspec.ldst_ifacetype in ['mmu_cache_wb', 'test_mmu_cache_wb']:
 330             return m
 331
 332         # hmmm not happy about this - should not be digging down and
 333         # putting modules in
 334         m.submodules.lsmem = self.cmpi.lsmem.lsi
 335
 336         return m
 337
 338     def ports(self):
 339         yield from self.cmpi.ports()
 340         yield from self.l0.ports()
 341         yield from self.pimem.ports()
 342
 343
 344 def wait_busy(port, no=False):
 345     while True:
 346         busy = yield port.busy_o
 347         print("busy", no, busy)
 348         if bool(busy) == no:
 349             break
 350         yield
 351
 352
 353 def wait_addr(port):
 354     while True:
 355         addr_ok = yield port.addr_ok_o
 356         print("addrok", addr_ok)
 357         if not addr_ok:
 358             break
 359         yield
 360
 361
 362 def wait_ldok(port):
 363     while True:
 364         ldok = yield port.ld.ok
 365         print("ldok", ldok)
 366         if ldok:
 367             break
 368         yield
 369
 370
 371 def l0_cache_st(dut, addr, data, datalen):
 372     return pi_st(dut.l0, addr, datalen)
 373
 374
 375 def l0_cache_ld(dut, addr, datalen, expected):
 376     return pi_ld(dut.l0, addr, datalen)
 377
 378
 379 def l0_cache_ldst(arg, dut):
 380     port0 = dut.l0.dports[0]
 381     return pi_ldst(arg, port0)
 382
 383
 384 def data_merger_merge(dut):
 385     # starting with all inputs zero
 386     yield Settle()
 387     en = yield dut.o_data.en
 388     data = yield dut.o_data.data
 389     assert en == 0, "en must be zero"
 390     assert data == 0, "data must be zero"
 391     yield
 392
 393     yield dut.addr_array_i[0].eq(0xFF)
 394     for j in range(dut.array_size):
 395         yield dut.i_data[j].en.eq(1 << j)
 396         yield dut.i_data[j].data.eq(0xFF << (16*j))
 397     yield Settle()
 398
 399     en = yield dut.o_data.en
 400     data = yield dut.o_data.data
 401     assert data == 0xff00ff00ff00ff00ff00ff00ff00ff
 402     assert en == 0xff
 403     yield
 404
 405 def data_merger_test2(dut):
 406     # starting with all inputs zero
 407     yield Settle()
 408     yield
 409     yield
 410
 411
 412 class TestL0Cache(unittest.TestCase):
 413
 414     def test_l0_cache_test_bare_wb(self):
 415
 416         pspec = TestMemPspec(ldst_ifacetype='test_bare_wb',
 417                              addr_wid=64,
 418                              mask_wid=8,
 419                              reg_wid=64)
 420         dut = TstL0CacheBuffer(pspec)
 421         vl = rtlil.convert(dut, ports=[])  # TODOdut.ports())
 422         with open("test_basic_l0_cache_bare_wb.il", "w") as f:
 423             f.write(vl)
 424
 425         run_simulation(dut, l0_cache_ldst(self, dut),
 426                        vcd_name='test_l0_cache_basic_bare_wb.vcd')
 427
 428     def test_l0_cache_testpi(self):
 429
 430         pspec = TestMemPspec(ldst_ifacetype='testpi',
 431                              addr_wid=64,
 432                              mask_wid=8,
 433                              reg_wid=64)
 434         dut = TstL0CacheBuffer(pspec)
 435         vl = rtlil.convert(dut, ports=[])  # TODOdut.ports())
 436         with open("test_basic_l0_cache.il", "w") as f:
 437             f.write(vl)
 438
 439         run_simulation(dut, l0_cache_ldst(self, dut),
 440                        vcd_name='test_l0_cache_basic_testpi.vcd')
 441
 442
 443 class TestDataMerger(unittest.TestCase):
 444
 445     def test_data_merger(self):
 446
 447         dut = TstDataMerger2()
 448         #vl = rtlil.convert(dut, ports=dut.ports())
 449         # with open("test_data_merger.il", "w") as f:
 450         #    f.write(vl)
 451
 452         run_simulation(dut, data_merger_test2(dut),
 453                        vcd_name='test_data_merger.vcd')
 454
 455
 456
 457 class TestDualPortSplitter(unittest.TestCase):
 458
 459     def test_dual_port_splitter(self):
 460
 461         dut = DualPortSplitter()
 462         #vl = rtlil.convert(dut, ports=dut.ports())
 463         # with open("test_data_merger.il", "w") as f:
 464         #    f.write(vl)
 465
 466         # run_simulation(dut, data_merger_merge(dut),
 467         #               vcd_name='test_dual_port_splitter.vcd')
 468
 469
 470 if __name__ == '__main__':
 471     unittest.main()