src/soc/experiment/l0_cache.py

   1 """L0 Cache/Buffer
   2
   3 This first version is intended for prototyping and test purposes:
   4 it has "direct" access to Memory.
   5
   6 The intention is that this version remains an integral part of the
   7 test infrastructure, and, just as with minerva's memory arrangement,
   8 a dynamic runtime config *selects* alternative memory arrangements
   9 rather than *replaces and discards* this code.
  10
  11 Links:
  12
  13 * https://bugs.libre-soc.org/show_bug.cgi?id=216
  14 * https://libre-soc.org/3d_gpu/architecture/memory_and_cache/
  15
  16 """
  17
  18 from nmigen.compat.sim import run_simulation, Settle
  19 from nmigen.cli import verilog, rtlil
  20 from nmigen import Module, Signal, Mux, Elaboratable, Array, Cat
  21 from nmutil.iocontrol import RecordObject
  22 from nmigen.utils import log2_int
  23 from nmigen.hdl.rec import Record, Layout
  24
  25 from nmutil.latch import SRLatch, latchregister
  26 from soc.decoder.power_decoder2 import Data
  27 from soc.decoder.power_enums import InternalOp
  28 from soc.regfile.regfile import ortreereduce
  29 from nmutil.util import treereduce
  30
  31 from soc.decoder.power_decoder2 import Data
  32 #from nmutil.picker import PriorityPicker
  33 from nmigen.lib.coding import PriorityEncoder
  34 from soc.scoreboard.addr_split import LDSTSplitter
  35 from soc.scoreboard.addr_match import LenExpand
  36
  37 # for testing purposes
  38 from soc.experiment.testmem import TestMemory # TODO: replace with TMLSUI
  39 # TODO: from soc.experiment.testmem import TestMemoryLoadStoreUnit
  40 from soc.experiment.pimem import PortInterface, TestMemoryPortInterface
  41
  42 import unittest
  43
  44
  45 class DualPortSplitter(Elaboratable):
  46     """DualPortSplitter
  47
  48     * one incoming PortInterface
  49     * two *OUTGOING* PortInterfaces
  50     * uses LDSTSplitter to do it
  51
  52     (actually, thinking about it LDSTSplitter could simply be
  53      modified to conform to PortInterface: one in, two out)
  54
  55     once that is done each pair of ports may be wired directly
  56     to the dual ports of L0CacheBuffer
  57
  58     The split is carried out so that, regardless of alignment or
  59     mis-alignment, outgoing PortInterface[0] takes bit 4 == 0
  60     of the address, whilst outgoing PortInterface[1] takes
  61     bit 4 == 1.
  62
  63     PortInterface *may* need to be changed so that the length is
  64     a binary number (accepting values 1-16).
  65     """
  66     def __init__(self):
  67         self.outp = [PortInterface(name="outp_0"),
  68                      PortInterface(name="outp_1")]
  69         self.inp  = PortInterface(name="inp")
  70         print(self.outp)
  71
  72     def elaborate(self, platform):
  73         m = Module()
  74         comb = m.d.comb
  75         m.submodules.splitter = splitter = LDSTSplitter(64, 48, 4)
  76         comb += splitter.addr_i.eq(self.inp.addr) #XXX
  77         #comb += splitter.len_i.eq()
  78         #comb += splitter.valid_i.eq()
  79         comb += splitter.is_ld_i.eq(self.inp.is_ld_i)
  80         comb += splitter.is_st_i.eq(self.inp.is_st_i)
  81         #comb += splitter.st_data_i.eq()
  82         #comb += splitter.sld_valid_i.eq()
  83         #comb += splitter.sld_data_i.eq()
  84         #comb += splitter.sst_valid_i.eq()
  85         return m
  86
  87
  88 class DataMergerRecord(Record):
  89     """
  90     {data: 128 bit, byte_enable: 16 bit}
  91     """
  92
  93     def __init__(self, name=None):
  94         layout = (('data', 128),
  95                   ('en', 16))
  96         Record.__init__(self, Layout(layout), name=name)
  97
  98         self.data.reset_less = True
  99         self.en.reset_less = True
 100
 101
 102 # TODO: formal verification
 103 class DataMerger(Elaboratable):
 104     """DataMerger
 105
 106     Merges data based on an address-match matrix.
 107     Identifies (picks) one (any) row, then uses that row,
 108     based on matching address bits, to merge (OR) all data
 109     rows into the output.
 110
 111     Basically, by the time DataMerger is used, all of its incoming data is
 112     determined not to conflict.  The last step before actually submitting
 113     the request to the Memory Subsystem is to work out which requests,
 114     on the same 128-bit cache line, can be "merged" due to them being:
 115     (A) on the same address (bits 4 and above) (B) having byte-enable
 116     lines that (as previously mentioned) do not conflict.
 117
 118     Therefore, put simply, this module will:
 119     (1) pick a row (any row) and identify it by an index labelled "idx"
 120     (2) merge all byte-enable lines which are on that same address, as
 121         indicated by addr_match_i[idx], onto the output
 122     """
 123
 124     def __init__(self, array_size):
 125         """
 126         :addr_array_i: an NxN Array of Signals with bits set indicating address
 127                        match.  bits across the diagonal (addr_array_i[x][x])
 128                        will always be set, to indicate "active".
 129         :data_i: an Nx Array of Records {data: 128 bit, byte_enable: 16 bit}
 130         :data_o: an Output Record of same type
 131                  {data: 128 bit, byte_enable: 16 bit}
 132         """
 133         self.array_size = array_size
 134         ul = []
 135         for i in range(array_size):
 136             ul.append(Signal(array_size,
 137                              reset_less=True,
 138                              name="addr_match_%d" % i))
 139         self.addr_array_i = Array(ul)
 140
 141         ul = []
 142         for i in range(array_size):
 143             ul.append(DataMergerRecord())
 144         self.data_i = Array(ul)
 145         self.data_o = DataMergerRecord()
 146
 147     def elaborate(self, platform):
 148         m = Module()
 149         comb = m.d.comb
 150         #(1) pick a row
 151         m.submodules.pick = pick = PriorityEncoder(self.array_size)
 152         for j in range(self.array_size):
 153             comb += pick.i[j].eq(self.addr_array_i[j].bool())
 154         valid = ~pick.n
 155         idx = pick.o
 156         #(2) merge
 157         with m.If(valid):
 158             l = []
 159             for j in range(self.array_size):
 160                 select = self.addr_array_i[idx][j]
 161                 r = DataMergerRecord()
 162                 with m.If(select):
 163                     comb += r.eq(self.data_i[j])
 164                 l.append(r)
 165             comb += self.data_o.data.eq(ortreereduce(l,"data"))
 166             comb += self.data_o.en.eq(ortreereduce(l,"en"))
 167
 168         return m
 169
 170
 171 class L0CacheBuffer(Elaboratable):
 172     """L0 Cache / Buffer
 173
 174     Note that the final version will have *two* interfaces per LDSTCompUnit,
 175     to cover mis-aligned requests, as well as *two* 128-bit L1 Cache
 176     interfaces: one for odd (addr[4] == 1) and one for even (addr[4] == 1).
 177
 178     This version is to be used for test purposes (and actively maintained
 179     for such, rather than "replaced")
 180
 181     There are much better ways to implement this.  However it's only
 182     a "demo" / "test" class, and one important aspect: it responds
 183     combinatorially, where a nmigen FSM's state-changes only activate
 184     on clock-sync boundaries.
 185
 186     Note: the data byte-order is *not* expected to be normalised (LE/BE)
 187     by this class.  That task is taken care of by LDSTCompUnit.
 188     """
 189
 190     def __init__(self, n_units, pimem, regwid=64, addrwid=48):
 191         self.n_units = n_units
 192         self.pimem = pimem
 193         self.regwid = regwid
 194         self.addrwid = addrwid
 195         ul = []
 196         for i in range(n_units):
 197             ul.append(PortInterface("ldst_port%d" % i, regwid, addrwid))
 198         self.dports = Array(ul)
 199
 200     def elaborate(self, platform):
 201         m = Module()
 202         comb, sync = m.d.comb, m.d.sync
 203
 204         # connect the ports as modules
 205         #for i in range(self.n_units):
 206         #    setattr(m.submodules, "port%d" % i, self.dports[i])
 207
 208         # state-machine latches
 209         m.submodules.idx_l = idx_l = SRLatch(False, name="idx_l")
 210         m.submodules.reset_l = reset_l = SRLatch(True, name="reset")
 211
 212         # find one LD (or ST) and do it.  only one per cycle.
 213         # TODO: in the "live" (production) L0Cache/Buffer, merge multiple
 214         # LD/STs using mask-expansion - see LenExpand class
 215
 216         m.submodules.pick = pick = PriorityEncoder(self.n_units)
 217         m.submodules.lenexp = lenexp = LenExpand(4, 8)
 218
 219         ldsti = []
 220         for i in range(self.n_units):
 221             pi = self.dports[i]
 222             busy = (pi.is_ld_i | pi.is_st_i)# & pi.busy_o
 223             ldsti.append(busy) # accumulate ld/st-req
 224         # put the requests into the priority-picker
 225         comb += pick.i.eq(Cat(*ldsti))
 226
 227         # hmm, have to select (record) the right port index
 228         nbits = log2_int(self.n_units, False)
 229         idx = Signal(nbits, reset_less=False)
 230
 231         # use these because of the sync-and-comb pass-through capability
 232         latchregister(m, pick.o, idx, idx_l.q, name="idx_l")
 233
 234         # convenience variables to reference the "picked" port
 235         port = self.dports[idx]
 236
 237         # pick (and capture) the port index
 238         with m.If(~pick.n):
 239             comb += idx_l.s.eq(1)
 240
 241         # from this point onwards, with the port "picked", it stays picked
 242         # until idx_l is deasserted
 243         comb += reset_l.s.eq(0)
 244         comb += reset_l.r.eq(0)
 245
 246         with m.If(idx_l.q):
 247             comb += self.pimem.connect_port(port)
 248             with m.If(~self.pimem.pi.pi.busy_o):
 249                 comb += reset_l.s.eq(1) # reset when no longer busy
 250
 251         # ugly hack, due to simultaneous addr req-go acknowledge
 252         reset_delay = Signal(reset_less=True)
 253         sync += reset_delay.eq(reset_l.q)
 254
 255         # after waiting one cycle (reset_l is "sync" mode), reset the port
 256         with m.If(reset_l.q):
 257             comb += idx_l.r.eq(1)  # deactivate port-index selector
 258             comb += reset_l.r.eq(1)     # clear reset
 259
 260         return m
 261
 262     def ports(self):
 263         for p in self.dports:
 264             yield from p.ports()
 265
 266
 267 class TstL0CacheBuffer(Elaboratable):
 268     def __init__(self, n_units=3, regwid=16, addrwid=4):
 269         self.pimem = TestMemoryPortInterface(regwid, addrwid<<1)
 270         self.l0 = L0CacheBuffer(n_units, self.pimem, regwid, addrwid<<1)
 271
 272     def elaborate(self, platform):
 273         m = Module()
 274         m.submodules.pimem = self.pimem
 275         m.submodules.l0 = self.l0
 276
 277         return m
 278
 279     def ports(self):
 280         yield from self.l0.ports()
 281         yield from self.pimem
 282
 283
 284 def wait_busy(port, no=False):
 285     while True:
 286         busy = yield port.busy_o
 287         print("busy", no, busy)
 288         if bool(busy) == no:
 289             break
 290         yield
 291
 292
 293 def wait_addr(port):
 294     while True:
 295         addr_ok = yield port.addr_ok_o
 296         print("addrok", addr_ok)
 297         if not addr_ok:
 298             break
 299         yield
 300
 301
 302 def wait_ldok(port):
 303     while True:
 304         ldok = yield port.ld.ok
 305         print("ldok", ldok)
 306         if ldok:
 307             break
 308         yield
 309
 310
 311 def l0_cache_st(dut, addr, data, datalen):
 312     l0 = dut.l0
 313     mem = dut.pimem
 314     port0 = l0.dports[0]
 315     port1 = l0.dports[1]
 316
 317     # have to wait until not busy
 318     yield from wait_busy(port1, no=False)    # wait until not busy
 319
 320     # set up a ST on the port.  address first:
 321     yield port1.is_st_i.eq(1)  # indicate ST
 322     yield port1.data_len.eq(datalen)  # ST length (1/2/4/8)
 323
 324     yield port1.addr.data.eq(addr)  # set address
 325     yield port1.addr.ok.eq(1)  # set ok
 326     yield from wait_addr(port1)             # wait until addr ok
 327     # yield # not needed, just for checking
 328     # yield # not needed, just for checking
 329     # assert "ST" for one cycle (required by the API)
 330     yield port1.st.data.eq(data)
 331     yield port1.st.ok.eq(1)
 332     yield
 333     yield port1.st.ok.eq(0)
 334
 335     # can go straight to reset.
 336     yield port1.is_st_i.eq(0)  # end
 337     yield port1.addr.ok.eq(0)  # set !ok
 338     # yield from wait_busy(port1, False)    # wait until not busy
 339
 340
 341 def l0_cache_ld(dut, addr, datalen, expected):
 342
 343     l0 = dut.l0
 344     mem = dut.pimem
 345     port1 = l0.dports[0]
 346     port2 = l0.dports[2]
 347
 348     # have to wait until not busy
 349     yield from wait_busy(port1, no=False)    # wait until not busy
 350
 351     # set up a LD on the port.  address first:
 352     yield port1.is_ld_i.eq(1)  # indicate LD
 353     yield port1.data_len.eq(datalen)  # LD length (1/2/4/8)
 354
 355     yield port1.addr.data.eq(addr)  # set address
 356     yield port1.addr.ok.eq(1)  # set ok
 357     yield from wait_addr(port1)             # wait until addr ok
 358
 359     yield from wait_ldok(port1)             # wait until ld ok
 360     data = yield port1.ld.data
 361
 362     # cleanup
 363     yield port1.is_ld_i.eq(0)  # end
 364     yield port1.addr.ok.eq(0)  # set !ok
 365     # yield from wait_busy(port1, no=False)    # wait until not busy
 366
 367     return data
 368
 369
 370 def l0_cache_ldst(arg, dut):
 371     yield
 372     addr = 0x2
 373     data = 0xbeef
 374     data2 = 0xf00f
 375     #data = 0x4
 376     yield from l0_cache_st(dut, 0x2, data, 2)
 377     yield from l0_cache_st(dut, 0x4, data2, 2)
 378     result = yield from l0_cache_ld(dut, 0x2, 2, data)
 379     result2 = yield from l0_cache_ld(dut, 0x4, 2, data2)
 380     yield
 381     arg.assertEqual(data, result, "data %x != %x" % (result, data))
 382     arg.assertEqual(data2, result2, "data2 %x != %x" % (result2, data2))
 383
 384
 385 def data_merger_merge(dut):
 386     print("data_merger")
 387     #starting with all inputs zero
 388     yield Settle()
 389     en = yield dut.data_o.en
 390     data = yield dut.data_o.data
 391     assert en == 0, "en must be zero"
 392     assert data == 0, "data must be zero"
 393     yield
 394
 395     yield dut.addr_array_i[0].eq(0xFF)
 396     for j in range(dut.array_size):
 397         yield dut.data_i[j].en.eq(1 << j)
 398         yield dut.data_i[j].data.eq(0xFF << (16*j))
 399     yield Settle()
 400
 401     en = yield dut.data_o.en
 402     data = yield dut.data_o.data
 403     assert data == 0xff00ff00ff00ff00ff00ff00ff00ff
 404     assert en == 0xff
 405     yield
 406
 407
 408 class TestL0Cache(unittest.TestCase):
 409
 410     def test_l0_cache(self):
 411
 412         dut = TstL0CacheBuffer(regwid=64)
 413         #vl = rtlil.convert(dut, ports=dut.ports())
 414         #with open("test_basic_l0_cache.il", "w") as f:
 415         #    f.write(vl)
 416
 417         run_simulation(dut, l0_cache_ldst(self, dut),
 418                        vcd_name='test_l0_cache_basic.vcd')
 419
 420
 421 class TestDataMerger(unittest.TestCase):
 422
 423     def test_data_merger(self):
 424
 425         dut = DataMerger(8)
 426         #vl = rtlil.convert(dut, ports=dut.ports())
 427         #with open("test_data_merger.il", "w") as f:
 428         #    f.write(vl)
 429
 430         run_simulation(dut, data_merger_merge(dut),
 431                        vcd_name='test_data_merger.vcd')
 432
 433
 434 class TestDualPortSplitter(unittest.TestCase):
 435
 436     def test_dual_port_splitter(self):
 437
 438         dut = DualPortSplitter()
 439         #vl = rtlil.convert(dut, ports=dut.ports())
 440         #with open("test_data_merger.il", "w") as f:
 441         #    f.write(vl)
 442
 443         #run_simulation(dut, data_merger_merge(dut),
 444         #               vcd_name='test_dual_port_splitter.vcd')
 445
 446
 447 if __name__ == '__main__':
 448     unittest.main(exit=False)
 449