From 1bc0494489aa0948af04759025b803c5e72f7fcd Mon Sep 17 00:00:00 2001 From: Luke Kenneth Casson Leighton Date: Mon, 22 Jun 2020 20:24:02 +0100 Subject: [PATCH] simplified L0CacheBuffer down to a "PortInterface Arbiter" --- src/soc/experiment/l0_cache.py | 258 +++++---------------- src/soc/experiment/pimem.py | 19 +- src/soc/fu/compunits/test/test_compunit.py | 6 +- 3 files changed, 77 insertions(+), 206 deletions(-) diff --git a/src/soc/experiment/l0_cache.py b/src/soc/experiment/l0_cache.py index 89f8b024..946b344c 100644 --- a/src/soc/experiment/l0_cache.py +++ b/src/soc/experiment/l0_cache.py @@ -37,7 +37,7 @@ from soc.scoreboard.addr_match import LenExpand # for testing purposes from soc.experiment.testmem import TestMemory # TODO: replace with TMLSUI # TODO: from soc.experiment.testmem import TestMemoryLoadStoreUnit -from soc.experiment.pimem import PortInterface +from soc.experiment.pimem import PortInterface, TestMemoryPortInterface import unittest @@ -168,61 +168,6 @@ class DataMerger(Elaboratable): return m -class LDSTPort(Elaboratable): - def __init__(self, idx, regwid=64, addrwid=48): - self.pi = PortInterface("ldst_port%d" % idx, regwid, addrwid) - - def elaborate(self, platform): - m = Module() - comb, sync = m.d.comb, m.d.sync - - # latches - m.submodules.busy_l = busy_l = SRLatch(False, name="busy") - m.submodules.cyc_l = cyc_l = SRLatch(True, name="cyc") - comb += cyc_l.s.eq(0) - comb += cyc_l.r.eq(0) - - # this is a little weird: we let the L0Cache/Buffer set - # the outputs: this module just monitors "state". - - # LD/ST requested activates "busy" - with m.If(self.pi.is_ld_i | self.pi.is_st_i): - comb += busy_l.s.eq(1) - - # monitor for an exception or the completion of LD. - with m.If(self.pi.addr_exc_o): - comb += busy_l.r.eq(1) - - # however ST needs one cycle before busy is reset - with m.If(self.pi.st.ok | self.pi.ld.ok): - comb += cyc_l.s.eq(1) - - with m.If(cyc_l.q): - comb += cyc_l.r.eq(1) - comb += busy_l.r.eq(1) - - # busy latch outputs to interface - comb += self.pi.busy_o.eq(busy_l.q) - - return m - - def __iter__(self): - yield self.pi.is_ld_i - yield self.pi.is_st_i - yield from self.pi.op.ports() - yield self.pi.busy_o - yield self.pi.go_die_i - yield from self.pi.addr.ports() - yield self.pi.addr_ok_o - yield self.pi.addr_exc_o - - yield from self.pi.ld.ports() - yield from self.pi.st.ports() - - def ports(self): - return list(self) - - class L0CacheBuffer(Elaboratable): """L0 Cache / Buffer @@ -242,161 +187,75 @@ class L0CacheBuffer(Elaboratable): by this class. That task is taken care of by LDSTCompUnit. """ - def __init__(self, n_units, mem, regwid=64, addrwid=48): + def __init__(self, n_units, pimem, regwid=64, addrwid=48): self.n_units = n_units - self.mem = mem # TODO: remove, replace with lsui - # TODO: self.lsui = LoadStoreUnitInterface(addr_wid=addrwid....) + self.pimem = pimem self.regwid = regwid self.addrwid = addrwid ul = [] for i in range(n_units): - ul.append(LDSTPort(i, regwid, addrwid)) + ul.append(PortInterface("ldst_port%d" % i, regwid, addrwid)) self.dports = Array(ul) - @property - def addrbits(self): - return log2_int(self.mem.regwid//8) - - def splitaddr(self, addr): - """split the address into top and bottom bits of the memory granularity - """ - return addr[:self.addrbits], addr[self.addrbits:] - def elaborate(self, platform): m = Module() comb, sync = m.d.comb, m.d.sync # connect the ports as modules - for i in range(self.n_units): - setattr(m.submodules, "port%d" % i, self.dports[i]) + #for i in range(self.n_units): + # setattr(m.submodules, "port%d" % i, self.dports[i]) # state-machine latches - m.submodules.st_active = st_active = SRLatch(False, name="st_active") - m.submodules.ld_active = ld_active = SRLatch(False, name="ld_active") - m.submodules.reset_l = reset_l = SRLatch(True, name="reset") m.submodules.idx_l = idx_l = SRLatch(False, name="idx_l") - m.submodules.adrok_l = adrok_l = SRLatch(False, name="addr_acked") + m.submodules.reset_l = reset_l = SRLatch(True, name="reset") # find one LD (or ST) and do it. only one per cycle. # TODO: in the "live" (production) L0Cache/Buffer, merge multiple # LD/STs using mask-expansion - see LenExpand class - m.submodules.ldpick = ldpick = PriorityEncoder(self.n_units) - m.submodules.stpick = stpick = PriorityEncoder(self.n_units) + m.submodules.pick = pick = PriorityEncoder(self.n_units) m.submodules.lenexp = lenexp = LenExpand(4, 8) - lds = Signal(self.n_units, reset_less=True) - sts = Signal(self.n_units, reset_less=True) - ldi = [] - sti = [] + ldsti = [] for i in range(self.n_units): - pi = self.dports[i].pi - ldi.append(pi.is_ld_i & pi.busy_o) # accumulate ld-req signals - sti.append(pi.is_st_i & pi.busy_o) # accumulate st-req signals - # put the requests into the priority-pickers - comb += ldpick.i.eq(Cat(*ldi)) - comb += stpick.i.eq(Cat(*sti)) + pi = self.dports[i] + busy = (pi.is_ld_i | pi.is_st_i)# & pi.busy_o + ldsti.append(busy) # accumulate ld/st-req + # put the requests into the priority-picker + comb += pick.i.eq(Cat(*ldsti)) # hmm, have to select (record) the right port index nbits = log2_int(self.n_units, False) - ld_idx = Signal(nbits, reset_less=False) - st_idx = Signal(nbits, reset_less=False) + idx = Signal(nbits, reset_less=False) + # use these because of the sync-and-comb pass-through capability - latchregister(m, ldpick.o, ld_idx, idx_l.qn, name="ld_idx_l") - latchregister(m, stpick.o, st_idx, idx_l.qn, name="st_idx_l") + latchregister(m, pick.o, idx, idx_l.q, name="idx_l") # convenience variables to reference the "picked" port - ldport = self.dports[ld_idx].pi - stport = self.dports[st_idx].pi - # and the memory ports - rdport = self.mem.rdport - wrport = self.mem.wrport - - # Priority-Pickers pick one and only one request, capture its index. - # from that point on this code *only* "listens" to that port. - - sync += adrok_l.s.eq(0) - comb += adrok_l.r.eq(0) - with m.If(~ldpick.n): - comb += ld_active.s.eq(1) # activate LD mode - comb += idx_l.r.eq(1) # pick (and capture) the port index - with m.Elif(~stpick.n): - comb += st_active.s.eq(1) # activate ST mode - comb += idx_l.r.eq(1) # pick (and capture) the port index + port = self.dports[idx] + + # pick (and capture) the port index + with m.If(~pick.n): + comb += idx_l.s.eq(1) # from this point onwards, with the port "picked", it stays picked - # until ld_active (or st_active) are de-asserted. - - # if now in "LD" mode: wait for addr_ok, then send the address out - # to memory, acknowledge address, and send out LD data - with m.If(ld_active.q): - # set up LenExpander with the LD len and lower bits of addr - lsbaddr, msbaddr = self.splitaddr(ldport.addr.data) - comb += lenexp.len_i.eq(ldport.data_len) - comb += lenexp.addr_i.eq(lsbaddr) - with m.If(ldport.addr.ok & adrok_l.qn): - comb += rdport.addr.eq(msbaddr) # addr ok, send thru - comb += ldport.addr_ok_o.eq(1) # acknowledge addr ok - sync += adrok_l.s.eq(1) # and pull "ack" latch - - # if now in "ST" mode: likewise do the same but with "ST" - # to memory, acknowledge address, and send out LD data - with m.If(st_active.q): - # set up LenExpander with the ST len and lower bits of addr - lsbaddr, msbaddr = self.splitaddr(stport.addr.data) - comb += lenexp.len_i.eq(stport.data_len) - comb += lenexp.addr_i.eq(lsbaddr) - with m.If(stport.addr.ok): - comb += wrport.addr.eq(msbaddr) # addr ok, send thru - with m.If(adrok_l.qn): - comb += stport.addr_ok_o.eq(1) # acknowledge addr ok - sync += adrok_l.s.eq(1) # and pull "ack" latch - - # NOTE: in both these, below, the port itself takes care - # of de-asserting its "busy_o" signal, based on either ld.ok going - # high (by us, here) or by st.ok going high (by the LDSTCompUnit). - - # for LD mode, when addr has been "ok'd", assume that (because this - # is a "Memory" test-class) the memory read data is valid. + # until idx_l is deasserted comb += reset_l.s.eq(0) comb += reset_l.r.eq(0) - with m.If(ld_active.q & adrok_l.q): - # shift data down before pushing out. requires masking - # from the *byte*-expanded version of LenExpand output - lddata = Signal(self.regwid, reset_less=True) - # TODO: replace rdport.data with LoadStoreUnitInterface.x_load_data - # and also handle the ready/stall/busy protocol - comb += lddata.eq((rdport.data & lenexp.rexp_o) >> - (lenexp.addr_i*8)) - comb += ldport.ld.data.eq(lddata) # put data out - comb += ldport.ld.ok.eq(1) # indicate data valid - comb += reset_l.s.eq(1) # reset mode after 1 cycle - - # for ST mode, when addr has been "ok'd", wait for incoming "ST ok" - with m.If(st_active.q & stport.st.ok): - # shift data up before storing. lenexp *bit* version of mask is - # passed straight through as byte-level "write-enable" lines. - stdata = Signal(self.regwid, reset_less=True) - comb += stdata.eq(stport.st.data << (lenexp.addr_i*8)) - # TODO: replace with link to LoadStoreUnitInterface.x_store_data - # and also handle the ready/stall/busy protocol - comb += wrport.data.eq(stdata) # write st to mem - comb += wrport.en.eq(lenexp.lexp_o) # enable writes - comb += reset_l.s.eq(1) # reset mode after 1 cycle + + with m.If(idx_l.q): + comb += self.pimem.connect_port(port) + with m.If(~self.pimem.pi.pi.busy_o): + comb += reset_l.s.eq(1) # reset when no longer busy # ugly hack, due to simultaneous addr req-go acknowledge reset_delay = Signal(reset_less=True) sync += reset_delay.eq(reset_l.q) - with m.If(reset_delay): - comb += adrok_l.r.eq(1) # address reset # after waiting one cycle (reset_l is "sync" mode), reset the port with m.If(reset_l.q): - comb += idx_l.s.eq(1) # deactivate port-index selector - comb += ld_active.r.eq(1) # leave the ST active for 1 cycle - comb += st_active.r.eq(1) # leave the ST active for 1 cycle + comb += idx_l.r.eq(1) # deactivate port-index selector comb += reset_l.r.eq(1) # clear reset - comb += adrok_l.r.eq(1) # address reset return m @@ -407,29 +266,24 @@ class L0CacheBuffer(Elaboratable): class TstL0CacheBuffer(Elaboratable): def __init__(self, n_units=3, regwid=16, addrwid=4): - # TODO: replace with TestMemoryLoadStoreUnit - self.mem = TestMemory(regwid, addrwid, granularity=regwid//8) - self.l0 = L0CacheBuffer(n_units, self.mem, regwid, addrwid<<1) + self.pimem = TestMemoryPortInterface(regwid, addrwid<<1) + self.l0 = L0CacheBuffer(n_units, self.pimem, regwid, addrwid<<1) def elaborate(self, platform): m = Module() - m.submodules.mem = self.mem + m.submodules.pimem = self.pimem m.submodules.l0 = self.l0 return m def ports(self): yield from self.l0.ports() - yield self.mem.rdport.addr - yield self.mem.rdport.data - yield self.mem.wrport.addr - yield self.mem.wrport.data - # TODO: mem ports + yield from self.pimem def wait_busy(port, no=False): while True: - busy = yield port.pi.busy_o + busy = yield port.busy_o print("busy", no, busy) if bool(busy) == no: break @@ -438,7 +292,7 @@ def wait_busy(port, no=False): def wait_addr(port): while True: - addr_ok = yield port.pi.addr_ok_o + addr_ok = yield port.addr_ok_o print("addrok", addr_ok) if not addr_ok: break @@ -447,7 +301,7 @@ def wait_addr(port): def wait_ldok(port): while True: - ldok = yield port.pi.ld.ok + ldok = yield port.ld.ok print("ldok", ldok) if ldok: break @@ -456,7 +310,7 @@ def wait_ldok(port): def l0_cache_st(dut, addr, data, datalen): l0 = dut.l0 - mem = dut.mem + mem = dut.pimem port0 = l0.dports[0] port1 = l0.dports[1] @@ -464,50 +318,50 @@ def l0_cache_st(dut, addr, data, datalen): yield from wait_busy(port1, no=False) # wait until not busy # set up a ST on the port. address first: - yield port1.pi.is_st_i.eq(1) # indicate ST - yield port1.pi.data_len.eq(datalen) # ST length (1/2/4/8) + yield port1.is_st_i.eq(1) # indicate ST + yield port1.data_len.eq(datalen) # ST length (1/2/4/8) - yield port1.pi.addr.data.eq(addr) # set address - yield port1.pi.addr.ok.eq(1) # set ok + yield port1.addr.data.eq(addr) # set address + yield port1.addr.ok.eq(1) # set ok yield from wait_addr(port1) # wait until addr ok # yield # not needed, just for checking # yield # not needed, just for checking # assert "ST" for one cycle (required by the API) - yield port1.pi.st.data.eq(data) - yield port1.pi.st.ok.eq(1) + yield port1.st.data.eq(data) + yield port1.st.ok.eq(1) yield - yield port1.pi.st.ok.eq(0) + yield port1.st.ok.eq(0) # can go straight to reset. - yield port1.pi.is_st_i.eq(0) # end - yield port1.pi.addr.ok.eq(0) # set !ok + yield port1.is_st_i.eq(0) # end + yield port1.addr.ok.eq(0) # set !ok # yield from wait_busy(port1, False) # wait until not busy def l0_cache_ld(dut, addr, datalen, expected): l0 = dut.l0 - mem = dut.mem - port0 = l0.dports[0] - port1 = l0.dports[1] + mem = dut.pimem + port1 = l0.dports[0] + port2 = l0.dports[2] # have to wait until not busy yield from wait_busy(port1, no=False) # wait until not busy # set up a LD on the port. address first: - yield port1.pi.is_ld_i.eq(1) # indicate LD - yield port1.pi.data_len.eq(datalen) # LD length (1/2/4/8) + yield port1.is_ld_i.eq(1) # indicate LD + yield port1.data_len.eq(datalen) # LD length (1/2/4/8) - yield port1.pi.addr.data.eq(addr) # set address - yield port1.pi.addr.ok.eq(1) # set ok + yield port1.addr.data.eq(addr) # set address + yield port1.addr.ok.eq(1) # set ok yield from wait_addr(port1) # wait until addr ok yield from wait_ldok(port1) # wait until ld ok - data = yield port1.pi.ld.data + data = yield port1.ld.data # cleanup - yield port1.pi.is_ld_i.eq(0) # end - yield port1.pi.addr.ok.eq(0) # set !ok + yield port1.is_ld_i.eq(0) # end + yield port1.addr.ok.eq(0) # set !ok # yield from wait_busy(port1, no=False) # wait until not busy return data diff --git a/src/soc/experiment/pimem.py b/src/soc/experiment/pimem.py index a344606d..9374b13e 100644 --- a/src/soc/experiment/pimem.py +++ b/src/soc/experiment/pimem.py @@ -159,10 +159,24 @@ class LDSTPort(Elaboratable): return m + def connect_port(self, inport): + print ("connect_port", self.pi, inport) + return [self.pi.is_ld_i.eq(inport.is_ld_i), + self.pi.is_st_i.eq(inport.is_st_i), + self.pi.data_len.eq(inport.data_len), + self.pi.go_die_i.eq(inport.go_die_i), + self.pi.addr.eq(inport.addr), + self.pi.st.eq(inport.st), + inport.ld.eq(self.pi.ld), + inport.busy_o.eq(self.pi.busy_o), + inport.addr_ok_o.eq(self.pi.addr_ok_o), + inport.addr_exc_o.eq(self.pi.addr_exc_o), + ] + def __iter__(self): yield self.pi.is_ld_i yield self.pi.is_st_i - yield from self.pi.op.ports() + yield from self.pi.data_len yield self.pi.busy_o yield self.pi.go_die_i yield from self.pi.addr.ports() @@ -204,6 +218,9 @@ class TestMemoryPortInterface(Elaboratable): """ return addr[:self.addrbits], addr[self.addrbits:] + def connect_port(self, inport): + return self.pi.connect_port(inport) + def elaborate(self, platform): m = Module() comb, sync = m.d.comb, m.d.sync diff --git a/src/soc/fu/compunits/test/test_compunit.py b/src/soc/fu/compunits/test/test_compunit.py index efc8e9d5..37258ea3 100644 --- a/src/soc/fu/compunits/test/test_compunit.py +++ b/src/soc/fu/compunits/test/test_compunit.py @@ -104,7 +104,7 @@ def get_inp_indexed(cu, inp): return res def setup_test_memory(l0, sim): - mem = l0.mem.mem + mem = l0.pimem.mem.mem print ("before, init mem", mem.depth, mem.width, mem) for i in range(mem.depth): data = sim.mem.ld(i*8, 8, False) @@ -120,7 +120,7 @@ def setup_test_memory(l0, sim): def check_sim_memory(dut, l0, sim, code): - mem = l0.mem.mem + mem = l0.pimem.mem.mem print ("sim mem dump") for k, v in sim.mem.mem.items(): print (" %6x %016x" % (k, v)) @@ -161,7 +161,7 @@ class TestRunner(FHDLTestCase): from soc.experiment.l0_cache import TstL0CacheBuffer m.submodules.l0 = l0 = TstL0CacheBuffer(n_units=1, regwid=64, addrwid=3) - pi = l0.l0.dports[0].pi + pi = l0.l0.dports[0] m.submodules.cu = cu = self.fukls(pi, awid=3) m.d.comb += cu.ad.go.eq(cu.ad.rel) # link addr-go direct to rel m.d.comb += cu.st.go.eq(cu.st.rel) # link store-go direct to rel -- 2.30.2