X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fsoc%2Fexperiment%2Fpimem.py;h=bc1daee125871b18c5280863a440e1cf73c44e0e;hb=51b0617cbb4b6d1e14433fad6ef621428945a47a;hp=eb79f676ca6366d5ac24a78917822fe801dcbd0f;hpb=a71e38c76782da0f9ecf86e1c5e5eab697e07cab;p=soc.git diff --git a/src/soc/experiment/pimem.py b/src/soc/experiment/pimem.py index eb79f676..bc1daee1 100644 --- a/src/soc/experiment/pimem.py +++ b/src/soc/experiment/pimem.py @@ -12,31 +12,26 @@ Links: * https://bugs.libre-soc.org/show_bug.cgi?id=216 * https://libre-soc.org/3d_gpu/architecture/memory_and_cache/ +* https://bugs.libre-soc.org/show_bug.cgi?id=465 - exception handling """ from nmigen.compat.sim import run_simulation, Settle -from nmigen.cli import verilog, rtlil -from nmigen import Module, Signal, Mux, Elaboratable, Array, Cat +from nmigen.cli import rtlil +from nmigen import Module, Signal, Mux, Elaboratable, Cat, Const from nmutil.iocontrol import RecordObject from nmigen.utils import log2_int -from nmigen.hdl.rec import Record, Layout from nmutil.latch import SRLatch, latchregister -from soc.decoder.power_decoder2 import Data -from soc.decoder.power_enums import InternalOp -from soc.regfile.regfile import ortreereduce -from nmutil.util import treereduce - -from soc.decoder.power_decoder2 import Data -#from nmutil.picker import PriorityPicker -from nmigen.lib.coding import PriorityEncoder -from soc.scoreboard.addr_split import LDSTSplitter +from nmutil.util import rising_edge +from openpower.decoder.power_decoder2 import Data from soc.scoreboard.addr_match import LenExpand +from soc.experiment.mem_types import LDSTException # for testing purposes -from soc.experiment.testmem import TestMemory # TODO: replace with TMLSUI -# TODO: from soc.experiment.testmem import TestMemoryLoadStoreUnit +from soc.experiment.testmem import TestMemory +#from soc.scoreboard.addr_split import LDSTSplitter +from nmutil.util import Display import unittest @@ -66,13 +61,13 @@ class PortInterface(RecordObject): for the L0 Cache/Buffer to have an additional address latch (because the LDSTCompUnit already has it) - * addr_ok_o (or addr_exc_o) must be waited for. these will + * addr_ok_o (or exception.happened) must be waited for. these will be asserted *only* for one cycle and one cycle only. - * addr_exc_o will be asserted if there is no chance that the + * exception.happened will be asserted if there is no chance that the memory request may be fulfilled. - busy_o is deasserted on the same cycle as addr_exc_o is asserted. + busy_o is deasserted on the same cycle as exception.happened is asserted. * conversely: addr_ok_o must *ONLY* be asserted if there is a HUNDRED PERCENT guarantee that the memory request will be @@ -102,8 +97,8 @@ class PortInterface(RecordObject): RecordObject.__init__(self, name=name) # distinguish op type (ld/st) - self.is_ld_i = Signal(reset_less=True) - self.is_st_i = Signal(reset_less=True) + self.is_ld_i = Signal(reset_less=True) + self.is_st_i = Signal(reset_less=True) # LD/ST data length (TODO: other things may be needed) self.data_len = Signal(4, reset_less=True) @@ -114,105 +109,63 @@ class PortInterface(RecordObject): self.addr = Data(addrwid, "addr_i") # addr/addr-ok # addr is valid (TLB, L1 etc.) self.addr_ok_o = Signal(reset_less=True) - self.addr_exc_o = Signal(reset_less=True) # TODO, "type" of exception + self.exc_o = LDSTException("exc") + self.dar_o = Signal(64, reset_less=True) # LD/ST self.ld = Data(regwid, "ld_data_o") # ok to be set by L0 Cache/Buf self.st = Data(regwid, "st_data_i") # ok to be set by CompUnit + # additional "modes" + self.is_nc = Signal() # no cacheing + self.msr_pr = Signal() # 1==virtual, 0==privileged + self.is_dcbz_i = Signal(reset_less=True) -class LDSTPort(Elaboratable): - def __init__(self, idx, regwid=64, addrwid=48): - self.pi = PortInterface("ldst_port%d" % idx, regwid, addrwid) + # mmu + self.mmu_done = Signal() # keep for now - def elaborate(self, platform): - m = Module() - comb, sync = m.d.comb, m.d.sync - - # latches - m.submodules.busy_l = busy_l = SRLatch(False, name="busy") - m.submodules.cyc_l = cyc_l = SRLatch(True, name="cyc") - comb += cyc_l.s.eq(0) - comb += cyc_l.r.eq(0) - - # this is a little weird: we let the L0Cache/Buffer set - # the outputs: this module just monitors "state". - - # LD/ST requested activates "busy" - with m.If(self.pi.is_ld_i | self.pi.is_st_i): - comb += busy_l.s.eq(1) - - # monitor for an exception or the completion of LD. - with m.If(self.pi.addr_exc_o): - comb += busy_l.r.eq(1) - - # however ST needs one cycle before busy is reset - with m.If(self.pi.st.ok | self.pi.ld.ok): - comb += cyc_l.s.eq(1) - - with m.If(cyc_l.q): - comb += cyc_l.r.eq(1) - comb += busy_l.r.eq(1) - - # busy latch outputs to interface - comb += self.pi.busy_o.eq(busy_l.q) - - return m + # dcache + self.ldst_error = Signal() + ## Signalling ld/st error - NC cache hit, TLB miss, prot/RC failure + self.cache_paradox = Signal() def connect_port(self, inport): - print ("connect_port", self.pi, inport) - return [self.pi.is_ld_i.eq(inport.is_ld_i), - self.pi.is_st_i.eq(inport.is_st_i), - self.pi.data_len.eq(inport.data_len), - self.pi.go_die_i.eq(inport.go_die_i), - self.pi.addr.data.eq(inport.addr.data), - self.pi.addr.ok.eq(inport.addr.ok), - self.pi.st.eq(inport.st), - inport.ld.eq(self.pi.ld), - inport.busy_o.eq(self.pi.busy_o), - inport.addr_ok_o.eq(self.pi.addr_ok_o), - inport.addr_exc_o.eq(self.pi.addr_exc_o), + print("connect_port", self, inport) + return [self.is_ld_i.eq(inport.is_ld_i), + self.is_st_i.eq(inport.is_st_i), + self.is_nc.eq(inport.is_nc), + self.is_dcbz_i.eq(inport.is_dcbz_i), + self.data_len.eq(inport.data_len), + self.go_die_i.eq(inport.go_die_i), + self.addr.data.eq(inport.addr.data), + self.addr.ok.eq(inport.addr.ok), + self.st.eq(inport.st), + self.msr_pr.eq(inport.msr_pr), + inport.ld.eq(self.ld), + inport.busy_o.eq(self.busy_o), + inport.addr_ok_o.eq(self.addr_ok_o), + inport.exc_o.eq(self.exc_o), + inport.dar_o.eq(self.dar_o), + inport.mmu_done.eq(self.mmu_done), + inport.ldst_error.eq(self.ldst_error), + inport.cache_paradox.eq(self.cache_paradox) ] - def __iter__(self): - yield self.pi.is_ld_i - yield self.pi.is_st_i - yield from self.pi.data_len - yield self.pi.busy_o - yield self.pi.go_die_i - yield from self.pi.addr.ports() - yield self.pi.addr_ok_o - yield self.pi.addr_exc_o - yield from self.pi.ld.ports() - yield from self.pi.st.ports() +class PortInterfaceBase(Elaboratable): + """PortInterfaceBase - def ports(self): - return list(self) - - -class TestMemoryPortInterface(Elaboratable): - """TestMemoryPortInterface - - This is a test class for simple verification of the LDSTCompUnit - and for the simple core, to be able to run unit tests rapidly and - with less other code in the way. - - Versions of this which are *compatible* (conform with PortInterface) - will include augmented-Wishbone Bus versions, including ones that - connect to L1, L2, MMU etc. etc. however this is the "base lowest - possible version that complies with PortInterface". + Base class for PortInterface-compliant Memory read/writers """ def __init__(self, regwid=64, addrwid=4): - self.mem = TestMemory(regwid, addrwid, granularity=regwid//8) self.regwid = regwid self.addrwid = addrwid - self.pi = LDSTPort(0, regwid, addrwid) + self.pi = PortInterface("ldst_port0", regwid, addrwid) @property def addrbits(self): - return log2_int(self.mem.regwid//8) + return log2_int(self.regwid//8) def splitaddr(self, addr): """split the address into top and bottom bits of the memory granularity @@ -222,107 +175,125 @@ class TestMemoryPortInterface(Elaboratable): def connect_port(self, inport): return self.pi.connect_port(inport) + def set_wr_addr(self, m, addr, mask, misalign, msr_pr, is_dcbz): pass + def set_rd_addr(self, m, addr, mask, misalign, msr_pr): pass + def set_wr_data(self, m, data, wen): pass + def get_rd_data(self, m): pass + def elaborate(self, platform): m = Module() comb, sync = m.d.comb, m.d.sync - # add TestMemory as submodule - m.submodules.mem = self.mem - - # connect the ports as modules - m.submodules.port0 = self.pi - # state-machine latches m.submodules.st_active = st_active = SRLatch(False, name="st_active") + m.submodules.st_done = st_done = SRLatch(False, name="st_done") m.submodules.ld_active = ld_active = SRLatch(False, name="ld_active") m.submodules.reset_l = reset_l = SRLatch(True, name="reset") m.submodules.adrok_l = adrok_l = SRLatch(False, name="addr_acked") + m.submodules.busy_l = busy_l = SRLatch(False, name="busy") + m.submodules.cyc_l = cyc_l = SRLatch(True, name="cyc") + + self.busy_l = busy_l + + sync += st_done.s.eq(0) + comb += st_done.r.eq(0) + comb += st_active.r.eq(0) + comb += ld_active.r.eq(0) + comb += cyc_l.s.eq(0) + comb += cyc_l.r.eq(0) + comb += busy_l.s.eq(0) + comb += busy_l.r.eq(0) + sync += adrok_l.s.eq(0) + comb += adrok_l.r.eq(0) # expand ld/st binary length/addr[:3] into unary bitmap m.submodules.lenexp = lenexp = LenExpand(4, 8) lds = Signal(reset_less=True) sts = Signal(reset_less=True) - pi = self.pi.pi - comb += lds.eq(pi.is_ld_i & pi.busy_o) # ld-req signals - comb += sts.eq(pi.is_st_i & pi.busy_o) # st-req signals + pi = self.pi + comb += lds.eq(pi.is_ld_i) # ld-req signals + comb += sts.eq(pi.is_st_i) # st-req signals + pr = pi.msr_pr # MSR problem state: PR=1 ==> virt, PR==0 ==> priv - # convenience variables to reference the "picked" port - ldport = pi - stport = pi - # and the memory ports - rdport = self.mem.rdport - wrport = self.mem.wrport + # detect busy "edge" + busy_delay = Signal() + busy_edge = Signal() + sync += busy_delay.eq(pi.busy_o) + comb += busy_edge.eq(pi.busy_o & ~busy_delay) - # Priority-Pickers pick one and only one request, capture its index. - # from that point on this code *only* "listens" to that port. + # misalignment detection: bits at end of lenexpand are set. + # when using the L0CacheBuffer "data expander" which splits requests + # into *two* PortInterfaces, this acts as a "safety check". + misalign = Signal() + comb += misalign.eq(lenexp.lexp_o[8:].bool()) - sync += adrok_l.s.eq(0) - comb += adrok_l.r.eq(0) - with m.If(lds): - comb += ld_active.s.eq(1) # activate LD mode - with m.Elif(sts): - comb += st_active.s.eq(1) # activate ST mode - # from this point onwards, with the port "picked", it stays picked - # until ld_active (or st_active) are de-asserted. + # activate mode: only on "edge" + comb += ld_active.s.eq(rising_edge(m, lds)) # activate LD mode + comb += st_active.s.eq(rising_edge(m, sts)) # activate ST mode + + # LD/ST requested activates "busy" (only if not already busy) + with m.If(self.pi.is_ld_i | self.pi.is_st_i): + with m.If(self.pi.exc_o.happened): + comb += busy_l.s.eq(0) + sync += Display("fast exception") + with m.Else(): + comb += busy_l.s.eq(~busy_delay) # if now in "LD" mode: wait for addr_ok, then send the address out # to memory, acknowledge address, and send out LD data with m.If(ld_active.q): # set up LenExpander with the LD len and lower bits of addr - lsbaddr, msbaddr = self.splitaddr(ldport.addr.data) - comb += lenexp.len_i.eq(ldport.data_len) + lsbaddr, msbaddr = self.splitaddr(pi.addr.data) + comb += lenexp.len_i.eq(pi.data_len) comb += lenexp.addr_i.eq(lsbaddr) - with m.If(ldport.addr.ok & adrok_l.qn): - comb += rdport.addr.eq(msbaddr) # addr ok, send thru - comb += ldport.addr_ok_o.eq(1) # acknowledge addr ok + with m.If(pi.addr.ok & adrok_l.qn): + self.set_rd_addr(m, pi.addr.data, lenexp.lexp_o, misalign, pr) + comb += pi.addr_ok_o.eq(1) # acknowledge addr ok sync += adrok_l.s.eq(1) # and pull "ack" latch # if now in "ST" mode: likewise do the same but with "ST" # to memory, acknowledge address, and send out LD data with m.If(st_active.q): # set up LenExpander with the ST len and lower bits of addr - lsbaddr, msbaddr = self.splitaddr(stport.addr.data) - comb += lenexp.len_i.eq(stport.data_len) + lsbaddr, msbaddr = self.splitaddr(pi.addr.data) + comb += lenexp.len_i.eq(pi.data_len) comb += lenexp.addr_i.eq(lsbaddr) - with m.If(stport.addr.ok): - comb += wrport.addr.eq(msbaddr) # addr ok, send thru - with m.If(adrok_l.qn): - comb += stport.addr_ok_o.eq(1) # acknowledge addr ok + with m.If(pi.addr.ok): + self.set_wr_addr(m, pi.addr.data, lenexp.lexp_o, misalign, pr, + pi.is_dcbz_i) + with m.If(adrok_l.qn & self.pi.exc_o.happened==0): + comb += pi.addr_ok_o.eq(1) # acknowledge addr ok sync += adrok_l.s.eq(1) # and pull "ack" latch - # NOTE: in both these, below, the port itself takes care - # of de-asserting its "busy_o" signal, based on either ld.ok going - # high (by us, here) or by st.ok going high (by the LDSTCompUnit). - # for LD mode, when addr has been "ok'd", assume that (because this # is a "Memory" test-class) the memory read data is valid. comb += reset_l.s.eq(0) comb += reset_l.r.eq(0) + lddata = Signal(self.regwid, reset_less=True) + data, ldok = self.get_rd_data(m) + comb += lddata.eq((data & lenexp.rexp_o) >> + (lenexp.addr_i*8)) with m.If(ld_active.q & adrok_l.q): # shift data down before pushing out. requires masking # from the *byte*-expanded version of LenExpand output - lddata = Signal(self.regwid, reset_less=True) - # TODO: replace rdport.data with LoadStoreUnitInterface.x_load_data - # and also handle the ready/stall/busy protocol - comb += lddata.eq((rdport.data & lenexp.rexp_o) >> - (lenexp.addr_i*8)) - comb += ldport.ld.data.eq(lddata) # put data out - comb += ldport.ld.ok.eq(1) # indicate data valid - comb += reset_l.s.eq(1) # reset mode after 1 cycle + comb += pi.ld.data.eq(lddata) # put data out + comb += pi.ld.ok.eq(ldok) # indicate data valid + comb += reset_l.s.eq(ldok) # reset mode after 1 cycle # for ST mode, when addr has been "ok'd", wait for incoming "ST ok" - with m.If(st_active.q & stport.st.ok): + with m.If(st_active.q & pi.st.ok): # shift data up before storing. lenexp *bit* version of mask is # passed straight through as byte-level "write-enable" lines. stdata = Signal(self.regwid, reset_less=True) - comb += stdata.eq(stport.st.data << (lenexp.addr_i*8)) + comb += stdata.eq(pi.st.data << (lenexp.addr_i*8)) # TODO: replace with link to LoadStoreUnitInterface.x_store_data # and also handle the ready/stall/busy protocol - comb += wrport.data.eq(stdata) # write st to mem - comb += wrport.en.eq(lenexp.lexp_o) # enable writes - comb += reset_l.s.eq(1) # reset mode after 1 cycle + stok = self.set_wr_data(m, stdata, lenexp.lexp_o) + sync += st_done.s.eq(1) # store done trigger + with m.If(st_done.q): + comb += reset_l.s.eq(stok) # reset mode after 1 cycle # ugly hack, due to simultaneous addr req-go acknowledge reset_delay = Signal(reset_less=True) @@ -332,129 +303,77 @@ class TestMemoryPortInterface(Elaboratable): # after waiting one cycle (reset_l is "sync" mode), reset the port with m.If(reset_l.q): - comb += ld_active.r.eq(1) # leave the ST active for 1 cycle + comb += ld_active.r.eq(1) # leave the LD active for 1 cycle comb += st_active.r.eq(1) # leave the ST active for 1 cycle comb += reset_l.r.eq(1) # clear reset comb += adrok_l.r.eq(1) # address reset + comb += st_done.r.eq(1) # store done reset - return m - - def ports(self): - for p in self.dports: - yield from p.ports() - - -def wait_busy(port, no=False): - while True: - busy = yield port.pi.busy_o - print("busy", no, busy) - if bool(busy) == no: - break - yield - - -def wait_addr(port): - while True: - addr_ok = yield port.pi.addr_ok_o - print("addrok", addr_ok) - if not addr_ok: - break - yield - - -def wait_ldok(port): - while True: - ldok = yield port.pi.ld.ok - print("ldok", ldok) - if ldok: - break - yield - - -def l0_cache_st(dut, addr, data, datalen): - mem = dut.mem - port1 = dut.pi - - # have to wait until not busy - yield from wait_busy(port1, no=False) # wait until not busy - - # set up a ST on the port. address first: - yield port1.pi.is_st_i.eq(1) # indicate ST - yield port1.pi.data_len.eq(datalen) # ST length (1/2/4/8) - - yield port1.pi.addr.data.eq(addr) # set address - yield port1.pi.addr.ok.eq(1) # set ok - yield from wait_addr(port1) # wait until addr ok - # yield # not needed, just for checking - # yield # not needed, just for checking - # assert "ST" for one cycle (required by the API) - yield port1.pi.st.data.eq(data) - yield port1.pi.st.ok.eq(1) - yield - yield port1.pi.st.ok.eq(0) - - # can go straight to reset. - yield port1.pi.is_st_i.eq(0) # end - yield port1.pi.addr.ok.eq(0) # set !ok - # yield from wait_busy(port1, False) # wait until not busy - - -def l0_cache_ld(dut, addr, datalen, expected): + # monitor for an exception, clear busy immediately + with m.If(self.pi.exc_o.happened): + comb += busy_l.r.eq(1) + comb += reset_l.s.eq(1) # also reset whole unit - mem = dut.mem - port1 = dut.pi + # however ST needs one cycle before busy is reset + #with m.If(self.pi.st.ok | self.pi.ld.ok): + with m.If(reset_l.s): + comb += cyc_l.s.eq(1) - # have to wait until not busy - yield from wait_busy(port1, no=False) # wait until not busy + with m.If(cyc_l.q): + comb += cyc_l.r.eq(1) + comb += busy_l.r.eq(1) - # set up a LD on the port. address first: - yield port1.pi.is_ld_i.eq(1) # indicate LD - yield port1.pi.data_len.eq(datalen) # LD length (1/2/4/8) + # busy latch outputs to interface + comb += pi.busy_o.eq(busy_l.q) - yield port1.pi.addr.data.eq(addr) # set address - yield port1.pi.addr.ok.eq(1) # set ok - yield from wait_addr(port1) # wait until addr ok + return m - yield from wait_ldok(port1) # wait until ld ok - data = yield port1.pi.ld.data + def ports(self): + yield from self.pi.ports() - # cleanup - yield port1.pi.is_ld_i.eq(0) # end - yield port1.pi.addr.ok.eq(0) # set !ok - # yield from wait_busy(port1, no=False) # wait until not busy - return data +class TestMemoryPortInterface(PortInterfaceBase): + """TestMemoryPortInterface + This is a test class for simple verification of the LDSTCompUnit + and for the simple core, to be able to run unit tests rapidly and + with less other code in the way. -def l0_cache_ldst(arg, dut): - yield - addr = 0x2 - data = 0xbeef - data2 = 0xf00f - #data = 0x4 - yield from l0_cache_st(dut, 0x2, data, 2) - yield from l0_cache_st(dut, 0x4, data2, 2) - result = yield from l0_cache_ld(dut, 0x2, 2, data) - result2 = yield from l0_cache_ld(dut, 0x4, 2, data2) - yield - arg.assertEqual(data, result, "data %x != %x" % (result, data)) - arg.assertEqual(data2, result2, "data2 %x != %x" % (result2, data2)) + Versions of this which are *compatible* (conform with PortInterface) + will include augmented-Wishbone Bus versions, including ones that + connect to L1, L2, MMU etc. etc. however this is the "base lowest + possible version that complies with PortInterface". + """ + def __init__(self, regwid=64, addrwid=4): + super().__init__(regwid, addrwid) + # hard-code memory addressing width to 6 bits + self.mem = TestMemory(regwid, 5, granularity=regwid//8, init=False) + def set_wr_addr(self, m, addr, mask, misalign, msr_pr, is_dcbz): + lsbaddr, msbaddr = self.splitaddr(addr) + m.d.comb += self.mem.wrport.addr.eq(msbaddr) -class TestPIMem(unittest.TestCase): + def set_rd_addr(self, m, addr, mask, misalign, msr_pr): + lsbaddr, msbaddr = self.splitaddr(addr) + m.d.comb += self.mem.rdport.addr.eq(msbaddr) - def test_pi_mem(self): + def set_wr_data(self, m, data, wen): + m.d.comb += self.mem.wrport.data.eq(data) # write st to mem + m.d.comb += self.mem.wrport.en.eq(wen) # enable writes + return Const(1, 1) - dut = TestMemoryPortInterface(regwid=64) - #vl = rtlil.convert(dut, ports=dut.ports()) - #with open("test_basic_l0_cache.il", "w") as f: - # f.write(vl) + def get_rd_data(self, m): + return self.mem.rdport.data, Const(1, 1) - run_simulation(dut, l0_cache_ldst(self, dut), - vcd_name='test_pi_mem_basic.vcd') + def elaborate(self, platform): + m = super().elaborate(platform) + # add TestMemory as submodule + m.submodules.mem = self.mem -if __name__ == '__main__': - unittest.main(exit=False) + return m + def ports(self): + yield from super().ports() + # TODO: memory ports