X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fsoc%2Fexperiment%2Fdcache.py;h=a5548cfb6114859a35e13e0aaad81dbbb25c1a04;hb=1032bacb081c79b104cc31431c6c081a0773db2e;hp=5e797f419c1edd893ab697d924839ca9bba9ed48;hpb=ae266b309180b9b820cc8d4e34133c4e60ae17b2;p=soc.git diff --git a/src/soc/experiment/dcache.py b/src/soc/experiment/dcache.py index 5e797f41..a5548cfb 100644 --- a/src/soc/experiment/dcache.py +++ b/src/soc/experiment/dcache.py @@ -2,22 +2,38 @@ based on Anton Blanchard microwatt dcache.vhdl +note that the microwatt dcache wishbone interface expects "stall". +for simplicity at the moment this is hard-coded to cyc & ~ack. +see WB4 spec, p84, section 5.2.1 + +IMPORTANT: for store, the data is sampled the cycle AFTER the "valid" +is raised. sigh + +Links: + +* https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg +* https://bugs.libre-soc.org/show_bug.cgi?id=469 + """ +import sys + +from nmutil.gtkw import write_gtkw + +sys.setrecursionlimit(1000000) + from enum import Enum, unique from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const -try: - from nmigen.hdl.ast import Display -except ImportError: - def Display(*args): - return [] +from nmutil.util import Display + +from copy import deepcopy +from random import randint, seed -from random import randint +from nmigen_soc.wishbone.bus import Interface from nmigen.cli import main from nmutil.iocontrol import RecordObject -from nmutil.util import wrap from nmigen.utils import log2_int from soc.experiment.mem_types import (LoadStore1ToDCacheType, DCacheToLoadStore1Type, @@ -31,16 +47,19 @@ from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS, WBIOMasterOut, WBIOSlaveOut) from soc.experiment.cache_ram import CacheRam -from soc.experiment.plru import PLRU +#from soc.experiment.plru import PLRU +from nmutil.plru import PLRU # for test -from nmigen_soc.wishbone.sram import SRAM +from soc.bus.sram import SRAM from nmigen import Memory from nmigen.cli import rtlil -if True: - from nmigen.back.pysim import Simulator, Delay, Settle -else: - from nmigen.sim.cxxsim import Simulator, Delay, Settle + +# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell +# Also, check out the cxxsim nmigen branch, and latest yosys from git +from nmutil.sim_tmp_alternative import Simulator + +from nmutil.util import wrap # TODO: make these parameters of DCache at some point @@ -55,7 +74,7 @@ LOG_LENGTH = 0 # Non-zero to enable log data collection # BRAM organisation: We never access more than # -- WB_DATA_BITS at a time so to save # -- resources we make the array only that wide, and -# -- use consecutive indices for to make a cache "line" +# -- use consecutive indices to make a cache "line" # -- # -- ROW_SIZE is the width in bytes of the BRAM # -- (based on WB, so 64-bits) @@ -69,6 +88,10 @@ ROW_PER_LINE = LINE_SIZE // ROW_SIZE # to represent the full dcache BRAM_ROWS = NUM_LINES * ROW_PER_LINE +print ("ROW_SIZE", ROW_SIZE) +print ("ROW_PER_LINE", ROW_PER_LINE) +print ("BRAM_ROWS", BRAM_ROWS) +print ("NUM_WAYS", NUM_WAYS) # Bit fields counts in the address @@ -120,7 +143,7 @@ layout = """\ .. --------| | TAG_BITS (45) """ print (layout) -print ("Dcache TAG %d IDX %d ROW %d ROFF %d LOFF %d RLB %d" % \ +print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \ (TAG_BITS, INDEX_BITS, ROW_BITS, ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS)) print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS)) @@ -129,12 +152,14 @@ print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH)) TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS +print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH) + def CacheTagArray(): return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \ for x in range(NUM_LINES)) def CacheValidBitsArray(): - return Array(Signal(INDEX_BITS, name="cachevalid_%d" % x) \ + return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \ for x in range(NUM_LINES)) def RowPerLineValidArray(): @@ -149,10 +174,13 @@ TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS TLB_PTE_BITS = 64 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS; +def ispow2(x): + return (1< '0') & r1.reload_tag; + """ comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag) sync += cache_tags[r1.store_index].eq(ct) sync += r1.store_way.eq(replace_way) @@ -1263,10 +1335,13 @@ class DCache(Elaboratable): comb += req.dcbz.eq(r0.req.dcbz) comb += req.real_addr.eq(ra) - with m.If(~r0.req.dcbz): + with m.If(r0.req.dcbz): + # force data to 0 for dcbz + comb += req.data.eq(0) + with m.Elif(r0.d_valid): comb += req.data.eq(r0.req.data) with m.Else(): - comb += req.data.eq(0) + comb += req.data.eq(d_in.data) # Select all bytes for dcbz # and for cacheable loads @@ -1291,7 +1366,7 @@ class DCache(Elaboratable): with m.Switch(r1.state): with m.Case(State.IDLE): - sync += r1.wb.adr.eq(req.real_addr) + sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:]) sync += r1.wb.sel.eq(req.byte_sel) sync += r1.wb.dat.eq(req.data) sync += r1.dcbz.eq(req.dcbz) @@ -1300,7 +1375,7 @@ class DCache(Elaboratable): # for subsequent stores. sync += r1.store_index.eq(req_idx) sync += r1.store_row.eq(req_row) - sync += r1.end_row_ix.eq(get_row_of_line(req_row)) + sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1) sync += r1.reload_tag.eq(req_tag) sync += r1.req.same_tag.eq(1) @@ -1381,35 +1456,38 @@ class DCache(Elaboratable): # Requests are all sent if stb is 0 comb += ld_stbs_done.eq(~r1.wb.stb) - with m.If((~wb_in.stall) & r1.wb.stb): - # That was the last word? - # We are done sending. - # Clear stb and set ld_stbs_done - # so we can handle an eventual - # last ack on the same cycle. - with m.If(is_last_row_addr(r1.wb.adr, r1.end_row_ix)): + # If we are still sending requests, was one accepted? + with m.If((~bus.stall) & r1.wb.stb): + # That was the last word? We are done sending. + # Clear stb and set ld_stbs_done so we can handle an + # eventual last ack on the same cycle. + # sigh - reconstruct wb adr with 3 extra 0s at front + wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr) + with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)): sync += r1.wb.stb.eq(0) comb += ld_stbs_done.eq(1) # Calculate the next row address in the current cache line - rarange = Signal(LINE_OFF_BITS-ROW_OFF_BITS) - comb += rarange.eq(r1.wb.adr[ROW_OFF_BITS:LINE_OFF_BITS]+1) - sync += r1.wb.adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange) + row = Signal(LINE_OFF_BITS-ROW_OFF_BITS) + comb += row.eq(r1.wb.adr) + sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1) # Incoming acks processing - sync += r1.forward_valid1.eq(wb_in.ack) - with m.If(wb_in.ack): - sync += r1.rows_valid[r1.store_row % ROW_PER_LINE].eq(1) + sync += r1.forward_valid1.eq(bus.ack) + with m.If(bus.ack): + srow = Signal(ROW_LINE_BITS) + comb += srow.eq(r1.store_row) + sync += r1.rows_valid[srow].eq(1) # If this is the data we were looking for, # we can complete the request next cycle. # Compare the whole address in case the # request in r1.req is not the one that # started this refill. - with m.If(r1.full & r1.req.same_tag & + with m.If(req.valid & r1.req.same_tag & ((r1.dcbz & r1.req.dcbz) | (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) & - (r1.store_row == get_row(r1.req.real_addr))): + (r1.store_row == get_row(req.real_addr))): sync += r1.full.eq(0) sync += r1.slow_valid.eq(1) with m.If(~r1.mmu_req): @@ -1427,16 +1505,23 @@ class DCache(Elaboratable): # Cache line is now valid cv = Signal(INDEX_BITS) - comb += cv.eq(cache_valid_bits[r1.store_index]) + comb += cv.eq(cache_valids[r1.store_index]) comb += cv.bit_select(r1.store_way, 1).eq(1) - sync += cache_valid_bits[r1.store_index].eq(cv) + sync += cache_valids[r1.store_index].eq(cv) + sync += r1.state.eq(State.IDLE) + sync += Display("cache valid set %x " + "idx %d way %d", + cv, r1.store_index, r1.store_way) # Increment store row counter sync += r1.store_row.eq(next_row(r1.store_row)) with m.Case(State.STORE_WAIT_ACK): st_stbs_done = Signal() + acks = Signal(3) + adjust_acks = Signal(3) + comb += st_stbs_done.eq(~r1.wb.stb) comb += acks.eq(r1.acks_pending) @@ -1451,16 +1536,16 @@ class DCache(Elaboratable): sync += r1.acks_pending.eq(adjust_acks) # Clear stb when slave accepted request - with m.If(~wb_in.stall): + with m.If(~bus.stall): # See if there is another store waiting # to be done which is in the same real page. with m.If(req.valid): - ra = req.real_addr[0:SET_SIZE_BITS] - sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra) + _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS] + sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra) sync += r1.wb.dat.eq(req.data) sync += r1.wb.sel.eq(req.byte_sel) - with m.Elif((adjust_acks < 7) & req.same_tag & + with m.If((adjust_acks < 7) & req.same_tag & ((req.op == Op.OP_STORE_MISS) | (req.op == Op.OP_STORE_HIT))): sync += r1.wb.stb.eq(1) @@ -1480,7 +1565,7 @@ class DCache(Elaboratable): comb += st_stbs_done.eq(1) # Got ack ? See if complete. - with m.If(wb_in.ack): + with m.If(bus.ack): with m.If(st_stbs_done & (adjust_acks == 1)): sync += r1.state.eq(State.IDLE) sync += r1.wb.cyc.eq(0) @@ -1489,11 +1574,11 @@ class DCache(Elaboratable): with m.Case(State.NC_LOAD_WAIT_ACK): # Clear stb when slave accepted request - with m.If(~wb_in.stall): + with m.If(~bus.stall): sync += r1.wb.stb.eq(0) # Got ack ? complete. - with m.If(wb_in.ack): + with m.If(bus.ack): sync += r1.state.eq(State.IDLE) sync += r1.full.eq(0) sync += r1.slow_valid.eq(1) @@ -1511,22 +1596,23 @@ class DCache(Elaboratable): def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out): sync = m.d.sync - d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out + d_out, bus, log_out = self.d_out, self.bus, self.log_out sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3], stall_out, req_op[:3], d_out.valid, d_out.error, - r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall, - r1.wb.adr[3:6])) + r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall, + r1.real_adr[3:6])) def elaborate(self, platform): m = Module() comb = m.d.comb + d_in = self.d_in # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs cache_tags = CacheTagArray() cache_tag_set = Signal(TAG_RAM_WIDTH) - cache_valid_bits = CacheValidBitsArray() + cache_valids = CacheValidBitsArray() # TODO attribute ram_style : string; # TODO attribute ram_style of cache_tags : signal is "distributed"; @@ -1571,7 +1657,7 @@ class DCache(Elaboratable): use_forward1_next = Signal() use_forward2_next = Signal() - cache_out = CacheRamOut() + cache_out_row = Signal(WB_DATA_BITS) plru_victim = PLRUOut() replace_way = Signal(WAY_BITS) @@ -1601,12 +1687,23 @@ class DCache(Elaboratable): comb += self.m_out.stall.eq(0) # Hold off the request in r0 when r1 has an uncompleted request - comb += r0_stall.eq(r0_full & r1.full) - comb += r0_valid.eq(r0_full & ~r1.full) + comb += r0_stall.eq(r0_full & (r1.full | d_in.hold)) + comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold) comb += self.stall_out.eq(r0_stall) + + # deal with litex not doing wishbone pipeline mode + # XXX in wrong way. FIFOs are needed in the SRAM test + # so that stb/ack match up. same thing done in icache.py + comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack) + # Wire up wishbone request latch out of stage 1 - comb += self.wb_out.eq(r1.wb) + comb += self.bus.we.eq(r1.wb.we) + comb += self.bus.adr.eq(r1.wb.adr) + comb += self.bus.sel.eq(r1.wb.sel) + comb += self.bus.stb.eq(r1.wb.stb) + comb += self.bus.dat_w.eq(r1.wb.dat) + comb += self.bus.cyc.eq(r1.wb.cyc) # call sub-functions putting everything together, using shared # signals established above @@ -1624,7 +1721,7 @@ class DCache(Elaboratable): self.maybe_tlb_plrus(m, r1, tlb_plru_victim) self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags) self.dcache_request(m, r0, ra, req_index, req_row, req_tag, - r0_valid, r1, cache_valid_bits, replace_way, + r0_valid, r1, cache_valids, replace_way, use_forward1_next, use_forward2_next, req_hit_way, plru_victim, rc_ok, perm_attr, valid_ra, perm_ok, access_ok, req_op, req_go, @@ -1635,206 +1732,22 @@ class DCache(Elaboratable): r0_valid, r0, reservation) self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv, reservation, r0) - self.writeback_control(m, r1, cache_out) - self.rams(m, r1, early_req_row, cache_out, replace_way) + self.writeback_control(m, r1, cache_out_row) + self.rams(m, r1, early_req_row, cache_out_row, replace_way) self.dcache_fast_hit(m, req_op, r0_valid, r0, r1, req_hit_way, req_index, req_tag, access_ok, tlb_hit, tlb_hit_way, tlb_req_index) self.dcache_slow(m, r1, use_forward1_next, use_forward2_next, - cache_valid_bits, r0, replace_way, + cache_valids, r0, replace_way, req_hit_way, req_same_tag, r0_valid, req_op, cache_tags, req_go, ra) #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out) return m -def dcache_load(dut, addr, nc=0): - yield dut.d_in.load.eq(1) - yield dut.d_in.nc.eq(nc) - yield dut.d_in.addr.eq(addr) - yield dut.d_in.byte_sel.eq(~0) - yield dut.d_in.valid.eq(1) - yield - yield dut.d_in.valid.eq(0) - yield dut.d_in.byte_sel.eq(0) - yield - while not (yield dut.d_out.valid): - yield - data = yield dut.d_out.data - return data - - -def dcache_store(dut, addr, data, nc=0): - yield dut.d_in.load.eq(0) - yield dut.d_in.nc.eq(nc) - yield dut.d_in.data.eq(data) - yield dut.d_in.byte_sel.eq(~0) - yield dut.d_in.addr.eq(addr) - yield dut.d_in.valid.eq(1) - yield - yield dut.d_in.valid.eq(0) - yield dut.d_in.byte_sel.eq(0) - yield - while not (yield dut.d_out.valid): - yield - - -def dcache_random_sim(dut): - - # start with stack of zeros - sim_mem = [0] * 512 - - # clear stuff - yield dut.d_in.valid.eq(0) - yield dut.d_in.load.eq(0) - yield dut.d_in.priv_mode.eq(1) - yield dut.d_in.nc.eq(0) - yield dut.d_in.addr.eq(0) - yield dut.d_in.data.eq(0) - yield dut.m_in.valid.eq(0) - yield dut.m_in.addr.eq(0) - yield dut.m_in.pte.eq(0) - # wait 4 * clk_period - yield - yield - yield - yield - - print () - - for i in range(256): - addr = randint(0, 255) - data = randint(0, (1<<64)-1) - sim_mem[addr] = data - addr *= 8 - - print ("testing %x data %x" % (addr, data)) - - yield from dcache_load(dut, addr) - yield from dcache_store(dut, addr, data) - - addr = randint(0, 255) - sim_data = sim_mem[addr] - addr *= 8 - - data = yield from dcache_load(dut, addr) - assert data == sim_data, \ - "check %x data %x != %x" % (addr, data, sim_data) - - for addr in range(8): - data = yield from dcache_load(dut, addr*8) - assert data == sim_mem[addr], \ - "final check %x data %x != %x" % (addr*8, data, sim_mem[addr]) - -def dcache_sim(dut): - # clear stuff - yield dut.d_in.valid.eq(0) - yield dut.d_in.load.eq(0) - yield dut.d_in.priv_mode.eq(1) - yield dut.d_in.nc.eq(0) - yield dut.d_in.addr.eq(0) - yield dut.d_in.data.eq(0) - yield dut.m_in.valid.eq(0) - yield dut.m_in.addr.eq(0) - yield dut.m_in.pte.eq(0) - # wait 4 * clk_period - yield - yield - yield - yield - - # Cacheable read of address 4 - data = yield from dcache_load(dut, 0x4) - addr = yield dut.d_in.addr - assert data == 0x0000000100000000, \ - f"data @%x=%x expected 0x0000000100000000" % (addr, data) - - # Cacheable read of address 20 - data = yield from dcache_load(dut, 0x20) - addr = yield dut.d_in.addr - assert data == 0x0000000100000000, \ - f"data @%x=%x expected 0x0000000100000000" % (addr, data) - - # Cacheable read of address 30 - data = yield from dcache_load(dut, 0x530) - addr = yield dut.d_in.addr - assert data == 0x0000014D0000014C, \ - f"data @%x=%x expected 0000014D0000014C" % (addr, data) - - # 2nd Cacheable read of address 30 - data = yield from dcache_load(dut, 0x530) - addr = yield dut.d_in.addr - assert data == 0x0000014D0000014C, \ - f"data @%x=%x expected 0000014D0000014C" % (addr, data) - - # Non-cacheable read of address 100 - data = yield from dcache_load(dut, 0x100, nc=1) - addr = yield dut.d_in.addr - assert data == 0x0000004100000040, \ - f"data @%x=%x expected 0000004100000040" % (addr, data) - - # Store at address 530 - yield from dcache_store(dut, 0x530, 0x121) - - # Store at address 30 - yield from dcache_store(dut, 0x530, 0x12345678) - - # 3nd Cacheable read of address 530 - data = yield from dcache_load(dut, 0x530) - addr = yield dut.d_in.addr - assert data == 0x12345678, \ - f"data @%x=%x expected 0x12345678" % (addr, data) - - # 4th Cacheable read of address 30 - data = yield from dcache_load(dut, 0x20) - addr = yield dut.d_in.addr - assert data == 0x12345678, \ - f"data @%x=%x expected 0x12345678" % (addr, data) - - yield - yield - yield - yield - - -def test_dcache(mem, test_fn, test_name): - dut = DCache() - - memory = Memory(width=64, depth=16*64, init=mem) - sram = SRAM(memory=memory, granularity=8) - - m = Module() - m.submodules.dcache = dut - m.submodules.sram = sram - - m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc) - m.d.comb += sram.bus.stb.eq(dut.wb_out.stb) - m.d.comb += sram.bus.we.eq(dut.wb_out.we) - m.d.comb += sram.bus.sel.eq(dut.wb_out.sel) - m.d.comb += sram.bus.adr.eq(dut.wb_out.adr[3:]) - m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat) - - m.d.comb += dut.wb_in.ack.eq(sram.bus.ack) - m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r) - - # nmigen Simulation - sim = Simulator(m) - sim.add_clock(1e-6) - - sim.add_sync_process(wrap(test_fn(dut))) - with sim.write_vcd('test_dcache%s.vcd' % test_name): - sim.run() if __name__ == '__main__': dut = DCache() vl = rtlil.convert(dut, ports=[]) with open("test_dcache.il", "w") as f: f.write(vl) - - mem = [] - for i in range(0,512): - mem.append((i*2)| ((i*2+1)<<32)) - - test_dcache(mem, dcache_sim, "") - #test_dcache(None, dcache_random_sim, "random") -