"""DCache based on Anton Blanchard microwatt dcache.vhdl note that the microwatt dcache wishbone interface expects "stall". for simplicity at the moment this is hard-coded to cyc & ~ack. see WB4 spec, p84, section 5.2.1 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid" is raised. sigh Links: * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg * https://bugs.libre-soc.org/show_bug.cgi?id=469 """ import sys from nmutil.gtkw import write_gtkw sys.setrecursionlimit(1000000) from enum import Enum, unique from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const from nmutil.util import Display from copy import deepcopy from random import randint, seed from nmigen.cli import main from nmutil.iocontrol import RecordObject from nmigen.utils import log2_int from soc.experiment.mem_types import (LoadStore1ToDCacheType, DCacheToLoadStore1Type, MMUToDCacheType, DCacheToMMUType) from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS, WBAddrType, WBDataType, WBSelType, WBMasterOut, WBSlaveOut, WBMasterOutVector, WBSlaveOutVector, WBIOMasterOut, WBIOSlaveOut) from soc.experiment.cache_ram import CacheRam #from soc.experiment.plru import PLRU from nmutil.plru import PLRU # for test from soc.bus.sram import SRAM from nmigen import Memory from nmigen.cli import rtlil # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell # Also, check out the cxxsim nmigen branch, and latest yosys from git from nmutil.sim_tmp_alternative import Simulator from nmutil.util import wrap # TODO: make these parameters of DCache at some point LINE_SIZE = 64 # Line size in bytes NUM_LINES = 16 # Number of lines in a set NUM_WAYS = 4 # Number of ways TLB_SET_SIZE = 64 # L1 DTLB entries per set TLB_NUM_WAYS = 2 # L1 DTLB number of sets TLB_LG_PGSZ = 12 # L1 DTLB log_2(page_size) LOG_LENGTH = 0 # Non-zero to enable log data collection # BRAM organisation: We never access more than # -- WB_DATA_BITS at a time so to save # -- resources we make the array only that wide, and # -- use consecutive indices for to make a cache "line" # -- # -- ROW_SIZE is the width in bytes of the BRAM # -- (based on WB, so 64-bits) ROW_SIZE = WB_DATA_BITS // 8; # ROW_PER_LINE is the number of row (wishbone # transactions) in a line ROW_PER_LINE = LINE_SIZE // ROW_SIZE # BRAM_ROWS is the number of rows in BRAM needed # to represent the full dcache BRAM_ROWS = NUM_LINES * ROW_PER_LINE print ("ROW_SIZE", ROW_SIZE) print ("ROW_PER_LINE", ROW_PER_LINE) print ("BRAM_ROWS", BRAM_ROWS) print ("NUM_WAYS", NUM_WAYS) # Bit fields counts in the address # REAL_ADDR_BITS is the number of real address # bits that we store REAL_ADDR_BITS = 56 # ROW_BITS is the number of bits to select a row ROW_BITS = log2_int(BRAM_ROWS) # ROW_LINE_BITS is the number of bits to select # a row within a line ROW_LINE_BITS = log2_int(ROW_PER_LINE) # LINE_OFF_BITS is the number of bits for # the offset in a cache line LINE_OFF_BITS = log2_int(LINE_SIZE) # ROW_OFF_BITS is the number of bits for # the offset in a row ROW_OFF_BITS = log2_int(ROW_SIZE) # INDEX_BITS is the number if bits to # select a cache line INDEX_BITS = log2_int(NUM_LINES) # SET_SIZE_BITS is the log base 2 of the set size SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS # TAG_BITS is the number of bits of # the tag part of the address TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS # TAG_WIDTH is the width in bits of each way of the tag RAM TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8) # WAY_BITS is the number of bits to select a way WAY_BITS = log2_int(NUM_WAYS) # Example of layout for 32 lines of 64 bytes: layout = """\ .. tag |index| line | .. | row | | .. | |---| | ROW_LINE_BITS (3) .. | |--- - --| LINE_OFF_BITS (6) .. | |- --| ROW_OFF_BITS (3) .. |----- ---| | ROW_BITS (8) .. |-----| | INDEX_BITS (5) .. --------| | TAG_BITS (45) """ print (layout) print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \ (TAG_BITS, INDEX_BITS, ROW_BITS, ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS)) print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS)) print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS)) print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH)) TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH) def CacheTagArray(): return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \ for x in range(NUM_LINES)) def CacheValidBitsArray(): return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \ for x in range(NUM_LINES)) def RowPerLineValidArray(): return Array(Signal(name="rows_valid%d" % x) \ for x in range(ROW_PER_LINE)) # L1 TLB TLB_SET_BITS = log2_int(TLB_SET_SIZE) TLB_WAY_BITS = log2_int(TLB_NUM_WAYS) TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_SET_BITS) TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS TLB_PTE_BITS = 64 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS; def ispow2(x): return (1< '0') & r1.reload_tag; """ comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag) sync += cache_tags[r1.store_index].eq(ct) sync += r1.store_way.eq(replace_way) sync += r1.write_tag.eq(0) # Take request from r1.req if there is one there, # else from req_op, ra, etc. with m.If(r1.full): comb += req.eq(r1.req) with m.Else(): comb += req.op.eq(req_op) comb += req.valid.eq(req_go) comb += req.mmu_req.eq(r0.mmu_req) comb += req.dcbz.eq(r0.req.dcbz) comb += req.real_addr.eq(ra) with m.If(r0.req.dcbz): # force data to 0 for dcbz comb += req.data.eq(0) with m.Elif(r0.d_valid): comb += req.data.eq(r0.req.data) with m.Else(): comb += req.data.eq(d_in.data) # Select all bytes for dcbz # and for cacheable loads with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)): comb += req.byte_sel.eq(~0) # all 1s with m.Else(): comb += req.byte_sel.eq(r0.req.byte_sel) comb += req.hit_way.eq(req_hit_way) comb += req.same_tag.eq(req_same_tag) # Store the incoming request from r0, # if it is a slow request # Note that r1.full = 1 implies req_op = OP_NONE with m.If((req_op == Op.OP_LOAD_MISS) | (req_op == Op.OP_LOAD_NC) | (req_op == Op.OP_STORE_MISS) | (req_op == Op.OP_STORE_HIT)): sync += r1.req.eq(req) sync += r1.full.eq(1) # Main state machine with m.Switch(r1.state): with m.Case(State.IDLE): sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:]) sync += r1.wb.sel.eq(req.byte_sel) sync += r1.wb.dat.eq(req.data) sync += r1.dcbz.eq(req.dcbz) # Keep track of our index and way # for subsequent stores. sync += r1.store_index.eq(req_idx) sync += r1.store_row.eq(req_row) sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1) sync += r1.reload_tag.eq(req_tag) sync += r1.req.same_tag.eq(1) with m.If(req.op == Op.OP_STORE_HIT): sync += r1.store_way.eq(req.hit_way) # Reset per-row valid bits, # ready for handling OP_LOAD_MISS for i in range(ROW_PER_LINE): sync += r1.rows_valid[i].eq(0) with m.If(req_op != Op.OP_NONE): sync += Display("cache op %d", req.op) with m.Switch(req.op): with m.Case(Op.OP_LOAD_HIT): # stay in IDLE state pass with m.Case(Op.OP_LOAD_MISS): sync += Display("cache miss real addr: %x " \ "idx: %x tag: %x", req.real_addr, req_row, req_tag) # Start the wishbone cycle sync += r1.wb.we.eq(0) sync += r1.wb.cyc.eq(1) sync += r1.wb.stb.eq(1) # Track that we had one request sent sync += r1.state.eq(State.RELOAD_WAIT_ACK) sync += r1.write_tag.eq(1) with m.Case(Op.OP_LOAD_NC): sync += r1.wb.cyc.eq(1) sync += r1.wb.stb.eq(1) sync += r1.wb.we.eq(0) sync += r1.state.eq(State.NC_LOAD_WAIT_ACK) with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS): with m.If(~req.dcbz): sync += r1.state.eq(State.STORE_WAIT_ACK) sync += r1.acks_pending.eq(1) sync += r1.full.eq(0) sync += r1.slow_valid.eq(1) with m.If(~req.mmu_req): sync += r1.ls_valid.eq(1) with m.Else(): sync += r1.mmu_done.eq(1) with m.If(req.op == Op.OP_STORE_HIT): sync += r1.write_bram.eq(1) with m.Else(): # dcbz is handled much like a load miss except # that we are writing to memory instead of reading sync += r1.state.eq(State.RELOAD_WAIT_ACK) with m.If(req.op == Op.OP_STORE_MISS): sync += r1.write_tag.eq(1) sync += r1.wb.we.eq(1) sync += r1.wb.cyc.eq(1) sync += r1.wb.stb.eq(1) # OP_NONE and OP_BAD do nothing # OP_BAD & OP_STCX_FAIL were # handled above already with m.Case(Op.OP_NONE): pass with m.Case(Op.OP_BAD): pass with m.Case(Op.OP_STCX_FAIL): pass with m.Case(State.RELOAD_WAIT_ACK): ld_stbs_done = Signal() # Requests are all sent if stb is 0 comb += ld_stbs_done.eq(~r1.wb.stb) # If we are still sending requests, was one accepted? with m.If((~wb_in.stall) & r1.wb.stb): # That was the last word? We are done sending. # Clear stb and set ld_stbs_done so we can handle an # eventual last ack on the same cycle. # sigh - reconstruct wb adr with 3 extra 0s at front wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr) with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)): sync += r1.wb.stb.eq(0) comb += ld_stbs_done.eq(1) # Calculate the next row address in the current cache line row = Signal(LINE_OFF_BITS-ROW_OFF_BITS) comb += row.eq(r1.wb.adr) sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1) # Incoming acks processing sync += r1.forward_valid1.eq(wb_in.ack) with m.If(wb_in.ack): srow = Signal(ROW_LINE_BITS) comb += srow.eq(r1.store_row) sync += r1.rows_valid[srow].eq(1) # If this is the data we were looking for, # we can complete the request next cycle. # Compare the whole address in case the # request in r1.req is not the one that # started this refill. with m.If(req.valid & r1.req.same_tag & ((r1.dcbz & r1.req.dcbz) | (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) & (r1.store_row == get_row(req.real_addr))): sync += r1.full.eq(0) sync += r1.slow_valid.eq(1) with m.If(~r1.mmu_req): sync += r1.ls_valid.eq(1) with m.Else(): sync += r1.mmu_done.eq(1) sync += r1.forward_sel.eq(~0) # all 1s sync += r1.use_forward1.eq(1) # Check for completion with m.If(ld_stbs_done & is_last_row(r1.store_row, r1.end_row_ix)): # Complete wishbone cycle sync += r1.wb.cyc.eq(0) # Cache line is now valid cv = Signal(INDEX_BITS) comb += cv.eq(cache_valids[r1.store_index]) comb += cv.bit_select(r1.store_way, 1).eq(1) sync += cache_valids[r1.store_index].eq(cv) sync += r1.state.eq(State.IDLE) sync += Display("cache valid set %x " "idx %d way %d", cv, r1.store_index, r1.store_way) # Increment store row counter sync += r1.store_row.eq(next_row(r1.store_row)) with m.Case(State.STORE_WAIT_ACK): st_stbs_done = Signal() acks = Signal(3) adjust_acks = Signal(3) comb += st_stbs_done.eq(~r1.wb.stb) comb += acks.eq(r1.acks_pending) with m.If(r1.inc_acks != r1.dec_acks): with m.If(r1.inc_acks): comb += adjust_acks.eq(acks + 1) with m.Else(): comb += adjust_acks.eq(acks - 1) with m.Else(): comb += adjust_acks.eq(acks) sync += r1.acks_pending.eq(adjust_acks) # Clear stb when slave accepted request with m.If(~wb_in.stall): # See if there is another store waiting # to be done which is in the same real page. with m.If(req.valid): _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS] sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra) sync += r1.wb.dat.eq(req.data) sync += r1.wb.sel.eq(req.byte_sel) with m.If((adjust_acks < 7) & req.same_tag & ((req.op == Op.OP_STORE_MISS) | (req.op == Op.OP_STORE_HIT))): sync += r1.wb.stb.eq(1) comb += st_stbs_done.eq(0) with m.If(req.op == Op.OP_STORE_HIT): sync += r1.write_bram.eq(1) sync += r1.full.eq(0) sync += r1.slow_valid.eq(1) # Store requests never come from the MMU sync += r1.ls_valid.eq(1) comb += st_stbs_done.eq(0) sync += r1.inc_acks.eq(1) with m.Else(): sync += r1.wb.stb.eq(0) comb += st_stbs_done.eq(1) # Got ack ? See if complete. with m.If(wb_in.ack): with m.If(st_stbs_done & (adjust_acks == 1)): sync += r1.state.eq(State.IDLE) sync += r1.wb.cyc.eq(0) sync += r1.wb.stb.eq(0) sync += r1.dec_acks.eq(1) with m.Case(State.NC_LOAD_WAIT_ACK): # Clear stb when slave accepted request with m.If(~wb_in.stall): sync += r1.wb.stb.eq(0) # Got ack ? complete. with m.If(wb_in.ack): sync += r1.state.eq(State.IDLE) sync += r1.full.eq(0) sync += r1.slow_valid.eq(1) with m.If(~r1.mmu_req): sync += r1.ls_valid.eq(1) with m.Else(): sync += r1.mmu_done.eq(1) sync += r1.forward_sel.eq(~0) # all 1s sync += r1.use_forward1.eq(1) sync += r1.wb.cyc.eq(0) sync += r1.wb.stb.eq(0) def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out): sync = m.d.sync d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3], stall_out, req_op[:3], d_out.valid, d_out.error, r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall, r1.real_adr[3:6])) def elaborate(self, platform): m = Module() comb = m.d.comb d_in = self.d_in # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs cache_tags = CacheTagArray() cache_tag_set = Signal(TAG_RAM_WIDTH) cache_valids = CacheValidBitsArray() # TODO attribute ram_style : string; # TODO attribute ram_style of cache_tags : signal is "distributed"; """note: these are passed to nmigen.hdl.Memory as "attributes". don't know how, just that they are. """ dtlb_valid_bits = TLBValidBitsArray() dtlb_tags = TLBTagsArray() dtlb_ptes = TLBPtesArray() # TODO attribute ram_style of # dtlb_tags : signal is "distributed"; # TODO attribute ram_style of # dtlb_ptes : signal is "distributed"; r0 = RegStage0("r0") r0_full = Signal() r1 = RegStage1("r1") reservation = Reservation() # Async signals on incoming request req_index = Signal(INDEX_BITS) req_row = Signal(ROW_BITS) req_hit_way = Signal(WAY_BITS) req_tag = Signal(TAG_BITS) req_op = Signal(Op) req_data = Signal(64) req_same_tag = Signal() req_go = Signal() early_req_row = Signal(ROW_BITS) cancel_store = Signal() set_rsrv = Signal() clear_rsrv = Signal() r0_valid = Signal() r0_stall = Signal() use_forward1_next = Signal() use_forward2_next = Signal() cache_out_row = Signal(WB_DATA_BITS) plru_victim = PLRUOut() replace_way = Signal(WAY_BITS) # Wishbone read/write/cache write formatting signals bus_sel = Signal(8) # TLB signals tlb_tag_way = Signal(TLB_TAG_WAY_BITS) tlb_pte_way = Signal(TLB_PTE_WAY_BITS) tlb_valid_way = Signal(TLB_NUM_WAYS) tlb_req_index = Signal(TLB_SET_BITS) tlb_hit = Signal() tlb_hit_way = Signal(TLB_WAY_BITS) pte = Signal(TLB_PTE_BITS) ra = Signal(REAL_ADDR_BITS) valid_ra = Signal() perm_attr = PermAttr("dc_perms") rc_ok = Signal() perm_ok = Signal() access_ok = Signal() tlb_plru_victim = TLBPLRUOut() # we don't yet handle collisions between loadstore1 requests # and MMU requests comb += self.m_out.stall.eq(0) # Hold off the request in r0 when r1 has an uncompleted request comb += r0_stall.eq(r0_full & (r1.full | d_in.hold)) comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold) comb += self.stall_out.eq(r0_stall) # Wire up wishbone request latch out of stage 1 comb += self.wb_out.eq(r1.wb) # deal with litex not doing wishbone pipeline mode # XXX in wrong way. FIFOs are needed in the SRAM test # so that stb/ack match up comb += self.wb_in.stall.eq(self.wb_out.cyc & ~self.wb_in.ack) # call sub-functions putting everything together, using shared # signals established above self.stage_0(m, r0, r1, r0_full) self.tlb_read(m, r0_stall, tlb_valid_way, tlb_tag_way, tlb_pte_way, dtlb_valid_bits, dtlb_tags, dtlb_ptes) self.tlb_search(m, tlb_req_index, r0, r0_valid, tlb_valid_way, tlb_tag_way, tlb_hit_way, tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra) self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index, tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way, dtlb_tags, tlb_pte_way, dtlb_ptes) self.maybe_plrus(m, r1, plru_victim) self.maybe_tlb_plrus(m, r1, tlb_plru_victim) self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags) self.dcache_request(m, r0, ra, req_index, req_row, req_tag, r0_valid, r1, cache_valids, replace_way, use_forward1_next, use_forward2_next, req_hit_way, plru_victim, rc_ok, perm_attr, valid_ra, perm_ok, access_ok, req_op, req_go, tlb_pte_way, tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set, cancel_store, req_same_tag, r0_stall, early_req_row) self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv, r0_valid, r0, reservation) self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv, reservation, r0) self.writeback_control(m, r1, cache_out_row) self.rams(m, r1, early_req_row, cache_out_row, replace_way) self.dcache_fast_hit(m, req_op, r0_valid, r0, r1, req_hit_way, req_index, req_tag, access_ok, tlb_hit, tlb_hit_way, tlb_req_index) self.dcache_slow(m, r1, use_forward1_next, use_forward2_next, cache_valids, r0, replace_way, req_hit_way, req_same_tag, r0_valid, req_op, cache_tags, req_go, ra) #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out) return m if __name__ == '__main__': dut = DCache() vl = rtlil.convert(dut, ports=[]) with open("test_dcache.il", "w") as f: f.write(vl)