From 942f829bfdf9d65cfcdb5f732e81a5a920290832 Mon Sep 17 00:00:00 2001 From: Luke Kenneth Casson Leighton Date: Fri, 18 Feb 2022 11:36:19 +0000 Subject: [PATCH] parameterise I-Cache similar to D-Cache. lots of "self." all over the place. yuk. --- src/soc/bus/sram.py | 2 +- src/soc/experiment/icache.py | 458 ++++++++++++++++++----------------- 2 files changed, 236 insertions(+), 224 deletions(-) diff --git a/src/soc/bus/sram.py b/src/soc/bus/sram.py index 9819302f..f0252114 100644 --- a/src/soc/bus/sram.py +++ b/src/soc/bus/sram.py @@ -60,7 +60,7 @@ class SRAM(Elaboratable): data_width=self.memory.width, granularity=granularity, features=features, - alignment=0, + #alignment=0, name=None) self.bus = bus self.granularity = bus.granularity diff --git a/src/soc/experiment/icache.py b/src/soc/experiment/icache.py index 57b738c6..58bdb6c5 100644 --- a/src/soc/experiment/icache.py +++ b/src/soc/experiment/icache.py @@ -72,197 +72,203 @@ SIM = 0 LOG_LENGTH = 0 class ICacheConfig: - def __init__(self, self.LINE_SIZE = 64 - self.NUM_LINE = 16 # Number of lines in a set - self.NUM_WAYS = 1, # Number of ways - self.TLB_SIZE = 64, # L1 ITLB number of entries - self.TLB_LG_PGSZ = 12): # L1 ITLB log_2(page_size) -self.LINE_SIZE = 64 -self.NUM_LINE = 16 # Number of lines in a set -self.NUM_WAYS = 1 # Number of ways -self.TLB_SIZE = 64 # L1 ITLB number of entries -self.TLB_LG_PGSZ = 12 # L1 ITLB log_2(page_size) - -# BRAM organisation: We never access more than wishbone_data_bits -# at a time so to save resources we make the array only that wide, -# and use consecutive indices for to make a cache "line" -# -# self.ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits) -self.ROW_SIZE = WB_DATA_BITS // 8 -# Number of real address bits that we store -self.REAL_ADDR_BITS = 56 - -self.ROW_SIZE_BITS = self.ROW_SIZE * 8 -# ROW_PER_LINE is the number of row (wishbone) transactions in a line -self.ROW_PER_LINE = self.LINE_SIZE // self.ROW_SIZE -# BRAM_ROWS is the number of rows in BRAM -# needed to represent the full icache -self.BRAM_ROWS = self.NUM_LINE * self.ROW_PER_LINE -# INSN_PER_ROW is the number of 32bit instructions per BRAM row -self.INSN_PER_ROW = self.ROW_SIZE_BITS // 32 - -# Bit fields counts in the address -# -# INSN_BITS is the number of bits to select an instruction in a row -self.INSN_BITS = log2_int(self.INSN_PER_ROW) -# ROW_BITS is the number of bits to select a row -self.ROW_BITS = log2_int(self.BRAM_ROWS) -# ROW_LINE_BITS is the number of bits to select a row within a line -self.ROW_LINE_BITS = log2_int(self.ROW_PER_LINE) -# LINE_OFF_BITS is the number of bits for the offset in a cache line -self.LINE_OFF_BITS = log2_int(self.LINE_SIZE) -# ROW_OFF_BITS is the number of bits for the offset in a row -self.ROW_OFF_BITS = log2_int(self.ROW_SIZE) -# INDEX_BITS is the number of bits to select a cache line -self.INDEX_BITS = log2_int(self.NUM_LINE) -# SET_SIZE_BITS is the log base 2 of the set size -self.SET_SIZE_BITS = self.LINE_OFF_BITS + self.INDEX_BITS -# TAG_BITS is the number of bits of the tag part of the address -self.TAG_BITS = self.REAL_ADDR_BITS - self.SET_SIZE_BITS -# TAG_WIDTH is the width in bits of each way of the tag RAM -self.TAG_WIDTH = self.TAG_BITS + 7 - ((self.TAG_BITS + 7) % 8) - -# WAY_BITS is the number of bits to select a way -self.WAY_BITS = log2_int(self.NUM_WAYS) -self.TAG_RAM_WIDTH = self.TAG_BITS * self.NUM_WAYS - -# L1 ITLB -self.TL_BITS = log2_int(self.TLB_SIZE) -self.TLB_EA_TAG_BITS = 64 - (self.TLB_LG_PGSZ + self.TL_BITS) -self.TLB_PTE_BITS = 64 - -print("self.BRAM_ROWS =", self.BRAM_ROWS) -print("self.INDEX_BITS =", self.INDEX_BITS) -print("self.INSN_BITS =", self.INSN_BITS) -print("self.INSN_PER_ROW =", self.INSN_PER_ROW) -print("self.LINE_SIZE =", self.LINE_SIZE) -print("self.LINE_OFF_BITS =", self.LINE_OFF_BITS) -print("LOG_LENGTH =", LOG_LENGTH) -print("self.NUM_LINE =", self.NUM_LINE) -print("self.NUM_WAYS =", self.NUM_WAYS) -print("self.REAL_ADDR_BITS =", self.REAL_ADDR_BITS) -print("self.ROW_BITS =", self.ROW_BITS) -print("self.ROW_OFF_BITS =", self.ROW_OFF_BITS) -print("self.ROW_LINE_BITS =", self.ROW_LINE_BITS) -print("self.ROW_PER_LINE =", self.ROW_PER_LINE) -print("self.ROW_SIZE =", self.ROW_SIZE) -print("self.ROW_SIZE_BITS =", self.ROW_SIZE_BITS) -print("self.SET_SIZE_BITS =", self.SET_SIZE_BITS) -print("SIM =", SIM) -print("self.TAG_BITS =", self.TAG_BITS) -print("self.TAG_RAM_WIDTH =", self.TAG_RAM_WIDTH) -print("self.TAG_BITS =", self.TAG_BITS) -print("self.TL_BITS =", self.TL_BITS) -print("self.TLB_EA_TAG_BITS =", self.TLB_EA_TAG_BITS) -print("self.TLB_LG_PGSZ =", self.TLB_LG_PGSZ) -print("self.TLB_PTE_BITS =", self.TLB_PTE_BITS) -print("self.TLB_SIZE =", self.TLB_SIZE) -print("self.WAY_BITS =", self.WAY_BITS) - -assert self.LINE_SIZE % self.ROW_SIZE == 0 -assert ispow2(self.LINE_SIZE), "self.LINE_SIZE not power of 2" -assert ispow2(self.NUM_LINE), "self.NUM_LINE not power of 2" -assert ispow2(self.ROW_PER_LINE), "self.ROW_PER_LINE not power of 2" -assert ispow2(self.INSN_PER_ROW), "self.INSN_PER_ROW not power of 2" -assert (self.ROW_BITS == (self.INDEX_BITS + self.ROW_LINE_BITS)), \ - "geometry bits don't add up" -assert (self.LINE_OFF_BITS == (self.ROW_OFF_BITS + self.ROW_LINE_BITS)), \ - "geometry bits don't add up" -assert (self.REAL_ADDR_BITS == (self.TAG_BITS + self.INDEX_BITS + self.LINE_OFF_BITS)), \ - "geometry bits don't add up" -assert (self.REAL_ADDR_BITS == (self.TAG_BITS + self.ROW_BITS + self.ROW_OFF_BITS)), \ - "geometry bits don't add up" - -# Example of layout for 32 lines of 64 bytes: -# -# .. tag |index| line | -# .. | row | | -# .. | | | |00| zero (2) -# .. | | |-| | self.INSN_BITS (1) -# .. | |---| | self.ROW_LINE_BITS (3) -# .. | |--- - --| self.LINE_OFF_BITS (6) -# .. | |- --| self.ROW_OFF_BITS (3) -# .. |----- ---| | self.ROW_BITS (8) -# .. |-----| | self.INDEX_BITS (5) -# .. --------| | self.TAG_BITS (53) - -# The cache data BRAM organized as described above for each way -#subtype cache_row_t is std_ulogic_vector(self.ROW_SIZE_BITS-1 downto 0); -# -def RowPerLineValidArray(): - return Array(Signal(name="rows_valid_%d" %x) \ - for x in range(self.ROW_PER_LINE)) - - -# TODO to be passed to nigmen as ram attributes -# attribute ram_style : string; -# attribute ram_style of cache_tags : signal is "distributed"; - -def TLBRecord(name): - tlb_layout = [ ('tag', self.TLB_EA_TAG_BITS), - ('pte', self.TLB_PTE_BITS) - ] - return Record(tlb_layout, name=name) - -def TLBArray(): - return Array(TLBRecord("tlb%d" % x) for x in range(self.TLB_SIZE)) - -# PLRU output interface -def PLRUOut(): - return Array(Signal(self.WAY_BITS, name="plru_out_%d" %x) \ - for x in range(self.NUM_LINE)) - -# Return the cache line index (tag index) for an address -def get_index(addr): - return addr[self.LINE_OFF_BITS:self.SET_SIZE_BITS] - -# Return the cache row index (data memory) for an address -def get_row(addr): - return addr[self.ROW_OFF_BITS:self.SET_SIZE_BITS] - -# Return the index of a row within a line -def get_row_of_line(row): - return row[:self.ROW_BITS][:self.ROW_LINE_BITS] - -# Returns whether this is the last row of a line -def is_last_row_addr(addr, last): - return addr[self.ROW_OFF_BITS:self.LINE_OFF_BITS] == last - -# Returns whether this is the last row of a line -def is_last_row(row, last): - return get_row_of_line(row) == last - -# Return the next row in the current cache line. We use a dedicated -# function in order to limit the size of the generated adder to be -# only the bits within a cache line (3 bits with default settings) -def next_row(row): - row_v = row[0:self.ROW_LINE_BITS] + 1 - return Cat(row_v[:self.ROW_LINE_BITS], row[self.ROW_LINE_BITS:]) - -# Read the instruction word for the given address -# in the current cache row -def read_insn_word(addr, data): - word = addr[2:self.INSN_BITS+2] - return data.word_select(word, 32) - -# Get the tag value from the address -def get_tag(addr): - return addr[self.SET_SIZE_BITS:self.REAL_ADDR_BITS] - -# Read a tag from a tag memory row -def read_tag(way, tagset): - return tagset.word_select(way, self.TAG_BITS) - -# Write a tag to tag memory row -def write_tag(way, tagset, tag): - return read_tag(way, tagset).eq(tag) - -# Simple hash for direct-mapped TLB index -def hash_ea(addr): - hsh = (addr[self.TLB_LG_PGSZ:self.TLB_LG_PGSZ + self.TL_BITS] ^ - addr[self.TLB_LG_PGSZ + self.TL_BITS:self.TLB_LG_PGSZ + 2 * self.TL_BITS ] ^ - addr[self.TLB_LG_PGSZ + 2 * self.TL_BITS:self.TLB_LG_PGSZ + 3 * self.TL_BITS]) - return hsh + def __init__(self, LINE_SIZE = 64, + NUM_LINES = 16, # Number of lines in a set + NUM_WAYS = 1, # Number of ways + TLB_SIZE = 64, # L1 ITLB number of entries + TLB_LG_PGSZ = 12): # L1 ITLB log_2(page_size) + self.LINE_SIZE = LINE_SIZE + self.NUM_LINES = NUM_LINES + self.NUM_WAYS = NUM_WAYS + self.TLB_SIZE = TLB_SIZE + self.TLB_LG_PGSZ = TLB_LG_PGSZ + + # BRAM organisation: We never access more than wishbone_data_bits + # at a time so to save resources we make the array only that wide, + # and use consecutive indices for to make a cache "line" + # + # self.ROW_SIZE is the width in bytes of the BRAM + # (based on WB, so 64-bits) + self.ROW_SIZE = WB_DATA_BITS // 8 + # Number of real address bits that we store + self.REAL_ADDR_BITS = 56 + + self.ROW_SIZE_BITS = self.ROW_SIZE * 8 + # ROW_PER_LINE is the number of row (wishbone) transactions in a line + self.ROW_PER_LINE = self.LINE_SIZE // self.ROW_SIZE + # BRAM_ROWS is the number of rows in BRAM + # needed to represent the full icache + self.BRAM_ROWS = self.NUM_LINES * self.ROW_PER_LINE + # INSN_PER_ROW is the number of 32bit instructions per BRAM row + self.INSN_PER_ROW = self.ROW_SIZE_BITS // 32 + + # Bit fields counts in the address + # + # INSN_BITS is the number of bits to select an instruction in a row + self.INSN_BITS = log2_int(self.INSN_PER_ROW) + # ROW_BITS is the number of bits to select a row + self.ROW_BITS = log2_int(self.BRAM_ROWS) + # ROW_LINE_BITS is the number of bits to select a row within a line + self.ROW_LINE_BITS = log2_int(self.ROW_PER_LINE) + # LINE_OFF_BITS is the number of bits for the offset in a cache line + self.LINE_OFF_BITS = log2_int(self.LINE_SIZE) + # ROW_OFF_BITS is the number of bits for the offset in a row + self.ROW_OFF_BITS = log2_int(self.ROW_SIZE) + # INDEX_BITS is the number of bits to select a cache line + self.INDEX_BITS = log2_int(self.NUM_LINES) + # SET_SIZE_BITS is the log base 2 of the set size + self.SET_SIZE_BITS = self.LINE_OFF_BITS + self.INDEX_BITS + # TAG_BITS is the number of bits of the tag part of the address + self.TAG_BITS = self.REAL_ADDR_BITS - self.SET_SIZE_BITS + # TAG_WIDTH is the width in bits of each way of the tag RAM + self.TAG_WIDTH = self.TAG_BITS + 7 - ((self.TAG_BITS + 7) % 8) + + # WAY_BITS is the number of bits to select a way + self.WAY_BITS = log2_int(self.NUM_WAYS) + self.TAG_RAM_WIDTH = self.TAG_BITS * self.NUM_WAYS + + # L1 ITLB + self.TL_BITS = log2_int(self.TLB_SIZE) + self.TLB_EA_TAG_BITS = 64 - (self.TLB_LG_PGSZ + self.TL_BITS) + self.TLB_PTE_BITS = 64 + + print("self.BRAM_ROWS =", self.BRAM_ROWS) + print("self.INDEX_BITS =", self.INDEX_BITS) + print("self.INSN_BITS =", self.INSN_BITS) + print("self.INSN_PER_ROW =", self.INSN_PER_ROW) + print("self.LINE_SIZE =", self.LINE_SIZE) + print("self.LINE_OFF_BITS =", self.LINE_OFF_BITS) + print("LOG_LENGTH =", LOG_LENGTH) + print("self.NUM_LINES =", self.NUM_LINES) + print("self.NUM_WAYS =", self.NUM_WAYS) + print("self.REAL_ADDR_BITS =", self.REAL_ADDR_BITS) + print("self.ROW_BITS =", self.ROW_BITS) + print("self.ROW_OFF_BITS =", self.ROW_OFF_BITS) + print("self.ROW_LINE_BITS =", self.ROW_LINE_BITS) + print("self.ROW_PER_LINE =", self.ROW_PER_LINE) + print("self.ROW_SIZE =", self.ROW_SIZE) + print("self.ROW_SIZE_BITS =", self.ROW_SIZE_BITS) + print("self.SET_SIZE_BITS =", self.SET_SIZE_BITS) + print("SIM =", SIM) + print("self.TAG_BITS =", self.TAG_BITS) + print("self.TAG_RAM_WIDTH =", self.TAG_RAM_WIDTH) + print("self.TAG_BITS =", self.TAG_BITS) + print("self.TL_BITS =", self.TL_BITS) + print("self.TLB_EA_TAG_BITS =", self.TLB_EA_TAG_BITS) + print("self.TLB_LG_PGSZ =", self.TLB_LG_PGSZ) + print("self.TLB_PTE_BITS =", self.TLB_PTE_BITS) + print("self.TLB_SIZE =", self.TLB_SIZE) + print("self.WAY_BITS =", self.WAY_BITS) + + assert self.LINE_SIZE % self.ROW_SIZE == 0 + assert ispow2(self.LINE_SIZE), "self.LINE_SIZE not power of 2" + assert ispow2(self.NUM_LINES), "self.NUM_LINES not power of 2" + assert ispow2(self.ROW_PER_LINE), "self.ROW_PER_LINE not power of 2" + assert ispow2(self.INSN_PER_ROW), "self.INSN_PER_ROW not power of 2" + assert (self.ROW_BITS == (self.INDEX_BITS + self.ROW_LINE_BITS)), \ + "geometry bits don't add up" + assert (self.LINE_OFF_BITS == + (self.ROW_OFF_BITS + self.ROW_LINE_BITS)), \ + "geometry bits don't add up" + assert (self.REAL_ADDR_BITS == + (self.TAG_BITS + self.INDEX_BITS + self.LINE_OFF_BITS)), \ + "geometry bits don't add up" + assert (self.REAL_ADDR_BITS == + (self.TAG_BITS + self.ROW_BITS + self.ROW_OFF_BITS)), \ + "geometry bits don't add up" + + # Example of layout for 32 lines of 64 bytes: + # + # .. tag |index| line | + # .. | row | | + # .. | | | |00| zero (2) + # .. | | |-| | self.INSN_BITS (1) + # .. | |---| | self.ROW_LINE_BITS (3) + # .. | |--- - --| self.LINE_OFF_BITS (6) + # .. | |- --| self.ROW_OFF_BITS (3) + # .. |----- ---| | self.ROW_BITS (8) + # .. |-----| | self.INDEX_BITS (5) + # .. --------| | self.TAG_BITS (53) + + # The cache data BRAM organized as described above for each way + #subtype cache_row_t is std_ulogic_vector(self.ROW_SIZE_BITS-1 downto 0); + # + def RowPerLineValidArray(self): + return Array(Signal(name="rows_valid_%d" %x) \ + for x in range(self.ROW_PER_LINE)) + + + # TODO to be passed to nigmen as ram attributes + # attribute ram_style : string; + # attribute ram_style of cache_tags : signal is "distributed"; + + def TLBRecord(self, name): + tlb_layout = [ ('tag', self.TLB_EA_TAG_BITS), + ('pte', self.TLB_PTE_BITS) + ] + return Record(tlb_layout, name=name) + + def TLBArray(self): + return Array(self.TLBRecord("tlb%d" % x) for x in range(self.TLB_SIZE)) + + # PLRU output interface + def PLRUOut(self): + return Array(Signal(self.WAY_BITS, name="plru_out_%d" %x) \ + for x in range(self.NUM_LINES)) + + # Return the cache line index (tag index) for an address + def get_index(self, addr): + return addr[self.LINE_OFF_BITS:self.SET_SIZE_BITS] + + # Return the cache row index (data memory) for an address + def get_row(self, addr): + return addr[self.ROW_OFF_BITS:self.SET_SIZE_BITS] + + # Return the index of a row within a line + def get_row_of_line(self, row): + return row[:self.ROW_BITS][:self.ROW_LINE_BITS] + + # Returns whether this is the last row of a line + def is_last_row_addr(self, addr, last): + return addr[self.ROW_OFF_BITS:self.LINE_OFF_BITS] == last + + # Returns whether this is the last row of a line + def is_last_row(self, row, last): + return self.get_row_of_line(row) == last + + # Return the next row in the current cache line. We use a dedicated + # function in order to limit the size of the generated adder to be + # only the bits within a cache line (3 bits with default settings) + def next_row(self, row): + row_v = row[0:self.ROW_LINE_BITS] + 1 + return Cat(row_v[:self.ROW_LINE_BITS], row[self.ROW_LINE_BITS:]) + + # Read the instruction word for the given address + # in the current cache row + def read_insn_word(self, addr, data): + word = addr[2:self.INSN_BITS+2] + return data.word_select(word, 32) + + # Get the tag value from the address + def get_tag(self, addr): + return addr[self.SET_SIZE_BITS:self.REAL_ADDR_BITS] + + # Read a tag from a tag memory row + def read_tag(self, way, tagset): + return tagset.word_select(way, self.TAG_BITS) + + # Write a tag to tag memory row + def write_tag(self, way, tagset, tag): + return self.read_tag(way, tagset).eq(tag) + + # Simple hash for direct-mapped TLB index + def hash_ea(self, addr): + hsh = (addr[self.TLB_LG_PGSZ:self.TLB_LG_PGSZ + self.TL_BITS] ^ + addr[self.TLB_LG_PGSZ + self.TL_BITS: + self.TLB_LG_PGSZ + 2 * self.TL_BITS ] ^ + addr[self.TLB_LG_PGSZ + 2 * self.TL_BITS: + self.TLB_LG_PGSZ + 3 * self.TL_BITS]) + return hsh # Cache reload state machine @@ -274,10 +280,10 @@ class State(Enum): class RegInternal(RecordObject): - def __init__(self): + def __init__(self, cfg): super().__init__() # Cache hit state (Latches for 1 cycle BRAM access) - self.hit_way = Signal(self.WAY_BITS) + self.hit_way = Signal(cfg.WAY_BITS) self.hit_nia = Signal(64) self.hit_smark = Signal() self.hit_valid = Signal() @@ -286,22 +292,23 @@ class RegInternal(RecordObject): self.state = Signal(State, reset=State.IDLE) self.wb = WBMasterOut("wb") self.req_adr = Signal(64) - self.store_way = Signal(self.WAY_BITS) - self.store_index = Signal(self.INDEX_BITS) - self.store_row = Signal(self.ROW_BITS) - self.store_tag = Signal(self.TAG_BITS) + self.store_way = Signal(cfg.WAY_BITS) + self.store_index = Signal(cfg.INDEX_BITS) + self.store_row = Signal(cfg.ROW_BITS) + self.store_tag = Signal(cfg.TAG_BITS) self.store_valid = Signal() - self.end_row_ix = Signal(self.ROW_LINE_BITS) - self.rows_valid = RowPerLineValidArray() + self.end_row_ix = Signal(cfg.ROW_LINE_BITS) + self.rows_valid = cfg.RowPerLineValidArray() # TLB miss state self.fetch_failed = Signal() -class ICache(FetchUnitInterface, Elaboratable): +class ICache(FetchUnitInterface, Elaboratable, ICacheConfig): """64 bit direct mapped icache. All instructions are 4B aligned.""" def __init__(self, pspec): FetchUnitInterface.__init__(self, pspec) + ICacheConfig.__init__(self) self.i_in = Fetch1ToICacheType(name="i_in") self.i_out = ICacheToDecode1Type(name="i_out") @@ -359,7 +366,8 @@ class ICache(FetchUnitInterface, Elaboratable): d_out = Signal(self.ROW_SIZE_BITS, name="d_out_%d" % i) wr_sel = Signal(self.ROW_SIZE, name="wr_sel_%d" % i) - way = CacheRam(self.ROW_BITS, self.ROW_SIZE_BITS, TRACE=True, ram_num=i) + way = CacheRam(self.ROW_BITS, self.ROW_SIZE_BITS, + TRACE=True, ram_num=i) m.submodules["cacheram_%d" % i] = way comb += way.rd_en.eq(do_read) @@ -391,10 +399,10 @@ class ICache(FetchUnitInterface, Elaboratable): return - m.submodules.plrus = plru = PLRUs(self.NUM_LINE, self.WAY_BITS) + m.submodules.plrus = plru = PLRUs(self.NUM_LINES, self.WAY_BITS) comb += plru.way.eq(r.hit_way) comb += plru.valid.eq(r.hit_valid) - comb += plru.index.eq(get_index(r.hit_nia)) + comb += plru.index.eq(self.get_index(r.hit_nia)) comb += plru.isel.eq(r.store_index) # select victim comb += plru_victim.eq(plru.o_index) # selected victim @@ -409,10 +417,10 @@ class ICache(FetchUnitInterface, Elaboratable): # use an *asynchronous* Memory read port here (combinatorial) m.submodules.rd_tlb = rd_tlb = self.tlbmem.read_port(domain="comb") - tlb = TLBRecord("tlb_rdport") + tlb = self.TLBRecord("tlb_rdport") pte, ttag = tlb.pte, tlb.tag - comb += tlb_req_index.eq(hash_ea(i_in.nia)) + comb += tlb_req_index.eq(self.hash_ea(i_in.nia)) comb += rd_tlb.addr.eq(tlb_req_index) comb += tlb.eq(rd_tlb.data) @@ -443,7 +451,7 @@ class ICache(FetchUnitInterface, Elaboratable): wr_index = Signal(self.TL_BITS) wr_unary = Signal(self.TLB_SIZE) - comb += wr_index.eq(hash_ea(m_in.addr)) + comb += wr_index.eq(self.hash_ea(m_in.addr)) comb += wr_unary.eq(1<