X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=src%2Fsoc%2Fexperiment%2Ficache.py;h=8010bf2eab37718fd4b1c9533434dccad2d317bb;hb=11489454be1aef4cee4970e50ebf6133492363a6;hp=d2aafaab90b8056223ac28ab259f6f5da6904271;hpb=3a064a6a2f307b5dd338f8f9fb4e7cdd11c3ac57;p=soc.git diff --git a/src/soc/experiment/icache.py b/src/soc/experiment/icache.py index d2aafaab..8010bf2e 100644 --- a/src/soc/experiment/icache.py +++ b/src/soc/experiment/icache.py @@ -18,17 +18,27 @@ TODO (in no specific order): cause muxes to be inferred for "partial writes". * Check if making the read size of PLRU a ROM helps utilization +Links: + +* https://bugs.libre-soc.org/show_bug.cgi?id=485 +* https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html + (discussion about brams for ECP5) + """ -from enum import Enum, unique -from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const) -from nmigen.cli import main -from nmigen.cli import rtlil + +from enum import (Enum, unique) +from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl, + Record) +from nmigen.cli import main, rtlil from nmutil.iocontrol import RecordObject -from nmutil.byterev import byte_reverse -from nmutil.mask import Mask from nmigen.utils import log2_int +from nmigen.lib.coding import Decoder from nmutil.util import Display +#from nmutil.plru import PLRU +from soc.experiment.plru import PLRU, PLRUs +from soc.experiment.cache_ram import CacheRam + from soc.experiment.mem_types import (Fetch1ToICacheType, ICacheToDecode1Type, MMUToICacheType) @@ -36,22 +46,21 @@ from soc.experiment.mem_types import (Fetch1ToICacheType, from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS, WBAddrType, WBDataType, WBSelType, WBMasterOut, WBSlaveOut, - WBMasterOutVector, WBSlaveOutVector, - WBIOMasterOut, WBIOSlaveOut) + ) + +from nmigen_soc.wishbone.bus import Interface +from soc.minerva.units.fetch import FetchUnitInterface -from soc.experiment.cache_ram import CacheRam -from soc.experiment.plru import PLRU # for test -from nmigen_soc.wishbone.sram import SRAM +from soc.bus.sram import SRAM from nmigen import Memory -from nmigen.cli import rtlil -if True: - from nmigen.back.pysim import Simulator, Delay, Settle -else: - from nmigen.sim.cxxsim import Simulator, Delay, Settle from nmutil.util import wrap +from nmigen.cli import main, rtlil +# NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell +# Also, check out the cxxsim nmigen branch, and latest yosys from git +from nmutil.sim_tmp_alternative import Simulator, Settle SIM = 0 @@ -76,392 +85,192 @@ REAL_ADDR_BITS = 56 LOG_LENGTH = 0 ROW_SIZE_BITS = ROW_SIZE * 8 -# ROW_PER_LINE is the number of row -# (wishbone) transactions in a line +# ROW_PER_LINE is the number of row (wishbone) transactions in a line ROW_PER_LINE = LINE_SIZE // ROW_SIZE -# BRAM_ROWS is the number of rows in -# BRAM needed to represent the full icache +# BRAM_ROWS is the number of rows in BRAM needed to represent the full icache BRAM_ROWS = NUM_LINES * ROW_PER_LINE -# INSN_PER_ROW is the number of 32bit -# instructions per BRAM row +# INSN_PER_ROW is the number of 32bit instructions per BRAM row INSN_PER_ROW = ROW_SIZE_BITS // 32 # Bit fields counts in the address # -# INSN_BITS is the number of bits to -# select an instruction in a row +# INSN_BITS is the number of bits to select an instruction in a row INSN_BITS = log2_int(INSN_PER_ROW) -# ROW_BITS is the number of bits to -# select a row +# ROW_BITS is the number of bits to select a row ROW_BITS = log2_int(BRAM_ROWS) -# ROW_LINEBITS is the number of bits to -# select a row within a line -ROW_LINE_BITS = log2_int(ROW_PER_LINE) -# LINE_OFF_BITS is the number of bits for -# the offset in a cache line +# ROW_LINE_BITS is the number of bits to select a row within a line +ROW_LINE_BITS = log2_int(ROW_PER_LINE) +# LINE_OFF_BITS is the number of bits for the offset in a cache line LINE_OFF_BITS = log2_int(LINE_SIZE) -# ROW_OFF_BITS is the number of bits for -# the offset in a row +# ROW_OFF_BITS is the number of bits for the offset in a row ROW_OFF_BITS = log2_int(ROW_SIZE) -# INDEX_BITS is the number of bits to -# select a cache line +# INDEX_BITS is the number of bits to select a cache line INDEX_BITS = log2_int(NUM_LINES) -# SET_SIZE_BITS is the log base 2 of -# the set size +# SET_SIZE_BITS is the log base 2 of the set size SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS -# TAG_BITS is the number of bits of -# the tag part of the address +# TAG_BITS is the number of bits of the tag part of the address TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS -# WAY_BITS is the number of bits to -# select a way +# TAG_WIDTH is the width in bits of each way of the tag RAM +TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8) + +# WAY_BITS is the number of bits to select a way WAY_BITS = log2_int(NUM_WAYS) TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS -# -- L1 ITLB. -# constant TLB_BITS : natural := log2(TLB_SIZE); -# constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS); -# constant TLB_PTE_BITS : natural := 64; +# L1 ITLB TLB_BITS = log2_int(TLB_SIZE) TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS) TLB_PTE_BITS = 64 -# architecture rtl of icache is -#constant ROW_SIZE_BITS : natural := ROW_SIZE*8; -#-- ROW_PER_LINE is the number of row (wishbone -#-- transactions) in a line -#constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE; -#-- BRAM_ROWS is the number of rows in BRAM -#-- needed to represent the full -#-- icache -#constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE; -#-- INSN_PER_ROW is the number of 32bit instructions per BRAM row -#constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32; -#-- Bit fields counts in the address +print("BRAM_ROWS =", BRAM_ROWS) +print("INDEX_BITS =", INDEX_BITS) +print("INSN_BITS =", INSN_BITS) +print("INSN_PER_ROW =", INSN_PER_ROW) +print("LINE_SIZE =", LINE_SIZE) +print("LINE_OFF_BITS =", LINE_OFF_BITS) +print("LOG_LENGTH =", LOG_LENGTH) +print("NUM_LINES =", NUM_LINES) +print("NUM_WAYS =", NUM_WAYS) +print("REAL_ADDR_BITS =", REAL_ADDR_BITS) +print("ROW_BITS =", ROW_BITS) +print("ROW_OFF_BITS =", ROW_OFF_BITS) +print("ROW_LINE_BITS =", ROW_LINE_BITS) +print("ROW_PER_LINE =", ROW_PER_LINE) +print("ROW_SIZE =", ROW_SIZE) +print("ROW_SIZE_BITS =", ROW_SIZE_BITS) +print("SET_SIZE_BITS =", SET_SIZE_BITS) +print("SIM =", SIM) +print("TAG_BITS =", TAG_BITS) +print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH) +print("TAG_BITS =", TAG_BITS) +print("TLB_BITS =", TLB_BITS) +print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS) +print("TLB_LG_PGSZ =", TLB_LG_PGSZ) +print("TLB_PTE_BITS =", TLB_PTE_BITS) +print("TLB_SIZE =", TLB_SIZE) +print("WAY_BITS =", WAY_BITS) + +# from microwatt/utils.vhdl +def ispow2(n): + return n != 0 and (n & (n - 1)) == 0 + +assert LINE_SIZE % ROW_SIZE == 0 +assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2" +assert ispow2(NUM_LINES), "NUM_LINES not power of 2" +assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2" +assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2" +assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \ + "geometry bits don't add up" +assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \ + "geometry bits don't add up" +assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \ + "geometry bits don't add up" +assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \ + "geometry bits don't add up" + +# Example of layout for 32 lines of 64 bytes: # -#-- INSN_BITS is the number of bits to select -#-- an instruction in a row -#constant INSN_BITS : natural := log2(INSN_PER_ROW); -#-- ROW_BITS is the number of bits to select a row -#constant ROW_BITS : natural := log2(BRAM_ROWS); -#-- ROW_LINEBITS is the number of bits to -#-- select a row within a line -#constant ROW_LINEBITS : natural := log2(ROW_PER_LINE); -#-- LINE_OFF_BITS is the number of bits for the offset -#-- in a cache line -#constant LINE_OFF_BITS : natural := log2(LINE_SIZE); -#-- ROW_OFF_BITS is the number of bits for the offset in a row -#constant ROW_OFF_BITS : natural := log2(ROW_SIZE); -#-- INDEX_BITS is the number of bits to select a cache line -#constant INDEX_BITS : natural := log2(NUM_LINES); -#-- SET_SIZE_BITS is the log base 2 of the set size -#constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS; -#-- TAG_BITS is the number of bits of the tag part of the address -#constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS; -#-- WAY_BITS is the number of bits to select a way -#constant WAY_BITS : natural := log2(NUM_WAYS); - -#-- Example of layout for 32 lines of 64 bytes: -#-- -#-- .. tag |index| line | -#-- .. | row | | -#-- .. | | | |00| zero (2) -#-- .. | | |-| | INSN_BITS (1) -#-- .. | |---| | ROW_LINEBITS (3) -#-- .. | |--- - --| LINE_OFF_BITS (6) -#-- .. | |- --| ROW_OFF_BITS (3) -#-- .. |----- ---| | ROW_BITS (8) -#-- .. |-----| | INDEX_BITS (5) -#-- .. --------| | TAG_BITS (53) - # Example of layout for 32 lines of 64 bytes: - # - # .. tag |index| line | - # .. | row | | - # .. | | | |00| zero (2) - # .. | | |-| | INSN_BITS (1) - # .. | |---| | ROW_LINEBITS (3) - # .. | |--- - --| LINE_OFF_BITS (6) - # .. | |- --| ROW_OFF_BITS (3) - # .. |----- ---| | ROW_BITS (8) - # .. |-----| | INDEX_BITS (5) - # .. --------| | TAG_BITS (53) - -#subtype row_t is integer range 0 to BRAM_ROWS-1; -#subtype index_t is integer range 0 to NUM_LINES-1; -#subtype way_t is integer range 0 to NUM_WAYS-1; -#subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0); -# -#-- The cache data BRAM organized as described above for each way +# .. tag |index| line | +# .. | row | | +# .. | | | |00| zero (2) +# .. | | |-| | INSN_BITS (1) +# .. | |---| | ROW_LINE_BITS (3) +# .. | |--- - --| LINE_OFF_BITS (6) +# .. | |- --| ROW_OFF_BITS (3) +# .. |----- ---| | ROW_BITS (8) +# .. |-----| | INDEX_BITS (5) +# .. --------| | TAG_BITS (53) + +# The cache data BRAM organized as described above for each way #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0); # -#-- The cache tags LUTRAM has a row per set. Vivado is a pain and will -#-- not handle a clean (commented) definition of the cache tags as a 3d -#-- memory. For now, work around it by putting all the tags -#subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0); -# type cache_tags_set_t is array(way_t) of cache_tag_t; -# type cache_tags_array_t is array(index_t) of cache_tags_set_t; -#constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS; -#subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0); -#type cache_tags_array_t is array(index_t) of cache_tags_set_t; +# The cache tags LUTRAM has a row per set. Vivado is a pain and will +# not handle a clean (commented) definition of the cache tags as a 3d +# memory. For now, work around it by putting all the tags def CacheTagArray(): - return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES)) - -#-- The cache valid bits -#subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0); -#type cache_valids_t is array(index_t) of cache_way_valids_t; -#type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic; -def CacheValidBitsArray(): - return Array(Signal() for x in range(ROW_PER_LINE)) + tag_layout = [('valid', NUM_WAYS), + ('tag', TAG_RAM_WIDTH), + ] + return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES)) def RowPerLineValidArray(): - return Array(Signal() for x in range(ROW_PER_LINE)) + return Array(Signal(name="rows_valid_%d" %x) \ + for x in range(ROW_PER_LINE)) -#attribute ram_style : string; -#attribute ram_style of cache_tags : signal is "distributed"; - # TODO to be passed to nigmen as ram attributes - # attribute ram_style : string; - # attribute ram_style of cache_tags : signal is "distributed"; +# TODO to be passed to nigmen as ram attributes +# attribute ram_style : string; +# attribute ram_style of cache_tags : signal is "distributed"; +def TLBArray(): + tlb_layout = [('valid', 1), + ('tag', TLB_EA_TAG_BITS), + ('pte', TLB_PTE_BITS) + ] + return Array(Record(tlb_layout, name="tlb%d" % x) for x in range(TLB_SIZE)) -#subtype tlb_index_t is integer range 0 to TLB_SIZE - 1; -#type tlb_valids_t is array(tlb_index_t) of std_ulogic; -#subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0); -#type tlb_tags_t is array(tlb_index_t) of tlb_tag_t; -#subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0); -#type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t; -def TLBValidBitsArray(): - return Array(Signal() for x in range(TLB_SIZE)) - -def TLBTagArray(): - return Array(Signal(TLB_EA_TAG_BITS) for x in range(TLB_SIZE)) - -def TLBPTEArray(): - return Array(Signal(TLB_PTE_BITS) for x in range(TLB_SIZE)) - - -#-- Cache RAM interface -#type cache_ram_out_t is array(way_t) of cache_row_t; # Cache RAM interface def CacheRamOut(): - return Array(Signal(ROW_SIZE_BITS) for x in range(NUM_WAYS)) + return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \ + for x in range(NUM_WAYS)) -#-- PLRU output interface -#type plru_out_t is array(index_t) of -# std_ulogic_vector(WAY_BITS-1 downto 0); # PLRU output interface def PLRUOut(): - return Array(Signal(WAY_BITS) for x in range(NUM_LINES)) - -# -- Return the cache line index (tag index) for an address -# function get_index(addr: std_ulogic_vector(63 downto 0)) -# return index_t is -# begin -# return to_integer(unsigned( -# addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS) -# )); -# end; + return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \ + for x in range(NUM_LINES)) + # Return the cache line index (tag index) for an address def get_index(addr): return addr[LINE_OFF_BITS:SET_SIZE_BITS] -# -- Return the cache row index (data memory) for an address -# function get_row(addr: std_ulogic_vector(63 downto 0)) -# return row_t is -# begin -# return to_integer(unsigned( -# addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS) -# )); -# end; # Return the cache row index (data memory) for an address def get_row(addr): return addr[ROW_OFF_BITS:SET_SIZE_BITS] -# -- Return the index of a row within a line -# function get_row_of_line(row: row_t) return row_in_line_t is -# variable row_v : unsigned(ROW_BITS-1 downto 0); -# begin -# row_v := to_unsigned(row, ROW_BITS); -# return row_v(ROW_LINEBITS-1 downto 0); -# end; # Return the index of a row within a line def get_row_of_line(row): - row[:ROW_LINE_BITS] - -# -- Returns whether this is the last row of a line -# function is_last_row_addr(addr: wishbone_addr_type; -# last: row_in_line_t -# ) -# return boolean is -# begin -# return unsigned( -# addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS) -# ) = last; -# end; + return row[:ROW_BITS][:ROW_LINE_BITS] + # Returns whether this is the last row of a line def is_last_row_addr(addr, last): return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last -# -- Returns whether this is the last row of a line -# function is_last_row(row: row_t; -# last: row_in_line_t) return boolean is -# begin -# return get_row_of_line(row) = last; -# end; # Returns whether this is the last row of a line def is_last_row(row, last): return get_row_of_line(row) == last -# -- Return the address of the next row in the current cache line -# function next_row_addr(addr: wishbone_addr_type) -# return std_ulogic_vector is -# variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0); -# variable result : wishbone_addr_type; -# begin -# -- Is there no simpler way in VHDL to generate that 3 bits adder ? -# row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS); -# row_idx := std_ulogic_vector(unsigned(row_idx) + 1); -# result := addr; -# result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx; -# return result; -# end; -# Return the address of the next row in the current cache line -def next_row_addr(addr): - # TODO no idea what's going on here, looks like double assignments - # overriding earlier assignments ??? Help please! - pass - -# -- Return the next row in the current cache line. We use a dedicated -# -- function in order to limit the size of the generated adder to be -# -- only the bits within a cache line (3 bits with default settings) -# function next_row(row: row_t) return row_t is -# variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0); -# variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0); -# variable result : std_ulogic_vector(ROW_BITS-1 downto 0); -# begin -# row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS)); -# row_idx := row_v(ROW_LINEBITS-1 downto 0); -# row_v(ROW_LINEBITS-1 downto 0) := -# std_ulogic_vector(unsigned(row_idx) + 1); -# return to_integer(unsigned(row_v)); -# end; # Return the next row in the current cache line. We use a dedicated # function in order to limit the size of the generated adder to be # only the bits within a cache line (3 bits with default settings) def next_row(row): - # TODO no idea what's going on here, looks like double assignments - # overriding earlier assignments ??? Help please! - pass - -# -- Read the instruction word for the given address in the -# -- current cache row -# function read_insn_word(addr: std_ulogic_vector(63 downto 0); -# data: cache_row_t) return std_ulogic_vector is -# variable word: integer range 0 to INSN_PER_ROW-1; -# begin -# word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2))); -# return data(31+word*32 downto word*32); -# end; + row_v = row[0:ROW_LINE_BITS] + 1 + return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:]) + # Read the instruction word for the given address # in the current cache row def read_insn_word(addr, data): - word = addr[2:INSN_BITS+3] - return data[word * 32:32 + word * 32] - -# -- Get the tag value from the address -# function get_tag( -# addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0) -# ) -# return cache_tag_t is -# begin -# return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS); -# end; + word = addr[2:INSN_BITS+2] + return data.word_select(word, 32) + # Get the tag value from the address def get_tag(addr): return addr[SET_SIZE_BITS:REAL_ADDR_BITS] -# -- Read a tag from a tag memory row -# function read_tag(way: way_t; tagset: cache_tags_set_t) -# return cache_tag_t is -# begin -# return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS); -# end; # Read a tag from a tag memory row def read_tag(way, tagset): - return tagset[way * TAG_BITS:(way + 1) * TAG_BITS] - -# -- Write a tag to tag memory row -# procedure write_tag(way: in way_t; -# tagset: inout cache_tags_set_t; tag: cache_tag_t) is -# begin -# tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag; -# end; + return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS] + # Write a tag to tag memory row def write_tag(way, tagset, tag): - tagset[way * TAG_BITS:(way + 1) * TAG_BITS] = tag - -# -- Simple hash for direct-mapped TLB index -# function hash_ea(addr: std_ulogic_vector(63 downto 0)) -# return tlb_index_t is -# variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0); -# begin -# hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ) -# xor addr( -# TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto -# TLB_LG_PGSZ + TLB_BITS -# ) -# xor addr( -# TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto -# TLB_LG_PGSZ + 2 * TLB_BITS -# ); -# return to_integer(unsigned(hash)); -# end; + return read_tag(way, tagset).eq(tag) + # Simple hash for direct-mapped TLB index def hash_ea(addr): - hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[ - TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS - ] ^ addr[ - TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS - ] + hsh = (addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ + addr[TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS ] ^ + addr[TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS]) return hsh -# begin -# -# assert LINE_SIZE mod ROW_SIZE = 0; -# assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" -# severity FAILURE; -# assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" -# severity FAILURE; -# assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2" -# severity FAILURE; -# assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2" -# severity FAILURE; -# assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS) -# report "geometry bits don't add up" severity FAILURE; -# assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) -# report "geometry bits don't add up" severity FAILURE; -# assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) -# report "geometry bits don't add up" severity FAILURE; -# assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS) -# report "geometry bits don't add up" severity FAILURE; -# -# sim_debug: if SIM generate -# debug: process -# begin -# report "ROW_SIZE = " & natural'image(ROW_SIZE); -# report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE); -# report "BRAM_ROWS = " & natural'image(BRAM_ROWS); -# report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW); -# report "INSN_BITS = " & natural'image(INSN_BITS); -# report "ROW_BITS = " & natural'image(ROW_BITS); -# report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS); -# report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS); -# report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS); -# report "INDEX_BITS = " & natural'image(INDEX_BITS); -# report "TAG_BITS = " & natural'image(TAG_BITS); -# report "WAY_BITS = " & natural'image(WAY_BITS); -# wait; -# end process; -# end generate; # Cache reload state machine @unique @@ -470,42 +279,23 @@ class State(Enum): CLR_TAG = 1 WAIT_ACK = 2 -# type reg_internal_t is record -# -- Cache hit state (Latches for 1 cycle BRAM access) -# hit_way : way_t; -# hit_nia : std_ulogic_vector(63 downto 0); -# hit_smark : std_ulogic; -# hit_valid : std_ulogic; -# -# -- Cache miss state (reload state machine) -# state : state_t; -# wb : wishbone_master_out; -# store_way : way_t; -# store_index : index_t; -# store_row : row_t; -# store_tag : cache_tag_t; -# store_valid : std_ulogic; -# end_row_ix : row_in_line_t; -# rows_valid : row_per_line_valid_t; -# -# -- TLB miss state -# fetch_failed : std_ulogic; -# end record; + class RegInternal(RecordObject): def __init__(self): super().__init__() # Cache hit state (Latches for 1 cycle BRAM access) - self.hit_way = Signal(NUM_WAYS) + self.hit_way = Signal(WAY_BITS) self.hit_nia = Signal(64) self.hit_smark = Signal() self.hit_valid = Signal() # Cache miss state (reload state machine) - self.state = Signal(State) - self.wb = WBMasterOut() - self.store_way = Signal(NUM_WAYS) - self.store_index = Signal(NUM_LINES) - self.store_row = Signal(BRAM_ROWS) + self.state = Signal(State, reset=State.IDLE) + self.wb = WBMasterOut("wb") + self.req_adr = Signal(64) + self.store_way = Signal(WAY_BITS) + self.store_index = Signal(INDEX_BITS) + self.store_row = Signal(ROW_BITS) self.store_tag = Signal(TAG_BITS) self.store_valid = Signal() self.end_row_ix = Signal(ROW_LINE_BITS) @@ -514,251 +304,133 @@ class RegInternal(RecordObject): # TLB miss state self.fetch_failed = Signal() -# -- 64 bit direct mapped icache. All instructions are 4B aligned. -# -# entity icache is -# generic ( -# SIM : boolean := false; -# -- Line size in bytes -# LINE_SIZE : positive := 64; -# -- BRAM organisation: We never access more -# -- than wishbone_data_bits -# -- at a time so to save resources we make the -# -- array only that wide, -# -- and use consecutive indices for to make a cache "line" -# -- -# -- ROW_SIZE is the width in bytes of the BRAM (based on WB, -# -- so 64-bits) -# ROW_SIZE : positive := wishbone_data_bits / 8; -# -- Number of lines in a set -# NUM_LINES : positive := 32; -# -- Number of ways -# NUM_WAYS : positive := 4; -# -- L1 ITLB number of entries (direct mapped) -# TLB_SIZE : positive := 64; -# -- L1 ITLB log_2(page_size) -# TLB_LG_PGSZ : positive := 12; -# -- Number of real address bits that we store -# REAL_ADDR_BITS : positive := 56; -# -- Non-zero to enable log data collection -# LOG_LENGTH : natural := 0 -# ); -# port ( -# clk : in std_ulogic; -# rst : in std_ulogic; -# -# i_in : in Fetch1ToIcacheType; -# i_out : out IcacheToDecode1Type; -# -# m_in : in MmuToIcacheType; -# -# stall_in : in std_ulogic; -# stall_out : out std_ulogic; -# flush_in : in std_ulogic; -# inval_in : in std_ulogic; -# -# wishbone_out : out wishbone_master_out; -# wishbone_in : in wishbone_slave_out; -# -# log_out : out std_ulogic_vector(53 downto 0) -# ); -# end entity icache; -# 64 bit direct mapped icache. All instructions are 4B aligned. -class ICache(Elaboratable): + +class ICache(FetchUnitInterface, Elaboratable): """64 bit direct mapped icache. All instructions are 4B aligned.""" - def __init__(self): - self.i_in = Fetch1ToICacheType() - self.i_out = ICacheToDecode1Type() + def __init__(self, pspec): + FetchUnitInterface.__init__(self, pspec) + self.i_in = Fetch1ToICacheType(name="i_in") + self.i_out = ICacheToDecode1Type(name="i_out") - self.m_in = MMUToICacheType() + self.m_in = MMUToICacheType(name="m_in") self.stall_in = Signal() self.stall_out = Signal() self.flush_in = Signal() self.inval_in = Signal() - self.wb_out = WBMasterOut() - self.wb_in = WBSlaveOut() + # standard naming (wired to non-standard for compatibility) + self.bus = Interface(addr_width=32, + data_width=64, + granularity=8, + features={'stall'}, + alignment=0, + name="icache_wb") self.log_out = Signal(54) + # use FetchUnitInterface, helps keep some unit tests running + self.use_fetch_iface = False + + def use_fetch_interface(self): + self.use_fetch_iface = True + + # Generate a cache RAM for each way + def rams(self, m, r, cache_out_row, use_previous, + replace_way, req_row): -# -- Generate a cache RAM for each way -# rams: for i in 0 to NUM_WAYS-1 generate -# signal do_read : std_ulogic; -# signal do_write : std_ulogic; -# signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0); -# signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0); -# signal dout : cache_row_t; -# signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0); -# begin -# way: entity work.cache_ram -# generic map ( -# ROW_BITS => ROW_BITS, -# WIDTH => ROW_SIZE_BITS -# ) -# port map ( -# clk => clk, -# rd_en => do_read, -# rd_addr => rd_addr, -# rd_data => dout, -# wr_sel => wr_sel, -# wr_addr => wr_addr, -# wr_data => wishbone_in.dat -# ); -# process(all) -# begin -# do_read <= not (stall_in or use_previous); -# do_write <= '0'; -# if wishbone_in.ack = '1' and replace_way = i then -# do_write <= '1'; -# end if; -# cache_out(i) <= dout; -# rd_addr <= -# std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); -# wr_addr <= -# std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS)); -# for i in 0 to ROW_SIZE-1 loop -# wr_sel(i) <= do_write; -# end loop; -# end process; -# end generate; - def rams(self, m): comb = m.d.comb + sync = m.d.sync + + bus, stall_in = self.bus, self.stall_in + # read condition (for every cache ram) do_read = Signal() - do_write = Signal() + comb += do_read.eq(~(stall_in | use_previous)) + rd_addr = Signal(ROW_BITS) wr_addr = Signal(ROW_BITS) - _d_out = Signal(ROW_SIZE_BITS) - wr_sel = Signal(ROW_SIZE) + comb += rd_addr.eq(req_row) + comb += wr_addr.eq(r.store_row) + + # binary-to-unary converters: replace-way enabled by bus.ack, + # hit-way left permanently enabled + m.submodules.replace_way_e = re = Decoder(NUM_WAYS) + m.submodules.hit_way_e = he = Decoder(NUM_WAYS) + comb += re.i.eq(replace_way) + comb += re.n.eq(~bus.ack) + comb += he.i.eq(r.hit_way) for i in range(NUM_WAYS): - way = CacheRam(ROW_BITS, ROW_SIZE_BITS) + do_write = Signal(name="do_wr_%d" % i) + d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i) + wr_sel = Signal(ROW_SIZE, name="wr_sel_%d" % i) + + way = CacheRam(ROW_BITS, ROW_SIZE_BITS, TRACE=True, ram_num=i) + m.submodules["cacheram_%d" % i] = way + comb += way.rd_en.eq(do_read) comb += way.rd_addr.eq(rd_addr) - comb += way.rd_data.eq(_d_out) + comb += d_out.eq(way.rd_data_o) comb += way.wr_sel.eq(wr_sel) - comb += way.wr_add.eq(wr_addr) - comb += way.wr_data.eq(wb_in.dat) - - comb += do_read.eq(~(stall_in | use_previous)) - comb += do_write.eq(0) - - with m.If(wb_in.ack & (replace_way == i)): - do_write.eq(1) - - comb += cache_out[i].eq(_d_out) - comb += rd_addr.eq(Signal(req_row)) - comb += wr_addr.eq(Signal(r.store_row)) - for j in range(ROW_SIZE): - comb += wr_sel[j].eq(do_write) - -# -- Generate PLRUs -# maybe_plrus: if NUM_WAYS > 1 generate -# begin -# plrus: for i in 0 to NUM_LINES-1 generate -# -- PLRU interface -# signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); -# signal plru_acc_en : std_ulogic; -# signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); -# -# begin -# plru : entity work.plru -# generic map ( -# BITS => WAY_BITS -# ) -# port map ( -# clk => clk, -# rst => rst, -# acc => plru_acc, -# acc_en => plru_acc_en, -# lru => plru_out -# ); -# -# process(all) -# begin -# -- PLRU interface -# if get_index(r.hit_nia) = i then -# plru_acc_en <= r.hit_valid; -# else -# plru_acc_en <= '0'; -# end if; -# plru_acc <= -# std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS)); -# plru_victim(i) <= plru_out; -# end process; -# end generate; -# end generate; - def maybe_plrus(self, m): - comb += m.d.comb - - with m.If(NUM_WAYS > 1): - for i in range(NUM_LINES): - plru_acc = Signal(WAY_BITS) - plru_acc_en = Signal() - plru_out = Signal(WAY_BITS) - plru = PLRU(WAY_BITS) - comb += plru.acc.eq(plru_acc) - comb += plru.acc_en.eq(plru_acc_en) - comb += plru.lru.eq(plru_out) - - # PLRU interface - with m.If(get_index(r.hit_nia) == i): - comb += plru.acc_en.eq(r.hit_valid) - - with m.Else(): - comb += plru.acc_en.eq(0) - - comb += plru.acc.eq(r.hit_way) - comb += plru_victim[i].eq(plru.lru) - -# -- TLB hit detection and real address generation -# itlb_lookup : process(all) -# variable pte : tlb_pte_t; -# variable ttag : tlb_tag_t; -# begin -# tlb_req_index <= hash_ea(i_in.nia); -# pte := itlb_ptes(tlb_req_index); -# ttag := itlb_tags(tlb_req_index); -# if i_in.virt_mode = '1' then -# real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & -# i_in.nia(TLB_LG_PGSZ - 1 downto 0); -# if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then -# ra_valid <= itlb_valids(tlb_req_index); -# else -# ra_valid <= '0'; -# end if; -# eaa_priv <= pte(3); -# else -# real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0); -# ra_valid <= '1'; -# eaa_priv <= '1'; -# end if; -# -# -- no IAMR, so no KUEP support for now -# priv_fault <= eaa_priv and not i_in.priv_mode; -# access_ok <= ra_valid and not priv_fault; -# end process; + comb += way.wr_addr.eq(wr_addr) + comb += way.wr_data.eq(bus.dat_r) + + comb += do_write.eq(re.o[i]) + + with m.If(do_write): + sync += Display("cache write adr: %x data: %lx", + wr_addr, way.wr_data) + + with m.If(he.o[i]): + comb += cache_out_row.eq(d_out) + with m.If(do_read): + sync += Display("cache read adr: %x data: %x", + req_row, d_out) + + comb += wr_sel.eq(Repl(do_write, ROW_SIZE)) + + # Generate PLRUs + def maybe_plrus(self, m, r, plru_victim): + comb = m.d.comb + + if NUM_WAYS == 0: + return + + + m.submodules.plrus = plru = PLRUs(NUM_LINES, WAY_BITS) + comb += plru.way.eq(r.hit_way) + comb += plru.valid.eq(r.hit_valid) + comb += plru.index.eq(get_index(r.hit_nia)) + comb += plru.isel.eq(r.store_index) # select victim + comb += plru_victim.eq(plru.o_index) # selected victim + # TLB hit detection and real address generation - def itlb_lookup(self, m): + def itlb_lookup(self, m, tlb_req_index, itlb, + real_addr, ra_valid, eaa_priv, + priv_fault, access_ok): + comb = m.d.comb + i_in = self.i_in + + pte = Signal(TLB_PTE_BITS) + ttag = Signal(TLB_EA_TAG_BITS) + comb += tlb_req_index.eq(hash_ea(i_in.nia)) - comb += pte.eq(itlb_ptes[tlb_req_index]) - comb += ttag.eq(itlb_tags[tlb_req_index]) + comb += pte.eq(itlb[tlb_req_index].pte) + comb += ttag.eq(itlb[tlb_req_index].tag) with m.If(i_in.virt_mode): comb += real_addr.eq(Cat( - i_in.nia[:TLB_LB_PGSZ], + i_in.nia[:TLB_LG_PGSZ], pte[TLB_LG_PGSZ:REAL_ADDR_BITS] )) with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]): - comb += ra_valid.eq(itlb_valid_bits[tlb_req_index]) + comb += ra_valid.eq(itlb[tlb_req_index].valid) - with m.Else(): - comb += ra_valid.eq(0) + comb += eaa_priv.eq(pte[3]) with m.Else(): comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS]) @@ -769,74 +441,45 @@ class ICache(Elaboratable): comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode) comb += access_ok.eq(ra_valid & ~priv_fault) -# -- iTLB update -# itlb_update: process(clk) -# variable wr_index : tlb_index_t; -# begin -# if rising_edge(clk) then -# wr_index := hash_ea(m_in.addr); -# if rst = '1' or -# (m_in.tlbie = '1' and m_in.doall = '1') then -# -- clear all valid bits -# for i in tlb_index_t loop -# itlb_valids(i) <= '0'; -# end loop; -# elsif m_in.tlbie = '1' then -# -- clear entry regardless of hit or miss -# itlb_valids(wr_index) <= '0'; -# elsif m_in.tlbld = '1' then -# itlb_tags(wr_index) <= -# m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS); -# itlb_ptes(wr_index) <= m_in.pte; -# itlb_valids(wr_index) <= '1'; -# end if; -# end if; -# end process; # iTLB update - def itlb_update(self, m): + def itlb_update(self, m, itlb): + comb = m.d.comb sync = m.d.sync + m_in = self.m_in + wr_index = Signal(TLB_SIZE) - sync += wr_index.eq(hash_ea(m_in.addr)) + comb += wr_index.eq(hash_ea(m_in.addr)) - with m.If('''TODO rst in nmigen''' | (m_in.tlbie & m_in.doall)): + with m.If(m_in.tlbie & m_in.doall): # Clear all valid bits for i in range(TLB_SIZE): - sync += itlb_vlaids[i].eq(0) + sync += itlb[i].valid.eq(0) with m.Elif(m_in.tlbie): # Clear entry regardless of hit or miss - sync += itlb_valid_bits[wr_index].eq(0) + sync += itlb[wr_index].valid.eq(0) with m.Elif(m_in.tlbld): - sync += itlb_tags[wr_index].eq( - m_in.addr[TLB_LG_PGSZ + TLB_BITS:64] - ) - sync += itlb_ptes[wr_index].eq(m_in.pte) - sync += itlb_valid_bits[wr_index].eq(1) - -# -- Cache hit detection, output to fetch2 and other misc logic -# icache_comb : process(all) + sync += itlb[wr_index].tag.eq(m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]) + sync += itlb[wr_index].pte.eq(m_in.pte) + sync += itlb[wr_index].valid.eq(1) + # Cache hit detection, output to fetch2 and other misc logic - def icache_comb(self, m): -# variable is_hit : std_ulogic; -# variable hit_way : way_t; + def icache_comb(self, m, use_previous, r, req_index, req_row, + req_hit_way, req_tag, real_addr, req_laddr, + cache_tags, access_ok, + req_is_hit, req_is_miss, replace_way, + plru_victim, cache_out_row): + comb = m.d.comb + i_in, i_out, bus = self.i_in, self.i_out, self.bus + flush_in, stall_out = self.flush_in, self.stall_out + is_hit = Signal() - hit_way = Signal(NUM_WAYS) -# begin -# -- i_in.sequential means that i_in.nia this cycle -# -- is 4 more than last cycle. If we read more -# -- than 32 bits at a time, had a cache hit last -# -- cycle, and we don't want the first 32-bit chunk -# -- then we can keep the data we read last cycle -# -- and just use that. -# if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then -# use_previous <= i_in.sequential and r.hit_valid; -# else -# use_previous <= '0'; -# end if; + hit_way = Signal(WAY_BITS) + # i_in.sequential means that i_in.nia this cycle is 4 more than # last cycle. If we read more than 32 bits at a time, had a # cache hit last cycle, and we don't want the first 32-bit chunk @@ -844,108 +487,58 @@ class ICache(Elaboratable): with m.If(i_in.nia[2:INSN_BITS+2] != 0): comb += use_previous.eq(i_in.sequential & r.hit_valid) - with m.Else(): - comb += use_previous.eq(0) - -# -- Extract line, row and tag from request -# req_index <= get_index(i_in.nia); -# req_row <= get_row(i_in.nia); -# req_tag <= get_tag(real_addr); # Extract line, row and tag from request comb += req_index.eq(get_index(i_in.nia)) comb += req_row.eq(get_row(i_in.nia)) comb += req_tag.eq(get_tag(real_addr)) -# -- Calculate address of beginning of cache row, will be -# -- used for cache miss processing if needed -# req_laddr <= -# (63 downto REAL_ADDR_BITS => '0') & -# real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) & -# (ROW_OFF_BITS-1 downto 0 => '0'); # Calculate address of beginning of cache row, will be # used for cache miss processing if needed comb += req_laddr.eq(Cat( - Const(0b0, ROW_OFF_BITS), + Const(0, ROW_OFF_BITS), real_addr[ROW_OFF_BITS:REAL_ADDR_BITS], - Const(0, REAL_ADDR_BITS) )) -# -- Test if pending request is a hit on any way -# hit_way := 0; -# is_hit := '0'; -# for i in way_t loop -# if i_in.req = '1' and -# (cache_valids(req_index)(i) = '1' or -# (r.state = WAIT_ACK and -# req_index = r.store_index and -# i = r.store_way and -# r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then -# if read_tag(i, cache_tags(req_index)) = req_tag then -# hit_way := i; -# is_hit := '1'; -# end if; -# end if; -# end loop; # Test if pending request is a hit on any way + hitcond = Signal() + comb += hitcond.eq((r.state == State.WAIT_ACK) + & (req_index == r.store_index) + & r.rows_valid[req_row % ROW_PER_LINE] + ) + # i_in.req asserts Decoder active + cvb = Signal(NUM_WAYS) + ctag = Signal(TAG_RAM_WIDTH) + comb += ctag.eq(cache_tags[req_index].tag) + comb += cvb.eq(cache_tags[req_index].valid) + m.submodules.store_way_e = se = Decoder(NUM_WAYS) + comb += se.i.eq(r.store_way) + comb += se.n.eq(~i_in.req) for i in range(NUM_WAYS): - with m.If(i_in.req & - (cache_valid_bits[req_index][i] | - ((r.state == State.WAIT_ACK) - & (req_index == r.store_index) - & (i == r.store_way) - & r.rows_valid[req_row % ROW_PER_LINE]))): - with m.If(read_tag(i, cahce_tags[req_index]) == req_tag): - comb += hit_way.eq(i) - comb += is_hit.eq(1) - -# -- Generate the "hit" and "miss" signals -# -- for the synchronous blocks -# if i_in.req = '1' and access_ok = '1' and flush_in = '0' -# and rst = '0' then -# req_is_hit <= is_hit; -# req_is_miss <= not is_hit; -# else -# req_is_hit <= '0'; -# req_is_miss <= '0'; -# end if; -# req_hit_way <= hit_way; + tagi = Signal(TAG_BITS, name="tag_i%d" % i) + hit_test = Signal(name="hit_test%d" % i) + is_tag_hit = Signal(name="is_tag_hit_%d" % i) + comb += tagi.eq(read_tag(i, ctag)) + comb += hit_test.eq(se.o[i]) + comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) & + (tagi == req_tag)) + with m.If(is_tag_hit): + comb += hit_way.eq(i) + comb += is_hit.eq(1) + # Generate the "hit" and "miss" signals # for the synchronous blocks - with m.If(i_in.rq & access_ok & ~flush_in): + with m.If(i_in.req & access_ok & ~flush_in): comb += req_is_hit.eq(is_hit) comb += req_is_miss.eq(~is_hit) - with m.Else(): - comb += req_is_hit.eq(0) - comb += req_is_miss.eq(0) - -# -- The way to replace on a miss -# if r.state = CLR_TAG then -# replace_way <= -# to_integer(unsigned(plru_victim(r.store_index))); -# else -# replace_way <= r.store_way; -# end if; + comb += req_hit_way.eq(hit_way) + # The way to replace on a miss with m.If(r.state == State.CLR_TAG): - comb += replace_way.eq(plru_victim[r.store_index]) - + comb += replace_way.eq(plru_victim) with m.Else(): comb += replace_way.eq(r.store_way) -# -- Output instruction from current cache row -# -- -# -- Note: This is a mild violation of our design principle of -# -- having pipeline stages output from a clean latch. In this -# -- case we output the result of a mux. The alternative would -# -- be output an entire row which I prefer not to do just yet -# -- as it would force fetch2 to know about some of the cache -# -- geometry information. -# i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way)); -# i_out.valid <= r.hit_valid; -# i_out.nia <= r.hit_nia; -# i_out.stop_mark <= r.hit_smark; -# i_out.fetch_failed <= r.fetch_failed; # Output instruction from current cache row # # Note: This is a mild violation of our design principle of @@ -954,56 +547,39 @@ class ICache(Elaboratable): # be output an entire row which I prefer not to do just yet # as it would force fetch2 to know about some of the cache # geometry information. - comb += i_out.insn.eq( - read_insn_word(r.hit_nia, cache_out[r.hit_way]) - ) + comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row)) comb += i_out.valid.eq(r.hit_valid) comb += i_out.nia.eq(r.hit_nia) comb += i_out.stop_mark.eq(r.hit_smark) comb += i_out.fetch_failed.eq(r.fetch_failed) -# -- Stall fetch1 if we have a miss on cache or TLB -# -- or a protection fault -# stall_out <= not (is_hit and access_ok); # Stall fetch1 if we have a miss on cache or TLB # or a protection fault comb += stall_out.eq(~(is_hit & access_ok)) -# -- Wishbone requests output (from the cache miss reload machine) -# wishbone_out <= r.wb; # Wishbone requests output (from the cache miss reload machine) - comb += wb_out.eq(r.wb) -# end process; + comb += bus.we.eq(r.wb.we) + comb += bus.adr.eq(r.wb.adr) + comb += bus.sel.eq(r.wb.sel) + comb += bus.stb.eq(r.wb.stb) + comb += bus.dat_w.eq(r.wb.dat) + comb += bus.cyc.eq(r.wb.cyc) -# -- Cache hit synchronous machine -# icache_hit : process(clk) # Cache hit synchronous machine - def icache_hit(self, m): + def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way, + req_index, req_tag, real_addr): sync = m.d.sync -# begin -# if rising_edge(clk) then -# -- keep outputs to fetch2 unchanged on a stall -# -- except that flush or reset sets valid to 0 -# -- If use_previous, keep the same data as last -# -- cycle and use the second half -# if stall_in = '1' or use_previous = '1' then -# if rst = '1' or flush_in = '1' then -# r.hit_valid <= '0'; -# end if; + + i_in, stall_in = self.i_in, self.stall_in + flush_in = self.flush_in + # keep outputs to fetch2 unchanged on a stall # except that flush or reset sets valid to 0 # If use_previous, keep the same data as last # cycle and use the second half with m.If(stall_in | use_previous): - with m.If('''TODO rst nmigen''' | flush_in): + with m.If(flush_in): sync += r.hit_valid.eq(0) -# else -# -- On a hit, latch the request for the next cycle, -# -- when the BRAM data will be available on the -# -- cache_out output of the corresponding way -# r.hit_valid <= req_is_hit; -# if req_is_hit = '1' then -# r.hit_way <= req_hit_way; with m.Else(): # On a hit, latch the request for the next cycle, # when the BRAM data will be available on the @@ -1012,367 +588,210 @@ class ICache(Elaboratable): with m.If(req_is_hit): sync += r.hit_way.eq(req_hit_way) + sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x " + "way:%x RA:%x", i_in.nia, i_in.virt_mode, + i_in.stop_mark, req_index, req_tag, + req_hit_way, real_addr) -# report "cache hit nia:" & to_hstring(i_in.nia) & -# " IR:" & std_ulogic'image(i_in.virt_mode) & -# " SM:" & std_ulogic'image(i_in.stop_mark) & -# " idx:" & integer'image(req_index) & -# " tag:" & to_hstring(req_tag) & -# " way:" & integer'image(req_hit_way) & -# " RA:" & to_hstring(real_addr); - print(f"cache hit nia:{i_in.nia}, " \ - f"IR:{i_in.virt_mode}, " \ - f"SM:{i_in.stop_mark}, idx:{req_index}, " \ - f"tag:{req_tag}, way:{req_hit_way}, " \ - f"RA:{real_addr}") -# end if; -# end if; -# if stall_in = '0' then -# -- Send stop marks and NIA down regardless of validity -# r.hit_smark <= i_in.stop_mark; -# r.hit_nia <= i_in.nia; -# end if; with m.If(~stall_in): # Send stop marks and NIA down regardless of validity sync += r.hit_smark.eq(i_in.stop_mark) sync += r.hit_nia.eq(i_in.nia) -# end if; -# end process; -# -- Cache miss/reload synchronous machine -# icache_miss : process(clk) + def icache_miss_idle(self, m, r, req_is_miss, req_laddr, + req_index, req_tag, replace_way, real_addr): + comb = m.d.comb + sync = m.d.sync + + i_in = self.i_in + + # Reset per-row valid flags, only used in WAIT_ACK + for i in range(ROW_PER_LINE): + sync += r.rows_valid[i].eq(0) + + # We need to read a cache line + with m.If(req_is_miss): + sync += Display( + "cache miss nia:%x IR:%x SM:%x idx:%x " + " way:%x tag:%x RA:%x", i_in.nia, + i_in.virt_mode, i_in.stop_mark, req_index, + replace_way, req_tag, real_addr) + + # Keep track of our index and way for subsequent stores + st_row = Signal(ROW_BITS) + comb += st_row.eq(get_row(req_laddr)) + sync += r.store_index.eq(req_index) + sync += r.store_row.eq(st_row) + sync += r.store_tag.eq(req_tag) + sync += r.store_valid.eq(1) + sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1) + + # Prep for first wishbone read. We calculate the address + # of the start of the cache line and start the WB cycle. + sync += r.req_adr.eq(req_laddr) + sync += r.wb.cyc.eq(1) + sync += r.wb.stb.eq(1) + + # Track that we had one request sent + sync += r.state.eq(State.CLR_TAG) + + def icache_miss_clr_tag(self, m, r, replace_way, + req_index, + tagset, cache_tags): + comb = m.d.comb + sync = m.d.sync + + # Get victim way from plru + sync += r.store_way.eq(replace_way) + + # Force misses on that way while reloading that line + cv = Signal(INDEX_BITS) + comb += cv.eq(cache_tags[req_index].valid) + comb += cv.bit_select(replace_way, 1).eq(0) + sync += cache_tags[req_index].valid.eq(cv) + + for i in range(NUM_WAYS): + with m.If(i == replace_way): + comb += tagset.eq(cache_tags[r.store_index].tag) + comb += write_tag(i, tagset, r.store_tag) + sync += cache_tags[r.store_index].tag.eq(tagset) + + sync += r.state.eq(State.WAIT_ACK) + + def icache_miss_wait_ack(self, m, r, replace_way, inval_in, + cache_tags, stbs_done): + comb = m.d.comb + sync = m.d.sync + + bus = self.bus + + # Requests are all sent if stb is 0 + stbs_zero = Signal() + comb += stbs_zero.eq(r.wb.stb == 0) + comb += stbs_done.eq(stbs_zero) + + # If we are still sending requests, was one accepted? + with m.If(~bus.stall & ~stbs_zero): + # That was the last word? We are done sending. + # Clear stb and set stbs_done so we can handle + # an eventual last ack on the same cycle. + with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)): + sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x " + "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x " + "stbs_done:%x", r.wb.adr, r.end_row_ix, + r.wb.stb, stbs_zero, stbs_done) + sync += r.wb.stb.eq(0) + comb += stbs_done.eq(1) + + # Calculate the next row address + rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS) + comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1) + sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange) + sync += Display("RARANGE r.req_adr:%x rarange:%x " + "stbs_zero:%x stbs_done:%x", + r.req_adr, rarange, stbs_zero, stbs_done) + + # Incoming acks processing + with m.If(bus.ack): + sync += Display("WB_IN_ACK data:%x stbs_zero:%x " + "stbs_done:%x", + bus.dat_r, stbs_zero, stbs_done) + + sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1) + + # Check for completion + with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)): + # Complete wishbone cycle + sync += r.wb.cyc.eq(0) + # be nice, clear addr + sync += r.req_adr.eq(0) + + # Cache line is now valid + cv = Signal(INDEX_BITS) + comb += cv.eq(cache_tags[r.store_index].valid) + comb += cv.bit_select(replace_way, 1).eq( + r.store_valid & ~inval_in) + sync += cache_tags[r.store_index].valid.eq(cv) + + sync += r.state.eq(State.IDLE) + + # move on to next request in row + # Increment store row counter + sync += r.store_row.eq(next_row(r.store_row)) + # Cache miss/reload synchronous machine - def icache_miss(self, m): + def icache_miss(self, m, r, req_is_miss, + req_index, req_laddr, req_tag, replace_way, + cache_tags, access_ok, real_addr): comb = m.d.comb sync = m.d.sync -# variable tagset : cache_tags_set_t; -# variable stbs_done : boolean; + i_in, bus, m_in = self.i_in, self.bus, self.m_in + stall_in, flush_in = self.stall_in, self.flush_in + inval_in = self.inval_in tagset = Signal(TAG_RAM_WIDTH) stbs_done = Signal() -# begin -# if rising_edge(clk) then -# -- On reset, clear all valid bits to force misses -# if rst = '1' then - # On reset, clear all valid bits to force misses - with m.If('''TODO rst nmigen'''): -# for i in index_t loop -# cache_valids(i) <= (others => '0'); -# end loop; - for i in Signal(NUM_LINES): - sync += cache_valid_bits[i].eq(~1) - -# r.state <= IDLE; -# r.wb.cyc <= '0'; -# r.wb.stb <= '0'; - sync += r.state.eq(State.IDLE) - sync += r.wb.cyc.eq(0) - sync += r.wb.stb.eq(0) - -# -- We only ever do reads on wishbone -# r.wb.dat <= (others => '0'); -# r.wb.sel <= "11111111"; -# r.wb.we <= '0'; - # We only ever do reads on wishbone - sync += r.wb.dat.eq(~1) - sync += r.wb.sel.eq(Const(0b11111111, 8)) - sync += r.wb.we.eq(0) - -# -- Not useful normally but helps avoiding -# -- tons of sim warnings -# r.wb.adr <= (others => '0'); - # Not useful normally but helps avoiding tons of sim warnings - sync += r.wb.adr.eq(~1) - -# else - with m.Else(): -# -- Process cache invalidations -# if inval_in = '1' then -# for i in index_t loop -# cache_valids(i) <= (others => '0'); -# end loop; -# r.store_valid <= '0'; -# end if; - # Process cache invalidations - with m.If(inval_in): - for i in range(NUM_LINES): - sync += cache_valid_bits[i].eq(~1) - - sync += r.store_valid.eq(0) - -# -- Main state machine -# case r.state is - # Main state machine - with m.Switch(r.state): - -# when IDLE => - with m.Case(State.IDLE): -# -- Reset per-row valid flags, -# -- only used in WAIT_ACK -# for i in 0 to ROW_PER_LINE - 1 loop -# r.rows_valid(i) <= '0'; -# end loop; - # Reset per-row valid flags, - # only used in WAIT_ACK - for i in range(ROW_PER_LINE): - sync += r.rows_valid[i].eq(0) - -# -- We need to read a cache line -# if req_is_miss = '1' then -# report "cache miss nia:" & to_hstring(i_in.nia) & -# " IR:" & std_ulogic'image(i_in.virt_mode) & -# " SM:" & std_ulogic'image(i_in.stop_mark) & -# " idx:" & integer'image(req_index) & -# " way:" & integer'image(replace_way) & -# " tag:" & to_hstring(req_tag) & -# " RA:" & to_hstring(real_addr); - # We need to read a cache line - with m.If(req_is_miss): - print(f"cache miss nia:{i_in.nia} " \ - f"IR:{i_in.virt_mode} " \ - f"SM:{i_in.stop_mark} " \ - F"idx:{req_index} " \ - f"way:{replace_way} tag:{req_tag} " \ - f"RA:{real_addr}") - -# -- Keep track of our index and way for -# -- subsequent stores -# r.store_index <= req_index; -# r.store_row <= get_row(req_laddr); -# r.store_tag <= req_tag; -# r.store_valid <= '1'; -# r.end_row_ix <= -# get_row_of_line(get_row(req_laddr)) - 1; - # Keep track of our index and way - # for subsequent stores - sync += r.store_index.eq(req_index) - sync += r.store_row.eq(get_row(req_laddr)) - sync += r.store_tag.eq(req_tag) - sync += r.store_valid.eq(1) - sync += r.end_row_ix.eq( - get_row_of_line( - get_row(req_laddr) - ) - 1 - ) - -# -- Prep for first wishbone read. We calculate the -# -- address of the start of the cache line and -# -- start the WB cycle. -# r.wb.adr <= req_laddr(r.wb.adr'left downto 0); -# r.wb.cyc <= '1'; -# r.wb.stb <= '1'; - # Prep for first wishbone read. - # We calculate the - # address of the start of the cache line and - # start the WB cycle. - sync += r.wb.adr.eq( - req_laddr[:r.wb.adr] - ) - -# -- Track that we had one request sent -# r.state <= CLR_TAG; - # Track that we had one request sent - sync += r.state.eq(State.CLR_TAG) -# end if; - -# when CLR_TAG | WAIT_ACK => - with m.Case(State.CLR_TAG, State.WAIT_ACK): -# if r.state = CLR_TAG then - with m.If(r.state == State.CLR_TAG): -# -- Get victim way from plru -# r.store_way <= replace_way; - # Get victim way from plru - sync += r.store_way.eq(replace_way) -# -# -- Force misses on that way while -# -- reloading that line -# cache_valids(req_index)(replace_way) <= '0'; - # Force misses on that way while - # realoading that line - sync += cache_valid_bits[ - req_index - ][replace_way].eq(0) - -# -- Store new tag in selected way -# for i in 0 to NUM_WAYS-1 loop -# if i = replace_way then -# tagset := cache_tags(r.store_index); -# write_tag(i, tagset, r.store_tag); -# cache_tags(r.store_index) <= tagset; -# end if; -# end loop; - for i in range(NUM_WAYS): - with m.If(i == replace_way): - comb += tagset.eq( - cache_tags[r.store_index] - ) - sync += write_tag( - i, tagset, r.store_tag - ) - sync += cache_tags(r.store_index).eq( - tagset - ) - -# r.state <= WAIT_ACK; - sync += r.state.eq(State.WAIT_ACK) -# end if; - -# -- Requests are all sent if stb is 0 -# stbs_done := r.wb.stb = '0'; - # Requests are all sent if stb is 0 - comb += stbs_done.eq(r.wb.stb == 0) - -# -- If we are still sending requests, -# -- was one accepted ? -# if wishbone_in.stall = '0' and not stbs_done then - # If we are still sending requests, - # was one accepted? - with m.If(~wb_in.stall & ~stbs_done): -# -- That was the last word ? We are done sending. -# -- Clear stb and set stbs_done so we can handle -# -- an eventual last ack on the same cycle. -# if is_last_row_addr(r.wb.adr, r.end_row_ix) then -# r.wb.stb <= '0'; -# stbs_done := true; -# end if; - # That was the last word ? - # We are done sending. - # Clear stb and set stbs_done - # so we can handle - # an eventual last ack on - # the same cycle. - with m.If(is_last_row_addr( - r.wb.adr, r.end_row_ix)): - sync += r.wb.stb.eq(0) - stbs_done.eq(1) - -# -- Calculate the next row address -# r.wb.adr <= next_row_addr(r.wb.adr); - # Calculate the next row address - sync += r.wb.adr.eq(next_row_addr(r.wb.adr)) -# end if; - -# -- Incoming acks processing -# if wishbone_in.ack = '1' then - # Incoming acks processing - with m.If(wb_in.ack): -# r.rows_valid(r.store_row mod ROW_PER_LINE) -# <= '1'; - sync += r.rows_valid[ - r.store_row & ROW_PER_LINE - ].eq(1) - -# -- Check for completion -# if stbs_done and -# is_last_row(r.store_row, r.end_row_ix) then - # Check for completion - with m.If(stbs_done & is_last_row( - r.store_row, r.end_row_ix)): -# -- Complete wishbone cycle -# r.wb.cyc <= '0'; - # Complete wishbone cycle - sync += r.wb.cyc.eq(0) - -# -- Cache line is now valid -# cache_valids(r.store_index)(replace_way) <= -# r.store_valid and not inval_in; - # Cache line is now valid - sync += cache_valid_bits[ - r.store_index - ][relace_way].eq( - r.store_valid & ~inval_in - ) - -# -- We are done -# r.state <= IDLE; - # We are done - sync += r.state.eq(State.IDLE) -# end if; - -# -- Increment store row counter -# r.store_row <= next_row(r.store_row); - # Increment store row counter - sync += store_row.eq(next_row(r.store_row)) -# end if; -# end case; -# end if; -# -# -- TLB miss and protection fault processing -# if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then -# r.fetch_failed <= '0'; -# elsif i_in.req = '1' and access_ok = '0' and -# stall_in = '0' then -# r.fetch_failed <= '1'; -# end if; - # TLB miss and protection fault processing - with m.If('''TODO nmigen rst''' | flush_in | m_in.tlbld): - sync += r.fetch_failed.eq(0) - - with m.Elif(i_in.req & ~access_ok & ~stall_in): - sync += r.fetch_failed.eq(1) -# end if; -# end process; - -# icache_log: if LOG_LENGTH > 0 generate - def icache_log(self, m, log_out): + comb += r.wb.sel.eq(-1) + comb += r.wb.adr.eq(r.req_adr[3:]) + + # Process cache invalidations + with m.If(inval_in): + for i in range(NUM_LINES): + sync += cache_tags[i].valid.eq(0) + sync += r.store_valid.eq(0) + + # Main state machine + with m.Switch(r.state): + + with m.Case(State.IDLE): + self.icache_miss_idle(m, r, req_is_miss, req_laddr, + req_index, req_tag, replace_way, + real_addr) + + with m.Case(State.CLR_TAG, State.WAIT_ACK): + with m.If(r.state == State.CLR_TAG): + self.icache_miss_clr_tag(m, r, replace_way, + req_index, tagset, cache_tags) + + self.icache_miss_wait_ack(m, r, replace_way, inval_in, + cache_tags, stbs_done) + + # TLB miss and protection fault processing + with m.If(flush_in | m_in.tlbld): + sync += r.fetch_failed.eq(0) + with m.Elif(i_in.req & ~access_ok & ~stall_in): + sync += r.fetch_failed.eq(1) + + # icache_log: if LOG_LENGTH > 0 generate + def icache_log(self, m, req_hit_way, ra_valid, access_ok, + req_is_miss, req_is_hit, lway, wstate, r): comb = m.d.comb sync = m.d.sync -# -- Output data to logger -# signal log_data : std_ulogic_vector(53 downto 0); -# begin -# data_log: process(clk) -# variable lway: way_t; -# variable wstate: std_ulogic; + bus, i_out = self.bus, self.i_out + log_out, stall_out = self.log_out, self.stall_out + # Output data to logger for i in range(LOG_LENGTH): - # Output data to logger log_data = Signal(54) - lway = Signal(NUM_WAYS) + lway = Signal(WAY_BITS) wstate = Signal() -# begin -# if rising_edge(clk) then -# lway := req_hit_way; -# wstate := '0'; - comb += lway.eq(req_hit_way) - comb += wstate.eq(0) + sync += lway.eq(req_hit_way) + sync += wstate.eq(0) -# if r.state /= IDLE then -# wstate := '1'; -# end if; with m.If(r.state != State.IDLE): - comb += wstate.eq(1) - -# log_data <= i_out.valid & -# i_out.insn & -# wishbone_in.ack & -# r.wb.adr(5 downto 3) & -# r.wb.stb & r.wb.cyc & -# wishbone_in.stall & -# stall_out & -# r.fetch_failed & -# r.hit_nia(5 downto 2) & -# wstate & -# std_ulogic_vector(to_unsigned(lway, 3)) & -# req_is_hit & req_is_miss & -# access_ok & -# ra_valid; + sync += wstate.eq(1) + sync += log_data.eq(Cat( ra_valid, access_ok, req_is_miss, req_is_hit, - lway, wstate, r.hit_nia[2:6], - r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc, - r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn, - i_out.valid + lway, wstate, r.hit_nia[2:6], r.fetch_failed, + stall_out, bus.stall, r.wb.cyc, r.wb.stb, + r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid )) -# end if; -# end process; -# log_out <= log_data; comb += log_out.eq(log_data) -# end generate; -# end; def elaborate(self, platform): @@ -1381,292 +800,208 @@ class ICache(Elaboratable): # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs cache_tags = CacheTagArray() - cache_valid_bits = CacheValidBitsArray() - -# signal itlb_valids : tlb_valids_t; -# signal itlb_tags : tlb_tags_t; -# signal itlb_ptes : tlb_ptes_t; -# attribute ram_style of itlb_tags : signal is "distributed"; -# attribute ram_style of itlb_ptes : signal is "distributed"; - itlb_valid_bits = TLBValidBitsArray() - itlb_tags = TLBTagArray() - itlb_ptes = TLBPTEArray() + + # TLB Array + itlb = TLBArray() + # TODO to be passed to nmigen as ram attributes # attribute ram_style of itlb_tags : signal is "distributed"; # attribute ram_style of itlb_ptes : signal is "distributed"; -# -- Privilege bit from PTE EAA field -# signal eaa_priv : std_ulogic; # Privilege bit from PTE EAA field eaa_priv = Signal() -# signal r : reg_internal_t; r = RegInternal() -# -- Async signals on incoming request -# signal req_index : index_t; -# signal req_row : row_t; -# signal req_hit_way : way_t; -# signal req_tag : cache_tag_t; -# signal req_is_hit : std_ulogic; -# signal req_is_miss : std_ulogic; -# signal req_laddr : std_ulogic_vector(63 downto 0); # Async signal on incoming request - req_index = Signal(NUM_LINES) - req_row = Signal(BRAM_ROWS) - req_hit_way = Signal(NUM_WAYS) + req_index = Signal(INDEX_BITS) + req_row = Signal(ROW_BITS) + req_hit_way = Signal(WAY_BITS) req_tag = Signal(TAG_BITS) req_is_hit = Signal() req_is_miss = Signal() req_laddr = Signal(64) -# signal tlb_req_index : tlb_index_t; -# signal real_addr : std_ulogic_vector( -# REAL_ADDR_BITS - 1 downto 0 -# ); -# signal ra_valid : std_ulogic; -# signal priv_fault : std_ulogic; -# signal access_ok : std_ulogic; -# signal use_previous : std_ulogic; - tlb_req_index = Signal(TLB_SIZE) + tlb_req_index = Signal(TLB_BITS) real_addr = Signal(REAL_ADDR_BITS) ra_valid = Signal() priv_fault = Signal() access_ok = Signal() use_previous = Signal() -# signal cache_out : cache_ram_out_t; - cache_out = CacheRamOut() - -# signal plru_victim : plru_out_t; -# signal replace_way : way_t; - plru_victim = PLRUOut() - replace_way = Signal(NUM_WAYS) + cache_out_row = Signal(ROW_SIZE_BITS) + + plru_victim = Signal(WAY_BITS) + replace_way = Signal(WAY_BITS) + + # call sub-functions putting everything together, + # using shared signals established above + self.rams(m, r, cache_out_row, use_previous, replace_way, req_row) + self.maybe_plrus(m, r, plru_victim) + self.itlb_lookup(m, tlb_req_index, itlb, real_addr, + ra_valid, eaa_priv, priv_fault, + access_ok) + self.itlb_update(m, itlb) + self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way, + req_tag, real_addr, req_laddr, + cache_tags, access_ok, req_is_hit, req_is_miss, + replace_way, plru_victim, cache_out_row) + self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way, + req_index, req_tag, real_addr) + self.icache_miss(m, r, req_is_miss, req_index, + req_laddr, req_tag, replace_way, cache_tags, + access_ok, real_addr) + #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok, + # req_is_miss, req_is_hit, lway, wstate, r) + + # don't connect up to FetchUnitInterface so that some unit tests + # can continue to operate + if not self.use_fetch_iface: + return m + + # connect to FetchUnitInterface. FetchUnitInterface is undocumented + # so needs checking and iterative revising + i_in, bus, i_out = self.i_in, self.bus, self.i_out + comb += i_in.req.eq(self.a_i_valid) + comb += i_in.nia.eq(self.a_pc_i) + comb += self.stall_in.eq(self.a_stall_i) + comb += self.f_fetch_err_o.eq(i_out.fetch_failed) + comb += self.f_badaddr_o.eq(i_out.nia) + comb += self.f_instr_o.eq(i_out.insn) + comb += self.f_busy_o.eq(~i_out.valid) # probably + + # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus + ibus = self.ibus + comb += ibus.adr.eq(self.bus.adr) + comb += ibus.dat_w.eq(self.bus.dat_w) + comb += ibus.sel.eq(self.bus.sel) + comb += ibus.cyc.eq(self.bus.cyc) + comb += ibus.stb.eq(self.bus.stb) + comb += ibus.we.eq(self.bus.we) + + comb += self.bus.dat_r.eq(ibus.dat_r) + comb += self.bus.ack.eq(ibus.ack) + if hasattr(ibus, "stall"): + comb += self.bus.stall.eq(ibus.stall) + else: + # fake-up the wishbone stall signal to comply with pipeline mode + # same thing is done in dcache.py + comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack) return m -# icache_tb.vhdl -# -# library ieee; -# use ieee.std_logic_1164.all; -# -# library work; -# use work.common.all; -# use work.wishbone_types.all; -# -# entity icache_tb is -# end icache_tb; -# -# architecture behave of icache_tb is -# signal clk : std_ulogic; -# signal rst : std_ulogic; -# -# signal i_out : Fetch1ToIcacheType; -# signal i_in : IcacheToDecode1Type; -# -# signal m_out : MmuToIcacheType; -# -# signal wb_bram_in : wishbone_master_out; -# signal wb_bram_out : wishbone_slave_out; -# -# constant clk_period : time := 10 ns; -# begin -# icache0: entity work.icache -# generic map( -# LINE_SIZE => 64, -# NUM_LINES => 4 -# ) -# port map( -# clk => clk, -# rst => rst, -# i_in => i_out, -# i_out => i_in, -# m_in => m_out, -# stall_in => '0', -# flush_in => '0', -# inval_in => '0', -# wishbone_out => wb_bram_in, -# wishbone_in => wb_bram_out -# ); -# -# -- BRAM Memory slave -# bram0: entity work.wishbone_bram_wrapper -# generic map( -# MEMORY_SIZE => 1024, -# RAM_INIT_FILE => "icache_test.bin" -# ) -# port map( -# clk => clk, -# rst => rst, -# wishbone_in => wb_bram_in, -# wishbone_out => wb_bram_out -# ); -# -# clk_process: process -# begin -# clk <= '0'; -# wait for clk_period/2; -# clk <= '1'; -# wait for clk_period/2; -# end process; -# -# rst_process: process -# begin -# rst <= '1'; -# wait for 2*clk_period; -# rst <= '0'; -# wait; -# end process; -# -# stim: process -# begin -# i_out.req <= '0'; -# i_out.nia <= (others => '0'); -# i_out.stop_mark <= '0'; -# -# m_out.tlbld <= '0'; -# m_out.tlbie <= '0'; -# m_out.addr <= (others => '0'); -# m_out.pte <= (others => '0'); -# -# wait until rising_edge(clk); -# wait until rising_edge(clk); -# wait until rising_edge(clk); -# wait until rising_edge(clk); -# -# i_out.req <= '1'; -# i_out.nia <= x"0000000000000004"; -# -# wait for 30*clk_period; -# wait until rising_edge(clk); -# -# assert i_in.valid = '1' severity failure; -# assert i_in.insn = x"00000001" -# report "insn @" & to_hstring(i_out.nia) & -# "=" & to_hstring(i_in.insn) & -# " expected 00000001" -# severity failure; -# -# i_out.req <= '0'; -# -# wait until rising_edge(clk); -# -# -- hit -# i_out.req <= '1'; -# i_out.nia <= x"0000000000000008"; -# wait until rising_edge(clk); -# wait until rising_edge(clk); -# assert i_in.valid = '1' severity failure; -# assert i_in.insn = x"00000002" -# report "insn @" & to_hstring(i_out.nia) & -# "=" & to_hstring(i_in.insn) & -# " expected 00000002" -# severity failure; -# wait until rising_edge(clk); -# -# -- another miss -# i_out.req <= '1'; -# i_out.nia <= x"0000000000000040"; -# -# wait for 30*clk_period; -# wait until rising_edge(clk); -# -# assert i_in.valid = '1' severity failure; -# assert i_in.insn = x"00000010" -# report "insn @" & to_hstring(i_out.nia) & -# "=" & to_hstring(i_in.insn) & -# " expected 00000010" -# severity failure; -# -# -- test something that aliases -# i_out.req <= '1'; -# i_out.nia <= x"0000000000000100"; -# wait until rising_edge(clk); -# wait until rising_edge(clk); -# assert i_in.valid = '0' severity failure; -# wait until rising_edge(clk); -# -# wait for 30*clk_period; -# wait until rising_edge(clk); -# -# assert i_in.valid = '1' severity failure; -# assert i_in.insn = x"00000040" -# report "insn @" & to_hstring(i_out.nia) & -# "=" & to_hstring(i_in.insn) & -# " expected 00000040" -# severity failure; -# -# i_out.req <= '0'; -# -# std.env.finish; -# end process; -# end; def icache_sim(dut): - i_out, i_in, m_out, m_in = dut.i_out, dut.i_in, dut.m_out, dut.m_in - - yield i_out.req.eq(0) - yield i_out.nia.eq(~1) - yield i_out.stop_mark.eq(0) + i_in = dut.i_in + i_out = dut.i_out + m_out = dut.m_in + + yield i_in.priv_mode.eq(1) + yield i_in.req.eq(0) + yield i_in.nia.eq(0) + yield i_in.stop_mark.eq(0) yield m_out.tlbld.eq(0) yield m_out.tlbie.eq(0) - yield m_out.addr.eq(~1) - yield m_out.pte.eq(~1) + yield m_out.addr.eq(0) + yield m_out.pte.eq(0) yield yield yield yield - yield i_out.req.eq(1) - yield i_out.nia.eq(Const(0x0000000000000004, 64)) - for i in range(30): - yield + + # miss, stalls for a bit + yield i_in.req.eq(1) + yield i_in.nia.eq(Const(0x0000000000000004, 64)) yield - assert i_in.valid - assert i_in.insn == Const(0x00000001, 32), \ - ("insn @%x=%x expected 00000001" % i_out.nia, i_in.insn) - yield i_out.req.eq(0) + valid = yield i_out.valid + while not valid: + yield + valid = yield i_out.valid + yield i_in.req.eq(0) + + insn = yield i_out.insn + nia = yield i_out.nia + assert insn == 0x00000001, \ + "insn @%x=%x expected 00000001" % (nia, insn) + yield i_in.req.eq(0) yield # hit - yield i_out.req.eq(1) - yield i_out.nia.eq(Const(0x0000000000000008, 64)) + yield i_in.req.eq(1) + yield i_in.nia.eq(Const(0x0000000000000008, 64)) yield + valid = yield i_out.valid + while not valid: + yield + valid = yield i_out.valid + yield i_in.req.eq(0) + + nia = yield i_out.nia + insn = yield i_out.insn yield - assert i_in.valid - assert i_in.insn == Const(0x00000002, 32), \ - ("insn @%x=%x expected 00000002" % i_out.nia, i_in.insn) - yield + assert insn == 0x00000002, \ + "insn @%x=%x expected 00000002" % (nia, insn) # another miss - yield i_out.req(1) - yield i_out.nia.eq(Const(0x0000000000000040, 64)) - for i in range(30): - yield + yield i_in.req.eq(1) + yield i_in.nia.eq(Const(0x0000000000000040, 64)) yield - assert i_in.valid - assert i_in.insn == Const(0x00000010, 32), \ - ("insn @%x=%x expected 00000010" % i_out.nia, i_in.insn) - - # test something that aliases - yield i_out.req.eq(1) - yield i_out.nia.eq(Const(0x0000000000000100, 64)) + valid = yield i_out.valid + while not valid: + yield + valid = yield i_out.valid + yield i_in.req.eq(0) + + nia = yield i_in.nia + insn = yield i_out.insn + assert insn == 0x00000010, \ + "insn @%x=%x expected 00000010" % (nia, insn) + + # test something that aliases (this only works because + # the unit test SRAM is a depth of 512) + yield i_in.req.eq(1) + yield i_in.nia.eq(Const(0x0000000000000100, 64)) yield yield - assert i_in.valid + valid = yield i_out.valid + assert ~valid for i in range(30): yield yield - assert i_in.valid - assert i_in.insn == Const(0x00000040, 32), \ - ("insn @%x=%x expected 00000040" % i_out.nia, i_in.insn) - yield i_out.req.eq(0) + insn = yield i_out.insn + valid = yield i_out.valid + insn = yield i_out.insn + assert valid + assert insn == 0x00000040, \ + "insn @%x=%x expected 00000040" % (nia, insn) + yield i_in.req.eq(0) + + +def test_icache(mem): + from soc.config.test.test_loadstore import TestMemPspec + pspec = TestMemPspec(addr_wid=32, + mask_wid=8, + reg_wid=64, + ) + dut = ICache(pspec) + memory = Memory(width=64, depth=512, init=mem) + sram = SRAM(memory=memory, granularity=8) -def test_icache(): - dut = ICache() + m = Module() - m = Module() m.submodules.icache = dut + m.submodules.sram = sram + + m.d.comb += sram.bus.cyc.eq(dut.bus.cyc) + m.d.comb += sram.bus.stb.eq(dut.bus.stb) + m.d.comb += sram.bus.we.eq(dut.bus.we) + m.d.comb += sram.bus.sel.eq(dut.bus.sel) + m.d.comb += sram.bus.adr.eq(dut.bus.adr) + m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w) + + m.d.comb += dut.bus.ack.eq(sram.bus.ack) + m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r) # nmigen Simulation sim = Simulator(m) @@ -1674,12 +1009,23 @@ def test_icache(): sim.add_sync_process(wrap(icache_sim(dut))) with sim.write_vcd('test_icache.vcd'): - sim.run() + sim.run() + if __name__ == '__main__': - dut = ICache() + from soc.config.test.test_loadstore import TestMemPspec + pspec = TestMemPspec(addr_wid=64, + mask_wid=8, + reg_wid=64, + ) + dut = ICache(pspec) vl = rtlil.convert(dut, ports=[]) with open("test_icache.il", "w") as f: f.write(vl) - test_icache() + # set up memory every 32-bits with incrementing values 0 1 2 ... + mem = [] + for i in range(512): + mem.append((i*2) | ((i*2+1)<<32)) + + test_icache(mem)