"""
from enum import Enum, unique
-from nmigen import (Module, Signal, Elaboratable, Cat, Signal)
-from nmigen.cli import main
-from nmigen.cli import rtlil
+from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const)
+from nmigen.cli import main, rtlil
from nmutil.iocontrol import RecordObject
-from nmutil.byterev import byte_reverse
-from nmutil.mask import Mask
-from nmigen.util import log2_int
+from nmigen.utils import log2_int
+from nmutil.util import Display
+
+#from nmutil.plru import PLRU
+from soc.experiment.cache_ram import CacheRam
+from soc.experiment.plru import PLRU
+
+from soc.experiment.mem_types import (Fetch1ToICacheType,
+ ICacheToDecode1Type,
+ MMUToICacheType)
+
+from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
+ WB_SEL_BITS, WBAddrType, WBDataType,
+ WBSelType, WBMasterOut, WBSlaveOut,
+ WBMasterOutVector, WBSlaveOutVector,
+ WBIOMasterOut, WBIOSlaveOut)
+
+# for test
+from nmigen_soc.wishbone.sram import SRAM
+from nmigen import Memory
+from nmutil.util import wrap
+from nmigen.cli import main, rtlil
+if True:
+ from nmigen.back.pysim import Simulator, Delay, Settle
+else:
+ from nmigen.sim.cxxsim import Simulator, Delay, Settle
+
+
+SIM = 0
+LINE_SIZE = 64
+# BRAM organisation: We never access more than wishbone_data_bits
+# at a time so to save resources we make the array only that wide,
+# and use consecutive indices for to make a cache "line"
+#
+# ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
+ROW_SIZE = WB_DATA_BITS // 8
+# Number of lines in a set
+NUM_LINES = 32
+# Number of ways
+NUM_WAYS = 4
+# L1 ITLB number of entries (direct mapped)
+TLB_SIZE = 64
+# L1 ITLB log_2(page_size)
+TLB_LG_PGSZ = 12
+# Number of real address bits that we store
+REAL_ADDR_BITS = 56
+# Non-zero to enable log data collection
+LOG_LENGTH = 0
+
+ROW_SIZE_BITS = ROW_SIZE * 8
+# ROW_PER_LINE is the number of row
+# (wishbone) transactions in a line
+ROW_PER_LINE = LINE_SIZE // ROW_SIZE
+# BRAM_ROWS is the number of rows in
+# BRAM needed to represent the full icache
+BRAM_ROWS = NUM_LINES * ROW_PER_LINE
+# INSN_PER_ROW is the number of 32bit
+# instructions per BRAM row
+INSN_PER_ROW = ROW_SIZE_BITS // 32
+
+print("ROW_SIZE", ROW_SIZE)
+print("ROW_SIZE_BITS", ROW_SIZE_BITS)
+print("ROW_PER_LINE", ROW_PER_LINE)
+print("BRAM_ROWS", BRAM_ROWS)
+print("INSN_PER_ROW", INSN_PER_ROW)
+
+# Bit fields counts in the address
+#
+# INSN_BITS is the number of bits to
+# select an instruction in a row
+INSN_BITS = log2_int(INSN_PER_ROW)
+# ROW_BITS is the number of bits to
+# select a row
+ROW_BITS = log2_int(BRAM_ROWS)
+# ROW_LINEBITS is the number of bits to
+# select a row within a line
+ROW_LINE_BITS = log2_int(ROW_PER_LINE)
+# LINE_OFF_BITS is the number of bits for
+# the offset in a cache line
+LINE_OFF_BITS = log2_int(LINE_SIZE)
+# ROW_OFF_BITS is the number of bits for
+# the offset in a row
+ROW_OFF_BITS = log2_int(ROW_SIZE)
+# INDEX_BITS is the number of bits to
+# select a cache line
+INDEX_BITS = log2_int(NUM_LINES)
+# SET_SIZE_BITS is the log base 2 of
+# the set size
+SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
+# TAG_BITS is the number of bits of
+# the tag part of the address
+TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
+# WAY_BITS is the number of bits to
+# select a way
+WAY_BITS = log2_int(NUM_WAYS)
+TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
+# -- L1 ITLB.
+# constant TLB_BITS : natural := log2(TLB_SIZE);
+# constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
+# constant TLB_PTE_BITS : natural := 64;
+TLB_BITS = log2_int(TLB_SIZE)
+TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
+TLB_PTE_BITS = 64
-from soc.experiment.mem_types import Fetch1ToICacheType,
- ICacheToDecode1Type,
- MMUToICacheType
-from experiment.wb_types import WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
- WBAddrType, WBDataType, WBSelType,
- WbMasterOut, WBSlaveOut,
- WBMasterOutVector, WBSlaveOutVector,
- WBIOMasterOut, WBIOSlaveOut
+print("INSN_BITS", INSN_BITS)
+print("ROW_BITS", ROW_BITS)
+print("ROW_LINE_BITS", ROW_LINE_BITS)
+print("LINE_OFF_BITS", LINE_OFF_BITS)
+print("ROW_OFF_BITS", ROW_OFF_BITS)
+print("INDEX_BITS", INDEX_BITS)
+print("SET_SIZE_BITS", SET_SIZE_BITS)
+print("TAG_BITS", TAG_BITS)
+print("WAY_BITS", WAY_BITS)
+print("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
+print("TLB_BITS", TLB_BITS)
+print("TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
+print("TLB_PTE_BITS", TLB_PTE_BITS)
-# Cache reload state machine
-@unique
-class State(Enum):
- IDLE
- CLR_TAG
- WAIT_ACK
-# type reg_internal_t is record
-# -- Cache hit state (Latches for 1 cycle BRAM access)
-# hit_way : way_t;
-# hit_nia : std_ulogic_vector(63 downto 0);
-# hit_smark : std_ulogic;
-# hit_valid : std_ulogic;
-#
-# -- Cache miss state (reload state machine)
-# state : state_t;
-# wb : wishbone_master_out;
-# store_way : way_t;
-# store_index : index_t;
-# store_row : row_t;
-# store_tag : cache_tag_t;
-# store_valid : std_ulogic;
-# end_row_ix : row_in_line_t;
-# rows_valid : row_per_line_valid_t;
-#
-# -- TLB miss state
-# fetch_failed : std_ulogic;
-# end record;
-class RegInternal(RecordObject):
- def __init__(self):
- super().__init__()
- # Cache hit state (Latches for 1 cycle BRAM access)
- self.hit_way = Signal(NUM_WAYS)
- self.hit_nia = Signal(64)
- self.hit_smark = Signal()
- self.hit_valid = Signal()
-
- # Cache miss state (reload state machine)
- self.state = State()
- self.wb = WBMasterOut()
- self.store_way = Signal(NUM_WAYS)
- self.store_index = Signal(NUM_LINES)
- self.store_row = Signal(BRAM_ROWS)
- self.store_tag = Signal(TAG_BITS)
- self.store_valid = Signal()
- self.end_row_ix = Signal(ROW_LINE_BITS)
- self.rows_valid = RowPerLineValidArray()
- # TLB miss state
- self.fetch_failed = Signal()
-
-# -- 64 bit direct mapped icache. All instructions are 4B aligned.
-#
-# entity icache is
-# generic (
-# SIM : boolean := false;
-# -- Line size in bytes
-# LINE_SIZE : positive := 64;
-# -- BRAM organisation: We never access more than wishbone_data_bits
-# -- at a time so to save resources we make the array only that wide,
-# -- and use consecutive indices for to make a cache "line"
-# --
-# -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
-# -- so 64-bits)
-# ROW_SIZE : positive := wishbone_data_bits / 8;
-# -- Number of lines in a set
-# NUM_LINES : positive := 32;
-# -- Number of ways
-# NUM_WAYS : positive := 4;
-# -- L1 ITLB number of entries (direct mapped)
-# TLB_SIZE : positive := 64;
-# -- L1 ITLB log_2(page_size)
-# TLB_LG_PGSZ : positive := 12;
-# -- Number of real address bits that we store
-# REAL_ADDR_BITS : positive := 56;
-# -- Non-zero to enable log data collection
-# LOG_LENGTH : natural := 0
-# );
-# port (
-# clk : in std_ulogic;
-# rst : in std_ulogic;
-#
-# i_in : in Fetch1ToIcacheType;
-# i_out : out IcacheToDecode1Type;
-#
-# m_in : in MmuToIcacheType;
+# architecture rtl of icache is
+#constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
+#-- ROW_PER_LINE is the number of row (wishbone
+#-- transactions) in a line
+#constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
+#-- BRAM_ROWS is the number of rows in BRAM
+#-- needed to represent the full
+#-- icache
+#constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
+#-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
+#constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
+#-- Bit fields counts in the address
#
-# stall_in : in std_ulogic;
-# stall_out : out std_ulogic;
-# flush_in : in std_ulogic;
-# inval_in : in std_ulogic;
+#-- INSN_BITS is the number of bits to select
+#-- an instruction in a row
+#constant INSN_BITS : natural := log2(INSN_PER_ROW);
+#-- ROW_BITS is the number of bits to select a row
+#constant ROW_BITS : natural := log2(BRAM_ROWS);
+#-- ROW_LINEBITS is the number of bits to
+#-- select a row within a line
+#constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
+#-- LINE_OFF_BITS is the number of bits for the offset
+#-- in a cache line
+#constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
+#-- ROW_OFF_BITS is the number of bits for the offset in a row
+#constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
+#-- INDEX_BITS is the number of bits to select a cache line
+#constant INDEX_BITS : natural := log2(NUM_LINES);
+#-- SET_SIZE_BITS is the log base 2 of the set size
+#constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
+#-- TAG_BITS is the number of bits of the tag part of the address
+#constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
+#-- WAY_BITS is the number of bits to select a way
+#constant WAY_BITS : natural := log2(NUM_WAYS);
+
+#-- Example of layout for 32 lines of 64 bytes:
+#--
+#-- .. tag |index| line |
+#-- .. | row | |
+#-- .. | | | |00| zero (2)
+#-- .. | | |-| | INSN_BITS (1)
+#-- .. | |---| | ROW_LINEBITS (3)
+#-- .. | |--- - --| LINE_OFF_BITS (6)
+#-- .. | |- --| ROW_OFF_BITS (3)
+#-- .. |----- ---| | ROW_BITS (8)
+#-- .. |-----| | INDEX_BITS (5)
+#-- .. --------| | TAG_BITS (53)
+ # Example of layout for 32 lines of 64 bytes:
+ #
+ # .. tag |index| line |
+ # .. | row | |
+ # .. | | | |00| zero (2)
+ # .. | | |-| | INSN_BITS (1)
+ # .. | |---| | ROW_LINEBITS (3)
+ # .. | |--- - --| LINE_OFF_BITS (6)
+ # .. | |- --| ROW_OFF_BITS (3)
+ # .. |----- ---| | ROW_BITS (8)
+ # .. |-----| | INDEX_BITS (5)
+ # .. --------| | TAG_BITS (53)
+
+#subtype row_t is integer range 0 to BRAM_ROWS-1;
+#subtype index_t is integer range 0 to NUM_LINES-1;
+#subtype way_t is integer range 0 to NUM_WAYS-1;
+#subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
#
-# wishbone_out : out wishbone_master_out;
-# wishbone_in : in wishbone_slave_out;
+#-- The cache data BRAM organized as described above for each way
+#subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
#
-# log_out : out std_ulogic_vector(53 downto 0)
-# );
-# end entity icache;
-# 64 bit direct mapped icache. All instructions are 4B aligned.
-class ICache(Elaboratable):
- """64 bit direct mapped icache. All instructions are 4B aligned."""
- def __init__(self):
- self.SIM = 0
- self.LINE_SIZE = 64
- # BRAM organisation: We never access more than wishbone_data_bits
- # at a time so to save resources we make the array only that wide,
- # and use consecutive indices for to make a cache "line"
- #
- # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
- self.ROW_SIZE = WB_DATA_BITS / 8
- # Number of lines in a set
- self.NUM_LINES = 32
- # Number of ways
- self.NUM_WAYS = 4
- # L1 ITLB number of entries (direct mapped)
- self.TLB_SIZE = 64
- # L1 ITLB log_2(page_size)
- self.TLB_LG_PGSZ = 12
- # Number of real address bits that we store
- self.REAL_ADDR_BITS = 56
- # Non-zero to enable log data collection
- self.LOG_LENGTH = 0
-
- self.i_in = Fetch1ToICacheType()
- self.i_out = ICacheToDecode1Type()
-
- self.m_in = MMUToICacheType()
-
- self.stall_in = Signal()
- self.stall_out = Signal()
- self.flush_in = Signal()
- self.inval_in = Signal()
-
- self.wb_out = WBMasterOut()
- self.wb_in = WBSlaveOut()
-
- self.log_out = Signal(54)
+#-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
+#-- not handle a clean (commented) definition of the cache tags as a 3d
+#-- memory. For now, work around it by putting all the tags
+#subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
+# type cache_tags_set_t is array(way_t) of cache_tag_t;
+# type cache_tags_array_t is array(index_t) of cache_tags_set_t;
+#constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
+#subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
+#type cache_tags_array_t is array(index_t) of cache_tags_set_t;
+def CacheTagArray():
+ return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
+ for x in range(NUM_LINES))
+
+#-- The cache valid bits
+#subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
+#type cache_valids_t is array(index_t) of cache_way_valids_t;
+#type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
+def CacheValidBitsArray():
+ return Array(Signal(NUM_WAYS, name="cahcevalid_%d" %x) \
+ for x in range(NUM_LINES))
+
+def RowPerLineValidArray():
+ return Array(Signal(name="rows_valid_%d" %x) \
+ for x in range(ROW_PER_LINE))
+
+
+#attribute ram_style : string;
+#attribute ram_style of cache_tags : signal is "distributed";
+ # TODO to be passed to nigmen as ram attributes
+ # attribute ram_style : string;
+ # attribute ram_style of cache_tags : signal is "distributed";
+
+
+#subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
+#type tlb_valids_t is array(tlb_index_t) of std_ulogic;
+#subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
+#type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
+#subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
+#type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
+def TLBValidBitsArray():
+ return Array(Signal(name="tlbvalid_%d" %x) \
+ for x in range(TLB_SIZE))
+
+def TLBTagArray():
+ return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
+ for x in range(TLB_SIZE))
+
+def TLBPtesArray():
+ return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
+ for x in range(TLB_SIZE))
+
+
+#-- Cache RAM interface
+#type cache_ram_out_t is array(way_t) of cache_row_t;
+# Cache RAM interface
+def CacheRamOut():
+ return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
+ for x in range(NUM_WAYS))
+
+#-- PLRU output interface
+#type plru_out_t is array(index_t) of
+# std_ulogic_vector(WAY_BITS-1 downto 0);
+# PLRU output interface
+def PLRUOut():
+ return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
+ for x in range(NUM_LINES))
# -- Return the cache line index (tag index) for an address
# function get_index(addr: std_ulogic_vector(63 downto 0))
# addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
# ));
# end;
- # Return the cache line index (tag index) for an address
- def get_index(addr):
- return addr[LINE_OFF_BITS:SET_SIZE_BITS]
+# Return the cache line index (tag index) for an address
+def get_index(addr):
+ return addr[LINE_OFF_BITS:SET_SIZE_BITS]
# -- Return the cache row index (data memory) for an address
-# function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is
+# function get_row(addr: std_ulogic_vector(63 downto 0))
+# return row_t is
# begin
# return to_integer(unsigned(
# addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
# ));
# end;
- # Return the cache row index (data memory) for an address
- def get_row(addr):
- return addr[ROW_OFF_BITS:SET_SIZE_BITS]
+# Return the cache row index (data memory) for an address
+def get_row(addr):
+ return addr[ROW_OFF_BITS:SET_SIZE_BITS]
# -- Return the index of a row within a line
# function get_row_of_line(row: row_t) return row_in_line_t is
# row_v := to_unsigned(row, ROW_BITS);
# return row_v(ROW_LINEBITS-1 downto 0);
# end;
- # Return the index of a row within a line
- def get_row_of_line(row):
- row[:ROW_LINE_BITS]
+# Return the index of a row within a line
+def get_row_of_line(row):
+ return row[:ROW_LINE_BITS]
# -- Returns whether this is the last row of a line
-# function is_last_row_addr(addr: wishbone_addr_type; last: row_in_line_t)
+# function is_last_row_addr(addr: wishbone_addr_type;
+# last: row_in_line_t
+# )
# return boolean is
# begin
-# return unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last;
+# return unsigned(
+# addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
+# ) = last;
# end;
- # Returns whether this is the last row of a line
- def is_last_row_addr(addr, last):
- return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
+# Returns whether this is the last row of a line
+def is_last_row_addr(addr, last):
+ return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
# -- Returns whether this is the last row of a line
-# function is_last_row(row: row_t; last: row_in_line_t) return boolean is
+# function is_last_row(row: row_t;
+# last: row_in_line_t) return boolean is
# begin
# return get_row_of_line(row) = last;
# end;
- # Returns whether this is the last row of a line
- def is_last_row(row, last):
- return get_row_of_line(row) == last
-
-# -- Return the address of the next row in the current cache line
-# function next_row_addr(addr: wishbone_addr_type)
-# return std_ulogic_vector is
-# variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
-# variable result : wishbone_addr_type;
-# begin
-# -- Is there no simpler way in VHDL to generate that 3 bits adder ?
-# row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
-# row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
-# result := addr;
-# result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
-# return result;
-# end;
- # Return the address of the next row in the current cache line
- def next_row_addr(addr):
- # TODO no idea what's going on here, looks like double assignments
- # overriding earlier assignments ??? Help please!
+# Returns whether this is the last row of a line
+def is_last_row(row, last):
+ return get_row_of_line(row) == last
# -- Return the next row in the current cache line. We use a dedicated
# -- function in order to limit the size of the generated adder to be
# std_ulogic_vector(unsigned(row_idx) + 1);
# return to_integer(unsigned(row_v));
# end;
- # Return the next row in the current cache line. We use a dedicated
- # function in order to limit the size of the generated adder to be
- # only the bits within a cache line (3 bits with default settings)
- def next_row(row):
- # TODO no idea what's going on here, looks like double assignments
- # overriding earlier assignments ??? Help please!
-
+# Return the next row in the current cache line. We use a dedicated
+# function in order to limit the size of the generated adder to be
+# only the bits within a cache line (3 bits with default settings)
+def next_row(row):
+ row_v = row[0:ROW_LINE_BITS] + 1
+ return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
# -- Read the instruction word for the given address in the
# -- current cache row
# function read_insn_word(addr: std_ulogic_vector(63 downto 0);
# word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
# return data(31+word*32 downto word*32);
# end;
- # Read the instruction word for the given address
- # in the current cache row
- def read_insn_word(addr, data):
- word = addr[2:INSN_BITS+3]
- return data[word * 32:32 + word * 32]
+# Read the instruction word for the given address
+# in the current cache row
+def read_insn_word(addr, data):
+ word = addr[2:INSN_BITS+3]
+ return data.word_select(word, 32)
# -- Get the tag value from the address
-# function get_tag(addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0))
+# function get_tag(
+# addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
+# )
# return cache_tag_t is
# begin
# return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
# end;
- # Get the tag value from the address
- def get_tag(addr):
- return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
+# Get the tag value from the address
+def get_tag(addr):
+ return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
# -- Read a tag from a tag memory row
# function read_tag(way: way_t; tagset: cache_tags_set_t)
# begin
# return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
# end;
- # Read a tag from a tag memory row
- def read_tag(way, tagset):
- return tagset[way * TAG_BITS:(way + 1) * TAG_BITS]
+# Read a tag from a tag memory row
+def read_tag(way, tagset):
+ return tagset[way * TAG_BITS:(way + 1) * TAG_BITS]
# -- Write a tag to tag memory row
-# procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t;
-# tag: cache_tag_t) is
+# procedure write_tag(way: in way_t;
+# tagset: inout cache_tags_set_t; tag: cache_tag_t) is
# begin
# tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
# end;
- # Write a tag to tag memory row
- def write_tag(way, tagset, tag):
- tagset[way * TAG_BITS:(way + 1) * TAG_BITS] = tag
+# Write a tag to tag memory row
+def write_tag(way, tagset, tag):
+ return tagset[way * TAG_BITS:(way + 1) * TAG_BITS].eq(tag)
# -- Simple hash for direct-mapped TLB index
# function hash_ea(addr: std_ulogic_vector(63 downto 0))
# );
# return to_integer(unsigned(hash));
# end;
- # Simple hash for direct-mapped TLB index
- def hash_ea(addr):
- hash = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS]
- ^ addr[TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS]
- ^ addr[TLB_LG_PGSZ + 2 * TLB_BITS: TLB_LG_PGSZE + 3 * TLB_BITS]
- return hash
-
- def elaborate(self, platform):
-# architecture rtl of icache is
-# constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
-# -- ROW_PER_LINE is the number of row (wishbone transactions) in a line
-# constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
-# -- BRAM_ROWS is the number of rows in BRAM needed to represent the full
-# -- icache
-# constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
-# -- INSN_PER_ROW is the number of 32bit instructions per BRAM row
-# constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
-# -- Bit fields counts in the address
-#
-# -- INSN_BITS is the number of bits to select an instruction in a row
-# constant INSN_BITS : natural := log2(INSN_PER_ROW);
-# -- ROW_BITS is the number of bits to select a row
-# constant ROW_BITS : natural := log2(BRAM_ROWS);
-# -- ROW_LINEBITS is the number of bits to select a row within a line
-# constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
-# -- LINE_OFF_BITS is the number of bits for the offset in a cache line
-# constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
-# -- ROW_OFF_BITS is the number of bits for the offset in a row
-# constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
-# -- INDEX_BITS is the number of bits to select a cache line
-# constant INDEX_BITS : natural := log2(NUM_LINES);
-# -- SET_SIZE_BITS is the log base 2 of the set size
-# constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
-# -- TAG_BITS is the number of bits of the tag part of the address
-# constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
-# -- WAY_BITS is the number of bits to select a way
-# constant WAY_BITS : natural := log2(NUM_WAYS);
-
- ROW_SIZE_BITS = ROW_SIZE * 8
- # ROW_PER_LINE is the number of row
- # (wishbone) transactions in a line
- ROW_PER_LINE = LINE_SIZE / ROW_SIZE
- # BRAM_ROWS is the number of rows in
- # BRAM needed to represent the full icache
- BRAM_ROWS = NUM_LINES * ROW_PER_LINE
- # INSN_PER_ROW is the number of 32bit
- # instructions per BRAM row
- INSN_PER_ROW = ROW_SIZE_BITS / 32
-
- # Bit fields counts in the address
- #
- # INSN_BITS is the number of bits to
- # select an instruction in a row
- INSN_BITS = log2_int(INSN_PER_ROW)
- # ROW_BITS is the number of bits to
- # select a row
- ROW_BITS = log2_int(BRAM_ROWS)
- # ROW_LINEBITS is the number of bits to
- # select a row within a line
- ROW_LINE_BITS = log2_int(ROW_PER_LINE)
- # LINE_OFF_BITS is the number of bits for
- # the offset in a cache line
- LINE_OFF_BITS = log2_int(LINE_SIZE)
- # ROW_OFF_BITS is the number of bits for
- # the offset in a row
- ROW_OFF_BITS = log2_int(ROW_SIZE)
- # INDEX_BITS is the number of bits to
- # select a cache line
- INDEX_BITS = log2_int(NUM_LINES)
- # SET_SIZE_BITS is the log base 2 of
- # the set size
- SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
- # TAG_BITS is the number of bits of
- # the tag part of the address
- TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
- # WAY_BITS is the number of bits to
- # select a way
- WAY_BITS = log2_int(NUM_WAYS)
- TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
-
-# -- Example of layout for 32 lines of 64 bytes:
-# --
-# -- .. tag |index| line |
-# -- .. | row | |
-# -- .. | | | |00| zero (2)
-# -- .. | | |-| | INSN_BITS (1)
-# -- .. | |---| | ROW_LINEBITS (3)
-# -- .. | |--- - --| LINE_OFF_BITS (6)
-# -- .. | |- --| ROW_OFF_BITS (3)
-# -- .. |----- ---| | ROW_BITS (8)
-# -- .. |-----| | INDEX_BITS (5)
-# -- .. --------| | TAG_BITS (53)
- # Example of layout for 32 lines of 64 bytes:
- #
- # .. tag |index| line |
- # .. | row | |
- # .. | | | |00| zero (2)
- # .. | | |-| | INSN_BITS (1)
- # .. | |---| | ROW_LINEBITS (3)
- # .. | |--- - --| LINE_OFF_BITS (6)
- # .. | |- --| ROW_OFF_BITS (3)
- # .. |----- ---| | ROW_BITS (8)
- # .. |-----| | INDEX_BITS (5)
- # .. --------| | TAG_BITS (53)
-
-# subtype row_t is integer range 0 to BRAM_ROWS-1;
-# subtype index_t is integer range 0 to NUM_LINES-1;
-# subtype way_t is integer range 0 to NUM_WAYS-1;
-# subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
-#
-# -- The cache data BRAM organized as described above for each way
-# subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
-#
-# -- The cache tags LUTRAM has a row per set. Vivado is a pain and will
-# -- not handle a clean (commented) definition of the cache tags as a 3d
-# -- memory. For now, work around it by putting all the tags
-# subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
-# -- type cache_tags_set_t is array(way_t) of cache_tag_t;
-# -- type cache_tags_array_t is array(index_t) of cache_tags_set_t;
-# constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
-# subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
-# type cache_tags_array_t is array(index_t) of cache_tags_set_t;
- def CacheTagArray():
- return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
-
-# -- The cache valid bits
-# subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
-# type cache_valids_t is array(index_t) of cache_way_valids_t;
-# type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
- def CacheValidBitsArray():
- return Array(Signal() for x in ROW_PER_LINE)
-
- def RowPerLineValidArray():
- return Array(Signal() for x in range ROW_PER_LINE)
-
-# -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
-# signal cache_tags : cache_tags_array_t;
-# signal cache_valids : cache_valids_t;
- # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
- cache_tags = CacheTagArray()
- cache_valid_bits = CacheValidBitsArray()
-
-# attribute ram_style : string;
-# attribute ram_style of cache_tags : signal is "distributed";
- # TODO to be passed to nigmen as ram attributes
- # attribute ram_style : string;
- # attribute ram_style of cache_tags : signal is "distributed";
-
-# -- L1 ITLB.
-# constant TLB_BITS : natural := log2(TLB_SIZE);
-# constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
-# constant TLB_PTE_BITS : natural := 64;
- TLB_BITS = log2_int(TLB_SIZE)
- TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
- TLB_PTE_BITS = 64
-
-# subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
-# type tlb_valids_t is array(tlb_index_t) of std_ulogic;
-# subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
-# type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
-# subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
-# type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
- def TLBValidBitsArray():
- return Array(Signal() for x in range(TLB_SIZE))
-
- def TLBTagArray():
- return Array(Signal(TLB_EA_TAG_BITS) for x in range(TLB_SIZE))
-
- def TLBPTEArray():
- return Array(Signal(LTB_PTE_BITS) for x in range(TLB_SIZE))
-
-# signal itlb_valids : tlb_valids_t;
-# signal itlb_tags : tlb_tags_t;
-# signal itlb_ptes : tlb_ptes_t;
-# attribute ram_style of itlb_tags : signal is "distributed";
-# attribute ram_style of itlb_ptes : signal is "distributed";
- itlb_valid_bits = TLBValidBitsArray()
- itlb_tags = TLBTagArray()
- itlb_ptes = TLBPTEArray()
- # TODO to be passed to nmigen as ram attributes
- # attribute ram_style of itlb_tags : signal is "distributed";
- # attribute ram_style of itlb_ptes : signal is "distributed";
-
-# -- Privilege bit from PTE EAA field
-# signal eaa_priv : std_ulogic;
- # Privilege bit from PTE EAA field
- eaa_priv = Signal()
-
-
-# signal r : reg_internal_t;
- r = RegInternal()
-
-# -- Async signals on incoming request
-# signal req_index : index_t;
-# signal req_row : row_t;
-# signal req_hit_way : way_t;
-# signal req_tag : cache_tag_t;
-# signal req_is_hit : std_ulogic;
-# signal req_is_miss : std_ulogic;
-# signal req_laddr : std_ulogic_vector(63 downto 0);
- # Async signal on incoming request
- req_index = Signal(NUM_LINES)
- req_row = Signal(BRAM_ROWS)
- req_hit_way = Signal(NUM_WAYS)
- req_tag = Signal(TAG_BITS)
- req_is_hit = Signal()
- req_is_miss = Signal()
- req_laddr = Signal(64)
-
-# signal tlb_req_index : tlb_index_t;
-# signal real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
-# signal ra_valid : std_ulogic;
-# signal priv_fault : std_ulogic;
-# signal access_ok : std_ulogic;
-# signal use_previous : std_ulogic;
- tlb_req_index = Signal(TLB_SIZE)
- real_addr = Signal(REAL_ADDR_BITS)
- ra_valid = Signal()
- priv_fault = Signal()
- access_ok = Signal()
- use_previous = Signal()
-
-# -- Cache RAM interface
-# type cache_ram_out_t is array(way_t) of cache_row_t;
-# signal cache_out : cache_ram_out_t;
- # Cache RAM interface
- def CacheRamOut():
- return Array(Signal(ROW_SIZE_BITS) for x in range(NUM_WAYS))
-
- cache_out = CacheRamOut()
-
-# -- PLRU output interface
-# type plru_out_t is array(index_t) of
-# std_ulogic_vector(WAY_BITS-1 downto 0);
-# signal plru_victim : plru_out_t;
-# signal replace_way : way_t;
- # PLRU output interface
- def PLRUOut():
- return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
-
- plru_victim = PLRUOut()
- replace_way = Signal(NUM_WAYS)
+# Simple hash for direct-mapped TLB index
+def hash_ea(addr):
+ hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
+ TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
+ ] ^ addr[
+ TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
+ ]
+ return hsh
# begin
#
# end process;
# end generate;
+# Cache reload state machine
+@unique
+class State(Enum):
+ IDLE = 0
+ CLR_TAG = 1
+ WAIT_ACK = 2
+
+# type reg_internal_t is record
+# -- Cache hit state (Latches for 1 cycle BRAM access)
+# hit_way : way_t;
+# hit_nia : std_ulogic_vector(63 downto 0);
+# hit_smark : std_ulogic;
+# hit_valid : std_ulogic;
+#
+# -- Cache miss state (reload state machine)
+# state : state_t;
+# wb : wishbone_master_out;
+# store_way : way_t;
+# store_index : index_t;
+# store_row : row_t;
+# store_tag : cache_tag_t;
+# store_valid : std_ulogic;
+# end_row_ix : row_in_line_t;
+# rows_valid : row_per_line_valid_t;
+#
+# -- TLB miss state
+# fetch_failed : std_ulogic;
+# end record;
+class RegInternal(RecordObject):
+ def __init__(self):
+ super().__init__()
+ # Cache hit state (Latches for 1 cycle BRAM access)
+ self.hit_way = Signal(NUM_WAYS)
+ self.hit_nia = Signal(64)
+ self.hit_smark = Signal()
+ self.hit_valid = Signal()
+
+ # Cache miss state (reload state machine)
+ self.state = Signal(State, reset=State.IDLE)
+ self.wb = WBMasterOut("wb")
+ self.store_way = Signal(NUM_WAYS)
+ self.store_index = Signal(NUM_LINES)
+ self.store_row = Signal(BRAM_ROWS)
+ self.store_tag = Signal(TAG_BITS)
+ self.store_valid = Signal()
+ self.end_row_ix = Signal(ROW_LINE_BITS)
+ self.rows_valid = RowPerLineValidArray()
+
+ # TLB miss state
+ self.fetch_failed = Signal()
+
+# -- 64 bit direct mapped icache. All instructions are 4B aligned.
+#
+# entity icache is
+# generic (
+# SIM : boolean := false;
+# -- Line size in bytes
+# LINE_SIZE : positive := 64;
+# -- BRAM organisation: We never access more
+# -- than wishbone_data_bits
+# -- at a time so to save resources we make the
+# -- array only that wide,
+# -- and use consecutive indices for to make a cache "line"
+# --
+# -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
+# -- so 64-bits)
+# ROW_SIZE : positive := wishbone_data_bits / 8;
+# -- Number of lines in a set
+# NUM_LINES : positive := 32;
+# -- Number of ways
+# NUM_WAYS : positive := 4;
+# -- L1 ITLB number of entries (direct mapped)
+# TLB_SIZE : positive := 64;
+# -- L1 ITLB log_2(page_size)
+# TLB_LG_PGSZ : positive := 12;
+# -- Number of real address bits that we store
+# REAL_ADDR_BITS : positive := 56;
+# -- Non-zero to enable log data collection
+# LOG_LENGTH : natural := 0
+# );
+# port (
+# clk : in std_ulogic;
+# rst : in std_ulogic;
+#
+# i_in : in Fetch1ToIcacheType;
+# i_out : out IcacheToDecode1Type;
+#
+# m_in : in MmuToIcacheType;
+#
+# stall_in : in std_ulogic;
+# stall_out : out std_ulogic;
+# flush_in : in std_ulogic;
+# inval_in : in std_ulogic;
+#
+# wishbone_out : out wishbone_master_out;
+# wishbone_in : in wishbone_slave_out;
+#
+# log_out : out std_ulogic_vector(53 downto 0)
+# );
+# end entity icache;
+# 64 bit direct mapped icache. All instructions are 4B aligned.
+class ICache(Elaboratable):
+ """64 bit direct mapped icache. All instructions are 4B aligned."""
+ def __init__(self):
+ self.i_in = Fetch1ToICacheType(name="i_in")
+ self.i_out = ICacheToDecode1Type(name="i_out")
+
+ self.m_in = MMUToICacheType(name="m_in")
+
+ self.stall_in = Signal()
+ self.stall_out = Signal()
+ self.flush_in = Signal()
+ self.inval_in = Signal()
+
+ self.wb_out = WBMasterOut(name="wb_out")
+ self.wb_in = WBSlaveOut(name="wb_in")
+
+ self.log_out = Signal(54)
+
+
# -- Generate a cache RAM for each way
# rams: for i in 0 to NUM_WAYS-1 generate
# signal do_read : std_ulogic;
# do_write <= '1';
# end if;
# cache_out(i) <= dout;
-# rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
-# wr_addr <= std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
+# rd_addr <=
+# std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
+# wr_addr <=
+# std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
# for i in 0 to ROW_SIZE-1 loop
# wr_sel(i) <= do_write;
# end loop;
# end process;
# end generate;
- def rams(self, m):
+ def rams(self, m, r, cache_out, use_previous, replace_way, req_row):
comb = m.d.comb
+ wb_in, stall_in = self.wb_in, self.stall_in
+
do_read = Signal()
do_write = Signal()
rd_addr = Signal(ROW_BITS)
_d_out = Signal(ROW_SIZE_BITS)
wr_sel = Signal(ROW_SIZE)
- for i in range(NUM_WAYS)
+ for i in range(NUM_WAYS):
way = CacheRam(ROW_BITS, ROW_SIZE_BITS)
comb += way.rd_en.eq(do_read)
comb += way.rd_addr.eq(rd_addr)
- comb += way.rd_data.eq(_d_out)
+ comb += way.rd_data_o.eq(_d_out)
comb += way.wr_sel.eq(wr_sel)
- comb += way.wr_add.eq(wr_addr)
- comb += way.wr_data.eq('''TODO ?? wishbone_in.data ??''')
+ comb += way.wr_addr.eq(wr_addr)
+ comb += way.wr_data.eq(wb_in.dat)
comb += do_read.eq(~(stall_in | use_previous))
- comb += do_write.eq(0)
- with m.If(wb_in.ack & replace_way == i):
- do_write.eq(1)
+ with m.If(wb_in.ack & (replace_way == i)):
+ comb += do_write.eq(1)
comb += cache_out[i].eq(_d_out)
- comb += rd_addr.eq(Signal(req_row))
- comb += wr_addr.eq(Signal(r.store_row))
+ comb += rd_addr.eq(req_row)
+ comb += wr_addr.eq(r.store_row)
for j in range(ROW_SIZE):
comb += wr_sel[j].eq(do_write)
# end process;
# end generate;
# end generate;
- def maybe_plrus(self, m):
- comb += m.d.comb
+ def maybe_plrus(self, m, r, plru_victim):
+ comb = m.d.comb
with m.If(NUM_WAYS > 1):
- plru_acc = Signal(WAY_BITS)
- plru_acc_en = Signal()
- plru_out = Signal(WAY_BITS)
-
for i in range(NUM_LINES):
- plru = PLRU(WAY_BITS)
- comb += plru.acc.eq(plru_acc)
+ plru_acc_i = Signal(WAY_BITS)
+ plru_acc_en = Signal()
+ plru_out = Signal(WAY_BITS)
+ plru = PLRU(WAY_BITS)
+ comb += plru.acc_i.eq(plru_acc_i)
comb += plru.acc_en.eq(plru_acc_en)
- comb += plru.lru.eq(plru_out)
+ comb += plru.lru_o.eq(plru_out)
# PLRU interface
with m.If(get_index(r.hit_nia) == i):
comb += plru.acc_en.eq(r.hit_valid)
- with m.Else():
- comb += plru.acc_en.eq(0)
-
- comb += plru.acc.eq(r.hit_way)
- comb += plru_victim[i].eq(plru.lru)
+ comb += plru.acc_i.eq(r.hit_way)
+ comb += plru_victim[i].eq(plru.lru_o)
# -- TLB hit detection and real address generation
# itlb_lookup : process(all)
# access_ok <= ra_valid and not priv_fault;
# end process;
# TLB hit detection and real address generation
- def itlb_lookup(self, m):
+ def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
+ real_addr, itlb_valid_bits, ra_valid, eaa_priv,
+ priv_fault, access_ok):
comb = m.d.comb
+ i_in = self.i_in
+
+ pte = Signal(TLB_PTE_BITS)
+ ttag = Signal(TLB_EA_TAG_BITS)
+
comb += tlb_req_index.eq(hash_ea(i_in.nia))
comb += pte.eq(itlb_ptes[tlb_req_index])
comb += ttag.eq(itlb_tags[tlb_req_index])
with m.If(i_in.virt_mode):
comb += real_addr.eq(Cat(
- i_in.nia[:TLB_LB_PGSZ],
+ i_in.nia[:TLB_LG_PGSZ],
pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
))
with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
- with m.Else():
- comb += ra_valid.eq(0)
+ comb += eaa_priv.eq(pte[3])
with m.Else():
comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
# begin
# if rising_edge(clk) then
# wr_index := hash_ea(m_in.addr);
-# if rst = '1' or (m_in.tlbie = '1' and m_in.doall = '1') then
+# if rst = '1' or
+# (m_in.tlbie = '1' and m_in.doall = '1') then
# -- clear all valid bits
# for i in tlb_index_t loop
# itlb_valids(i) <= '0';
# -- clear entry regardless of hit or miss
# itlb_valids(wr_index) <= '0';
# elsif m_in.tlbld = '1' then
-# itlb_tags(wr_index) <= m_in.addr(
-# 63 downto TLB_LG_PGSZ + TLB_BITS
-# );
+# itlb_tags(wr_index) <=
+# m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS);
# itlb_ptes(wr_index) <= m_in.pte;
# itlb_valids(wr_index) <= '1';
# end if;
# end if;
# end process;
# iTLB update
- def itlb_update(self, m):
+ def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
+ comb = m.d.comb
sync = m.d.sync
+ m_in = self.m_in
+
wr_index = Signal(TLB_SIZE)
sync += wr_index.eq(hash_ea(m_in.addr))
- with m.If('''TODO rst in nmigen''' | (m_in.tlbie & m_in.doall)):
+ with m.If(m_in.tlbie & m_in.doall):
# Clear all valid bits
for i in range(TLB_SIZE):
- sync += itlb_vlaids[i].eq(0)
+ sync += itlb_valid_bits[i].eq(0)
with m.Elif(m_in.tlbie):
# Clear entry regardless of hit or miss
# -- Cache hit detection, output to fetch2 and other misc logic
# icache_comb : process(all)
# Cache hit detection, output to fetch2 and other misc logic
- def icache_comb(self, m):
+ def icache_comb(self, m, use_previous, r, req_index, req_row,
+ req_tag, real_addr, req_laddr, cache_valid_bits,
+ cache_tags, access_ok, req_is_hit,
+ req_is_miss, replace_way, plru_victim, cache_out):
# variable is_hit : std_ulogic;
# variable hit_way : way_t;
comb = m.d.comb
+ i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
+ flush_in, stall_out = self.flush_in, self.stall_out
+
is_hit = Signal()
hit_way = Signal(NUM_WAYS)
# begin
-# -- i_in.sequential means that i_in.nia this cycle is 4 more than
-# -- last cycle. If we read more than 32 bits at a time, had a
-# -- cache hit last cycle, and we don't want the first 32-bit chunk
-# -- then we can keep the data we read last cycle and just use that.
+# -- i_in.sequential means that i_in.nia this cycle
+# -- is 4 more than last cycle. If we read more
+# -- than 32 bits at a time, had a cache hit last
+# -- cycle, and we don't want the first 32-bit chunk
+# -- then we can keep the data we read last cycle
+# -- and just use that.
# if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
# use_previous <= i_in.sequential and r.hit_valid;
# else
with m.If(i_in.nia[2:INSN_BITS+2] != 0):
comb += use_previous.eq(i_in.sequential & r.hit_valid)
- with m.else():
- comb += use_previous.eq(0)
-
# -- Extract line, row and tag from request
# req_index <= get_index(i_in.nia);
# req_row <= get_row(i_in.nia);
# -- Calculate address of beginning of cache row, will be
# -- used for cache miss processing if needed
-# req_laddr <= (63 downto REAL_ADDR_BITS => '0') &
-# real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
-# (ROW_OFF_BITS-1 downto 0 => '0');
+# req_laddr <=
+# (63 downto REAL_ADDR_BITS => '0') &
+# real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
+# (ROW_OFF_BITS-1 downto 0 => '0');
# Calculate address of beginning of cache row, will be
# used for cache miss processing if needed
comb += req_laddr.eq(Cat(
Const(0b0, ROW_OFF_BITS),
real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
- Const(0, REAL_ADDR_BITS)
+ Const(0b0, 8)
))
# -- Test if pending request is a hit on any way
((r.state == State.WAIT_ACK)
& (req_index == r.store_index)
& (i == r.store_way)
- & r.rows_valid[req_row % ROW_PER_LINE])):
- with m.If(read_tag(i, cahce_tags[req_index]) == req_tag):
+ & r.rows_valid[req_row % ROW_PER_LINE]))):
+ with m.If(read_tag(i, cache_tags[req_index]) == req_tag):
comb += hit_way.eq(i)
comb += is_hit.eq(1)
-# -- Generate the "hit" and "miss" signals for the synchronous blocks
+# -- Generate the "hit" and "miss" signals
+# -- for the synchronous blocks
# if i_in.req = '1' and access_ok = '1' and flush_in = '0'
# and rst = '0' then
# req_is_hit <= is_hit;
# req_is_miss <= '0';
# end if;
# req_hit_way <= hit_way;
- # Generate the "hit" and "miss" signals for the synchronous blocks
- with m.If(i_in.rq & access_ok & ~flush_in & '''TODO nmigen rst'''):
+ # Generate the "hit" and "miss" signals
+ # for the synchronous blocks
+ with m.If(i_in.req & access_ok & ~flush_in):
comb += req_is_hit.eq(is_hit)
comb += req_is_miss.eq(~is_hit)
# -- The way to replace on a miss
# if r.state = CLR_TAG then
-# replace_way <= to_integer(unsigned(plru_victim(r.store_index)));
+# replace_way <=
+# to_integer(unsigned(plru_victim(r.store_index)));
# else
# replace_way <= r.store_way;
# end if;
# be output an entire row which I prefer not to do just yet
# as it would force fetch2 to know about some of the cache
# geometry information.
- comb += i_out.insn.eq(
- read_insn_word(r.hit_nia, cache_out[r.hit_way])
- )
+ comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out[r.hit_way]))
comb += i_out.valid.eq(r.hit_valid)
comb += i_out.nia.eq(r.hit_nia)
comb += i_out.stop_mark.eq(r.hit_smark)
# -- Cache hit synchronous machine
# icache_hit : process(clk)
# Cache hit synchronous machine
- def icache_hit(self, m):
+ def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
+ req_index, req_tag, real_addr):
sync = m.d.sync
+
+ i_in, stall_in = self.i_in, self.stall_in
+ flush_in = self.flush_in
+
# begin
# if rising_edge(clk) then
# -- keep outputs to fetch2 unchanged on a stall
# If use_previous, keep the same data as last
# cycle and use the second half
with m.If(stall_in | use_previous):
- with m.If('''TODO rst nmigen''' | flush_in):
+ with m.If(flush_in):
sync += r.hit_valid.eq(0)
# else
# -- On a hit, latch the request for the next cycle,
# " tag:" & to_hstring(req_tag) &
# " way:" & integer'image(req_hit_way) &
# " RA:" & to_hstring(real_addr);
- print(f"cache hit nia:{i_in.nia}, IR:{i_in.virt_mode}, " \
+ # XXX NO do not use f"" use %d and %x. see dcache.py Display
+ print(f"cache hit nia:{i_in.nia}, " \
+ f"IR:{i_in.virt_mode}, " \
f"SM:{i_in.stop_mark}, idx:{req_index}, " \
- f"tag:{req_tag}, way:{req_hit_way}, RA:{real_addr}")
+ f"tag:{req_tag}, way:{req_hit_way}, " \
+ f"RA:{real_addr}")
# end if;
# end if;
# if stall_in = '0' then
# -- Cache miss/reload synchronous machine
# icache_miss : process(clk)
# Cache miss/reload synchronous machine
- def icache_miss(self, m):
+ def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
+ req_index, req_laddr, req_tag, replace_way,
+ cache_tags, access_ok, real_addr):
comb = m.d.comb
sync = m.d.sync
+ i_in, wb_in, m_in = self.i_in, self.wb_in, self.m_in
+ stall_in, flush_in = self.stall_in, self.flush_in
+ inval_in = self.inval_in
+
# variable tagset : cache_tags_set_t;
# variable stbs_done : boolean;
# -- On reset, clear all valid bits to force misses
# if rst = '1' then
# On reset, clear all valid bits to force misses
- with m.If('''TODO rst nmigen'''):
# for i in index_t loop
# cache_valids(i) <= (others => '0');
# end loop;
- for i in Signal(NUM_LINES):
- sync += cache_valid_bits[i].eq(~1)
-
# r.state <= IDLE;
# r.wb.cyc <= '0';
# r.wb.stb <= '0';
- sync += r.state.eq(State.IDLE)
- sync += r.wb.cyc.eq(0)
- sync += r.wb.stb.eq(0)
-
# -- We only ever do reads on wishbone
# r.wb.dat <= (others => '0');
# r.wb.sel <= "11111111";
# r.wb.we <= '0';
- # We only ever do reads on wishbone
- sync += r.wb.dat.eq(~1)
- sync += r.wb.sel.eq(Const(0b11111111, 8))
- sync += r.wb.we.eq(0)
-# -- Not useful normally but helps avoiding tons of sim warnings
+# -- Not useful normally but helps avoiding
+# -- tons of sim warnings
# r.wb.adr <= (others => '0');
- # Not useful normally but helps avoiding tons of sim warnings
- sync += r.wb.adr.eq(~1)
# else
- with m.Else():
+
# -- Process cache invalidations
# if inval_in = '1' then
# for i in index_t loop
# end loop;
# r.store_valid <= '0';
# end if;
- # Process cache invalidations
- with m.If(inval_in):
- for i in range(NUM_LINES):
- sync += cache_valid_bits[i].eq(~1)
-
- sync += r.store_valid.eq(0)
+ # Process cache invalidations
+ with m.If(inval_in):
+ for i in range(NUM_LINES):
+ sync += cache_valid_bits[i].eq(~1)
+ sync += r.store_valid.eq(0)
# -- Main state machine
# case r.state is
- # Main state machine
- with m.Switch(r.state):
-
-# when IDLE =>
- with m.Case(State.IDLE):
-# -- Reset per-row valid flags, only used in WAIT_ACK
-# for i in 0 to ROW_PER_LINE - 1 loop
-# r.rows_valid(i) <= '0';
-# end loop;
- # Reset per-row valid flags, onlyy used in WAIT_ACK
- for i in range(ROW_PER_LINE):
- sync += r.rows_valid[i].eq(0)
-
-# -- We need to read a cache line
-# if req_is_miss = '1' then
-# report "cache miss nia:" & to_hstring(i_in.nia) &
-# " IR:" & std_ulogic'image(i_in.virt_mode) &
-# " SM:" & std_ulogic'image(i_in.stop_mark) &
-# " idx:" & integer'image(req_index) &
-# " way:" & integer'image(replace_way) &
-# " tag:" & to_hstring(req_tag) &
-# " RA:" & to_hstring(real_addr);
- # We need to read a cache line
- with m.If(req_is_miss):
- print(f"cache miss nia:{i_in.nia} " \
- f"IR:{i_in.virt_mode} " \
- f"SM:{i_in.stop_mark} idx:{req_index} " \
- f"way:{replace_way} tag:{req_tag} " \
- f"RA:{real_addr}")
-
-# -- Keep track of our index and way for
-# -- subsequent stores
-# r.store_index <= req_index;
-# r.store_row <= get_row(req_laddr);
-# r.store_tag <= req_tag;
-# r.store_valid <= '1';
-# r.end_row_ix <=
-# get_row_of_line(get_row(req_laddr)) - 1;
- # Keep track of our index and way
- # for subsequent stores
- sync += r.store_index.eq(req_index)
- sync += r.store_row.eq(get_row(req_laddr))
- sync += r.store_tag.eq(req_tag)
- sync += r.store_valid.eq(1)
- sync += r.end_row_ix.eq(
- get_row_of_line(get_row(req_laddr)) - 1
- )
-
-# -- Prep for first wishbone read. We calculate the
-# -- address of the start of the cache line and
-# -- start the WB cycle.
-# r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
-# r.wb.cyc <= '1';
-# r.wb.stb <= '1';
- # Prep for first wishbone read. We calculate the
- # address of the start of the cache line and
- # start the WB cycle.
- sync += r.wb.adr.eq(
- req_laddr[:r.wb.adr'''left?''']
- )
-
-# -- Track that we had one request sent
-# r.state <= CLR_TAG;
- # Track that we had one request sent
- sync += r.state.eq(State.CLR_TAG)
-# end if;
-
-# when CLR_TAG | WAIT_ACK =>
- with m.Case(State.CLR_TAG, State.WAIT_ACK):
-# if r.state = CLR_TAG then
- with m.If(r.state == State.CLR_TAG):
-# -- Get victim way from plru
-# r.store_way <= replace_way;
- # Get victim way from plru
- sync += r.store_way.eq(replace_way)
+ # Main state machine
+ with m.Switch(r.state):
+
+# when IDLE =>
+ with m.Case(State.IDLE):
+# -- Reset per-row valid flags,
+# -- only used in WAIT_ACK
+# for i in 0 to ROW_PER_LINE - 1 loop
+# r.rows_valid(i) <= '0';
+# end loop;
+ # Reset per-row valid flags,
+ # only used in WAIT_ACK
+ for i in range(ROW_PER_LINE):
+ sync += r.rows_valid[i].eq(0)
+
+# -- We need to read a cache line
+# if req_is_miss = '1' then
+# report "cache miss nia:" & to_hstring(i_in.nia) &
+# " IR:" & std_ulogic'image(i_in.virt_mode) &
+# " SM:" & std_ulogic'image(i_in.stop_mark) &
+# " idx:" & integer'image(req_index) &
+# " way:" & integer'image(replace_way) &
+# " tag:" & to_hstring(req_tag) &
+# " RA:" & to_hstring(real_addr);
+ # We need to read a cache line
+ with m.If(req_is_miss):
+ # XXX no, do not use "f". use sync += Display
+ # and use %d for integer, %x for hex.
+ print(f"cache miss nia:{i_in.nia} " \
+ f"IR:{i_in.virt_mode} " \
+ f"SM:{i_in.stop_mark} " \
+ F"idx:{req_index} " \
+ f"way:{replace_way} tag:{req_tag} " \
+ f"RA:{real_addr}")
+
+# -- Keep track of our index and way for
+# -- subsequent stores
+# r.store_index <= req_index;
+# r.store_row <= get_row(req_laddr);
+# r.store_tag <= req_tag;
+# r.store_valid <= '1';
+# r.end_row_ix <=
+# get_row_of_line(get_row(req_laddr)) - 1;
+ # Keep track of our index and way
+ # for subsequent stores
+ sync += r.store_index.eq(req_index)
+ sync += r.store_row.eq(get_row(req_laddr))
+ sync += r.store_tag.eq(req_tag)
+ sync += r.store_valid.eq(1)
+ sync += r.end_row_ix.eq(
+ get_row_of_line(
+ get_row(req_laddr)
+ ) - 1
+ )
+
+# -- Prep for first wishbone read. We calculate the
+# -- address of the start of the cache line and
+# -- start the WB cycle.
+# r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
+# r.wb.cyc <= '1';
+# r.wb.stb <= '1';
+ # Prep for first wishbone read.
+ # We calculate the
+ # address of the start of the cache line and
+ # start the WB cycle.
+ sync += r.wb.adr.eq(req_laddr)
+ sync += r.wb.cyc.eq(1)
+ sync += r.wb.stb.eq(1)
+
+# -- Track that we had one request sent
+# r.state <= CLR_TAG;
+ # Track that we had one request sent
+ sync += r.state.eq(State.CLR_TAG)
+# end if;
+
+# when CLR_TAG | WAIT_ACK =>
+ with m.Case(State.CLR_TAG, State.WAIT_ACK):
+# if r.state = CLR_TAG then
+ with m.If(r.state == State.CLR_TAG):
+# -- Get victim way from plru
+# r.store_way <= replace_way;
+ # Get victim way from plru
+ sync += r.store_way.eq(replace_way)
#
-# -- Force misses on that way while reloading that line
-# cache_valids(req_index)(replace_way) <= '0';
- # Force misses on that way while
- # realoading that line
- sync += cache_valid_bits[
- req_index
- ][replace_way].eq(0)
-
-# -- Store new tag in selected way
-# for i in 0 to NUM_WAYS-1 loop
-# if i = replace_way then
-# tagset := cache_tags(r.store_index);
-# write_tag(i, tagset, r.store_tag);
-# cache_tags(r.store_index) <= tagset;
-# end if;
-# end loop;
- for i in range(NUM_WAYS):
- with m.If(i == replace_way):
- comb += tagset.eq(
- cache_tags[r.store_index]
- )
- sync += write_tag(i, tagset, r.store_tag)
- sync += cache_tags(r.store_index).eq(
- tagset
- )
-
-# r.state <= WAIT_ACK;
- sync += r.state.eq(State.WAIT_ACK)
-# end if;
-
-# -- Requests are all sent if stb is 0
-# stbs_done := r.wb.stb = '0';
- # Requests are all sent if stb is 0
- comb += stbs_done.eq(r.wb.stb == 0)
-
-# -- If we are still sending requests, was one accepted ?
-# if wishbone_in.stall = '0' and not stbs_done then
- # If we are still sending requests, was one accepted?
- with m.If(~wb_in.stall & ~stbs_done):
-# -- That was the last word ? We are done sending.
-# -- Clear stb and set stbs_done so we can handle
-# -- an eventual last ack on the same cycle.
-# if is_last_row_addr(r.wb.adr, r.end_row_ix) then
-# r.wb.stb <= '0';
-# stbs_done := true;
-# end if;
- # That was the last word ? We are done sending.
- # Clear stb and set stbs_done so we can handle
- # an eventual last ack on the same cycle.
- with m.If(is_last_row_addr(
- r.wb.adr, r.end_row_ix)):
- sync += r.wb.stb.eq(0)
- stbs_done.eq(1)
-
-# -- Calculate the next row address
-# r.wb.adr <= next_row_addr(r.wb.adr);
- # Calculate the next row address
- sync += r.wb.adr.eq(next_row_addr(r.wb.adr))
-# end if;
-
-# -- Incoming acks processing
-# if wishbone_in.ack = '1' then
- # Incoming acks processing
- with m.If(wb_in.ack):
-# r.rows_valid(r.store_row mod ROW_PER_LINE) <= '1';
- sync += r.rows_valid[
- r.store_row & ROW_PER_LINE
- ].eq(1)
-
-# -- Check for completion
-# if stbs_done and
-# is_last_row(r.store_row, r.end_row_ix) then
- # Check for completion
- with m.If(stbs_done & is_last_row(
- r.store_row, r.end_row_ix)):
-# -- Complete wishbone cycle
-# r.wb.cyc <= '0';
- # Complete wishbone cycle
- sync += r.wb.cyc.eq(0)
-
-# -- Cache line is now valid
-# cache_valids(r.store_index)(replace_way) <=
-# r.store_valid and not inval_in;
- # Cache line is now valid
- sync += cache_valid_bits[
- r.store_index
- ][relace_way].eq(
- r.store_valid & ~inval_in
- )
-
-# -- We are done
-# r.state <= IDLE;
- # We are done
- sync += r.state.eq(State.IDLE)
-# end if;
-
-# -- Increment store row counter
-# r.store_row <= next_row(r.store_row);
- # Increment store row counter
- sync += store_row.eq(next_row(r.store_row))
-# end if;
-# end case;
-# end if;
+# -- Force misses on that way while
+# -- reloading that line
+# cache_valids(req_index)(replace_way) <= '0';
+ # Force misses on that way while
+ # realoading that line
+ cv = Signal(INDEX_BITS)
+ comb += cv.eq(cache_valid_bits[req_index])
+ comb += cv.bit_select(replace_way, 1).eq(0)
+ sync += cache_valid_bits[req_index].eq(cv)
+
+# -- Store new tag in selected way
+# for i in 0 to NUM_WAYS-1 loop
+# if i = replace_way then
+# tagset := cache_tags(r.store_index);
+# write_tag(i, tagset, r.store_tag);
+# cache_tags(r.store_index) <= tagset;
+# end if;
+# end loop;
+ for i in range(NUM_WAYS):
+ with m.If(i == replace_way):
+ sync += tagset.eq(cache_tags[r.store_index])
+ sync += write_tag(i, tagset, r.store_tag)
+ sync += cache_tags[r.store_index].eq(tagset)
+
+# r.state <= WAIT_ACK;
+ sync += r.state.eq(State.WAIT_ACK)
+# end if;
+
+# -- Requests are all sent if stb is 0
+# stbs_done := r.wb.stb = '0';
+ # Requests are all sent if stb is 0
+ sync += stbs_done.eq(r.wb.stb == 0)
+
+# -- If we are still sending requests,
+# -- was one accepted ?
+# if wishbone_in.stall = '0' and not stbs_done then
+ # If we are still sending requests,
+ # was one accepted?
+ with m.If(~wb_in.stall & ~stbs_done):
+# -- That was the last word ? We are done sending.
+# -- Clear stb and set stbs_done so we can handle
+# -- an eventual last ack on the same cycle.
+# if is_last_row_addr(r.wb.adr, r.end_row_ix) then
+# r.wb.stb <= '0';
+# stbs_done := true;
+# end if;
+ # That was the last word ?
+ # We are done sending.
+ # Clear stb and set stbs_done
+ # so we can handle
+ # an eventual last ack on
+ # the same cycle.
+ with m.If(is_last_row_addr(r.wb.adr, r.end_row_ix)):
+ sync += r.wb.stb.eq(0)
+ sync += stbs_done.eq(1)
+
+# -- Calculate the next row address
+# r.wb.adr <= next_row_addr(r.wb.adr);
+ # Calculate the next row address
+ rarange = r.wb.adr[ROW_OFF_BITS:LINE_OFF_BITS]
+ sync += r.wb.adr.eq(rarange + 1)
+# end if;
+
+# -- Incoming acks processing
+# if wishbone_in.ack = '1' then
+ # Incoming acks processing
+ with m.If(wb_in.ack):
+# r.rows_valid(r.store_row mod ROW_PER_LINE)
+# <= '1';
+ sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
+
+# -- Check for completion
+# if stbs_done and
+# is_last_row(r.store_row, r.end_row_ix) then
+ # Check for completion
+ with m.If(stbs_done &
+ is_last_row(r.store_row, r.end_row_ix)):
+# -- Complete wishbone cycle
+# r.wb.cyc <= '0';
+ # Complete wishbone cycle
+ sync += r.wb.cyc.eq(0)
+
+# -- Cache line is now valid
+# cache_valids(r.store_index)(replace_way) <=
+# r.store_valid and not inval_in;
+ # Cache line is now valid
+ cv = Signal(INDEX_BITS)
+ comb += cv.eq(cache_valid_bits[r.store_index])
+ comb += cv.bit_select(replace_way, 1).eq(
+ r.store_valid & ~inval_in
+ )
+ sync += cache_valid_bits[r.store_index].eq(cv)
+
+# -- We are done
+# r.state <= IDLE;
+ # We are done
+ sync += r.state.eq(State.IDLE)
+# end if;
+
+# -- Increment store row counter
+# r.store_row <= next_row(r.store_row);
+ # Increment store row counter
+ sync += r.store_row.eq(next_row(r.store_row))
+# end if;
+# end case;
+# end if;
#
# -- TLB miss and protection fault processing
# if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
# r.fetch_failed <= '0';
-# elsif i_in.req = '1' and access_ok = '0' and stall_in = '0' then
+# elsif i_in.req = '1' and access_ok = '0' and
+# stall_in = '0' then
# r.fetch_failed <= '1';
# end if;
- # TLB miss and protection fault processing
- with m.If('''TODO nmigen rst''' | flush_in | m_in.tlbld):
- sync += r.fetch_failed.eq(0)
+ # TLB miss and protection fault processing
+ with m.If(flush_in | m_in.tlbld):
+ sync += r.fetch_failed.eq(0)
- with m.Elif(i_in.req & ~access_ok & ~stall_in):
- sync += r.fetch_failed.eq(1)
+ with m.Elif(i_in.req & ~access_ok & ~stall_in):
+ sync += r.fetch_failed.eq(1)
# end if;
# end process;
# icache_log: if LOG_LENGTH > 0 generate
- def icache_log(self, m, log_out):
+ def icache_log(self, m, req_hit_way, ra_valid, access_ok,
+ req_is_miss, req_is_hit, lway, wstate, r):
comb = m.d.comb
sync = m.d.sync
+ wb_in, i_out = self.wb_in, self.i_out
+ log_out, stall_out = self.log_out, self.stall_out
+
# -- Output data to logger
# signal log_data : std_ulogic_vector(53 downto 0);
# begin
# variable lway: way_t;
# variable wstate: std_ulogic;
# Output data to logger
- for i in range(LOG_LENGTH)
+ for i in range(LOG_LENGTH):
# Output data to logger
log_data = Signal(54)
lway = Signal(NUM_WAYS)
# if rising_edge(clk) then
# lway := req_hit_way;
# wstate := '0';
- comb += lway.eq(req_hit_way)
- comb += wstate.eq(0)
+ sync += lway.eq(req_hit_way)
+ sync += wstate.eq(0)
# if r.state /= IDLE then
# wstate := '1';
# end if;
with m.If(r.state != State.IDLE):
- comb += wstate.eq(1)
+ sync += wstate.eq(1)
# log_data <= i_out.valid &
# i_out.insn &
# ra_valid;
sync += log_data.eq(Cat(
ra_valid, access_ok, req_is_miss, req_is_hit,
- lway '''truncate to 3 bits?''', wstate, r.hit_nia[2:6],
+ lway, wstate, r.hit_nia[2:6],
r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
i_out.valid
comb += log_out.eq(log_data)
# end generate;
# end;
+
+ def elaborate(self, platform):
+
+ m = Module()
+ comb = m.d.comb
+
+ # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
+ cache_tags = CacheTagArray()
+ cache_valid_bits = CacheValidBitsArray()
+
+# signal itlb_valids : tlb_valids_t;
+# signal itlb_tags : tlb_tags_t;
+# signal itlb_ptes : tlb_ptes_t;
+# attribute ram_style of itlb_tags : signal is "distributed";
+# attribute ram_style of itlb_ptes : signal is "distributed";
+ itlb_valid_bits = TLBValidBitsArray()
+ itlb_tags = TLBTagArray()
+ itlb_ptes = TLBPtesArray()
+ # TODO to be passed to nmigen as ram attributes
+ # attribute ram_style of itlb_tags : signal is "distributed";
+ # attribute ram_style of itlb_ptes : signal is "distributed";
+
+# -- Privilege bit from PTE EAA field
+# signal eaa_priv : std_ulogic;
+ # Privilege bit from PTE EAA field
+ eaa_priv = Signal()
+
+# signal r : reg_internal_t;
+ r = RegInternal()
+
+# -- Async signals on incoming request
+# signal req_index : index_t;
+# signal req_row : row_t;
+# signal req_hit_way : way_t;
+# signal req_tag : cache_tag_t;
+# signal req_is_hit : std_ulogic;
+# signal req_is_miss : std_ulogic;
+# signal req_laddr : std_ulogic_vector(63 downto 0);
+ # Async signal on incoming request
+ req_index = Signal(NUM_LINES)
+ req_row = Signal(BRAM_ROWS)
+ req_hit_way = Signal(NUM_WAYS)
+ req_tag = Signal(TAG_BITS)
+ req_is_hit = Signal()
+ req_is_miss = Signal()
+ req_laddr = Signal(64)
+
+# signal tlb_req_index : tlb_index_t;
+# signal real_addr : std_ulogic_vector(
+# REAL_ADDR_BITS - 1 downto 0
+# );
+# signal ra_valid : std_ulogic;
+# signal priv_fault : std_ulogic;
+# signal access_ok : std_ulogic;
+# signal use_previous : std_ulogic;
+ tlb_req_index = Signal(TLB_SIZE)
+ real_addr = Signal(REAL_ADDR_BITS)
+ ra_valid = Signal()
+ priv_fault = Signal()
+ access_ok = Signal()
+ use_previous = Signal()
+
+# signal cache_out : cache_ram_out_t;
+ cache_out = CacheRamOut()
+
+# signal plru_victim : plru_out_t;
+# signal replace_way : way_t;
+ plru_victim = PLRUOut()
+ replace_way = Signal(NUM_WAYS)
+
+ # call sub-functions putting everything together, using shared
+ # signals established above
+ self.rams(m, r, cache_out, use_previous, replace_way, req_row)
+ self.maybe_plrus(m, r, plru_victim)
+ self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags,
+ real_addr, itlb_valid_bits, ra_valid, eaa_priv,
+ priv_fault, access_ok)
+ self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
+ self.icache_comb(m, use_previous, r, req_index, req_row,
+ req_tag, real_addr, req_laddr, cache_valid_bits,
+ cache_tags, access_ok, req_is_hit, req_is_miss,
+ replace_way, plru_victim, cache_out)
+ self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
+ req_index, req_tag, real_addr)
+ self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
+ req_laddr, req_tag, replace_way, cache_tags,
+ access_ok, real_addr)
+ #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
+ # req_is_miss, req_is_hit, lway, wstate, r)
+
+ return m
+
+
+# icache_tb.vhdl
+#
+# library ieee;
+# use ieee.std_logic_1164.all;
+#
+# library work;
+# use work.common.all;
+# use work.wishbone_types.all;
+#
+# entity icache_tb is
+# end icache_tb;
+#
+# architecture behave of icache_tb is
+# signal clk : std_ulogic;
+# signal rst : std_ulogic;
+#
+# signal i_out : Fetch1ToIcacheType;
+# signal i_in : IcacheToDecode1Type;
+#
+# signal m_out : MmuToIcacheType;
+#
+# signal wb_bram_in : wishbone_master_out;
+# signal wb_bram_out : wishbone_slave_out;
+#
+# constant clk_period : time := 10 ns;
+# begin
+# icache0: entity work.icache
+# generic map(
+# LINE_SIZE => 64,
+# NUM_LINES => 4
+# )
+# port map(
+# clk => clk,
+# rst => rst,
+# i_in => i_out,
+# i_out => i_in,
+# m_in => m_out,
+# stall_in => '0',
+# flush_in => '0',
+# inval_in => '0',
+# wishbone_out => wb_bram_in,
+# wishbone_in => wb_bram_out
+# );
+#
+# -- BRAM Memory slave
+# bram0: entity work.wishbone_bram_wrapper
+# generic map(
+# MEMORY_SIZE => 1024,
+# RAM_INIT_FILE => "icache_test.bin"
+# )
+# port map(
+# clk => clk,
+# rst => rst,
+# wishbone_in => wb_bram_in,
+# wishbone_out => wb_bram_out
+# );
+#
+# clk_process: process
+# begin
+# clk <= '0';
+# wait for clk_period/2;
+# clk <= '1';
+# wait for clk_period/2;
+# end process;
+#
+# rst_process: process
+# begin
+# rst <= '1';
+# wait for 2*clk_period;
+# rst <= '0';
+# wait;
+# end process;
+#
+# stim: process
+# begin
+# i_out.req <= '0';
+# i_out.nia <= (others => '0');
+# i_out.stop_mark <= '0';
+#
+# m_out.tlbld <= '0';
+# m_out.tlbie <= '0';
+# m_out.addr <= (others => '0');
+# m_out.pte <= (others => '0');
+#
+# wait until rising_edge(clk);
+# wait until rising_edge(clk);
+# wait until rising_edge(clk);
+# wait until rising_edge(clk);
+#
+# i_out.req <= '1';
+# i_out.nia <= x"0000000000000004";
+#
+# wait for 30*clk_period;
+# wait until rising_edge(clk);
+#
+# assert i_in.valid = '1' severity failure;
+# assert i_in.insn = x"00000001"
+# report "insn @" & to_hstring(i_out.nia) &
+# "=" & to_hstring(i_in.insn) &
+# " expected 00000001"
+# severity failure;
+#
+# i_out.req <= '0';
+#
+# wait until rising_edge(clk);
+#
+# -- hit
+# i_out.req <= '1';
+# i_out.nia <= x"0000000000000008";
+# wait until rising_edge(clk);
+# wait until rising_edge(clk);
+# assert i_in.valid = '1' severity failure;
+# assert i_in.insn = x"00000002"
+# report "insn @" & to_hstring(i_out.nia) &
+# "=" & to_hstring(i_in.insn) &
+# " expected 00000002"
+# severity failure;
+# wait until rising_edge(clk);
+#
+# -- another miss
+# i_out.req <= '1';
+# i_out.nia <= x"0000000000000040";
+#
+# wait for 30*clk_period;
+# wait until rising_edge(clk);
+#
+# assert i_in.valid = '1' severity failure;
+# assert i_in.insn = x"00000010"
+# report "insn @" & to_hstring(i_out.nia) &
+# "=" & to_hstring(i_in.insn) &
+# " expected 00000010"
+# severity failure;
+#
+# -- test something that aliases
+# i_out.req <= '1';
+# i_out.nia <= x"0000000000000100";
+# wait until rising_edge(clk);
+# wait until rising_edge(clk);
+# assert i_in.valid = '0' severity failure;
+# wait until rising_edge(clk);
+#
+# wait for 30*clk_period;
+# wait until rising_edge(clk);
+#
+# assert i_in.valid = '1' severity failure;
+# assert i_in.insn = x"00000040"
+# report "insn @" & to_hstring(i_out.nia) &
+# "=" & to_hstring(i_in.insn) &
+# " expected 00000040"
+# severity failure;
+#
+# i_out.req <= '0';
+#
+# std.env.finish;
+# end process;
+# end;
+def icache_sim(dut):
+ i_out = dut.i_in
+ i_in = dut.i_out
+ m_out = dut.m_in
+
+ yield i_in.valid.eq(0)
+ yield i_out.priv_mode.eq(1)
+ yield i_out.req.eq(0)
+ yield i_out.nia.eq(0)
+ yield i_out.stop_mark.eq(0)
+ yield m_out.tlbld.eq(0)
+ yield m_out.tlbie.eq(0)
+ yield m_out.addr.eq(0)
+ yield m_out.pte.eq(0)
+ yield
+ yield
+ yield
+ yield
+ yield i_out.req.eq(1)
+ yield i_out.nia.eq(Const(0x0000000000000004, 64))
+ for i in range(30):
+ yield
+ yield
+ valid = yield i_in.valid
+ nia = yield i_out.nia
+ insn = yield i_in.insn
+ print(f"valid? {valid}")
+ assert valid
+ assert insn == 0x00000001, \
+ "insn @%x=%x expected 00000001" % (nia, insn)
+ yield i_out.req.eq(0)
+ yield
+
+# # hit
+# yield i_out.req.eq(1)
+# yield i_out.nia.eq(Const(0x0000000000000008, 64))
+# yield
+# yield
+# valid = yield i_in.valid
+# insn = yield i_in.insn
+# #assert valid
+# #assert insn == 0x00000002, \
+# #("insn @%x=%x expected 00000002" % i_out.nia, i_in.insn)
+# yield
+#
+# # another miss
+# yield i_out.req.eq(1)
+# yield i_out.nia.eq(Const(0x0000000000000040, 64))
+# for i in range(30):
+# yield
+# yield
+# valid = yield i_in.valid
+# insn = yield i_in.insn
+# #assert valid
+# #assert insn == 0x00000010, \
+# #("insn @%x=%x expected 00000010" % i_out.nia, i_in.insn)
+#
+# # test something that aliases
+# yield i_out.req.eq(1)
+# yield i_out.nia.eq(Const(0x0000000000000100, 64))
+# yield
+# yield
+# #assert i_in.valid == Const(1, 1)
+# for i in range(30):
+# yield
+# yield
+# valid = yield i_in.valid
+# insn = yield i_in.insn
+# #assert valid
+# #assert insn == 0x00000040, \
+# #("insn @%x=%x expected 00000040" % i_out.nia, i_in.insn)
+# yield i_out.req.eq(0)
+
+
+def test_icache(mem):
+ dut = ICache()
+
+ memory = Memory(width=64, depth=16*64, init=mem)
+ sram = SRAM(memory=memory, granularity=8)
+
+ m = Module()
+
+ m.submodules.icache = dut
+ m.submodules.sram = sram
+
+ m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
+ m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
+ m.d.comb += sram.bus.we.eq(dut.wb_out.we)
+ m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
+ m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
+ m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
+
+ m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
+ m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
+
+ # nmigen Simulation
+ sim = Simulator(m)
+ sim.add_clock(1e-6)
+
+ sim.add_sync_process(wrap(icache_sim(dut)))
+ with sim.write_vcd('test_icache.vcd'):
+ sim.run()
+
+if __name__ == '__main__':
+ dut = ICache()
+ vl = rtlil.convert(dut, ports=[])
+ with open("test_icache.il", "w") as f:
+ f.write(vl)
+
+ mem = []
+ for i in range(0,512):
+ mem.append((i*2)| ((i*2+1)<<32))
+
+ test_icache(mem)
+