+"""Dcache
+
+based on Anton Blanchard microwatt dcache.vhdl
+
+"""
+
+from enum import Enum, unique
+
+from nmigen import Module, Signal, Elaboratable,
+ Cat, Repl
+from nmigen.cli import main
+from nmigen.iocontrol import RecordObject
+from nmigen.util import log2_int
+
+from experiment.mem_types import LoadStore1ToDcacheType,
+ DcacheToLoadStore1Type,
+ MmuToDcacheType,
+ DcacheToMmuType
+
+from experiment.wb_types import WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
+ WBAddrType, WBDataType, WBSelType,
+ WbMasterOut, WBSlaveOut, WBMasterOutVector,
+ WBSlaveOutVector, WBIOMasterOut,
+ WBIOSlaveOut
+
# --
# -- Set associative dcache write-through
# --
# use work.wishbone_types.all;
#
# entity dcache is
+class Dcache(Elaboratable):
# generic (
# -- Line size in bytes
# LINE_SIZE : positive := 64;
# -- Non-zero to enable log data collection
# LOG_LENGTH : natural := 0
# );
+ def __init__(self):
+ # Line size in bytes
+ self.LINE_SIZE = 64
+ # Number of lines in a set
+ self.NUM_LINES = 32
+ # Number of ways
+ self.NUM_WAYS = 4
+ # L1 DTLB entries per set
+ self.TLB_SET_SIZE = 64
+ # L1 DTLB number of sets
+ self.TLB_NUM_WAYS = 2
+ # L1 DTLB log_2(page_size)
+ self.TLB_LG_PGSZ = 12
+ # Non-zero to enable log data collection
+ self.LOG_LENGTH = 0
# port (
# clk : in std_ulogic;
# rst : in std_ulogic;
#
# log_out : out std_ulogic_vector(19 downto 0)
# );
+ self.d_in = LoadStore1ToDcacheType()
+ self.d_out = DcacheToLoadStore1Type()
+
+ self.m_in = MmuToDcacheType()
+ self.m_out = DcacheToMmuType()
+
+ self.stall_out = Signal()
+
+ self.wb_out = WBMasterOut()
+ self.wb_in = WBSlaveOut()
+
+ self.log_out = Signal(20)
# end entity dcache;
-#
+
# architecture rtl of dcache is
+ def elaborate(self, platform):
+ LINE_SIZE = self.LINE_SIZE
+ NUM_LINES = self.NUM_LINES
+ NUM_WAYS = self.NUM_WAYS
+ TLB_SET_SIZE = self.TLB_SET_SIZE
+ TLB_NUM_WAYS = self.TLB_NUM_WAYS
+ TLB_LG_PGSZ = self.TLB_LG_PGSZ
+ LOG_LENGTH = self.LOG_LENGTH
+
# -- BRAM organisation: We never access more than
# -- wishbone_data_bits at a time so to save
# -- resources we make the array only that wide, and
# -- ROW_SIZE is the width in bytes of the BRAM
# -- (based on WB, so 64-bits)
# constant ROW_SIZE : natural := wishbone_data_bits / 8;
+ # BRAM organisation: We never access more than
+ # -- wishbone_data_bits at a time so to save
+ # -- resources we make the array only that wide, and
+ # -- use consecutive indices for to make a cache "line"
+ # --
+ # -- ROW_SIZE is the width in bytes of the BRAM
+ # -- (based on WB, so 64-bits)
+ ROW_SIZE = wishbone_data_bits / 8;
+
# -- ROW_PER_LINE is the number of row (wishbone
# -- transactions) in a line
# constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
# -- BRAM_ROWS is the number of rows in BRAM needed
# -- to represent the full dcache
# constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
-#
+ # ROW_PER_LINE is the number of row (wishbone
+ # transactions) in a line
+ ROW_PER_LINE = LINE_SIZE / ROW_SIZE
+ # BRAM_ROWS is the number of rows in BRAM needed
+ # to represent the full dcache
+ BRAM_ROWS = NUM_LINES * ROW_PER_LINE
+
# -- Bit fields counts in the address
#
# -- REAL_ADDR_BITS is the number of real address
# - ((TAG_BITS + 7) mod 8);
# -- WAY_BITS is the number of bits to select a way
# constant WAY_BITS : natural := log2(NUM_WAYS);
-#
+ # Bit fields counts in the address
+
+ # REAL_ADDR_BITS is the number of real address
+ # bits that we store
+ REAL_ADDR_BITS = 56
+ # ROW_BITS is the number of bits to select a row
+ ROW_BITS = log2_int(BRAM_ROWS)
+ # ROW_LINE_BITS is the number of bits to select
+ # a row within a line
+ ROW_LINE_BITS = log2_int(ROW_PER_LINE)
+ # LINE_OFF_BITS is the number of bits for
+ # the offset in a cache line
+ LINE_OFF_BITS = log2_int(LINE_SIZE)
+ # ROW_OFF_BITS is the number of bits for
+ # the offset in a row
+ ROW_OFF_BITS = log2_int(ROW_SIZE)
+ # INDEX_BITS is the number if bits to
+ # select a cache line
+ INDEX_BITS = log2_int(NUM_LINES)
+ # SET_SIZE_BITS is the log base 2 of the set size
+ SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
+ # TAG_BITS is the number of bits of
+ # the tag part of the address
+ TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
+ # TAG_WIDTH is the width in bits of each way of the tag RAM
+ TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
+ # WAY_BITS is the number of bits to select a way
+ WAY_BITS = log2_int(NUM_WAYS)
+
# -- Example of layout for 32 lines of 64 bytes:
# --
# -- .. tag |index| line |
# -- .. |----- ---| | ROW_BITS (8)
# -- .. |-----| | INDEX_BITS (5)
# -- .. --------| | TAG_BITS (45)
-#
+ # Example of layout for 32 lines of 64 bytes:
+ #
+ # .. tag |index| line |
+ # .. | row | |
+ # .. | |---| | ROW_LINE_BITS (3)
+ # .. | |--- - --| LINE_OFF_BITS (6)
+ # .. | |- --| ROW_OFF_BITS (3)
+ # .. |----- ---| | ROW_BITS (8)
+ # .. |-----| | INDEX_BITS (5)
+ # .. --------| | TAG_BITS (45)
+
+
# subtype row_t is integer range 0 to BRAM_ROWS-1;
# subtype index_t is integer range 0 to NUM_LINES-1;
# subtype way_t is integer range 0 to NUM_WAYS-1;
-# subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
-#
+# subtype row_in_line_t is unsigned(ROW_LINE_BITS-1 downto 0);
+ def Row():
+ return Signal(BRAM_ROWS)
+
+ def Index():
+ return Signal(NUM_LINES)
+
+ def Way():
+ return Signal(NUM_WAYS)
+
+ def RowInLine():
+ return Signal(ROW_LINE_BITS)
+
# -- The cache data BRAM organized as described above for each way
# subtype cache_row_t is
# std_ulogic_vector(wishbone_data_bits-1 downto 0);
-#
+ # The cache data BRAM organized as described above for each way
+ def CacheRow():
+ return Signal(WB_DATA_BITS)
+
# -- The cache tags LUTRAM has a row per set.
# -- Vivado is a pain and will not handle a
# -- clean (commented) definition of the cache
# -- tags as a 3d memory. For now, work around
# -- it by putting all the tags
# subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
+ # The cache tags LUTRAM has a row per set.
+ # Vivado is a pain and will not handle a
+ # clean (commented) definition of the cache
+ # tags as a 3d memory. For now, work around
+ # it by putting all the tags
+ def CacheTag():
+ return Signal(TAG_BITS)
+
# -- type cache_tags_set_t is array(way_t) of cache_tag_t;
# -- type cache_tags_array_t is array(index_t) of cache_tags_set_t;
# constant TAG_RAM_WIDTH : natural := TAG_WIDTH * NUM_WAYS;
# subtype cache_tags_set_t is
# std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
# type cache_tags_array_t is array(index_t) of cache_tags_set_t;
-#
+ # type cache_tags_set_t is array(way_t) of cache_tag_t;
+ # type cache_tags_array_t is array(index_t) of cache_tags_set_t;
+ TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
+
+ def CacheTagSet():
+ return Signal(TAG_RAM_WIDTH)
+
+ def CacheTagArray():
+ return Array(CacheTagSet() for x in range(Index()))
+
# -- The cache valid bits
# subtype cache_way_valids_t is
# std_ulogic_vector(NUM_WAYS-1 downto 0);
# type cache_valids_t is array(index_t) of cache_way_valids_t;
# type row_per_line_valid_t is
# array(0 to ROW_PER_LINE - 1) of std_ulogic;
-#
+ # The cache valid bits
+ def CacheWayValidBits():
+ return Signal(NUM_WAYS)
+ def CacheValidBits():
+ return Array(CacheWayValidBits() for x in range(Index()))
+ def RowPerLineValid():
+ return Array(Signal() for x in range(ROW_PER_LINE))
+
# -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
# signal cache_tags : cache_tags_array_t;
# signal cache_tag_set : cache_tags_set_t;
#
# attribute ram_style : string;
# attribute ram_style of cache_tags : signal is "distributed";
-#
+ # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
+ cache_tags = CacheTagArray()
+ cache_tag_set = CacheTagSet()
+ cache_valid_bits = CacheValidBits()
+
+ # TODO attribute ram_style : string;
+ # TODO attribute ram_style of cache_tags : signal is "distributed";
+
# -- L1 TLB.
# constant TLB_SET_BITS : natural := log2(TLB_SET_SIZE);
# constant TLB_WAY_BITS : natural := log2(TLB_NUM_WAYS);
# constant TLB_PTE_BITS : natural := 64;
# constant TLB_PTE_WAY_BITS : natural :=
# TLB_NUM_WAYS * TLB_PTE_BITS;
-#
+ # L1 TLB
+ TLB_SET_BITS = log2_int(TLB_SET_SIZE)
+ TLB_WAY_BITS = log2_int(TLB_NUM_WAYS)
+ TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
+ TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
+ TLB_PTE_BITS = 64
+ TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
+
# subtype tlb_way_t is integer range 0 to TLB_NUM_WAYS - 1;
# subtype tlb_index_t is integer range 0 to TLB_SET_SIZE - 1;
# subtype tlb_way_valids_t is
# std_ulogic_vector(TLB_PTE_WAY_BITS-1 downto 0);
# type tlb_ptes_t is array(tlb_index_t) of tlb_way_ptes_t;
# type hit_way_set_t is array(tlb_way_t) of way_t;
-#
+ def TLBWay():
+ return Signal(TLB_NUM_WAYS)
+
+ def TLBWayValidBits():
+ return Signal(TLB_NUM_WAYS)
+
+ def TLBValidBits():
+ return Array(TLBValidBits() for x in range(TLB_SET_SIZE))
+
+ def TLBTag():
+ return Signal(TLB_EA_TAG_BITS)
+
+ def TLBWayTags():
+ return Signal(TLB_TAG_WAY_BITS)
+
+ def TLBTags():
+ return Array(TLBWayTags() for x in range (TLB_SET_SIZE))
+
+ def TLBPte():
+ return Signal(TLB_PTE_BITS)
+
+ def TLBWayPtes():
+ return Signal(TLB_PTE_WAY_BITS)
+
+ def TLBPtes():
+ return Array(TLBWayPtes() for x in range(TLB_SET_SIZE))
+
+ def HitWaySet():
+ return Array(Way() for x in range(TLB_NUM_WAYS))
+
# signal dtlb_valids : tlb_valids_t;
# signal dtlb_tags : tlb_tags_t;
# signal dtlb_ptes : tlb_ptes_t;
+
+"""note: these are passed to nmigen.hdl.Memory as "attributes". don't
+ know how, just that they are.
+"""
# attribute ram_style of dtlb_tags : signal is "distributed";
# attribute ram_style of dtlb_ptes : signal is "distributed";
-#
+ dtlb_valids = tlb_valids_t;
+ dtlb_tags = tlb_tags_t;
+ dtlb_ptes = tlb_ptes_t;
+ # TODO attribute ram_style of dtlb_tags : signal is "distributed";
+ # TODO attribute ram_style of dtlb_ptes : signal is "distributed";
+
+
# -- Record for storing permission, attribute, etc. bits from a PTE
# type perm_attr_t is record
# reference : std_ulogic;
# rd_perm : std_ulogic;
# wr_perm : std_ulogic;
# end record;
-#
+ # Record for storing permission, attribute, etc. bits from a PTE
+ class PermAttr(RecordObject):
+ def __init__(self):
+ super().__init__()
+ self.reference = Signal()
+ self.changed = Signal()
+ self.nocache = Signal()
+ self.priv = Signal()
+ self.rd_perm = Signal()
+ self.wr_perm = Signal()
+
# function extract_perm_attr(
# pte : std_ulogic_vector(TLB_PTE_BITS - 1 downto 0))
# return perm_attr_t is
# pa.wr_perm := pte(1);
# return pa;
# end;
-#
+ def extract_perm_attr(pte=Signal(TLB_PTE_BITS)):
+ pa = PermAttr()
+ pa.reference = pte[8]
+ pa.changed = pte[7]
+ pa.nocache = pte[5]
+ pa.priv = pte[3]
+ pa.rd_perm = pte[2]
+ pa.wr_perm = pte[1]
+ return pa;
+
# constant real_mode_perm_attr : perm_attr_t :=
# (nocache => '0', others => '1');
-#
+ REAL_MODE_PERM_ATTR = PermAttr()
+ REAL_MODE_PERM_ATTR.reference = 1
+ REAL_MODE_PERM_ATTR.changed = 1
+ REAL_MODE_PERM_ATTR.priv = 1
+ REAL_MODE_PERM_ATTR.rd_perm = 1
+ REAL_MODE_PERM_ATTR.wr_perm = 1
+
# -- Type of operation on a "valid" input
# type op_t is
# (
# OP_STORE_HIT, -- Store hitting cache
# OP_STORE_MISS -- Store missing cache
# );
-#
+ # Type of operation on a "valid" input
+ @unique
+ class OP(Enum):
+ OP_NONE = 0
+ OP_BAD = 1 # NC cache hit, TLB miss, prot/RC failure
+ OP_STCX_FAIL = 2 # conditional store w/o reservation
+ OP_LOAD_HIT = 3 # Cache hit on load
+ OP_LOAD_MISS = 4 # Load missing cache
+ OP_LOAD_NC = 5 # Non-cachable load
+ OP_STORE_HIT = 6 # Store hitting cache
+ OP_STORE_MISS = 7 # Store missing cache
+
# -- Cache state machine
# type state_t is
# (
# STORE_WAIT_ACK, -- Store wait ack
# NC_LOAD_WAIT_ACK -- Non-cachable load wait ack
# );
-#
-# --
+ # Cache state machine
+ @unique
+ class State(Enum):
+ IDLE = 0 # Normal load hit processing
+ RELOAD_WAIT_ACK = 1 # Cache reload wait ack
+ STORE_WAIT_ACK = 2 # Store wait ack
+ NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
+
# -- Dcache operations:
# --
# -- In order to make timing, we use the BRAMs with
# tlbld : std_ulogic;
# mmu_req : std_ulogic; -- indicates source of request
# end record;
-#
+# Dcache operations:
+#
+# In order to make timing, we use the BRAMs with
+# an output buffer, which means that the BRAM
+# output is delayed by an extra cycle.
+#
+# Thus, the dcache has a 2-stage internal pipeline
+# for cache hits with no stalls.
+#
+# All other operations are handled via stalling
+# in the first stage.
+#
+# The second stage can thus complete a hit at the same
+# time as the first stage emits a stall for a complex op.
+#
+ # Stage 0 register, basically contains just the latched request
+ class RegStage0(RecordObject):
+ def __init__(self):
+ super().__init__()
+ self.req = LoadStore1ToDcacheType()
+ self.tlbie = Signal()
+ self.doall = Signal()
+ self.tlbld = Signal()
+ self.mmu_req = Signal() # indicates source of request
+
# signal r0 : reg_stage_0_t;
# signal r0_full : std_ulogic;
-#
+ r0 = RegStage0()
+ r0_full = Signal()
+
# type mem_access_request_t is record
# op : op_t;
# valid : std_ulogic;
# same_tag : std_ulogic;
# mmu_req : std_ulogic;
# end record;
-#
+ class MemAccessRequest(RecordObject):
+ def __init__(self):
+ super().__init__()
+ self.op = Op()
+ self.valid = Signal()
+ self.dcbz = Signal()
+ self.real_addr = Signal(REAL_ADDR_BITS)
+ self.data = Signal(64)
+ self.byte_sel = Signal(8)
+ self.hit_way = Way()
+ self.same_tag = Signal()
+ self.mmu_req = Signal()
+
# -- First stage register, contains state for stage 1 of load hits
# -- and for the state machine used by all other operations
# type reg_stage_1_t is record
# -- Signal to complete a failed stcx.
# stcx_fail : std_ulogic;
# end record;
-#
+# First stage register, contains state for stage 1 of load hits
+# and for the state machine used by all other operations
+ class RegStage1(RecordObject):
+ def __init__(self):
+ super().__init__()
+ # Info about the request
+ self.full = Signal() # have uncompleted request
+ self.mmu_req = Signal() # request is from MMU
+ self.req = MemAccessRequest()
+
+ # Cache hit state
+ self.hit_way = Way()
+ self.hit_load_valid = Signal()
+ self.hit_index = Index()
+ self.cache_hit = Signal()
+
+ # TLB hit state
+ self.tlb_hit = Signal()
+ self.tlb_hit_way = TLBWay()
+ self.tlb_hit_index = Signal(TLB_SET_SIZE)
+ self.
+ # 2-stage data buffer for data forwarded from writes to reads
+ self.forward_data1 = Signal(64)
+ self.forward_data2 = Signal(64)
+ self.forward_sel1 = Signal(8)
+ self.forward_valid1 = Signal()
+ self.forward_way1 = Way()
+ self.forward_row1 = Row()
+ self.use_forward1 = Signal()
+ self.forward_sel = Signal(8)
+
+ # Cache miss state (reload state machine)
+ self.state = State()
+ self.dcbz = Signal()
+ self.write_bram = Signal()
+ self.write_tag = Signal()
+ self.slow_valid = Signal()
+ self.wb = WishboneMasterOut()
+ self.reload_tag = CacheTag()
+ self.store_way = Way()
+ self.store_row = Row()
+ self.store_index = Index()
+ self.end_row_ix = RowInLine()
+ self.rows_valid = RowPerLineValid()
+ self.acks_pending = Signal(3)
+ self.inc_acks = Signal()
+ self.dec_acks = Signal()
+
+ # Signals to complete (possibly with error)
+ self.ls_valid = Signal()
+ self.ls_error = Signal()
+ self.mmu_done = Signal()
+ self.mmu_error = Signal()
+ self.cache_paradox = Signal()
+
+ # Signal to complete a failed stcx.
+ self.stcx_fail = Signal()
+
# signal r1 : reg_stage_1_t;
-#
+ r1 = RegStage1()
+
# -- Reservation information
# --
# type reservation_t is record
# valid : std_ulogic;
# addr : std_ulogic_vector(63 downto LINE_OFF_BITS);
# end record;
-#
+# Reservation information
+
+ class Reservation(RecordObject):
+ def __init__(self):
+ super().__init__()
+ valid = Signal()
+ # TODO LINE_OFF_BITS is 6
+ addr = Signal(63 downto LINE_OFF_BITS)
+
# signal reservation : reservation_t;
-#
+ reservation = Reservation()
+
# -- Async signals on incoming request
# signal req_index : index_t;
# signal req_row : row_t;
# signal req_data : std_ulogic_vector(63 downto 0);
# signal req_same_tag : std_ulogic;
# signal req_go : std_ulogic;
-#
+ # Async signals on incoming request
+ req_index = Index()
+ req_row = Row()
+ req_hit_way = Way()
+ req_tag = CacheTag()
+ req_op = Op()
+ req_data = Signal(64)
+ req_same_tag = Signal()
+ req_go = Signal()
+
# signal early_req_row : row_t;
#
# signal cancel_store : std_ulogic;
#
# signal use_forward1_next : std_ulogic;
# signal use_forward2_next : std_ulogic;
-#
+ early_req_row = Row()
+
+ cancel_store = Signal()
+ set_rsrv = Signal()
+ clear_rsrv = Signal()
+
+ r0_valid = Signal()
+ r0_stall = Signal()
+
+ use_forward1_next = Signal()
+ use_forward2_next = Signal()
+
# -- Cache RAM interface
# type cache_ram_out_t is array(way_t) of cache_row_t;
# signal cache_out : cache_ram_out_t;
-#
+ # Cache RAM interface
+ def CacheRamOut():
+ return Array(CacheRow() for x in range(NUM_WAYS))
+
+ cache_out = CacheRamOut()
+
# -- PLRU output interface
# type plru_out_t is array(index_t) of
# std_ulogic_vector(WAY_BITS-1 downto 0);
# signal plru_victim : plru_out_t;
# signal replace_way : way_t;
-#
+ # PLRU output interface
+ def PLRUOut():
+ return Array(Signal(WAY_BITS) for x in range(Index()))
+
+ plru_victim = PLRUOut()
+ replace_way = Way()
+
# -- Wishbone read/write/cache write formatting signals
# signal bus_sel : std_ulogic_vector(7 downto 0);
-#
+ # Wishbone read/write/cache write formatting signals
+ bus_sel = Signal(8)
+
# -- TLB signals
# signal tlb_tag_way : tlb_way_tags_t;
# signal tlb_pte_way : tlb_way_ptes_t;
# signal rc_ok : std_ulogic;
# signal perm_ok : std_ulogic;
# signal access_ok : std_ulogic;
-#
+ # TLB signals
+ tlb_tag_way = TLBWayTags()
+ tlb_pte_way = TLBWayPtes()
+ tlb_valid_way = TLBWayValidBits()
+ tlb_req_index = Signal(TLB_SET_SIZE)
+ tlb_hit = Signal()
+ tlb_hit_way = TLBWay()
+ pte = TLBPte()
+ ra = Signal(REAL_ADDR_BITS)
+ valid_ra = Signal()
+ perm_attr = PermAttr()
+ rc_ok = Signal()
+ perm_ok = Signal()
+ access_ok = Signal()
+
# -- TLB PLRU output interface
# type tlb_plru_out_t is array(tlb_index_t) of
# std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
# signal tlb_plru_victim : tlb_plru_out_t;
-#
-# --
+ # TLB PLRU output interface
+ TLBPLRUOut():
+ return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
+
+ tlb_plru_victim = TLBPLRUOut()
+
# -- Helper functions to decode incoming requests
-# --
#
# -- Return the cache line index (tag index) for an address
# function get_index(addr: std_ulogic_vector) return index_t is
# unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS))
# );
# end;
+# Helper functions to decode incoming requests
#
+ # Return the cache line index (tag index) for an address
+ def get_index(addr):
+ return addr[LINE_OFF_BITS:SET_SIZE_BITS]
+
# -- Return the cache row index (data memory) for an address
# function get_row(addr: std_ulogic_vector) return row_t is
# begin
# unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS))
# );
# end;
-#
+ # Return the cache row index (data memory) for an address
+ def get_row(addr):
+ return addr[ROW_OFF_BITS:SET_SIZE_BITS]
+
# -- Return the index of a row within a line
# function get_row_of_line(row: row_t) return row_in_line_t is
# variable row_v : unsigned(ROW_BITS-1 downto 0);
# row_v := to_unsigned(row, ROW_BITS);
# return row_v(ROW_LINEBITS-1 downto 0);
# end;
-#
+ # Return the index of a row within a line
+ def get_row_of_line(row):
+ row_v = Signal(ROW_BITS)
+ row_v = Signal(row)
+ return row_v[0:ROW_LINE_BITS]
+
# -- Returns whether this is the last row of a line
# function is_last_row_addr(addr: wishbone_addr_type;
# last: row_in_line_t) return boolean is
# return
# unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last;
# end;
-#
+ # Returns whether this is the last row of a line
+ def is_last_row_addr(addr, last):
+ return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
+
# -- Returns whether this is the last row of a line
# function is_last_row(row: row_t; last: row_in_line_t)
# return boolean is
# begin
# return get_row_of_line(row) = last;
# end;
-#
+ # Returns whether this is the last row of a line
+ def is_last_row(row, last):
+ return get_row_of_line(row) == last
+
# -- Return the address of the next row in the current cache line
# function next_row_addr(addr: wishbone_addr_type)
# return std_ulogic_vector is
# result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
# return result;
# end;
-#
+ # Return the address of the next row in the current cache line
+ def next_row_addr(addr):
+ row_idx = Signal(ROW_LINE_BITS)
+ result = WBAddrType()
+ # Is there no simpler way in VHDL to
+ # generate that 3 bits adder ?
+ row_idx = addr[ROW_OFF_BITS:LINE_OFF_BITS]
+ row_idx = Signal(row_idx + 1)
+ result = addr
+ result[ROW_OFF_BITS:LINE_OFF_BITS] = row_idx
+ return result
+
# -- Return the next row in the current cache line. We use a
# -- dedicated function in order to limit the size of the
# -- generated adder to be only the bits within a cache line
# std_ulogic_vector(unsigned(row_idx) + 1);
# return to_integer(unsigned(row_v));
# end;
-#
+# Return the next row in the current cache line. We use a
+# dedicated function in order to limit the size of the
+# generated adder to be only the bits within a cache line
+# (3 bits with default settings)
+ def next_row(row)
+ row_v = Signal(ROW_BITS)
+ row_idx = Signal(ROW_LINE_BITS)
+ result = Signal(ROW_BITS)
+
+ row_v = Signal(row)
+ row_idx = row_v[ROW_LINE_BITS]
+ row_v[0:ROW_LINE_BITS] = Signal(row_idx + 1)
+ return row_v
+
# -- Get the tag value from the address
# function get_tag(addr: std_ulogic_vector) return cache_tag_t is
# begin
# return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
# end;
-#
+ # Get the tag value from the address
+ def get_tag(addr):
+ return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
+
# -- Read a tag from a tag memory row
# function read_tag(way: way_t; tagset: cache_tags_set_t)
# return cache_tag_t is
# return tagset(way * TAG_WIDTH + TAG_BITS
# - 1 downto way * TAG_WIDTH);
# end;
-#
+ # Read a tag from a tag memory row
+ def read_tag(way, tagset):
+ return tagset[way *TAG_WIDTH:way * TAG_WIDTH + TAG_BITS]
+
# -- Read a TLB tag from a TLB tag memory row
# function read_tlb_tag(way: tlb_way_t; tags: tlb_way_tags_t)
# return tlb_tag_t is
# j := way * TLB_EA_TAG_BITS;
# return tags(j + TLB_EA_TAG_BITS - 1 downto j);
# end;
-#
+ # Read a TLB tag from a TLB tag memory row
+ def read_tlb_tag(way, tags):
+ j = Signal()
+
+ j = way * TLB_EA_TAG_BITS
+ return tags[j:j + TLB_EA_TAG_BITS]
+
# -- Write a TLB tag to a TLB tag memory row
# procedure write_tlb_tag(way: tlb_way_t; tags: inout tlb_way_tags_t;
# tag: tlb_tag_t) is
# j := way * TLB_EA_TAG_BITS;
# tags(j + TLB_EA_TAG_BITS - 1 downto j) := tag;
# end;
-#
+ # Write a TLB tag to a TLB tag memory row
+ def write_tlb_tag(way, tags), tag):
+ j = Signal()
+
+ j = way * TLB_EA_TAG_BITS
+ tags[j:j + TLB_EA_TAG_BITS] = tag
+
# -- Read a PTE from a TLB PTE memory row
# function read_tlb_pte(way: tlb_way_t; ptes: tlb_way_ptes_t)
# return tlb_pte_t is
# j := way * TLB_PTE_BITS;
# return ptes(j + TLB_PTE_BITS - 1 downto j);
# end;
-#
+ # Read a PTE from a TLB PTE memory row
+ def read_tlb_pte(way, ptes):
+ j = Signal()
+
+ j = way * TLB_PTE_BITS
+ return ptes[j:j + TLB_PTE_BITS]
+
# procedure write_tlb_pte(way: tlb_way_t;
# ptes: inout tlb_way_ptes_t; newpte: tlb_pte_t) is
# variable j : integer;
# j := way * TLB_PTE_BITS;
# ptes(j + TLB_PTE_BITS - 1 downto j) := newpte;
# end;
-#
+ def write_tlb_pte(way, ptes),
+ newpte=TLBPte()):
+
+ j = Signal()
+
+ j = way * TLB_PTE_BITS
+ return ptes[j:j + TLB_PTE_BITS] = newpte
+
# begin
#
+"""these, because they are constants, can actually be done *as*
+ python asserts:
+ assert LINE_SIZE % ROWSIZE == 0, "line size not ...."
+"""
# assert LINE_SIZE mod ROW_SIZE = 0
# report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE;
# assert ispow2(LINE_SIZE)
# severity FAILURE;
# assert SET_SIZE_BITS <= TLB_LG_PGSZ
# report "Set indexed by virtual address" severity FAILURE;
-#
+ assert (LINE_SIZE % ROW_SIZE) == 0 "LINE_SIZE not " \
+ "multiple of ROW_SIZE -!- severity FAILURE"
+
+ assert (LINE_SIZE % 2) == 0 "LINE_SIZE not power of" \
+ "2 -!- severity FAILURE"
+
+ assert (NUM_LINES % 2) == 0 "NUM_LINES not power of
+ 2 -!- severity FAILURE"
+
+ assert (ROW_PER_LINE % 2) == 0 "ROW_PER_LINE not
+ power of 2 -!- severity FAILURE"
+
+ assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)
+ "geometry bits don't add up -!- severity FAILURE"
+
+ assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
+ "geometry bits don't add up -!- severity FAILURE"
+
+ assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS
+ + LINE_OFF_BITS) "geometry bits don't add up -!-
+ severity FAILURE"
+
+ assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)
+ "geometry bits don't add up -!- severity FAILURE"
+
+ assert 64 == wishbone_data_bits "Can't yet handle a
+ wishbone width that isn't 64-bits -!- severity FAILURE"
+
+ assert SET_SIZE_BITS <= TLB_LG_PGSZ "Set indexed by
+ virtual address -!- severity FAILURE"
+
# -- Latch the request in r0.req as long as we're not stalling
# stage_0 : process(clk)
+# Latch the request in r0.req as long as we're not stalling
+class Stage0(Elaboratable):
+ def __init__(self):
+ pass
+
+ def elaborate(self, platform):
+ m = Module()
+
+ comb = m.d.comb
+ sync = m.d.sync
+
# variable r : reg_stage_0_t;
+ r = RegStage0()
+ comb += r
+
# begin
# if rising_edge(clk) then
# assert (d_in.valid and m_in.valid) = '0'
# report "request collision loadstore vs MMU";
+ assert ~(d_in.valid & m_in.valid) "request collision
+ loadstore vs MMU"
+
# if m_in.valid = '1' then
+ with m.If(m_in.valid):
# r.req.valid := '1';
# r.req.load := not (m_in.tlbie or m_in.tlbld);
# r.req.dcbz := '0';
# r.doall := m_in.doall;
# r.tlbld := m_in.tlbld;
# r.mmu_req := '1';
+ sync += r.req.valid.eq(1)
+ sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
+ sync += r.req.priv_mode.eq(1)
+ sync += r.req.addr.eq(m_in.addr)
+ sync += r.req.data.eq(m_in.pte)
+ sync += r.req.byte_sel.eq(1)
+ sync += r.tlbie.eq(m_in.tlbie)
+ sync += r.doall.eq(m_in.doall)
+ sync += r.tlbld.eq(m_in.tlbld)
+ sync += r.mmu_req.eq(1)
# else
+ with m.Else():
# r.req := d_in;
# r.tlbie := '0';
# r.doall := '0';
# r.tlbld := '0';
# r.mmu_req := '0';
+ sync += r.req.eq(d_in)
# end if;
# if rst = '1' then
# r0_full <= '0';
# elsif r1.full = '0' or r0_full = '0' then
+ with m.If(~r1.full | ~r0_full):
# r0 <= r;
# r0_full <= r.req.valid;
+ sync += r0.eq(r)
+ sync += r0_full.eq(r.req.valid)
# end if;
# end if;
# end process;
# -- we don't yet handle collisions between loadstore1 requests
# -- and MMU requests
# m_out.stall <= '0';
-#
+# we don't yet handle collisions between loadstore1 requests
+# and MMU requests
+comb += m_out.stall.eq(0)
+
# -- Hold off the request in r0 when r1 has an uncompleted request
# r0_stall <= r0_full and r1.full;
# r0_valid <= r0_full and not r1.full;
# stall_out <= r0_stall;
-#
+# Hold off the request in r0 when r1 has an uncompleted request
+comb += r0_stall.eq(r0_full & r1.full)
+comb += r0_valid.eq(r0_full & ~r1.full)
+comb += stall_out.eq(r0_stall)
+
# -- TLB
# -- Operates in the second cycle on the request latched in r0.req.
# -- TLB updates write the entry at the end of the second cycle.
# tlb_read : process(clk)
+# TLB
+# Operates in the second cycle on the request latched in r0.req.
+# TLB updates write the entry at the end of the second cycle.
+class TLBRead(Elaboratable):
+ def __init__(self):
+ pass
+
+ def elaborate(self, platform):
+ m = Module()
+
+ comb = m.d.comb
+ sync = m.d.sync
+
# variable index : tlb_index_t;
# variable addrbits :
# std_ulogic_vector(TLB_SET_BITS - 1 downto 0);
+ index = TLB_SET_SIZE
+ addrbits = Signal(TLB_SET_BITS)
+
+ comb += index
+ comb += addrbits
+
# begin
# if rising_edge(clk) then
# if m_in.valid = '1' then
+ with m.If(m_in.valid):
# addrbits := m_in.addr(TLB_LG_PGSZ + TLB_SET_BITS
# - 1 downto TLB_LG_PGSZ);
+ sync += addrbits.eq(m_in.addr[
+ TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_SET_BITS
+ ])
# else
+ with m.Else():
# addrbits := d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS
# - 1 downto TLB_LG_PGSZ);
+ sync += addrbits.eq(d_in.addr[
+ TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_SET_BITS
+ ])
# end if;
+
# index := to_integer(unsigned(addrbits));
+ sync += index.eq(addrbits)
# -- If we have any op and the previous op isn't finished,
# -- then keep the same output for next cycle.
# if r0_stall = '0' then
-# tlb_valid_way <= dtlb_valids(index);
-# tlb_tag_way <= dtlb_tags(index);
-# tlb_pte_way <= dtlb_ptes(index);
+# If we have any op and the previous op isn't finished,
+# then keep the same output for next cycle.
+ with m.If(~r0_stall):
+ sync += tlb_valid_way.eq(dtlb_valids[index])
+ sync += tlb_tag_way.eq(dtlb_tags[index])
+ sync += tlb_pte_way.eq(dtlb_ptes[index])
# end if;
# end if;
# end process;
-#
+
# -- Generate TLB PLRUs
# maybe_tlb_plrus: if TLB_NUM_WAYS > 1 generate
+# Generate TLB PLRUs
+class MaybeTLBPLRUs(Elaboratable):
+ def __init__(self):
+ pass
+
+ def elaborate(self, platform):
+ m = Module()
+
+ comb = m.d.comb
+ sync = m.d.sync
+
+ with m.If(TLB_NUM_WAYS > 1):
# begin
+# TODO understand how to conver generate statements
# tlb_plrus: for i in 0 to TLB_SET_SIZE - 1 generate
# -- TLB PLRU interface
# signal tlb_plru_acc :
# end process;
# end generate;
# end generate;
+# end TODO
#
# tlb_search : process(all)
+class TLBSearch(Elaboratable):
+ def __init__(self):
+ pass
+
+ def elborate(self, platform):
+ m = Module()
+
+ comb = m.d.comb
+ sync = m.d.sync
+
# variable hitway : tlb_way_t;
# variable hit : std_ulogic;
# variable eatag : tlb_tag_t;
+ hitway = TLBWay()
+ hit = Signal()
+ eatag = TLBTag()
+
+ comb += hitway
+ comb += hit
+ comb += eatag
+
# begin
# tlb_req_index <=
# to_integer(unsigned(r0.req.addr(
# end loop;
# tlb_hit <= hit and r0_valid;
# tlb_hit_way <= hitway;
+ comb += tlb_req_index.eq(r0.req.addr[
+ TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_SET_BITS
+ ])
+
+ comb += eatag.eq(r0.req.addr[
+ TLB_LG_PGSZ + TLB_SET_BITS:64
+ ])
+
+ for i in TLBWay():
+ with m.If(tlb_valid_way(i)
+ & read_tlb_tag(i, tlb_tag_way) == eatag):
+
+ comb += hitway.eq(i)
+ comb += hit.eq(1)
+
+ comb += tlb_hit.eq(hit & r0_valid)
+ comb += tlb_hit_way.eq(hitway)
+
# if tlb_hit = '1' then
+ with m.If(tlb_hit):
# pte <= read_tlb_pte(hitway, tlb_pte_way);
+ comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
# else
+ with m.Else():
# pte <= (others => '0');
+ comb += pte.eq(0)
# end if;
# valid_ra <= tlb_hit or not r0.req.virt_mode;
+ comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
# if r0.req.virt_mode = '1' then
+ with m.If(r0.req.virt_mode):
# ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
# r0.req.addr(TLB_LG_PGSZ - 1 downto ROW_OFF_BITS) &
# (ROW_OFF_BITS-1 downto 0 => '0');
# perm_attr <= extract_perm_attr(pte);
+ comb += ra.eq(Cat(
+ Const(ROW_OFF_BITS, ROW_OFF_BITS),
+ r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
+ pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
+ ))
+ comb += perm_attr.eq(extract_perm_attr(pte))
# else
+ with m.Else():
# ra <= r0.req.addr(
# REAL_ADDR_BITS - 1 downto ROW_OFF_BITS
# ) & (ROW_OFF_BITS-1 downto 0 => '0');
+ comb += ra.eq(Cat(
+ Const(ROW_OFF_BITS, ROW_OFF_BITS),
+ r0.rq.addr[ROW_OFF_BITS:REAL_ADDR_BITS]
+ )
+
# perm_attr <= real_mode_perm_attr;
+ comb += perm_attr.eq(real_mode_perm_attr)
# end if;
# end process;
-#
+
# tlb_update : process(clk)
+class TLBUpdate(Elaboratable):
+ def __init__(self):
+ pass
+
+ def elaborate(self, platform):
+ m = Module()
+
+ comb = m.d.comb
+ sync = m.d.sync
+
# variable tlbie : std_ulogic;
# variable tlbwe : std_ulogic;
# variable repl_way : tlb_way_t;
# variable eatag : tlb_tag_t;
# variable tagset : tlb_way_tags_t;
# variable pteset : tlb_way_ptes_t;
+ tlbie = Signal()
+ tlbwe = Signal()
+ repl_way = TLBWay()
+ eatag = TLBTag()
+ tagset = TLBWayTags()
+ pteset = TLBWayPtes()
+
+ comb += tlbie
+ comb += tlbwe
+ comb += repl_way
+ comb += eatag
+ comb += tagset
+ comb += pteset
+
# begin
# if rising_edge(clk) then
# tlbie := r0_valid and r0.tlbie;
-# tlbwe := r0_valid and r0.tlbld;
+# tlbwe := r0_valid and r0.tlbldoi;
+ sync += tlbie.eq(r0_valid & r0.tlbie)
+ sync += tlbwe.eq(r0_valid & r0.tlbldoi)
+
# if rst = '1' or (tlbie = '1' and r0.doall = '1') then
+# with m.If (TODO understand how signal resets work in nmigen)
# -- clear all valid bits at once
# for i in tlb_index_t loop
# dtlb_valids(i) <= (others => '0');
# end loop;
+ # clear all valid bits at once
+ for i in range(TLB_SET_SIZE):
+ sync += dtlb_valids[i].eq(0)
# elsif tlbie = '1' then
+ with m.Elif(tlbie):
# if tlb_hit = '1' then
+ with m.If(tlb_hit):
# dtlb_valids(tlb_req_index)(tlb_hit_way) <= '0';
+ sync += dtlb_valids[tlb_req_index][tlb_hit_way].eq(0)
# end if;
# elsif tlbwe = '1' then
+ with m.Elif(tlbwe):
# if tlb_hit = '1' then
+ with m.If(tlb_hit):
# repl_way := tlb_hit_way;
+ sync += repl_way.eq(tlb_hit_way)
# else
+ with m.Else():
# repl_way := to_integer(unsigned(
# tlb_plru_victim(tlb_req_index)));
+ sync += repl_way.eq(tlb_plru_victim[tlb_req_index])
# end if;
# eatag := r0.req.addr(
# 63 downto TLB_LG_PGSZ + TLB_SET_BITS
# write_tlb_pte(repl_way, pteset, r0.req.data);
# dtlb_ptes(tlb_req_index) <= pteset;
# dtlb_valids(tlb_req_index)(repl_way) <= '1';
+ sync += eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
+ sync += tagset.eq(tlb_tag_way)
+ sync += write_tlb_tag(repl_way, tagset, eatag)
+ sync += dtlb_tags[tlb_req_index].eq(tagset)
+ sync += pteset.eq(tlb_pte_way)
+ sync += write_tlb_pte(repl_way, pteset, r0.req.data)
+ sync += dtlb_ptes[tlb_req_index].eq(pteset)
+ sync += dtlb_valids[tlb_req_index][repl_way].eq(1)
# end if;
# end if;
# end process;
-#
+
# -- Generate PLRUs
# maybe_plrus: if NUM_WAYS > 1 generate
+class MaybePLRUs(Elaboratable):
+ def __init__(self):
+ pass
+
+ def elaborate(self, platform):
+ m = Module()
+
+ comb = m.d.comb
+ sync = m.d.sync
+
# begin
+ # TODO learn translation of generate into nmgien @lkcl
# plrus: for i in 0 to NUM_LINES-1 generate
# -- PLRU interface
# signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
# signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
#
# begin
+ # TODO learn tranlation of entity, generic map, port map in
+ # nmigen @lkcl
# plru : entity work.plru
# generic map (
# BITS => WAY_BITS