3 based on Anton Blanchard microwatt icache.vhdl
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
22 from enum
import Enum
, unique
23 from nmigen
import (Module
, Signal
, Elaboratable
, Cat
, Array
, Const
)
24 from nmigen
.cli
import main
25 from nmigen
.cli
import rtlil
26 from nmutil
.iocontrol
import RecordObject
27 from nmutil
.byterev
import byte_reverse
28 from nmutil
.mask
import Mask
29 from nmigen
.utils
import log2_int
30 from nmutil
.util
import Display
32 from soc
.experiment
.mem_types
import (Fetch1ToICacheType
,
36 from soc
.experiment
.wb_types
import (WB_ADDR_BITS
, WB_DATA_BITS
,
37 WB_SEL_BITS
, WBAddrType
, WBDataType
,
38 WBSelType
, WBMasterOut
, WBSlaveOut
,
39 WBMasterOutVector
, WBSlaveOutVector
,
40 WBIOMasterOut
, WBIOSlaveOut
)
42 from soc
.experiment
.cache_ram
import CacheRam
43 from soc
.experiment
.plru
import PLRU
46 from nmigen_soc
.wishbone
.sram
import SRAM
47 from nmigen
import Memory
48 from nmigen
.cli
import rtlil
50 from nmigen
.back
.pysim
import Simulator
, Delay
, Settle
52 from nmigen
.sim
.cxxsim
import Simulator
, Delay
, Settle
53 from nmutil
.util
import wrap
59 # BRAM organisation: We never access more than wishbone_data_bits
60 # at a time so to save resources we make the array only that wide,
61 # and use consecutive indices for to make a cache "line"
63 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
64 ROW_SIZE
= WB_DATA_BITS
// 8
65 # Number of lines in a set
69 # L1 ITLB number of entries (direct mapped)
71 # L1 ITLB log_2(page_size)
73 # Number of real address bits that we store
75 # Non-zero to enable log data collection
78 ROW_SIZE_BITS
= ROW_SIZE
* 8
79 # ROW_PER_LINE is the number of row
80 # (wishbone) transactions in a line
81 ROW_PER_LINE
= LINE_SIZE
// ROW_SIZE
82 # BRAM_ROWS is the number of rows in
83 # BRAM needed to represent the full icache
84 BRAM_ROWS
= NUM_LINES
* ROW_PER_LINE
85 # INSN_PER_ROW is the number of 32bit
86 # instructions per BRAM row
87 INSN_PER_ROW
= ROW_SIZE_BITS
// 32
89 # Bit fields counts in the address
91 # INSN_BITS is the number of bits to
92 # select an instruction in a row
93 INSN_BITS
= log2_int(INSN_PER_ROW
)
94 # ROW_BITS is the number of bits to
96 ROW_BITS
= log2_int(BRAM_ROWS
)
97 # ROW_LINEBITS is the number of bits to
98 # select a row within a line
99 ROW_LINE_BITS
= log2_int(ROW_PER_LINE
)
100 # LINE_OFF_BITS is the number of bits for
101 # the offset in a cache line
102 LINE_OFF_BITS
= log2_int(LINE_SIZE
)
103 # ROW_OFF_BITS is the number of bits for
104 # the offset in a row
105 ROW_OFF_BITS
= log2_int(ROW_SIZE
)
106 # INDEX_BITS is the number of bits to
107 # select a cache line
108 INDEX_BITS
= log2_int(NUM_LINES
)
109 # SET_SIZE_BITS is the log base 2 of
111 SET_SIZE_BITS
= LINE_OFF_BITS
+ INDEX_BITS
112 # TAG_BITS is the number of bits of
113 # the tag part of the address
114 TAG_BITS
= REAL_ADDR_BITS
- SET_SIZE_BITS
115 # WAY_BITS is the number of bits to
117 WAY_BITS
= log2_int(NUM_WAYS
)
118 TAG_RAM_WIDTH
= TAG_BITS
* NUM_WAYS
121 # constant TLB_BITS : natural := log2(TLB_SIZE);
122 # constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
123 # constant TLB_PTE_BITS : natural := 64;
124 TLB_BITS
= log2_int(TLB_SIZE
)
125 TLB_EA_TAG_BITS
= 64 - (TLB_LG_PGSZ
+ TLB_BITS
)
128 # architecture rtl of icache is
129 #constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
130 #-- ROW_PER_LINE is the number of row (wishbone
131 #-- transactions) in a line
132 #constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
133 #-- BRAM_ROWS is the number of rows in BRAM
134 #-- needed to represent the full
136 #constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
137 #-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
138 #constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
139 #-- Bit fields counts in the address
141 #-- INSN_BITS is the number of bits to select
142 #-- an instruction in a row
143 #constant INSN_BITS : natural := log2(INSN_PER_ROW);
144 #-- ROW_BITS is the number of bits to select a row
145 #constant ROW_BITS : natural := log2(BRAM_ROWS);
146 #-- ROW_LINEBITS is the number of bits to
147 #-- select a row within a line
148 #constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
149 #-- LINE_OFF_BITS is the number of bits for the offset
151 #constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
152 #-- ROW_OFF_BITS is the number of bits for the offset in a row
153 #constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
154 #-- INDEX_BITS is the number of bits to select a cache line
155 #constant INDEX_BITS : natural := log2(NUM_LINES);
156 #-- SET_SIZE_BITS is the log base 2 of the set size
157 #constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
158 #-- TAG_BITS is the number of bits of the tag part of the address
159 #constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
160 #-- WAY_BITS is the number of bits to select a way
161 #constant WAY_BITS : natural := log2(NUM_WAYS);
163 #-- Example of layout for 32 lines of 64 bytes:
165 #-- .. tag |index| line |
167 #-- .. | | | |00| zero (2)
168 #-- .. | | |-| | INSN_BITS (1)
169 #-- .. | |---| | ROW_LINEBITS (3)
170 #-- .. | |--- - --| LINE_OFF_BITS (6)
171 #-- .. | |- --| ROW_OFF_BITS (3)
172 #-- .. |----- ---| | ROW_BITS (8)
173 #-- .. |-----| | INDEX_BITS (5)
174 #-- .. --------| | TAG_BITS (53)
175 # Example of layout for 32 lines of 64 bytes:
177 # .. tag |index| line |
179 # .. | | | |00| zero (2)
180 # .. | | |-| | INSN_BITS (1)
181 # .. | |---| | ROW_LINEBITS (3)
182 # .. | |--- - --| LINE_OFF_BITS (6)
183 # .. | |- --| ROW_OFF_BITS (3)
184 # .. |----- ---| | ROW_BITS (8)
185 # .. |-----| | INDEX_BITS (5)
186 # .. --------| | TAG_BITS (53)
188 #subtype row_t is integer range 0 to BRAM_ROWS-1;
189 #subtype index_t is integer range 0 to NUM_LINES-1;
190 #subtype way_t is integer range 0 to NUM_WAYS-1;
191 #subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
193 #-- The cache data BRAM organized as described above for each way
194 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
196 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
197 #-- not handle a clean (commented) definition of the cache tags as a 3d
198 #-- memory. For now, work around it by putting all the tags
199 #subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
200 # type cache_tags_set_t is array(way_t) of cache_tag_t;
201 # type cache_tags_array_t is array(index_t) of cache_tags_set_t;
202 #constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
203 #subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
204 #type cache_tags_array_t is array(index_t) of cache_tags_set_t;
206 return Array(Signal(TAG_RAM_WIDTH
) for x
in range(NUM_LINES
))
208 #-- The cache valid bits
209 #subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
210 #type cache_valids_t is array(index_t) of cache_way_valids_t;
211 #type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
212 def CacheValidBitsArray():
213 return Array(Signal(NUM_WAYS
) for x
in range(NUM_LINES
))
215 def RowPerLineValidArray():
216 return Array(Signal() for x
in range(ROW_PER_LINE
))
219 #attribute ram_style : string;
220 #attribute ram_style of cache_tags : signal is "distributed";
221 # TODO to be passed to nigmen as ram attributes
222 # attribute ram_style : string;
223 # attribute ram_style of cache_tags : signal is "distributed";
226 #subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
227 #type tlb_valids_t is array(tlb_index_t) of std_ulogic;
228 #subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
229 #type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
230 #subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
231 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
232 def TLBValidBitsArray():
233 return Array(Signal() for x
in range(TLB_SIZE
))
236 return Array(Signal(TLB_EA_TAG_BITS
) for x
in range(TLB_SIZE
))
239 return Array(Signal(TLB_PTE_BITS
) for x
in range(TLB_SIZE
))
242 #-- Cache RAM interface
243 #type cache_ram_out_t is array(way_t) of cache_row_t;
244 # Cache RAM interface
246 return Array(Signal(ROW_SIZE_BITS
) for x
in range(NUM_WAYS
))
248 #-- PLRU output interface
249 #type plru_out_t is array(index_t) of
250 # std_ulogic_vector(WAY_BITS-1 downto 0);
251 # PLRU output interface
253 return Array(Signal(WAY_BITS
) for x
in range(NUM_LINES
))
255 # -- Return the cache line index (tag index) for an address
256 # function get_index(addr: std_ulogic_vector(63 downto 0))
259 # return to_integer(unsigned(
260 # addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
263 # Return the cache line index (tag index) for an address
265 return addr
[LINE_OFF_BITS
:SET_SIZE_BITS
]
267 # -- Return the cache row index (data memory) for an address
268 # function get_row(addr: std_ulogic_vector(63 downto 0))
271 # return to_integer(unsigned(
272 # addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
275 # Return the cache row index (data memory) for an address
277 return addr
[ROW_OFF_BITS
:SET_SIZE_BITS
]
279 # -- Return the index of a row within a line
280 # function get_row_of_line(row: row_t) return row_in_line_t is
281 # variable row_v : unsigned(ROW_BITS-1 downto 0);
283 # row_v := to_unsigned(row, ROW_BITS);
284 # return row_v(ROW_LINEBITS-1 downto 0);
286 # Return the index of a row within a line
287 def get_row_of_line(row
):
288 return row
[:ROW_LINE_BITS
]
290 # -- Returns whether this is the last row of a line
291 # function is_last_row_addr(addr: wishbone_addr_type;
292 # last: row_in_line_t
297 # addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
300 # Returns whether this is the last row of a line
301 def is_last_row_addr(addr
, last
):
302 return addr
[ROW_OFF_BITS
:LINE_OFF_BITS
] == last
304 # -- Returns whether this is the last row of a line
305 # function is_last_row(row: row_t;
306 # last: row_in_line_t) return boolean is
308 # return get_row_of_line(row) = last;
310 # Returns whether this is the last row of a line
311 def is_last_row(row
, last
):
312 return get_row_of_line(row
) == last
314 # -- Return the address of the next row in the current cache line
315 # function next_row_addr(addr: wishbone_addr_type)
316 # return std_ulogic_vector is
317 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
318 # variable result : wishbone_addr_type;
320 # -- Is there no simpler way in VHDL to generate that 3 bits adder ?
321 # row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
322 # row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
324 # result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
327 # Return the address of the next row in the current cache line
328 def next_row_addr(addr
):
329 row_idx
= addr
[ROW_OFF_BITS
:LINE_OFF_BITS
] + 1
330 return addr
[ROW_OFF_BITS
:LINE_OFF_BITS
].eq(row_idx
)
332 # -- Return the next row in the current cache line. We use a dedicated
333 # -- function in order to limit the size of the generated adder to be
334 # -- only the bits within a cache line (3 bits with default settings)
335 # function next_row(row: row_t) return row_t is
336 # variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
337 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
338 # variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
340 # row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
341 # row_idx := row_v(ROW_LINEBITS-1 downto 0);
342 # row_v(ROW_LINEBITS-1 downto 0) :=
343 # std_ulogic_vector(unsigned(row_idx) + 1);
344 # return to_integer(unsigned(row_v));
346 # Return the next row in the current cache line. We use a dedicated
347 # function in order to limit the size of the generated adder to be
348 # only the bits within a cache line (3 bits with default settings)
350 row_idx
= row
[:ROW_LINE_BITS
]
351 return row
[:ROW_LINE_BITS
].eq(row_idx
+ 1)
353 # -- Read the instruction word for the given address in the
354 # -- current cache row
355 # function read_insn_word(addr: std_ulogic_vector(63 downto 0);
356 # data: cache_row_t) return std_ulogic_vector is
357 # variable word: integer range 0 to INSN_PER_ROW-1;
359 # word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
360 # return data(31+word*32 downto word*32);
362 # Read the instruction word for the given address
363 # in the current cache row
364 def read_insn_word(addr
, data
):
365 word
= addr
[2:INSN_BITS
+3]
366 return data
.word_select(word
, 32)
368 # -- Get the tag value from the address
370 # addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
372 # return cache_tag_t is
374 # return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
376 # Get the tag value from the address
378 return addr
[SET_SIZE_BITS
:REAL_ADDR_BITS
]
380 # -- Read a tag from a tag memory row
381 # function read_tag(way: way_t; tagset: cache_tags_set_t)
382 # return cache_tag_t is
384 # return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
386 # Read a tag from a tag memory row
387 def read_tag(way
, tagset
):
388 return tagset
[way
* TAG_BITS
:(way
+ 1) * TAG_BITS
]
390 # -- Write a tag to tag memory row
391 # procedure write_tag(way: in way_t;
392 # tagset: inout cache_tags_set_t; tag: cache_tag_t) is
394 # tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
396 # Write a tag to tag memory row
397 def write_tag(way
, tagset
, tag
):
398 tagset
[way
* TAG_BITS
:(way
+ 1) * TAG_BITS
] = tag
400 # -- Simple hash for direct-mapped TLB index
401 # function hash_ea(addr: std_ulogic_vector(63 downto 0))
402 # return tlb_index_t is
403 # variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
405 # hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
407 # TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
408 # TLB_LG_PGSZ + TLB_BITS
411 # TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
412 # TLB_LG_PGSZ + 2 * TLB_BITS
414 # return to_integer(unsigned(hash));
416 # Simple hash for direct-mapped TLB index
418 hsh
= addr
[TLB_LG_PGSZ
:TLB_LG_PGSZ
+ TLB_BITS
] ^ addr
[
419 TLB_LG_PGSZ
+ TLB_BITS
:TLB_LG_PGSZ
+ 2 * TLB_BITS
421 TLB_LG_PGSZ
+ 2 * TLB_BITS
:TLB_LG_PGSZ
+ 3 * TLB_BITS
427 # assert LINE_SIZE mod ROW_SIZE = 0;
428 # assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
430 # assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
432 # assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
434 # assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
436 # assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
437 # report "geometry bits don't add up" severity FAILURE;
438 # assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
439 # report "geometry bits don't add up" severity FAILURE;
440 # assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
441 # report "geometry bits don't add up" severity FAILURE;
442 # assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
443 # report "geometry bits don't add up" severity FAILURE;
445 # sim_debug: if SIM generate
448 # report "ROW_SIZE = " & natural'image(ROW_SIZE);
449 # report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
450 # report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
451 # report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
452 # report "INSN_BITS = " & natural'image(INSN_BITS);
453 # report "ROW_BITS = " & natural'image(ROW_BITS);
454 # report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
455 # report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
456 # report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
457 # report "INDEX_BITS = " & natural'image(INDEX_BITS);
458 # report "TAG_BITS = " & natural'image(TAG_BITS);
459 # report "WAY_BITS = " & natural'image(WAY_BITS);
464 # Cache reload state machine
471 # type reg_internal_t is record
472 # -- Cache hit state (Latches for 1 cycle BRAM access)
474 # hit_nia : std_ulogic_vector(63 downto 0);
475 # hit_smark : std_ulogic;
476 # hit_valid : std_ulogic;
478 # -- Cache miss state (reload state machine)
480 # wb : wishbone_master_out;
482 # store_index : index_t;
484 # store_tag : cache_tag_t;
485 # store_valid : std_ulogic;
486 # end_row_ix : row_in_line_t;
487 # rows_valid : row_per_line_valid_t;
490 # fetch_failed : std_ulogic;
492 class RegInternal(RecordObject
):
495 # Cache hit state (Latches for 1 cycle BRAM access)
496 self
.hit_way
= Signal(NUM_WAYS
)
497 self
.hit_nia
= Signal(64)
498 self
.hit_smark
= Signal()
499 self
.hit_valid
= Signal()
501 # Cache miss state (reload state machine)
502 self
.state
= Signal(State
)
503 self
.wb
= WBMasterOut()
504 self
.store_way
= Signal(NUM_WAYS
)
505 self
.store_index
= Signal(NUM_LINES
)
506 self
.store_row
= Signal(BRAM_ROWS
)
507 self
.store_tag
= Signal(TAG_BITS
)
508 self
.store_valid
= Signal()
509 self
.end_row_ix
= Signal(ROW_LINE_BITS
)
510 self
.rows_valid
= RowPerLineValidArray()
513 self
.fetch_failed
= Signal()
515 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
519 # SIM : boolean := false;
520 # -- Line size in bytes
521 # LINE_SIZE : positive := 64;
522 # -- BRAM organisation: We never access more
523 # -- than wishbone_data_bits
524 # -- at a time so to save resources we make the
525 # -- array only that wide,
526 # -- and use consecutive indices for to make a cache "line"
528 # -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
530 # ROW_SIZE : positive := wishbone_data_bits / 8;
531 # -- Number of lines in a set
532 # NUM_LINES : positive := 32;
534 # NUM_WAYS : positive := 4;
535 # -- L1 ITLB number of entries (direct mapped)
536 # TLB_SIZE : positive := 64;
537 # -- L1 ITLB log_2(page_size)
538 # TLB_LG_PGSZ : positive := 12;
539 # -- Number of real address bits that we store
540 # REAL_ADDR_BITS : positive := 56;
541 # -- Non-zero to enable log data collection
542 # LOG_LENGTH : natural := 0
545 # clk : in std_ulogic;
546 # rst : in std_ulogic;
548 # i_in : in Fetch1ToIcacheType;
549 # i_out : out IcacheToDecode1Type;
551 # m_in : in MmuToIcacheType;
553 # stall_in : in std_ulogic;
554 # stall_out : out std_ulogic;
555 # flush_in : in std_ulogic;
556 # inval_in : in std_ulogic;
558 # wishbone_out : out wishbone_master_out;
559 # wishbone_in : in wishbone_slave_out;
561 # log_out : out std_ulogic_vector(53 downto 0)
564 # 64 bit direct mapped icache. All instructions are 4B aligned.
565 class ICache(Elaboratable
):
566 """64 bit direct mapped icache. All instructions are 4B aligned."""
568 self
.i_in
= Fetch1ToICacheType()
569 self
.i_out
= ICacheToDecode1Type()
571 self
.m_in
= MMUToICacheType()
573 self
.stall_in
= Signal()
574 self
.stall_out
= Signal()
575 self
.flush_in
= Signal()
576 self
.inval_in
= Signal()
578 self
.wb_out
= WBMasterOut()
579 self
.wb_in
= WBSlaveOut()
581 self
.log_out
= Signal(54)
584 # -- Generate a cache RAM for each way
585 # rams: for i in 0 to NUM_WAYS-1 generate
586 # signal do_read : std_ulogic;
587 # signal do_write : std_ulogic;
588 # signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
589 # signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
590 # signal dout : cache_row_t;
591 # signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
593 # way: entity work.cache_ram
595 # ROW_BITS => ROW_BITS,
596 # WIDTH => ROW_SIZE_BITS
601 # rd_addr => rd_addr,
604 # wr_addr => wr_addr,
605 # wr_data => wishbone_in.dat
609 # do_read <= not (stall_in or use_previous);
611 # if wishbone_in.ack = '1' and replace_way = i then
614 # cache_out(i) <= dout;
616 # std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
618 # std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
619 # for i in 0 to ROW_SIZE-1 loop
620 # wr_sel(i) <= do_write;
624 def rams(self
, m
, r
, cache_out
, use_previous
, replace_way
, req_row
):
627 wb_in
, stall_in
= self
.wb_in
, self
.stall_in
631 rd_addr
= Signal(ROW_BITS
)
632 wr_addr
= Signal(ROW_BITS
)
633 _d_out
= Signal(ROW_SIZE_BITS
)
634 wr_sel
= Signal(ROW_SIZE
)
636 for i
in range(NUM_WAYS
):
637 way
= CacheRam(ROW_BITS
, ROW_SIZE_BITS
)
638 comb
+= way
.rd_en
.eq(do_read
)
639 comb
+= way
.rd_addr
.eq(rd_addr
)
640 comb
+= way
.rd_data_o
.eq(_d_out
)
641 comb
+= way
.wr_sel
.eq(wr_sel
)
642 comb
+= way
.wr_addr
.eq(wr_addr
)
643 comb
+= way
.wr_data
.eq(wb_in
.dat
)
645 comb
+= do_read
.eq(~
(stall_in | use_previous
))
646 comb
+= do_write
.eq(0)
648 with m
.If(wb_in
.ack
& (replace_way
== i
)):
649 comb
+= do_write
.eq(1)
651 comb
+= cache_out
[i
].eq(_d_out
)
652 comb
+= rd_addr
.eq(req_row
)
653 comb
+= wr_addr
.eq(r
.store_row
)
654 for j
in range(ROW_SIZE
):
655 comb
+= wr_sel
[j
].eq(do_write
)
658 # maybe_plrus: if NUM_WAYS > 1 generate
660 # plrus: for i in 0 to NUM_LINES-1 generate
662 # signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
663 # signal plru_acc_en : std_ulogic;
664 # signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
667 # plru : entity work.plru
675 # acc_en => plru_acc_en,
682 # if get_index(r.hit_nia) = i then
683 # plru_acc_en <= r.hit_valid;
685 # plru_acc_en <= '0';
688 # std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
689 # plru_victim(i) <= plru_out;
693 def maybe_plrus(self
, m
, r
, plru_victim
):
696 with m
.If(NUM_WAYS
> 1):
697 for i
in range(NUM_LINES
):
698 plru_acc_i
= Signal(WAY_BITS
)
699 plru_acc_en
= Signal()
700 plru_out
= Signal(WAY_BITS
)
701 plru
= PLRU(WAY_BITS
)
702 comb
+= plru
.acc_i
.eq(plru_acc_i
)
703 comb
+= plru
.acc_en
.eq(plru_acc_en
)
704 comb
+= plru
.lru_o
.eq(plru_out
)
707 with m
.If(get_index(r
.hit_nia
) == i
):
708 comb
+= plru
.acc_en
.eq(r
.hit_valid
)
711 comb
+= plru
.acc_en
.eq(0)
713 comb
+= plru
.acc_i
.eq(r
.hit_way
)
714 comb
+= plru_victim
[i
].eq(plru
.lru_o
)
716 # -- TLB hit detection and real address generation
717 # itlb_lookup : process(all)
718 # variable pte : tlb_pte_t;
719 # variable ttag : tlb_tag_t;
721 # tlb_req_index <= hash_ea(i_in.nia);
722 # pte := itlb_ptes(tlb_req_index);
723 # ttag := itlb_tags(tlb_req_index);
724 # if i_in.virt_mode = '1' then
725 # real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
726 # i_in.nia(TLB_LG_PGSZ - 1 downto 0);
727 # if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then
728 # ra_valid <= itlb_valids(tlb_req_index);
732 # eaa_priv <= pte(3);
734 # real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0);
739 # -- no IAMR, so no KUEP support for now
740 # priv_fault <= eaa_priv and not i_in.priv_mode;
741 # access_ok <= ra_valid and not priv_fault;
743 # TLB hit detection and real address generation
744 def itlb_lookup(self
, m
, tlb_req_index
, itlb_ptes
, itlb_tags
,
745 real_addr
, itlb_valid_bits
, ra_valid
, eaa_priv
,
746 priv_fault
, access_ok
):
751 pte
= Signal(TLB_PTE_BITS
)
752 ttag
= Signal(TLB_EA_TAG_BITS
)
754 comb
+= tlb_req_index
.eq(hash_ea(i_in
.nia
))
755 comb
+= pte
.eq(itlb_ptes
[tlb_req_index
])
756 comb
+= ttag
.eq(itlb_tags
[tlb_req_index
])
758 with m
.If(i_in
.virt_mode
):
759 comb
+= real_addr
.eq(Cat(
760 i_in
.nia
[:TLB_LG_PGSZ
],
761 pte
[TLB_LG_PGSZ
:REAL_ADDR_BITS
]
764 with m
.If(ttag
== i_in
.nia
[TLB_LG_PGSZ
+ TLB_BITS
:64]):
765 comb
+= ra_valid
.eq(itlb_valid_bits
[tlb_req_index
])
768 comb
+= ra_valid
.eq(0)
771 comb
+= real_addr
.eq(i_in
.nia
[:REAL_ADDR_BITS
])
772 comb
+= ra_valid
.eq(1)
773 comb
+= eaa_priv
.eq(1)
775 # No IAMR, so no KUEP support for now
776 comb
+= priv_fault
.eq(eaa_priv
& ~i_in
.priv_mode
)
777 comb
+= access_ok
.eq(ra_valid
& ~priv_fault
)
780 # itlb_update: process(clk)
781 # variable wr_index : tlb_index_t;
783 # if rising_edge(clk) then
784 # wr_index := hash_ea(m_in.addr);
786 # (m_in.tlbie = '1' and m_in.doall = '1') then
787 # -- clear all valid bits
788 # for i in tlb_index_t loop
789 # itlb_valids(i) <= '0';
791 # elsif m_in.tlbie = '1' then
792 # -- clear entry regardless of hit or miss
793 # itlb_valids(wr_index) <= '0';
794 # elsif m_in.tlbld = '1' then
795 # itlb_tags(wr_index) <=
796 # m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS);
797 # itlb_ptes(wr_index) <= m_in.pte;
798 # itlb_valids(wr_index) <= '1';
803 def itlb_update(self
, m
, itlb_valid_bits
, itlb_tags
, itlb_ptes
):
809 wr_index
= Signal(TLB_SIZE
)
810 comb
+= wr_index
.eq(hash_ea(m_in
.addr
))
812 with m
.If(m_in
.tlbie
& m_in
.doall
):
813 # Clear all valid bits
814 for i
in range(TLB_SIZE
):
815 sync
+= itlb_valid_bits
[i
].eq(0)
817 with m
.Elif(m_in
.tlbie
):
818 # Clear entry regardless of hit or miss
819 sync
+= itlb_valid_bits
[wr_index
].eq(0)
821 with m
.Elif(m_in
.tlbld
):
822 sync
+= itlb_tags
[wr_index
].eq(
823 m_in
.addr
[TLB_LG_PGSZ
+ TLB_BITS
:64]
825 sync
+= itlb_ptes
[wr_index
].eq(m_in
.pte
)
826 sync
+= itlb_valid_bits
[wr_index
].eq(1)
828 # -- Cache hit detection, output to fetch2 and other misc logic
829 # icache_comb : process(all)
830 # Cache hit detection, output to fetch2 and other misc logic
831 def icache_comb(self
, m
, use_previous
, r
, req_index
, req_row
,
832 req_tag
, real_addr
, req_laddr
, cache_valid_bits
,
833 cache_tags
, access_ok
, req_is_hit
,
834 req_is_miss
, replace_way
, plru_victim
, cache_out
):
835 # variable is_hit : std_ulogic;
836 # variable hit_way : way_t;
839 i_in
, i_out
, wb_out
= self
.i_in
, self
.i_out
, self
.wb_out
840 flush_in
, stall_out
= self
.flush_in
, self
.stall_out
843 hit_way
= Signal(NUM_WAYS
)
845 # -- i_in.sequential means that i_in.nia this cycle
846 # -- is 4 more than last cycle. If we read more
847 # -- than 32 bits at a time, had a cache hit last
848 # -- cycle, and we don't want the first 32-bit chunk
849 # -- then we can keep the data we read last cycle
850 # -- and just use that.
851 # if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
852 # use_previous <= i_in.sequential and r.hit_valid;
854 # use_previous <= '0';
856 # i_in.sequential means that i_in.nia this cycle is 4 more than
857 # last cycle. If we read more than 32 bits at a time, had a
858 # cache hit last cycle, and we don't want the first 32-bit chunk
859 # then we can keep the data we read last cycle and just use that.
860 with m
.If(i_in
.nia
[2:INSN_BITS
+2] != 0):
861 comb
+= use_previous
.eq(i_in
.sequential
& r
.hit_valid
)
864 comb
+= use_previous
.eq(0)
866 # -- Extract line, row and tag from request
867 # req_index <= get_index(i_in.nia);
868 # req_row <= get_row(i_in.nia);
869 # req_tag <= get_tag(real_addr);
870 # Extract line, row and tag from request
871 comb
+= req_index
.eq(get_index(i_in
.nia
))
872 comb
+= req_row
.eq(get_row(i_in
.nia
))
873 comb
+= req_tag
.eq(get_tag(real_addr
))
875 # -- Calculate address of beginning of cache row, will be
876 # -- used for cache miss processing if needed
878 # (63 downto REAL_ADDR_BITS => '0') &
879 # real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
880 # (ROW_OFF_BITS-1 downto 0 => '0');
881 # Calculate address of beginning of cache row, will be
882 # used for cache miss processing if needed
883 comb
+= req_laddr
.eq(Cat(
884 Const(0b0, ROW_OFF_BITS
),
885 real_addr
[ROW_OFF_BITS
:REAL_ADDR_BITS
],
886 Const(0, REAL_ADDR_BITS
)
889 # -- Test if pending request is a hit on any way
892 # for i in way_t loop
893 # if i_in.req = '1' and
894 # (cache_valids(req_index)(i) = '1' or
895 # (r.state = WAIT_ACK and
896 # req_index = r.store_index and
897 # i = r.store_way and
898 # r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
899 # if read_tag(i, cache_tags(req_index)) = req_tag then
905 # Test if pending request is a hit on any way
906 for i
in range(NUM_WAYS
):
908 (cache_valid_bits
[req_index
][i
] |
909 ((r
.state
== State
.WAIT_ACK
)
910 & (req_index
== r
.store_index
)
912 & r
.rows_valid
[req_row
% ROW_PER_LINE
]))):
913 with m
.If(read_tag(i
, cache_tags
[req_index
]) == req_tag
):
914 comb
+= hit_way
.eq(i
)
917 # -- Generate the "hit" and "miss" signals
918 # -- for the synchronous blocks
919 # if i_in.req = '1' and access_ok = '1' and flush_in = '0'
921 # req_is_hit <= is_hit;
922 # req_is_miss <= not is_hit;
925 # req_is_miss <= '0';
927 # req_hit_way <= hit_way;
928 # Generate the "hit" and "miss" signals
929 # for the synchronous blocks
930 with m
.If(i_in
.req
& access_ok
& ~flush_in
):
931 comb
+= req_is_hit
.eq(is_hit
)
932 comb
+= req_is_miss
.eq(~is_hit
)
935 comb
+= req_is_hit
.eq(0)
936 comb
+= req_is_miss
.eq(0)
938 # -- The way to replace on a miss
939 # if r.state = CLR_TAG then
941 # to_integer(unsigned(plru_victim(r.store_index)));
943 # replace_way <= r.store_way;
945 # The way to replace on a miss
946 with m
.If(r
.state
== State
.CLR_TAG
):
947 comb
+= replace_way
.eq(plru_victim
[r
.store_index
])
950 comb
+= replace_way
.eq(r
.store_way
)
952 # -- Output instruction from current cache row
954 # -- Note: This is a mild violation of our design principle of
955 # -- having pipeline stages output from a clean latch. In this
956 # -- case we output the result of a mux. The alternative would
957 # -- be output an entire row which I prefer not to do just yet
958 # -- as it would force fetch2 to know about some of the cache
959 # -- geometry information.
960 # i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
961 # i_out.valid <= r.hit_valid;
962 # i_out.nia <= r.hit_nia;
963 # i_out.stop_mark <= r.hit_smark;
964 # i_out.fetch_failed <= r.fetch_failed;
965 # Output instruction from current cache row
967 # Note: This is a mild violation of our design principle of
968 # having pipeline stages output from a clean latch. In this
969 # case we output the result of a mux. The alternative would
970 # be output an entire row which I prefer not to do just yet
971 # as it would force fetch2 to know about some of the cache
972 # geometry information.
973 comb
+= i_out
.insn
.eq(
974 read_insn_word(r
.hit_nia
, cache_out
[r
.hit_way
])
976 comb
+= i_out
.valid
.eq(r
.hit_valid
)
977 comb
+= i_out
.nia
.eq(r
.hit_nia
)
978 comb
+= i_out
.stop_mark
.eq(r
.hit_smark
)
979 comb
+= i_out
.fetch_failed
.eq(r
.fetch_failed
)
981 # -- Stall fetch1 if we have a miss on cache or TLB
982 # -- or a protection fault
983 # stall_out <= not (is_hit and access_ok);
984 # Stall fetch1 if we have a miss on cache or TLB
985 # or a protection fault
986 comb
+= stall_out
.eq(~
(is_hit
& access_ok
))
988 # -- Wishbone requests output (from the cache miss reload machine)
989 # wishbone_out <= r.wb;
990 # Wishbone requests output (from the cache miss reload machine)
991 comb
+= wb_out
.eq(r
.wb
)
994 # -- Cache hit synchronous machine
995 # icache_hit : process(clk)
996 # Cache hit synchronous machine
997 def icache_hit(self
, m
, use_previous
, r
, req_is_hit
, req_hit_way
,
998 req_index
, req_tag
, real_addr
):
1001 i_in
, stall_in
= self
.i_in
, self
.stall_in
1002 flush_in
= self
.flush_in
1005 # if rising_edge(clk) then
1006 # -- keep outputs to fetch2 unchanged on a stall
1007 # -- except that flush or reset sets valid to 0
1008 # -- If use_previous, keep the same data as last
1009 # -- cycle and use the second half
1010 # if stall_in = '1' or use_previous = '1' then
1011 # if rst = '1' or flush_in = '1' then
1012 # r.hit_valid <= '0';
1014 # keep outputs to fetch2 unchanged on a stall
1015 # except that flush or reset sets valid to 0
1016 # If use_previous, keep the same data as last
1017 # cycle and use the second half
1018 with m
.If(stall_in | use_previous
):
1019 with m
.If(flush_in
):
1020 sync
+= r
.hit_valid
.eq(0)
1022 # -- On a hit, latch the request for the next cycle,
1023 # -- when the BRAM data will be available on the
1024 # -- cache_out output of the corresponding way
1025 # r.hit_valid <= req_is_hit;
1026 # if req_is_hit = '1' then
1027 # r.hit_way <= req_hit_way;
1029 # On a hit, latch the request for the next cycle,
1030 # when the BRAM data will be available on the
1031 # cache_out output of the corresponding way
1032 sync
+= r
.hit_valid
.eq(req_is_hit
)
1034 with m
.If(req_is_hit
):
1035 sync
+= r
.hit_way
.eq(req_hit_way
)
1037 # report "cache hit nia:" & to_hstring(i_in.nia) &
1038 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1039 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1040 # " idx:" & integer'image(req_index) &
1041 # " tag:" & to_hstring(req_tag) &
1042 # " way:" & integer'image(req_hit_way) &
1043 # " RA:" & to_hstring(real_addr);
1044 print(f
"cache hit nia:{i_in.nia}, " \
1045 f
"IR:{i_in.virt_mode}, " \
1046 f
"SM:{i_in.stop_mark}, idx:{req_index}, " \
1047 f
"tag:{req_tag}, way:{req_hit_way}, " \
1051 # if stall_in = '0' then
1052 # -- Send stop marks and NIA down regardless of validity
1053 # r.hit_smark <= i_in.stop_mark;
1054 # r.hit_nia <= i_in.nia;
1056 with m
.If(~stall_in
):
1057 # Send stop marks and NIA down regardless of validity
1058 sync
+= r
.hit_smark
.eq(i_in
.stop_mark
)
1059 sync
+= r
.hit_nia
.eq(i_in
.nia
)
1063 # -- Cache miss/reload synchronous machine
1064 # icache_miss : process(clk)
1065 # Cache miss/reload synchronous machine
1066 def icache_miss(self
, m
, cache_valid_bits
, r
, req_is_miss
,
1067 req_index
, req_laddr
, req_tag
, replace_way
,
1068 cache_tags
, access_ok
, real_addr
):
1072 i_in
, wb_in
, m_in
= self
.i_in
, self
.wb_in
, self
.m_in
1073 stall_in
, flush_in
= self
.stall_in
, self
.flush_in
1074 inval_in
= self
.inval_in
1076 # variable tagset : cache_tags_set_t;
1077 # variable stbs_done : boolean;
1079 tagset
= Signal(TAG_RAM_WIDTH
)
1080 stbs_done
= Signal()
1083 # if rising_edge(clk) then
1084 # -- On reset, clear all valid bits to force misses
1086 # On reset, clear all valid bits to force misses
1087 # for i in index_t loop
1088 # cache_valids(i) <= (others => '0');
1093 # -- We only ever do reads on wishbone
1094 # r.wb.dat <= (others => '0');
1095 # r.wb.sel <= "11111111";
1098 # We only ever do reads on wishbone
1099 comb
+= r
.wb
.sel
.eq(~
0) # set to all 1s
1101 # -- Not useful normally but helps avoiding
1102 # -- tons of sim warnings
1103 # r.wb.adr <= (others => '0');
1107 # -- Process cache invalidations
1108 # if inval_in = '1' then
1109 # for i in index_t loop
1110 # cache_valids(i) <= (others => '0');
1112 # r.store_valid <= '0';
1114 # Process cache invalidations
1115 with m
.If(inval_in
):
1116 for i
in range(NUM_LINES
):
1117 sync
+= cache_valid_bits
[i
].eq(~
1) # NO just set to zero.
1118 # look again: others == 0
1120 sync
+= r
.store_valid
.eq(0)
1122 # -- Main state machine
1124 # Main state machine
1125 with m
.Switch(r
.state
):
1128 with m
.Case(State
.IDLE
):
1129 # -- Reset per-row valid flags,
1130 # -- only used in WAIT_ACK
1131 # for i in 0 to ROW_PER_LINE - 1 loop
1132 # r.rows_valid(i) <= '0';
1134 # Reset per-row valid flags,
1135 # only used in WAIT_ACK
1136 for i
in range(ROW_PER_LINE
):
1137 sync
+= r
.rows_valid
[i
].eq(0)
1139 # -- We need to read a cache line
1140 # if req_is_miss = '1' then
1141 # report "cache miss nia:" & to_hstring(i_in.nia) &
1142 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1143 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1144 # " idx:" & integer'image(req_index) &
1145 # " way:" & integer'image(replace_way) &
1146 # " tag:" & to_hstring(req_tag) &
1147 # " RA:" & to_hstring(real_addr);
1148 # We need to read a cache line
1149 with m
.If(req_is_miss
):
1150 # XXX no, do not use "f". use sync += Display
1151 # and use %d for integer, %x for hex.
1152 print(f
"cache miss nia:{i_in.nia} " \
1153 f
"IR:{i_in.virt_mode} " \
1154 f
"SM:{i_in.stop_mark} " \
1155 F
"idx:{req_index} " \
1156 f
"way:{replace_way} tag:{req_tag} " \
1159 # -- Keep track of our index and way for
1160 # -- subsequent stores
1161 # r.store_index <= req_index;
1162 # r.store_row <= get_row(req_laddr);
1163 # r.store_tag <= req_tag;
1164 # r.store_valid <= '1';
1166 # get_row_of_line(get_row(req_laddr)) - 1;
1167 # Keep track of our index and way
1168 # for subsequent stores
1169 sync
+= r
.store_index
.eq(req_index
)
1170 sync
+= r
.store_row
.eq(get_row(req_laddr
))
1171 sync
+= r
.store_tag
.eq(req_tag
)
1172 sync
+= r
.store_valid
.eq(1)
1173 sync
+= r
.end_row_ix
.eq(
1179 # -- Prep for first wishbone read. We calculate the
1180 # -- address of the start of the cache line and
1181 # -- start the WB cycle.
1182 # r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
1185 # Prep for first wishbone read.
1187 # address of the start of the cache line and
1188 # start the WB cycle.
1189 sync
+= r
.wb
.adr
.eq(
1190 req_laddr
[:r
.wb
.adr
]
1192 sync
+= r
.wb
.cyc
.eq(1)
1193 sync
+= r
.wb
.stb
.eq(1)
1195 # -- Track that we had one request sent
1196 # r.state <= CLR_TAG;
1197 # Track that we had one request sent
1198 sync
+= r
.state
.eq(State
.CLR_TAG
)
1201 # when CLR_TAG | WAIT_ACK =>
1202 with m
.Case(State
.CLR_TAG
, State
.WAIT_ACK
):
1203 # if r.state = CLR_TAG then
1204 with m
.If(r
.state
== State
.CLR_TAG
):
1205 # -- Get victim way from plru
1206 # r.store_way <= replace_way;
1207 # Get victim way from plru
1208 sync
+= r
.store_way
.eq(replace_way
)
1210 # -- Force misses on that way while
1211 # -- reloading that line
1212 # cache_valids(req_index)(replace_way) <= '0';
1213 # Force misses on that way while
1214 # realoading that line
1215 sync
+= cache_valid_bits
[
1217 ][replace_way
].eq(0)
1219 # -- Store new tag in selected way
1220 # for i in 0 to NUM_WAYS-1 loop
1221 # if i = replace_way then
1222 # tagset := cache_tags(r.store_index);
1223 # write_tag(i, tagset, r.store_tag);
1224 # cache_tags(r.store_index) <= tagset;
1227 for i
in range(NUM_WAYS
):
1228 with m
.If(i
== replace_way
):
1230 cache_tags
[r
.store_index
]
1233 i
, tagset
, r
.store_tag
1235 sync
+= cache_tags
[r
.store_index
].eq(
1239 # r.state <= WAIT_ACK;
1240 sync
+= r
.state
.eq(State
.WAIT_ACK
)
1243 # -- Requests are all sent if stb is 0
1244 # stbs_done := r.wb.stb = '0';
1245 # Requests are all sent if stb is 0
1246 comb
+= stbs_done
.eq(r
.wb
.stb
== 0)
1248 # -- If we are still sending requests,
1249 # -- was one accepted ?
1250 # if wishbone_in.stall = '0' and not stbs_done then
1251 # If we are still sending requests,
1253 with m
.If(~wb_in
.stall
& ~stbs_done
):
1254 # -- That was the last word ? We are done sending.
1255 # -- Clear stb and set stbs_done so we can handle
1256 # -- an eventual last ack on the same cycle.
1257 # if is_last_row_addr(r.wb.adr, r.end_row_ix) then
1259 # stbs_done := true;
1261 # That was the last word ?
1262 # We are done sending.
1263 # Clear stb and set stbs_done
1265 # an eventual last ack on
1267 with m
.If(is_last_row_addr(
1268 r
.wb
.adr
, r
.end_row_ix
)):
1269 sync
+= r
.wb
.stb
.eq(0)
1272 # -- Calculate the next row address
1273 # r.wb.adr <= next_row_addr(r.wb.adr);
1274 # Calculate the next row address
1275 sync
+= r
.wb
.adr
.eq(next_row_addr(r
.wb
.adr
))
1278 # -- Incoming acks processing
1279 # if wishbone_in.ack = '1' then
1280 # Incoming acks processing
1281 with m
.If(wb_in
.ack
):
1282 # r.rows_valid(r.store_row mod ROW_PER_LINE)
1284 sync
+= r
.rows_valid
[
1285 r
.store_row
& ROW_PER_LINE
1288 # -- Check for completion
1290 # is_last_row(r.store_row, r.end_row_ix) then
1291 # Check for completion
1292 with m
.If(stbs_done
& is_last_row(
1293 r
.store_row
, r
.end_row_ix
)):
1294 # -- Complete wishbone cycle
1296 # Complete wishbone cycle
1297 sync
+= r
.wb
.cyc
.eq(0)
1299 # -- Cache line is now valid
1300 # cache_valids(r.store_index)(replace_way) <=
1301 # r.store_valid and not inval_in;
1302 # Cache line is now valid
1303 sync
+= cache_valid_bits
[
1306 r
.store_valid
& ~inval_in
1312 sync
+= r
.state
.eq(State
.IDLE
)
1315 # -- Increment store row counter
1316 # r.store_row <= next_row(r.store_row);
1317 # Increment store row counter
1318 sync
+= store_row
.eq(next_row(r
.store_row
))
1323 # -- TLB miss and protection fault processing
1324 # if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
1325 # r.fetch_failed <= '0';
1326 # elsif i_in.req = '1' and access_ok = '0' and
1327 # stall_in = '0' then
1328 # r.fetch_failed <= '1';
1330 # TLB miss and protection fault processing
1331 with m
.If('''TODO nmigen rst''' | flush_in | m_in
.tlbld
):
1332 sync
+= r
.fetch_failed
.eq(0)
1334 with m
.Elif(i_in
.req
& ~access_ok
& ~stall_in
):
1335 sync
+= r
.fetch_failed
.eq(1)
1339 # icache_log: if LOG_LENGTH > 0 generate
1340 def icache_log(self
, m
, req_hit_way
, ra_valid
, access_ok
,
1341 req_is_miss
, req_is_hit
, lway
, wstate
, r
):
1345 wb_in
, i_out
= self
.wb_in
, self
.i_out
1346 log_out
, stall_out
= self
.log_out
, self
.stall_out
1348 # -- Output data to logger
1349 # signal log_data : std_ulogic_vector(53 downto 0);
1351 # data_log: process(clk)
1352 # variable lway: way_t;
1353 # variable wstate: std_ulogic;
1354 # Output data to logger
1355 for i
in range(LOG_LENGTH
):
1356 # Output data to logger
1357 log_data
= Signal(54)
1358 lway
= Signal(NUM_WAYS
)
1362 # if rising_edge(clk) then
1363 # lway := req_hit_way;
1365 comb
+= lway
.eq(req_hit_way
)
1366 comb
+= wstate
.eq(0)
1368 # if r.state /= IDLE then
1371 with m
.If(r
.state
!= State
.IDLE
):
1372 sync
+= wstate
.eq(1)
1374 # log_data <= i_out.valid &
1377 # r.wb.adr(5 downto 3) &
1378 # r.wb.stb & r.wb.cyc &
1379 # wishbone_in.stall &
1382 # r.hit_nia(5 downto 2) &
1384 # std_ulogic_vector(to_unsigned(lway, 3)) &
1385 # req_is_hit & req_is_miss &
1388 sync
+= log_data
.eq(Cat(
1389 ra_valid
, access_ok
, req_is_miss
, req_is_hit
,
1390 lway
, wstate
, r
.hit_nia
[2:6],
1391 r
.fetch_failed
, stall_out
, wb_in
.stall
, r
.wb
.cyc
,
1392 r
.wb
.stb
, r
.wb
.adr
[3:6], wb_in
.ack
, i_out
.insn
,
1397 # log_out <= log_data;
1398 comb
+= log_out
.eq(log_data
)
1402 def elaborate(self
, platform
):
1407 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1408 cache_tags
= CacheTagArray()
1409 cache_valid_bits
= CacheValidBitsArray()
1411 # signal itlb_valids : tlb_valids_t;
1412 # signal itlb_tags : tlb_tags_t;
1413 # signal itlb_ptes : tlb_ptes_t;
1414 # attribute ram_style of itlb_tags : signal is "distributed";
1415 # attribute ram_style of itlb_ptes : signal is "distributed";
1416 itlb_valid_bits
= TLBValidBitsArray()
1417 itlb_tags
= TLBTagArray()
1418 itlb_ptes
= TLBPTEArray()
1419 # TODO to be passed to nmigen as ram attributes
1420 # attribute ram_style of itlb_tags : signal is "distributed";
1421 # attribute ram_style of itlb_ptes : signal is "distributed";
1423 # -- Privilege bit from PTE EAA field
1424 # signal eaa_priv : std_ulogic;
1425 # Privilege bit from PTE EAA field
1428 # signal r : reg_internal_t;
1431 # -- Async signals on incoming request
1432 # signal req_index : index_t;
1433 # signal req_row : row_t;
1434 # signal req_hit_way : way_t;
1435 # signal req_tag : cache_tag_t;
1436 # signal req_is_hit : std_ulogic;
1437 # signal req_is_miss : std_ulogic;
1438 # signal req_laddr : std_ulogic_vector(63 downto 0);
1439 # Async signal on incoming request
1440 req_index
= Signal(NUM_LINES
)
1441 req_row
= Signal(BRAM_ROWS
)
1442 req_hit_way
= Signal(NUM_WAYS
)
1443 req_tag
= Signal(TAG_BITS
)
1444 req_is_hit
= Signal()
1445 req_is_miss
= Signal()
1446 req_laddr
= Signal(64)
1448 # signal tlb_req_index : tlb_index_t;
1449 # signal real_addr : std_ulogic_vector(
1450 # REAL_ADDR_BITS - 1 downto 0
1452 # signal ra_valid : std_ulogic;
1453 # signal priv_fault : std_ulogic;
1454 # signal access_ok : std_ulogic;
1455 # signal use_previous : std_ulogic;
1456 tlb_req_index
= Signal(TLB_SIZE
)
1457 real_addr
= Signal(REAL_ADDR_BITS
)
1459 priv_fault
= Signal()
1460 access_ok
= Signal()
1461 use_previous
= Signal()
1463 # signal cache_out : cache_ram_out_t;
1464 cache_out
= CacheRamOut()
1466 # signal plru_victim : plru_out_t;
1467 # signal replace_way : way_t;
1468 plru_victim
= PLRUOut()
1469 replace_way
= Signal(NUM_WAYS
)
1471 # call sub-functions putting everything together, using shared
1472 # signals established above
1473 self
.rams(m
, r
, cache_out
, use_previous
, replace_way
, req_row
)
1474 self
.maybe_plrus(m
, r
, plru_victim
)
1475 self
.itlb_lookup(m
, tlb_req_index
, itlb_ptes
, itlb_tags
,
1476 real_addr
, itlb_valid_bits
, ra_valid
, eaa_priv
,
1477 priv_fault
, access_ok
)
1478 self
.itlb_update(m
, itlb_valid_bits
, itlb_tags
, itlb_ptes
)
1479 self
.icache_comb(m
, use_previous
, r
, req_index
, req_row
,
1480 req_tag
, real_addr
, req_laddr
, cache_valid_bits
,
1481 cache_tags
, access_ok
, req_is_hit
, req_is_miss
,
1482 replace_way
, plru_victim
, cache_out
)
1483 self
.icache_hit(m
, use_previous
, r
, req_is_hit
, req_hit_way
,
1484 req_index
, req_tag
, real_addr
)
1485 self
.icache_miss(m
, cache_valid_bits
, r
, req_is_miss
, req_index
,
1486 req_laddr
, req_tag
, replace_way
, cache_tags
,
1487 access_ok
, real_addr
)
1488 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
1489 # req_is_miss, req_is_hit, lway, wstate, r)
1497 # use ieee.std_logic_1164.all;
1500 # use work.common.all;
1501 # use work.wishbone_types.all;
1503 # entity icache_tb is
1506 # architecture behave of icache_tb is
1507 # signal clk : std_ulogic;
1508 # signal rst : std_ulogic;
1510 # signal i_out : Fetch1ToIcacheType;
1511 # signal i_in : IcacheToDecode1Type;
1513 # signal m_out : MmuToIcacheType;
1515 # signal wb_bram_in : wishbone_master_out;
1516 # signal wb_bram_out : wishbone_slave_out;
1518 # constant clk_period : time := 10 ns;
1520 # icache0: entity work.icache
1534 # wishbone_out => wb_bram_in,
1535 # wishbone_in => wb_bram_out
1538 # -- BRAM Memory slave
1539 # bram0: entity work.wishbone_bram_wrapper
1541 # MEMORY_SIZE => 1024,
1542 # RAM_INIT_FILE => "icache_test.bin"
1547 # wishbone_in => wb_bram_in,
1548 # wishbone_out => wb_bram_out
1551 # clk_process: process
1554 # wait for clk_period/2;
1556 # wait for clk_period/2;
1559 # rst_process: process
1562 # wait for 2*clk_period;
1570 # i_out.nia <= (others => '0');
1571 # i_out.stop_mark <= '0';
1573 # m_out.tlbld <= '0';
1574 # m_out.tlbie <= '0';
1575 # m_out.addr <= (others => '0');
1576 # m_out.pte <= (others => '0');
1578 # wait until rising_edge(clk);
1579 # wait until rising_edge(clk);
1580 # wait until rising_edge(clk);
1581 # wait until rising_edge(clk);
1584 # i_out.nia <= x"0000000000000004";
1586 # wait for 30*clk_period;
1587 # wait until rising_edge(clk);
1589 # assert i_in.valid = '1' severity failure;
1590 # assert i_in.insn = x"00000001"
1591 # report "insn @" & to_hstring(i_out.nia) &
1592 # "=" & to_hstring(i_in.insn) &
1593 # " expected 00000001"
1598 # wait until rising_edge(clk);
1602 # i_out.nia <= x"0000000000000008";
1603 # wait until rising_edge(clk);
1604 # wait until rising_edge(clk);
1605 # assert i_in.valid = '1' severity failure;
1606 # assert i_in.insn = x"00000002"
1607 # report "insn @" & to_hstring(i_out.nia) &
1608 # "=" & to_hstring(i_in.insn) &
1609 # " expected 00000002"
1611 # wait until rising_edge(clk);
1615 # i_out.nia <= x"0000000000000040";
1617 # wait for 30*clk_period;
1618 # wait until rising_edge(clk);
1620 # assert i_in.valid = '1' severity failure;
1621 # assert i_in.insn = x"00000010"
1622 # report "insn @" & to_hstring(i_out.nia) &
1623 # "=" & to_hstring(i_in.insn) &
1624 # " expected 00000010"
1627 # -- test something that aliases
1629 # i_out.nia <= x"0000000000000100";
1630 # wait until rising_edge(clk);
1631 # wait until rising_edge(clk);
1632 # assert i_in.valid = '0' severity failure;
1633 # wait until rising_edge(clk);
1635 # wait for 30*clk_period;
1636 # wait until rising_edge(clk);
1638 # assert i_in.valid = '1' severity failure;
1639 # assert i_in.insn = x"00000040"
1640 # report "insn @" & to_hstring(i_out.nia) &
1641 # "=" & to_hstring(i_in.insn) &
1642 # " expected 00000040"
1650 def icache_sim(dut
):
1651 i_out
, i_in
, m_out
, m_in
= dut
.i_out
, dut
.i_in
, dut
.m_out
, dut
.m_in
1653 yield i_out
.req
.eq(0)
1654 yield i_out
.nia
.eq(~
1)
1655 yield i_out
.stop_mark
.eq(0)
1656 yield m_out
.tlbld
.eq(0)
1657 yield m_out
.tlbie
.eq(0)
1658 yield m_out
.addr
.eq(~
1)
1659 yield m_out
.pte
.eq(~
1)
1664 yield i_out
.req
.eq(1)
1665 yield i_out
.nia
.eq(Const(0x0000000000000004, 64))
1670 assert i_in
.insn
== Const(0x00000001, 32), \
1671 ("insn @%x=%x expected 00000001" % i_out
.nia
, i_in
.insn
)
1672 yield i_out
.req
.eq(0)
1676 yield i_out
.req
.eq(1)
1677 yield i_out
.nia
.eq(Const(0x0000000000000008, 64))
1681 assert i_in
.insn
== Const(0x00000002, 32), \
1682 ("insn @%x=%x expected 00000002" % i_out
.nia
, i_in
.insn
)
1687 yield i_out
.nia
.eq(Const(0x0000000000000040, 64))
1692 assert i_in
.insn
== Const(0x00000010, 32), \
1693 ("insn @%x=%x expected 00000010" % i_out
.nia
, i_in
.insn
)
1695 # test something that aliases
1696 yield i_out
.req
.eq(1)
1697 yield i_out
.nia
.eq(Const(0x0000000000000100, 64))
1705 assert i_in
.insn
== Const(0x00000040, 32), \
1706 ("insn @%x=%x expected 00000040" % i_out
.nia
, i_in
.insn
)
1707 yield i_out
.req
.eq(0)
1714 m
.submodules
.icache
= dut
1720 sim
.add_sync_process(wrap(icache_sim(dut
)))
1721 with sim
.write_vcd('test_icache.vcd'):
1724 if __name__
== '__main__':
1726 vl
= rtlil
.convert(dut
, ports
=[])
1727 with
open("test_icache.il", "w") as f
: