icache.py add missing funciton bodies, add missing return statment, fix
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 """
22 from enum import Enum, unique
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const)
24 from nmigen.cli import main
25 from nmigen.cli import rtlil
26 from nmutil.iocontrol import RecordObject
27 from nmutil.byterev import byte_reverse
28 from nmutil.mask import Mask
29 from nmigen.utils import log2_int
30 from nmutil.util import Display
31
32 from soc.experiment.mem_types import (Fetch1ToICacheType,
33 ICacheToDecode1Type,
34 MMUToICacheType)
35
36 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
37 WB_SEL_BITS, WBAddrType, WBDataType,
38 WBSelType, WBMasterOut, WBSlaveOut,
39 WBMasterOutVector, WBSlaveOutVector,
40 WBIOMasterOut, WBIOSlaveOut)
41
42 from soc.experiment.cache_ram import CacheRam
43 from soc.experiment.plru import PLRU
44
45 # for test
46 from nmigen_soc.wishbone.sram import SRAM
47 from nmigen import Memory
48 from nmigen.cli import rtlil
49 if True:
50 from nmigen.back.pysim import Simulator, Delay, Settle
51 else:
52 from nmigen.sim.cxxsim import Simulator, Delay, Settle
53 from nmutil.util import wrap
54
55
56
57 SIM = 0
58 LINE_SIZE = 64
59 # BRAM organisation: We never access more than wishbone_data_bits
60 # at a time so to save resources we make the array only that wide,
61 # and use consecutive indices for to make a cache "line"
62 #
63 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
64 ROW_SIZE = WB_DATA_BITS // 8
65 # Number of lines in a set
66 NUM_LINES = 32
67 # Number of ways
68 NUM_WAYS = 4
69 # L1 ITLB number of entries (direct mapped)
70 TLB_SIZE = 64
71 # L1 ITLB log_2(page_size)
72 TLB_LG_PGSZ = 12
73 # Number of real address bits that we store
74 REAL_ADDR_BITS = 56
75 # Non-zero to enable log data collection
76 LOG_LENGTH = 0
77
78 ROW_SIZE_BITS = ROW_SIZE * 8
79 # ROW_PER_LINE is the number of row
80 # (wishbone) transactions in a line
81 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
82 # BRAM_ROWS is the number of rows in
83 # BRAM needed to represent the full icache
84 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
85 # INSN_PER_ROW is the number of 32bit
86 # instructions per BRAM row
87 INSN_PER_ROW = ROW_SIZE_BITS // 32
88
89 # Bit fields counts in the address
90 #
91 # INSN_BITS is the number of bits to
92 # select an instruction in a row
93 INSN_BITS = log2_int(INSN_PER_ROW)
94 # ROW_BITS is the number of bits to
95 # select a row
96 ROW_BITS = log2_int(BRAM_ROWS)
97 # ROW_LINEBITS is the number of bits to
98 # select a row within a line
99 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
100 # LINE_OFF_BITS is the number of bits for
101 # the offset in a cache line
102 LINE_OFF_BITS = log2_int(LINE_SIZE)
103 # ROW_OFF_BITS is the number of bits for
104 # the offset in a row
105 ROW_OFF_BITS = log2_int(ROW_SIZE)
106 # INDEX_BITS is the number of bits to
107 # select a cache line
108 INDEX_BITS = log2_int(NUM_LINES)
109 # SET_SIZE_BITS is the log base 2 of
110 # the set size
111 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
112 # TAG_BITS is the number of bits of
113 # the tag part of the address
114 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
115 # WAY_BITS is the number of bits to
116 # select a way
117 WAY_BITS = log2_int(NUM_WAYS)
118 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
119
120 # -- L1 ITLB.
121 # constant TLB_BITS : natural := log2(TLB_SIZE);
122 # constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
123 # constant TLB_PTE_BITS : natural := 64;
124 TLB_BITS = log2_int(TLB_SIZE)
125 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
126 TLB_PTE_BITS = 64
127
128 # architecture rtl of icache is
129 #constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
130 #-- ROW_PER_LINE is the number of row (wishbone
131 #-- transactions) in a line
132 #constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
133 #-- BRAM_ROWS is the number of rows in BRAM
134 #-- needed to represent the full
135 #-- icache
136 #constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
137 #-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
138 #constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
139 #-- Bit fields counts in the address
140 #
141 #-- INSN_BITS is the number of bits to select
142 #-- an instruction in a row
143 #constant INSN_BITS : natural := log2(INSN_PER_ROW);
144 #-- ROW_BITS is the number of bits to select a row
145 #constant ROW_BITS : natural := log2(BRAM_ROWS);
146 #-- ROW_LINEBITS is the number of bits to
147 #-- select a row within a line
148 #constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
149 #-- LINE_OFF_BITS is the number of bits for the offset
150 #-- in a cache line
151 #constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
152 #-- ROW_OFF_BITS is the number of bits for the offset in a row
153 #constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
154 #-- INDEX_BITS is the number of bits to select a cache line
155 #constant INDEX_BITS : natural := log2(NUM_LINES);
156 #-- SET_SIZE_BITS is the log base 2 of the set size
157 #constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
158 #-- TAG_BITS is the number of bits of the tag part of the address
159 #constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
160 #-- WAY_BITS is the number of bits to select a way
161 #constant WAY_BITS : natural := log2(NUM_WAYS);
162
163 #-- Example of layout for 32 lines of 64 bytes:
164 #--
165 #-- .. tag |index| line |
166 #-- .. | row | |
167 #-- .. | | | |00| zero (2)
168 #-- .. | | |-| | INSN_BITS (1)
169 #-- .. | |---| | ROW_LINEBITS (3)
170 #-- .. | |--- - --| LINE_OFF_BITS (6)
171 #-- .. | |- --| ROW_OFF_BITS (3)
172 #-- .. |----- ---| | ROW_BITS (8)
173 #-- .. |-----| | INDEX_BITS (5)
174 #-- .. --------| | TAG_BITS (53)
175 # Example of layout for 32 lines of 64 bytes:
176 #
177 # .. tag |index| line |
178 # .. | row | |
179 # .. | | | |00| zero (2)
180 # .. | | |-| | INSN_BITS (1)
181 # .. | |---| | ROW_LINEBITS (3)
182 # .. | |--- - --| LINE_OFF_BITS (6)
183 # .. | |- --| ROW_OFF_BITS (3)
184 # .. |----- ---| | ROW_BITS (8)
185 # .. |-----| | INDEX_BITS (5)
186 # .. --------| | TAG_BITS (53)
187
188 #subtype row_t is integer range 0 to BRAM_ROWS-1;
189 #subtype index_t is integer range 0 to NUM_LINES-1;
190 #subtype way_t is integer range 0 to NUM_WAYS-1;
191 #subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
192 #
193 #-- The cache data BRAM organized as described above for each way
194 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
195 #
196 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
197 #-- not handle a clean (commented) definition of the cache tags as a 3d
198 #-- memory. For now, work around it by putting all the tags
199 #subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
200 # type cache_tags_set_t is array(way_t) of cache_tag_t;
201 # type cache_tags_array_t is array(index_t) of cache_tags_set_t;
202 #constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
203 #subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
204 #type cache_tags_array_t is array(index_t) of cache_tags_set_t;
205 def CacheTagArray():
206 return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
207
208 #-- The cache valid bits
209 #subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
210 #type cache_valids_t is array(index_t) of cache_way_valids_t;
211 #type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
212 def CacheValidBitsArray():
213 return Array(Signal(NUM_WAYS) for x in range(NUM_LINES))
214
215 def RowPerLineValidArray():
216 return Array(Signal() for x in range(ROW_PER_LINE))
217
218
219 #attribute ram_style : string;
220 #attribute ram_style of cache_tags : signal is "distributed";
221 # TODO to be passed to nigmen as ram attributes
222 # attribute ram_style : string;
223 # attribute ram_style of cache_tags : signal is "distributed";
224
225
226 #subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
227 #type tlb_valids_t is array(tlb_index_t) of std_ulogic;
228 #subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
229 #type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
230 #subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
231 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
232 def TLBValidBitsArray():
233 return Array(Signal() for x in range(TLB_SIZE))
234
235 def TLBTagArray():
236 return Array(Signal(TLB_EA_TAG_BITS) for x in range(TLB_SIZE))
237
238 def TLBPTEArray():
239 return Array(Signal(TLB_PTE_BITS) for x in range(TLB_SIZE))
240
241
242 #-- Cache RAM interface
243 #type cache_ram_out_t is array(way_t) of cache_row_t;
244 # Cache RAM interface
245 def CacheRamOut():
246 return Array(Signal(ROW_SIZE_BITS) for x in range(NUM_WAYS))
247
248 #-- PLRU output interface
249 #type plru_out_t is array(index_t) of
250 # std_ulogic_vector(WAY_BITS-1 downto 0);
251 # PLRU output interface
252 def PLRUOut():
253 return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
254
255 # -- Return the cache line index (tag index) for an address
256 # function get_index(addr: std_ulogic_vector(63 downto 0))
257 # return index_t is
258 # begin
259 # return to_integer(unsigned(
260 # addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
261 # ));
262 # end;
263 # Return the cache line index (tag index) for an address
264 def get_index(addr):
265 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
266
267 # -- Return the cache row index (data memory) for an address
268 # function get_row(addr: std_ulogic_vector(63 downto 0))
269 # return row_t is
270 # begin
271 # return to_integer(unsigned(
272 # addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
273 # ));
274 # end;
275 # Return the cache row index (data memory) for an address
276 def get_row(addr):
277 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
278
279 # -- Return the index of a row within a line
280 # function get_row_of_line(row: row_t) return row_in_line_t is
281 # variable row_v : unsigned(ROW_BITS-1 downto 0);
282 # begin
283 # row_v := to_unsigned(row, ROW_BITS);
284 # return row_v(ROW_LINEBITS-1 downto 0);
285 # end;
286 # Return the index of a row within a line
287 def get_row_of_line(row):
288 return row[:ROW_LINE_BITS]
289
290 # -- Returns whether this is the last row of a line
291 # function is_last_row_addr(addr: wishbone_addr_type;
292 # last: row_in_line_t
293 # )
294 # return boolean is
295 # begin
296 # return unsigned(
297 # addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
298 # ) = last;
299 # end;
300 # Returns whether this is the last row of a line
301 def is_last_row_addr(addr, last):
302 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
303
304 # -- Returns whether this is the last row of a line
305 # function is_last_row(row: row_t;
306 # last: row_in_line_t) return boolean is
307 # begin
308 # return get_row_of_line(row) = last;
309 # end;
310 # Returns whether this is the last row of a line
311 def is_last_row(row, last):
312 return get_row_of_line(row) == last
313
314 # -- Return the address of the next row in the current cache line
315 # function next_row_addr(addr: wishbone_addr_type)
316 # return std_ulogic_vector is
317 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
318 # variable result : wishbone_addr_type;
319 # begin
320 # -- Is there no simpler way in VHDL to generate that 3 bits adder ?
321 # row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
322 # row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
323 # result := addr;
324 # result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
325 # return result;
326 # end;
327 # Return the address of the next row in the current cache line
328 def next_row_addr(addr):
329 row_idx = addr[ROW_OFF_BITS:LINE_OFF_BITS] + 1
330 return addr[ROW_OFF_BITS:LINE_OFF_BITS].eq(row_idx)
331
332 # -- Return the next row in the current cache line. We use a dedicated
333 # -- function in order to limit the size of the generated adder to be
334 # -- only the bits within a cache line (3 bits with default settings)
335 # function next_row(row: row_t) return row_t is
336 # variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
337 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
338 # variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
339 # begin
340 # row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
341 # row_idx := row_v(ROW_LINEBITS-1 downto 0);
342 # row_v(ROW_LINEBITS-1 downto 0) :=
343 # std_ulogic_vector(unsigned(row_idx) + 1);
344 # return to_integer(unsigned(row_v));
345 # end;
346 # Return the next row in the current cache line. We use a dedicated
347 # function in order to limit the size of the generated adder to be
348 # only the bits within a cache line (3 bits with default settings)
349 def next_row(row):
350 row_idx = row[:ROW_LINE_BITS]
351 return row[:ROW_LINE_BITS].eq(row_idx + 1)
352
353 # -- Read the instruction word for the given address in the
354 # -- current cache row
355 # function read_insn_word(addr: std_ulogic_vector(63 downto 0);
356 # data: cache_row_t) return std_ulogic_vector is
357 # variable word: integer range 0 to INSN_PER_ROW-1;
358 # begin
359 # word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
360 # return data(31+word*32 downto word*32);
361 # end;
362 # Read the instruction word for the given address
363 # in the current cache row
364 def read_insn_word(addr, data):
365 word = addr[2:INSN_BITS+3]
366 return data.word_select(word, 32)
367
368 # -- Get the tag value from the address
369 # function get_tag(
370 # addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
371 # )
372 # return cache_tag_t is
373 # begin
374 # return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
375 # end;
376 # Get the tag value from the address
377 def get_tag(addr):
378 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
379
380 # -- Read a tag from a tag memory row
381 # function read_tag(way: way_t; tagset: cache_tags_set_t)
382 # return cache_tag_t is
383 # begin
384 # return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
385 # end;
386 # Read a tag from a tag memory row
387 def read_tag(way, tagset):
388 return tagset[way * TAG_BITS:(way + 1) * TAG_BITS]
389
390 # -- Write a tag to tag memory row
391 # procedure write_tag(way: in way_t;
392 # tagset: inout cache_tags_set_t; tag: cache_tag_t) is
393 # begin
394 # tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
395 # end;
396 # Write a tag to tag memory row
397 def write_tag(way, tagset, tag):
398 tagset[way * TAG_BITS:(way + 1) * TAG_BITS] = tag
399
400 # -- Simple hash for direct-mapped TLB index
401 # function hash_ea(addr: std_ulogic_vector(63 downto 0))
402 # return tlb_index_t is
403 # variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
404 # begin
405 # hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
406 # xor addr(
407 # TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
408 # TLB_LG_PGSZ + TLB_BITS
409 # )
410 # xor addr(
411 # TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
412 # TLB_LG_PGSZ + 2 * TLB_BITS
413 # );
414 # return to_integer(unsigned(hash));
415 # end;
416 # Simple hash for direct-mapped TLB index
417 def hash_ea(addr):
418 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
419 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
420 ] ^ addr[
421 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
422 ]
423 return hsh
424
425 # begin
426 #
427 # assert LINE_SIZE mod ROW_SIZE = 0;
428 # assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
429 # severity FAILURE;
430 # assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
431 # severity FAILURE;
432 # assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
433 # severity FAILURE;
434 # assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
435 # severity FAILURE;
436 # assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
437 # report "geometry bits don't add up" severity FAILURE;
438 # assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
439 # report "geometry bits don't add up" severity FAILURE;
440 # assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
441 # report "geometry bits don't add up" severity FAILURE;
442 # assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
443 # report "geometry bits don't add up" severity FAILURE;
444 #
445 # sim_debug: if SIM generate
446 # debug: process
447 # begin
448 # report "ROW_SIZE = " & natural'image(ROW_SIZE);
449 # report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
450 # report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
451 # report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
452 # report "INSN_BITS = " & natural'image(INSN_BITS);
453 # report "ROW_BITS = " & natural'image(ROW_BITS);
454 # report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
455 # report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
456 # report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
457 # report "INDEX_BITS = " & natural'image(INDEX_BITS);
458 # report "TAG_BITS = " & natural'image(TAG_BITS);
459 # report "WAY_BITS = " & natural'image(WAY_BITS);
460 # wait;
461 # end process;
462 # end generate;
463
464 # Cache reload state machine
465 @unique
466 class State(Enum):
467 IDLE = 0
468 CLR_TAG = 1
469 WAIT_ACK = 2
470
471 # type reg_internal_t is record
472 # -- Cache hit state (Latches for 1 cycle BRAM access)
473 # hit_way : way_t;
474 # hit_nia : std_ulogic_vector(63 downto 0);
475 # hit_smark : std_ulogic;
476 # hit_valid : std_ulogic;
477 #
478 # -- Cache miss state (reload state machine)
479 # state : state_t;
480 # wb : wishbone_master_out;
481 # store_way : way_t;
482 # store_index : index_t;
483 # store_row : row_t;
484 # store_tag : cache_tag_t;
485 # store_valid : std_ulogic;
486 # end_row_ix : row_in_line_t;
487 # rows_valid : row_per_line_valid_t;
488 #
489 # -- TLB miss state
490 # fetch_failed : std_ulogic;
491 # end record;
492 class RegInternal(RecordObject):
493 def __init__(self):
494 super().__init__()
495 # Cache hit state (Latches for 1 cycle BRAM access)
496 self.hit_way = Signal(NUM_WAYS)
497 self.hit_nia = Signal(64)
498 self.hit_smark = Signal()
499 self.hit_valid = Signal()
500
501 # Cache miss state (reload state machine)
502 self.state = Signal(State)
503 self.wb = WBMasterOut()
504 self.store_way = Signal(NUM_WAYS)
505 self.store_index = Signal(NUM_LINES)
506 self.store_row = Signal(BRAM_ROWS)
507 self.store_tag = Signal(TAG_BITS)
508 self.store_valid = Signal()
509 self.end_row_ix = Signal(ROW_LINE_BITS)
510 self.rows_valid = RowPerLineValidArray()
511
512 # TLB miss state
513 self.fetch_failed = Signal()
514
515 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
516 #
517 # entity icache is
518 # generic (
519 # SIM : boolean := false;
520 # -- Line size in bytes
521 # LINE_SIZE : positive := 64;
522 # -- BRAM organisation: We never access more
523 # -- than wishbone_data_bits
524 # -- at a time so to save resources we make the
525 # -- array only that wide,
526 # -- and use consecutive indices for to make a cache "line"
527 # --
528 # -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
529 # -- so 64-bits)
530 # ROW_SIZE : positive := wishbone_data_bits / 8;
531 # -- Number of lines in a set
532 # NUM_LINES : positive := 32;
533 # -- Number of ways
534 # NUM_WAYS : positive := 4;
535 # -- L1 ITLB number of entries (direct mapped)
536 # TLB_SIZE : positive := 64;
537 # -- L1 ITLB log_2(page_size)
538 # TLB_LG_PGSZ : positive := 12;
539 # -- Number of real address bits that we store
540 # REAL_ADDR_BITS : positive := 56;
541 # -- Non-zero to enable log data collection
542 # LOG_LENGTH : natural := 0
543 # );
544 # port (
545 # clk : in std_ulogic;
546 # rst : in std_ulogic;
547 #
548 # i_in : in Fetch1ToIcacheType;
549 # i_out : out IcacheToDecode1Type;
550 #
551 # m_in : in MmuToIcacheType;
552 #
553 # stall_in : in std_ulogic;
554 # stall_out : out std_ulogic;
555 # flush_in : in std_ulogic;
556 # inval_in : in std_ulogic;
557 #
558 # wishbone_out : out wishbone_master_out;
559 # wishbone_in : in wishbone_slave_out;
560 #
561 # log_out : out std_ulogic_vector(53 downto 0)
562 # );
563 # end entity icache;
564 # 64 bit direct mapped icache. All instructions are 4B aligned.
565 class ICache(Elaboratable):
566 """64 bit direct mapped icache. All instructions are 4B aligned."""
567 def __init__(self):
568 self.i_in = Fetch1ToICacheType()
569 self.i_out = ICacheToDecode1Type()
570
571 self.m_in = MMUToICacheType()
572
573 self.stall_in = Signal()
574 self.stall_out = Signal()
575 self.flush_in = Signal()
576 self.inval_in = Signal()
577
578 self.wb_out = WBMasterOut()
579 self.wb_in = WBSlaveOut()
580
581 self.log_out = Signal(54)
582
583
584 # -- Generate a cache RAM for each way
585 # rams: for i in 0 to NUM_WAYS-1 generate
586 # signal do_read : std_ulogic;
587 # signal do_write : std_ulogic;
588 # signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
589 # signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
590 # signal dout : cache_row_t;
591 # signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
592 # begin
593 # way: entity work.cache_ram
594 # generic map (
595 # ROW_BITS => ROW_BITS,
596 # WIDTH => ROW_SIZE_BITS
597 # )
598 # port map (
599 # clk => clk,
600 # rd_en => do_read,
601 # rd_addr => rd_addr,
602 # rd_data => dout,
603 # wr_sel => wr_sel,
604 # wr_addr => wr_addr,
605 # wr_data => wishbone_in.dat
606 # );
607 # process(all)
608 # begin
609 # do_read <= not (stall_in or use_previous);
610 # do_write <= '0';
611 # if wishbone_in.ack = '1' and replace_way = i then
612 # do_write <= '1';
613 # end if;
614 # cache_out(i) <= dout;
615 # rd_addr <=
616 # std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
617 # wr_addr <=
618 # std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
619 # for i in 0 to ROW_SIZE-1 loop
620 # wr_sel(i) <= do_write;
621 # end loop;
622 # end process;
623 # end generate;
624 def rams(self, m, r, cache_out, use_previous, replace_way, req_row):
625 comb = m.d.comb
626
627 wb_in, stall_in = self.wb_in, self.stall_in
628
629 do_read = Signal()
630 do_write = Signal()
631 rd_addr = Signal(ROW_BITS)
632 wr_addr = Signal(ROW_BITS)
633 _d_out = Signal(ROW_SIZE_BITS)
634 wr_sel = Signal(ROW_SIZE)
635
636 for i in range(NUM_WAYS):
637 way = CacheRam(ROW_BITS, ROW_SIZE_BITS)
638 comb += way.rd_en.eq(do_read)
639 comb += way.rd_addr.eq(rd_addr)
640 comb += way.rd_data_o.eq(_d_out)
641 comb += way.wr_sel.eq(wr_sel)
642 comb += way.wr_addr.eq(wr_addr)
643 comb += way.wr_data.eq(wb_in.dat)
644
645 comb += do_read.eq(~(stall_in | use_previous))
646 comb += do_write.eq(0)
647
648 with m.If(wb_in.ack & (replace_way == i)):
649 comb += do_write.eq(1)
650
651 comb += cache_out[i].eq(_d_out)
652 comb += rd_addr.eq(req_row)
653 comb += wr_addr.eq(r.store_row)
654 for j in range(ROW_SIZE):
655 comb += wr_sel[j].eq(do_write)
656
657 # -- Generate PLRUs
658 # maybe_plrus: if NUM_WAYS > 1 generate
659 # begin
660 # plrus: for i in 0 to NUM_LINES-1 generate
661 # -- PLRU interface
662 # signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
663 # signal plru_acc_en : std_ulogic;
664 # signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
665 #
666 # begin
667 # plru : entity work.plru
668 # generic map (
669 # BITS => WAY_BITS
670 # )
671 # port map (
672 # clk => clk,
673 # rst => rst,
674 # acc => plru_acc,
675 # acc_en => plru_acc_en,
676 # lru => plru_out
677 # );
678 #
679 # process(all)
680 # begin
681 # -- PLRU interface
682 # if get_index(r.hit_nia) = i then
683 # plru_acc_en <= r.hit_valid;
684 # else
685 # plru_acc_en <= '0';
686 # end if;
687 # plru_acc <=
688 # std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
689 # plru_victim(i) <= plru_out;
690 # end process;
691 # end generate;
692 # end generate;
693 def maybe_plrus(self, m, r, plru_victim):
694 comb = m.d.comb
695
696 with m.If(NUM_WAYS > 1):
697 for i in range(NUM_LINES):
698 plru_acc_i = Signal(WAY_BITS)
699 plru_acc_en = Signal()
700 plru_out = Signal(WAY_BITS)
701 plru = PLRU(WAY_BITS)
702 comb += plru.acc_i.eq(plru_acc_i)
703 comb += plru.acc_en.eq(plru_acc_en)
704 comb += plru.lru_o.eq(plru_out)
705
706 # PLRU interface
707 with m.If(get_index(r.hit_nia) == i):
708 comb += plru.acc_en.eq(r.hit_valid)
709
710 with m.Else():
711 comb += plru.acc_en.eq(0)
712
713 comb += plru.acc_i.eq(r.hit_way)
714 comb += plru_victim[i].eq(plru.lru_o)
715
716 # -- TLB hit detection and real address generation
717 # itlb_lookup : process(all)
718 # variable pte : tlb_pte_t;
719 # variable ttag : tlb_tag_t;
720 # begin
721 # tlb_req_index <= hash_ea(i_in.nia);
722 # pte := itlb_ptes(tlb_req_index);
723 # ttag := itlb_tags(tlb_req_index);
724 # if i_in.virt_mode = '1' then
725 # real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
726 # i_in.nia(TLB_LG_PGSZ - 1 downto 0);
727 # if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then
728 # ra_valid <= itlb_valids(tlb_req_index);
729 # else
730 # ra_valid <= '0';
731 # end if;
732 # eaa_priv <= pte(3);
733 # else
734 # real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0);
735 # ra_valid <= '1';
736 # eaa_priv <= '1';
737 # end if;
738 #
739 # -- no IAMR, so no KUEP support for now
740 # priv_fault <= eaa_priv and not i_in.priv_mode;
741 # access_ok <= ra_valid and not priv_fault;
742 # end process;
743 # TLB hit detection and real address generation
744 def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
745 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
746 priv_fault, access_ok):
747 comb = m.d.comb
748
749 i_in = self.i_in
750
751 pte = Signal(TLB_PTE_BITS)
752 ttag = Signal(TLB_EA_TAG_BITS)
753
754 comb += tlb_req_index.eq(hash_ea(i_in.nia))
755 comb += pte.eq(itlb_ptes[tlb_req_index])
756 comb += ttag.eq(itlb_tags[tlb_req_index])
757
758 with m.If(i_in.virt_mode):
759 comb += real_addr.eq(Cat(
760 i_in.nia[:TLB_LG_PGSZ],
761 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
762 ))
763
764 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
765 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
766
767 with m.Else():
768 comb += ra_valid.eq(0)
769
770 with m.Else():
771 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
772 comb += ra_valid.eq(1)
773 comb += eaa_priv.eq(1)
774
775 # No IAMR, so no KUEP support for now
776 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
777 comb += access_ok.eq(ra_valid & ~priv_fault)
778
779 # -- iTLB update
780 # itlb_update: process(clk)
781 # variable wr_index : tlb_index_t;
782 # begin
783 # if rising_edge(clk) then
784 # wr_index := hash_ea(m_in.addr);
785 # if rst = '1' or
786 # (m_in.tlbie = '1' and m_in.doall = '1') then
787 # -- clear all valid bits
788 # for i in tlb_index_t loop
789 # itlb_valids(i) <= '0';
790 # end loop;
791 # elsif m_in.tlbie = '1' then
792 # -- clear entry regardless of hit or miss
793 # itlb_valids(wr_index) <= '0';
794 # elsif m_in.tlbld = '1' then
795 # itlb_tags(wr_index) <=
796 # m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS);
797 # itlb_ptes(wr_index) <= m_in.pte;
798 # itlb_valids(wr_index) <= '1';
799 # end if;
800 # end if;
801 # end process;
802 # iTLB update
803 def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
804 comb = m.d.comb
805 sync = m.d.sync
806
807 m_in = self.m_in
808
809 wr_index = Signal(TLB_SIZE)
810 comb += wr_index.eq(hash_ea(m_in.addr))
811
812 with m.If(m_in.tlbie & m_in.doall):
813 # Clear all valid bits
814 for i in range(TLB_SIZE):
815 sync += itlb_valid_bits[i].eq(0)
816
817 with m.Elif(m_in.tlbie):
818 # Clear entry regardless of hit or miss
819 sync += itlb_valid_bits[wr_index].eq(0)
820
821 with m.Elif(m_in.tlbld):
822 sync += itlb_tags[wr_index].eq(
823 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
824 )
825 sync += itlb_ptes[wr_index].eq(m_in.pte)
826 sync += itlb_valid_bits[wr_index].eq(1)
827
828 # -- Cache hit detection, output to fetch2 and other misc logic
829 # icache_comb : process(all)
830 # Cache hit detection, output to fetch2 and other misc logic
831 def icache_comb(self, m, use_previous, r, req_index, req_row,
832 req_tag, real_addr, req_laddr, cache_valid_bits,
833 cache_tags, access_ok, req_is_hit,
834 req_is_miss, replace_way, plru_victim, cache_out):
835 # variable is_hit : std_ulogic;
836 # variable hit_way : way_t;
837 comb = m.d.comb
838
839 i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
840 flush_in, stall_out = self.flush_in, self.stall_out
841
842 is_hit = Signal()
843 hit_way = Signal(NUM_WAYS)
844 # begin
845 # -- i_in.sequential means that i_in.nia this cycle
846 # -- is 4 more than last cycle. If we read more
847 # -- than 32 bits at a time, had a cache hit last
848 # -- cycle, and we don't want the first 32-bit chunk
849 # -- then we can keep the data we read last cycle
850 # -- and just use that.
851 # if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
852 # use_previous <= i_in.sequential and r.hit_valid;
853 # else
854 # use_previous <= '0';
855 # end if;
856 # i_in.sequential means that i_in.nia this cycle is 4 more than
857 # last cycle. If we read more than 32 bits at a time, had a
858 # cache hit last cycle, and we don't want the first 32-bit chunk
859 # then we can keep the data we read last cycle and just use that.
860 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
861 comb += use_previous.eq(i_in.sequential & r.hit_valid)
862
863 with m.Else():
864 comb += use_previous.eq(0)
865
866 # -- Extract line, row and tag from request
867 # req_index <= get_index(i_in.nia);
868 # req_row <= get_row(i_in.nia);
869 # req_tag <= get_tag(real_addr);
870 # Extract line, row and tag from request
871 comb += req_index.eq(get_index(i_in.nia))
872 comb += req_row.eq(get_row(i_in.nia))
873 comb += req_tag.eq(get_tag(real_addr))
874
875 # -- Calculate address of beginning of cache row, will be
876 # -- used for cache miss processing if needed
877 # req_laddr <=
878 # (63 downto REAL_ADDR_BITS => '0') &
879 # real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
880 # (ROW_OFF_BITS-1 downto 0 => '0');
881 # Calculate address of beginning of cache row, will be
882 # used for cache miss processing if needed
883 comb += req_laddr.eq(Cat(
884 Const(0b0, ROW_OFF_BITS),
885 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
886 Const(0, REAL_ADDR_BITS)
887 ))
888
889 # -- Test if pending request is a hit on any way
890 # hit_way := 0;
891 # is_hit := '0';
892 # for i in way_t loop
893 # if i_in.req = '1' and
894 # (cache_valids(req_index)(i) = '1' or
895 # (r.state = WAIT_ACK and
896 # req_index = r.store_index and
897 # i = r.store_way and
898 # r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
899 # if read_tag(i, cache_tags(req_index)) = req_tag then
900 # hit_way := i;
901 # is_hit := '1';
902 # end if;
903 # end if;
904 # end loop;
905 # Test if pending request is a hit on any way
906 for i in range(NUM_WAYS):
907 with m.If(i_in.req &
908 (cache_valid_bits[req_index][i] |
909 ((r.state == State.WAIT_ACK)
910 & (req_index == r.store_index)
911 & (i == r.store_way)
912 & r.rows_valid[req_row % ROW_PER_LINE]))):
913 with m.If(read_tag(i, cache_tags[req_index]) == req_tag):
914 comb += hit_way.eq(i)
915 comb += is_hit.eq(1)
916
917 # -- Generate the "hit" and "miss" signals
918 # -- for the synchronous blocks
919 # if i_in.req = '1' and access_ok = '1' and flush_in = '0'
920 # and rst = '0' then
921 # req_is_hit <= is_hit;
922 # req_is_miss <= not is_hit;
923 # else
924 # req_is_hit <= '0';
925 # req_is_miss <= '0';
926 # end if;
927 # req_hit_way <= hit_way;
928 # Generate the "hit" and "miss" signals
929 # for the synchronous blocks
930 with m.If(i_in.req & access_ok & ~flush_in):
931 comb += req_is_hit.eq(is_hit)
932 comb += req_is_miss.eq(~is_hit)
933
934 with m.Else():
935 comb += req_is_hit.eq(0)
936 comb += req_is_miss.eq(0)
937
938 # -- The way to replace on a miss
939 # if r.state = CLR_TAG then
940 # replace_way <=
941 # to_integer(unsigned(plru_victim(r.store_index)));
942 # else
943 # replace_way <= r.store_way;
944 # end if;
945 # The way to replace on a miss
946 with m.If(r.state == State.CLR_TAG):
947 comb += replace_way.eq(plru_victim[r.store_index])
948
949 with m.Else():
950 comb += replace_way.eq(r.store_way)
951
952 # -- Output instruction from current cache row
953 # --
954 # -- Note: This is a mild violation of our design principle of
955 # -- having pipeline stages output from a clean latch. In this
956 # -- case we output the result of a mux. The alternative would
957 # -- be output an entire row which I prefer not to do just yet
958 # -- as it would force fetch2 to know about some of the cache
959 # -- geometry information.
960 # i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
961 # i_out.valid <= r.hit_valid;
962 # i_out.nia <= r.hit_nia;
963 # i_out.stop_mark <= r.hit_smark;
964 # i_out.fetch_failed <= r.fetch_failed;
965 # Output instruction from current cache row
966 #
967 # Note: This is a mild violation of our design principle of
968 # having pipeline stages output from a clean latch. In this
969 # case we output the result of a mux. The alternative would
970 # be output an entire row which I prefer not to do just yet
971 # as it would force fetch2 to know about some of the cache
972 # geometry information.
973 comb += i_out.insn.eq(
974 read_insn_word(r.hit_nia, cache_out[r.hit_way])
975 )
976 comb += i_out.valid.eq(r.hit_valid)
977 comb += i_out.nia.eq(r.hit_nia)
978 comb += i_out.stop_mark.eq(r.hit_smark)
979 comb += i_out.fetch_failed.eq(r.fetch_failed)
980
981 # -- Stall fetch1 if we have a miss on cache or TLB
982 # -- or a protection fault
983 # stall_out <= not (is_hit and access_ok);
984 # Stall fetch1 if we have a miss on cache or TLB
985 # or a protection fault
986 comb += stall_out.eq(~(is_hit & access_ok))
987
988 # -- Wishbone requests output (from the cache miss reload machine)
989 # wishbone_out <= r.wb;
990 # Wishbone requests output (from the cache miss reload machine)
991 comb += wb_out.eq(r.wb)
992 # end process;
993
994 # -- Cache hit synchronous machine
995 # icache_hit : process(clk)
996 # Cache hit synchronous machine
997 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
998 req_index, req_tag, real_addr):
999 sync = m.d.sync
1000
1001 i_in, stall_in = self.i_in, self.stall_in
1002 flush_in = self.flush_in
1003
1004 # begin
1005 # if rising_edge(clk) then
1006 # -- keep outputs to fetch2 unchanged on a stall
1007 # -- except that flush or reset sets valid to 0
1008 # -- If use_previous, keep the same data as last
1009 # -- cycle and use the second half
1010 # if stall_in = '1' or use_previous = '1' then
1011 # if rst = '1' or flush_in = '1' then
1012 # r.hit_valid <= '0';
1013 # end if;
1014 # keep outputs to fetch2 unchanged on a stall
1015 # except that flush or reset sets valid to 0
1016 # If use_previous, keep the same data as last
1017 # cycle and use the second half
1018 with m.If(stall_in | use_previous):
1019 with m.If(flush_in):
1020 sync += r.hit_valid.eq(0)
1021 # else
1022 # -- On a hit, latch the request for the next cycle,
1023 # -- when the BRAM data will be available on the
1024 # -- cache_out output of the corresponding way
1025 # r.hit_valid <= req_is_hit;
1026 # if req_is_hit = '1' then
1027 # r.hit_way <= req_hit_way;
1028 with m.Else():
1029 # On a hit, latch the request for the next cycle,
1030 # when the BRAM data will be available on the
1031 # cache_out output of the corresponding way
1032 sync += r.hit_valid.eq(req_is_hit)
1033
1034 with m.If(req_is_hit):
1035 sync += r.hit_way.eq(req_hit_way)
1036
1037 # report "cache hit nia:" & to_hstring(i_in.nia) &
1038 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1039 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1040 # " idx:" & integer'image(req_index) &
1041 # " tag:" & to_hstring(req_tag) &
1042 # " way:" & integer'image(req_hit_way) &
1043 # " RA:" & to_hstring(real_addr);
1044 print(f"cache hit nia:{i_in.nia}, " \
1045 f"IR:{i_in.virt_mode}, " \
1046 f"SM:{i_in.stop_mark}, idx:{req_index}, " \
1047 f"tag:{req_tag}, way:{req_hit_way}, " \
1048 f"RA:{real_addr}")
1049 # end if;
1050 # end if;
1051 # if stall_in = '0' then
1052 # -- Send stop marks and NIA down regardless of validity
1053 # r.hit_smark <= i_in.stop_mark;
1054 # r.hit_nia <= i_in.nia;
1055 # end if;
1056 with m.If(~stall_in):
1057 # Send stop marks and NIA down regardless of validity
1058 sync += r.hit_smark.eq(i_in.stop_mark)
1059 sync += r.hit_nia.eq(i_in.nia)
1060 # end if;
1061 # end process;
1062
1063 # -- Cache miss/reload synchronous machine
1064 # icache_miss : process(clk)
1065 # Cache miss/reload synchronous machine
1066 def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
1067 req_index, req_laddr, req_tag, replace_way,
1068 cache_tags, access_ok, real_addr):
1069 comb = m.d.comb
1070 sync = m.d.sync
1071
1072 i_in, wb_in, m_in = self.i_in, self.wb_in, self.m_in
1073 stall_in, flush_in = self.stall_in, self.flush_in
1074 inval_in = self.inval_in
1075
1076 # variable tagset : cache_tags_set_t;
1077 # variable stbs_done : boolean;
1078
1079 tagset = Signal(TAG_RAM_WIDTH)
1080 stbs_done = Signal()
1081
1082 # begin
1083 # if rising_edge(clk) then
1084 # -- On reset, clear all valid bits to force misses
1085 # if rst = '1' then
1086 # On reset, clear all valid bits to force misses
1087 # for i in index_t loop
1088 # cache_valids(i) <= (others => '0');
1089 # end loop;
1090 # r.state <= IDLE;
1091 # r.wb.cyc <= '0';
1092 # r.wb.stb <= '0';
1093 # -- We only ever do reads on wishbone
1094 # r.wb.dat <= (others => '0');
1095 # r.wb.sel <= "11111111";
1096 # r.wb.we <= '0';
1097
1098 # We only ever do reads on wishbone
1099 comb += r.wb.sel.eq(~0) # set to all 1s
1100
1101 # -- Not useful normally but helps avoiding
1102 # -- tons of sim warnings
1103 # r.wb.adr <= (others => '0');
1104
1105 # else
1106
1107 # -- Process cache invalidations
1108 # if inval_in = '1' then
1109 # for i in index_t loop
1110 # cache_valids(i) <= (others => '0');
1111 # end loop;
1112 # r.store_valid <= '0';
1113 # end if;
1114 # Process cache invalidations
1115 with m.If(inval_in):
1116 for i in range(NUM_LINES):
1117 sync += cache_valid_bits[i].eq(~1) # NO just set to zero.
1118 # look again: others == 0
1119
1120 sync += r.store_valid.eq(0)
1121
1122 # -- Main state machine
1123 # case r.state is
1124 # Main state machine
1125 with m.Switch(r.state):
1126
1127 # when IDLE =>
1128 with m.Case(State.IDLE):
1129 # -- Reset per-row valid flags,
1130 # -- only used in WAIT_ACK
1131 # for i in 0 to ROW_PER_LINE - 1 loop
1132 # r.rows_valid(i) <= '0';
1133 # end loop;
1134 # Reset per-row valid flags,
1135 # only used in WAIT_ACK
1136 for i in range(ROW_PER_LINE):
1137 sync += r.rows_valid[i].eq(0)
1138
1139 # -- We need to read a cache line
1140 # if req_is_miss = '1' then
1141 # report "cache miss nia:" & to_hstring(i_in.nia) &
1142 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1143 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1144 # " idx:" & integer'image(req_index) &
1145 # " way:" & integer'image(replace_way) &
1146 # " tag:" & to_hstring(req_tag) &
1147 # " RA:" & to_hstring(real_addr);
1148 # We need to read a cache line
1149 with m.If(req_is_miss):
1150 # XXX no, do not use "f". use sync += Display
1151 # and use %d for integer, %x for hex.
1152 print(f"cache miss nia:{i_in.nia} " \
1153 f"IR:{i_in.virt_mode} " \
1154 f"SM:{i_in.stop_mark} " \
1155 F"idx:{req_index} " \
1156 f"way:{replace_way} tag:{req_tag} " \
1157 f"RA:{real_addr}")
1158
1159 # -- Keep track of our index and way for
1160 # -- subsequent stores
1161 # r.store_index <= req_index;
1162 # r.store_row <= get_row(req_laddr);
1163 # r.store_tag <= req_tag;
1164 # r.store_valid <= '1';
1165 # r.end_row_ix <=
1166 # get_row_of_line(get_row(req_laddr)) - 1;
1167 # Keep track of our index and way
1168 # for subsequent stores
1169 sync += r.store_index.eq(req_index)
1170 sync += r.store_row.eq(get_row(req_laddr))
1171 sync += r.store_tag.eq(req_tag)
1172 sync += r.store_valid.eq(1)
1173 sync += r.end_row_ix.eq(
1174 get_row_of_line(
1175 get_row(req_laddr)
1176 ) - 1
1177 )
1178
1179 # -- Prep for first wishbone read. We calculate the
1180 # -- address of the start of the cache line and
1181 # -- start the WB cycle.
1182 # r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
1183 # r.wb.cyc <= '1';
1184 # r.wb.stb <= '1';
1185 # Prep for first wishbone read.
1186 # We calculate the
1187 # address of the start of the cache line and
1188 # start the WB cycle.
1189 sync += r.wb.adr.eq(
1190 req_laddr[:r.wb.adr]
1191 )
1192 sync += r.wb.cyc.eq(1)
1193 sync += r.wb.stb.eq(1)
1194
1195 # -- Track that we had one request sent
1196 # r.state <= CLR_TAG;
1197 # Track that we had one request sent
1198 sync += r.state.eq(State.CLR_TAG)
1199 # end if;
1200
1201 # when CLR_TAG | WAIT_ACK =>
1202 with m.Case(State.CLR_TAG, State.WAIT_ACK):
1203 # if r.state = CLR_TAG then
1204 with m.If(r.state == State.CLR_TAG):
1205 # -- Get victim way from plru
1206 # r.store_way <= replace_way;
1207 # Get victim way from plru
1208 sync += r.store_way.eq(replace_way)
1209 #
1210 # -- Force misses on that way while
1211 # -- reloading that line
1212 # cache_valids(req_index)(replace_way) <= '0';
1213 # Force misses on that way while
1214 # realoading that line
1215 sync += cache_valid_bits[
1216 req_index
1217 ][replace_way].eq(0)
1218
1219 # -- Store new tag in selected way
1220 # for i in 0 to NUM_WAYS-1 loop
1221 # if i = replace_way then
1222 # tagset := cache_tags(r.store_index);
1223 # write_tag(i, tagset, r.store_tag);
1224 # cache_tags(r.store_index) <= tagset;
1225 # end if;
1226 # end loop;
1227 for i in range(NUM_WAYS):
1228 with m.If(i == replace_way):
1229 comb += tagset.eq(
1230 cache_tags[r.store_index]
1231 )
1232 sync += write_tag(
1233 i, tagset, r.store_tag
1234 )
1235 sync += cache_tags[r.store_index].eq(
1236 tagset
1237 )
1238
1239 # r.state <= WAIT_ACK;
1240 sync += r.state.eq(State.WAIT_ACK)
1241 # end if;
1242
1243 # -- Requests are all sent if stb is 0
1244 # stbs_done := r.wb.stb = '0';
1245 # Requests are all sent if stb is 0
1246 comb += stbs_done.eq(r.wb.stb == 0)
1247
1248 # -- If we are still sending requests,
1249 # -- was one accepted ?
1250 # if wishbone_in.stall = '0' and not stbs_done then
1251 # If we are still sending requests,
1252 # was one accepted?
1253 with m.If(~wb_in.stall & ~stbs_done):
1254 # -- That was the last word ? We are done sending.
1255 # -- Clear stb and set stbs_done so we can handle
1256 # -- an eventual last ack on the same cycle.
1257 # if is_last_row_addr(r.wb.adr, r.end_row_ix) then
1258 # r.wb.stb <= '0';
1259 # stbs_done := true;
1260 # end if;
1261 # That was the last word ?
1262 # We are done sending.
1263 # Clear stb and set stbs_done
1264 # so we can handle
1265 # an eventual last ack on
1266 # the same cycle.
1267 with m.If(is_last_row_addr(
1268 r.wb.adr, r.end_row_ix)):
1269 sync += r.wb.stb.eq(0)
1270 stbs_done.eq(1)
1271
1272 # -- Calculate the next row address
1273 # r.wb.adr <= next_row_addr(r.wb.adr);
1274 # Calculate the next row address
1275 sync += r.wb.adr.eq(next_row_addr(r.wb.adr))
1276 # end if;
1277
1278 # -- Incoming acks processing
1279 # if wishbone_in.ack = '1' then
1280 # Incoming acks processing
1281 with m.If(wb_in.ack):
1282 # r.rows_valid(r.store_row mod ROW_PER_LINE)
1283 # <= '1';
1284 sync += r.rows_valid[
1285 r.store_row & ROW_PER_LINE
1286 ].eq(1)
1287
1288 # -- Check for completion
1289 # if stbs_done and
1290 # is_last_row(r.store_row, r.end_row_ix) then
1291 # Check for completion
1292 with m.If(stbs_done & is_last_row(
1293 r.store_row, r.end_row_ix)):
1294 # -- Complete wishbone cycle
1295 # r.wb.cyc <= '0';
1296 # Complete wishbone cycle
1297 sync += r.wb.cyc.eq(0)
1298
1299 # -- Cache line is now valid
1300 # cache_valids(r.store_index)(replace_way) <=
1301 # r.store_valid and not inval_in;
1302 # Cache line is now valid
1303 sync += cache_valid_bits[
1304 r.store_index
1305 ][relace_way].eq(
1306 r.store_valid & ~inval_in
1307 )
1308
1309 # -- We are done
1310 # r.state <= IDLE;
1311 # We are done
1312 sync += r.state.eq(State.IDLE)
1313 # end if;
1314
1315 # -- Increment store row counter
1316 # r.store_row <= next_row(r.store_row);
1317 # Increment store row counter
1318 sync += store_row.eq(next_row(r.store_row))
1319 # end if;
1320 # end case;
1321 # end if;
1322 #
1323 # -- TLB miss and protection fault processing
1324 # if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
1325 # r.fetch_failed <= '0';
1326 # elsif i_in.req = '1' and access_ok = '0' and
1327 # stall_in = '0' then
1328 # r.fetch_failed <= '1';
1329 # end if;
1330 # TLB miss and protection fault processing
1331 with m.If('''TODO nmigen rst''' | flush_in | m_in.tlbld):
1332 sync += r.fetch_failed.eq(0)
1333
1334 with m.Elif(i_in.req & ~access_ok & ~stall_in):
1335 sync += r.fetch_failed.eq(1)
1336 # end if;
1337 # end process;
1338
1339 # icache_log: if LOG_LENGTH > 0 generate
1340 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
1341 req_is_miss, req_is_hit, lway, wstate, r):
1342 comb = m.d.comb
1343 sync = m.d.sync
1344
1345 wb_in, i_out = self.wb_in, self.i_out
1346 log_out, stall_out = self.log_out, self.stall_out
1347
1348 # -- Output data to logger
1349 # signal log_data : std_ulogic_vector(53 downto 0);
1350 # begin
1351 # data_log: process(clk)
1352 # variable lway: way_t;
1353 # variable wstate: std_ulogic;
1354 # Output data to logger
1355 for i in range(LOG_LENGTH):
1356 # Output data to logger
1357 log_data = Signal(54)
1358 lway = Signal(NUM_WAYS)
1359 wstate = Signal()
1360
1361 # begin
1362 # if rising_edge(clk) then
1363 # lway := req_hit_way;
1364 # wstate := '0';
1365 comb += lway.eq(req_hit_way)
1366 comb += wstate.eq(0)
1367
1368 # if r.state /= IDLE then
1369 # wstate := '1';
1370 # end if;
1371 with m.If(r.state != State.IDLE):
1372 sync += wstate.eq(1)
1373
1374 # log_data <= i_out.valid &
1375 # i_out.insn &
1376 # wishbone_in.ack &
1377 # r.wb.adr(5 downto 3) &
1378 # r.wb.stb & r.wb.cyc &
1379 # wishbone_in.stall &
1380 # stall_out &
1381 # r.fetch_failed &
1382 # r.hit_nia(5 downto 2) &
1383 # wstate &
1384 # std_ulogic_vector(to_unsigned(lway, 3)) &
1385 # req_is_hit & req_is_miss &
1386 # access_ok &
1387 # ra_valid;
1388 sync += log_data.eq(Cat(
1389 ra_valid, access_ok, req_is_miss, req_is_hit,
1390 lway, wstate, r.hit_nia[2:6],
1391 r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
1392 r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
1393 i_out.valid
1394 ))
1395 # end if;
1396 # end process;
1397 # log_out <= log_data;
1398 comb += log_out.eq(log_data)
1399 # end generate;
1400 # end;
1401
1402 def elaborate(self, platform):
1403
1404 m = Module()
1405 comb = m.d.comb
1406
1407 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1408 cache_tags = CacheTagArray()
1409 cache_valid_bits = CacheValidBitsArray()
1410
1411 # signal itlb_valids : tlb_valids_t;
1412 # signal itlb_tags : tlb_tags_t;
1413 # signal itlb_ptes : tlb_ptes_t;
1414 # attribute ram_style of itlb_tags : signal is "distributed";
1415 # attribute ram_style of itlb_ptes : signal is "distributed";
1416 itlb_valid_bits = TLBValidBitsArray()
1417 itlb_tags = TLBTagArray()
1418 itlb_ptes = TLBPTEArray()
1419 # TODO to be passed to nmigen as ram attributes
1420 # attribute ram_style of itlb_tags : signal is "distributed";
1421 # attribute ram_style of itlb_ptes : signal is "distributed";
1422
1423 # -- Privilege bit from PTE EAA field
1424 # signal eaa_priv : std_ulogic;
1425 # Privilege bit from PTE EAA field
1426 eaa_priv = Signal()
1427
1428 # signal r : reg_internal_t;
1429 r = RegInternal()
1430
1431 # -- Async signals on incoming request
1432 # signal req_index : index_t;
1433 # signal req_row : row_t;
1434 # signal req_hit_way : way_t;
1435 # signal req_tag : cache_tag_t;
1436 # signal req_is_hit : std_ulogic;
1437 # signal req_is_miss : std_ulogic;
1438 # signal req_laddr : std_ulogic_vector(63 downto 0);
1439 # Async signal on incoming request
1440 req_index = Signal(NUM_LINES)
1441 req_row = Signal(BRAM_ROWS)
1442 req_hit_way = Signal(NUM_WAYS)
1443 req_tag = Signal(TAG_BITS)
1444 req_is_hit = Signal()
1445 req_is_miss = Signal()
1446 req_laddr = Signal(64)
1447
1448 # signal tlb_req_index : tlb_index_t;
1449 # signal real_addr : std_ulogic_vector(
1450 # REAL_ADDR_BITS - 1 downto 0
1451 # );
1452 # signal ra_valid : std_ulogic;
1453 # signal priv_fault : std_ulogic;
1454 # signal access_ok : std_ulogic;
1455 # signal use_previous : std_ulogic;
1456 tlb_req_index = Signal(TLB_SIZE)
1457 real_addr = Signal(REAL_ADDR_BITS)
1458 ra_valid = Signal()
1459 priv_fault = Signal()
1460 access_ok = Signal()
1461 use_previous = Signal()
1462
1463 # signal cache_out : cache_ram_out_t;
1464 cache_out = CacheRamOut()
1465
1466 # signal plru_victim : plru_out_t;
1467 # signal replace_way : way_t;
1468 plru_victim = PLRUOut()
1469 replace_way = Signal(NUM_WAYS)
1470
1471 # call sub-functions putting everything together, using shared
1472 # signals established above
1473 self.rams(m, r, cache_out, use_previous, replace_way, req_row)
1474 self.maybe_plrus(m, r, plru_victim)
1475 self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags,
1476 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
1477 priv_fault, access_ok)
1478 self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
1479 self.icache_comb(m, use_previous, r, req_index, req_row,
1480 req_tag, real_addr, req_laddr, cache_valid_bits,
1481 cache_tags, access_ok, req_is_hit, req_is_miss,
1482 replace_way, plru_victim, cache_out)
1483 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
1484 req_index, req_tag, real_addr)
1485 self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
1486 req_laddr, req_tag, replace_way, cache_tags,
1487 access_ok, real_addr)
1488 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
1489 # req_is_miss, req_is_hit, lway, wstate, r)
1490
1491 return m
1492
1493
1494 # icache_tb.vhdl
1495 #
1496 # library ieee;
1497 # use ieee.std_logic_1164.all;
1498 #
1499 # library work;
1500 # use work.common.all;
1501 # use work.wishbone_types.all;
1502 #
1503 # entity icache_tb is
1504 # end icache_tb;
1505 #
1506 # architecture behave of icache_tb is
1507 # signal clk : std_ulogic;
1508 # signal rst : std_ulogic;
1509 #
1510 # signal i_out : Fetch1ToIcacheType;
1511 # signal i_in : IcacheToDecode1Type;
1512 #
1513 # signal m_out : MmuToIcacheType;
1514 #
1515 # signal wb_bram_in : wishbone_master_out;
1516 # signal wb_bram_out : wishbone_slave_out;
1517 #
1518 # constant clk_period : time := 10 ns;
1519 # begin
1520 # icache0: entity work.icache
1521 # generic map(
1522 # LINE_SIZE => 64,
1523 # NUM_LINES => 4
1524 # )
1525 # port map(
1526 # clk => clk,
1527 # rst => rst,
1528 # i_in => i_out,
1529 # i_out => i_in,
1530 # m_in => m_out,
1531 # stall_in => '0',
1532 # flush_in => '0',
1533 # inval_in => '0',
1534 # wishbone_out => wb_bram_in,
1535 # wishbone_in => wb_bram_out
1536 # );
1537 #
1538 # -- BRAM Memory slave
1539 # bram0: entity work.wishbone_bram_wrapper
1540 # generic map(
1541 # MEMORY_SIZE => 1024,
1542 # RAM_INIT_FILE => "icache_test.bin"
1543 # )
1544 # port map(
1545 # clk => clk,
1546 # rst => rst,
1547 # wishbone_in => wb_bram_in,
1548 # wishbone_out => wb_bram_out
1549 # );
1550 #
1551 # clk_process: process
1552 # begin
1553 # clk <= '0';
1554 # wait for clk_period/2;
1555 # clk <= '1';
1556 # wait for clk_period/2;
1557 # end process;
1558 #
1559 # rst_process: process
1560 # begin
1561 # rst <= '1';
1562 # wait for 2*clk_period;
1563 # rst <= '0';
1564 # wait;
1565 # end process;
1566 #
1567 # stim: process
1568 # begin
1569 # i_out.req <= '0';
1570 # i_out.nia <= (others => '0');
1571 # i_out.stop_mark <= '0';
1572 #
1573 # m_out.tlbld <= '0';
1574 # m_out.tlbie <= '0';
1575 # m_out.addr <= (others => '0');
1576 # m_out.pte <= (others => '0');
1577 #
1578 # wait until rising_edge(clk);
1579 # wait until rising_edge(clk);
1580 # wait until rising_edge(clk);
1581 # wait until rising_edge(clk);
1582 #
1583 # i_out.req <= '1';
1584 # i_out.nia <= x"0000000000000004";
1585 #
1586 # wait for 30*clk_period;
1587 # wait until rising_edge(clk);
1588 #
1589 # assert i_in.valid = '1' severity failure;
1590 # assert i_in.insn = x"00000001"
1591 # report "insn @" & to_hstring(i_out.nia) &
1592 # "=" & to_hstring(i_in.insn) &
1593 # " expected 00000001"
1594 # severity failure;
1595 #
1596 # i_out.req <= '0';
1597 #
1598 # wait until rising_edge(clk);
1599 #
1600 # -- hit
1601 # i_out.req <= '1';
1602 # i_out.nia <= x"0000000000000008";
1603 # wait until rising_edge(clk);
1604 # wait until rising_edge(clk);
1605 # assert i_in.valid = '1' severity failure;
1606 # assert i_in.insn = x"00000002"
1607 # report "insn @" & to_hstring(i_out.nia) &
1608 # "=" & to_hstring(i_in.insn) &
1609 # " expected 00000002"
1610 # severity failure;
1611 # wait until rising_edge(clk);
1612 #
1613 # -- another miss
1614 # i_out.req <= '1';
1615 # i_out.nia <= x"0000000000000040";
1616 #
1617 # wait for 30*clk_period;
1618 # wait until rising_edge(clk);
1619 #
1620 # assert i_in.valid = '1' severity failure;
1621 # assert i_in.insn = x"00000010"
1622 # report "insn @" & to_hstring(i_out.nia) &
1623 # "=" & to_hstring(i_in.insn) &
1624 # " expected 00000010"
1625 # severity failure;
1626 #
1627 # -- test something that aliases
1628 # i_out.req <= '1';
1629 # i_out.nia <= x"0000000000000100";
1630 # wait until rising_edge(clk);
1631 # wait until rising_edge(clk);
1632 # assert i_in.valid = '0' severity failure;
1633 # wait until rising_edge(clk);
1634 #
1635 # wait for 30*clk_period;
1636 # wait until rising_edge(clk);
1637 #
1638 # assert i_in.valid = '1' severity failure;
1639 # assert i_in.insn = x"00000040"
1640 # report "insn @" & to_hstring(i_out.nia) &
1641 # "=" & to_hstring(i_in.insn) &
1642 # " expected 00000040"
1643 # severity failure;
1644 #
1645 # i_out.req <= '0';
1646 #
1647 # std.env.finish;
1648 # end process;
1649 # end;
1650 def icache_sim(dut):
1651 i_out, i_in, m_out, m_in = dut.i_out, dut.i_in, dut.m_out, dut.m_in
1652
1653 yield i_out.req.eq(0)
1654 yield i_out.nia.eq(~1)
1655 yield i_out.stop_mark.eq(0)
1656 yield m_out.tlbld.eq(0)
1657 yield m_out.tlbie.eq(0)
1658 yield m_out.addr.eq(~1)
1659 yield m_out.pte.eq(~1)
1660 yield
1661 yield
1662 yield
1663 yield
1664 yield i_out.req.eq(1)
1665 yield i_out.nia.eq(Const(0x0000000000000004, 64))
1666 for i in range(30):
1667 yield
1668 yield
1669 assert i_in.valid
1670 assert i_in.insn == Const(0x00000001, 32), \
1671 ("insn @%x=%x expected 00000001" % i_out.nia, i_in.insn)
1672 yield i_out.req.eq(0)
1673 yield
1674
1675 # hit
1676 yield i_out.req.eq(1)
1677 yield i_out.nia.eq(Const(0x0000000000000008, 64))
1678 yield
1679 yield
1680 assert i_in.valid
1681 assert i_in.insn == Const(0x00000002, 32), \
1682 ("insn @%x=%x expected 00000002" % i_out.nia, i_in.insn)
1683 yield
1684
1685 # another miss
1686 yield i_out.req(1)
1687 yield i_out.nia.eq(Const(0x0000000000000040, 64))
1688 for i in range(30):
1689 yield
1690 yield
1691 assert i_in.valid
1692 assert i_in.insn == Const(0x00000010, 32), \
1693 ("insn @%x=%x expected 00000010" % i_out.nia, i_in.insn)
1694
1695 # test something that aliases
1696 yield i_out.req.eq(1)
1697 yield i_out.nia.eq(Const(0x0000000000000100, 64))
1698 yield
1699 yield
1700 assert i_in.valid
1701 for i in range(30):
1702 yield
1703 yield
1704 assert i_in.valid
1705 assert i_in.insn == Const(0x00000040, 32), \
1706 ("insn @%x=%x expected 00000040" % i_out.nia, i_in.insn)
1707 yield i_out.req.eq(0)
1708
1709
1710 def test_icache():
1711 dut = ICache()
1712
1713 m = Module()
1714 m.submodules.icache = dut
1715
1716 # nmigen Simulation
1717 sim = Simulator(m)
1718 sim.add_clock(1e-6)
1719
1720 sim.add_sync_process(wrap(icache_sim(dut)))
1721 with sim.write_vcd('test_icache.vcd'):
1722 sim.run()
1723
1724 if __name__ == '__main__':
1725 dut = ICache()
1726 vl = rtlil.convert(dut, ports=[])
1727 with open("test_icache.il", "w") as f:
1728 f.write(vl)
1729
1730 test_icache()