icache.py fixed all errors that raised python exceptions, now runs sim, sim doenst...
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 """
22 from enum import Enum, unique
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const)
24 from nmigen.cli import main
25 from nmigen.cli import rtlil
26 from nmutil.iocontrol import RecordObject
27 from nmutil.byterev import byte_reverse
28 from nmutil.mask import Mask
29 from nmigen.utils import log2_int
30 from nmutil.util import Display
31
32 from soc.experiment.mem_types import (Fetch1ToICacheType,
33 ICacheToDecode1Type,
34 MMUToICacheType)
35
36 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
37 WB_SEL_BITS, WBAddrType, WBDataType,
38 WBSelType, WBMasterOut, WBSlaveOut,
39 WBMasterOutVector, WBSlaveOutVector,
40 WBIOMasterOut, WBIOSlaveOut)
41
42 from soc.experiment.cache_ram import CacheRam
43 from soc.experiment.plru import PLRU
44
45 # for test
46 from nmigen_soc.wishbone.sram import SRAM
47 from nmigen import Memory
48 from nmigen.cli import rtlil
49 if True:
50 from nmigen.back.pysim import Simulator, Delay, Settle
51 else:
52 from nmigen.sim.cxxsim import Simulator, Delay, Settle
53 from nmutil.util import wrap
54
55
56
57 SIM = 0
58 LINE_SIZE = 64
59 # BRAM organisation: We never access more than wishbone_data_bits
60 # at a time so to save resources we make the array only that wide,
61 # and use consecutive indices for to make a cache "line"
62 #
63 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
64 ROW_SIZE = WB_DATA_BITS // 8
65 # Number of lines in a set
66 NUM_LINES = 32
67 # Number of ways
68 NUM_WAYS = 4
69 # L1 ITLB number of entries (direct mapped)
70 TLB_SIZE = 64
71 # L1 ITLB log_2(page_size)
72 TLB_LG_PGSZ = 12
73 # Number of real address bits that we store
74 REAL_ADDR_BITS = 56
75 # Non-zero to enable log data collection
76 LOG_LENGTH = 0
77
78 ROW_SIZE_BITS = ROW_SIZE * 8
79 # ROW_PER_LINE is the number of row
80 # (wishbone) transactions in a line
81 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
82 # BRAM_ROWS is the number of rows in
83 # BRAM needed to represent the full icache
84 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
85 # INSN_PER_ROW is the number of 32bit
86 # instructions per BRAM row
87 INSN_PER_ROW = ROW_SIZE_BITS // 32
88
89 # Bit fields counts in the address
90 #
91 # INSN_BITS is the number of bits to
92 # select an instruction in a row
93 INSN_BITS = log2_int(INSN_PER_ROW)
94 # ROW_BITS is the number of bits to
95 # select a row
96 ROW_BITS = log2_int(BRAM_ROWS)
97 # ROW_LINEBITS is the number of bits to
98 # select a row within a line
99 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
100 # LINE_OFF_BITS is the number of bits for
101 # the offset in a cache line
102 LINE_OFF_BITS = log2_int(LINE_SIZE)
103 # ROW_OFF_BITS is the number of bits for
104 # the offset in a row
105 ROW_OFF_BITS = log2_int(ROW_SIZE)
106 # INDEX_BITS is the number of bits to
107 # select a cache line
108 INDEX_BITS = log2_int(NUM_LINES)
109 # SET_SIZE_BITS is the log base 2 of
110 # the set size
111 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
112 # TAG_BITS is the number of bits of
113 # the tag part of the address
114 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
115 # WAY_BITS is the number of bits to
116 # select a way
117 WAY_BITS = log2_int(NUM_WAYS)
118 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
119
120 # -- L1 ITLB.
121 # constant TLB_BITS : natural := log2(TLB_SIZE);
122 # constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
123 # constant TLB_PTE_BITS : natural := 64;
124 TLB_BITS = log2_int(TLB_SIZE)
125 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
126 TLB_PTE_BITS = 64
127
128 # architecture rtl of icache is
129 #constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
130 #-- ROW_PER_LINE is the number of row (wishbone
131 #-- transactions) in a line
132 #constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
133 #-- BRAM_ROWS is the number of rows in BRAM
134 #-- needed to represent the full
135 #-- icache
136 #constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
137 #-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
138 #constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
139 #-- Bit fields counts in the address
140 #
141 #-- INSN_BITS is the number of bits to select
142 #-- an instruction in a row
143 #constant INSN_BITS : natural := log2(INSN_PER_ROW);
144 #-- ROW_BITS is the number of bits to select a row
145 #constant ROW_BITS : natural := log2(BRAM_ROWS);
146 #-- ROW_LINEBITS is the number of bits to
147 #-- select a row within a line
148 #constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
149 #-- LINE_OFF_BITS is the number of bits for the offset
150 #-- in a cache line
151 #constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
152 #-- ROW_OFF_BITS is the number of bits for the offset in a row
153 #constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
154 #-- INDEX_BITS is the number of bits to select a cache line
155 #constant INDEX_BITS : natural := log2(NUM_LINES);
156 #-- SET_SIZE_BITS is the log base 2 of the set size
157 #constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
158 #-- TAG_BITS is the number of bits of the tag part of the address
159 #constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
160 #-- WAY_BITS is the number of bits to select a way
161 #constant WAY_BITS : natural := log2(NUM_WAYS);
162
163 #-- Example of layout for 32 lines of 64 bytes:
164 #--
165 #-- .. tag |index| line |
166 #-- .. | row | |
167 #-- .. | | | |00| zero (2)
168 #-- .. | | |-| | INSN_BITS (1)
169 #-- .. | |---| | ROW_LINEBITS (3)
170 #-- .. | |--- - --| LINE_OFF_BITS (6)
171 #-- .. | |- --| ROW_OFF_BITS (3)
172 #-- .. |----- ---| | ROW_BITS (8)
173 #-- .. |-----| | INDEX_BITS (5)
174 #-- .. --------| | TAG_BITS (53)
175 # Example of layout for 32 lines of 64 bytes:
176 #
177 # .. tag |index| line |
178 # .. | row | |
179 # .. | | | |00| zero (2)
180 # .. | | |-| | INSN_BITS (1)
181 # .. | |---| | ROW_LINEBITS (3)
182 # .. | |--- - --| LINE_OFF_BITS (6)
183 # .. | |- --| ROW_OFF_BITS (3)
184 # .. |----- ---| | ROW_BITS (8)
185 # .. |-----| | INDEX_BITS (5)
186 # .. --------| | TAG_BITS (53)
187
188 #subtype row_t is integer range 0 to BRAM_ROWS-1;
189 #subtype index_t is integer range 0 to NUM_LINES-1;
190 #subtype way_t is integer range 0 to NUM_WAYS-1;
191 #subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
192 #
193 #-- The cache data BRAM organized as described above for each way
194 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
195 #
196 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
197 #-- not handle a clean (commented) definition of the cache tags as a 3d
198 #-- memory. For now, work around it by putting all the tags
199 #subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
200 # type cache_tags_set_t is array(way_t) of cache_tag_t;
201 # type cache_tags_array_t is array(index_t) of cache_tags_set_t;
202 #constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
203 #subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
204 #type cache_tags_array_t is array(index_t) of cache_tags_set_t;
205 def CacheTagArray():
206 return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
207 for x in range(NUM_LINES))
208
209 #-- The cache valid bits
210 #subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
211 #type cache_valids_t is array(index_t) of cache_way_valids_t;
212 #type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
213 def CacheValidBitsArray():
214 return Array(Signal(NUM_WAYS, name="cahcevalid_%d" %x) \
215 for x in range(NUM_LINES))
216
217 def RowPerLineValidArray():
218 return Array(Signal(name="rows_valid_%d" %x) \
219 for x in range(ROW_PER_LINE))
220
221
222 #attribute ram_style : string;
223 #attribute ram_style of cache_tags : signal is "distributed";
224 # TODO to be passed to nigmen as ram attributes
225 # attribute ram_style : string;
226 # attribute ram_style of cache_tags : signal is "distributed";
227
228
229 #subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
230 #type tlb_valids_t is array(tlb_index_t) of std_ulogic;
231 #subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
232 #type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
233 #subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
234 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
235 def TLBValidBitsArray():
236 return Array(Signal(name="tlbvalid_%d" %x) \
237 for x in range(TLB_SIZE))
238
239 def TLBTagArray():
240 return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
241 for x in range(TLB_SIZE))
242
243 def TLBPtesArray():
244 return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
245 for x in range(TLB_SIZE))
246
247
248 #-- Cache RAM interface
249 #type cache_ram_out_t is array(way_t) of cache_row_t;
250 # Cache RAM interface
251 def CacheRamOut():
252 return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
253 for x in range(NUM_WAYS))
254
255 #-- PLRU output interface
256 #type plru_out_t is array(index_t) of
257 # std_ulogic_vector(WAY_BITS-1 downto 0);
258 # PLRU output interface
259 def PLRUOut():
260 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
261 for x in range(NUM_LINES))
262
263 # -- Return the cache line index (tag index) for an address
264 # function get_index(addr: std_ulogic_vector(63 downto 0))
265 # return index_t is
266 # begin
267 # return to_integer(unsigned(
268 # addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
269 # ));
270 # end;
271 # Return the cache line index (tag index) for an address
272 def get_index(addr):
273 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
274
275 # -- Return the cache row index (data memory) for an address
276 # function get_row(addr: std_ulogic_vector(63 downto 0))
277 # return row_t is
278 # begin
279 # return to_integer(unsigned(
280 # addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
281 # ));
282 # end;
283 # Return the cache row index (data memory) for an address
284 def get_row(addr):
285 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
286
287 # -- Return the index of a row within a line
288 # function get_row_of_line(row: row_t) return row_in_line_t is
289 # variable row_v : unsigned(ROW_BITS-1 downto 0);
290 # begin
291 # row_v := to_unsigned(row, ROW_BITS);
292 # return row_v(ROW_LINEBITS-1 downto 0);
293 # end;
294 # Return the index of a row within a line
295 def get_row_of_line(row):
296 return row[:ROW_LINE_BITS]
297
298 # -- Returns whether this is the last row of a line
299 # function is_last_row_addr(addr: wishbone_addr_type;
300 # last: row_in_line_t
301 # )
302 # return boolean is
303 # begin
304 # return unsigned(
305 # addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
306 # ) = last;
307 # end;
308 # Returns whether this is the last row of a line
309 def is_last_row_addr(addr, last):
310 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
311
312 # -- Returns whether this is the last row of a line
313 # function is_last_row(row: row_t;
314 # last: row_in_line_t) return boolean is
315 # begin
316 # return get_row_of_line(row) = last;
317 # end;
318 # Returns whether this is the last row of a line
319 def is_last_row(row, last):
320 return get_row_of_line(row) == last
321
322 # -- Return the next row in the current cache line. We use a dedicated
323 # -- function in order to limit the size of the generated adder to be
324 # -- only the bits within a cache line (3 bits with default settings)
325 # function next_row(row: row_t) return row_t is
326 # variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
327 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
328 # variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
329 # begin
330 # row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
331 # row_idx := row_v(ROW_LINEBITS-1 downto 0);
332 # row_v(ROW_LINEBITS-1 downto 0) :=
333 # std_ulogic_vector(unsigned(row_idx) + 1);
334 # return to_integer(unsigned(row_v));
335 # end;
336 # Return the next row in the current cache line. We use a dedicated
337 # function in order to limit the size of the generated adder to be
338 # only the bits within a cache line (3 bits with default settings)
339 def next_row(row):
340 row_v = row[0:ROW_LINE_BITS] + 1
341 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
342 # -- Read the instruction word for the given address in the
343 # -- current cache row
344 # function read_insn_word(addr: std_ulogic_vector(63 downto 0);
345 # data: cache_row_t) return std_ulogic_vector is
346 # variable word: integer range 0 to INSN_PER_ROW-1;
347 # begin
348 # word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
349 # return data(31+word*32 downto word*32);
350 # end;
351 # Read the instruction word for the given address
352 # in the current cache row
353 def read_insn_word(addr, data):
354 word = addr[2:INSN_BITS+3]
355 return data.word_select(word, 32)
356
357 # -- Get the tag value from the address
358 # function get_tag(
359 # addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
360 # )
361 # return cache_tag_t is
362 # begin
363 # return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
364 # end;
365 # Get the tag value from the address
366 def get_tag(addr):
367 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
368
369 # -- Read a tag from a tag memory row
370 # function read_tag(way: way_t; tagset: cache_tags_set_t)
371 # return cache_tag_t is
372 # begin
373 # return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
374 # end;
375 # Read a tag from a tag memory row
376 def read_tag(way, tagset):
377 return tagset[way * TAG_BITS:(way + 1) * TAG_BITS]
378
379 # -- Write a tag to tag memory row
380 # procedure write_tag(way: in way_t;
381 # tagset: inout cache_tags_set_t; tag: cache_tag_t) is
382 # begin
383 # tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
384 # end;
385 # Write a tag to tag memory row
386 def write_tag(way, tagset, tag):
387 return tagset[way * TAG_BITS:(way + 1) * TAG_BITS].eq(tag)
388
389 # -- Simple hash for direct-mapped TLB index
390 # function hash_ea(addr: std_ulogic_vector(63 downto 0))
391 # return tlb_index_t is
392 # variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
393 # begin
394 # hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
395 # xor addr(
396 # TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
397 # TLB_LG_PGSZ + TLB_BITS
398 # )
399 # xor addr(
400 # TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
401 # TLB_LG_PGSZ + 2 * TLB_BITS
402 # );
403 # return to_integer(unsigned(hash));
404 # end;
405 # Simple hash for direct-mapped TLB index
406 def hash_ea(addr):
407 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
408 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
409 ] ^ addr[
410 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
411 ]
412 return hsh
413
414 # begin
415 #
416 # assert LINE_SIZE mod ROW_SIZE = 0;
417 # assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
418 # severity FAILURE;
419 # assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
420 # severity FAILURE;
421 # assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
422 # severity FAILURE;
423 # assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
424 # severity FAILURE;
425 # assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
426 # report "geometry bits don't add up" severity FAILURE;
427 # assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
428 # report "geometry bits don't add up" severity FAILURE;
429 # assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
430 # report "geometry bits don't add up" severity FAILURE;
431 # assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
432 # report "geometry bits don't add up" severity FAILURE;
433 #
434 # sim_debug: if SIM generate
435 # debug: process
436 # begin
437 # report "ROW_SIZE = " & natural'image(ROW_SIZE);
438 # report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
439 # report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
440 # report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
441 # report "INSN_BITS = " & natural'image(INSN_BITS);
442 # report "ROW_BITS = " & natural'image(ROW_BITS);
443 # report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
444 # report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
445 # report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
446 # report "INDEX_BITS = " & natural'image(INDEX_BITS);
447 # report "TAG_BITS = " & natural'image(TAG_BITS);
448 # report "WAY_BITS = " & natural'image(WAY_BITS);
449 # wait;
450 # end process;
451 # end generate;
452
453 # Cache reload state machine
454 @unique
455 class State(Enum):
456 IDLE = 0
457 CLR_TAG = 1
458 WAIT_ACK = 2
459
460 # type reg_internal_t is record
461 # -- Cache hit state (Latches for 1 cycle BRAM access)
462 # hit_way : way_t;
463 # hit_nia : std_ulogic_vector(63 downto 0);
464 # hit_smark : std_ulogic;
465 # hit_valid : std_ulogic;
466 #
467 # -- Cache miss state (reload state machine)
468 # state : state_t;
469 # wb : wishbone_master_out;
470 # store_way : way_t;
471 # store_index : index_t;
472 # store_row : row_t;
473 # store_tag : cache_tag_t;
474 # store_valid : std_ulogic;
475 # end_row_ix : row_in_line_t;
476 # rows_valid : row_per_line_valid_t;
477 #
478 # -- TLB miss state
479 # fetch_failed : std_ulogic;
480 # end record;
481 class RegInternal(RecordObject):
482 def __init__(self):
483 super().__init__()
484 # Cache hit state (Latches for 1 cycle BRAM access)
485 self.hit_way = Signal(NUM_WAYS)
486 self.hit_nia = Signal(64)
487 self.hit_smark = Signal()
488 self.hit_valid = Signal()
489
490 # Cache miss state (reload state machine)
491 self.state = Signal(State)
492 self.wb = WBMasterOut("wb")
493 self.store_way = Signal(NUM_WAYS)
494 self.store_index = Signal(NUM_LINES)
495 self.store_row = Signal(BRAM_ROWS)
496 self.store_tag = Signal(TAG_BITS)
497 self.store_valid = Signal()
498 self.end_row_ix = Signal(ROW_LINE_BITS)
499 self.rows_valid = RowPerLineValidArray()
500
501 # TLB miss state
502 self.fetch_failed = Signal()
503
504 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
505 #
506 # entity icache is
507 # generic (
508 # SIM : boolean := false;
509 # -- Line size in bytes
510 # LINE_SIZE : positive := 64;
511 # -- BRAM organisation: We never access more
512 # -- than wishbone_data_bits
513 # -- at a time so to save resources we make the
514 # -- array only that wide,
515 # -- and use consecutive indices for to make a cache "line"
516 # --
517 # -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
518 # -- so 64-bits)
519 # ROW_SIZE : positive := wishbone_data_bits / 8;
520 # -- Number of lines in a set
521 # NUM_LINES : positive := 32;
522 # -- Number of ways
523 # NUM_WAYS : positive := 4;
524 # -- L1 ITLB number of entries (direct mapped)
525 # TLB_SIZE : positive := 64;
526 # -- L1 ITLB log_2(page_size)
527 # TLB_LG_PGSZ : positive := 12;
528 # -- Number of real address bits that we store
529 # REAL_ADDR_BITS : positive := 56;
530 # -- Non-zero to enable log data collection
531 # LOG_LENGTH : natural := 0
532 # );
533 # port (
534 # clk : in std_ulogic;
535 # rst : in std_ulogic;
536 #
537 # i_in : in Fetch1ToIcacheType;
538 # i_out : out IcacheToDecode1Type;
539 #
540 # m_in : in MmuToIcacheType;
541 #
542 # stall_in : in std_ulogic;
543 # stall_out : out std_ulogic;
544 # flush_in : in std_ulogic;
545 # inval_in : in std_ulogic;
546 #
547 # wishbone_out : out wishbone_master_out;
548 # wishbone_in : in wishbone_slave_out;
549 #
550 # log_out : out std_ulogic_vector(53 downto 0)
551 # );
552 # end entity icache;
553 # 64 bit direct mapped icache. All instructions are 4B aligned.
554 class ICache(Elaboratable):
555 """64 bit direct mapped icache. All instructions are 4B aligned."""
556 def __init__(self):
557 self.i_in = Fetch1ToICacheType(name="i_in")
558 self.i_out = ICacheToDecode1Type(name="i_out")
559
560 self.m_in = MMUToICacheType(name="m_in")
561
562 self.stall_in = Signal()
563 self.stall_out = Signal()
564 self.flush_in = Signal()
565 self.inval_in = Signal()
566
567 self.wb_out = WBMasterOut(name="wb_out")
568 self.wb_in = WBSlaveOut(name="wb_in")
569
570 self.log_out = Signal(54)
571
572
573 # -- Generate a cache RAM for each way
574 # rams: for i in 0 to NUM_WAYS-1 generate
575 # signal do_read : std_ulogic;
576 # signal do_write : std_ulogic;
577 # signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
578 # signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
579 # signal dout : cache_row_t;
580 # signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
581 # begin
582 # way: entity work.cache_ram
583 # generic map (
584 # ROW_BITS => ROW_BITS,
585 # WIDTH => ROW_SIZE_BITS
586 # )
587 # port map (
588 # clk => clk,
589 # rd_en => do_read,
590 # rd_addr => rd_addr,
591 # rd_data => dout,
592 # wr_sel => wr_sel,
593 # wr_addr => wr_addr,
594 # wr_data => wishbone_in.dat
595 # );
596 # process(all)
597 # begin
598 # do_read <= not (stall_in or use_previous);
599 # do_write <= '0';
600 # if wishbone_in.ack = '1' and replace_way = i then
601 # do_write <= '1';
602 # end if;
603 # cache_out(i) <= dout;
604 # rd_addr <=
605 # std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
606 # wr_addr <=
607 # std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
608 # for i in 0 to ROW_SIZE-1 loop
609 # wr_sel(i) <= do_write;
610 # end loop;
611 # end process;
612 # end generate;
613 def rams(self, m, r, cache_out, use_previous, replace_way, req_row):
614 comb = m.d.comb
615
616 wb_in, stall_in = self.wb_in, self.stall_in
617
618 do_read = Signal()
619 do_write = Signal()
620 rd_addr = Signal(ROW_BITS)
621 wr_addr = Signal(ROW_BITS)
622 _d_out = Signal(ROW_SIZE_BITS)
623 wr_sel = Signal(ROW_SIZE)
624
625 for i in range(NUM_WAYS):
626 way = CacheRam(ROW_BITS, ROW_SIZE_BITS)
627 comb += way.rd_en.eq(do_read)
628 comb += way.rd_addr.eq(rd_addr)
629 comb += way.rd_data_o.eq(_d_out)
630 comb += way.wr_sel.eq(wr_sel)
631 comb += way.wr_addr.eq(wr_addr)
632 comb += way.wr_data.eq(wb_in.dat)
633
634 comb += do_read.eq(~(stall_in | use_previous))
635
636 with m.If(wb_in.ack & (replace_way == i)):
637 comb += do_write.eq(1)
638
639 comb += cache_out[i].eq(_d_out)
640 comb += rd_addr.eq(req_row)
641 comb += wr_addr.eq(r.store_row)
642 for j in range(ROW_SIZE):
643 comb += wr_sel[j].eq(do_write)
644
645 # -- Generate PLRUs
646 # maybe_plrus: if NUM_WAYS > 1 generate
647 # begin
648 # plrus: for i in 0 to NUM_LINES-1 generate
649 # -- PLRU interface
650 # signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
651 # signal plru_acc_en : std_ulogic;
652 # signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
653 #
654 # begin
655 # plru : entity work.plru
656 # generic map (
657 # BITS => WAY_BITS
658 # )
659 # port map (
660 # clk => clk,
661 # rst => rst,
662 # acc => plru_acc,
663 # acc_en => plru_acc_en,
664 # lru => plru_out
665 # );
666 #
667 # process(all)
668 # begin
669 # -- PLRU interface
670 # if get_index(r.hit_nia) = i then
671 # plru_acc_en <= r.hit_valid;
672 # else
673 # plru_acc_en <= '0';
674 # end if;
675 # plru_acc <=
676 # std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
677 # plru_victim(i) <= plru_out;
678 # end process;
679 # end generate;
680 # end generate;
681 def maybe_plrus(self, m, r, plru_victim):
682 comb = m.d.comb
683
684 with m.If(NUM_WAYS > 1):
685 for i in range(NUM_LINES):
686 plru_acc_i = Signal(WAY_BITS)
687 plru_acc_en = Signal()
688 plru_out = Signal(WAY_BITS)
689 plru = PLRU(WAY_BITS)
690 comb += plru.acc_i.eq(plru_acc_i)
691 comb += plru.acc_en.eq(plru_acc_en)
692 comb += plru.lru_o.eq(plru_out)
693
694 # PLRU interface
695 with m.If(get_index(r.hit_nia) == i):
696 comb += plru.acc_en.eq(r.hit_valid)
697
698 comb += plru.acc_i.eq(r.hit_way)
699 comb += plru_victim[i].eq(plru.lru_o)
700
701 # -- TLB hit detection and real address generation
702 # itlb_lookup : process(all)
703 # variable pte : tlb_pte_t;
704 # variable ttag : tlb_tag_t;
705 # begin
706 # tlb_req_index <= hash_ea(i_in.nia);
707 # pte := itlb_ptes(tlb_req_index);
708 # ttag := itlb_tags(tlb_req_index);
709 # if i_in.virt_mode = '1' then
710 # real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
711 # i_in.nia(TLB_LG_PGSZ - 1 downto 0);
712 # if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then
713 # ra_valid <= itlb_valids(tlb_req_index);
714 # else
715 # ra_valid <= '0';
716 # end if;
717 # eaa_priv <= pte(3);
718 # else
719 # real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0);
720 # ra_valid <= '1';
721 # eaa_priv <= '1';
722 # end if;
723 #
724 # -- no IAMR, so no KUEP support for now
725 # priv_fault <= eaa_priv and not i_in.priv_mode;
726 # access_ok <= ra_valid and not priv_fault;
727 # end process;
728 # TLB hit detection and real address generation
729 def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
730 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
731 priv_fault, access_ok):
732 comb = m.d.comb
733
734 i_in = self.i_in
735
736 pte = Signal(TLB_PTE_BITS)
737 ttag = Signal(TLB_EA_TAG_BITS)
738
739 comb += tlb_req_index.eq(hash_ea(i_in.nia))
740 comb += pte.eq(itlb_ptes[tlb_req_index])
741 comb += ttag.eq(itlb_tags[tlb_req_index])
742
743 with m.If(i_in.virt_mode):
744 comb += real_addr.eq(Cat(
745 i_in.nia[:TLB_LG_PGSZ],
746 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
747 ))
748
749 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
750 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
751
752 with m.Else():
753 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
754 comb += ra_valid.eq(1)
755 comb += eaa_priv.eq(1)
756
757 # No IAMR, so no KUEP support for now
758 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
759 comb += access_ok.eq(ra_valid & ~priv_fault)
760
761 # -- iTLB update
762 # itlb_update: process(clk)
763 # variable wr_index : tlb_index_t;
764 # begin
765 # if rising_edge(clk) then
766 # wr_index := hash_ea(m_in.addr);
767 # if rst = '1' or
768 # (m_in.tlbie = '1' and m_in.doall = '1') then
769 # -- clear all valid bits
770 # for i in tlb_index_t loop
771 # itlb_valids(i) <= '0';
772 # end loop;
773 # elsif m_in.tlbie = '1' then
774 # -- clear entry regardless of hit or miss
775 # itlb_valids(wr_index) <= '0';
776 # elsif m_in.tlbld = '1' then
777 # itlb_tags(wr_index) <=
778 # m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS);
779 # itlb_ptes(wr_index) <= m_in.pte;
780 # itlb_valids(wr_index) <= '1';
781 # end if;
782 # end if;
783 # end process;
784 # iTLB update
785 def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
786 comb = m.d.comb
787 sync = m.d.sync
788
789 m_in = self.m_in
790
791 wr_index = Signal(TLB_SIZE)
792 comb += wr_index.eq(hash_ea(m_in.addr))
793
794 with m.If(m_in.tlbie & m_in.doall):
795 # Clear all valid bits
796 for i in range(TLB_SIZE):
797 sync += itlb_valid_bits[i].eq(0)
798
799 with m.Elif(m_in.tlbie):
800 # Clear entry regardless of hit or miss
801 sync += itlb_valid_bits[wr_index].eq(0)
802
803 with m.Elif(m_in.tlbld):
804 sync += itlb_tags[wr_index].eq(
805 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
806 )
807 sync += itlb_ptes[wr_index].eq(m_in.pte)
808 sync += itlb_valid_bits[wr_index].eq(1)
809
810 # -- Cache hit detection, output to fetch2 and other misc logic
811 # icache_comb : process(all)
812 # Cache hit detection, output to fetch2 and other misc logic
813 def icache_comb(self, m, use_previous, r, req_index, req_row,
814 req_tag, real_addr, req_laddr, cache_valid_bits,
815 cache_tags, access_ok, req_is_hit,
816 req_is_miss, replace_way, plru_victim, cache_out):
817 # variable is_hit : std_ulogic;
818 # variable hit_way : way_t;
819 comb = m.d.comb
820
821 i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
822 flush_in, stall_out = self.flush_in, self.stall_out
823
824 is_hit = Signal()
825 hit_way = Signal(NUM_WAYS)
826 # begin
827 # -- i_in.sequential means that i_in.nia this cycle
828 # -- is 4 more than last cycle. If we read more
829 # -- than 32 bits at a time, had a cache hit last
830 # -- cycle, and we don't want the first 32-bit chunk
831 # -- then we can keep the data we read last cycle
832 # -- and just use that.
833 # if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
834 # use_previous <= i_in.sequential and r.hit_valid;
835 # else
836 # use_previous <= '0';
837 # end if;
838 # i_in.sequential means that i_in.nia this cycle is 4 more than
839 # last cycle. If we read more than 32 bits at a time, had a
840 # cache hit last cycle, and we don't want the first 32-bit chunk
841 # then we can keep the data we read last cycle and just use that.
842 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
843 comb += use_previous.eq(i_in.sequential & r.hit_valid)
844
845 # -- Extract line, row and tag from request
846 # req_index <= get_index(i_in.nia);
847 # req_row <= get_row(i_in.nia);
848 # req_tag <= get_tag(real_addr);
849 # Extract line, row and tag from request
850 comb += req_index.eq(get_index(i_in.nia))
851 comb += req_row.eq(get_row(i_in.nia))
852 comb += req_tag.eq(get_tag(real_addr))
853
854 # -- Calculate address of beginning of cache row, will be
855 # -- used for cache miss processing if needed
856 # req_laddr <=
857 # (63 downto REAL_ADDR_BITS => '0') &
858 # real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
859 # (ROW_OFF_BITS-1 downto 0 => '0');
860 # Calculate address of beginning of cache row, will be
861 # used for cache miss processing if needed
862 comb += req_laddr.eq(Cat(
863 Const(0b0, ROW_OFF_BITS),
864 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS]
865 ))
866
867 # -- Test if pending request is a hit on any way
868 # hit_way := 0;
869 # is_hit := '0';
870 # for i in way_t loop
871 # if i_in.req = '1' and
872 # (cache_valids(req_index)(i) = '1' or
873 # (r.state = WAIT_ACK and
874 # req_index = r.store_index and
875 # i = r.store_way and
876 # r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
877 # if read_tag(i, cache_tags(req_index)) = req_tag then
878 # hit_way := i;
879 # is_hit := '1';
880 # end if;
881 # end if;
882 # end loop;
883 # Test if pending request is a hit on any way
884 for i in range(NUM_WAYS):
885 with m.If(i_in.req &
886 (cache_valid_bits[req_index][i] |
887 ((r.state == State.WAIT_ACK)
888 & (req_index == r.store_index)
889 & (i == r.store_way)
890 & r.rows_valid[req_row % ROW_PER_LINE]))):
891 with m.If(read_tag(i, cache_tags[req_index]) == req_tag):
892 comb += hit_way.eq(i)
893 comb += is_hit.eq(1)
894
895 # -- Generate the "hit" and "miss" signals
896 # -- for the synchronous blocks
897 # if i_in.req = '1' and access_ok = '1' and flush_in = '0'
898 # and rst = '0' then
899 # req_is_hit <= is_hit;
900 # req_is_miss <= not is_hit;
901 # else
902 # req_is_hit <= '0';
903 # req_is_miss <= '0';
904 # end if;
905 # req_hit_way <= hit_way;
906 # Generate the "hit" and "miss" signals
907 # for the synchronous blocks
908 with m.If(i_in.req & access_ok & ~flush_in):
909 comb += req_is_hit.eq(is_hit)
910 comb += req_is_miss.eq(~is_hit)
911
912 with m.Else():
913 comb += req_is_hit.eq(0)
914 comb += req_is_miss.eq(0)
915
916 # -- The way to replace on a miss
917 # if r.state = CLR_TAG then
918 # replace_way <=
919 # to_integer(unsigned(plru_victim(r.store_index)));
920 # else
921 # replace_way <= r.store_way;
922 # end if;
923 # The way to replace on a miss
924 with m.If(r.state == State.CLR_TAG):
925 comb += replace_way.eq(plru_victim[r.store_index])
926
927 with m.Else():
928 comb += replace_way.eq(r.store_way)
929
930 # -- Output instruction from current cache row
931 # --
932 # -- Note: This is a mild violation of our design principle of
933 # -- having pipeline stages output from a clean latch. In this
934 # -- case we output the result of a mux. The alternative would
935 # -- be output an entire row which I prefer not to do just yet
936 # -- as it would force fetch2 to know about some of the cache
937 # -- geometry information.
938 # i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
939 # i_out.valid <= r.hit_valid;
940 # i_out.nia <= r.hit_nia;
941 # i_out.stop_mark <= r.hit_smark;
942 # i_out.fetch_failed <= r.fetch_failed;
943 # Output instruction from current cache row
944 #
945 # Note: This is a mild violation of our design principle of
946 # having pipeline stages output from a clean latch. In this
947 # case we output the result of a mux. The alternative would
948 # be output an entire row which I prefer not to do just yet
949 # as it would force fetch2 to know about some of the cache
950 # geometry information.
951 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out[r.hit_way]))
952 comb += i_out.valid.eq(r.hit_valid)
953 comb += i_out.nia.eq(r.hit_nia)
954 comb += i_out.stop_mark.eq(r.hit_smark)
955 comb += i_out.fetch_failed.eq(r.fetch_failed)
956
957 # -- Stall fetch1 if we have a miss on cache or TLB
958 # -- or a protection fault
959 # stall_out <= not (is_hit and access_ok);
960 # Stall fetch1 if we have a miss on cache or TLB
961 # or a protection fault
962 comb += stall_out.eq(~(is_hit & access_ok))
963
964 # -- Wishbone requests output (from the cache miss reload machine)
965 # wishbone_out <= r.wb;
966 # Wishbone requests output (from the cache miss reload machine)
967 comb += wb_out.eq(r.wb)
968 # end process;
969
970 # -- Cache hit synchronous machine
971 # icache_hit : process(clk)
972 # Cache hit synchronous machine
973 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
974 req_index, req_tag, real_addr):
975 sync = m.d.sync
976
977 i_in, stall_in = self.i_in, self.stall_in
978 flush_in = self.flush_in
979
980 # begin
981 # if rising_edge(clk) then
982 # -- keep outputs to fetch2 unchanged on a stall
983 # -- except that flush or reset sets valid to 0
984 # -- If use_previous, keep the same data as last
985 # -- cycle and use the second half
986 # if stall_in = '1' or use_previous = '1' then
987 # if rst = '1' or flush_in = '1' then
988 # r.hit_valid <= '0';
989 # end if;
990 # keep outputs to fetch2 unchanged on a stall
991 # except that flush or reset sets valid to 0
992 # If use_previous, keep the same data as last
993 # cycle and use the second half
994 with m.If(stall_in | use_previous):
995 with m.If(flush_in):
996 sync += r.hit_valid.eq(0)
997 # else
998 # -- On a hit, latch the request for the next cycle,
999 # -- when the BRAM data will be available on the
1000 # -- cache_out output of the corresponding way
1001 # r.hit_valid <= req_is_hit;
1002 # if req_is_hit = '1' then
1003 # r.hit_way <= req_hit_way;
1004 with m.Else():
1005 # On a hit, latch the request for the next cycle,
1006 # when the BRAM data will be available on the
1007 # cache_out output of the corresponding way
1008 sync += r.hit_valid.eq(req_is_hit)
1009
1010 with m.If(req_is_hit):
1011 sync += r.hit_way.eq(req_hit_way)
1012
1013 # report "cache hit nia:" & to_hstring(i_in.nia) &
1014 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1015 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1016 # " idx:" & integer'image(req_index) &
1017 # " tag:" & to_hstring(req_tag) &
1018 # " way:" & integer'image(req_hit_way) &
1019 # " RA:" & to_hstring(real_addr);
1020 # XXX NO do not use f"" use %d and %x. see dcache.py Display
1021 print(f"cache hit nia:{i_in.nia}, " \
1022 f"IR:{i_in.virt_mode}, " \
1023 f"SM:{i_in.stop_mark}, idx:{req_index}, " \
1024 f"tag:{req_tag}, way:{req_hit_way}, " \
1025 f"RA:{real_addr}")
1026 # end if;
1027 # end if;
1028 # if stall_in = '0' then
1029 # -- Send stop marks and NIA down regardless of validity
1030 # r.hit_smark <= i_in.stop_mark;
1031 # r.hit_nia <= i_in.nia;
1032 # end if;
1033 with m.If(~stall_in):
1034 # Send stop marks and NIA down regardless of validity
1035 sync += r.hit_smark.eq(i_in.stop_mark)
1036 sync += r.hit_nia.eq(i_in.nia)
1037 # end if;
1038 # end process;
1039
1040 # -- Cache miss/reload synchronous machine
1041 # icache_miss : process(clk)
1042 # Cache miss/reload synchronous machine
1043 def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
1044 req_index, req_laddr, req_tag, replace_way,
1045 cache_tags, access_ok, real_addr):
1046 comb = m.d.comb
1047 sync = m.d.sync
1048
1049 i_in, wb_in, m_in = self.i_in, self.wb_in, self.m_in
1050 stall_in, flush_in = self.stall_in, self.flush_in
1051 inval_in = self.inval_in
1052
1053 # variable tagset : cache_tags_set_t;
1054 # variable stbs_done : boolean;
1055
1056 tagset = Signal(TAG_RAM_WIDTH)
1057 stbs_done = Signal()
1058
1059 # begin
1060 # if rising_edge(clk) then
1061 # -- On reset, clear all valid bits to force misses
1062 # if rst = '1' then
1063 # On reset, clear all valid bits to force misses
1064 # for i in index_t loop
1065 # cache_valids(i) <= (others => '0');
1066 # end loop;
1067 # r.state <= IDLE;
1068 # r.wb.cyc <= '0';
1069 # r.wb.stb <= '0';
1070 # -- We only ever do reads on wishbone
1071 # r.wb.dat <= (others => '0');
1072 # r.wb.sel <= "11111111";
1073 # r.wb.we <= '0';
1074
1075 # We only ever do reads on wishbone
1076 comb += r.wb.sel.eq(~0) # set to all 1s
1077
1078 # -- Not useful normally but helps avoiding
1079 # -- tons of sim warnings
1080 # r.wb.adr <= (others => '0');
1081
1082 # else
1083
1084 # -- Process cache invalidations
1085 # if inval_in = '1' then
1086 # for i in index_t loop
1087 # cache_valids(i) <= (others => '0');
1088 # end loop;
1089 # r.store_valid <= '0';
1090 # end if;
1091 # Process cache invalidations
1092 with m.If(inval_in):
1093 for i in range(NUM_LINES):
1094 sync += cache_valid_bits[i].eq(0)
1095 sync += r.store_valid.eq(0)
1096
1097 # -- Main state machine
1098 # case r.state is
1099 # Main state machine
1100 with m.Switch(r.state):
1101
1102 # when IDLE =>
1103 with m.Case(State.IDLE):
1104 # -- Reset per-row valid flags,
1105 # -- only used in WAIT_ACK
1106 # for i in 0 to ROW_PER_LINE - 1 loop
1107 # r.rows_valid(i) <= '0';
1108 # end loop;
1109 # Reset per-row valid flags,
1110 # only used in WAIT_ACK
1111 for i in range(ROW_PER_LINE):
1112 sync += r.rows_valid[i].eq(0)
1113
1114 # -- We need to read a cache line
1115 # if req_is_miss = '1' then
1116 # report "cache miss nia:" & to_hstring(i_in.nia) &
1117 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1118 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1119 # " idx:" & integer'image(req_index) &
1120 # " way:" & integer'image(replace_way) &
1121 # " tag:" & to_hstring(req_tag) &
1122 # " RA:" & to_hstring(real_addr);
1123 # We need to read a cache line
1124 with m.If(req_is_miss):
1125 # XXX no, do not use "f". use sync += Display
1126 # and use %d for integer, %x for hex.
1127 print(f"cache miss nia:{i_in.nia} " \
1128 f"IR:{i_in.virt_mode} " \
1129 f"SM:{i_in.stop_mark} " \
1130 F"idx:{req_index} " \
1131 f"way:{replace_way} tag:{req_tag} " \
1132 f"RA:{real_addr}")
1133
1134 # -- Keep track of our index and way for
1135 # -- subsequent stores
1136 # r.store_index <= req_index;
1137 # r.store_row <= get_row(req_laddr);
1138 # r.store_tag <= req_tag;
1139 # r.store_valid <= '1';
1140 # r.end_row_ix <=
1141 # get_row_of_line(get_row(req_laddr)) - 1;
1142 # Keep track of our index and way
1143 # for subsequent stores
1144 sync += r.store_index.eq(req_index)
1145 sync += r.store_row.eq(get_row(req_laddr))
1146 sync += r.store_tag.eq(req_tag)
1147 sync += r.store_valid.eq(1)
1148 sync += r.end_row_ix.eq(
1149 get_row_of_line(
1150 get_row(req_laddr)
1151 ) - 1
1152 )
1153
1154 # -- Prep for first wishbone read. We calculate the
1155 # -- address of the start of the cache line and
1156 # -- start the WB cycle.
1157 # r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
1158 # r.wb.cyc <= '1';
1159 # r.wb.stb <= '1';
1160 # Prep for first wishbone read.
1161 # We calculate the
1162 # address of the start of the cache line and
1163 # start the WB cycle.
1164 sync += r.wb.adr.eq(req_laddr)
1165 sync += r.wb.cyc.eq(1)
1166 sync += r.wb.stb.eq(1)
1167
1168 # -- Track that we had one request sent
1169 # r.state <= CLR_TAG;
1170 # Track that we had one request sent
1171 sync += r.state.eq(State.CLR_TAG)
1172 # end if;
1173
1174 # when CLR_TAG | WAIT_ACK =>
1175 with m.Case(State.CLR_TAG, State.WAIT_ACK):
1176 # if r.state = CLR_TAG then
1177 with m.If(r.state == State.CLR_TAG):
1178 # -- Get victim way from plru
1179 # r.store_way <= replace_way;
1180 # Get victim way from plru
1181 sync += r.store_way.eq(replace_way)
1182 #
1183 # -- Force misses on that way while
1184 # -- reloading that line
1185 # cache_valids(req_index)(replace_way) <= '0';
1186 # Force misses on that way while
1187 # realoading that line
1188 cv = Signal(INDEX_BITS)
1189 comb += cv.eq(cache_valid_bits[req_index])
1190 comb += cv.bit_select(replace_way, 1).eq(0)
1191 sync += cache_valid_bits[req_index].eq(cv)
1192
1193 # -- Store new tag in selected way
1194 # for i in 0 to NUM_WAYS-1 loop
1195 # if i = replace_way then
1196 # tagset := cache_tags(r.store_index);
1197 # write_tag(i, tagset, r.store_tag);
1198 # cache_tags(r.store_index) <= tagset;
1199 # end if;
1200 # end loop;
1201 for i in range(NUM_WAYS):
1202 with m.If(i == replace_way):
1203 sync += tagset.eq(cache_tags[r.store_index])
1204 sync += write_tag(i, tagset, r.store_tag)
1205 sync += cache_tags[r.store_index].eq(tagset)
1206
1207 # r.state <= WAIT_ACK;
1208 sync += r.state.eq(State.WAIT_ACK)
1209 # end if;
1210
1211 # -- Requests are all sent if stb is 0
1212 # stbs_done := r.wb.stb = '0';
1213 # Requests are all sent if stb is 0
1214 comb += stbs_done.eq(r.wb.stb == 0)
1215
1216 # -- If we are still sending requests,
1217 # -- was one accepted ?
1218 # if wishbone_in.stall = '0' and not stbs_done then
1219 # If we are still sending requests,
1220 # was one accepted?
1221 with m.If(~wb_in.stall & ~stbs_done):
1222 # -- That was the last word ? We are done sending.
1223 # -- Clear stb and set stbs_done so we can handle
1224 # -- an eventual last ack on the same cycle.
1225 # if is_last_row_addr(r.wb.adr, r.end_row_ix) then
1226 # r.wb.stb <= '0';
1227 # stbs_done := true;
1228 # end if;
1229 # That was the last word ?
1230 # We are done sending.
1231 # Clear stb and set stbs_done
1232 # so we can handle
1233 # an eventual last ack on
1234 # the same cycle.
1235 with m.If(is_last_row_addr(r.wb.adr, r.end_row_ix)):
1236 sync += r.wb.stb.eq(0)
1237 comb += stbs_done.eq(1)
1238
1239 # -- Calculate the next row address
1240 # r.wb.adr <= next_row_addr(r.wb.adr);
1241 # Calculate the next row address
1242 rarange = r.wb.adr[ROW_OFF_BITS:LINE_OFF_BITS]
1243 sync += rarange.eq(rarange + 1)
1244 # end if;
1245
1246 # -- Incoming acks processing
1247 # if wishbone_in.ack = '1' then
1248 # Incoming acks processing
1249 with m.If(wb_in.ack):
1250 # r.rows_valid(r.store_row mod ROW_PER_LINE)
1251 # <= '1';
1252 sync += r.rows_valid[r.store_row & ROW_PER_LINE].eq(1)
1253
1254 # -- Check for completion
1255 # if stbs_done and
1256 # is_last_row(r.store_row, r.end_row_ix) then
1257 # Check for completion
1258 with m.If(stbs_done &
1259 is_last_row(r.store_row, r.end_row_ix)):
1260 # -- Complete wishbone cycle
1261 # r.wb.cyc <= '0';
1262 # Complete wishbone cycle
1263 sync += r.wb.cyc.eq(0)
1264
1265 # -- Cache line is now valid
1266 # cache_valids(r.store_index)(replace_way) <=
1267 # r.store_valid and not inval_in;
1268 # Cache line is now valid
1269 cv = Signal(INDEX_BITS)
1270 sync += cv.eq(cache_valid_bits[r.store_index])
1271 sync += cv.bit_select(replace_way, 1).eq(
1272 r.store_valid & ~inval_in)
1273
1274 # -- We are done
1275 # r.state <= IDLE;
1276 # We are done
1277 sync += r.state.eq(State.IDLE)
1278 # end if;
1279
1280 # -- Increment store row counter
1281 # r.store_row <= next_row(r.store_row);
1282 # Increment store row counter
1283 sync += r.store_row.eq(next_row(r.store_row))
1284 # end if;
1285 # end case;
1286 # end if;
1287 #
1288 # -- TLB miss and protection fault processing
1289 # if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
1290 # r.fetch_failed <= '0';
1291 # elsif i_in.req = '1' and access_ok = '0' and
1292 # stall_in = '0' then
1293 # r.fetch_failed <= '1';
1294 # end if;
1295 # TLB miss and protection fault processing
1296 with m.If(flush_in | m_in.tlbld):
1297 sync += r.fetch_failed.eq(0)
1298 with m.Elif(i_in.req & ~access_ok & ~stall_in):
1299 sync += r.fetch_failed.eq(1)
1300 # end if;
1301 # end process;
1302
1303 # icache_log: if LOG_LENGTH > 0 generate
1304 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
1305 req_is_miss, req_is_hit, lway, wstate, r):
1306 comb = m.d.comb
1307 sync = m.d.sync
1308
1309 wb_in, i_out = self.wb_in, self.i_out
1310 log_out, stall_out = self.log_out, self.stall_out
1311
1312 # -- Output data to logger
1313 # signal log_data : std_ulogic_vector(53 downto 0);
1314 # begin
1315 # data_log: process(clk)
1316 # variable lway: way_t;
1317 # variable wstate: std_ulogic;
1318 # Output data to logger
1319 for i in range(LOG_LENGTH):
1320 # Output data to logger
1321 log_data = Signal(54)
1322 lway = Signal(NUM_WAYS)
1323 wstate = Signal()
1324
1325 # begin
1326 # if rising_edge(clk) then
1327 # lway := req_hit_way;
1328 # wstate := '0';
1329 comb += lway.eq(req_hit_way)
1330 comb += wstate.eq(0)
1331
1332 # if r.state /= IDLE then
1333 # wstate := '1';
1334 # end if;
1335 with m.If(r.state != State.IDLE):
1336 sync += wstate.eq(1)
1337
1338 # log_data <= i_out.valid &
1339 # i_out.insn &
1340 # wishbone_in.ack &
1341 # r.wb.adr(5 downto 3) &
1342 # r.wb.stb & r.wb.cyc &
1343 # wishbone_in.stall &
1344 # stall_out &
1345 # r.fetch_failed &
1346 # r.hit_nia(5 downto 2) &
1347 # wstate &
1348 # std_ulogic_vector(to_unsigned(lway, 3)) &
1349 # req_is_hit & req_is_miss &
1350 # access_ok &
1351 # ra_valid;
1352 sync += log_data.eq(Cat(
1353 ra_valid, access_ok, req_is_miss, req_is_hit,
1354 lway, wstate, r.hit_nia[2:6],
1355 r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
1356 r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
1357 i_out.valid
1358 ))
1359 # end if;
1360 # end process;
1361 # log_out <= log_data;
1362 comb += log_out.eq(log_data)
1363 # end generate;
1364 # end;
1365
1366 def elaborate(self, platform):
1367
1368 m = Module()
1369 comb = m.d.comb
1370
1371 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1372 cache_tags = CacheTagArray()
1373 cache_valid_bits = CacheValidBitsArray()
1374
1375 # signal itlb_valids : tlb_valids_t;
1376 # signal itlb_tags : tlb_tags_t;
1377 # signal itlb_ptes : tlb_ptes_t;
1378 # attribute ram_style of itlb_tags : signal is "distributed";
1379 # attribute ram_style of itlb_ptes : signal is "distributed";
1380 itlb_valid_bits = TLBValidBitsArray()
1381 itlb_tags = TLBTagArray()
1382 itlb_ptes = TLBPtesArray()
1383 # TODO to be passed to nmigen as ram attributes
1384 # attribute ram_style of itlb_tags : signal is "distributed";
1385 # attribute ram_style of itlb_ptes : signal is "distributed";
1386
1387 # -- Privilege bit from PTE EAA field
1388 # signal eaa_priv : std_ulogic;
1389 # Privilege bit from PTE EAA field
1390 eaa_priv = Signal()
1391
1392 # signal r : reg_internal_t;
1393 r = RegInternal()
1394
1395 # -- Async signals on incoming request
1396 # signal req_index : index_t;
1397 # signal req_row : row_t;
1398 # signal req_hit_way : way_t;
1399 # signal req_tag : cache_tag_t;
1400 # signal req_is_hit : std_ulogic;
1401 # signal req_is_miss : std_ulogic;
1402 # signal req_laddr : std_ulogic_vector(63 downto 0);
1403 # Async signal on incoming request
1404 req_index = Signal(NUM_LINES)
1405 req_row = Signal(BRAM_ROWS)
1406 req_hit_way = Signal(NUM_WAYS)
1407 req_tag = Signal(TAG_BITS)
1408 req_is_hit = Signal()
1409 req_is_miss = Signal()
1410 req_laddr = Signal(64)
1411
1412 # signal tlb_req_index : tlb_index_t;
1413 # signal real_addr : std_ulogic_vector(
1414 # REAL_ADDR_BITS - 1 downto 0
1415 # );
1416 # signal ra_valid : std_ulogic;
1417 # signal priv_fault : std_ulogic;
1418 # signal access_ok : std_ulogic;
1419 # signal use_previous : std_ulogic;
1420 tlb_req_index = Signal(TLB_SIZE)
1421 real_addr = Signal(REAL_ADDR_BITS)
1422 ra_valid = Signal()
1423 priv_fault = Signal()
1424 access_ok = Signal()
1425 use_previous = Signal()
1426
1427 # signal cache_out : cache_ram_out_t;
1428 cache_out = CacheRamOut()
1429
1430 # signal plru_victim : plru_out_t;
1431 # signal replace_way : way_t;
1432 plru_victim = PLRUOut()
1433 replace_way = Signal(NUM_WAYS)
1434
1435 # call sub-functions putting everything together, using shared
1436 # signals established above
1437 self.rams(m, r, cache_out, use_previous, replace_way, req_row)
1438 self.maybe_plrus(m, r, plru_victim)
1439 self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags,
1440 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
1441 priv_fault, access_ok)
1442 self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
1443 self.icache_comb(m, use_previous, r, req_index, req_row,
1444 req_tag, real_addr, req_laddr, cache_valid_bits,
1445 cache_tags, access_ok, req_is_hit, req_is_miss,
1446 replace_way, plru_victim, cache_out)
1447 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
1448 req_index, req_tag, real_addr)
1449 self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
1450 req_laddr, req_tag, replace_way, cache_tags,
1451 access_ok, real_addr)
1452 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
1453 # req_is_miss, req_is_hit, lway, wstate, r)
1454
1455 return m
1456
1457
1458 # icache_tb.vhdl
1459 #
1460 # library ieee;
1461 # use ieee.std_logic_1164.all;
1462 #
1463 # library work;
1464 # use work.common.all;
1465 # use work.wishbone_types.all;
1466 #
1467 # entity icache_tb is
1468 # end icache_tb;
1469 #
1470 # architecture behave of icache_tb is
1471 # signal clk : std_ulogic;
1472 # signal rst : std_ulogic;
1473 #
1474 # signal i_out : Fetch1ToIcacheType;
1475 # signal i_in : IcacheToDecode1Type;
1476 #
1477 # signal m_out : MmuToIcacheType;
1478 #
1479 # signal wb_bram_in : wishbone_master_out;
1480 # signal wb_bram_out : wishbone_slave_out;
1481 #
1482 # constant clk_period : time := 10 ns;
1483 # begin
1484 # icache0: entity work.icache
1485 # generic map(
1486 # LINE_SIZE => 64,
1487 # NUM_LINES => 4
1488 # )
1489 # port map(
1490 # clk => clk,
1491 # rst => rst,
1492 # i_in => i_out,
1493 # i_out => i_in,
1494 # m_in => m_out,
1495 # stall_in => '0',
1496 # flush_in => '0',
1497 # inval_in => '0',
1498 # wishbone_out => wb_bram_in,
1499 # wishbone_in => wb_bram_out
1500 # );
1501 #
1502 # -- BRAM Memory slave
1503 # bram0: entity work.wishbone_bram_wrapper
1504 # generic map(
1505 # MEMORY_SIZE => 1024,
1506 # RAM_INIT_FILE => "icache_test.bin"
1507 # )
1508 # port map(
1509 # clk => clk,
1510 # rst => rst,
1511 # wishbone_in => wb_bram_in,
1512 # wishbone_out => wb_bram_out
1513 # );
1514 #
1515 # clk_process: process
1516 # begin
1517 # clk <= '0';
1518 # wait for clk_period/2;
1519 # clk <= '1';
1520 # wait for clk_period/2;
1521 # end process;
1522 #
1523 # rst_process: process
1524 # begin
1525 # rst <= '1';
1526 # wait for 2*clk_period;
1527 # rst <= '0';
1528 # wait;
1529 # end process;
1530 #
1531 # stim: process
1532 # begin
1533 # i_out.req <= '0';
1534 # i_out.nia <= (others => '0');
1535 # i_out.stop_mark <= '0';
1536 #
1537 # m_out.tlbld <= '0';
1538 # m_out.tlbie <= '0';
1539 # m_out.addr <= (others => '0');
1540 # m_out.pte <= (others => '0');
1541 #
1542 # wait until rising_edge(clk);
1543 # wait until rising_edge(clk);
1544 # wait until rising_edge(clk);
1545 # wait until rising_edge(clk);
1546 #
1547 # i_out.req <= '1';
1548 # i_out.nia <= x"0000000000000004";
1549 #
1550 # wait for 30*clk_period;
1551 # wait until rising_edge(clk);
1552 #
1553 # assert i_in.valid = '1' severity failure;
1554 # assert i_in.insn = x"00000001"
1555 # report "insn @" & to_hstring(i_out.nia) &
1556 # "=" & to_hstring(i_in.insn) &
1557 # " expected 00000001"
1558 # severity failure;
1559 #
1560 # i_out.req <= '0';
1561 #
1562 # wait until rising_edge(clk);
1563 #
1564 # -- hit
1565 # i_out.req <= '1';
1566 # i_out.nia <= x"0000000000000008";
1567 # wait until rising_edge(clk);
1568 # wait until rising_edge(clk);
1569 # assert i_in.valid = '1' severity failure;
1570 # assert i_in.insn = x"00000002"
1571 # report "insn @" & to_hstring(i_out.nia) &
1572 # "=" & to_hstring(i_in.insn) &
1573 # " expected 00000002"
1574 # severity failure;
1575 # wait until rising_edge(clk);
1576 #
1577 # -- another miss
1578 # i_out.req <= '1';
1579 # i_out.nia <= x"0000000000000040";
1580 #
1581 # wait for 30*clk_period;
1582 # wait until rising_edge(clk);
1583 #
1584 # assert i_in.valid = '1' severity failure;
1585 # assert i_in.insn = x"00000010"
1586 # report "insn @" & to_hstring(i_out.nia) &
1587 # "=" & to_hstring(i_in.insn) &
1588 # " expected 00000010"
1589 # severity failure;
1590 #
1591 # -- test something that aliases
1592 # i_out.req <= '1';
1593 # i_out.nia <= x"0000000000000100";
1594 # wait until rising_edge(clk);
1595 # wait until rising_edge(clk);
1596 # assert i_in.valid = '0' severity failure;
1597 # wait until rising_edge(clk);
1598 #
1599 # wait for 30*clk_period;
1600 # wait until rising_edge(clk);
1601 #
1602 # assert i_in.valid = '1' severity failure;
1603 # assert i_in.insn = x"00000040"
1604 # report "insn @" & to_hstring(i_out.nia) &
1605 # "=" & to_hstring(i_in.insn) &
1606 # " expected 00000040"
1607 # severity failure;
1608 #
1609 # i_out.req <= '0';
1610 #
1611 # std.env.finish;
1612 # end process;
1613 # end;
1614 def icache_sim(dut):
1615 i_out = dut.i_in
1616 i_in = dut.i_out
1617 m_out = dut.m_in
1618
1619 yield i_in.valid.eq(0)
1620 yield i_out.req.eq(0)
1621 yield i_out.nia.eq(~1)
1622 yield i_out.stop_mark.eq(0)
1623 yield m_out.tlbld.eq(0)
1624 yield m_out.tlbie.eq(0)
1625 yield m_out.addr.eq(~1)
1626 yield m_out.pte.eq(~1)
1627 yield
1628 yield
1629 yield
1630 yield
1631 yield i_out.req.eq(1)
1632 yield i_out.nia.eq(Const(0x0000000000000004, 64))
1633 for i in range(30):
1634 yield
1635 yield
1636 valid = yield i_in.valid
1637 insn = yield i_in.insn
1638 print(f"valid? {valid}")
1639 #assert valid
1640 #assert insn == 0x00000001, \
1641 #("insn @%x=%x expected 00000001" % i_out.nia, i_in.insn)
1642 yield i_out.req.eq(0)
1643 yield
1644
1645 # hit
1646 yield i_out.req.eq(1)
1647 yield i_out.nia.eq(Const(0x0000000000000008, 64))
1648 yield
1649 yield
1650 valid = yield i_in.valid
1651 insn = yield i_in.insn
1652 #assert valid
1653 #assert insn == 0x00000002, \
1654 #("insn @%x=%x expected 00000002" % i_out.nia, i_in.insn)
1655 yield
1656
1657 # another miss
1658 yield i_out.req.eq(1)
1659 yield i_out.nia.eq(Const(0x0000000000000040, 64))
1660 for i in range(30):
1661 yield
1662 yield
1663 valid = yield i_in.valid
1664 insn = yield i_in.insn
1665 #assert valid
1666 #assert insn == 0x00000010, \
1667 #("insn @%x=%x expected 00000010" % i_out.nia, i_in.insn)
1668
1669 # test something that aliases
1670 yield i_out.req.eq(1)
1671 yield i_out.nia.eq(Const(0x0000000000000100, 64))
1672 yield
1673 yield
1674 #assert i_in.valid == Const(1, 1)
1675 for i in range(30):
1676 yield
1677 yield
1678 valid = yield i_in.valid
1679 insn = yield i_in.insn
1680 #assert valid
1681 #assert insn == 0x00000040, \
1682 #("insn @%x=%x expected 00000040" % i_out.nia, i_in.insn)
1683 yield i_out.req.eq(0)
1684
1685
1686 def test_icache():
1687 dut = ICache()
1688
1689 m = Module()
1690 m.submodules.icache = dut
1691
1692 # nmigen Simulation
1693 sim = Simulator(m)
1694 sim.add_clock(1e-6)
1695
1696 sim.add_sync_process(wrap(icache_sim(dut)))
1697 with sim.write_vcd('test_icache.vcd'):
1698 sim.run()
1699
1700 if __name__ == '__main__':
1701 dut = ICache()
1702 vl = rtlil.convert(dut, ports=[])
1703 with open("test_icache.il", "w") as f:
1704 f.write(vl)
1705
1706 test_icache()