icache.py commit progress, about a third through the process
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 """
22 from enum import Enum, unique
23 from nmigen import (Module, Signal, Elaboratable, Cat, Signal)
24 from nmigen.cli import main
25 from nmigen.cli import rtlil
26 from nmutil.iocontrol import RecordObject
27 from nmutil.byterev import byte_reverse
28 from nmutil.mask import Mask
29 from nmigen.util import log2_int
30
31
32 from soc.experiment.mem_types import Fetch1ToICacheType,
33 ICacheToDecode1Type,
34 MMUToICacheType
35
36 from experiment.wb_types import WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
37 WBAddrType, WBDataType, WBSelType,
38 WbMasterOut, WBSlaveOut,
39 WBMasterOutVector, WBSlaveOutVector,
40 WBIOMasterOut, WBIOSlaveOut
41
42
43 # Cache reload state machine
44 @unique
45 class State(Enum):
46 IDLE
47 CLR_TAG
48 WAIT_ACK
49
50 # type reg_internal_t is record
51 # -- Cache hit state (Latches for 1 cycle BRAM access)
52 # hit_way : way_t;
53 # hit_nia : std_ulogic_vector(63 downto 0);
54 # hit_smark : std_ulogic;
55 # hit_valid : std_ulogic;
56 #
57 # -- Cache miss state (reload state machine)
58 # state : state_t;
59 # wb : wishbone_master_out;
60 # store_way : way_t;
61 # store_index : index_t;
62 # store_row : row_t;
63 # store_tag : cache_tag_t;
64 # store_valid : std_ulogic;
65 # end_row_ix : row_in_line_t;
66 # rows_valid : row_per_line_valid_t;
67 #
68 # -- TLB miss state
69 # fetch_failed : std_ulogic;
70 # end record;
71 class RegInternal(RecordObject):
72 def __init__(self):
73 super().__init__()
74 # Cache hit state (Latches for 1 cycle BRAM access)
75 self.hit_way = Signal(NUM_WAYS)
76 self.hit_nia = Signal(64)
77 self.hit_smark = Signal()
78 self.hit_valid = Signal()
79
80 # Cache miss state (reload state machine)
81 self.state = State()
82 self.wb = WBMasterOut()
83 self.store_way = Signal(NUM_WAYS)
84 self.store_index = Signal(NUM_LINES)
85 self.store_row = Signal(BRAM_ROWS)
86 self.store_tag = Signal(TAG_BITS)
87 self.store_valid = Signal()
88 self.end_row_ix = Signal(ROW_LINE_BITS)
89 self.rows_valid = RowPerLineValidArray()
90
91 # TLB miss state
92 self.fetch_failed = Signal()
93
94 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
95 #
96 # entity icache is
97 # generic (
98 # SIM : boolean := false;
99 # -- Line size in bytes
100 # LINE_SIZE : positive := 64;
101 # -- BRAM organisation: We never access more than wishbone_data_bits
102 # -- at a time so to save resources we make the array only that wide,
103 # -- and use consecutive indices for to make a cache "line"
104 # --
105 # -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
106 # -- so 64-bits)
107 # ROW_SIZE : positive := wishbone_data_bits / 8;
108 # -- Number of lines in a set
109 # NUM_LINES : positive := 32;
110 # -- Number of ways
111 # NUM_WAYS : positive := 4;
112 # -- L1 ITLB number of entries (direct mapped)
113 # TLB_SIZE : positive := 64;
114 # -- L1 ITLB log_2(page_size)
115 # TLB_LG_PGSZ : positive := 12;
116 # -- Number of real address bits that we store
117 # REAL_ADDR_BITS : positive := 56;
118 # -- Non-zero to enable log data collection
119 # LOG_LENGTH : natural := 0
120 # );
121 # port (
122 # clk : in std_ulogic;
123 # rst : in std_ulogic;
124 #
125 # i_in : in Fetch1ToIcacheType;
126 # i_out : out IcacheToDecode1Type;
127 #
128 # m_in : in MmuToIcacheType;
129 #
130 # stall_in : in std_ulogic;
131 # stall_out : out std_ulogic;
132 # flush_in : in std_ulogic;
133 # inval_in : in std_ulogic;
134 #
135 # wishbone_out : out wishbone_master_out;
136 # wishbone_in : in wishbone_slave_out;
137 #
138 # log_out : out std_ulogic_vector(53 downto 0)
139 # );
140 # end entity icache;
141 # 64 bit direct mapped icache. All instructions are 4B aligned.
142 class ICache(Elaboratable):
143 """64 bit direct mapped icache. All instructions are 4B aligned."""
144 def __init__(self):
145 self.SIM = 0
146 self.LINE_SIZE = 64
147 # BRAM organisation: We never access more than wishbone_data_bits
148 # at a time so to save resources we make the array only that wide,
149 # and use consecutive indices for to make a cache "line"
150 #
151 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
152 self.ROW_SIZE = WB_DATA_BITS / 8
153 # Number of lines in a set
154 self.NUM_LINES = 32
155 # Number of ways
156 self.NUM_WAYS = 4
157 # L1 ITLB number of entries (direct mapped)
158 self.TLB_SIZE = 64
159 # L1 ITLB log_2(page_size)
160 self.TLB_LG_PGSZ = 12
161 # Number of real address bits that we store
162 self.REAL_ADDR_BITS = 56
163 # Non-zero to enable log data collection
164 self.LOG_LENGTH = 0
165
166 self.i_in = Fetch1ToICacheType()
167 self.i_out = ICacheToDecode1Type()
168
169 self.m_in = MMUToICacheType()
170
171 self.stall_in = Signal()
172 self.stall_out = Signal()
173 self.flush_in = Signal()
174 self.inval_in = Signal()
175
176 self.wb_out = WBMasterOut()
177 self.wb_in = WBSlaveOut()
178
179 self.log_out = Signal(54)
180
181 def elaborate(self, platform):
182 # architecture rtl of icache is
183 # constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
184 # -- ROW_PER_LINE is the number of row (wishbone transactions) in a line
185 # constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
186 # -- BRAM_ROWS is the number of rows in BRAM needed to represent the full
187 # -- icache
188 # constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
189 # -- INSN_PER_ROW is the number of 32bit instructions per BRAM row
190 # constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
191 # -- Bit fields counts in the address
192 #
193 # -- INSN_BITS is the number of bits to select an instruction in a row
194 # constant INSN_BITS : natural := log2(INSN_PER_ROW);
195 # -- ROW_BITS is the number of bits to select a row
196 # constant ROW_BITS : natural := log2(BRAM_ROWS);
197 # -- ROW_LINEBITS is the number of bits to select a row within a line
198 # constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
199 # -- LINE_OFF_BITS is the number of bits for the offset in a cache line
200 # constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
201 # -- ROW_OFF_BITS is the number of bits for the offset in a row
202 # constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
203 # -- INDEX_BITS is the number of bits to select a cache line
204 # constant INDEX_BITS : natural := log2(NUM_LINES);
205 # -- SET_SIZE_BITS is the log base 2 of the set size
206 # constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
207 # -- TAG_BITS is the number of bits of the tag part of the address
208 # constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
209 # -- WAY_BITS is the number of bits to select a way
210 # constant WAY_BITS : natural := log2(NUM_WAYS);
211
212 ROW_SIZE_BITS = ROW_SIZE * 8
213 # ROW_PER_LINE is the number of row
214 # (wishbone) transactions in a line
215 ROW_PER_LINE = LINE_SIZE / ROW_SIZE
216 # BRAM_ROWS is the number of rows in
217 # BRAM needed to represent the full icache
218 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
219 # INSN_PER_ROW is the number of 32bit
220 # instructions per BRAM row
221 INSN_PER_ROW = ROW_SIZE_BITS / 32
222
223 # Bit fields counts in the address
224 #
225 # INSN_BITS is the number of bits to
226 # select an instruction in a row
227 INSN_BITS = log2_int(INSN_PER_ROW)
228 # ROW_BITS is the number of bits to
229 # select a row
230 ROW_BITS = log2_int(BRAM_ROWS)
231 # ROW_LINEBITS is the number of bits to
232 # select a row within a line
233 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
234 # LINE_OFF_BITS is the number of bits for
235 # the offset in a cache line
236 LINE_OFF_BITS = log2_int(LINE_SIZE)
237 # ROW_OFF_BITS is the number of bits for
238 # the offset in a row
239 ROW_OFF_BITS = log2_int(ROW_SIZE)
240 # INDEX_BITS is the number of bits to
241 # select a cache line
242 INDEX_BITS = log2_int(NUM_LINES)
243 # SET_SIZE_BITS is the log base 2 of
244 # the set size
245 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
246 # TAG_BITS is the number of bits of
247 # the tag part of the address
248 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
249 # WAY_BITS is the number of bits to
250 # select a way
251 WAY_BITS = log2_int(NUM_WAYS)
252 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
253
254 # -- Example of layout for 32 lines of 64 bytes:
255 # --
256 # -- .. tag |index| line |
257 # -- .. | row | |
258 # -- .. | | | |00| zero (2)
259 # -- .. | | |-| | INSN_BITS (1)
260 # -- .. | |---| | ROW_LINEBITS (3)
261 # -- .. | |--- - --| LINE_OFF_BITS (6)
262 # -- .. | |- --| ROW_OFF_BITS (3)
263 # -- .. |----- ---| | ROW_BITS (8)
264 # -- .. |-----| | INDEX_BITS (5)
265 # -- .. --------| | TAG_BITS (53)
266 # Example of layout for 32 lines of 64 bytes:
267 #
268 # .. tag |index| line |
269 # .. | row | |
270 # .. | | | |00| zero (2)
271 # .. | | |-| | INSN_BITS (1)
272 # .. | |---| | ROW_LINEBITS (3)
273 # .. | |--- - --| LINE_OFF_BITS (6)
274 # .. | |- --| ROW_OFF_BITS (3)
275 # .. |----- ---| | ROW_BITS (8)
276 # .. |-----| | INDEX_BITS (5)
277 # .. --------| | TAG_BITS (53)
278
279 # subtype row_t is integer range 0 to BRAM_ROWS-1;
280 # subtype index_t is integer range 0 to NUM_LINES-1;
281 # subtype way_t is integer range 0 to NUM_WAYS-1;
282 # subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
283 #
284 # -- The cache data BRAM organized as described above for each way
285 # subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
286 #
287 # -- The cache tags LUTRAM has a row per set. Vivado is a pain and will
288 # -- not handle a clean (commented) definition of the cache tags as a 3d
289 # -- memory. For now, work around it by putting all the tags
290 # subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
291 # -- type cache_tags_set_t is array(way_t) of cache_tag_t;
292 # -- type cache_tags_array_t is array(index_t) of cache_tags_set_t;
293 # constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
294 # subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
295 # type cache_tags_array_t is array(index_t) of cache_tags_set_t;
296 def CacheTagArray():
297 return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
298
299 # -- The cache valid bits
300 # subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
301 # type cache_valids_t is array(index_t) of cache_way_valids_t;
302 # type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
303 def CacheValidBitsArray():
304 return Array(Signal() for x in ROW_PER_LINE)
305
306 def RowPerLineValidArray():
307 return Array(Signal() for x in range ROW_PER_LINE)
308
309 # -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
310 # signal cache_tags : cache_tags_array_t;
311 # signal cache_valids : cache_valids_t;
312 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
313 cache_tags = CacheTagArray()
314 cache_valid_bits = CacheValidBitsArray()
315
316 # attribute ram_style : string;
317 # attribute ram_style of cache_tags : signal is "distributed";
318 # TODO to be passed to nigmen as ram attributes
319 # attribute ram_style : string;
320 # attribute ram_style of cache_tags : signal is "distributed";
321
322 # -- L1 ITLB.
323 # constant TLB_BITS : natural := log2(TLB_SIZE);
324 # constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
325 # constant TLB_PTE_BITS : natural := 64;
326 TLB_BITS = log2_int(TLB_SIZE)
327 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
328 TLB_PTE_BITS = 64
329
330 # subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
331 # type tlb_valids_t is array(tlb_index_t) of std_ulogic;
332 # subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
333 # type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
334 # subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
335 # type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
336 def TLBValidBitsArray():
337 return Array(Signal() for x in range(TLB_SIZE))
338
339 def TLBTagArray():
340 return Array(Signal(TLB_EA_TAG_BITS) for x in range(TLB_SIZE))
341
342 def TLBPTEArray():
343 return Array(Signal(LTB_PTE_BITS) for x in range(TLB_SIZE))
344
345 # signal itlb_valids : tlb_valids_t;
346 # signal itlb_tags : tlb_tags_t;
347 # signal itlb_ptes : tlb_ptes_t;
348 # attribute ram_style of itlb_tags : signal is "distributed";
349 # attribute ram_style of itlb_ptes : signal is "distributed";
350 itlb_valid_bits = TLBValidBitsArray()
351 itlb_tags = TLBTagArray()
352 itlb_ptes = TLBPTEArray()
353 # TODO to be passed to nmigen as ram attributes
354 # attribute ram_style of itlb_tags : signal is "distributed";
355 # attribute ram_style of itlb_ptes : signal is "distributed";
356
357 # -- Privilege bit from PTE EAA field
358 # signal eaa_priv : std_ulogic;
359 # Privilege bit from PTE EAA field
360 eaa_priv = Signal()
361
362
363 # signal r : reg_internal_t;
364 r = RegInternal()
365
366 # -- Async signals on incoming request
367 # signal req_index : index_t;
368 # signal req_row : row_t;
369 # signal req_hit_way : way_t;
370 # signal req_tag : cache_tag_t;
371 # signal req_is_hit : std_ulogic;
372 # signal req_is_miss : std_ulogic;
373 # signal req_laddr : std_ulogic_vector(63 downto 0);
374 # Async signal on incoming request
375 req_index = Signal(NUM_LINES)
376 req_row = Signal(BRAM_ROWS)
377 req_hit_way = Signal(NUM_WAYS)
378 req_tag = Signal(TAG_BITS)
379 req_is_hit = Signal()
380 req_is_miss = Signal()
381 req_laddr = Signal(64)
382
383 # signal tlb_req_index : tlb_index_t;
384 # signal real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
385 # signal ra_valid : std_ulogic;
386 # signal priv_fault : std_ulogic;
387 # signal access_ok : std_ulogic;
388 # signal use_previous : std_ulogic;
389 tlb_req_index = Signal(TLB_SIZE)
390 real_addr = Signal(REAL_ADDR_BITS)
391 ra_valid = Signal()
392 priv_fault = Signal()
393 access_ok = Signal()
394 use_previous = Signal()
395
396 # -- Cache RAM interface
397 # type cache_ram_out_t is array(way_t) of cache_row_t;
398 # signal cache_out : cache_ram_out_t;
399 # Cache RAM interface
400 def CacheRamOut():
401 return Array(Signal(ROW_SIZE_BITS) for x in range(NUM_WAYS))
402
403 cache_out = CacheRamOut()
404
405 # -- PLRU output interface
406 # type plru_out_t is array(index_t) of
407 # std_ulogic_vector(WAY_BITS-1 downto 0);
408 # signal plru_victim : plru_out_t;
409 # signal replace_way : way_t;
410 # PLRU output interface
411 def PLRUOut():
412 return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
413
414 plru_victim = PLRUOut()
415 replace_way = Signal(NUM_WAYS)
416
417 # -- Return the cache line index (tag index) for an address
418 # function get_index(addr: std_ulogic_vector(63 downto 0))
419 # return index_t is
420 # begin
421 # return to_integer(unsigned(
422 # addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
423 # ));
424 # end;
425 #
426 # -- Return the cache row index (data memory) for an address
427 # function get_row(addr: std_ulogic_vector(63 downto 0)) return row_t is
428 # begin
429 # return to_integer(unsigned(
430 # addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
431 # ));
432 # end;
433 #
434 # -- Return the index of a row within a line
435 # function get_row_of_line(row: row_t) return row_in_line_t is
436 # variable row_v : unsigned(ROW_BITS-1 downto 0);
437 # begin
438 # row_v := to_unsigned(row, ROW_BITS);
439 # return row_v(ROW_LINEBITS-1 downto 0);
440 # end;
441 #
442 # -- Returns whether this is the last row of a line
443 # function is_last_row_addr(addr: wishbone_addr_type; last: row_in_line_t)
444 # return boolean is
445 # begin
446 # return unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last;
447 # end;
448 #
449 # -- Returns whether this is the last row of a line
450 # function is_last_row(row: row_t; last: row_in_line_t) return boolean is
451 # begin
452 # return get_row_of_line(row) = last;
453 # end;
454 #
455 # -- Return the address of the next row in the current cache line
456 # function next_row_addr(addr: wishbone_addr_type)
457 # return std_ulogic_vector is
458 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
459 # variable result : wishbone_addr_type;
460 # begin
461 # -- Is there no simpler way in VHDL to generate that 3 bits adder ?
462 # row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS);
463 # row_idx := std_ulogic_vector(unsigned(row_idx) + 1);
464 # result := addr;
465 # result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx;
466 # return result;
467 # end;
468 #
469 # -- Return the next row in the current cache line. We use a dedicated
470 # -- function in order to limit the size of the generated adder to be
471 # -- only the bits within a cache line (3 bits with default settings)
472 # --
473 # function next_row(row: row_t) return row_t is
474 # variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
475 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
476 # variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
477 # begin
478 # row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
479 # row_idx := row_v(ROW_LINEBITS-1 downto 0);
480 # row_v(ROW_LINEBITS-1 downto 0) :=
481 # std_ulogic_vector(unsigned(row_idx) + 1);
482 # return to_integer(unsigned(row_v));
483 # end;
484 #
485 # -- Read the instruction word for the given address in the
486 # -- current cache row
487 # function read_insn_word(addr: std_ulogic_vector(63 downto 0);
488 # data: cache_row_t) return std_ulogic_vector is
489 # variable word: integer range 0 to INSN_PER_ROW-1;
490 # begin
491 # word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
492 # return data(31+word*32 downto word*32);
493 # end;
494 #
495 # -- Get the tag value from the address
496 # function get_tag(addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0))
497 # return cache_tag_t is
498 # begin
499 # return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
500 # end;
501 #
502 # -- Read a tag from a tag memory row
503 # function read_tag(way: way_t; tagset: cache_tags_set_t)
504 # return cache_tag_t is
505 # begin
506 # return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
507 # end;
508 #
509 # -- Write a tag to tag memory row
510 # procedure write_tag(way: in way_t; tagset: inout cache_tags_set_t;
511 # tag: cache_tag_t) is
512 # begin
513 # tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
514 # end;
515 #
516 # -- Simple hash for direct-mapped TLB index
517 # function hash_ea(addr: std_ulogic_vector(63 downto 0))
518 # return tlb_index_t is
519 # variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
520 # begin
521 # hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
522 # xor addr(
523 # TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
524 # TLB_LG_PGSZ + TLB_BITS
525 # )
526 # xor addr(
527 # TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
528 # TLB_LG_PGSZ + 2 * TLB_BITS
529 # );
530 # return to_integer(unsigned(hash));
531 # end;
532 # begin
533 #
534 # assert LINE_SIZE mod ROW_SIZE = 0;
535 # assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
536 # severity FAILURE;
537 # assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
538 # severity FAILURE;
539 # assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
540 # severity FAILURE;
541 # assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
542 # severity FAILURE;
543 # assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
544 # report "geometry bits don't add up" severity FAILURE;
545 # assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
546 # report "geometry bits don't add up" severity FAILURE;
547 # assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
548 # report "geometry bits don't add up" severity FAILURE;
549 # assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
550 # report "geometry bits don't add up" severity FAILURE;
551 #
552 # sim_debug: if SIM generate
553 # debug: process
554 # begin
555 # report "ROW_SIZE = " & natural'image(ROW_SIZE);
556 # report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
557 # report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
558 # report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
559 # report "INSN_BITS = " & natural'image(INSN_BITS);
560 # report "ROW_BITS = " & natural'image(ROW_BITS);
561 # report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
562 # report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
563 # report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
564 # report "INDEX_BITS = " & natural'image(INDEX_BITS);
565 # report "TAG_BITS = " & natural'image(TAG_BITS);
566 # report "WAY_BITS = " & natural'image(WAY_BITS);
567 # wait;
568 # end process;
569 # end generate;
570 #
571 # -- Generate a cache RAM for each way
572 # rams: for i in 0 to NUM_WAYS-1 generate
573 # signal do_read : std_ulogic;
574 # signal do_write : std_ulogic;
575 # signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
576 # signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
577 # signal dout : cache_row_t;
578 # signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
579 # begin
580 # way: entity work.cache_ram
581 # generic map (
582 # ROW_BITS => ROW_BITS,
583 # WIDTH => ROW_SIZE_BITS
584 # )
585 # port map (
586 # clk => clk,
587 # rd_en => do_read,
588 # rd_addr => rd_addr,
589 # rd_data => dout,
590 # wr_sel => wr_sel,
591 # wr_addr => wr_addr,
592 # wr_data => wishbone_in.dat
593 # );
594 # process(all)
595 # begin
596 # do_read <= not (stall_in or use_previous);
597 # do_write <= '0';
598 # if wishbone_in.ack = '1' and replace_way = i then
599 # do_write <= '1';
600 # end if;
601 # cache_out(i) <= dout;
602 # rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
603 # wr_addr <= std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
604 # for i in 0 to ROW_SIZE-1 loop
605 # wr_sel(i) <= do_write;
606 # end loop;
607 # end process;
608 # end generate;
609 #
610 # -- Generate PLRUs
611 # maybe_plrus: if NUM_WAYS > 1 generate
612 # begin
613 # plrus: for i in 0 to NUM_LINES-1 generate
614 # -- PLRU interface
615 # signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
616 # signal plru_acc_en : std_ulogic;
617 # signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
618 #
619 # begin
620 # plru : entity work.plru
621 # generic map (
622 # BITS => WAY_BITS
623 # )
624 # port map (
625 # clk => clk,
626 # rst => rst,
627 # acc => plru_acc,
628 # acc_en => plru_acc_en,
629 # lru => plru_out
630 # );
631 #
632 # process(all)
633 # begin
634 # -- PLRU interface
635 # if get_index(r.hit_nia) = i then
636 # plru_acc_en <= r.hit_valid;
637 # else
638 # plru_acc_en <= '0';
639 # end if;
640 # plru_acc <=
641 # std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
642 # plru_victim(i) <= plru_out;
643 # end process;
644 # end generate;
645 # end generate;
646 #
647 # -- TLB hit detection and real address generation
648 # itlb_lookup : process(all)
649 # variable pte : tlb_pte_t;
650 # variable ttag : tlb_tag_t;
651 # begin
652 # tlb_req_index <= hash_ea(i_in.nia);
653 # pte := itlb_ptes(tlb_req_index);
654 # ttag := itlb_tags(tlb_req_index);
655 # if i_in.virt_mode = '1' then
656 # real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
657 # i_in.nia(TLB_LG_PGSZ - 1 downto 0);
658 # if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then
659 # ra_valid <= itlb_valids(tlb_req_index);
660 # else
661 # ra_valid <= '0';
662 # end if;
663 # eaa_priv <= pte(3);
664 # else
665 # real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0);
666 # ra_valid <= '1';
667 # eaa_priv <= '1';
668 # end if;
669 #
670 # -- no IAMR, so no KUEP support for now
671 # priv_fault <= eaa_priv and not i_in.priv_mode;
672 # access_ok <= ra_valid and not priv_fault;
673 # end process;
674 #
675 # -- iTLB update
676 # itlb_update: process(clk)
677 # variable wr_index : tlb_index_t;
678 # begin
679 # if rising_edge(clk) then
680 # wr_index := hash_ea(m_in.addr);
681 # if rst = '1' or (m_in.tlbie = '1' and m_in.doall = '1') then
682 # -- clear all valid bits
683 # for i in tlb_index_t loop
684 # itlb_valids(i) <= '0';
685 # end loop;
686 # elsif m_in.tlbie = '1' then
687 # -- clear entry regardless of hit or miss
688 # itlb_valids(wr_index) <= '0';
689 # elsif m_in.tlbld = '1' then
690 # itlb_tags(wr_index) <= m_in.addr(
691 # 63 downto TLB_LG_PGSZ + TLB_BITS
692 # );
693 # itlb_ptes(wr_index) <= m_in.pte;
694 # itlb_valids(wr_index) <= '1';
695 # end if;
696 # end if;
697 # end process;
698 #
699 # -- Cache hit detection, output to fetch2 and other misc logic
700 # icache_comb : process(all)
701 # variable is_hit : std_ulogic;
702 # variable hit_way : way_t;
703 # begin
704 # -- i_in.sequential means that i_in.nia this cycle is 4 more than
705 # -- last cycle. If we read more than 32 bits at a time, had a
706 # -- cache hit last cycle, and we don't want the first 32-bit chunk
707 # -- then we can keep the data we read last cycle and just use that.
708 # if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
709 # use_previous <= i_in.sequential and r.hit_valid;
710 # else
711 # use_previous <= '0';
712 # end if;
713 #
714 # -- Extract line, row and tag from request
715 # req_index <= get_index(i_in.nia);
716 # req_row <= get_row(i_in.nia);
717 # req_tag <= get_tag(real_addr);
718 #
719 # -- Calculate address of beginning of cache row, will be
720 # -- used for cache miss processing if needed
721 # --
722 # req_laddr <= (63 downto REAL_ADDR_BITS => '0') &
723 # real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
724 # (ROW_OFF_BITS-1 downto 0 => '0');
725 #
726 # -- Test if pending request is a hit on any way
727 # hit_way := 0;
728 # is_hit := '0';
729 # for i in way_t loop
730 # if i_in.req = '1' and
731 # (cache_valids(req_index)(i) = '1' or
732 # (r.state = WAIT_ACK and
733 # req_index = r.store_index and
734 # i = r.store_way and
735 # r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
736 # if read_tag(i, cache_tags(req_index)) = req_tag then
737 # hit_way := i;
738 # is_hit := '1';
739 # end if;
740 # end if;
741 # end loop;
742 #
743 # -- Generate the "hit" and "miss" signals for the synchronous blocks
744 # if i_in.req = '1' and access_ok = '1' and flush_in = '0'
745 # and rst = '0' then
746 # req_is_hit <= is_hit;
747 # req_is_miss <= not is_hit;
748 # else
749 # req_is_hit <= '0';
750 # req_is_miss <= '0';
751 # end if;
752 # req_hit_way <= hit_way;
753 #
754 # -- The way to replace on a miss
755 # if r.state = CLR_TAG then
756 # replace_way <= to_integer(unsigned(plru_victim(r.store_index)));
757 # else
758 # replace_way <= r.store_way;
759 # end if;
760 #
761 # -- Output instruction from current cache row
762 # --
763 # -- Note: This is a mild violation of our design principle of
764 # -- having pipeline stages output from a clean latch. In this
765 # -- case we output the result of a mux. The alternative would
766 # -- be output an entire row which I prefer not to do just yet
767 # -- as it would force fetch2 to know about some of the cache
768 # -- geometry information.
769 # i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
770 # i_out.valid <= r.hit_valid;
771 # i_out.nia <= r.hit_nia;
772 # i_out.stop_mark <= r.hit_smark;
773 # i_out.fetch_failed <= r.fetch_failed;
774 #
775 # -- Stall fetch1 if we have a miss on cache or TLB or a protection fault
776 # stall_out <= not (is_hit and access_ok);
777 #
778 # -- Wishbone requests output (from the cache miss reload machine)
779 # wishbone_out <= r.wb;
780 # end process;
781 #
782 # -- Cache hit synchronous machine
783 # icache_hit : process(clk)
784 # begin
785 # if rising_edge(clk) then
786 # -- keep outputs to fetch2 unchanged on a stall
787 # -- except that flush or reset sets valid to 0
788 # -- If use_previous, keep the same data as last
789 # -- cycle and use the second half
790 # if stall_in = '1' or use_previous = '1' then
791 # if rst = '1' or flush_in = '1' then
792 # r.hit_valid <= '0';
793 # end if;
794 # else
795 # -- On a hit, latch the request for the next cycle,
796 # -- when the BRAM data will be available on the
797 # -- cache_out output of the corresponding way
798 # r.hit_valid <= req_is_hit;
799 # if req_is_hit = '1' then
800 # r.hit_way <= req_hit_way;
801 #
802 # report "cache hit nia:" & to_hstring(i_in.nia) &
803 # " IR:" & std_ulogic'image(i_in.virt_mode) &
804 # " SM:" & std_ulogic'image(i_in.stop_mark) &
805 # " idx:" & integer'image(req_index) &
806 # " tag:" & to_hstring(req_tag) &
807 # " way:" & integer'image(req_hit_way) &
808 # " RA:" & to_hstring(real_addr);
809 # end if;
810 # end if;
811 # if stall_in = '0' then
812 # -- Send stop marks and NIA down regardless of validity
813 # r.hit_smark <= i_in.stop_mark;
814 # r.hit_nia <= i_in.nia;
815 # end if;
816 # end if;
817 # end process;
818 #
819 # -- Cache miss/reload synchronous machine
820 # icache_miss : process(clk)
821 # variable tagset : cache_tags_set_t;
822 # variable stbs_done : boolean;
823 # begin
824 # if rising_edge(clk) then
825 # -- On reset, clear all valid bits to force misses
826 # if rst = '1' then
827 # for i in index_t loop
828 # cache_valids(i) <= (others => '0');
829 # end loop;
830 # r.state <= IDLE;
831 # r.wb.cyc <= '0';
832 # r.wb.stb <= '0';
833 #
834 # -- We only ever do reads on wishbone
835 # r.wb.dat <= (others => '0');
836 # r.wb.sel <= "11111111";
837 # r.wb.we <= '0';
838 #
839 # -- Not useful normally but helps avoiding tons of sim warnings
840 # r.wb.adr <= (others => '0');
841 # else
842 # -- Process cache invalidations
843 # if inval_in = '1' then
844 # for i in index_t loop
845 # cache_valids(i) <= (others => '0');
846 # end loop;
847 # r.store_valid <= '0';
848 # end if;
849 #
850 # -- Main state machine
851 # case r.state is
852 # when IDLE =>
853 # -- Reset per-row valid flags, only used in WAIT_ACK
854 # for i in 0 to ROW_PER_LINE - 1 loop
855 # r.rows_valid(i) <= '0';
856 # end loop;
857 #
858 # -- We need to read a cache line
859 # if req_is_miss = '1' then
860 # report "cache miss nia:" & to_hstring(i_in.nia) &
861 # " IR:" & std_ulogic'image(i_in.virt_mode) &
862 # " SM:" & std_ulogic'image(i_in.stop_mark) &
863 # " idx:" & integer'image(req_index) &
864 # " way:" & integer'image(replace_way) &
865 # " tag:" & to_hstring(req_tag) &
866 # " RA:" & to_hstring(real_addr);
867 #
868 # -- Keep track of our index and way for
869 # -- subsequent stores
870 # r.store_index <= req_index;
871 # r.store_row <= get_row(req_laddr);
872 # r.store_tag <= req_tag;
873 # r.store_valid <= '1';
874 # r.end_row_ix <=
875 # get_row_of_line(get_row(req_laddr)) - 1;
876 #
877 # -- Prep for first wishbone read. We calculate the
878 # -- address of the start of the cache line and
879 # -- start the WB cycle.
880 # r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
881 # r.wb.cyc <= '1';
882 # r.wb.stb <= '1';
883 #
884 # -- Track that we had one request sent
885 # r.state <= CLR_TAG;
886 # end if;
887 #
888 # when CLR_TAG | WAIT_ACK =>
889 # if r.state = CLR_TAG then
890 # -- Get victim way from plru
891 # r.store_way <= replace_way;
892 #
893 # -- Force misses on that way while reloading that line
894 # cache_valids(req_index)(replace_way) <= '0';
895 #
896 # -- Store new tag in selected way
897 # for i in 0 to NUM_WAYS-1 loop
898 # if i = replace_way then
899 # tagset := cache_tags(r.store_index);
900 # write_tag(i, tagset, r.store_tag);
901 # cache_tags(r.store_index) <= tagset;
902 # end if;
903 # end loop;
904 #
905 # r.state <= WAIT_ACK;
906 # end if;
907 # -- Requests are all sent if stb is 0
908 # stbs_done := r.wb.stb = '0';
909 #
910 # -- If we are still sending requests, was one accepted ?
911 # if wishbone_in.stall = '0' and not stbs_done then
912 # -- That was the last word ? We are done sending.
913 # -- Clear stb and set stbs_done so we can handle
914 # -- an eventual last ack on the same cycle.
915 # if is_last_row_addr(r.wb.adr, r.end_row_ix) then
916 # r.wb.stb <= '0';
917 # stbs_done := true;
918 # end if;
919 #
920 # -- Calculate the next row address
921 # r.wb.adr <= next_row_addr(r.wb.adr);
922 # end if;
923 #
924 # -- Incoming acks processing
925 # if wishbone_in.ack = '1' then
926 # r.rows_valid(r.store_row mod ROW_PER_LINE) <= '1';
927 # -- Check for completion
928 # if stbs_done and
929 # is_last_row(r.store_row, r.end_row_ix) then
930 # -- Complete wishbone cycle
931 # r.wb.cyc <= '0';
932 #
933 # -- Cache line is now valid
934 # cache_valids(r.store_index)(replace_way) <=
935 # r.store_valid and not inval_in;
936 #
937 # -- We are done
938 # r.state <= IDLE;
939 # end if;
940 #
941 # -- Increment store row counter
942 # r.store_row <= next_row(r.store_row);
943 # end if;
944 # end case;
945 # end if;
946 #
947 # -- TLB miss and protection fault processing
948 # if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
949 # r.fetch_failed <= '0';
950 # elsif i_in.req = '1' and access_ok = '0' and stall_in = '0' then
951 # r.fetch_failed <= '1';
952 # end if;
953 # end if;
954 # end process;
955 #
956 # icache_log: if LOG_LENGTH > 0 generate
957 # -- Output data to logger
958 # signal log_data : std_ulogic_vector(53 downto 0);
959 # begin
960 # data_log: process(clk)
961 # variable lway: way_t;
962 # variable wstate: std_ulogic;
963 # begin
964 # if rising_edge(clk) then
965 # lway := req_hit_way;
966 # wstate := '0';
967 # if r.state /= IDLE then
968 # wstate := '1';
969 # end if;
970 # log_data <= i_out.valid &
971 # i_out.insn &
972 # wishbone_in.ack &
973 # r.wb.adr(5 downto 3) &
974 # r.wb.stb & r.wb.cyc &
975 # wishbone_in.stall &
976 # stall_out &
977 # r.fetch_failed &
978 # r.hit_nia(5 downto 2) &
979 # wstate &
980 # std_ulogic_vector(to_unsigned(lway, 3)) &
981 # req_is_hit & req_is_miss &
982 # access_ok &
983 # ra_valid;
984 # end if;
985 # end process;
986 # log_out <= log_data;
987 # end generate;
988 # end;
989