efb06c9d4da7d499ab8dfb0e8f3aa810c3b1b866
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 """
22 from enum import Enum, unique
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const)
24 from nmigen.cli import main, rtlil
25 from nmutil.iocontrol import RecordObject
26 from nmigen.utils import log2_int
27 from nmutil.util import Display
28
29 #from nmutil.plru import PLRU
30 from soc.experiment.cache_ram import CacheRam
31 from soc.experiment.plru import PLRU
32
33 from soc.experiment.mem_types import (Fetch1ToICacheType,
34 ICacheToDecode1Type,
35 MMUToICacheType)
36
37 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
38 WB_SEL_BITS, WBAddrType, WBDataType,
39 WBSelType, WBMasterOut, WBSlaveOut,
40 WBMasterOutVector, WBSlaveOutVector,
41 WBIOMasterOut, WBIOSlaveOut)
42
43 # for test
44 from nmigen_soc.wishbone.sram import SRAM
45 from nmigen import Memory
46 from nmutil.util import wrap
47 from nmigen.cli import main, rtlil
48 if True:
49 from nmigen.back.pysim import Simulator, Delay, Settle
50 else:
51 from nmigen.sim.cxxsim import Simulator, Delay, Settle
52
53
54 SIM = 0
55 LINE_SIZE = 64
56 # BRAM organisation: We never access more than wishbone_data_bits
57 # at a time so to save resources we make the array only that wide,
58 # and use consecutive indices for to make a cache "line"
59 #
60 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
61 ROW_SIZE = WB_DATA_BITS // 8
62 # Number of lines in a set
63 NUM_LINES = 32
64 # Number of ways
65 NUM_WAYS = 4
66 # L1 ITLB number of entries (direct mapped)
67 TLB_SIZE = 64
68 # L1 ITLB log_2(page_size)
69 TLB_LG_PGSZ = 12
70 # Number of real address bits that we store
71 REAL_ADDR_BITS = 56
72 # Non-zero to enable log data collection
73 LOG_LENGTH = 0
74
75 ROW_SIZE_BITS = ROW_SIZE * 8
76 # ROW_PER_LINE is the number of row
77 # (wishbone) transactions in a line
78 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
79 # BRAM_ROWS is the number of rows in
80 # BRAM needed to represent the full icache
81 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
82 # INSN_PER_ROW is the number of 32bit
83 # instructions per BRAM row
84 INSN_PER_ROW = ROW_SIZE_BITS // 32
85
86 print("ROW_SIZE", ROW_SIZE)
87 print("ROW_SIZE_BITS", ROW_SIZE_BITS)
88 print("ROW_PER_LINE", ROW_PER_LINE)
89 print("BRAM_ROWS", BRAM_ROWS)
90 print("INSN_PER_ROW", INSN_PER_ROW)
91
92 # Bit fields counts in the address
93 #
94 # INSN_BITS is the number of bits to
95 # select an instruction in a row
96 INSN_BITS = log2_int(INSN_PER_ROW)
97 # ROW_BITS is the number of bits to
98 # select a row
99 ROW_BITS = log2_int(BRAM_ROWS)
100 # ROW_LINEBITS is the number of bits to
101 # select a row within a line
102 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
103 # LINE_OFF_BITS is the number of bits for
104 # the offset in a cache line
105 LINE_OFF_BITS = log2_int(LINE_SIZE)
106 # ROW_OFF_BITS is the number of bits for
107 # the offset in a row
108 ROW_OFF_BITS = log2_int(ROW_SIZE)
109 # INDEX_BITS is the number of bits to
110 # select a cache line
111 INDEX_BITS = log2_int(NUM_LINES)
112 # SET_SIZE_BITS is the log base 2 of
113 # the set size
114 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
115 # TAG_BITS is the number of bits of
116 # the tag part of the address
117 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
118 # TAG_WIDTH is the width in bits of each way of the tag RAM
119 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
120
121 # WAY_BITS is the number of bits to
122 # select a way
123 WAY_BITS = log2_int(NUM_WAYS)
124 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
125
126 # -- L1 ITLB.
127 # constant TLB_BITS : natural := log2(TLB_SIZE);
128 # constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
129 # constant TLB_PTE_BITS : natural := 64;
130 TLB_BITS = log2_int(TLB_SIZE)
131 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
132 TLB_PTE_BITS = 64
133
134
135 print("INSN_BITS", INSN_BITS)
136 print("ROW_BITS", ROW_BITS)
137 print("ROW_LINE_BITS", ROW_LINE_BITS)
138 print("LINE_OFF_BITS", LINE_OFF_BITS)
139 print("ROW_OFF_BITS", ROW_OFF_BITS)
140 print("INDEX_BITS", INDEX_BITS)
141 print("SET_SIZE_BITS", SET_SIZE_BITS)
142 print("TAG_BITS", TAG_BITS)
143 print("WAY_BITS", WAY_BITS)
144 print("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
145 print("TLB_BITS", TLB_BITS)
146 print("TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
147 print("TLB_PTE_BITS", TLB_PTE_BITS)
148
149
150
151
152 # architecture rtl of icache is
153 #constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
154 #-- ROW_PER_LINE is the number of row (wishbone
155 #-- transactions) in a line
156 #constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
157 #-- BRAM_ROWS is the number of rows in BRAM
158 #-- needed to represent the full
159 #-- icache
160 #constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
161 #-- INSN_PER_ROW is the number of 32bit instructions per BRAM row
162 #constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
163 #-- Bit fields counts in the address
164 #
165 #-- INSN_BITS is the number of bits to select
166 #-- an instruction in a row
167 #constant INSN_BITS : natural := log2(INSN_PER_ROW);
168 #-- ROW_BITS is the number of bits to select a row
169 #constant ROW_BITS : natural := log2(BRAM_ROWS);
170 #-- ROW_LINEBITS is the number of bits to
171 #-- select a row within a line
172 #constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
173 #-- LINE_OFF_BITS is the number of bits for the offset
174 #-- in a cache line
175 #constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
176 #-- ROW_OFF_BITS is the number of bits for the offset in a row
177 #constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
178 #-- INDEX_BITS is the number of bits to select a cache line
179 #constant INDEX_BITS : natural := log2(NUM_LINES);
180 #-- SET_SIZE_BITS is the log base 2 of the set size
181 #constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
182 #-- TAG_BITS is the number of bits of the tag part of the address
183 #constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
184 #-- WAY_BITS is the number of bits to select a way
185 #constant WAY_BITS : natural := log2(NUM_WAYS);
186
187 #-- Example of layout for 32 lines of 64 bytes:
188 #--
189 #-- .. tag |index| line |
190 #-- .. | row | |
191 #-- .. | | | |00| zero (2)
192 #-- .. | | |-| | INSN_BITS (1)
193 #-- .. | |---| | ROW_LINEBITS (3)
194 #-- .. | |--- - --| LINE_OFF_BITS (6)
195 #-- .. | |- --| ROW_OFF_BITS (3)
196 #-- .. |----- ---| | ROW_BITS (8)
197 #-- .. |-----| | INDEX_BITS (5)
198 #-- .. --------| | TAG_BITS (53)
199 # Example of layout for 32 lines of 64 bytes:
200 #
201 # .. tag |index| line |
202 # .. | row | |
203 # .. | | | |00| zero (2)
204 # .. | | |-| | INSN_BITS (1)
205 # .. | |---| | ROW_LINEBITS (3)
206 # .. | |--- - --| LINE_OFF_BITS (6)
207 # .. | |- --| ROW_OFF_BITS (3)
208 # .. |----- ---| | ROW_BITS (8)
209 # .. |-----| | INDEX_BITS (5)
210 # .. --------| | TAG_BITS (53)
211
212 #subtype row_t is integer range 0 to BRAM_ROWS-1;
213 #subtype index_t is integer range 0 to NUM_LINES-1;
214 #subtype way_t is integer range 0 to NUM_WAYS-1;
215 #subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
216 #
217 #-- The cache data BRAM organized as described above for each way
218 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
219 #
220 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
221 #-- not handle a clean (commented) definition of the cache tags as a 3d
222 #-- memory. For now, work around it by putting all the tags
223 #subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
224 # type cache_tags_set_t is array(way_t) of cache_tag_t;
225 # type cache_tags_array_t is array(index_t) of cache_tags_set_t;
226 #constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
227 #subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
228 #type cache_tags_array_t is array(index_t) of cache_tags_set_t;
229 def CacheTagArray():
230 return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
231 for x in range(NUM_LINES))
232
233 #-- The cache valid bits
234 #subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
235 #type cache_valids_t is array(index_t) of cache_way_valids_t;
236 #type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
237 def CacheValidBitsArray():
238 return Array(Signal(NUM_WAYS, name="cachevalid_%d" %x) \
239 for x in range(NUM_LINES))
240
241 def RowPerLineValidArray():
242 return Array(Signal(name="rows_valid_%d" %x) \
243 for x in range(ROW_PER_LINE))
244
245
246 #attribute ram_style : string;
247 #attribute ram_style of cache_tags : signal is "distributed";
248 # TODO to be passed to nigmen as ram attributes
249 # attribute ram_style : string;
250 # attribute ram_style of cache_tags : signal is "distributed";
251
252
253 #subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
254 #type tlb_valids_t is array(tlb_index_t) of std_ulogic;
255 #subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
256 #type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
257 #subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
258 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
259 def TLBValidBitsArray():
260 return Array(Signal(name="tlbvalid_%d" %x) \
261 for x in range(TLB_SIZE))
262
263 def TLBTagArray():
264 return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
265 for x in range(TLB_SIZE))
266
267 def TLBPtesArray():
268 return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
269 for x in range(TLB_SIZE))
270
271
272 #-- Cache RAM interface
273 #type cache_ram_out_t is array(way_t) of cache_row_t;
274 # Cache RAM interface
275 def CacheRamOut():
276 return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
277 for x in range(NUM_WAYS))
278
279 #-- PLRU output interface
280 #type plru_out_t is array(index_t) of
281 # std_ulogic_vector(WAY_BITS-1 downto 0);
282 # PLRU output interface
283 def PLRUOut():
284 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
285 for x in range(NUM_LINES))
286
287 # -- Return the cache line index (tag index) for an address
288 # function get_index(addr: std_ulogic_vector(63 downto 0))
289 # return index_t is
290 # begin
291 # return to_integer(unsigned(
292 # addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)
293 # ));
294 # end;
295 # Return the cache line index (tag index) for an address
296 def get_index(addr):
297 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
298
299 # -- Return the cache row index (data memory) for an address
300 # function get_row(addr: std_ulogic_vector(63 downto 0))
301 # return row_t is
302 # begin
303 # return to_integer(unsigned(
304 # addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)
305 # ));
306 # end;
307 # Return the cache row index (data memory) for an address
308 def get_row(addr):
309 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
310
311 # -- Return the index of a row within a line
312 # function get_row_of_line(row: row_t) return row_in_line_t is
313 # variable row_v : unsigned(ROW_BITS-1 downto 0);
314 # begin
315 # row_v := to_unsigned(row, ROW_BITS);
316 # return row_v(ROW_LINEBITS-1 downto 0);
317 # end;
318 # Return the index of a row within a line
319 def get_row_of_line(row):
320 return row[:ROW_LINE_BITS]
321
322 # -- Returns whether this is the last row of a line
323 # function is_last_row_addr(addr: wishbone_addr_type;
324 # last: row_in_line_t
325 # )
326 # return boolean is
327 # begin
328 # return unsigned(
329 # addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)
330 # ) = last;
331 # end;
332 # Returns whether this is the last row of a line
333 def is_last_row_addr(addr, last):
334 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
335
336 # -- Returns whether this is the last row of a line
337 # function is_last_row(row: row_t;
338 # last: row_in_line_t) return boolean is
339 # begin
340 # return get_row_of_line(row) = last;
341 # end;
342 # Returns whether this is the last row of a line
343 def is_last_row(row, last):
344 return get_row_of_line(row) == last
345
346 # -- Return the next row in the current cache line. We use a dedicated
347 # -- function in order to limit the size of the generated adder to be
348 # -- only the bits within a cache line (3 bits with default settings)
349 # function next_row(row: row_t) return row_t is
350 # variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0);
351 # variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0);
352 # variable result : std_ulogic_vector(ROW_BITS-1 downto 0);
353 # begin
354 # row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS));
355 # row_idx := row_v(ROW_LINEBITS-1 downto 0);
356 # row_v(ROW_LINEBITS-1 downto 0) :=
357 # std_ulogic_vector(unsigned(row_idx) + 1);
358 # return to_integer(unsigned(row_v));
359 # end;
360 # Return the next row in the current cache line. We use a dedicated
361 # function in order to limit the size of the generated adder to be
362 # only the bits within a cache line (3 bits with default settings)
363 def next_row(row):
364 row_v = row[0:ROW_LINE_BITS] + 1
365 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
366 # -- Read the instruction word for the given address in the
367 # -- current cache row
368 # function read_insn_word(addr: std_ulogic_vector(63 downto 0);
369 # data: cache_row_t) return std_ulogic_vector is
370 # variable word: integer range 0 to INSN_PER_ROW-1;
371 # begin
372 # word := to_integer(unsigned(addr(INSN_BITS+2-1 downto 2)));
373 # return data(31+word*32 downto word*32);
374 # end;
375 # Read the instruction word for the given address
376 # in the current cache row
377 def read_insn_word(addr, data):
378 word = addr[2:INSN_BITS+2]
379 return data.word_select(word, 32)
380
381 # -- Get the tag value from the address
382 # function get_tag(
383 # addr: std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0)
384 # )
385 # return cache_tag_t is
386 # begin
387 # return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS);
388 # end;
389 # Get the tag value from the address
390 def get_tag(addr):
391 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
392
393 # -- Read a tag from a tag memory row
394 # function read_tag(way: way_t; tagset: cache_tags_set_t)
395 # return cache_tag_t is
396 # begin
397 # return tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS);
398 # end;
399 # Read a tag from a tag memory row
400 def read_tag(way, tagset):
401 return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
402
403 # -- Write a tag to tag memory row
404 # procedure write_tag(way: in way_t;
405 # tagset: inout cache_tags_set_t; tag: cache_tag_t) is
406 # begin
407 # tagset((way+1) * TAG_BITS - 1 downto way * TAG_BITS) := tag;
408 # end;
409 # Write a tag to tag memory row
410 def write_tag(way, tagset, tag):
411 return tagset[way * TAG_BITS:(way + 1) * TAG_BITS].eq(tag)
412
413 # -- Simple hash for direct-mapped TLB index
414 # function hash_ea(addr: std_ulogic_vector(63 downto 0))
415 # return tlb_index_t is
416 # variable hash : std_ulogic_vector(TLB_BITS - 1 downto 0);
417 # begin
418 # hash := addr(TLB_LG_PGSZ + TLB_BITS - 1 downto TLB_LG_PGSZ)
419 # xor addr(
420 # TLB_LG_PGSZ + 2 * TLB_BITS - 1 downto
421 # TLB_LG_PGSZ + TLB_BITS
422 # )
423 # xor addr(
424 # TLB_LG_PGSZ + 3 * TLB_BITS - 1 downto
425 # TLB_LG_PGSZ + 2 * TLB_BITS
426 # );
427 # return to_integer(unsigned(hash));
428 # end;
429 # Simple hash for direct-mapped TLB index
430 def hash_ea(addr):
431 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
432 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
433 ] ^ addr[
434 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
435 ]
436 return hsh
437
438 # begin
439 #
440 # assert LINE_SIZE mod ROW_SIZE = 0;
441 # assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
442 # severity FAILURE;
443 # assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
444 # severity FAILURE;
445 # assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
446 # severity FAILURE;
447 # assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
448 # severity FAILURE;
449 # assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
450 # report "geometry bits don't add up" severity FAILURE;
451 # assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
452 # report "geometry bits don't add up" severity FAILURE;
453 # assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
454 # report "geometry bits don't add up" severity FAILURE;
455 # assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
456 # report "geometry bits don't add up" severity FAILURE;
457 #
458 # sim_debug: if SIM generate
459 # debug: process
460 # begin
461 # report "ROW_SIZE = " & natural'image(ROW_SIZE);
462 # report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
463 # report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
464 # report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
465 # report "INSN_BITS = " & natural'image(INSN_BITS);
466 # report "ROW_BITS = " & natural'image(ROW_BITS);
467 # report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
468 # report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
469 # report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
470 # report "INDEX_BITS = " & natural'image(INDEX_BITS);
471 # report "TAG_BITS = " & natural'image(TAG_BITS);
472 # report "WAY_BITS = " & natural'image(WAY_BITS);
473 # wait;
474 # end process;
475 # end generate;
476
477 # Cache reload state machine
478 @unique
479 class State(Enum):
480 IDLE = 0
481 CLR_TAG = 1
482 WAIT_ACK = 2
483
484 # type reg_internal_t is record
485 # -- Cache hit state (Latches for 1 cycle BRAM access)
486 # hit_way : way_t;
487 # hit_nia : std_ulogic_vector(63 downto 0);
488 # hit_smark : std_ulogic;
489 # hit_valid : std_ulogic;
490 #
491 # -- Cache miss state (reload state machine)
492 # state : state_t;
493 # wb : wishbone_master_out;
494 # store_way : way_t;
495 # store_index : index_t;
496 # store_row : row_t;
497 # store_tag : cache_tag_t;
498 # store_valid : std_ulogic;
499 # end_row_ix : row_in_line_t;
500 # rows_valid : row_per_line_valid_t;
501 #
502 # -- TLB miss state
503 # fetch_failed : std_ulogic;
504 # end record;
505 class RegInternal(RecordObject):
506 def __init__(self):
507 super().__init__()
508 # Cache hit state (Latches for 1 cycle BRAM access)
509 self.hit_way = Signal(NUM_WAYS)
510 self.hit_nia = Signal(64)
511 self.hit_smark = Signal()
512 self.hit_valid = Signal()
513
514 # Cache miss state (reload state machine)
515 self.state = Signal(State, reset=State.IDLE)
516 self.wb = WBMasterOut("wb")
517 self.store_way = Signal(NUM_WAYS)
518 self.store_index = Signal(NUM_LINES)
519 self.store_row = Signal(BRAM_ROWS)
520 self.store_tag = Signal(TAG_BITS)
521 self.store_valid = Signal()
522 self.end_row_ix = Signal(ROW_LINE_BITS)
523 self.rows_valid = RowPerLineValidArray()
524
525 # TLB miss state
526 self.fetch_failed = Signal()
527
528 # -- 64 bit direct mapped icache. All instructions are 4B aligned.
529 #
530 # entity icache is
531 # generic (
532 # SIM : boolean := false;
533 # -- Line size in bytes
534 # LINE_SIZE : positive := 64;
535 # -- BRAM organisation: We never access more
536 # -- than wishbone_data_bits
537 # -- at a time so to save resources we make the
538 # -- array only that wide,
539 # -- and use consecutive indices for to make a cache "line"
540 # --
541 # -- ROW_SIZE is the width in bytes of the BRAM (based on WB,
542 # -- so 64-bits)
543 # ROW_SIZE : positive := wishbone_data_bits / 8;
544 # -- Number of lines in a set
545 # NUM_LINES : positive := 32;
546 # -- Number of ways
547 # NUM_WAYS : positive := 4;
548 # -- L1 ITLB number of entries (direct mapped)
549 # TLB_SIZE : positive := 64;
550 # -- L1 ITLB log_2(page_size)
551 # TLB_LG_PGSZ : positive := 12;
552 # -- Number of real address bits that we store
553 # REAL_ADDR_BITS : positive := 56;
554 # -- Non-zero to enable log data collection
555 # LOG_LENGTH : natural := 0
556 # );
557 # port (
558 # clk : in std_ulogic;
559 # rst : in std_ulogic;
560 #
561 # i_in : in Fetch1ToIcacheType;
562 # i_out : out IcacheToDecode1Type;
563 #
564 # m_in : in MmuToIcacheType;
565 #
566 # stall_in : in std_ulogic;
567 # stall_out : out std_ulogic;
568 # flush_in : in std_ulogic;
569 # inval_in : in std_ulogic;
570 #
571 # wishbone_out : out wishbone_master_out;
572 # wishbone_in : in wishbone_slave_out;
573 #
574 # log_out : out std_ulogic_vector(53 downto 0)
575 # );
576 # end entity icache;
577 # 64 bit direct mapped icache. All instructions are 4B aligned.
578 class ICache(Elaboratable):
579 """64 bit direct mapped icache. All instructions are 4B aligned."""
580 def __init__(self):
581 self.i_in = Fetch1ToICacheType(name="i_in")
582 self.i_out = ICacheToDecode1Type(name="i_out")
583
584 self.m_in = MMUToICacheType(name="m_in")
585
586 self.stall_in = Signal()
587 self.stall_out = Signal()
588 self.flush_in = Signal()
589 self.inval_in = Signal()
590
591 self.wb_out = WBMasterOut(name="wb_out")
592 self.wb_in = WBSlaveOut(name="wb_in")
593
594 self.log_out = Signal(54)
595
596
597 # -- Generate a cache RAM for each way
598 # rams: for i in 0 to NUM_WAYS-1 generate
599 # signal do_read : std_ulogic;
600 # signal do_write : std_ulogic;
601 # signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
602 # signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
603 # signal dout : cache_row_t;
604 # signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
605 # begin
606 # way: entity work.cache_ram
607 # generic map (
608 # ROW_BITS => ROW_BITS,
609 # WIDTH => ROW_SIZE_BITS
610 # )
611 # port map (
612 # clk => clk,
613 # rd_en => do_read,
614 # rd_addr => rd_addr,
615 # rd_data => dout,
616 # wr_sel => wr_sel,
617 # wr_addr => wr_addr,
618 # wr_data => wishbone_in.dat
619 # );
620 # process(all)
621 # begin
622 # do_read <= not (stall_in or use_previous);
623 # do_write <= '0';
624 # if wishbone_in.ack = '1' and replace_way = i then
625 # do_write <= '1';
626 # end if;
627 # cache_out(i) <= dout;
628 # rd_addr <=
629 # std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
630 # wr_addr <=
631 # std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
632 # for i in 0 to ROW_SIZE-1 loop
633 # wr_sel(i) <= do_write;
634 # end loop;
635 # end process;
636 # end generate;
637 def rams(self, m, r, cache_out, use_previous, replace_way, req_row):
638 comb = m.d.comb
639
640 wb_in, stall_in = self.wb_in, self.stall_in
641
642
643 for i in range(NUM_WAYS):
644 do_read = Signal(name="do_rd_%d" % i)
645 do_write = Signal(name="do_wr_%d" % i)
646 rd_addr = Signal(ROW_BITS)
647 wr_addr = Signal(ROW_BITS)
648 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
649 wr_sel = Signal(ROW_SIZE)
650
651 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True)
652 setattr(m.submodules, "cacheram_%d" % i, way)
653
654 comb += way.rd_en.eq(do_read)
655 comb += way.rd_addr.eq(rd_addr)
656 comb += d_out.eq(way.rd_data_o)
657 comb += way.wr_sel.eq(wr_sel)
658 comb += way.wr_addr.eq(wr_addr)
659 comb += way.wr_data.eq(wb_in.dat)
660
661 comb += do_read.eq(~(stall_in | use_previous))
662
663 with m.If(wb_in.ack & (replace_way == i)):
664 comb += do_write.eq(1)
665
666 comb += cache_out[i].eq(d_out)
667 comb += rd_addr.eq(req_row)
668 comb += wr_addr.eq(r.store_row)
669 for j in range(ROW_SIZE):
670 comb += wr_sel[j].eq(do_write)
671
672 # -- Generate PLRUs
673 # maybe_plrus: if NUM_WAYS > 1 generate
674 # begin
675 # plrus: for i in 0 to NUM_LINES-1 generate
676 # -- PLRU interface
677 # signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
678 # signal plru_acc_en : std_ulogic;
679 # signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
680 #
681 # begin
682 # plru : entity work.plru
683 # generic map (
684 # BITS => WAY_BITS
685 # )
686 # port map (
687 # clk => clk,
688 # rst => rst,
689 # acc => plru_acc,
690 # acc_en => plru_acc_en,
691 # lru => plru_out
692 # );
693 #
694 # process(all)
695 # begin
696 # -- PLRU interface
697 # if get_index(r.hit_nia) = i then
698 # plru_acc_en <= r.hit_valid;
699 # else
700 # plru_acc_en <= '0';
701 # end if;
702 # plru_acc <=
703 # std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
704 # plru_victim(i) <= plru_out;
705 # end process;
706 # end generate;
707 # end generate;
708 def maybe_plrus(self, m, r, plru_victim):
709 comb = m.d.comb
710
711 with m.If(NUM_WAYS > 1):
712 for i in range(NUM_LINES):
713 plru_acc_i = Signal(WAY_BITS)
714 plru_acc_en = Signal()
715 plru_out = Signal(WAY_BITS)
716 plru = PLRU(WAY_BITS)
717 comb += plru.acc_i.eq(plru_acc_i)
718 comb += plru.acc_en.eq(plru_acc_en)
719 comb += plru.lru_o.eq(plru_out)
720
721 # PLRU interface
722 with m.If(get_index(r.hit_nia) == i):
723 comb += plru.acc_en.eq(r.hit_valid)
724
725 comb += plru.acc_i.eq(r.hit_way)
726 comb += plru_victim[i].eq(plru.lru_o)
727
728 # -- TLB hit detection and real address generation
729 # itlb_lookup : process(all)
730 # variable pte : tlb_pte_t;
731 # variable ttag : tlb_tag_t;
732 # begin
733 # tlb_req_index <= hash_ea(i_in.nia);
734 # pte := itlb_ptes(tlb_req_index);
735 # ttag := itlb_tags(tlb_req_index);
736 # if i_in.virt_mode = '1' then
737 # real_addr <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
738 # i_in.nia(TLB_LG_PGSZ - 1 downto 0);
739 # if ttag = i_in.nia(63 downto TLB_LG_PGSZ + TLB_BITS) then
740 # ra_valid <= itlb_valids(tlb_req_index);
741 # else
742 # ra_valid <= '0';
743 # end if;
744 # eaa_priv <= pte(3);
745 # else
746 # real_addr <= i_in.nia(REAL_ADDR_BITS - 1 downto 0);
747 # ra_valid <= '1';
748 # eaa_priv <= '1';
749 # end if;
750 #
751 # -- no IAMR, so no KUEP support for now
752 # priv_fault <= eaa_priv and not i_in.priv_mode;
753 # access_ok <= ra_valid and not priv_fault;
754 # end process;
755 # TLB hit detection and real address generation
756 def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
757 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
758 priv_fault, access_ok):
759 comb = m.d.comb
760
761 i_in = self.i_in
762
763 pte = Signal(TLB_PTE_BITS)
764 ttag = Signal(TLB_EA_TAG_BITS)
765
766 comb += tlb_req_index.eq(hash_ea(i_in.nia))
767 comb += pte.eq(itlb_ptes[tlb_req_index])
768 comb += ttag.eq(itlb_tags[tlb_req_index])
769
770 with m.If(i_in.virt_mode):
771 comb += real_addr.eq(Cat(
772 i_in.nia[:TLB_LG_PGSZ],
773 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
774 ))
775
776 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
777 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
778
779 comb += eaa_priv.eq(pte[3])
780
781 with m.Else():
782 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
783 comb += ra_valid.eq(1)
784 comb += eaa_priv.eq(1)
785
786 # No IAMR, so no KUEP support for now
787 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
788 comb += access_ok.eq(ra_valid & ~priv_fault)
789
790 # -- iTLB update
791 # itlb_update: process(clk)
792 # variable wr_index : tlb_index_t;
793 # begin
794 # if rising_edge(clk) then
795 # wr_index := hash_ea(m_in.addr);
796 # if rst = '1' or
797 # (m_in.tlbie = '1' and m_in.doall = '1') then
798 # -- clear all valid bits
799 # for i in tlb_index_t loop
800 # itlb_valids(i) <= '0';
801 # end loop;
802 # elsif m_in.tlbie = '1' then
803 # -- clear entry regardless of hit or miss
804 # itlb_valids(wr_index) <= '0';
805 # elsif m_in.tlbld = '1' then
806 # itlb_tags(wr_index) <=
807 # m_in.addr(63 downto TLB_LG_PGSZ + TLB_BITS);
808 # itlb_ptes(wr_index) <= m_in.pte;
809 # itlb_valids(wr_index) <= '1';
810 # end if;
811 # end if;
812 # end process;
813 # iTLB update
814 def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
815 comb = m.d.comb
816 sync = m.d.sync
817
818 m_in = self.m_in
819
820 wr_index = Signal(TLB_SIZE)
821 sync += wr_index.eq(hash_ea(m_in.addr))
822
823 with m.If(m_in.tlbie & m_in.doall):
824 # Clear all valid bits
825 for i in range(TLB_SIZE):
826 sync += itlb_valid_bits[i].eq(0)
827
828 with m.Elif(m_in.tlbie):
829 # Clear entry regardless of hit or miss
830 sync += itlb_valid_bits[wr_index].eq(0)
831
832 with m.Elif(m_in.tlbld):
833 sync += itlb_tags[wr_index].eq(
834 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
835 )
836 sync += itlb_ptes[wr_index].eq(m_in.pte)
837 sync += itlb_valid_bits[wr_index].eq(1)
838
839 # -- Cache hit detection, output to fetch2 and other misc logic
840 # icache_comb : process(all)
841 # Cache hit detection, output to fetch2 and other misc logic
842 def icache_comb(self, m, use_previous, r, req_index, req_row,
843 req_tag, real_addr, req_laddr, cache_valid_bits,
844 cache_tags, access_ok, req_is_hit,
845 req_is_miss, replace_way, plru_victim, cache_out):
846 # variable is_hit : std_ulogic;
847 # variable hit_way : way_t;
848 comb = m.d.comb
849
850 #comb += Display("ENTER icache_comb - use_previous:%x req_index:%x " \
851 # "req_row:%x req_tag:%x real_addr:%x req_laddr:%x " \
852 # "access_ok:%x req_is_hit:%x req_is_miss:%x " \
853 # "replace_way:%x", use_previous, req_index, req_row, \
854 # req_tag, real_addr, req_laddr, access_ok, \
855 # req_is_hit, req_is_miss, replace_way)
856
857 i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
858 flush_in, stall_out = self.flush_in, self.stall_out
859
860 is_hit = Signal()
861 hit_way = Signal(NUM_WAYS)
862 # begin
863 # -- i_in.sequential means that i_in.nia this cycle
864 # -- is 4 more than last cycle. If we read more
865 # -- than 32 bits at a time, had a cache hit last
866 # -- cycle, and we don't want the first 32-bit chunk
867 # -- then we can keep the data we read last cycle
868 # -- and just use that.
869 # if unsigned(i_in.nia(INSN_BITS+2-1 downto 2)) /= 0 then
870 # use_previous <= i_in.sequential and r.hit_valid;
871 # else
872 # use_previous <= '0';
873 # end if;
874 # i_in.sequential means that i_in.nia this cycle is 4 more than
875 # last cycle. If we read more than 32 bits at a time, had a
876 # cache hit last cycle, and we don't want the first 32-bit chunk
877 # then we can keep the data we read last cycle and just use that.
878 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
879 comb += use_previous.eq(i_in.sequential & r.hit_valid)
880
881 # -- Extract line, row and tag from request
882 # req_index <= get_index(i_in.nia);
883 # req_row <= get_row(i_in.nia);
884 # req_tag <= get_tag(real_addr);
885 # Extract line, row and tag from request
886 comb += req_index.eq(get_index(i_in.nia))
887 comb += req_row.eq(get_row(i_in.nia))
888 comb += req_tag.eq(get_tag(real_addr))
889
890 # -- Calculate address of beginning of cache row, will be
891 # -- used for cache miss processing if needed
892 # req_laddr <=
893 # (63 downto REAL_ADDR_BITS => '0') &
894 # real_addr(REAL_ADDR_BITS - 1 downto ROW_OFF_BITS) &
895 # (ROW_OFF_BITS-1 downto 0 => '0');
896 # Calculate address of beginning of cache row, will be
897 # used for cache miss processing if needed
898 comb += req_laddr.eq(Cat(
899 Const(0b0, ROW_OFF_BITS),
900 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
901 Const(0b0, 8)
902 ))
903
904 # -- Test if pending request is a hit on any way
905 # hit_way := 0;
906 # is_hit := '0';
907 # for i in way_t loop
908 # if i_in.req = '1' and
909 # (cache_valids(req_index)(i) = '1' or
910 # (r.state = WAIT_ACK and
911 # req_index = r.store_index and
912 # i = r.store_way and
913 # r.rows_valid(req_row mod ROW_PER_LINE) = '1')) then
914 # if read_tag(i, cache_tags(req_index)) = req_tag then
915 # hit_way := i;
916 # is_hit := '1';
917 # end if;
918 # end if;
919 # end loop;
920 # Test if pending request is a hit on any way
921 for i in range(NUM_WAYS):
922 with m.If(i_in.req &
923 (cache_valid_bits[req_index][i] |
924 ((r.state == State.WAIT_ACK)
925 & (req_index == r.store_index)
926 & (i == r.store_way)
927 & r.rows_valid[req_row % ROW_PER_LINE]))):
928 with m.If(read_tag(i, cache_tags[req_index]) == req_tag):
929 comb += hit_way.eq(i)
930 comb += is_hit.eq(1)
931
932 # -- Generate the "hit" and "miss" signals
933 # -- for the synchronous blocks
934 # if i_in.req = '1' and access_ok = '1' and flush_in = '0'
935 # and rst = '0' then
936 # req_is_hit <= is_hit;
937 # req_is_miss <= not is_hit;
938 # else
939 # req_is_hit <= '0';
940 # req_is_miss <= '0';
941 # end if;
942 # req_hit_way <= hit_way;
943 # Generate the "hit" and "miss" signals
944 # for the synchronous blocks
945 with m.If(i_in.req & access_ok & ~flush_in):
946 comb += req_is_hit.eq(is_hit)
947 comb += req_is_miss.eq(~is_hit)
948
949 with m.Else():
950 comb += req_is_hit.eq(0)
951 comb += req_is_miss.eq(0)
952
953 # -- The way to replace on a miss
954 # if r.state = CLR_TAG then
955 # replace_way <=
956 # to_integer(unsigned(plru_victim(r.store_index)));
957 # else
958 # replace_way <= r.store_way;
959 # end if;
960 # The way to replace on a miss
961 with m.If(r.state == State.CLR_TAG):
962 comb += replace_way.eq(plru_victim[r.store_index])
963
964 with m.Else():
965 comb += replace_way.eq(r.store_way)
966
967 # -- Output instruction from current cache row
968 # --
969 # -- Note: This is a mild violation of our design principle of
970 # -- having pipeline stages output from a clean latch. In this
971 # -- case we output the result of a mux. The alternative would
972 # -- be output an entire row which I prefer not to do just yet
973 # -- as it would force fetch2 to know about some of the cache
974 # -- geometry information.
975 # i_out.insn <= read_insn_word(r.hit_nia, cache_out(r.hit_way));
976 # i_out.valid <= r.hit_valid;
977 # i_out.nia <= r.hit_nia;
978 # i_out.stop_mark <= r.hit_smark;
979 # i_out.fetch_failed <= r.fetch_failed;
980 # Output instruction from current cache row
981 #
982 # Note: This is a mild violation of our design principle of
983 # having pipeline stages output from a clean latch. In this
984 # case we output the result of a mux. The alternative would
985 # be output an entire row which I prefer not to do just yet
986 # as it would force fetch2 to know about some of the cache
987 # geometry information.
988 #comb += Display("BEFORE read_insn_word - r.hit_nia:%x " \
989 # "r.hit_way:%x, cache_out[r.hit_way]:%x", r.hit_nia, \
990 # r.hit_way, cache_out[r.hit_way])
991 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out[r.hit_way]))
992 comb += i_out.valid.eq(r.hit_valid)
993 comb += i_out.nia.eq(r.hit_nia)
994 comb += i_out.stop_mark.eq(r.hit_smark)
995 comb += i_out.fetch_failed.eq(r.fetch_failed)
996
997 # -- Stall fetch1 if we have a miss on cache or TLB
998 # -- or a protection fault
999 # stall_out <= not (is_hit and access_ok);
1000 # Stall fetch1 if we have a miss on cache or TLB
1001 # or a protection fault
1002 comb += stall_out.eq(~(is_hit & access_ok))
1003
1004 # -- Wishbone requests output (from the cache miss reload machine)
1005 # wishbone_out <= r.wb;
1006 # Wishbone requests output (from the cache miss reload machine)
1007 comb += wb_out.eq(r.wb)
1008 # end process;
1009
1010 # -- Cache hit synchronous machine
1011 # icache_hit : process(clk)
1012 # Cache hit synchronous machine
1013 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
1014 req_index, req_tag, real_addr):
1015 sync = m.d.sync
1016
1017 i_in, stall_in = self.i_in, self.stall_in
1018 flush_in = self.flush_in
1019
1020 # begin
1021 # if rising_edge(clk) then
1022 # -- keep outputs to fetch2 unchanged on a stall
1023 # -- except that flush or reset sets valid to 0
1024 # -- If use_previous, keep the same data as last
1025 # -- cycle and use the second half
1026 # if stall_in = '1' or use_previous = '1' then
1027 # if rst = '1' or flush_in = '1' then
1028 # r.hit_valid <= '0';
1029 # end if;
1030 # keep outputs to fetch2 unchanged on a stall
1031 # except that flush or reset sets valid to 0
1032 # If use_previous, keep the same data as last
1033 # cycle and use the second half
1034 with m.If(stall_in | use_previous):
1035 with m.If(flush_in):
1036 sync += r.hit_valid.eq(0)
1037 # else
1038 # -- On a hit, latch the request for the next cycle,
1039 # -- when the BRAM data will be available on the
1040 # -- cache_out output of the corresponding way
1041 # r.hit_valid <= req_is_hit;
1042 # if req_is_hit = '1' then
1043 # r.hit_way <= req_hit_way;
1044 with m.Else():
1045 # On a hit, latch the request for the next cycle,
1046 # when the BRAM data will be available on the
1047 # cache_out output of the corresponding way
1048 sync += r.hit_valid.eq(req_is_hit)
1049
1050 with m.If(req_is_hit):
1051 sync += r.hit_way.eq(req_hit_way)
1052
1053 # report "cache hit nia:" & to_hstring(i_in.nia) &
1054 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1055 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1056 # " idx:" & integer'image(req_index) &
1057 # " tag:" & to_hstring(req_tag) &
1058 # " way:" & integer'image(req_hit_way) &
1059 # " RA:" & to_hstring(real_addr);
1060 sync += Display("cache hit nia:%x, IR:%x, SM:%x, idx:%x, " \
1061 "tag:%x, way:%x, RA:%x", i_in.nia, \
1062 i_in.virt_mode, i_in.stop_mark, req_index, \
1063 req_tag, req_hit_way, real_addr)
1064
1065
1066
1067 # end if;
1068 # end if;
1069 # if stall_in = '0' then
1070 # -- Send stop marks and NIA down regardless of validity
1071 # r.hit_smark <= i_in.stop_mark;
1072 # r.hit_nia <= i_in.nia;
1073 # end if;
1074 with m.If(~stall_in):
1075 # Send stop marks and NIA down regardless of validity
1076 sync += r.hit_smark.eq(i_in.stop_mark)
1077 sync += r.hit_nia.eq(i_in.nia)
1078 # end if;
1079 # end process;
1080
1081 # -- Cache miss/reload synchronous machine
1082 # icache_miss : process(clk)
1083 # Cache miss/reload synchronous machine
1084 def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
1085 req_index, req_laddr, req_tag, replace_way,
1086 cache_tags, access_ok, real_addr):
1087 comb = m.d.comb
1088 sync = m.d.sync
1089
1090 i_in, wb_in, m_in = self.i_in, self.wb_in, self.m_in
1091 stall_in, flush_in = self.stall_in, self.flush_in
1092 inval_in = self.inval_in
1093
1094 # variable tagset : cache_tags_set_t;
1095 # variable stbs_done : boolean;
1096
1097 tagset = Signal(TAG_RAM_WIDTH)
1098 stbs_done = Signal()
1099
1100 # begin
1101 # if rising_edge(clk) then
1102 # -- On reset, clear all valid bits to force misses
1103 # if rst = '1' then
1104 # On reset, clear all valid bits to force misses
1105 # for i in index_t loop
1106 # cache_valids(i) <= (others => '0');
1107 # end loop;
1108 # r.state <= IDLE;
1109 # r.wb.cyc <= '0';
1110 # r.wb.stb <= '0';
1111 # -- We only ever do reads on wishbone
1112 # r.wb.dat <= (others => '0');
1113 # r.wb.sel <= "11111111";
1114 # r.wb.we <= '0';
1115
1116 # -- Not useful normally but helps avoiding
1117 # -- tons of sim warnings
1118 # r.wb.adr <= (others => '0');
1119
1120 # else
1121
1122 # -- Process cache invalidations
1123 # if inval_in = '1' then
1124 # for i in index_t loop
1125 # cache_valids(i) <= (others => '0');
1126 # end loop;
1127 # r.store_valid <= '0';
1128 # end if;
1129 # Process cache invalidations
1130 with m.If(inval_in):
1131 for i in range(NUM_LINES):
1132 sync += cache_valid_bits[i].eq(0)
1133 sync += r.store_valid.eq(0)
1134
1135 # -- Main state machine
1136 # case r.state is
1137 # Main state machine
1138 with m.Switch(r.state):
1139
1140 # when IDLE =>
1141 with m.Case(State.IDLE):
1142 # -- Reset per-row valid flags,
1143 # -- only used in WAIT_ACK
1144 # for i in 0 to ROW_PER_LINE - 1 loop
1145 # r.rows_valid(i) <= '0';
1146 # end loop;
1147 # Reset per-row valid flags,
1148 # only used in WAIT_ACK
1149 for i in range(ROW_PER_LINE):
1150 sync += r.rows_valid[i].eq(0)
1151
1152 # -- We need to read a cache line
1153 # if req_is_miss = '1' then
1154 # report "cache miss nia:" & to_hstring(i_in.nia) &
1155 # " IR:" & std_ulogic'image(i_in.virt_mode) &
1156 # " SM:" & std_ulogic'image(i_in.stop_mark) &
1157 # " idx:" & integer'image(req_index) &
1158 # " way:" & integer'image(replace_way) &
1159 # " tag:" & to_hstring(req_tag) &
1160 # " RA:" & to_hstring(real_addr);
1161 # We need to read a cache line
1162 with m.If(req_is_miss):
1163 sync += Display(
1164 "cache miss nia:%x IR:%x SM:%x idx:%x way:%x " \
1165 "tag:%x RA:%x", i_in.nia, i_in.virt_mode, \
1166 i_in.stop_mark, req_index, replace_way, \
1167 req_tag, real_addr)
1168
1169 # -- Keep track of our index and way for
1170 # -- subsequent stores
1171 # r.store_index <= req_index;
1172 # r.store_row <= get_row(req_laddr);
1173 # r.store_tag <= req_tag;
1174 # r.store_valid <= '1';
1175 # r.end_row_ix <=
1176 # get_row_of_line(get_row(req_laddr)) - 1;
1177 # Keep track of our index and way
1178 # for subsequent stores
1179 sync += r.store_index.eq(req_index)
1180 sync += r.store_row.eq(get_row(req_laddr))
1181 sync += r.store_tag.eq(req_tag)
1182 sync += r.store_valid.eq(1)
1183 sync += r.end_row_ix.eq(
1184 get_row_of_line(
1185 get_row(req_laddr)
1186 ) - 1
1187 )
1188
1189 # -- Prep for first wishbone read. We calculate the
1190 # -- address of the start of the cache line and
1191 # -- start the WB cycle.
1192 # r.wb.adr <= req_laddr(r.wb.adr'left downto 0);
1193 # r.wb.cyc <= '1';
1194 # r.wb.stb <= '1';
1195 # Prep for first wishbone read.
1196 # We calculate the
1197 # address of the start of the cache line and
1198 # start the WB cycle.
1199 sync += r.wb.adr.eq(req_laddr)
1200 sync += r.wb.cyc.eq(1)
1201 sync += r.wb.stb.eq(1)
1202
1203 # -- Track that we had one request sent
1204 # r.state <= CLR_TAG;
1205 # Track that we had one request sent
1206 sync += r.state.eq(State.CLR_TAG)
1207 # end if;
1208
1209 # when CLR_TAG | WAIT_ACK =>
1210 with m.Case(State.CLR_TAG, State.WAIT_ACK):
1211 # if r.state = CLR_TAG then
1212 with m.If(r.state == State.CLR_TAG):
1213 # -- Get victim way from plru
1214 # r.store_way <= replace_way;
1215 # Get victim way from plru
1216 sync += r.store_way.eq(replace_way)
1217 #
1218 # -- Force misses on that way while
1219 # -- reloading that line
1220 # cache_valids(req_index)(replace_way) <= '0';
1221 # Force misses on that way while
1222 # realoading that line
1223 cv = Signal(INDEX_BITS)
1224 comb += cv.eq(cache_valid_bits[req_index])
1225 comb += cv.bit_select(replace_way, 1).eq(0)
1226 sync += cache_valid_bits[req_index].eq(cv)
1227
1228 # -- Store new tag in selected way
1229 # for i in 0 to NUM_WAYS-1 loop
1230 # if i = replace_way then
1231 # tagset := cache_tags(r.store_index);
1232 # write_tag(i, tagset, r.store_tag);
1233 # cache_tags(r.store_index) <= tagset;
1234 # end if;
1235 # end loop;
1236 for i in range(NUM_WAYS):
1237 with m.If(i == replace_way):
1238 comb += tagset.eq(cache_tags[r.store_index])
1239 comb += write_tag(i, tagset, r.store_tag)
1240 sync += cache_tags[r.store_index].eq(tagset)
1241
1242 # r.state <= WAIT_ACK;
1243 sync += r.state.eq(State.WAIT_ACK)
1244 # end if;
1245
1246 # -- Requests are all sent if stb is 0
1247 # stbs_done := r.wb.stb = '0';
1248 # Requests are all sent if stb is 0
1249 stbs_zero = Signal()
1250 comb += stbs_zero.eq(r.wb.stb == 0)
1251 comb += stbs_done.eq(stbs_zero)
1252
1253 # -- If we are still sending requests,
1254 # -- was one accepted ?
1255 # if wishbone_in.stall = '0' and not stbs_done then
1256 # If we are still sending requests,
1257 # was one accepted?
1258 with m.If(~wb_in.stall & stbs_zero):
1259 # -- That was the last word ? We are done sending.
1260 # -- Clear stb and set stbs_done so we can handle
1261 # -- an eventual last ack on the same cycle.
1262 # if is_last_row_addr(r.wb.adr, r.end_row_ix) then
1263 # r.wb.stb <= '0';
1264 # stbs_done := true;
1265 # end if;
1266 # That was the last word ?
1267 # We are done sending.
1268 # Clear stb and set stbs_done
1269 # so we can handle
1270 # an eventual last ack on
1271 # the same cycle.
1272 with m.If(is_last_row_addr(r.wb.adr, r.end_row_ix)):
1273 sync += r.wb.stb.eq(0)
1274 comb += stbs_done.eq(1)
1275
1276 # -- Calculate the next row address
1277 # r.wb.adr <= next_row_addr(r.wb.adr);
1278 # Calculate the next row address
1279 rarange = Signal(64)
1280 comb += rarange.eq(
1281 r.wb.adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1
1282 )
1283 sync += r.wb.adr.eq(rarange)
1284 sync += Display("r.wb.adr:%x", rarange)
1285 # end if;
1286
1287 # -- Incoming acks processing
1288 # if wishbone_in.ack = '1' then
1289 # Incoming acks processing
1290 with m.If(wb_in.ack):
1291 # r.rows_valid(r.store_row mod ROW_PER_LINE)
1292 # <= '1';
1293 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
1294
1295 # -- Check for completion
1296 # if stbs_done and
1297 # is_last_row(r.store_row, r.end_row_ix) then
1298 # Check for completion
1299 with m.If(stbs_done &
1300 is_last_row(r.store_row, r.end_row_ix)):
1301 # -- Complete wishbone cycle
1302 # r.wb.cyc <= '0';
1303 # Complete wishbone cycle
1304 sync += r.wb.cyc.eq(0)
1305
1306 # -- Cache line is now valid
1307 # cache_valids(r.store_index)(replace_way) <=
1308 # r.store_valid and not inval_in;
1309 # Cache line is now valid
1310 cv = Signal(INDEX_BITS)
1311 comb += cv.eq(cache_valid_bits[r.store_index])
1312 comb += cv.bit_select(replace_way, 1).eq(
1313 r.store_valid & ~inval_in
1314 )
1315 sync += cache_valid_bits[r.store_index].eq(cv)
1316
1317 # -- We are done
1318 # r.state <= IDLE;
1319 # We are done
1320 sync += r.state.eq(State.IDLE)
1321 # end if;
1322
1323 # -- Increment store row counter
1324 # r.store_row <= next_row(r.store_row);
1325 # Increment store row counter
1326 sync += r.store_row.eq(next_row(r.store_row))
1327 # end if;
1328 # end case;
1329 # end if;
1330 #
1331 # -- TLB miss and protection fault processing
1332 # if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
1333 # r.fetch_failed <= '0';
1334 # elsif i_in.req = '1' and access_ok = '0' and
1335 # stall_in = '0' then
1336 # r.fetch_failed <= '1';
1337 # end if;
1338 # TLB miss and protection fault processing
1339 with m.If(flush_in | m_in.tlbld):
1340 sync += r.fetch_failed.eq(0)
1341
1342 with m.Elif(i_in.req & ~access_ok & ~stall_in):
1343 sync += r.fetch_failed.eq(1)
1344 # end if;
1345 # end process;
1346
1347 # icache_log: if LOG_LENGTH > 0 generate
1348 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
1349 req_is_miss, req_is_hit, lway, wstate, r):
1350 comb = m.d.comb
1351 sync = m.d.sync
1352
1353 wb_in, i_out = self.wb_in, self.i_out
1354 log_out, stall_out = self.log_out, self.stall_out
1355
1356 # -- Output data to logger
1357 # signal log_data : std_ulogic_vector(53 downto 0);
1358 # begin
1359 # data_log: process(clk)
1360 # variable lway: way_t;
1361 # variable wstate: std_ulogic;
1362 # Output data to logger
1363 for i in range(LOG_LENGTH):
1364 # Output data to logger
1365 log_data = Signal(54)
1366 lway = Signal(NUM_WAYS)
1367 wstate = Signal()
1368
1369 # begin
1370 # if rising_edge(clk) then
1371 # lway := req_hit_way;
1372 # wstate := '0';
1373 sync += lway.eq(req_hit_way)
1374 sync += wstate.eq(0)
1375
1376 # if r.state /= IDLE then
1377 # wstate := '1';
1378 # end if;
1379 with m.If(r.state != State.IDLE):
1380 sync += wstate.eq(1)
1381
1382 # log_data <= i_out.valid &
1383 # i_out.insn &
1384 # wishbone_in.ack &
1385 # r.wb.adr(5 downto 3) &
1386 # r.wb.stb & r.wb.cyc &
1387 # wishbone_in.stall &
1388 # stall_out &
1389 # r.fetch_failed &
1390 # r.hit_nia(5 downto 2) &
1391 # wstate &
1392 # std_ulogic_vector(to_unsigned(lway, 3)) &
1393 # req_is_hit & req_is_miss &
1394 # access_ok &
1395 # ra_valid;
1396 sync += log_data.eq(Cat(
1397 ra_valid, access_ok, req_is_miss, req_is_hit,
1398 lway, wstate, r.hit_nia[2:6],
1399 r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
1400 r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
1401 i_out.valid
1402 ))
1403 # end if;
1404 # end process;
1405 # log_out <= log_data;
1406 comb += log_out.eq(log_data)
1407 # end generate;
1408 # end;
1409
1410 def elaborate(self, platform):
1411
1412 m = Module()
1413 comb = m.d.comb
1414
1415 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1416 cache_tags = CacheTagArray()
1417 cache_valid_bits = CacheValidBitsArray()
1418
1419 # signal itlb_valids : tlb_valids_t;
1420 # signal itlb_tags : tlb_tags_t;
1421 # signal itlb_ptes : tlb_ptes_t;
1422 # attribute ram_style of itlb_tags : signal is "distributed";
1423 # attribute ram_style of itlb_ptes : signal is "distributed";
1424 itlb_valid_bits = TLBValidBitsArray()
1425 itlb_tags = TLBTagArray()
1426 itlb_ptes = TLBPtesArray()
1427 # TODO to be passed to nmigen as ram attributes
1428 # attribute ram_style of itlb_tags : signal is "distributed";
1429 # attribute ram_style of itlb_ptes : signal is "distributed";
1430
1431 # -- Privilege bit from PTE EAA field
1432 # signal eaa_priv : std_ulogic;
1433 # Privilege bit from PTE EAA field
1434 eaa_priv = Signal()
1435
1436 # signal r : reg_internal_t;
1437 r = RegInternal()
1438
1439 # -- Async signals on incoming request
1440 # signal req_index : index_t;
1441 # signal req_row : row_t;
1442 # signal req_hit_way : way_t;
1443 # signal req_tag : cache_tag_t;
1444 # signal req_is_hit : std_ulogic;
1445 # signal req_is_miss : std_ulogic;
1446 # signal req_laddr : std_ulogic_vector(63 downto 0);
1447 # Async signal on incoming request
1448 req_index = Signal(NUM_LINES)
1449 req_row = Signal(BRAM_ROWS)
1450 req_hit_way = Signal(NUM_WAYS)
1451 req_tag = Signal(TAG_BITS)
1452 req_is_hit = Signal()
1453 req_is_miss = Signal()
1454 req_laddr = Signal(64)
1455
1456 # signal tlb_req_index : tlb_index_t;
1457 # signal real_addr : std_ulogic_vector(
1458 # REAL_ADDR_BITS - 1 downto 0
1459 # );
1460 # signal ra_valid : std_ulogic;
1461 # signal priv_fault : std_ulogic;
1462 # signal access_ok : std_ulogic;
1463 # signal use_previous : std_ulogic;
1464 tlb_req_index = Signal(TLB_SIZE)
1465 real_addr = Signal(REAL_ADDR_BITS)
1466 ra_valid = Signal()
1467 priv_fault = Signal()
1468 access_ok = Signal()
1469 use_previous = Signal()
1470
1471 # signal cache_out : cache_ram_out_t;
1472 cache_out = CacheRamOut()
1473
1474 # signal plru_victim : plru_out_t;
1475 # signal replace_way : way_t;
1476 plru_victim = PLRUOut()
1477 replace_way = Signal(NUM_WAYS)
1478
1479 # call sub-functions putting everything together, using shared
1480 # signals established above
1481 self.rams(m, r, cache_out, use_previous, replace_way, req_row)
1482 self.maybe_plrus(m, r, plru_victim)
1483 self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags,
1484 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
1485 priv_fault, access_ok)
1486 self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
1487 self.icache_comb(m, use_previous, r, req_index, req_row,
1488 req_tag, real_addr, req_laddr, cache_valid_bits,
1489 cache_tags, access_ok, req_is_hit, req_is_miss,
1490 replace_way, plru_victim, cache_out)
1491 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
1492 req_index, req_tag, real_addr)
1493 self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
1494 req_laddr, req_tag, replace_way, cache_tags,
1495 access_ok, real_addr)
1496 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
1497 # req_is_miss, req_is_hit, lway, wstate, r)
1498
1499 return m
1500
1501
1502 # icache_tb.vhdl
1503 #
1504 # library ieee;
1505 # use ieee.std_logic_1164.all;
1506 #
1507 # library work;
1508 # use work.common.all;
1509 # use work.wishbone_types.all;
1510 #
1511 # entity icache_tb is
1512 # end icache_tb;
1513 #
1514 # architecture behave of icache_tb is
1515 # signal clk : std_ulogic;
1516 # signal rst : std_ulogic;
1517 #
1518 # signal i_out : Fetch1ToIcacheType;
1519 # signal i_in : IcacheToDecode1Type;
1520 #
1521 # signal m_out : MmuToIcacheType;
1522 #
1523 # signal wb_bram_in : wishbone_master_out;
1524 # signal wb_bram_out : wishbone_slave_out;
1525 #
1526 # constant clk_period : time := 10 ns;
1527 # begin
1528 # icache0: entity work.icache
1529 # generic map(
1530 # LINE_SIZE => 64,
1531 # NUM_LINES => 4
1532 # )
1533 # port map(
1534 # clk => clk,
1535 # rst => rst,
1536 # i_in => i_out,
1537 # i_out => i_in,
1538 # m_in => m_out,
1539 # stall_in => '0',
1540 # flush_in => '0',
1541 # inval_in => '0',
1542 # wishbone_out => wb_bram_in,
1543 # wishbone_in => wb_bram_out
1544 # );
1545 #
1546 # -- BRAM Memory slave
1547 # bram0: entity work.wishbone_bram_wrapper
1548 # generic map(
1549 # MEMORY_SIZE => 1024,
1550 # RAM_INIT_FILE => "icache_test.bin"
1551 # )
1552 # port map(
1553 # clk => clk,
1554 # rst => rst,
1555 # wishbone_in => wb_bram_in,
1556 # wishbone_out => wb_bram_out
1557 # );
1558 #
1559 # clk_process: process
1560 # begin
1561 # clk <= '0';
1562 # wait for clk_period/2;
1563 # clk <= '1';
1564 # wait for clk_period/2;
1565 # end process;
1566 #
1567 # rst_process: process
1568 # begin
1569 # rst <= '1';
1570 # wait for 2*clk_period;
1571 # rst <= '0';
1572 # wait;
1573 # end process;
1574 #
1575 # stim: process
1576 # begin
1577 # i_out.req <= '0';
1578 # i_out.nia <= (others => '0');
1579 # i_out.stop_mark <= '0';
1580 #
1581 # m_out.tlbld <= '0';
1582 # m_out.tlbie <= '0';
1583 # m_out.addr <= (others => '0');
1584 # m_out.pte <= (others => '0');
1585 #
1586 # wait until rising_edge(clk);
1587 # wait until rising_edge(clk);
1588 # wait until rising_edge(clk);
1589 # wait until rising_edge(clk);
1590 #
1591 # i_out.req <= '1';
1592 # i_out.nia <= x"0000000000000004";
1593 #
1594 # wait for 30*clk_period;
1595 # wait until rising_edge(clk);
1596 #
1597 # assert i_in.valid = '1' severity failure;
1598 # assert i_in.insn = x"00000001"
1599 # report "insn @" & to_hstring(i_out.nia) &
1600 # "=" & to_hstring(i_in.insn) &
1601 # " expected 00000001"
1602 # severity failure;
1603 #
1604 # i_out.req <= '0';
1605 #
1606 # wait until rising_edge(clk);
1607 #
1608 # -- hit
1609 # i_out.req <= '1';
1610 # i_out.nia <= x"0000000000000008";
1611 # wait until rising_edge(clk);
1612 # wait until rising_edge(clk);
1613 # assert i_in.valid = '1' severity failure;
1614 # assert i_in.insn = x"00000002"
1615 # report "insn @" & to_hstring(i_out.nia) &
1616 # "=" & to_hstring(i_in.insn) &
1617 # " expected 00000002"
1618 # severity failure;
1619 # wait until rising_edge(clk);
1620 #
1621 # -- another miss
1622 # i_out.req <= '1';
1623 # i_out.nia <= x"0000000000000040";
1624 #
1625 # wait for 30*clk_period;
1626 # wait until rising_edge(clk);
1627 #
1628 # assert i_in.valid = '1' severity failure;
1629 # assert i_in.insn = x"00000010"
1630 # report "insn @" & to_hstring(i_out.nia) &
1631 # "=" & to_hstring(i_in.insn) &
1632 # " expected 00000010"
1633 # severity failure;
1634 #
1635 # -- test something that aliases
1636 # i_out.req <= '1';
1637 # i_out.nia <= x"0000000000000100";
1638 # wait until rising_edge(clk);
1639 # wait until rising_edge(clk);
1640 # assert i_in.valid = '0' severity failure;
1641 # wait until rising_edge(clk);
1642 #
1643 # wait for 30*clk_period;
1644 # wait until rising_edge(clk);
1645 #
1646 # assert i_in.valid = '1' severity failure;
1647 # assert i_in.insn = x"00000040"
1648 # report "insn @" & to_hstring(i_out.nia) &
1649 # "=" & to_hstring(i_in.insn) &
1650 # " expected 00000040"
1651 # severity failure;
1652 #
1653 # i_out.req <= '0';
1654 #
1655 # std.env.finish;
1656 # end process;
1657 # end;
1658 def icache_sim(dut):
1659 i_out = dut.i_in
1660 i_in = dut.i_out
1661 m_out = dut.m_in
1662
1663 yield i_in.valid.eq(0)
1664 yield i_out.priv_mode.eq(1)
1665 yield i_out.req.eq(0)
1666 yield i_out.nia.eq(0)
1667 yield i_out.stop_mark.eq(0)
1668 yield m_out.tlbld.eq(0)
1669 yield m_out.tlbie.eq(0)
1670 yield m_out.addr.eq(0)
1671 yield m_out.pte.eq(0)
1672 yield
1673 yield
1674 yield
1675 yield
1676 yield i_out.req.eq(1)
1677 yield i_out.nia.eq(Const(0x0000000000000004, 64))
1678 for i in range(30):
1679 yield
1680 yield
1681 valid = yield i_in.valid
1682 nia = yield i_out.nia
1683 insn = yield i_in.insn
1684 print(f"valid? {valid}")
1685 assert valid
1686 assert insn == 0x00000001, \
1687 "insn @%x=%x expected 00000001" % (nia, insn)
1688 yield i_out.req.eq(0)
1689 yield
1690
1691 # hit
1692 yield i_out.req.eq(1)
1693 yield i_out.nia.eq(Const(0x0000000000000008, 64))
1694 yield
1695 yield
1696 valid = yield i_in.valid
1697 nia = yield i_in.nia
1698 insn = yield i_in.insn
1699 assert valid
1700 assert insn == 0x00000002, \
1701 "insn @%x=%x expected 00000002" % (nia, insn)
1702 yield
1703
1704 # another miss
1705 yield i_out.req.eq(1)
1706 yield i_out.nia.eq(Const(0x0000000000000040, 64))
1707 for i in range(30):
1708 yield
1709 yield
1710 valid = yield i_in.valid
1711 nia = yield i_out.nia
1712 insn = yield i_in.insn
1713 assert valid
1714 assert insn == 0x00000010, \
1715 "insn @%x=%x expected 00000010" % (nia, insn)
1716
1717 # test something that aliases
1718 yield i_out.req.eq(1)
1719 yield i_out.nia.eq(Const(0x0000000000000100, 64))
1720 yield
1721 yield
1722 valid = yield i_in.valid
1723 assert ~valid
1724 for i in range(30):
1725 yield
1726 yield
1727 insn = yield i_in.insn
1728 valid = yield i_in.valid
1729 insn = yield i_in.insn
1730 assert valid
1731 assert insn == 0x00000040, \
1732 "insn @%x=%x expected 00000040" % (nia, insn)
1733 yield i_out.req.eq(0)
1734
1735
1736
1737 def test_icache(mem):
1738 dut = ICache()
1739
1740 memory = Memory(width=64, depth=16*64, init=mem)
1741 sram = SRAM(memory=memory, granularity=8)
1742
1743 m = Module()
1744
1745 m.submodules.icache = dut
1746 m.submodules.sram = sram
1747
1748 m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1749 m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1750 m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1751 m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1752 m.d.comb += sram.bus.adr.eq(dut.wb_out.adr[3:])
1753 m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1754
1755 m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1756 m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1757
1758 # nmigen Simulation
1759 sim = Simulator(m)
1760 sim.add_clock(1e-6)
1761
1762 sim.add_sync_process(wrap(icache_sim(dut)))
1763 with sim.write_vcd('test_icache.vcd'):
1764 sim.run()
1765
1766 if __name__ == '__main__':
1767 dut = ICache()
1768 vl = rtlil.convert(dut, ports=[])
1769 with open("test_icache.il", "w") as f:
1770 f.write(vl)
1771
1772 mem = []
1773 for i in range(512):
1774 mem.append((i*2)| ((i*2+1)<<32))
1775
1776 test_icache(mem)
1777