be30d32ba5a69e6b00e2745b3d1e6bf8ec82a22d
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 """
22 from enum import Enum, unique
23 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl)
24 from nmigen.cli import main, rtlil
25 from nmutil.iocontrol import RecordObject
26 from nmigen.utils import log2_int
27 from nmutil.util import Display
28
29 #from nmutil.plru import PLRU
30 from soc.experiment.cache_ram import CacheRam
31 from soc.experiment.plru import PLRU
32
33 from soc.experiment.mem_types import (Fetch1ToICacheType,
34 ICacheToDecode1Type,
35 MMUToICacheType)
36
37 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
38 WB_SEL_BITS, WBAddrType, WBDataType,
39 WBSelType, WBMasterOut, WBSlaveOut,
40 WBMasterOutVector, WBSlaveOutVector,
41 WBIOMasterOut, WBIOSlaveOut)
42
43 # for test
44 from nmigen_soc.wishbone.sram import SRAM
45 from nmigen import Memory
46 from nmutil.util import wrap
47 from nmigen.cli import main, rtlil
48 if True:
49 from nmigen.back.pysim import Simulator, Delay, Settle
50 else:
51 from nmigen.sim.cxxsim import Simulator, Delay, Settle
52
53
54 SIM = 0
55 LINE_SIZE = 64
56 # BRAM organisation: We never access more than wishbone_data_bits
57 # at a time so to save resources we make the array only that wide,
58 # and use consecutive indices for to make a cache "line"
59 #
60 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
61 ROW_SIZE = WB_DATA_BITS // 8
62 # Number of lines in a set
63 NUM_LINES = 16
64 # Number of ways
65 NUM_WAYS = 4
66 # L1 ITLB number of entries (direct mapped)
67 TLB_SIZE = 64
68 # L1 ITLB log_2(page_size)
69 TLB_LG_PGSZ = 12
70 # Number of real address bits that we store
71 REAL_ADDR_BITS = 56
72 # Non-zero to enable log data collection
73 LOG_LENGTH = 0
74
75 ROW_SIZE_BITS = ROW_SIZE * 8
76 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
77 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
78 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
79 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
80 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
81 INSN_PER_ROW = ROW_SIZE_BITS // 32
82
83 # Bit fields counts in the address
84 #
85 # INSN_BITS is the number of bits to select an instruction in a row
86 INSN_BITS = log2_int(INSN_PER_ROW)
87 # ROW_BITS is the number of bits to select a row
88 ROW_BITS = log2_int(BRAM_ROWS)
89 # ROW_LINE_BITS is the number of bits to select a row within a line
90 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
91 # LINE_OFF_BITS is the number of bits for the offset in a cache line
92 LINE_OFF_BITS = log2_int(LINE_SIZE)
93 # ROW_OFF_BITS is the number of bits for the offset in a row
94 ROW_OFF_BITS = log2_int(ROW_SIZE)
95 # INDEX_BITS is the number of bits to select a cache line
96 INDEX_BITS = log2_int(NUM_LINES)
97 # SET_SIZE_BITS is the log base 2 of the set size
98 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
99 # TAG_BITS is the number of bits of the tag part of the address
100 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
101 # TAG_WIDTH is the width in bits of each way of the tag RAM
102 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
103
104 # WAY_BITS is the number of bits to select a way
105 WAY_BITS = log2_int(NUM_WAYS)
106 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
107
108 #-- L1 ITLB.
109 TLB_BITS = log2_int(TLB_SIZE)
110 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
111 TLB_PTE_BITS = 64
112
113 print("BRAM_ROWS =", BRAM_ROWS)
114 print("INDEX_BITS =", INDEX_BITS)
115 print("INSN_BITS =", INSN_BITS)
116 print("INSN_PER_ROW =", INSN_PER_ROW)
117 print("LINE_SIZE =", LINE_SIZE)
118 print("LINE_OFF_BITS =", LINE_OFF_BITS)
119 print("LOG_LENGTH =", LOG_LENGTH)
120 print("NUM_LINES =", NUM_LINES)
121 print("NUM_WAYS =", NUM_WAYS)
122 print("REAL_ADDR_BITS =", REAL_ADDR_BITS)
123 print("ROW_BITS =", ROW_BITS)
124 print("ROW_OFF_BITS =", ROW_OFF_BITS)
125 print("ROW_LINE_BITS =", ROW_LINE_BITS)
126 print("ROW_PER_LINE =", ROW_PER_LINE)
127 print("ROW_SIZE =", ROW_SIZE)
128 print("ROW_SIZE_BITS =", ROW_SIZE_BITS)
129 print("SET_SIZE_BITS =", SET_SIZE_BITS)
130 print("SIM =", SIM)
131 print("TAG_BITS =", TAG_BITS)
132 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH)
133 print("TAG_BITS =", TAG_BITS)
134 print("TLB_BITS =", TLB_BITS)
135 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
136 print("TLB_LG_PGSZ =", TLB_LG_PGSZ)
137 print("TLB_PTE_BITS =", TLB_PTE_BITS)
138 print("TLB_SIZE =", TLB_SIZE)
139 print("WAY_BITS =", WAY_BITS)
140
141 # from microwatt/utils.vhdl
142 def ispow2(n):
143 return if ((n << 32) & ((n-1) << 32)) == 0:
144
145 assert LINE_SIZE % ROW_SIZE == 0
146 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
147 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
148 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
149 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
150 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
151 "geometry bits don't add up"
152 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
153 "geometry bits don't add up"
154 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
155 "geometry bits don't add up"
156 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
157 "geometry bits don't add up"
158
159
160 #-- Example of layout for 32 lines of 64 bytes:
161 #--
162 #-- .. tag |index| line |
163 #-- .. | row | |
164 #-- .. | | | |00| zero (2)
165 #-- .. | | |-| | INSN_BITS (1)
166 #-- .. | |---| | ROW_LINE_BITS (3)
167 #-- .. | |--- - --| LINE_OFF_BITS (6)
168 #-- .. | |- --| ROW_OFF_BITS (3)
169 #-- .. |----- ---| | ROW_BITS (8)
170 #-- .. |-----| | INDEX_BITS (5)
171 #-- .. --------| | TAG_BITS (53)
172 # Example of layout for 32 lines of 64 bytes:
173 #
174 # .. tag |index| line |
175 # .. | row | |
176 # .. | | | |00| zero (2)
177 # .. | | |-| | INSN_BITS (1)
178 # .. | |---| | ROW_LINE_BITS (3)
179 # .. | |--- - --| LINE_OFF_BITS (6)
180 # .. | |- --| ROW_OFF_BITS (3)
181 # .. |----- ---| | ROW_BITS (8)
182 # .. |-----| | INDEX_BITS (5)
183 # .. --------| | TAG_BITS (53)
184
185 #-- The cache data BRAM organized as described above for each way
186 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
187 #
188 #-- The cache tags LUTRAM has a row per set. Vivado is a pain and will
189 #-- not handle a clean (commented) definition of the cache tags as a 3d
190 #-- memory. For now, work around it by putting all the tags
191
192 def CacheTagArray():
193 return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" %x) \
194 for x in range(NUM_LINES))
195
196 #-- The cache valid bits
197 def CacheValidBitsArray():
198 return Array(Signal(NUM_WAYS, name="cachevalid_%d" %x) \
199 for x in range(NUM_LINES))
200
201 def RowPerLineValidArray():
202 return Array(Signal(name="rows_valid_%d" %x) \
203 for x in range(ROW_PER_LINE))
204
205
206 # TODO to be passed to nigmen as ram attributes
207 # attribute ram_style : string;
208 # attribute ram_style of cache_tags : signal is "distributed";
209
210
211 #type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
212 def TLBValidBitsArray():
213 return Array(Signal(name="tlbvalid_%d" %x) \
214 for x in range(TLB_SIZE))
215
216 def TLBTagArray():
217 return Array(Signal(TLB_EA_TAG_BITS, name="tlbtag_%d" %x) \
218 for x in range(TLB_SIZE))
219
220 def TLBPtesArray():
221 return Array(Signal(TLB_PTE_BITS, name="tlbptes_%d" %x) \
222 for x in range(TLB_SIZE))
223
224
225 # Cache RAM interface
226 def CacheRamOut():
227 return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
228 for x in range(NUM_WAYS))
229
230 # PLRU output interface
231 def PLRUOut():
232 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
233 for x in range(NUM_LINES))
234
235 # Return the cache line index (tag index) for an address
236 def get_index(addr):
237 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
238
239 # Return the cache row index (data memory) for an address
240 def get_row(addr):
241 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
242
243 # Return the index of a row within a line
244 def get_row_of_line(row):
245 return row[:ROW_LINE_BITS]
246
247 # Returns whether this is the last row of a line
248 def is_last_row_addr(addr, last):
249 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
250
251 # Returns whether this is the last row of a line
252 def is_last_row(row, last):
253 return get_row_of_line(row) == last
254
255 # Return the next row in the current cache line. We use a dedicated
256 # function in order to limit the size of the generated adder to be
257 # only the bits within a cache line (3 bits with default settings)
258 def next_row(row):
259 row_v = row[0:ROW_LINE_BITS] + 1
260 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
261
262 # Read the instruction word for the given address
263 # in the current cache row
264 def read_insn_word(addr, data):
265 word = addr[2:INSN_BITS+2]
266 return data.word_select(word, 32)
267
268 # Get the tag value from the address
269 def get_tag(addr):
270 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
271
272 # Read a tag from a tag memory row
273 def read_tag(way, tagset):
274 return tagset.word_select(way, TAG_BITS)
275
276 # Write a tag to tag memory row
277 def write_tag(way, tagset, tag):
278 return read_tag(way, tagset).eq(tag)
279
280 # Simple hash for direct-mapped TLB index
281 def hash_ea(addr):
282 hsh = addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^ addr[
283 TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS
284 ] ^ addr[
285 TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS
286 ]
287 return hsh
288
289
290 # Cache reload state machine
291 @unique
292 class State(Enum):
293 IDLE = 0
294 CLR_TAG = 1
295 WAIT_ACK = 2
296
297
298 class RegInternal(RecordObject):
299 def __init__(self):
300 super().__init__()
301 # Cache hit state (Latches for 1 cycle BRAM access)
302 self.hit_way = Signal(NUM_WAYS)
303 self.hit_nia = Signal(64)
304 self.hit_smark = Signal()
305 self.hit_valid = Signal()
306
307 # Cache miss state (reload state machine)
308 self.state = Signal(State, reset=State.IDLE)
309 self.wb = WBMasterOut("wb")
310 self.req_adr = Signal(64)
311 self.store_way = Signal(NUM_WAYS)
312 self.store_index = Signal(NUM_LINES)
313 self.store_row = Signal(BRAM_ROWS)
314 self.store_tag = Signal(TAG_BITS)
315 self.store_valid = Signal()
316 self.end_row_ix = Signal(ROW_LINE_BITS)
317 self.rows_valid = RowPerLineValidArray()
318
319 # TLB miss state
320 self.fetch_failed = Signal()
321
322
323 class ICache(Elaboratable):
324 """64 bit direct mapped icache. All instructions are 4B aligned."""
325 def __init__(self):
326 self.i_in = Fetch1ToICacheType(name="i_in")
327 self.i_out = ICacheToDecode1Type(name="i_out")
328
329 self.m_in = MMUToICacheType(name="m_in")
330
331 self.stall_in = Signal()
332 self.stall_out = Signal()
333 self.flush_in = Signal()
334 self.inval_in = Signal()
335
336 self.wb_out = WBMasterOut(name="wb_out")
337 self.wb_in = WBSlaveOut(name="wb_in")
338
339 self.log_out = Signal(54)
340
341
342 # Generate a cache RAM for each way
343 def rams(self, m, r, cache_out_row, use_previous,
344 replace_way, req_row):
345
346 comb = m.d.comb
347 sync = m.d.sync
348
349 wb_in, stall_in = self.wb_in, self.stall_in
350
351 for i in range(NUM_WAYS):
352 do_read = Signal(name="do_rd_%d" % i)
353 do_write = Signal(name="do_wr_%d" % i)
354 rd_addr = Signal(ROW_BITS)
355 wr_addr = Signal(ROW_BITS)
356 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
357 wr_sel = Signal(ROW_SIZE)
358
359 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, True)
360 setattr(m.submodules, "cacheram_%d" % i, way)
361
362 comb += way.rd_en.eq(do_read)
363 comb += way.rd_addr.eq(rd_addr)
364 comb += d_out.eq(way.rd_data_o)
365 comb += way.wr_sel.eq(wr_sel)
366 comb += way.wr_addr.eq(wr_addr)
367 comb += way.wr_data.eq(wb_in.dat)
368
369 comb += do_read.eq(~(stall_in | use_previous))
370 comb += do_write.eq(wb_in.ack & (replace_way == i))
371
372 with m.If(do_write):
373 sync += Display("cache write adr: %x data: %lx",
374 wr_addr, way.wr_data)
375
376 with m.If(r.hit_way == i):
377 comb += cache_out_row.eq(d_out)
378 with m.If(do_read):
379 sync += Display("cache read adr: %x data: %x",
380 req_row, d_out)
381
382 comb += rd_addr.eq(req_row)
383 comb += wr_addr.eq(r.store_row)
384 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
385
386 # Generate PLRUs
387 def maybe_plrus(self, m, r, plru_victim):
388 comb = m.d.comb
389
390 with m.If(NUM_WAYS > 1):
391 for i in range(NUM_LINES):
392 plru_acc_i = Signal(WAY_BITS)
393 plru_acc_en = Signal()
394 plru = PLRU(WAY_BITS)
395 setattr(m.submodules, "plru_%d" % i, plru)
396
397 comb += plru.acc_i.eq(plru_acc_i)
398 comb += plru.acc_en.eq(plru_acc_en)
399
400 # PLRU interface
401 with m.If(get_index(r.hit_nia) == i):
402 comb += plru.acc_en.eq(r.hit_valid)
403
404 comb += plru.acc_i.eq(r.hit_way)
405 comb += plru_victim[i].eq(plru.lru_o)
406
407 # TLB hit detection and real address generation
408 def itlb_lookup(self, m, tlb_req_index, itlb_ptes, itlb_tags,
409 real_addr, itlb_valid_bits, ra_valid, eaa_priv,
410 priv_fault, access_ok):
411
412 comb = m.d.comb
413
414 i_in = self.i_in
415
416 pte = Signal(TLB_PTE_BITS)
417 ttag = Signal(TLB_EA_TAG_BITS)
418
419 comb += tlb_req_index.eq(hash_ea(i_in.nia))
420 comb += pte.eq(itlb_ptes[tlb_req_index])
421 comb += ttag.eq(itlb_tags[tlb_req_index])
422
423 with m.If(i_in.virt_mode):
424 comb += real_addr.eq(Cat(
425 i_in.nia[:TLB_LG_PGSZ],
426 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
427 ))
428
429 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
430 comb += ra_valid.eq(itlb_valid_bits[tlb_req_index])
431
432 comb += eaa_priv.eq(pte[3])
433
434 with m.Else():
435 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
436 comb += ra_valid.eq(1)
437 comb += eaa_priv.eq(1)
438
439 # No IAMR, so no KUEP support for now
440 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
441 comb += access_ok.eq(ra_valid & ~priv_fault)
442
443 # iTLB update
444 def itlb_update(self, m, itlb_valid_bits, itlb_tags, itlb_ptes):
445 comb = m.d.comb
446 sync = m.d.sync
447
448 m_in = self.m_in
449
450 wr_index = Signal(TLB_SIZE)
451 comb += wr_index.eq(hash_ea(m_in.addr))
452
453 with m.If(m_in.tlbie & m_in.doall):
454 # Clear all valid bits
455 for i in range(TLB_SIZE):
456 sync += itlb_valid_bits[i].eq(0)
457
458 with m.Elif(m_in.tlbie):
459 # Clear entry regardless of hit or miss
460 sync += itlb_valid_bits[wr_index].eq(0)
461
462 with m.Elif(m_in.tlbld):
463 sync += itlb_tags[wr_index].eq(
464 m_in.addr[TLB_LG_PGSZ + TLB_BITS:64]
465 )
466 sync += itlb_ptes[wr_index].eq(m_in.pte)
467 sync += itlb_valid_bits[wr_index].eq(1)
468
469 # Cache hit detection, output to fetch2 and other misc logic
470 def icache_comb(self, m, use_previous, r, req_index, req_row,
471 req_hit_way, req_tag, real_addr, req_laddr,
472 cache_valid_bits, cache_tags, access_ok,
473 req_is_hit, req_is_miss, replace_way,
474 plru_victim, cache_out_row):
475
476 comb = m.d.comb
477
478 i_in, i_out, wb_out = self.i_in, self.i_out, self.wb_out
479 flush_in, stall_out = self.flush_in, self.stall_out
480
481 is_hit = Signal()
482 hit_way = Signal(NUM_WAYS)
483
484 # i_in.sequential means that i_in.nia this cycle is 4 more than
485 # last cycle. If we read more than 32 bits at a time, had a
486 # cache hit last cycle, and we don't want the first 32-bit chunk
487 # then we can keep the data we read last cycle and just use that.
488 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
489 comb += use_previous.eq(i_in.sequential & r.hit_valid)
490
491 # Extract line, row and tag from request
492 comb += req_index.eq(get_index(i_in.nia))
493 comb += req_row.eq(get_row(i_in.nia))
494 comb += req_tag.eq(get_tag(real_addr))
495
496 # Calculate address of beginning of cache row, will be
497 # used for cache miss processing if needed
498 comb += req_laddr.eq(Cat(
499 Const(0, ROW_OFF_BITS),
500 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
501 ))
502
503 # Test if pending request is a hit on any way
504 hitcond = Signal()
505 comb += hitcond.eq((r.state == State.WAIT_ACK)
506 & (req_index == r.store_index)
507 & r.rows_valid[req_row % ROW_PER_LINE])
508 with m.If(i_in.req):
509 cvb = Signal(NUM_WAYS)
510 ctag = Signal(TAG_RAM_WIDTH)
511 comb += ctag.eq(cache_tags[req_index])
512 comb += cvb.eq(cache_valid_bits[req_index])
513 for i in range(NUM_WAYS):
514 tagi = Signal(TAG_BITS, name="tag_i%d" % i)
515 comb += tagi.eq(read_tag(i, ctag))
516 hit_test = Signal(name="hit_test%d" % i)
517 comb += hit_test.eq(i == r.store_way)
518 with m.If((cvb[i] | (hitcond & hit_test))
519 & (tagi == req_tag)):
520 comb += hit_way.eq(i)
521 comb += is_hit.eq(1)
522
523 # Generate the "hit" and "miss" signals
524 # for the synchronous blocks
525 with m.If(i_in.req & access_ok & ~flush_in):
526 comb += req_is_hit.eq(is_hit)
527 comb += req_is_miss.eq(~is_hit)
528
529 with m.Else():
530 comb += req_is_hit.eq(0)
531 comb += req_is_miss.eq(0)
532
533 comb += req_hit_way.eq(hit_way)
534
535 # The way to replace on a miss
536 with m.If(r.state == State.CLR_TAG):
537 comb += replace_way.eq(plru_victim[r.store_index])
538 with m.Else():
539 comb += replace_way.eq(r.store_way)
540
541 # Output instruction from current cache row
542 #
543 # Note: This is a mild violation of our design principle of
544 # having pipeline stages output from a clean latch. In this
545 # case we output the result of a mux. The alternative would
546 # be output an entire row which I prefer not to do just yet
547 # as it would force fetch2 to know about some of the cache
548 # geometry information.
549 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
550 comb += i_out.valid.eq(r.hit_valid)
551 comb += i_out.nia.eq(r.hit_nia)
552 comb += i_out.stop_mark.eq(r.hit_smark)
553 comb += i_out.fetch_failed.eq(r.fetch_failed)
554
555 # Stall fetch1 if we have a miss on cache or TLB
556 # or a protection fault
557 comb += stall_out.eq(~(is_hit & access_ok))
558
559 # Wishbone requests output (from the cache miss reload machine)
560 comb += wb_out.eq(r.wb)
561
562 # Cache hit synchronous machine
563 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
564 req_index, req_tag, real_addr):
565 sync = m.d.sync
566
567 i_in, stall_in = self.i_in, self.stall_in
568 flush_in = self.flush_in
569
570 # keep outputs to fetch2 unchanged on a stall
571 # except that flush or reset sets valid to 0
572 # If use_previous, keep the same data as last
573 # cycle and use the second half
574 with m.If(stall_in | use_previous):
575 with m.If(flush_in):
576 sync += r.hit_valid.eq(0)
577 with m.Else():
578 # On a hit, latch the request for the next cycle,
579 # when the BRAM data will be available on the
580 # cache_out output of the corresponding way
581 sync += r.hit_valid.eq(req_is_hit)
582
583 with m.If(req_is_hit):
584 sync += r.hit_way.eq(req_hit_way)
585 sync += Display(
586 "cache hit nia:%x IR:%x SM:%x idx:%x tag:%x " \
587 "way:%x RA:%x", i_in.nia, i_in.virt_mode, \
588 i_in.stop_mark, req_index, req_tag, \
589 req_hit_way, real_addr
590 )
591
592
593
594 with m.If(~stall_in):
595 # Send stop marks and NIA down regardless of validity
596 sync += r.hit_smark.eq(i_in.stop_mark)
597 sync += r.hit_nia.eq(i_in.nia)
598
599 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
600 req_index, req_tag, replace_way, real_addr):
601 comb = m.d.comb
602 sync = m.d.sync
603
604 i_in = self.i_in
605
606 # Reset per-row valid flags, only used in WAIT_ACK
607 for i in range(ROW_PER_LINE):
608 sync += r.rows_valid[i].eq(0)
609
610 # We need to read a cache line
611 with m.If(req_is_miss):
612 sync += Display(
613 "cache miss nia:%x IR:%x SM:%x idx:%x "
614 " way:%x tag:%x RA:%x", i_in.nia,
615 i_in.virt_mode, i_in.stop_mark, req_index,
616 replace_way, req_tag, real_addr
617 )
618
619 # Keep track of our index and way for subsequent stores
620 st_row = Signal(BRAM_ROWS)
621 comb += st_row.eq(get_row(req_laddr))
622 sync += r.store_index.eq(req_index)
623 sync += r.store_row.eq(st_row)
624 sync += r.store_tag.eq(req_tag)
625 sync += r.store_valid.eq(1)
626 sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
627
628 # Prep for first wishbone read. We calculate the address
629 # of the start of the cache line and start the WB cycle.
630 sync += r.req_adr.eq(req_laddr)
631 sync += r.wb.cyc.eq(1)
632 sync += r.wb.stb.eq(1)
633
634 # Track that we had one request sent
635 sync += r.state.eq(State.CLR_TAG)
636
637 def icache_miss_clr_tag(self, m, r, replace_way,
638 cache_valid_bits, req_index,
639 tagset, cache_tags):
640
641 comb = m.d.comb
642 sync = m.d.sync
643
644 # Get victim way from plru
645 sync += r.store_way.eq(replace_way)
646 # Force misses on that way while reloading that line
647 cv = Signal(INDEX_BITS)
648 comb += cv.eq(cache_valid_bits[req_index])
649 comb += cv.bit_select(replace_way, 1).eq(0)
650 sync += cache_valid_bits[req_index].eq(cv)
651
652 for i in range(NUM_WAYS):
653 with m.If(i == replace_way):
654 comb += tagset.eq(cache_tags[r.store_index])
655 comb += write_tag(i, tagset, r.store_tag)
656 sync += cache_tags[r.store_index].eq(tagset)
657
658 sync += r.state.eq(State.WAIT_ACK)
659
660 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
661 stbs_done, cache_valid_bits):
662 comb = m.d.comb
663 sync = m.d.sync
664
665 wb_in = self.wb_in
666
667 # Requests are all sent if stb is 0
668 stbs_zero = Signal()
669 comb += stbs_zero.eq(r.wb.stb == 0)
670 comb += stbs_done.eq(stbs_zero)
671
672 # If we are still sending requests, was one accepted?
673 with m.If(~wb_in.stall & ~stbs_zero):
674 # That was the last word ? # We are done sending.
675 # Clear stb and set stbs_done # so we can handle
676 # an eventual last ack on # the same cycle.
677 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
678 sync += Display(
679 "IS_LAST_ROW_ADDR r.wb.addr:%x " \
680 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x " \
681 "stbs_done:%x", r.wb.adr, r.end_row_ix,
682 r.wb.stb, stbs_zero, stbs_done
683 )
684 sync += r.wb.stb.eq(0)
685 comb += stbs_done.eq(1)
686
687 # Calculate the next row address
688 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
689 comb += rarange.eq(
690 r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1
691 )
692 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(
693 rarange
694 )
695 sync += Display("RARANGE r.req_adr:%x rarange:%x "
696 "stbs_zero:%x stbs_done:%x",
697 r.req_adr, rarange, stbs_zero, stbs_done)
698
699 # Incoming acks processing
700 with m.If(wb_in.ack):
701 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
702 "stbs_done:%x",
703 wb_in.dat, stbs_zero, stbs_done)
704
705 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
706
707 # Check for completion
708 with m.If(stbs_done &
709 is_last_row(r.store_row, r.end_row_ix)):
710 # Complete wishbone cycle
711 sync += r.wb.cyc.eq(0)
712 sync += r.req_adr.eq(0) # be nice, clear addr
713
714 # Cache line is now valid
715 cv = Signal(INDEX_BITS)
716 comb += cv.eq(cache_valid_bits[r.store_index])
717 comb += cv.bit_select(replace_way, 1).eq(
718 r.store_valid & ~inval_in
719 )
720 sync += cache_valid_bits[r.store_index].eq(cv)
721
722 sync += r.state.eq(State.IDLE)
723
724 # not completed, move on to next request in row
725 with m.Else():
726 # Increment store row counter
727 sync += r.store_row.eq(next_row(r.store_row))
728
729
730 # Cache miss/reload synchronous machine
731 def icache_miss(self, m, cache_valid_bits, r, req_is_miss,
732 req_index, req_laddr, req_tag, replace_way,
733 cache_tags, access_ok, real_addr):
734 comb = m.d.comb
735 sync = m.d.sync
736
737 i_in, wb_in, m_in = self.i_in, self.wb_in, self.m_in
738 stall_in, flush_in = self.stall_in, self.flush_in
739 inval_in = self.inval_in
740
741 tagset = Signal(TAG_RAM_WIDTH)
742 stbs_done = Signal()
743
744 comb += r.wb.sel.eq(-1)
745 comb += r.wb.adr.eq(r.req_adr[3:])
746
747 # Process cache invalidations
748 with m.If(inval_in):
749 for i in range(NUM_LINES):
750 sync += cache_valid_bits[i].eq(0)
751 sync += r.store_valid.eq(0)
752
753 # Main state machine
754 with m.Switch(r.state):
755
756 with m.Case(State.IDLE):
757 self.icache_miss_idle(
758 m, r, req_is_miss, req_laddr,
759 req_index, req_tag, replace_way,
760 real_addr
761 )
762
763 with m.Case(State.CLR_TAG, State.WAIT_ACK):
764 with m.If(r.state == State.CLR_TAG):
765 self.icache_miss_clr_tag(
766 m, r, replace_way,
767 cache_valid_bits, req_index,
768 tagset, cache_tags
769 )
770
771 self.icache_miss_wait_ack(
772 m, r, replace_way, inval_in,
773 stbs_done, cache_valid_bits
774 )
775
776 # TLB miss and protection fault processing
777 with m.If(flush_in | m_in.tlbld):
778 sync += r.fetch_failed.eq(0)
779 with m.Elif(i_in.req & ~access_ok & ~stall_in):
780 sync += r.fetch_failed.eq(1)
781
782 # icache_log: if LOG_LENGTH > 0 generate
783 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
784 req_is_miss, req_is_hit, lway, wstate, r):
785 comb = m.d.comb
786 sync = m.d.sync
787
788 wb_in, i_out = self.wb_in, self.i_out
789 log_out, stall_out = self.log_out, self.stall_out
790
791 # Output data to logger
792 for i in range(LOG_LENGTH):
793 # Output data to logger
794 log_data = Signal(54)
795 lway = Signal(NUM_WAYS)
796 wstate = Signal()
797
798 sync += lway.eq(req_hit_way)
799 sync += wstate.eq(0)
800
801 with m.If(r.state != State.IDLE):
802 sync += wstate.eq(1)
803
804 sync += log_data.eq(Cat(
805 ra_valid, access_ok, req_is_miss, req_is_hit,
806 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
807 stall_out, wb_in.stall, r.wb.cyc, r.wb.stb,
808 r.real_addr[3:6], wb_in.ack, i_out.insn, i_out.valid
809 ))
810 comb += log_out.eq(log_data)
811
812 def elaborate(self, platform):
813
814 m = Module()
815 comb = m.d.comb
816
817 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
818 cache_tags = CacheTagArray()
819 cache_valid_bits = CacheValidBitsArray()
820
821 itlb_valid_bits = TLBValidBitsArray()
822 itlb_tags = TLBTagArray()
823 itlb_ptes = TLBPtesArray()
824 # TODO to be passed to nmigen as ram attributes
825 # attribute ram_style of itlb_tags : signal is "distributed";
826 # attribute ram_style of itlb_ptes : signal is "distributed";
827
828 # Privilege bit from PTE EAA field
829 eaa_priv = Signal()
830
831 r = RegInternal()
832
833 # Async signal on incoming request
834 req_index = Signal(NUM_LINES)
835 req_row = Signal(BRAM_ROWS)
836 req_hit_way = Signal(NUM_WAYS)
837 req_tag = Signal(TAG_BITS)
838 req_is_hit = Signal()
839 req_is_miss = Signal()
840 req_laddr = Signal(64)
841
842 tlb_req_index = Signal(TLB_SIZE)
843 real_addr = Signal(REAL_ADDR_BITS)
844 ra_valid = Signal()
845 priv_fault = Signal()
846 access_ok = Signal()
847 use_previous = Signal()
848
849 cache_out_row = Signal(ROW_SIZE_BITS)
850
851 plru_victim = PLRUOut()
852 replace_way = Signal(NUM_WAYS)
853
854 # call sub-functions putting everything together,
855 # using shared signals established above
856 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
857 self.maybe_plrus(m, r, plru_victim)
858 self.itlb_lookup(m, tlb_req_index, itlb_ptes, itlb_tags, real_addr,
859 itlb_valid_bits, ra_valid, eaa_priv, priv_fault,
860 access_ok)
861 self.itlb_update(m, itlb_valid_bits, itlb_tags, itlb_ptes)
862 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
863 req_tag, real_addr, req_laddr, cache_valid_bits,
864 cache_tags, access_ok, req_is_hit, req_is_miss,
865 replace_way, plru_victim, cache_out_row)
866 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
867 req_index, req_tag, real_addr)
868 self.icache_miss(m, cache_valid_bits, r, req_is_miss, req_index,
869 req_laddr, req_tag, replace_way, cache_tags,
870 access_ok, real_addr)
871 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
872 # req_is_miss, req_is_hit, lway, wstate, r)
873
874 return m
875
876
877 def icache_sim(dut):
878 i_out = dut.i_in
879 i_in = dut.i_out
880 m_out = dut.m_in
881
882 yield i_in.valid.eq(0)
883 yield i_out.priv_mode.eq(1)
884 yield i_out.req.eq(0)
885 yield i_out.nia.eq(0)
886 yield i_out.stop_mark.eq(0)
887 yield m_out.tlbld.eq(0)
888 yield m_out.tlbie.eq(0)
889 yield m_out.addr.eq(0)
890 yield m_out.pte.eq(0)
891 yield
892 yield
893 yield
894 yield
895 yield i_out.req.eq(1)
896 yield i_out.nia.eq(Const(0x0000000000000004, 64))
897 for i in range(30):
898 yield
899 yield
900 valid = yield i_in.valid
901 nia = yield i_out.nia
902 insn = yield i_in.insn
903 print(f"valid? {valid}")
904 assert valid
905 assert insn == 0x00000001, \
906 "insn @%x=%x expected 00000001" % (nia, insn)
907 yield i_out.req.eq(0)
908 yield
909
910 # hit
911 yield
912 yield
913 yield i_out.req.eq(1)
914 yield i_out.nia.eq(Const(0x0000000000000008, 64))
915 yield
916 yield
917 valid = yield i_in.valid
918 nia = yield i_in.nia
919 insn = yield i_in.insn
920 assert valid
921 assert insn == 0x00000002, \
922 "insn @%x=%x expected 00000002" % (nia, insn)
923 yield
924
925 # another miss
926 yield i_out.req.eq(1)
927 yield i_out.nia.eq(Const(0x0000000000000040, 64))
928 for i in range(30):
929 yield
930 yield
931 valid = yield i_in.valid
932 nia = yield i_out.nia
933 insn = yield i_in.insn
934 assert valid
935 assert insn == 0x00000010, \
936 "insn @%x=%x expected 00000010" % (nia, insn)
937
938 # test something that aliases
939 yield i_out.req.eq(1)
940 yield i_out.nia.eq(Const(0x0000000000000100, 64))
941 yield
942 yield
943 valid = yield i_in.valid
944 assert ~valid
945 for i in range(30):
946 yield
947 yield
948 insn = yield i_in.insn
949 valid = yield i_in.valid
950 insn = yield i_in.insn
951 assert valid
952 assert insn == 0x00000040, \
953 "insn @%x=%x expected 00000040" % (nia, insn)
954 yield i_out.req.eq(0)
955
956
957
958 def test_icache(mem):
959 dut = ICache()
960
961 memory = Memory(width=64, depth=512, init=mem)
962 sram = SRAM(memory=memory, granularity=8)
963
964 m = Module()
965
966 m.submodules.icache = dut
967 m.submodules.sram = sram
968
969 m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
970 m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
971 m.d.comb += sram.bus.we.eq(dut.wb_out.we)
972 m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
973 m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
974 m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
975
976 m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
977 m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
978
979 # nmigen Simulation
980 sim = Simulator(m)
981 sim.add_clock(1e-6)
982
983 sim.add_sync_process(wrap(icache_sim(dut)))
984 with sim.write_vcd('test_icache.vcd'):
985 sim.run()
986
987 if __name__ == '__main__':
988 dut = ICache()
989 vl = rtlil.convert(dut, ports=[])
990 with open("test_icache.il", "w") as f:
991 f.write(vl)
992
993 mem = []
994 for i in range(512):
995 mem.append((i*2)| ((i*2+1)<<32))
996
997 test_icache(mem)
998