Allow the formal engine to perform a same-cycle result in the ALU
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 Links:
22
23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
25 (discussion about brams for ECP5)
26
27 """
28
29 from enum import (Enum, unique)
30 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
31 Record)
32 from nmigen.cli import main, rtlil
33 from nmutil.iocontrol import RecordObject
34 from nmigen.utils import log2_int
35 from nmigen.lib.coding import Decoder
36 from nmutil.util import Display
37
38 #from nmutil.plru import PLRU
39 from soc.experiment.plru import PLRU, PLRUs
40 from soc.experiment.cache_ram import CacheRam
41
42 from soc.experiment.mem_types import (Fetch1ToICacheType,
43 ICacheToDecode1Type,
44 MMUToICacheType)
45
46 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
47 WB_SEL_BITS, WBAddrType, WBDataType,
48 WBSelType, WBMasterOut, WBSlaveOut,
49 )
50
51 from nmigen_soc.wishbone.bus import Interface
52 from soc.minerva.units.fetch import FetchUnitInterface
53
54
55 # for test
56 from soc.bus.sram import SRAM
57 from nmigen import Memory
58 from nmutil.util import wrap
59 from nmigen.cli import main, rtlil
60
61 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
62 # Also, check out the cxxsim nmigen branch, and latest yosys from git
63 from nmutil.sim_tmp_alternative import Simulator, Settle
64
65
66 SIM = 0
67 LINE_SIZE = 64
68 # BRAM organisation: We never access more than wishbone_data_bits
69 # at a time so to save resources we make the array only that wide,
70 # and use consecutive indices for to make a cache "line"
71 #
72 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
73 ROW_SIZE = WB_DATA_BITS // 8
74 # Number of lines in a set
75 NUM_LINES = 64
76 # Number of ways
77 NUM_WAYS = 2
78 # L1 ITLB number of entries (direct mapped)
79 TLB_SIZE = 64
80 # L1 ITLB log_2(page_size)
81 TLB_LG_PGSZ = 12
82 # Number of real address bits that we store
83 REAL_ADDR_BITS = 56
84 # Non-zero to enable log data collection
85 LOG_LENGTH = 0
86
87 ROW_SIZE_BITS = ROW_SIZE * 8
88 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
89 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
90 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
91 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
92 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
93 INSN_PER_ROW = ROW_SIZE_BITS // 32
94
95 # Bit fields counts in the address
96 #
97 # INSN_BITS is the number of bits to select an instruction in a row
98 INSN_BITS = log2_int(INSN_PER_ROW)
99 # ROW_BITS is the number of bits to select a row
100 ROW_BITS = log2_int(BRAM_ROWS)
101 # ROW_LINE_BITS is the number of bits to select a row within a line
102 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
103 # LINE_OFF_BITS is the number of bits for the offset in a cache line
104 LINE_OFF_BITS = log2_int(LINE_SIZE)
105 # ROW_OFF_BITS is the number of bits for the offset in a row
106 ROW_OFF_BITS = log2_int(ROW_SIZE)
107 # INDEX_BITS is the number of bits to select a cache line
108 INDEX_BITS = log2_int(NUM_LINES)
109 # SET_SIZE_BITS is the log base 2 of the set size
110 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
111 # TAG_BITS is the number of bits of the tag part of the address
112 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
113 # TAG_WIDTH is the width in bits of each way of the tag RAM
114 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
115
116 # WAY_BITS is the number of bits to select a way
117 WAY_BITS = log2_int(NUM_WAYS)
118 TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
119
120 # L1 ITLB
121 TLB_BITS = log2_int(TLB_SIZE)
122 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
123 TLB_PTE_BITS = 64
124
125 print("BRAM_ROWS =", BRAM_ROWS)
126 print("INDEX_BITS =", INDEX_BITS)
127 print("INSN_BITS =", INSN_BITS)
128 print("INSN_PER_ROW =", INSN_PER_ROW)
129 print("LINE_SIZE =", LINE_SIZE)
130 print("LINE_OFF_BITS =", LINE_OFF_BITS)
131 print("LOG_LENGTH =", LOG_LENGTH)
132 print("NUM_LINES =", NUM_LINES)
133 print("NUM_WAYS =", NUM_WAYS)
134 print("REAL_ADDR_BITS =", REAL_ADDR_BITS)
135 print("ROW_BITS =", ROW_BITS)
136 print("ROW_OFF_BITS =", ROW_OFF_BITS)
137 print("ROW_LINE_BITS =", ROW_LINE_BITS)
138 print("ROW_PER_LINE =", ROW_PER_LINE)
139 print("ROW_SIZE =", ROW_SIZE)
140 print("ROW_SIZE_BITS =", ROW_SIZE_BITS)
141 print("SET_SIZE_BITS =", SET_SIZE_BITS)
142 print("SIM =", SIM)
143 print("TAG_BITS =", TAG_BITS)
144 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH)
145 print("TAG_BITS =", TAG_BITS)
146 print("TLB_BITS =", TLB_BITS)
147 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
148 print("TLB_LG_PGSZ =", TLB_LG_PGSZ)
149 print("TLB_PTE_BITS =", TLB_PTE_BITS)
150 print("TLB_SIZE =", TLB_SIZE)
151 print("WAY_BITS =", WAY_BITS)
152
153 # from microwatt/utils.vhdl
154 def ispow2(n):
155 return n != 0 and (n & (n - 1)) == 0
156
157 assert LINE_SIZE % ROW_SIZE == 0
158 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
159 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
160 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
161 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
162 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
163 "geometry bits don't add up"
164 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
165 "geometry bits don't add up"
166 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
167 "geometry bits don't add up"
168 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
169 "geometry bits don't add up"
170
171 # Example of layout for 32 lines of 64 bytes:
172 #
173 # .. tag |index| line |
174 # .. | row | |
175 # .. | | | |00| zero (2)
176 # .. | | |-| | INSN_BITS (1)
177 # .. | |---| | ROW_LINE_BITS (3)
178 # .. | |--- - --| LINE_OFF_BITS (6)
179 # .. | |- --| ROW_OFF_BITS (3)
180 # .. |----- ---| | ROW_BITS (8)
181 # .. |-----| | INDEX_BITS (5)
182 # .. --------| | TAG_BITS (53)
183
184 # The cache data BRAM organized as described above for each way
185 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
186 #
187 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
188 # not handle a clean (commented) definition of the cache tags as a 3d
189 # memory. For now, work around it by putting all the tags
190 def CacheTagArray():
191 tag_layout = [('valid', NUM_WAYS),
192 ('tag', TAG_RAM_WIDTH),
193 ]
194 return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
195
196 def RowPerLineValidArray():
197 return Array(Signal(name="rows_valid_%d" %x) \
198 for x in range(ROW_PER_LINE))
199
200
201 # TODO to be passed to nigmen as ram attributes
202 # attribute ram_style : string;
203 # attribute ram_style of cache_tags : signal is "distributed";
204
205 def TLBArray():
206 tlb_layout = [('valid', 1),
207 ('tag', TLB_EA_TAG_BITS),
208 ('pte', TLB_PTE_BITS)
209 ]
210 return Array(Record(tlb_layout, name="tlb%d" % x) for x in range(TLB_SIZE))
211
212 # Cache RAM interface
213 def CacheRamOut():
214 return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
215 for x in range(NUM_WAYS))
216
217 # PLRU output interface
218 def PLRUOut():
219 return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
220 for x in range(NUM_LINES))
221
222 # Return the cache line index (tag index) for an address
223 def get_index(addr):
224 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
225
226 # Return the cache row index (data memory) for an address
227 def get_row(addr):
228 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
229
230 # Return the index of a row within a line
231 def get_row_of_line(row):
232 return row[:ROW_BITS][:ROW_LINE_BITS]
233
234 # Returns whether this is the last row of a line
235 def is_last_row_addr(addr, last):
236 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
237
238 # Returns whether this is the last row of a line
239 def is_last_row(row, last):
240 return get_row_of_line(row) == last
241
242 # Return the next row in the current cache line. We use a dedicated
243 # function in order to limit the size of the generated adder to be
244 # only the bits within a cache line (3 bits with default settings)
245 def next_row(row):
246 row_v = row[0:ROW_LINE_BITS] + 1
247 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
248
249 # Read the instruction word for the given address
250 # in the current cache row
251 def read_insn_word(addr, data):
252 word = addr[2:INSN_BITS+2]
253 return data.word_select(word, 32)
254
255 # Get the tag value from the address
256 def get_tag(addr):
257 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
258
259 # Read a tag from a tag memory row
260 def read_tag(way, tagset):
261 return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
262
263 # Write a tag to tag memory row
264 def write_tag(way, tagset, tag):
265 return read_tag(way, tagset).eq(tag)
266
267 # Simple hash for direct-mapped TLB index
268 def hash_ea(addr):
269 hsh = (addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^
270 addr[TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS ] ^
271 addr[TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS])
272 return hsh
273
274
275 # Cache reload state machine
276 @unique
277 class State(Enum):
278 IDLE = 0
279 CLR_TAG = 1
280 WAIT_ACK = 2
281
282
283 class RegInternal(RecordObject):
284 def __init__(self):
285 super().__init__()
286 # Cache hit state (Latches for 1 cycle BRAM access)
287 self.hit_way = Signal(WAY_BITS)
288 self.hit_nia = Signal(64)
289 self.hit_smark = Signal()
290 self.hit_valid = Signal()
291
292 # Cache miss state (reload state machine)
293 self.state = Signal(State, reset=State.IDLE)
294 self.wb = WBMasterOut("wb")
295 self.req_adr = Signal(64)
296 self.store_way = Signal(WAY_BITS)
297 self.store_index = Signal(INDEX_BITS)
298 self.store_row = Signal(ROW_BITS)
299 self.store_tag = Signal(TAG_BITS)
300 self.store_valid = Signal()
301 self.end_row_ix = Signal(ROW_LINE_BITS)
302 self.rows_valid = RowPerLineValidArray()
303
304 # TLB miss state
305 self.fetch_failed = Signal()
306
307
308 class ICache(FetchUnitInterface, Elaboratable):
309 """64 bit direct mapped icache. All instructions are 4B aligned."""
310 def __init__(self, pspec):
311 FetchUnitInterface.__init__(self, pspec)
312 self.i_in = Fetch1ToICacheType(name="i_in")
313 self.i_out = ICacheToDecode1Type(name="i_out")
314
315 self.m_in = MMUToICacheType(name="m_in")
316
317 self.stall_in = Signal()
318 self.stall_out = Signal()
319 self.flush_in = Signal()
320 self.inval_in = Signal()
321
322 # standard naming (wired to non-standard for compatibility)
323 self.bus = Interface(addr_width=32,
324 data_width=64,
325 granularity=8,
326 features={'stall'},
327 alignment=0,
328 name="icache_wb")
329
330 self.log_out = Signal(54)
331
332 # use FetchUnitInterface, helps keep some unit tests running
333 self.use_fetch_iface = False
334
335 def use_fetch_interface(self):
336 self.use_fetch_iface = True
337
338 # Generate a cache RAM for each way
339 def rams(self, m, r, cache_out_row, use_previous,
340 replace_way, req_row):
341
342 comb = m.d.comb
343 sync = m.d.sync
344
345 bus, stall_in = self.bus, self.stall_in
346
347 # read condition (for every cache ram)
348 do_read = Signal()
349 comb += do_read.eq(~(stall_in | use_previous))
350
351 rd_addr = Signal(ROW_BITS)
352 wr_addr = Signal(ROW_BITS)
353 comb += rd_addr.eq(req_row)
354 comb += wr_addr.eq(r.store_row)
355
356 # binary-to-unary converters: replace-way enabled by bus.ack,
357 # hit-way left permanently enabled
358 m.submodules.replace_way_e = re = Decoder(NUM_WAYS)
359 m.submodules.hit_way_e = he = Decoder(NUM_WAYS)
360 comb += re.i.eq(replace_way)
361 comb += re.n.eq(~bus.ack)
362 comb += he.i.eq(r.hit_way)
363
364 for i in range(NUM_WAYS):
365 do_write = Signal(name="do_wr_%d" % i)
366 d_out = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
367 wr_sel = Signal(ROW_SIZE, name="wr_sel_%d" % i)
368
369 way = CacheRam(ROW_BITS, ROW_SIZE_BITS, TRACE=True, ram_num=i)
370 m.submodules["cacheram_%d" % i] = way
371
372 comb += way.rd_en.eq(do_read)
373 comb += way.rd_addr.eq(rd_addr)
374 comb += d_out.eq(way.rd_data_o)
375 comb += way.wr_sel.eq(wr_sel)
376 comb += way.wr_addr.eq(wr_addr)
377 comb += way.wr_data.eq(bus.dat_r)
378
379 comb += do_write.eq(re.o[i])
380
381 with m.If(do_write):
382 sync += Display("cache write adr: %x data: %lx",
383 wr_addr, way.wr_data)
384
385 with m.If(he.o[i]):
386 comb += cache_out_row.eq(d_out)
387 with m.If(do_read):
388 sync += Display("cache read adr: %x data: %x",
389 req_row, d_out)
390
391 comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
392
393 # Generate PLRUs
394 def maybe_plrus(self, m, r, plru_victim):
395 comb = m.d.comb
396
397 if NUM_WAYS == 0:
398 return
399
400
401 m.submodules.plrus = plru = PLRUs(NUM_LINES, WAY_BITS)
402 comb += plru.way.eq(r.hit_way)
403 comb += plru.valid.eq(r.hit_valid)
404 comb += plru.index.eq(get_index(r.hit_nia))
405 comb += plru.isel.eq(r.store_index) # select victim
406 comb += plru_victim.eq(plru.o_index) # selected victim
407
408 # TLB hit detection and real address generation
409 def itlb_lookup(self, m, tlb_req_index, itlb,
410 real_addr, ra_valid, eaa_priv,
411 priv_fault, access_ok):
412
413 comb = m.d.comb
414
415 i_in = self.i_in
416
417 pte = Signal(TLB_PTE_BITS)
418 ttag = Signal(TLB_EA_TAG_BITS)
419
420 comb += tlb_req_index.eq(hash_ea(i_in.nia))
421 comb += pte.eq(itlb[tlb_req_index].pte)
422 comb += ttag.eq(itlb[tlb_req_index].tag)
423
424 with m.If(i_in.virt_mode):
425 comb += real_addr.eq(Cat(
426 i_in.nia[:TLB_LG_PGSZ],
427 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
428 ))
429
430 with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
431 comb += ra_valid.eq(itlb[tlb_req_index].valid)
432
433 comb += eaa_priv.eq(pte[3])
434
435 with m.Else():
436 comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
437 comb += ra_valid.eq(1)
438 comb += eaa_priv.eq(1)
439
440 # No IAMR, so no KUEP support for now
441 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
442 comb += access_ok.eq(ra_valid & ~priv_fault)
443
444 # iTLB update
445 def itlb_update(self, m, itlb):
446 comb = m.d.comb
447 sync = m.d.sync
448
449 m_in = self.m_in
450
451 wr_index = Signal(TLB_SIZE)
452 comb += wr_index.eq(hash_ea(m_in.addr))
453
454 with m.If(m_in.tlbie & m_in.doall):
455 # Clear all valid bits
456 for i in range(TLB_SIZE):
457 sync += itlb[i].valid.eq(0)
458
459 with m.Elif(m_in.tlbie):
460 # Clear entry regardless of hit or miss
461 sync += itlb[wr_index].valid.eq(0)
462
463 with m.Elif(m_in.tlbld):
464 sync += itlb[wr_index].tag.eq(m_in.addr[TLB_LG_PGSZ + TLB_BITS:64])
465 sync += itlb[wr_index].pte.eq(m_in.pte)
466 sync += itlb[wr_index].valid.eq(1)
467
468 # Cache hit detection, output to fetch2 and other misc logic
469 def icache_comb(self, m, use_previous, r, req_index, req_row,
470 req_hit_way, req_tag, real_addr, req_laddr,
471 cache_tags, access_ok,
472 req_is_hit, req_is_miss, replace_way,
473 plru_victim, cache_out_row):
474
475 comb = m.d.comb
476
477 i_in, i_out, bus = self.i_in, self.i_out, self.bus
478 flush_in, stall_out = self.flush_in, self.stall_out
479
480 is_hit = Signal()
481 hit_way = Signal(WAY_BITS)
482
483 # i_in.sequential means that i_in.nia this cycle is 4 more than
484 # last cycle. If we read more than 32 bits at a time, had a
485 # cache hit last cycle, and we don't want the first 32-bit chunk
486 # then we can keep the data we read last cycle and just use that.
487 with m.If(i_in.nia[2:INSN_BITS+2] != 0):
488 comb += use_previous.eq(i_in.sequential & r.hit_valid)
489
490 # Extract line, row and tag from request
491 comb += req_index.eq(get_index(i_in.nia))
492 comb += req_row.eq(get_row(i_in.nia))
493 comb += req_tag.eq(get_tag(real_addr))
494
495 # Calculate address of beginning of cache row, will be
496 # used for cache miss processing if needed
497 comb += req_laddr.eq(Cat(
498 Const(0, ROW_OFF_BITS),
499 real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
500 ))
501
502 # Test if pending request is a hit on any way
503 hitcond = Signal()
504 comb += hitcond.eq((r.state == State.WAIT_ACK)
505 & (req_index == r.store_index)
506 & r.rows_valid[req_row % ROW_PER_LINE]
507 )
508 # i_in.req asserts Decoder active
509 cvb = Signal(NUM_WAYS)
510 ctag = Signal(TAG_RAM_WIDTH)
511 comb += ctag.eq(cache_tags[req_index].tag)
512 comb += cvb.eq(cache_tags[req_index].valid)
513 m.submodules.store_way_e = se = Decoder(NUM_WAYS)
514 comb += se.i.eq(r.store_way)
515 comb += se.n.eq(~i_in.req)
516 for i in range(NUM_WAYS):
517 tagi = Signal(TAG_BITS, name="tag_i%d" % i)
518 hit_test = Signal(name="hit_test%d" % i)
519 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
520 comb += tagi.eq(read_tag(i, ctag))
521 comb += hit_test.eq(se.o[i])
522 comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
523 (tagi == req_tag))
524 with m.If(is_tag_hit):
525 comb += hit_way.eq(i)
526 comb += is_hit.eq(1)
527
528 # Generate the "hit" and "miss" signals
529 # for the synchronous blocks
530 with m.If(i_in.req & access_ok & ~flush_in):
531 comb += req_is_hit.eq(is_hit)
532 comb += req_is_miss.eq(~is_hit)
533
534 comb += req_hit_way.eq(hit_way)
535
536 # The way to replace on a miss
537 with m.If(r.state == State.CLR_TAG):
538 comb += replace_way.eq(plru_victim)
539 with m.Else():
540 comb += replace_way.eq(r.store_way)
541
542 # Output instruction from current cache row
543 #
544 # Note: This is a mild violation of our design principle of
545 # having pipeline stages output from a clean latch. In this
546 # case we output the result of a mux. The alternative would
547 # be output an entire row which I prefer not to do just yet
548 # as it would force fetch2 to know about some of the cache
549 # geometry information.
550 comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
551 comb += i_out.valid.eq(r.hit_valid)
552 comb += i_out.nia.eq(r.hit_nia)
553 comb += i_out.stop_mark.eq(r.hit_smark)
554 comb += i_out.fetch_failed.eq(r.fetch_failed)
555
556 # Stall fetch1 if we have a miss on cache or TLB
557 # or a protection fault
558 comb += stall_out.eq(~(is_hit & access_ok))
559
560 # Wishbone requests output (from the cache miss reload machine)
561 comb += bus.we.eq(r.wb.we)
562 comb += bus.adr.eq(r.wb.adr)
563 comb += bus.sel.eq(r.wb.sel)
564 comb += bus.stb.eq(r.wb.stb)
565 comb += bus.dat_w.eq(r.wb.dat)
566 comb += bus.cyc.eq(r.wb.cyc)
567
568 # Cache hit synchronous machine
569 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
570 req_index, req_tag, real_addr):
571 sync = m.d.sync
572
573 i_in, stall_in = self.i_in, self.stall_in
574 flush_in = self.flush_in
575
576 # keep outputs to fetch2 unchanged on a stall
577 # except that flush or reset sets valid to 0
578 # If use_previous, keep the same data as last
579 # cycle and use the second half
580 with m.If(stall_in | use_previous):
581 with m.If(flush_in):
582 sync += r.hit_valid.eq(0)
583 with m.Else():
584 # On a hit, latch the request for the next cycle,
585 # when the BRAM data will be available on the
586 # cache_out output of the corresponding way
587 sync += r.hit_valid.eq(req_is_hit)
588
589 with m.If(req_is_hit):
590 sync += r.hit_way.eq(req_hit_way)
591 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
592 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
593 i_in.stop_mark, req_index, req_tag,
594 req_hit_way, real_addr)
595
596 with m.If(~stall_in):
597 # Send stop marks and NIA down regardless of validity
598 sync += r.hit_smark.eq(i_in.stop_mark)
599 sync += r.hit_nia.eq(i_in.nia)
600
601 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
602 req_index, req_tag, replace_way, real_addr):
603 comb = m.d.comb
604 sync = m.d.sync
605
606 i_in = self.i_in
607
608 # Reset per-row valid flags, only used in WAIT_ACK
609 for i in range(ROW_PER_LINE):
610 sync += r.rows_valid[i].eq(0)
611
612 # We need to read a cache line
613 with m.If(req_is_miss):
614 sync += Display(
615 "cache miss nia:%x IR:%x SM:%x idx:%x "
616 " way:%x tag:%x RA:%x", i_in.nia,
617 i_in.virt_mode, i_in.stop_mark, req_index,
618 replace_way, req_tag, real_addr)
619
620 # Keep track of our index and way for subsequent stores
621 st_row = Signal(ROW_BITS)
622 comb += st_row.eq(get_row(req_laddr))
623 sync += r.store_index.eq(req_index)
624 sync += r.store_row.eq(st_row)
625 sync += r.store_tag.eq(req_tag)
626 sync += r.store_valid.eq(1)
627 sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
628
629 # Prep for first wishbone read. We calculate the address
630 # of the start of the cache line and start the WB cycle.
631 sync += r.req_adr.eq(req_laddr)
632 sync += r.wb.cyc.eq(1)
633 sync += r.wb.stb.eq(1)
634
635 # Track that we had one request sent
636 sync += r.state.eq(State.CLR_TAG)
637
638 def icache_miss_clr_tag(self, m, r, replace_way,
639 req_index,
640 tagset, cache_tags):
641 comb = m.d.comb
642 sync = m.d.sync
643
644 # Get victim way from plru
645 sync += r.store_way.eq(replace_way)
646
647 # Force misses on that way while reloading that line
648 cv = Signal(INDEX_BITS)
649 comb += cv.eq(cache_tags[req_index].valid)
650 comb += cv.bit_select(replace_way, 1).eq(0)
651 sync += cache_tags[req_index].valid.eq(cv)
652
653 for i in range(NUM_WAYS):
654 with m.If(i == replace_way):
655 comb += tagset.eq(cache_tags[r.store_index].tag)
656 comb += write_tag(i, tagset, r.store_tag)
657 sync += cache_tags[r.store_index].tag.eq(tagset)
658
659 sync += r.state.eq(State.WAIT_ACK)
660
661 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
662 cache_tags, stbs_done):
663 comb = m.d.comb
664 sync = m.d.sync
665
666 bus = self.bus
667
668 # Requests are all sent if stb is 0
669 stbs_zero = Signal()
670 comb += stbs_zero.eq(r.wb.stb == 0)
671 comb += stbs_done.eq(stbs_zero)
672
673 # If we are still sending requests, was one accepted?
674 with m.If(~bus.stall & ~stbs_zero):
675 # That was the last word? We are done sending.
676 # Clear stb and set stbs_done so we can handle
677 # an eventual last ack on the same cycle.
678 with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
679 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
680 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
681 "stbs_done:%x", r.wb.adr, r.end_row_ix,
682 r.wb.stb, stbs_zero, stbs_done)
683 sync += r.wb.stb.eq(0)
684 comb += stbs_done.eq(1)
685
686 # Calculate the next row address
687 rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
688 comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1)
689 sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
690 sync += Display("RARANGE r.req_adr:%x rarange:%x "
691 "stbs_zero:%x stbs_done:%x",
692 r.req_adr, rarange, stbs_zero, stbs_done)
693
694 # Incoming acks processing
695 with m.If(bus.ack):
696 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
697 "stbs_done:%x",
698 bus.dat_r, stbs_zero, stbs_done)
699
700 sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
701
702 # Check for completion
703 with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
704 # Complete wishbone cycle
705 sync += r.wb.cyc.eq(0)
706 # be nice, clear addr
707 sync += r.req_adr.eq(0)
708
709 # Cache line is now valid
710 cv = Signal(INDEX_BITS)
711 comb += cv.eq(cache_tags[r.store_index].valid)
712 comb += cv.bit_select(replace_way, 1).eq(
713 r.store_valid & ~inval_in)
714 sync += cache_tags[r.store_index].valid.eq(cv)
715
716 sync += r.state.eq(State.IDLE)
717
718 # move on to next request in row
719 # Increment store row counter
720 sync += r.store_row.eq(next_row(r.store_row))
721
722 # Cache miss/reload synchronous machine
723 def icache_miss(self, m, r, req_is_miss,
724 req_index, req_laddr, req_tag, replace_way,
725 cache_tags, access_ok, real_addr):
726 comb = m.d.comb
727 sync = m.d.sync
728
729 i_in, bus, m_in = self.i_in, self.bus, self.m_in
730 stall_in, flush_in = self.stall_in, self.flush_in
731 inval_in = self.inval_in
732
733 tagset = Signal(TAG_RAM_WIDTH)
734 stbs_done = Signal()
735
736 comb += r.wb.sel.eq(-1)
737 comb += r.wb.adr.eq(r.req_adr[3:])
738
739 # Process cache invalidations
740 with m.If(inval_in):
741 for i in range(NUM_LINES):
742 sync += cache_tags[i].valid.eq(0)
743 sync += r.store_valid.eq(0)
744
745 # Main state machine
746 with m.Switch(r.state):
747
748 with m.Case(State.IDLE):
749 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
750 req_index, req_tag, replace_way,
751 real_addr)
752
753 with m.Case(State.CLR_TAG, State.WAIT_ACK):
754 with m.If(r.state == State.CLR_TAG):
755 self.icache_miss_clr_tag(m, r, replace_way,
756 req_index, tagset, cache_tags)
757
758 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
759 cache_tags, stbs_done)
760
761 # TLB miss and protection fault processing
762 with m.If(flush_in | m_in.tlbld):
763 sync += r.fetch_failed.eq(0)
764 with m.Elif(i_in.req & ~access_ok & ~stall_in):
765 sync += r.fetch_failed.eq(1)
766
767 # icache_log: if LOG_LENGTH > 0 generate
768 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
769 req_is_miss, req_is_hit, lway, wstate, r):
770 comb = m.d.comb
771 sync = m.d.sync
772
773 bus, i_out = self.bus, self.i_out
774 log_out, stall_out = self.log_out, self.stall_out
775
776 # Output data to logger
777 for i in range(LOG_LENGTH):
778 log_data = Signal(54)
779 lway = Signal(WAY_BITS)
780 wstate = Signal()
781
782 sync += lway.eq(req_hit_way)
783 sync += wstate.eq(0)
784
785 with m.If(r.state != State.IDLE):
786 sync += wstate.eq(1)
787
788 sync += log_data.eq(Cat(
789 ra_valid, access_ok, req_is_miss, req_is_hit,
790 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
791 stall_out, bus.stall, r.wb.cyc, r.wb.stb,
792 r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
793 ))
794 comb += log_out.eq(log_data)
795
796 def elaborate(self, platform):
797
798 m = Module()
799 comb = m.d.comb
800
801 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
802 cache_tags = CacheTagArray()
803
804 # TLB Array
805 itlb = TLBArray()
806
807 # TODO to be passed to nmigen as ram attributes
808 # attribute ram_style of itlb_tags : signal is "distributed";
809 # attribute ram_style of itlb_ptes : signal is "distributed";
810
811 # Privilege bit from PTE EAA field
812 eaa_priv = Signal()
813
814 r = RegInternal()
815
816 # Async signal on incoming request
817 req_index = Signal(INDEX_BITS)
818 req_row = Signal(ROW_BITS)
819 req_hit_way = Signal(WAY_BITS)
820 req_tag = Signal(TAG_BITS)
821 req_is_hit = Signal()
822 req_is_miss = Signal()
823 req_laddr = Signal(64)
824
825 tlb_req_index = Signal(TLB_BITS)
826 real_addr = Signal(REAL_ADDR_BITS)
827 ra_valid = Signal()
828 priv_fault = Signal()
829 access_ok = Signal()
830 use_previous = Signal()
831
832 cache_out_row = Signal(ROW_SIZE_BITS)
833
834 plru_victim = Signal(WAY_BITS)
835 replace_way = Signal(WAY_BITS)
836
837 # call sub-functions putting everything together,
838 # using shared signals established above
839 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
840 self.maybe_plrus(m, r, plru_victim)
841 self.itlb_lookup(m, tlb_req_index, itlb, real_addr,
842 ra_valid, eaa_priv, priv_fault,
843 access_ok)
844 self.itlb_update(m, itlb)
845 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
846 req_tag, real_addr, req_laddr,
847 cache_tags, access_ok, req_is_hit, req_is_miss,
848 replace_way, plru_victim, cache_out_row)
849 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
850 req_index, req_tag, real_addr)
851 self.icache_miss(m, r, req_is_miss, req_index,
852 req_laddr, req_tag, replace_way, cache_tags,
853 access_ok, real_addr)
854 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
855 # req_is_miss, req_is_hit, lway, wstate, r)
856
857 # don't connect up to FetchUnitInterface so that some unit tests
858 # can continue to operate
859 if not self.use_fetch_iface:
860 return m
861
862 # connect to FetchUnitInterface. FetchUnitInterface is undocumented
863 # so needs checking and iterative revising
864 i_in, bus, i_out = self.i_in, self.bus, self.i_out
865 comb += i_in.req.eq(self.a_i_valid)
866 comb += i_in.nia.eq(self.a_pc_i)
867 comb += self.stall_in.eq(self.a_stall_i)
868 comb += self.f_fetch_err_o.eq(i_out.fetch_failed)
869 comb += self.f_badaddr_o.eq(i_out.nia)
870 comb += self.f_instr_o.eq(i_out.insn)
871 comb += self.f_busy_o.eq(~i_out.valid) # probably
872
873 # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
874 ibus = self.ibus
875 comb += ibus.adr.eq(self.bus.adr)
876 comb += ibus.dat_w.eq(self.bus.dat_w)
877 comb += ibus.sel.eq(self.bus.sel)
878 comb += ibus.cyc.eq(self.bus.cyc)
879 comb += ibus.stb.eq(self.bus.stb)
880 comb += ibus.we.eq(self.bus.we)
881
882 comb += self.bus.dat_r.eq(ibus.dat_r)
883 comb += self.bus.ack.eq(ibus.ack)
884 if hasattr(ibus, "stall"):
885 comb += self.bus.stall.eq(ibus.stall)
886 else:
887 # fake-up the wishbone stall signal to comply with pipeline mode
888 # same thing is done in dcache.py
889 comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
890
891 return m
892
893
894 def icache_sim(dut):
895 i_in = dut.i_in
896 i_out = dut.i_out
897 m_out = dut.m_in
898
899 yield i_in.priv_mode.eq(1)
900 yield i_in.req.eq(0)
901 yield i_in.nia.eq(0)
902 yield i_in.stop_mark.eq(0)
903 yield m_out.tlbld.eq(0)
904 yield m_out.tlbie.eq(0)
905 yield m_out.addr.eq(0)
906 yield m_out.pte.eq(0)
907 yield
908 yield
909 yield
910 yield
911
912 # miss, stalls for a bit
913 yield i_in.req.eq(1)
914 yield i_in.nia.eq(Const(0x0000000000000004, 64))
915 yield
916 valid = yield i_out.valid
917 while not valid:
918 yield
919 valid = yield i_out.valid
920 yield i_in.req.eq(0)
921
922 insn = yield i_out.insn
923 nia = yield i_out.nia
924 assert insn == 0x00000001, \
925 "insn @%x=%x expected 00000001" % (nia, insn)
926 yield i_in.req.eq(0)
927 yield
928
929 # hit
930 yield i_in.req.eq(1)
931 yield i_in.nia.eq(Const(0x0000000000000008, 64))
932 yield
933 valid = yield i_out.valid
934 while not valid:
935 yield
936 valid = yield i_out.valid
937 yield i_in.req.eq(0)
938
939 nia = yield i_out.nia
940 insn = yield i_out.insn
941 yield
942 assert insn == 0x00000002, \
943 "insn @%x=%x expected 00000002" % (nia, insn)
944
945 # another miss
946 yield i_in.req.eq(1)
947 yield i_in.nia.eq(Const(0x0000000000000040, 64))
948 yield
949 valid = yield i_out.valid
950 while not valid:
951 yield
952 valid = yield i_out.valid
953 yield i_in.req.eq(0)
954
955 nia = yield i_in.nia
956 insn = yield i_out.insn
957 assert insn == 0x00000010, \
958 "insn @%x=%x expected 00000010" % (nia, insn)
959
960 # test something that aliases (this only works because
961 # the unit test SRAM is a depth of 512)
962 yield i_in.req.eq(1)
963 yield i_in.nia.eq(Const(0x0000000000000100, 64))
964 yield
965 yield
966 valid = yield i_out.valid
967 assert ~valid
968 for i in range(30):
969 yield
970 yield
971 insn = yield i_out.insn
972 valid = yield i_out.valid
973 insn = yield i_out.insn
974 assert valid
975 assert insn == 0x00000040, \
976 "insn @%x=%x expected 00000040" % (nia, insn)
977 yield i_in.req.eq(0)
978
979
980 def test_icache(mem):
981 from soc.config.test.test_loadstore import TestMemPspec
982 pspec = TestMemPspec(addr_wid=32,
983 mask_wid=8,
984 reg_wid=64,
985 )
986 dut = ICache(pspec)
987
988 memory = Memory(width=64, depth=512, init=mem)
989 sram = SRAM(memory=memory, granularity=8)
990
991 m = Module()
992
993 m.submodules.icache = dut
994 m.submodules.sram = sram
995
996 m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
997 m.d.comb += sram.bus.stb.eq(dut.bus.stb)
998 m.d.comb += sram.bus.we.eq(dut.bus.we)
999 m.d.comb += sram.bus.sel.eq(dut.bus.sel)
1000 m.d.comb += sram.bus.adr.eq(dut.bus.adr)
1001 m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
1002
1003 m.d.comb += dut.bus.ack.eq(sram.bus.ack)
1004 m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
1005
1006 # nmigen Simulation
1007 sim = Simulator(m)
1008 sim.add_clock(1e-6)
1009
1010 sim.add_sync_process(wrap(icache_sim(dut)))
1011 with sim.write_vcd('test_icache.vcd'):
1012 sim.run()
1013
1014
1015 if __name__ == '__main__':
1016 from soc.config.test.test_loadstore import TestMemPspec
1017 pspec = TestMemPspec(addr_wid=64,
1018 mask_wid=8,
1019 reg_wid=64,
1020 )
1021 dut = ICache(pspec)
1022 vl = rtlil.convert(dut, ports=[])
1023 with open("test_icache.il", "w") as f:
1024 f.write(vl)
1025
1026 # set up memory every 32-bits with incrementing values 0 1 2 ...
1027 mem = []
1028 for i in range(512):
1029 mem.append((i*2) | ((i*2+1)<<32))
1030
1031 test_icache(mem)