58bdb6c5fcce78c1b88b880b24a115ae23d36e47
[soc.git] / src / soc / experiment / icache.py
1 """ICache
2
3 based on Anton Blanchard microwatt icache.vhdl
4
5 Set associative icache
6
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
20
21 Links:
22
23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
25 (discussion about brams for ECP5)
26
27 """
28
29 from enum import (Enum, unique)
30 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
31 Record)
32 from nmigen.cli import main, rtlil
33 from nmutil.iocontrol import RecordObject
34 from nmigen.utils import log2_int
35 from nmigen.lib.coding import Decoder
36 from nmutil.util import Display
37 from nmutil.latch import SRLatch
38
39 #from nmutil.plru import PLRU
40 from soc.experiment.plru import PLRU, PLRUs
41 from soc.experiment.cache_ram import CacheRam
42
43 from soc.experiment.mem_types import (Fetch1ToICacheType,
44 ICacheToDecode1Type,
45 MMUToICacheType)
46
47 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
48 WB_SEL_BITS, WBAddrType, WBDataType,
49 WBSelType, WBMasterOut, WBSlaveOut,
50 )
51
52 from nmigen_soc.wishbone.bus import Interface
53 from soc.minerva.units.fetch import FetchUnitInterface
54
55
56 # for test
57 from soc.bus.sram import SRAM
58 from nmigen import Memory
59 from nmutil.util import wrap
60 from nmigen.cli import main, rtlil
61
62 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
63 # Also, check out the cxxsim nmigen branch, and latest yosys from git
64 from nmutil.sim_tmp_alternative import Simulator, Settle
65
66 # from microwatt/utils.vhdl
67 def ispow2(n):
68 return n != 0 and (n & (n - 1)) == 0
69
70 SIM = 0
71 # Non-zero to enable log data collection
72 LOG_LENGTH = 0
73
74 class ICacheConfig:
75 def __init__(self, LINE_SIZE = 64,
76 NUM_LINES = 16, # Number of lines in a set
77 NUM_WAYS = 1, # Number of ways
78 TLB_SIZE = 64, # L1 ITLB number of entries
79 TLB_LG_PGSZ = 12): # L1 ITLB log_2(page_size)
80 self.LINE_SIZE = LINE_SIZE
81 self.NUM_LINES = NUM_LINES
82 self.NUM_WAYS = NUM_WAYS
83 self.TLB_SIZE = TLB_SIZE
84 self.TLB_LG_PGSZ = TLB_LG_PGSZ
85
86 # BRAM organisation: We never access more than wishbone_data_bits
87 # at a time so to save resources we make the array only that wide,
88 # and use consecutive indices for to make a cache "line"
89 #
90 # self.ROW_SIZE is the width in bytes of the BRAM
91 # (based on WB, so 64-bits)
92 self.ROW_SIZE = WB_DATA_BITS // 8
93 # Number of real address bits that we store
94 self.REAL_ADDR_BITS = 56
95
96 self.ROW_SIZE_BITS = self.ROW_SIZE * 8
97 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
98 self.ROW_PER_LINE = self.LINE_SIZE // self.ROW_SIZE
99 # BRAM_ROWS is the number of rows in BRAM
100 # needed to represent the full icache
101 self.BRAM_ROWS = self.NUM_LINES * self.ROW_PER_LINE
102 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
103 self.INSN_PER_ROW = self.ROW_SIZE_BITS // 32
104
105 # Bit fields counts in the address
106 #
107 # INSN_BITS is the number of bits to select an instruction in a row
108 self.INSN_BITS = log2_int(self.INSN_PER_ROW)
109 # ROW_BITS is the number of bits to select a row
110 self.ROW_BITS = log2_int(self.BRAM_ROWS)
111 # ROW_LINE_BITS is the number of bits to select a row within a line
112 self.ROW_LINE_BITS = log2_int(self.ROW_PER_LINE)
113 # LINE_OFF_BITS is the number of bits for the offset in a cache line
114 self.LINE_OFF_BITS = log2_int(self.LINE_SIZE)
115 # ROW_OFF_BITS is the number of bits for the offset in a row
116 self.ROW_OFF_BITS = log2_int(self.ROW_SIZE)
117 # INDEX_BITS is the number of bits to select a cache line
118 self.INDEX_BITS = log2_int(self.NUM_LINES)
119 # SET_SIZE_BITS is the log base 2 of the set size
120 self.SET_SIZE_BITS = self.LINE_OFF_BITS + self.INDEX_BITS
121 # TAG_BITS is the number of bits of the tag part of the address
122 self.TAG_BITS = self.REAL_ADDR_BITS - self.SET_SIZE_BITS
123 # TAG_WIDTH is the width in bits of each way of the tag RAM
124 self.TAG_WIDTH = self.TAG_BITS + 7 - ((self.TAG_BITS + 7) % 8)
125
126 # WAY_BITS is the number of bits to select a way
127 self.WAY_BITS = log2_int(self.NUM_WAYS)
128 self.TAG_RAM_WIDTH = self.TAG_BITS * self.NUM_WAYS
129
130 # L1 ITLB
131 self.TL_BITS = log2_int(self.TLB_SIZE)
132 self.TLB_EA_TAG_BITS = 64 - (self.TLB_LG_PGSZ + self.TL_BITS)
133 self.TLB_PTE_BITS = 64
134
135 print("self.BRAM_ROWS =", self.BRAM_ROWS)
136 print("self.INDEX_BITS =", self.INDEX_BITS)
137 print("self.INSN_BITS =", self.INSN_BITS)
138 print("self.INSN_PER_ROW =", self.INSN_PER_ROW)
139 print("self.LINE_SIZE =", self.LINE_SIZE)
140 print("self.LINE_OFF_BITS =", self.LINE_OFF_BITS)
141 print("LOG_LENGTH =", LOG_LENGTH)
142 print("self.NUM_LINES =", self.NUM_LINES)
143 print("self.NUM_WAYS =", self.NUM_WAYS)
144 print("self.REAL_ADDR_BITS =", self.REAL_ADDR_BITS)
145 print("self.ROW_BITS =", self.ROW_BITS)
146 print("self.ROW_OFF_BITS =", self.ROW_OFF_BITS)
147 print("self.ROW_LINE_BITS =", self.ROW_LINE_BITS)
148 print("self.ROW_PER_LINE =", self.ROW_PER_LINE)
149 print("self.ROW_SIZE =", self.ROW_SIZE)
150 print("self.ROW_SIZE_BITS =", self.ROW_SIZE_BITS)
151 print("self.SET_SIZE_BITS =", self.SET_SIZE_BITS)
152 print("SIM =", SIM)
153 print("self.TAG_BITS =", self.TAG_BITS)
154 print("self.TAG_RAM_WIDTH =", self.TAG_RAM_WIDTH)
155 print("self.TAG_BITS =", self.TAG_BITS)
156 print("self.TL_BITS =", self.TL_BITS)
157 print("self.TLB_EA_TAG_BITS =", self.TLB_EA_TAG_BITS)
158 print("self.TLB_LG_PGSZ =", self.TLB_LG_PGSZ)
159 print("self.TLB_PTE_BITS =", self.TLB_PTE_BITS)
160 print("self.TLB_SIZE =", self.TLB_SIZE)
161 print("self.WAY_BITS =", self.WAY_BITS)
162
163 assert self.LINE_SIZE % self.ROW_SIZE == 0
164 assert ispow2(self.LINE_SIZE), "self.LINE_SIZE not power of 2"
165 assert ispow2(self.NUM_LINES), "self.NUM_LINES not power of 2"
166 assert ispow2(self.ROW_PER_LINE), "self.ROW_PER_LINE not power of 2"
167 assert ispow2(self.INSN_PER_ROW), "self.INSN_PER_ROW not power of 2"
168 assert (self.ROW_BITS == (self.INDEX_BITS + self.ROW_LINE_BITS)), \
169 "geometry bits don't add up"
170 assert (self.LINE_OFF_BITS ==
171 (self.ROW_OFF_BITS + self.ROW_LINE_BITS)), \
172 "geometry bits don't add up"
173 assert (self.REAL_ADDR_BITS ==
174 (self.TAG_BITS + self.INDEX_BITS + self.LINE_OFF_BITS)), \
175 "geometry bits don't add up"
176 assert (self.REAL_ADDR_BITS ==
177 (self.TAG_BITS + self.ROW_BITS + self.ROW_OFF_BITS)), \
178 "geometry bits don't add up"
179
180 # Example of layout for 32 lines of 64 bytes:
181 #
182 # .. tag |index| line |
183 # .. | row | |
184 # .. | | | |00| zero (2)
185 # .. | | |-| | self.INSN_BITS (1)
186 # .. | |---| | self.ROW_LINE_BITS (3)
187 # .. | |--- - --| self.LINE_OFF_BITS (6)
188 # .. | |- --| self.ROW_OFF_BITS (3)
189 # .. |----- ---| | self.ROW_BITS (8)
190 # .. |-----| | self.INDEX_BITS (5)
191 # .. --------| | self.TAG_BITS (53)
192
193 # The cache data BRAM organized as described above for each way
194 #subtype cache_row_t is std_ulogic_vector(self.ROW_SIZE_BITS-1 downto 0);
195 #
196 def RowPerLineValidArray(self):
197 return Array(Signal(name="rows_valid_%d" %x) \
198 for x in range(self.ROW_PER_LINE))
199
200
201 # TODO to be passed to nigmen as ram attributes
202 # attribute ram_style : string;
203 # attribute ram_style of cache_tags : signal is "distributed";
204
205 def TLBRecord(self, name):
206 tlb_layout = [ ('tag', self.TLB_EA_TAG_BITS),
207 ('pte', self.TLB_PTE_BITS)
208 ]
209 return Record(tlb_layout, name=name)
210
211 def TLBArray(self):
212 return Array(self.TLBRecord("tlb%d" % x) for x in range(self.TLB_SIZE))
213
214 # PLRU output interface
215 def PLRUOut(self):
216 return Array(Signal(self.WAY_BITS, name="plru_out_%d" %x) \
217 for x in range(self.NUM_LINES))
218
219 # Return the cache line index (tag index) for an address
220 def get_index(self, addr):
221 return addr[self.LINE_OFF_BITS:self.SET_SIZE_BITS]
222
223 # Return the cache row index (data memory) for an address
224 def get_row(self, addr):
225 return addr[self.ROW_OFF_BITS:self.SET_SIZE_BITS]
226
227 # Return the index of a row within a line
228 def get_row_of_line(self, row):
229 return row[:self.ROW_BITS][:self.ROW_LINE_BITS]
230
231 # Returns whether this is the last row of a line
232 def is_last_row_addr(self, addr, last):
233 return addr[self.ROW_OFF_BITS:self.LINE_OFF_BITS] == last
234
235 # Returns whether this is the last row of a line
236 def is_last_row(self, row, last):
237 return self.get_row_of_line(row) == last
238
239 # Return the next row in the current cache line. We use a dedicated
240 # function in order to limit the size of the generated adder to be
241 # only the bits within a cache line (3 bits with default settings)
242 def next_row(self, row):
243 row_v = row[0:self.ROW_LINE_BITS] + 1
244 return Cat(row_v[:self.ROW_LINE_BITS], row[self.ROW_LINE_BITS:])
245
246 # Read the instruction word for the given address
247 # in the current cache row
248 def read_insn_word(self, addr, data):
249 word = addr[2:self.INSN_BITS+2]
250 return data.word_select(word, 32)
251
252 # Get the tag value from the address
253 def get_tag(self, addr):
254 return addr[self.SET_SIZE_BITS:self.REAL_ADDR_BITS]
255
256 # Read a tag from a tag memory row
257 def read_tag(self, way, tagset):
258 return tagset.word_select(way, self.TAG_BITS)
259
260 # Write a tag to tag memory row
261 def write_tag(self, way, tagset, tag):
262 return self.read_tag(way, tagset).eq(tag)
263
264 # Simple hash for direct-mapped TLB index
265 def hash_ea(self, addr):
266 hsh = (addr[self.TLB_LG_PGSZ:self.TLB_LG_PGSZ + self.TL_BITS] ^
267 addr[self.TLB_LG_PGSZ + self.TL_BITS:
268 self.TLB_LG_PGSZ + 2 * self.TL_BITS ] ^
269 addr[self.TLB_LG_PGSZ + 2 * self.TL_BITS:
270 self.TLB_LG_PGSZ + 3 * self.TL_BITS])
271 return hsh
272
273
274 # Cache reload state machine
275 @unique
276 class State(Enum):
277 IDLE = 0
278 CLR_TAG = 1
279 WAIT_ACK = 2
280
281
282 class RegInternal(RecordObject):
283 def __init__(self, cfg):
284 super().__init__()
285 # Cache hit state (Latches for 1 cycle BRAM access)
286 self.hit_way = Signal(cfg.WAY_BITS)
287 self.hit_nia = Signal(64)
288 self.hit_smark = Signal()
289 self.hit_valid = Signal()
290
291 # Cache miss state (reload state machine)
292 self.state = Signal(State, reset=State.IDLE)
293 self.wb = WBMasterOut("wb")
294 self.req_adr = Signal(64)
295 self.store_way = Signal(cfg.WAY_BITS)
296 self.store_index = Signal(cfg.INDEX_BITS)
297 self.store_row = Signal(cfg.ROW_BITS)
298 self.store_tag = Signal(cfg.TAG_BITS)
299 self.store_valid = Signal()
300 self.end_row_ix = Signal(cfg.ROW_LINE_BITS)
301 self.rows_valid = cfg.RowPerLineValidArray()
302
303 # TLB miss state
304 self.fetch_failed = Signal()
305
306
307 class ICache(FetchUnitInterface, Elaboratable, ICacheConfig):
308 """64 bit direct mapped icache. All instructions are 4B aligned."""
309 def __init__(self, pspec):
310 FetchUnitInterface.__init__(self, pspec)
311 ICacheConfig.__init__(self)
312 self.i_in = Fetch1ToICacheType(name="i_in")
313 self.i_out = ICacheToDecode1Type(name="i_out")
314
315 self.m_in = MMUToICacheType(name="m_in")
316
317 self.stall_in = Signal()
318 self.stall_out = Signal()
319 self.flush_in = Signal()
320 self.inval_in = Signal()
321
322 # standard naming (wired to non-standard for compatibility)
323 self.bus = Interface(addr_width=32,
324 data_width=64,
325 granularity=8,
326 features={'stall'},
327 #alignment=0,
328 name="icache_wb")
329
330 self.log_out = Signal(54)
331
332 # use FetchUnitInterface, helps keep some unit tests running
333 self.use_fetch_iface = False
334
335 def use_fetch_interface(self):
336 self.use_fetch_iface = True
337
338 # Generate a cache RAM for each way
339 def rams(self, m, r, cache_out_row, use_previous,
340 replace_way, req_row):
341
342 comb = m.d.comb
343 sync = m.d.sync
344
345 bus, stall_in = self.bus, self.stall_in
346
347 # read condition (for every cache ram)
348 do_read = Signal()
349 comb += do_read.eq(~(stall_in | use_previous))
350
351 rd_addr = Signal(self.ROW_BITS)
352 wr_addr = Signal(self.ROW_BITS)
353 comb += rd_addr.eq(req_row)
354 comb += wr_addr.eq(r.store_row)
355
356 # binary-to-unary converters: replace-way enabled by bus.ack,
357 # hit-way left permanently enabled
358 m.submodules.replace_way_e = re = Decoder(self.NUM_WAYS)
359 m.submodules.hit_way_e = he = Decoder(self.NUM_WAYS)
360 comb += re.i.eq(replace_way)
361 comb += re.n.eq(~bus.ack)
362 comb += he.i.eq(r.hit_way)
363
364 for i in range(self.NUM_WAYS):
365 do_write = Signal(name="do_wr_%d" % i)
366 d_out = Signal(self.ROW_SIZE_BITS, name="d_out_%d" % i)
367 wr_sel = Signal(self.ROW_SIZE, name="wr_sel_%d" % i)
368
369 way = CacheRam(self.ROW_BITS, self.ROW_SIZE_BITS,
370 TRACE=True, ram_num=i)
371 m.submodules["cacheram_%d" % i] = way
372
373 comb += way.rd_en.eq(do_read)
374 comb += way.rd_addr.eq(rd_addr)
375 comb += d_out.eq(way.rd_data_o)
376 comb += way.wr_sel.eq(wr_sel)
377 comb += way.wr_addr.eq(wr_addr)
378 comb += way.wr_data.eq(bus.dat_r)
379
380 comb += do_write.eq(re.o[i])
381
382 with m.If(do_write):
383 sync += Display("cache write adr: %x data: %lx",
384 wr_addr, way.wr_data)
385
386 with m.If(he.o[i]):
387 comb += cache_out_row.eq(d_out)
388 with m.If(do_read):
389 sync += Display("cache read adr: %x data: %x",
390 req_row, d_out)
391
392 comb += wr_sel.eq(Repl(do_write, self.ROW_SIZE))
393
394 # Generate PLRUs
395 def maybe_plrus(self, m, r, plru_victim):
396 comb = m.d.comb
397
398 if self.NUM_WAYS == 0:
399 return
400
401
402 m.submodules.plrus = plru = PLRUs(self.NUM_LINES, self.WAY_BITS)
403 comb += plru.way.eq(r.hit_way)
404 comb += plru.valid.eq(r.hit_valid)
405 comb += plru.index.eq(self.get_index(r.hit_nia))
406 comb += plru.isel.eq(r.store_index) # select victim
407 comb += plru_victim.eq(plru.o_index) # selected victim
408
409 # TLB hit detection and real address generation
410 def itlb_lookup(self, m, tlb_req_index, itlb, itlb_valid,
411 real_addr, ra_valid, eaa_priv,
412 priv_fault, access_ok):
413
414 comb = m.d.comb
415
416 i_in = self.i_in
417
418 # use an *asynchronous* Memory read port here (combinatorial)
419 m.submodules.rd_tlb = rd_tlb = self.tlbmem.read_port(domain="comb")
420 tlb = self.TLBRecord("tlb_rdport")
421 pte, ttag = tlb.pte, tlb.tag
422
423 comb += tlb_req_index.eq(self.hash_ea(i_in.nia))
424 comb += rd_tlb.addr.eq(tlb_req_index)
425 comb += tlb.eq(rd_tlb.data)
426
427 with m.If(i_in.virt_mode):
428 comb += real_addr.eq(Cat(i_in.nia[:self.TLB_LG_PGSZ],
429 pte[self.TLB_LG_PGSZ:self.REAL_ADDR_BITS]))
430
431 with m.If(ttag == i_in.nia[self.TLB_LG_PGSZ + self.TL_BITS:64]):
432 comb += ra_valid.eq(itlb_valid.q.bit_select(tlb_req_index, 1))
433
434 comb += eaa_priv.eq(pte[3])
435
436 with m.Else():
437 comb += real_addr.eq(i_in.nia[:self.REAL_ADDR_BITS])
438 comb += ra_valid.eq(1)
439 comb += eaa_priv.eq(1)
440
441 # No IAMR, so no KUEP support for now
442 comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
443 comb += access_ok.eq(ra_valid & ~priv_fault)
444
445 # iTLB update
446 def itlb_update(self, m, itlb, itlb_valid):
447 comb = m.d.comb
448 sync = m.d.sync
449
450 m_in = self.m_in
451
452 wr_index = Signal(self.TL_BITS)
453 wr_unary = Signal(self.TLB_SIZE)
454 comb += wr_index.eq(self.hash_ea(m_in.addr))
455 comb += wr_unary.eq(1<<wr_index)
456
457 m.submodules.wr_tlb = wr_tlb = self.tlbmem.write_port()
458 sync += itlb_valid.s.eq(0)
459 sync += itlb_valid.r.eq(0)
460
461 with m.If(m_in.tlbie & m_in.doall):
462 # Clear all valid bits
463 sync += itlb_valid.r.eq(-1)
464
465 with m.Elif(m_in.tlbie):
466 # Clear entry regardless of hit or miss
467 sync += itlb_valid.r.eq(wr_unary)
468
469 with m.Elif(m_in.tlbld):
470 tlb = self.TLBRecord("tlb_wrport")
471 comb += tlb.tag.eq(m_in.addr[self.TLB_LG_PGSZ + self.TL_BITS:64])
472 comb += tlb.pte.eq(m_in.pte)
473 comb += wr_tlb.en.eq(1)
474 comb += wr_tlb.addr.eq(wr_index)
475 comb += wr_tlb.data.eq(tlb)
476 sync += itlb_valid.s.eq(wr_unary)
477
478 # Cache hit detection, output to fetch2 and other misc logic
479 def icache_comb(self, m, use_previous, r, req_index, req_row,
480 req_hit_way, req_tag, real_addr, req_laddr,
481 cache_valids, access_ok,
482 req_is_hit, req_is_miss, replace_way,
483 plru_victim, cache_out_row):
484
485 comb = m.d.comb
486 m.submodules.rd_tag = rd_tag = self.tagmem.read_port(domain="comb")
487
488 i_in, i_out, bus = self.i_in, self.i_out, self.bus
489 flush_in, stall_out = self.flush_in, self.stall_out
490
491 is_hit = Signal()
492 hit_way = Signal(self.WAY_BITS)
493
494 # i_in.sequential means that i_in.nia this cycle is 4 more than
495 # last cycle. If we read more than 32 bits at a time, had a
496 # cache hit last cycle, and we don't want the first 32-bit chunk
497 # then we can keep the data we read last cycle and just use that.
498 with m.If(i_in.nia[2:self.INSN_BITS+2] != 0):
499 comb += use_previous.eq(i_in.sequential & r.hit_valid)
500
501 # Extract line, row and tag from request
502 comb += req_index.eq(self.get_index(i_in.nia))
503 comb += req_row.eq(self.get_row(i_in.nia))
504 comb += req_tag.eq(self.get_tag(real_addr))
505
506 # Calculate address of beginning of cache row, will be
507 # used for cache miss processing if needed
508 comb += req_laddr.eq(Cat(
509 Const(0, self.ROW_OFF_BITS),
510 real_addr[self.ROW_OFF_BITS:self.REAL_ADDR_BITS],
511 ))
512
513 # Test if pending request is a hit on any way
514 hitcond = Signal()
515 comb += hitcond.eq((r.state == State.WAIT_ACK)
516 & (req_index == r.store_index)
517 & r.rows_valid[req_row % self.ROW_PER_LINE]
518 )
519 # i_in.req asserts Decoder active
520 cvb = Signal(self.NUM_WAYS)
521 ctag = Signal(self.TAG_RAM_WIDTH)
522 comb += rd_tag.addr.eq(req_index)
523 comb += ctag.eq(rd_tag.data)
524 comb += cvb.eq(cache_valids.q.word_select(req_index, self.NUM_WAYS))
525 m.submodules.store_way_e = se = Decoder(self.NUM_WAYS)
526 comb += se.i.eq(r.store_way)
527 comb += se.n.eq(~i_in.req)
528 for i in range(self.NUM_WAYS):
529 tagi = Signal(self.TAG_BITS, name="tag_i%d" % i)
530 hit_test = Signal(name="hit_test%d" % i)
531 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
532 comb += tagi.eq(self.read_tag(i, ctag))
533 comb += hit_test.eq(se.o[i])
534 comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
535 (tagi == req_tag))
536 with m.If(is_tag_hit):
537 comb += hit_way.eq(i)
538 comb += is_hit.eq(1)
539
540 # Generate the "hit" and "miss" signals
541 # for the synchronous blocks
542 with m.If(i_in.req & access_ok & ~flush_in):
543 comb += req_is_hit.eq(is_hit)
544 comb += req_is_miss.eq(~is_hit)
545
546 comb += req_hit_way.eq(hit_way)
547
548 # The way to replace on a miss
549 with m.If(r.state == State.CLR_TAG):
550 comb += replace_way.eq(plru_victim)
551 with m.Else():
552 comb += replace_way.eq(r.store_way)
553
554 # Output instruction from current cache row
555 #
556 # Note: This is a mild violation of our design principle of
557 # having pipeline stages output from a clean latch. In this
558 # case we output the result of a mux. The alternative would
559 # be output an entire row which I prefer not to do just yet
560 # as it would force fetch2 to know about some of the cache
561 # geometry information.
562 comb += i_out.insn.eq(self.read_insn_word(r.hit_nia, cache_out_row))
563 comb += i_out.valid.eq(r.hit_valid)
564 comb += i_out.nia.eq(r.hit_nia)
565 comb += i_out.stop_mark.eq(r.hit_smark)
566 comb += i_out.fetch_failed.eq(r.fetch_failed)
567
568 # Stall fetch1 if we have a miss on cache or TLB
569 # or a protection fault
570 comb += stall_out.eq(~(is_hit & access_ok))
571
572 # Wishbone requests output (from the cache miss reload machine)
573 comb += bus.we.eq(r.wb.we)
574 comb += bus.adr.eq(r.wb.adr)
575 comb += bus.sel.eq(r.wb.sel)
576 comb += bus.stb.eq(r.wb.stb)
577 comb += bus.dat_w.eq(r.wb.dat)
578 comb += bus.cyc.eq(r.wb.cyc)
579
580 # Cache hit synchronous machine
581 def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
582 req_index, req_tag, real_addr):
583 sync = m.d.sync
584
585 i_in, stall_in = self.i_in, self.stall_in
586 flush_in = self.flush_in
587
588 # keep outputs to fetch2 unchanged on a stall
589 # except that flush or reset sets valid to 0
590 # If use_previous, keep the same data as last
591 # cycle and use the second half
592 with m.If(stall_in | use_previous):
593 with m.If(flush_in):
594 sync += r.hit_valid.eq(0)
595 with m.Else():
596 # On a hit, latch the request for the next cycle,
597 # when the BRAM data will be available on the
598 # cache_out output of the corresponding way
599 sync += r.hit_valid.eq(req_is_hit)
600
601 with m.If(req_is_hit):
602 sync += r.hit_way.eq(req_hit_way)
603 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
604 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
605 i_in.stop_mark, req_index, req_tag,
606 req_hit_way, real_addr)
607
608 with m.If(~stall_in):
609 # Send stop marks and NIA down regardless of validity
610 sync += r.hit_smark.eq(i_in.stop_mark)
611 sync += r.hit_nia.eq(i_in.nia)
612
613 def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
614 req_index, req_tag, replace_way, real_addr):
615 comb = m.d.comb
616 sync = m.d.sync
617
618 i_in = self.i_in
619
620 # Reset per-row valid flags, only used in WAIT_ACK
621 for i in range(self.ROW_PER_LINE):
622 sync += r.rows_valid[i].eq(0)
623
624 # We need to read a cache line
625 with m.If(req_is_miss):
626 sync += Display(
627 "cache miss nia:%x IR:%x SM:%x idx:%x "
628 " way:%x tag:%x RA:%x", i_in.nia,
629 i_in.virt_mode, i_in.stop_mark, req_index,
630 replace_way, req_tag, real_addr)
631
632 # Keep track of our index and way for subsequent stores
633 st_row = Signal(self.ROW_BITS)
634 comb += st_row.eq(self.get_row(req_laddr))
635 sync += r.store_index.eq(req_index)
636 sync += r.store_row.eq(st_row)
637 sync += r.store_tag.eq(req_tag)
638 sync += r.store_valid.eq(1)
639 sync += r.end_row_ix.eq(self.get_row_of_line(st_row) - 1)
640
641 # Prep for first wishbone read. We calculate the address
642 # of the start of the cache line and start the WB cycle.
643 sync += r.req_adr.eq(req_laddr)
644 sync += r.wb.cyc.eq(1)
645 sync += r.wb.stb.eq(1)
646
647 # Track that we had one request sent
648 sync += r.state.eq(State.CLR_TAG)
649
650 def icache_miss_clr_tag(self, m, r, replace_way,
651 req_index,
652 cache_valids):
653 comb = m.d.comb
654 sync = m.d.sync
655 m.submodules.wr_tag = wr_tag = self.tagmem.write_port(
656 granularity=self.TAG_BITS)
657
658 # Get victim way from plru
659 sync += r.store_way.eq(replace_way)
660
661 # Force misses on that way while reloading that line
662 idx = req_index*self.NUM_WAYS + replace_way # 2D index, 1st dim: self.NUM_WAYS
663 comb += cache_valids.r.eq(1<<idx)
664
665 # use write-port "granularity" to select the tag to write to
666 # TODO: the Memory should be multipled-up (by NUM_TAGS)
667 tagset = Signal(self.TAG_RAM_WIDTH)
668 comb += tagset.eq(r.store_tag << (replace_way*self.TAG_BITS))
669 comb += wr_tag.en.eq(1<<replace_way)
670 comb += wr_tag.addr.eq(r.store_index)
671 comb += wr_tag.data.eq(tagset)
672
673 sync += r.state.eq(State.WAIT_ACK)
674
675 def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
676 cache_valids, stbs_done):
677 comb = m.d.comb
678 sync = m.d.sync
679
680 bus = self.bus
681
682 # Requests are all sent if stb is 0
683 stbs_zero = Signal()
684 comb += stbs_zero.eq(r.wb.stb == 0)
685 comb += stbs_done.eq(stbs_zero)
686
687 # If we are still sending requests, was one accepted?
688 with m.If(~bus.stall & ~stbs_zero):
689 # That was the last word? We are done sending.
690 # Clear stb and set stbs_done so we can handle
691 # an eventual last ack on the same cycle.
692 with m.If(self.is_last_row_addr(r.req_adr, r.end_row_ix)):
693 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
694 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
695 "stbs_done:%x", r.wb.adr, r.end_row_ix,
696 r.wb.stb, stbs_zero, stbs_done)
697 sync += r.wb.stb.eq(0)
698 comb += stbs_done.eq(1)
699
700 # Calculate the next row address
701 rarange = Signal(self.LINE_OFF_BITS - self.ROW_OFF_BITS)
702 comb += rarange.eq(r.req_adr[self.ROW_OFF_BITS:
703 self.LINE_OFF_BITS] + 1)
704 sync += r.req_adr[self.ROW_OFF_BITS:self.LINE_OFF_BITS].eq(rarange)
705 sync += Display("RARANGE r.req_adr:%x rarange:%x "
706 "stbs_zero:%x stbs_done:%x",
707 r.req_adr, rarange, stbs_zero, stbs_done)
708
709 # Incoming acks processing
710 with m.If(bus.ack):
711 sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
712 "stbs_done:%x",
713 bus.dat_r, stbs_zero, stbs_done)
714
715 sync += r.rows_valid[r.store_row % self.ROW_PER_LINE].eq(1)
716
717 # Check for completion
718 with m.If(stbs_done & self.is_last_row(r.store_row, r.end_row_ix)):
719 # Complete wishbone cycle
720 sync += r.wb.cyc.eq(0)
721 # be nice, clear addr
722 sync += r.req_adr.eq(0)
723
724 # Cache line is now valid
725 idx = r.store_index*self.NUM_WAYS + replace_way # 2D index again
726 valid = r.store_valid & ~inval_in
727 comb += cache_valids.s.eq(1<<idx)
728 sync += r.state.eq(State.IDLE)
729
730 # move on to next request in row
731 # Increment store row counter
732 sync += r.store_row.eq(self.next_row(r.store_row))
733
734 # Cache miss/reload synchronous machine
735 def icache_miss(self, m, r, req_is_miss,
736 req_index, req_laddr, req_tag, replace_way,
737 cache_valids, access_ok, real_addr):
738 comb = m.d.comb
739 sync = m.d.sync
740
741 i_in, bus, m_in = self.i_in, self.bus, self.m_in
742 stall_in, flush_in = self.stall_in, self.flush_in
743 inval_in = self.inval_in
744
745 stbs_done = Signal()
746
747 comb += r.wb.sel.eq(-1)
748 comb += r.wb.adr.eq(r.req_adr[3:])
749
750 # Process cache invalidations
751 with m.If(inval_in):
752 comb += cache_valids.r.eq(-1)
753 sync += r.store_valid.eq(0)
754
755 # Main state machine
756 with m.Switch(r.state):
757
758 with m.Case(State.IDLE):
759 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
760 req_index, req_tag, replace_way,
761 real_addr)
762
763 with m.Case(State.CLR_TAG, State.WAIT_ACK):
764 with m.If(r.state == State.CLR_TAG):
765 self.icache_miss_clr_tag(m, r, replace_way,
766 req_index,
767 cache_valids)
768
769 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
770 cache_valids, stbs_done)
771
772 # TLB miss and protection fault processing
773 with m.If(flush_in | m_in.tlbld):
774 sync += r.fetch_failed.eq(0)
775 with m.Elif(i_in.req & ~access_ok & ~stall_in):
776 sync += r.fetch_failed.eq(1)
777
778 # icache_log: if LOG_LENGTH > 0 generate
779 def icache_log(self, m, req_hit_way, ra_valid, access_ok,
780 req_is_miss, req_is_hit, lway, wstate, r):
781 comb = m.d.comb
782 sync = m.d.sync
783
784 bus, i_out = self.bus, self.i_out
785 log_out, stall_out = self.log_out, self.stall_out
786
787 # Output data to logger
788 for i in range(LOG_LENGTH):
789 log_data = Signal(54)
790 lway = Signal(self.WAY_BITS)
791 wstate = Signal()
792
793 sync += lway.eq(req_hit_way)
794 sync += wstate.eq(0)
795
796 with m.If(r.state != State.IDLE):
797 sync += wstate.eq(1)
798
799 sync += log_data.eq(Cat(
800 ra_valid, access_ok, req_is_miss, req_is_hit,
801 lway, wstate, r.hit_nia[2:6], r.fetch_failed,
802 stall_out, bus.stall, r.wb.cyc, r.wb.stb,
803 r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
804 ))
805 comb += log_out.eq(log_data)
806
807 def elaborate(self, platform):
808
809 m = Module()
810 comb = m.d.comb
811
812 # Cache-Ways "valid" indicators. this is a 2D Signal, by the
813 # number of ways and the number of lines.
814 vec = SRLatch(sync=True, llen=self.NUM_WAYS*self.NUM_LINES,
815 name="cachevalids")
816 m.submodules.cache_valids = cache_valids = vec
817
818 # TLB Array
819 itlb = self.TLBArray()
820 vec = SRLatch(sync=False, llen=self.TLB_SIZE, name="tlbvalids")
821 m.submodules.itlb_valids = itlb_valid = vec
822
823 # TODO to be passed to nmigen as ram attributes
824 # attribute ram_style of itlb_tags : signal is "distributed";
825 # attribute ram_style of itlb_ptes : signal is "distributed";
826
827 # Privilege bit from PTE EAA field
828 eaa_priv = Signal()
829
830 r = RegInternal(self)
831
832 # Async signal on incoming request
833 req_index = Signal(self.INDEX_BITS)
834 req_row = Signal(self.ROW_BITS)
835 req_hit_way = Signal(self.WAY_BITS)
836 req_tag = Signal(self.TAG_BITS)
837 req_is_hit = Signal()
838 req_is_miss = Signal()
839 req_laddr = Signal(64)
840
841 tlb_req_index = Signal(self.TL_BITS)
842 real_addr = Signal(self.REAL_ADDR_BITS)
843 ra_valid = Signal()
844 priv_fault = Signal()
845 access_ok = Signal()
846 use_previous = Signal()
847
848 cache_out_row = Signal(self.ROW_SIZE_BITS)
849
850 plru_victim = Signal(self.WAY_BITS)
851 replace_way = Signal(self.WAY_BITS)
852
853 self.tlbmem = Memory(depth=self.TLB_SIZE,
854 width=self.TLB_EA_TAG_BITS+self.TLB_PTE_BITS)
855 self.tagmem = Memory(depth=self.NUM_LINES,
856 width=self.TAG_RAM_WIDTH)
857
858 # call sub-functions putting everything together,
859 # using shared signals established above
860 self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
861 self.maybe_plrus(m, r, plru_victim)
862 self.itlb_lookup(m, tlb_req_index, itlb, itlb_valid, real_addr,
863 ra_valid, eaa_priv, priv_fault,
864 access_ok)
865 self.itlb_update(m, itlb, itlb_valid)
866 self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
867 req_tag, real_addr, req_laddr,
868 cache_valids,
869 access_ok, req_is_hit, req_is_miss,
870 replace_way, plru_victim, cache_out_row)
871 self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
872 req_index, req_tag, real_addr)
873 self.icache_miss(m, r, req_is_miss, req_index,
874 req_laddr, req_tag, replace_way,
875 cache_valids,
876 access_ok, real_addr)
877 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
878 # req_is_miss, req_is_hit, lway, wstate, r)
879
880 # don't connect up to FetchUnitInterface so that some unit tests
881 # can continue to operate
882 if not self.use_fetch_iface:
883 return m
884
885 # connect to FetchUnitInterface. FetchUnitInterface is undocumented
886 # so needs checking and iterative revising
887 i_in, bus, i_out = self.i_in, self.bus, self.i_out
888 comb += i_in.req.eq(self.a_i_valid)
889 comb += i_in.nia.eq(self.a_pc_i)
890 comb += self.stall_in.eq(self.a_stall_i)
891 comb += self.f_fetch_err_o.eq(i_out.fetch_failed)
892 comb += self.f_badaddr_o.eq(i_out.nia)
893 comb += self.f_instr_o.eq(i_out.insn)
894 comb += self.f_busy_o.eq(~i_out.valid) # probably
895
896 # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
897 ibus = self.ibus
898 comb += ibus.adr.eq(self.bus.adr)
899 comb += ibus.dat_w.eq(self.bus.dat_w)
900 comb += ibus.sel.eq(self.bus.sel)
901 comb += ibus.cyc.eq(self.bus.cyc)
902 comb += ibus.stb.eq(self.bus.stb)
903 comb += ibus.we.eq(self.bus.we)
904
905 comb += self.bus.dat_r.eq(ibus.dat_r)
906 comb += self.bus.ack.eq(ibus.ack)
907 if hasattr(ibus, "stall"):
908 comb += self.bus.stall.eq(ibus.stall)
909 else:
910 # fake-up the wishbone stall signal to comply with pipeline mode
911 # same thing is done in dcache.py
912 comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
913
914 return m
915
916
917 def icache_sim(dut):
918 i_in = dut.i_in
919 i_out = dut.i_out
920 m_out = dut.m_in
921
922 yield i_in.priv_mode.eq(1)
923 yield i_in.req.eq(0)
924 yield i_in.nia.eq(0)
925 yield i_in.stop_mark.eq(0)
926 yield m_out.tlbld.eq(0)
927 yield m_out.tlbie.eq(0)
928 yield m_out.addr.eq(0)
929 yield m_out.pte.eq(0)
930 yield
931 yield
932 yield
933 yield
934
935 # miss, stalls for a bit
936 yield i_in.req.eq(1)
937 yield i_in.nia.eq(Const(0x0000000000000004, 64))
938 yield
939 valid = yield i_out.valid
940 while not valid:
941 yield
942 valid = yield i_out.valid
943 yield i_in.req.eq(0)
944
945 insn = yield i_out.insn
946 nia = yield i_out.nia
947 assert insn == 0x00000001, \
948 "insn @%x=%x expected 00000001" % (nia, insn)
949 yield i_in.req.eq(0)
950 yield
951
952 # hit
953 yield i_in.req.eq(1)
954 yield i_in.nia.eq(Const(0x0000000000000008, 64))
955 yield
956 valid = yield i_out.valid
957 while not valid:
958 yield
959 valid = yield i_out.valid
960 yield i_in.req.eq(0)
961
962 nia = yield i_out.nia
963 insn = yield i_out.insn
964 yield
965 assert insn == 0x00000002, \
966 "insn @%x=%x expected 00000002" % (nia, insn)
967
968 # another miss
969 yield i_in.req.eq(1)
970 yield i_in.nia.eq(Const(0x0000000000000040, 64))
971 yield
972 valid = yield i_out.valid
973 while not valid:
974 yield
975 valid = yield i_out.valid
976 yield i_in.req.eq(0)
977
978 nia = yield i_in.nia
979 insn = yield i_out.insn
980 assert insn == 0x00000010, \
981 "insn @%x=%x expected 00000010" % (nia, insn)
982
983 # test something that aliases (this only works because
984 # the unit test SRAM is a depth of 512)
985 yield i_in.req.eq(1)
986 yield i_in.nia.eq(Const(0x0000000000000100, 64))
987 yield
988 yield
989 valid = yield i_out.valid
990 assert ~valid
991 for i in range(30):
992 yield
993 yield
994 insn = yield i_out.insn
995 valid = yield i_out.valid
996 insn = yield i_out.insn
997 assert valid
998 assert insn == 0x00000040, \
999 "insn @%x=%x expected 00000040" % (nia, insn)
1000 yield i_in.req.eq(0)
1001
1002
1003 def test_icache(mem):
1004 from soc.config.test.test_loadstore import TestMemPspec
1005 pspec = TestMemPspec(addr_wid=32,
1006 mask_wid=8,
1007 reg_wid=64,
1008 )
1009 dut = ICache(pspec)
1010
1011 memory = Memory(width=64, depth=512, init=mem)
1012 sram = SRAM(memory=memory, granularity=8)
1013
1014 m = Module()
1015
1016 m.submodules.icache = dut
1017 m.submodules.sram = sram
1018
1019 m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
1020 m.d.comb += sram.bus.stb.eq(dut.bus.stb)
1021 m.d.comb += sram.bus.we.eq(dut.bus.we)
1022 m.d.comb += sram.bus.sel.eq(dut.bus.sel)
1023 m.d.comb += sram.bus.adr.eq(dut.bus.adr)
1024 m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
1025
1026 m.d.comb += dut.bus.ack.eq(sram.bus.ack)
1027 m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
1028
1029 # nmigen Simulation
1030 sim = Simulator(m)
1031 sim.add_clock(1e-6)
1032
1033 sim.add_sync_process(wrap(icache_sim(dut)))
1034 with sim.write_vcd('test_icache.vcd'):
1035 sim.run()
1036
1037
1038 if __name__ == '__main__':
1039 from soc.config.test.test_loadstore import TestMemPspec
1040 pspec = TestMemPspec(addr_wid=64,
1041 mask_wid=8,
1042 reg_wid=64,
1043 )
1044 dut = ICache(pspec)
1045 vl = rtlil.convert(dut, ports=[])
1046 with open("test_icache.il", "w") as f:
1047 f.write(vl)
1048
1049 # set up memory every 32-bits with incrementing values 0 1 2 ...
1050 mem = []
1051 for i in range(512):
1052 mem.append((i*2) | ((i*2+1)<<32))
1053
1054 test_icache(mem)