3 based on Anton Blanchard microwatt icache.vhdl
7 TODO (in no specific order):
8 * Add debug interface to inspect cache content
9 * Add snoop/invalidate path
10 * Add multi-hit error detection
11 * Pipelined bus interface (wb or axi)
12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
13 * Add optimization: service hits on partially loaded lines
14 * Add optimization: (maybe) interrupt reload on fluch/redirect
15 * Check if playing with the geometry of the cache tags allow for more
16 efficient use of distributed RAM and less logic/muxes. Currently we
17 write TAG_BITS width which may not match full ram blocks and might
18 cause muxes to be inferred for "partial writes".
19 * Check if making the read size of PLRU a ROM helps utilization
23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
25 (discussion about brams for ECP5)
29 from enum
import (Enum
, unique
)
30 from nmigen
import (Module
, Signal
, Elaboratable
, Cat
, Array
, Const
, Repl
,
32 from nmigen
.cli
import main
, rtlil
33 from nmutil
.iocontrol
import RecordObject
34 from nmigen
.utils
import log2_int
35 from nmigen
.lib
.coding
import Decoder
36 from nmutil
.util
import Display
38 #from nmutil.plru import PLRU
39 from soc
.experiment
.plru
import PLRU
, PLRUs
40 from soc
.experiment
.cache_ram
import CacheRam
42 from soc
.experiment
.mem_types
import (Fetch1ToICacheType
,
46 from soc
.experiment
.wb_types
import (WB_ADDR_BITS
, WB_DATA_BITS
,
47 WB_SEL_BITS
, WBAddrType
, WBDataType
,
48 WBSelType
, WBMasterOut
, WBSlaveOut
,
51 from nmigen_soc
.wishbone
.bus
import Interface
52 from soc
.minerva
.units
.fetch
import FetchUnitInterface
56 from soc
.bus
.sram
import SRAM
57 from nmigen
import Memory
58 from nmutil
.util
import wrap
59 from nmigen
.cli
import main
, rtlil
61 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
62 # Also, check out the cxxsim nmigen branch, and latest yosys from git
63 from nmutil
.sim_tmp_alternative
import Simulator
, Settle
68 # BRAM organisation: We never access more than wishbone_data_bits
69 # at a time so to save resources we make the array only that wide,
70 # and use consecutive indices for to make a cache "line"
72 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
73 ROW_SIZE
= WB_DATA_BITS
// 8
74 # Number of lines in a set
78 # L1 ITLB number of entries (direct mapped)
80 # L1 ITLB log_2(page_size)
82 # Number of real address bits that we store
84 # Non-zero to enable log data collection
87 ROW_SIZE_BITS
= ROW_SIZE
* 8
88 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
89 ROW_PER_LINE
= LINE_SIZE
// ROW_SIZE
90 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
91 BRAM_ROWS
= NUM_LINES
* ROW_PER_LINE
92 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
93 INSN_PER_ROW
= ROW_SIZE_BITS
// 32
95 # Bit fields counts in the address
97 # INSN_BITS is the number of bits to select an instruction in a row
98 INSN_BITS
= log2_int(INSN_PER_ROW
)
99 # ROW_BITS is the number of bits to select a row
100 ROW_BITS
= log2_int(BRAM_ROWS
)
101 # ROW_LINE_BITS is the number of bits to select a row within a line
102 ROW_LINE_BITS
= log2_int(ROW_PER_LINE
)
103 # LINE_OFF_BITS is the number of bits for the offset in a cache line
104 LINE_OFF_BITS
= log2_int(LINE_SIZE
)
105 # ROW_OFF_BITS is the number of bits for the offset in a row
106 ROW_OFF_BITS
= log2_int(ROW_SIZE
)
107 # INDEX_BITS is the number of bits to select a cache line
108 INDEX_BITS
= log2_int(NUM_LINES
)
109 # SET_SIZE_BITS is the log base 2 of the set size
110 SET_SIZE_BITS
= LINE_OFF_BITS
+ INDEX_BITS
111 # TAG_BITS is the number of bits of the tag part of the address
112 TAG_BITS
= REAL_ADDR_BITS
- SET_SIZE_BITS
113 # TAG_WIDTH is the width in bits of each way of the tag RAM
114 TAG_WIDTH
= TAG_BITS
+ 7 - ((TAG_BITS
+ 7) % 8)
116 # WAY_BITS is the number of bits to select a way
117 WAY_BITS
= log2_int(NUM_WAYS
)
118 TAG_RAM_WIDTH
= TAG_BITS
* NUM_WAYS
121 TLB_BITS
= log2_int(TLB_SIZE
)
122 TLB_EA_TAG_BITS
= 64 - (TLB_LG_PGSZ
+ TLB_BITS
)
125 print("BRAM_ROWS =", BRAM_ROWS
)
126 print("INDEX_BITS =", INDEX_BITS
)
127 print("INSN_BITS =", INSN_BITS
)
128 print("INSN_PER_ROW =", INSN_PER_ROW
)
129 print("LINE_SIZE =", LINE_SIZE
)
130 print("LINE_OFF_BITS =", LINE_OFF_BITS
)
131 print("LOG_LENGTH =", LOG_LENGTH
)
132 print("NUM_LINES =", NUM_LINES
)
133 print("NUM_WAYS =", NUM_WAYS
)
134 print("REAL_ADDR_BITS =", REAL_ADDR_BITS
)
135 print("ROW_BITS =", ROW_BITS
)
136 print("ROW_OFF_BITS =", ROW_OFF_BITS
)
137 print("ROW_LINE_BITS =", ROW_LINE_BITS
)
138 print("ROW_PER_LINE =", ROW_PER_LINE
)
139 print("ROW_SIZE =", ROW_SIZE
)
140 print("ROW_SIZE_BITS =", ROW_SIZE_BITS
)
141 print("SET_SIZE_BITS =", SET_SIZE_BITS
)
143 print("TAG_BITS =", TAG_BITS
)
144 print("TAG_RAM_WIDTH =", TAG_RAM_WIDTH
)
145 print("TAG_BITS =", TAG_BITS
)
146 print("TLB_BITS =", TLB_BITS
)
147 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS
)
148 print("TLB_LG_PGSZ =", TLB_LG_PGSZ
)
149 print("TLB_PTE_BITS =", TLB_PTE_BITS
)
150 print("TLB_SIZE =", TLB_SIZE
)
151 print("WAY_BITS =", WAY_BITS
)
153 # from microwatt/utils.vhdl
155 return n
!= 0 and (n
& (n
- 1)) == 0
157 assert LINE_SIZE
% ROW_SIZE
== 0
158 assert ispow2(LINE_SIZE
), "LINE_SIZE not power of 2"
159 assert ispow2(NUM_LINES
), "NUM_LINES not power of 2"
160 assert ispow2(ROW_PER_LINE
), "ROW_PER_LINE not power of 2"
161 assert ispow2(INSN_PER_ROW
), "INSN_PER_ROW not power of 2"
162 assert (ROW_BITS
== (INDEX_BITS
+ ROW_LINE_BITS
)), \
163 "geometry bits don't add up"
164 assert (LINE_OFF_BITS
== (ROW_OFF_BITS
+ ROW_LINE_BITS
)), \
165 "geometry bits don't add up"
166 assert (REAL_ADDR_BITS
== (TAG_BITS
+ INDEX_BITS
+ LINE_OFF_BITS
)), \
167 "geometry bits don't add up"
168 assert (REAL_ADDR_BITS
== (TAG_BITS
+ ROW_BITS
+ ROW_OFF_BITS
)), \
169 "geometry bits don't add up"
171 # Example of layout for 32 lines of 64 bytes:
173 # .. tag |index| line |
175 # .. | | | |00| zero (2)
176 # .. | | |-| | INSN_BITS (1)
177 # .. | |---| | ROW_LINE_BITS (3)
178 # .. | |--- - --| LINE_OFF_BITS (6)
179 # .. | |- --| ROW_OFF_BITS (3)
180 # .. |----- ---| | ROW_BITS (8)
181 # .. |-----| | INDEX_BITS (5)
182 # .. --------| | TAG_BITS (53)
184 # The cache data BRAM organized as described above for each way
185 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
187 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
188 # not handle a clean (commented) definition of the cache tags as a 3d
189 # memory. For now, work around it by putting all the tags
191 tag_layout
= [('valid', NUM_WAYS
),
192 ('tag', TAG_RAM_WIDTH
),
194 return Array(Record(tag_layout
, name
="tag%d" % x
) for x
in range(NUM_LINES
))
196 def RowPerLineValidArray():
197 return Array(Signal(name
="rows_valid_%d" %x) \
198 for x
in range(ROW_PER_LINE
))
201 # TODO to be passed to nigmen as ram attributes
202 # attribute ram_style : string;
203 # attribute ram_style of cache_tags : signal is "distributed";
206 return Array(Signal(name
="tlb_valid%d" % x
)
207 for x
in range(TLB_SIZE
))
210 tlb_layout
= [ ('tag', TLB_EA_TAG_BITS
),
211 ('pte', TLB_PTE_BITS
)
213 return Record(tlb_layout
, name
=name
)
216 return Array(TLBRecord("tlb%d" % x
) for x
in range(TLB_SIZE
))
218 # PLRU output interface
220 return Array(Signal(WAY_BITS
, name
="plru_out_%d" %x) \
221 for x
in range(NUM_LINES
))
223 # Return the cache line index (tag index) for an address
225 return addr
[LINE_OFF_BITS
:SET_SIZE_BITS
]
227 # Return the cache row index (data memory) for an address
229 return addr
[ROW_OFF_BITS
:SET_SIZE_BITS
]
231 # Return the index of a row within a line
232 def get_row_of_line(row
):
233 return row
[:ROW_BITS
][:ROW_LINE_BITS
]
235 # Returns whether this is the last row of a line
236 def is_last_row_addr(addr
, last
):
237 return addr
[ROW_OFF_BITS
:LINE_OFF_BITS
] == last
239 # Returns whether this is the last row of a line
240 def is_last_row(row
, last
):
241 return get_row_of_line(row
) == last
243 # Return the next row in the current cache line. We use a dedicated
244 # function in order to limit the size of the generated adder to be
245 # only the bits within a cache line (3 bits with default settings)
247 row_v
= row
[0:ROW_LINE_BITS
] + 1
248 return Cat(row_v
[:ROW_LINE_BITS
], row
[ROW_LINE_BITS
:])
250 # Read the instruction word for the given address
251 # in the current cache row
252 def read_insn_word(addr
, data
):
253 word
= addr
[2:INSN_BITS
+2]
254 return data
.word_select(word
, 32)
256 # Get the tag value from the address
258 return addr
[SET_SIZE_BITS
:REAL_ADDR_BITS
]
260 # Read a tag from a tag memory row
261 def read_tag(way
, tagset
):
262 return tagset
.word_select(way
, TAG_WIDTH
)[:TAG_BITS
]
264 # Write a tag to tag memory row
265 def write_tag(way
, tagset
, tag
):
266 return read_tag(way
, tagset
).eq(tag
)
268 # Simple hash for direct-mapped TLB index
270 hsh
= (addr
[TLB_LG_PGSZ
:TLB_LG_PGSZ
+ TLB_BITS
] ^
271 addr
[TLB_LG_PGSZ
+ TLB_BITS
:TLB_LG_PGSZ
+ 2 * TLB_BITS
] ^
272 addr
[TLB_LG_PGSZ
+ 2 * TLB_BITS
:TLB_LG_PGSZ
+ 3 * TLB_BITS
])
276 # Cache reload state machine
284 class RegInternal(RecordObject
):
287 # Cache hit state (Latches for 1 cycle BRAM access)
288 self
.hit_way
= Signal(WAY_BITS
)
289 self
.hit_nia
= Signal(64)
290 self
.hit_smark
= Signal()
291 self
.hit_valid
= Signal()
293 # Cache miss state (reload state machine)
294 self
.state
= Signal(State
, reset
=State
.IDLE
)
295 self
.wb
= WBMasterOut("wb")
296 self
.req_adr
= Signal(64)
297 self
.store_way
= Signal(WAY_BITS
)
298 self
.store_index
= Signal(INDEX_BITS
)
299 self
.store_row
= Signal(ROW_BITS
)
300 self
.store_tag
= Signal(TAG_BITS
)
301 self
.store_valid
= Signal()
302 self
.end_row_ix
= Signal(ROW_LINE_BITS
)
303 self
.rows_valid
= RowPerLineValidArray()
306 self
.fetch_failed
= Signal()
309 class ICache(FetchUnitInterface
, Elaboratable
):
310 """64 bit direct mapped icache. All instructions are 4B aligned."""
311 def __init__(self
, pspec
):
312 FetchUnitInterface
.__init
__(self
, pspec
)
313 self
.i_in
= Fetch1ToICacheType(name
="i_in")
314 self
.i_out
= ICacheToDecode1Type(name
="i_out")
316 self
.m_in
= MMUToICacheType(name
="m_in")
318 self
.stall_in
= Signal()
319 self
.stall_out
= Signal()
320 self
.flush_in
= Signal()
321 self
.inval_in
= Signal()
323 # standard naming (wired to non-standard for compatibility)
324 self
.bus
= Interface(addr_width
=32,
331 self
.log_out
= Signal(54)
333 # use FetchUnitInterface, helps keep some unit tests running
334 self
.use_fetch_iface
= False
336 def use_fetch_interface(self
):
337 self
.use_fetch_iface
= True
339 # Generate a cache RAM for each way
340 def rams(self
, m
, r
, cache_out_row
, use_previous
,
341 replace_way
, req_row
):
346 bus
, stall_in
= self
.bus
, self
.stall_in
348 # read condition (for every cache ram)
350 comb
+= do_read
.eq(~
(stall_in | use_previous
))
352 rd_addr
= Signal(ROW_BITS
)
353 wr_addr
= Signal(ROW_BITS
)
354 comb
+= rd_addr
.eq(req_row
)
355 comb
+= wr_addr
.eq(r
.store_row
)
357 # binary-to-unary converters: replace-way enabled by bus.ack,
358 # hit-way left permanently enabled
359 m
.submodules
.replace_way_e
= re
= Decoder(NUM_WAYS
)
360 m
.submodules
.hit_way_e
= he
= Decoder(NUM_WAYS
)
361 comb
+= re
.i
.eq(replace_way
)
362 comb
+= re
.n
.eq(~bus
.ack
)
363 comb
+= he
.i
.eq(r
.hit_way
)
365 for i
in range(NUM_WAYS
):
366 do_write
= Signal(name
="do_wr_%d" % i
)
367 d_out
= Signal(ROW_SIZE_BITS
, name
="d_out_%d" % i
)
368 wr_sel
= Signal(ROW_SIZE
, name
="wr_sel_%d" % i
)
370 way
= CacheRam(ROW_BITS
, ROW_SIZE_BITS
, TRACE
=True, ram_num
=i
)
371 m
.submodules
["cacheram_%d" % i
] = way
373 comb
+= way
.rd_en
.eq(do_read
)
374 comb
+= way
.rd_addr
.eq(rd_addr
)
375 comb
+= d_out
.eq(way
.rd_data_o
)
376 comb
+= way
.wr_sel
.eq(wr_sel
)
377 comb
+= way
.wr_addr
.eq(wr_addr
)
378 comb
+= way
.wr_data
.eq(bus
.dat_r
)
380 comb
+= do_write
.eq(re
.o
[i
])
383 sync
+= Display("cache write adr: %x data: %lx",
384 wr_addr
, way
.wr_data
)
387 comb
+= cache_out_row
.eq(d_out
)
389 sync
+= Display("cache read adr: %x data: %x",
392 comb
+= wr_sel
.eq(Repl(do_write
, ROW_SIZE
))
395 def maybe_plrus(self
, m
, r
, plru_victim
):
402 m
.submodules
.plrus
= plru
= PLRUs(NUM_LINES
, WAY_BITS
)
403 comb
+= plru
.way
.eq(r
.hit_way
)
404 comb
+= plru
.valid
.eq(r
.hit_valid
)
405 comb
+= plru
.index
.eq(get_index(r
.hit_nia
))
406 comb
+= plru
.isel
.eq(r
.store_index
) # select victim
407 comb
+= plru_victim
.eq(plru
.o_index
) # selected victim
409 # TLB hit detection and real address generation
410 def itlb_lookup(self
, m
, tlb_req_index
, itlb
, itlb_valid
,
411 real_addr
, ra_valid
, eaa_priv
,
412 priv_fault
, access_ok
):
418 # use an *asynchronous* Memory read port here (combinatorial)
419 m
.submodules
.rd_tlb
= rd_tlb
= self
.tlbmem
.read_port(domain
="comb")
420 tlb
= TLBRecord("tlb_rdport")
421 pte
, ttag
= tlb
.pte
, tlb
.tag
423 comb
+= tlb_req_index
.eq(hash_ea(i_in
.nia
))
424 comb
+= rd_tlb
.addr
.eq(tlb_req_index
)
425 comb
+= tlb
.eq(rd_tlb
.data
)
427 with m
.If(i_in
.virt_mode
):
428 comb
+= real_addr
.eq(Cat(i_in
.nia
[:TLB_LG_PGSZ
],
429 pte
[TLB_LG_PGSZ
:REAL_ADDR_BITS
]))
431 with m
.If(ttag
== i_in
.nia
[TLB_LG_PGSZ
+ TLB_BITS
:64]):
432 comb
+= ra_valid
.eq(itlb_valid
[tlb_req_index
])
434 comb
+= eaa_priv
.eq(pte
[3])
437 comb
+= real_addr
.eq(i_in
.nia
[:REAL_ADDR_BITS
])
438 comb
+= ra_valid
.eq(1)
439 comb
+= eaa_priv
.eq(1)
441 # No IAMR, so no KUEP support for now
442 comb
+= priv_fault
.eq(eaa_priv
& ~i_in
.priv_mode
)
443 comb
+= access_ok
.eq(ra_valid
& ~priv_fault
)
446 def itlb_update(self
, m
, itlb
, itlb_valid
):
452 wr_index
= Signal(TLB_SIZE
)
453 comb
+= wr_index
.eq(hash_ea(m_in
.addr
))
455 m
.submodules
.wr_tlb
= wr_tlb
= self
.tlbmem
.write_port()
457 with m
.If(m_in
.tlbie
& m_in
.doall
):
458 # Clear all valid bits
459 for i
in range(TLB_SIZE
):
460 sync
+= itlb_valid
[i
].eq(0)
462 with m
.Elif(m_in
.tlbie
):
463 # Clear entry regardless of hit or miss
464 sync
+= itlb_valid
[wr_index
].eq(0)
466 with m
.Elif(m_in
.tlbld
):
467 tlb
= TLBRecord("tlb_wrport")
468 comb
+= tlb
.tag
.eq(m_in
.addr
[TLB_LG_PGSZ
+ TLB_BITS
:64])
469 comb
+= tlb
.pte
.eq(m_in
.pte
)
470 comb
+= wr_tlb
.en
.eq(1)
471 comb
+= wr_tlb
.addr
.eq(wr_index
)
472 comb
+= wr_tlb
.data
.eq(tlb
)
473 sync
+= itlb_valid
[wr_index
].eq(1)
475 # Cache hit detection, output to fetch2 and other misc logic
476 def icache_comb(self
, m
, use_previous
, r
, req_index
, req_row
,
477 req_hit_way
, req_tag
, real_addr
, req_laddr
,
478 cache_tags
, access_ok
,
479 req_is_hit
, req_is_miss
, replace_way
,
480 plru_victim
, cache_out_row
):
484 i_in
, i_out
, bus
= self
.i_in
, self
.i_out
, self
.bus
485 flush_in
, stall_out
= self
.flush_in
, self
.stall_out
488 hit_way
= Signal(WAY_BITS
)
490 # i_in.sequential means that i_in.nia this cycle is 4 more than
491 # last cycle. If we read more than 32 bits at a time, had a
492 # cache hit last cycle, and we don't want the first 32-bit chunk
493 # then we can keep the data we read last cycle and just use that.
494 with m
.If(i_in
.nia
[2:INSN_BITS
+2] != 0):
495 comb
+= use_previous
.eq(i_in
.sequential
& r
.hit_valid
)
497 # Extract line, row and tag from request
498 comb
+= req_index
.eq(get_index(i_in
.nia
))
499 comb
+= req_row
.eq(get_row(i_in
.nia
))
500 comb
+= req_tag
.eq(get_tag(real_addr
))
502 # Calculate address of beginning of cache row, will be
503 # used for cache miss processing if needed
504 comb
+= req_laddr
.eq(Cat(
505 Const(0, ROW_OFF_BITS
),
506 real_addr
[ROW_OFF_BITS
:REAL_ADDR_BITS
],
509 # Test if pending request is a hit on any way
511 comb
+= hitcond
.eq((r
.state
== State
.WAIT_ACK
)
512 & (req_index
== r
.store_index
)
513 & r
.rows_valid
[req_row
% ROW_PER_LINE
]
515 # i_in.req asserts Decoder active
516 cvb
= Signal(NUM_WAYS
)
517 ctag
= Signal(TAG_RAM_WIDTH
)
518 comb
+= ctag
.eq(cache_tags
[req_index
].tag
)
519 comb
+= cvb
.eq(cache_tags
[req_index
].valid
)
520 m
.submodules
.store_way_e
= se
= Decoder(NUM_WAYS
)
521 comb
+= se
.i
.eq(r
.store_way
)
522 comb
+= se
.n
.eq(~i_in
.req
)
523 for i
in range(NUM_WAYS
):
524 tagi
= Signal(TAG_BITS
, name
="tag_i%d" % i
)
525 hit_test
= Signal(name
="hit_test%d" % i
)
526 is_tag_hit
= Signal(name
="is_tag_hit_%d" % i
)
527 comb
+= tagi
.eq(read_tag(i
, ctag
))
528 comb
+= hit_test
.eq(se
.o
[i
])
529 comb
+= is_tag_hit
.eq((cvb
[i
] |
(hitcond
& hit_test
)) &
531 with m
.If(is_tag_hit
):
532 comb
+= hit_way
.eq(i
)
535 # Generate the "hit" and "miss" signals
536 # for the synchronous blocks
537 with m
.If(i_in
.req
& access_ok
& ~flush_in
):
538 comb
+= req_is_hit
.eq(is_hit
)
539 comb
+= req_is_miss
.eq(~is_hit
)
541 comb
+= req_hit_way
.eq(hit_way
)
543 # The way to replace on a miss
544 with m
.If(r
.state
== State
.CLR_TAG
):
545 comb
+= replace_way
.eq(plru_victim
)
547 comb
+= replace_way
.eq(r
.store_way
)
549 # Output instruction from current cache row
551 # Note: This is a mild violation of our design principle of
552 # having pipeline stages output from a clean latch. In this
553 # case we output the result of a mux. The alternative would
554 # be output an entire row which I prefer not to do just yet
555 # as it would force fetch2 to know about some of the cache
556 # geometry information.
557 comb
+= i_out
.insn
.eq(read_insn_word(r
.hit_nia
, cache_out_row
))
558 comb
+= i_out
.valid
.eq(r
.hit_valid
)
559 comb
+= i_out
.nia
.eq(r
.hit_nia
)
560 comb
+= i_out
.stop_mark
.eq(r
.hit_smark
)
561 comb
+= i_out
.fetch_failed
.eq(r
.fetch_failed
)
563 # Stall fetch1 if we have a miss on cache or TLB
564 # or a protection fault
565 comb
+= stall_out
.eq(~
(is_hit
& access_ok
))
567 # Wishbone requests output (from the cache miss reload machine)
568 comb
+= bus
.we
.eq(r
.wb
.we
)
569 comb
+= bus
.adr
.eq(r
.wb
.adr
)
570 comb
+= bus
.sel
.eq(r
.wb
.sel
)
571 comb
+= bus
.stb
.eq(r
.wb
.stb
)
572 comb
+= bus
.dat_w
.eq(r
.wb
.dat
)
573 comb
+= bus
.cyc
.eq(r
.wb
.cyc
)
575 # Cache hit synchronous machine
576 def icache_hit(self
, m
, use_previous
, r
, req_is_hit
, req_hit_way
,
577 req_index
, req_tag
, real_addr
):
580 i_in
, stall_in
= self
.i_in
, self
.stall_in
581 flush_in
= self
.flush_in
583 # keep outputs to fetch2 unchanged on a stall
584 # except that flush or reset sets valid to 0
585 # If use_previous, keep the same data as last
586 # cycle and use the second half
587 with m
.If(stall_in | use_previous
):
589 sync
+= r
.hit_valid
.eq(0)
591 # On a hit, latch the request for the next cycle,
592 # when the BRAM data will be available on the
593 # cache_out output of the corresponding way
594 sync
+= r
.hit_valid
.eq(req_is_hit
)
596 with m
.If(req_is_hit
):
597 sync
+= r
.hit_way
.eq(req_hit_way
)
598 sync
+= Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
599 "way:%x RA:%x", i_in
.nia
, i_in
.virt_mode
,
600 i_in
.stop_mark
, req_index
, req_tag
,
601 req_hit_way
, real_addr
)
603 with m
.If(~stall_in
):
604 # Send stop marks and NIA down regardless of validity
605 sync
+= r
.hit_smark
.eq(i_in
.stop_mark
)
606 sync
+= r
.hit_nia
.eq(i_in
.nia
)
608 def icache_miss_idle(self
, m
, r
, req_is_miss
, req_laddr
,
609 req_index
, req_tag
, replace_way
, real_addr
):
615 # Reset per-row valid flags, only used in WAIT_ACK
616 for i
in range(ROW_PER_LINE
):
617 sync
+= r
.rows_valid
[i
].eq(0)
619 # We need to read a cache line
620 with m
.If(req_is_miss
):
622 "cache miss nia:%x IR:%x SM:%x idx:%x "
623 " way:%x tag:%x RA:%x", i_in
.nia
,
624 i_in
.virt_mode
, i_in
.stop_mark
, req_index
,
625 replace_way
, req_tag
, real_addr
)
627 # Keep track of our index and way for subsequent stores
628 st_row
= Signal(ROW_BITS
)
629 comb
+= st_row
.eq(get_row(req_laddr
))
630 sync
+= r
.store_index
.eq(req_index
)
631 sync
+= r
.store_row
.eq(st_row
)
632 sync
+= r
.store_tag
.eq(req_tag
)
633 sync
+= r
.store_valid
.eq(1)
634 sync
+= r
.end_row_ix
.eq(get_row_of_line(st_row
) - 1)
636 # Prep for first wishbone read. We calculate the address
637 # of the start of the cache line and start the WB cycle.
638 sync
+= r
.req_adr
.eq(req_laddr
)
639 sync
+= r
.wb
.cyc
.eq(1)
640 sync
+= r
.wb
.stb
.eq(1)
642 # Track that we had one request sent
643 sync
+= r
.state
.eq(State
.CLR_TAG
)
645 def icache_miss_clr_tag(self
, m
, r
, replace_way
,
651 # Get victim way from plru
652 sync
+= r
.store_way
.eq(replace_way
)
654 # Force misses on that way while reloading that line
655 cv
= Signal(INDEX_BITS
)
656 comb
+= cv
.eq(cache_tags
[req_index
].valid
)
657 comb
+= cv
.bit_select(replace_way
, 1).eq(0)
658 sync
+= cache_tags
[req_index
].valid
.eq(cv
)
660 for i
in range(NUM_WAYS
):
661 with m
.If(i
== replace_way
):
662 comb
+= tagset
.eq(cache_tags
[r
.store_index
].tag
)
663 comb
+= write_tag(i
, tagset
, r
.store_tag
)
664 sync
+= cache_tags
[r
.store_index
].tag
.eq(tagset
)
666 sync
+= r
.state
.eq(State
.WAIT_ACK
)
668 def icache_miss_wait_ack(self
, m
, r
, replace_way
, inval_in
,
669 cache_tags
, stbs_done
):
675 # Requests are all sent if stb is 0
677 comb
+= stbs_zero
.eq(r
.wb
.stb
== 0)
678 comb
+= stbs_done
.eq(stbs_zero
)
680 # If we are still sending requests, was one accepted?
681 with m
.If(~bus
.stall
& ~stbs_zero
):
682 # That was the last word? We are done sending.
683 # Clear stb and set stbs_done so we can handle
684 # an eventual last ack on the same cycle.
685 with m
.If(is_last_row_addr(r
.req_adr
, r
.end_row_ix
)):
686 sync
+= Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
687 "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
688 "stbs_done:%x", r
.wb
.adr
, r
.end_row_ix
,
689 r
.wb
.stb
, stbs_zero
, stbs_done
)
690 sync
+= r
.wb
.stb
.eq(0)
691 comb
+= stbs_done
.eq(1)
693 # Calculate the next row address
694 rarange
= Signal(LINE_OFF_BITS
- ROW_OFF_BITS
)
695 comb
+= rarange
.eq(r
.req_adr
[ROW_OFF_BITS
:LINE_OFF_BITS
] + 1)
696 sync
+= r
.req_adr
[ROW_OFF_BITS
:LINE_OFF_BITS
].eq(rarange
)
697 sync
+= Display("RARANGE r.req_adr:%x rarange:%x "
698 "stbs_zero:%x stbs_done:%x",
699 r
.req_adr
, rarange
, stbs_zero
, stbs_done
)
701 # Incoming acks processing
703 sync
+= Display("WB_IN_ACK data:%x stbs_zero:%x "
705 bus
.dat_r
, stbs_zero
, stbs_done
)
707 sync
+= r
.rows_valid
[r
.store_row
% ROW_PER_LINE
].eq(1)
709 # Check for completion
710 with m
.If(stbs_done
& is_last_row(r
.store_row
, r
.end_row_ix
)):
711 # Complete wishbone cycle
712 sync
+= r
.wb
.cyc
.eq(0)
713 # be nice, clear addr
714 sync
+= r
.req_adr
.eq(0)
716 # Cache line is now valid
717 cv
= Signal(INDEX_BITS
)
718 comb
+= cv
.eq(cache_tags
[r
.store_index
].valid
)
719 comb
+= cv
.bit_select(replace_way
, 1).eq(
720 r
.store_valid
& ~inval_in
)
721 sync
+= cache_tags
[r
.store_index
].valid
.eq(cv
)
723 sync
+= r
.state
.eq(State
.IDLE
)
725 # move on to next request in row
726 # Increment store row counter
727 sync
+= r
.store_row
.eq(next_row(r
.store_row
))
729 # Cache miss/reload synchronous machine
730 def icache_miss(self
, m
, r
, req_is_miss
,
731 req_index
, req_laddr
, req_tag
, replace_way
,
732 cache_tags
, access_ok
, real_addr
):
736 i_in
, bus
, m_in
= self
.i_in
, self
.bus
, self
.m_in
737 stall_in
, flush_in
= self
.stall_in
, self
.flush_in
738 inval_in
= self
.inval_in
740 tagset
= Signal(TAG_RAM_WIDTH
)
743 comb
+= r
.wb
.sel
.eq(-1)
744 comb
+= r
.wb
.adr
.eq(r
.req_adr
[3:])
746 # Process cache invalidations
748 for i
in range(NUM_LINES
):
749 sync
+= cache_tags
[i
].valid
.eq(0)
750 sync
+= r
.store_valid
.eq(0)
753 with m
.Switch(r
.state
):
755 with m
.Case(State
.IDLE
):
756 self
.icache_miss_idle(m
, r
, req_is_miss
, req_laddr
,
757 req_index
, req_tag
, replace_way
,
760 with m
.Case(State
.CLR_TAG
, State
.WAIT_ACK
):
761 with m
.If(r
.state
== State
.CLR_TAG
):
762 self
.icache_miss_clr_tag(m
, r
, replace_way
,
763 req_index
, tagset
, cache_tags
)
765 self
.icache_miss_wait_ack(m
, r
, replace_way
, inval_in
,
766 cache_tags
, stbs_done
)
768 # TLB miss and protection fault processing
769 with m
.If(flush_in | m_in
.tlbld
):
770 sync
+= r
.fetch_failed
.eq(0)
771 with m
.Elif(i_in
.req
& ~access_ok
& ~stall_in
):
772 sync
+= r
.fetch_failed
.eq(1)
774 # icache_log: if LOG_LENGTH > 0 generate
775 def icache_log(self
, m
, req_hit_way
, ra_valid
, access_ok
,
776 req_is_miss
, req_is_hit
, lway
, wstate
, r
):
780 bus
, i_out
= self
.bus
, self
.i_out
781 log_out
, stall_out
= self
.log_out
, self
.stall_out
783 # Output data to logger
784 for i
in range(LOG_LENGTH
):
785 log_data
= Signal(54)
786 lway
= Signal(WAY_BITS
)
789 sync
+= lway
.eq(req_hit_way
)
792 with m
.If(r
.state
!= State
.IDLE
):
795 sync
+= log_data
.eq(Cat(
796 ra_valid
, access_ok
, req_is_miss
, req_is_hit
,
797 lway
, wstate
, r
.hit_nia
[2:6], r
.fetch_failed
,
798 stall_out
, bus
.stall
, r
.wb
.cyc
, r
.wb
.stb
,
799 r
.real_addr
[3:6], bus
.ack
, i_out
.insn
, i_out
.valid
801 comb
+= log_out
.eq(log_data
)
803 def elaborate(self
, platform
):
808 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
809 cache_tags
= CacheTagArray()
813 itlb_valid
= TLBValidArray()
815 # TODO to be passed to nmigen as ram attributes
816 # attribute ram_style of itlb_tags : signal is "distributed";
817 # attribute ram_style of itlb_ptes : signal is "distributed";
819 # Privilege bit from PTE EAA field
824 # Async signal on incoming request
825 req_index
= Signal(INDEX_BITS
)
826 req_row
= Signal(ROW_BITS
)
827 req_hit_way
= Signal(WAY_BITS
)
828 req_tag
= Signal(TAG_BITS
)
829 req_is_hit
= Signal()
830 req_is_miss
= Signal()
831 req_laddr
= Signal(64)
833 tlb_req_index
= Signal(TLB_BITS
)
834 real_addr
= Signal(REAL_ADDR_BITS
)
836 priv_fault
= Signal()
838 use_previous
= Signal()
840 cache_out_row
= Signal(ROW_SIZE_BITS
)
842 plru_victim
= Signal(WAY_BITS
)
843 replace_way
= Signal(WAY_BITS
)
845 self
.tlbmem
= Memory(depth
=TLB_SIZE
, width
=TLB_EA_TAG_BITS
+TLB_PTE_BITS
)
847 # call sub-functions putting everything together,
848 # using shared signals established above
849 self
.rams(m
, r
, cache_out_row
, use_previous
, replace_way
, req_row
)
850 self
.maybe_plrus(m
, r
, plru_victim
)
851 self
.itlb_lookup(m
, tlb_req_index
, itlb
, itlb_valid
, real_addr
,
852 ra_valid
, eaa_priv
, priv_fault
,
854 self
.itlb_update(m
, itlb
, itlb_valid
)
855 self
.icache_comb(m
, use_previous
, r
, req_index
, req_row
, req_hit_way
,
856 req_tag
, real_addr
, req_laddr
,
857 cache_tags
, access_ok
, req_is_hit
, req_is_miss
,
858 replace_way
, plru_victim
, cache_out_row
)
859 self
.icache_hit(m
, use_previous
, r
, req_is_hit
, req_hit_way
,
860 req_index
, req_tag
, real_addr
)
861 self
.icache_miss(m
, r
, req_is_miss
, req_index
,
862 req_laddr
, req_tag
, replace_way
, cache_tags
,
863 access_ok
, real_addr
)
864 #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
865 # req_is_miss, req_is_hit, lway, wstate, r)
867 # don't connect up to FetchUnitInterface so that some unit tests
868 # can continue to operate
869 if not self
.use_fetch_iface
:
872 # connect to FetchUnitInterface. FetchUnitInterface is undocumented
873 # so needs checking and iterative revising
874 i_in
, bus
, i_out
= self
.i_in
, self
.bus
, self
.i_out
875 comb
+= i_in
.req
.eq(self
.a_i_valid
)
876 comb
+= i_in
.nia
.eq(self
.a_pc_i
)
877 comb
+= self
.stall_in
.eq(self
.a_stall_i
)
878 comb
+= self
.f_fetch_err_o
.eq(i_out
.fetch_failed
)
879 comb
+= self
.f_badaddr_o
.eq(i_out
.nia
)
880 comb
+= self
.f_instr_o
.eq(i_out
.insn
)
881 comb
+= self
.f_busy_o
.eq(~i_out
.valid
) # probably
883 # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
885 comb
+= ibus
.adr
.eq(self
.bus
.adr
)
886 comb
+= ibus
.dat_w
.eq(self
.bus
.dat_w
)
887 comb
+= ibus
.sel
.eq(self
.bus
.sel
)
888 comb
+= ibus
.cyc
.eq(self
.bus
.cyc
)
889 comb
+= ibus
.stb
.eq(self
.bus
.stb
)
890 comb
+= ibus
.we
.eq(self
.bus
.we
)
892 comb
+= self
.bus
.dat_r
.eq(ibus
.dat_r
)
893 comb
+= self
.bus
.ack
.eq(ibus
.ack
)
894 if hasattr(ibus
, "stall"):
895 comb
+= self
.bus
.stall
.eq(ibus
.stall
)
897 # fake-up the wishbone stall signal to comply with pipeline mode
898 # same thing is done in dcache.py
899 comb
+= self
.bus
.stall
.eq(self
.bus
.cyc
& ~self
.bus
.ack
)
909 yield i_in
.priv_mode
.eq(1)
912 yield i_in
.stop_mark
.eq(0)
913 yield m_out
.tlbld
.eq(0)
914 yield m_out
.tlbie
.eq(0)
915 yield m_out
.addr
.eq(0)
916 yield m_out
.pte
.eq(0)
922 # miss, stalls for a bit
924 yield i_in
.nia
.eq(Const(0x0000000000000004, 64))
926 valid
= yield i_out
.valid
929 valid
= yield i_out
.valid
932 insn
= yield i_out
.insn
933 nia
= yield i_out
.nia
934 assert insn
== 0x00000001, \
935 "insn @%x=%x expected 00000001" % (nia
, insn
)
941 yield i_in
.nia
.eq(Const(0x0000000000000008, 64))
943 valid
= yield i_out
.valid
946 valid
= yield i_out
.valid
949 nia
= yield i_out
.nia
950 insn
= yield i_out
.insn
952 assert insn
== 0x00000002, \
953 "insn @%x=%x expected 00000002" % (nia
, insn
)
957 yield i_in
.nia
.eq(Const(0x0000000000000040, 64))
959 valid
= yield i_out
.valid
962 valid
= yield i_out
.valid
966 insn
= yield i_out
.insn
967 assert insn
== 0x00000010, \
968 "insn @%x=%x expected 00000010" % (nia
, insn
)
970 # test something that aliases (this only works because
971 # the unit test SRAM is a depth of 512)
973 yield i_in
.nia
.eq(Const(0x0000000000000100, 64))
976 valid
= yield i_out
.valid
981 insn
= yield i_out
.insn
982 valid
= yield i_out
.valid
983 insn
= yield i_out
.insn
985 assert insn
== 0x00000040, \
986 "insn @%x=%x expected 00000040" % (nia
, insn
)
990 def test_icache(mem
):
991 from soc
.config
.test
.test_loadstore
import TestMemPspec
992 pspec
= TestMemPspec(addr_wid
=32,
998 memory
= Memory(width
=64, depth
=512, init
=mem
)
999 sram
= SRAM(memory
=memory
, granularity
=8)
1003 m
.submodules
.icache
= dut
1004 m
.submodules
.sram
= sram
1006 m
.d
.comb
+= sram
.bus
.cyc
.eq(dut
.bus
.cyc
)
1007 m
.d
.comb
+= sram
.bus
.stb
.eq(dut
.bus
.stb
)
1008 m
.d
.comb
+= sram
.bus
.we
.eq(dut
.bus
.we
)
1009 m
.d
.comb
+= sram
.bus
.sel
.eq(dut
.bus
.sel
)
1010 m
.d
.comb
+= sram
.bus
.adr
.eq(dut
.bus
.adr
)
1011 m
.d
.comb
+= sram
.bus
.dat_w
.eq(dut
.bus
.dat_w
)
1013 m
.d
.comb
+= dut
.bus
.ack
.eq(sram
.bus
.ack
)
1014 m
.d
.comb
+= dut
.bus
.dat_r
.eq(sram
.bus
.dat_r
)
1020 sim
.add_sync_process(wrap(icache_sim(dut
)))
1021 with sim
.write_vcd('test_icache.vcd'):
1025 if __name__
== '__main__':
1026 from soc
.config
.test
.test_loadstore
import TestMemPspec
1027 pspec
= TestMemPspec(addr_wid
=64,
1032 vl
= rtlil
.convert(dut
, ports
=[])
1033 with
open("test_icache.il", "w") as f
:
1036 # set up memory every 32-bits with incrementing values 0 1 2 ...
1038 for i
in range(512):
1039 mem
.append((i
*2) |
((i
*2+1)<<32))