3 based on Anton Blanchard microwatt dcache.vhdl
7 from enum
import Enum
, unique
9 from nmigen
import Module
, Signal
, Elaboratable
, Cat
, Repl
, Array
, Const
10 from nmigen
.cli
import main
11 from nmutil
.iocontrol
import RecordObject
12 from nmigen
.utils
import log2_int
13 from nmigen
.cli
import rtlil
16 from soc
.experiment
.mem_types
import (LoadStore1ToDCacheType
,
17 DCacheToLoadStore1Type
,
21 from soc
.experiment
.wb_types
import (WB_ADDR_BITS
, WB_DATA_BITS
, WB_SEL_BITS
,
22 WBAddrType
, WBDataType
, WBSelType
,
23 WBMasterOut
, WBSlaveOut
,
24 WBMasterOutVector
, WBSlaveOutVector
,
25 WBIOMasterOut
, WBIOSlaveOut
)
27 from soc
.experiment
.cache_ram
import CacheRam
28 from soc
.experiment
.plru
import PLRU
31 # TODO: make these parameters of DCache at some point
32 LINE_SIZE
= 64 # Line size in bytes
33 NUM_LINES
= 32 # Number of lines in a set
34 NUM_WAYS
= 4 # Number of ways
35 TLB_SET_SIZE
= 64 # L1 DTLB entries per set
36 TLB_NUM_WAYS
= 2 # L1 DTLB number of sets
37 TLB_LG_PGSZ
= 12 # L1 DTLB log_2(page_size)
38 LOG_LENGTH
= 0 # Non-zero to enable log data collection
40 # BRAM organisation: We never access more than
41 # -- WB_DATA_BITS at a time so to save
42 # -- resources we make the array only that wide, and
43 # -- use consecutive indices for to make a cache "line"
45 # -- ROW_SIZE is the width in bytes of the BRAM
46 # -- (based on WB, so 64-bits)
47 ROW_SIZE
= WB_DATA_BITS
// 8;
49 # ROW_PER_LINE is the number of row (wishbone
50 # transactions) in a line
51 ROW_PER_LINE
= LINE_SIZE
// ROW_SIZE
53 # BRAM_ROWS is the number of rows in BRAM needed
54 # to represent the full dcache
55 BRAM_ROWS
= NUM_LINES
* ROW_PER_LINE
58 # Bit fields counts in the address
60 # REAL_ADDR_BITS is the number of real address
64 # ROW_BITS is the number of bits to select a row
65 ROW_BITS
= log2_int(BRAM_ROWS
)
67 # ROW_LINE_BITS is the number of bits to select
69 ROW_LINE_BITS
= log2_int(ROW_PER_LINE
)
71 # LINE_OFF_BITS is the number of bits for
72 # the offset in a cache line
73 LINE_OFF_BITS
= log2_int(LINE_SIZE
)
75 # ROW_OFF_BITS is the number of bits for
77 ROW_OFF_BITS
= log2_int(ROW_SIZE
)
79 # INDEX_BITS is the number if bits to
81 INDEX_BITS
= log2_int(NUM_LINES
)
83 # SET_SIZE_BITS is the log base 2 of the set size
84 SET_SIZE_BITS
= LINE_OFF_BITS
+ INDEX_BITS
86 # TAG_BITS is the number of bits of
87 # the tag part of the address
88 TAG_BITS
= REAL_ADDR_BITS
- SET_SIZE_BITS
90 # TAG_WIDTH is the width in bits of each way of the tag RAM
91 TAG_WIDTH
= TAG_BITS
+ 7 - ((TAG_BITS
+ 7) % 8)
93 # WAY_BITS is the number of bits to select a way
94 WAY_BITS
= log2_int(NUM_WAYS
)
96 # Example of layout for 32 lines of 64 bytes:
98 # .. tag |index| line |
100 # .. | |---| | ROW_LINE_BITS (3)
101 # .. | |--- - --| LINE_OFF_BITS (6)
102 # .. | |- --| ROW_OFF_BITS (3)
103 # .. |----- ---| | ROW_BITS (8)
104 # .. |-----| | INDEX_BITS (5)
105 # .. --------| | TAG_BITS (45)
107 TAG_RAM_WIDTH
= TAG_WIDTH
* NUM_WAYS
110 return Array(Signal(TAG_RAM_WIDTH
) for x
in range(NUM_LINES
))
112 def CacheValidBitsArray():
113 return Array(Signal(INDEX_BITS
) for x
in range(NUM_LINES
))
115 def RowPerLineValidArray():
116 return Array(Signal() for x
in range(ROW_PER_LINE
))
119 TLB_SET_BITS
= log2_int(TLB_SET_SIZE
)
120 TLB_WAY_BITS
= log2_int(TLB_NUM_WAYS
)
121 TLB_EA_TAG_BITS
= 64 - (TLB_LG_PGSZ
+ TLB_SET_BITS
)
122 TLB_TAG_WAY_BITS
= TLB_NUM_WAYS
* TLB_EA_TAG_BITS
124 TLB_PTE_WAY_BITS
= TLB_NUM_WAYS
* TLB_PTE_BITS
;
126 assert (LINE_SIZE
% ROW_SIZE
) == 0, "LINE_SIZE not multiple of ROW_SIZE"
127 assert (LINE_SIZE
% 2) == 0, "LINE_SIZE not power of 2"
128 assert (NUM_LINES
% 2) == 0, "NUM_LINES not power of 2"
129 assert (ROW_PER_LINE
% 2) == 0, "ROW_PER_LINE not power of 2"
130 assert ROW_BITS
== (INDEX_BITS
+ ROW_LINE_BITS
), "geometry bits don't add up"
131 assert (LINE_OFF_BITS
== ROW_OFF_BITS
+ ROW_LINE_BITS
), \
132 "geometry bits don't add up"
133 assert REAL_ADDR_BITS
== (TAG_BITS
+ INDEX_BITS
+ LINE_OFF_BITS
), \
134 "geometry bits don't add up"
135 assert REAL_ADDR_BITS
== (TAG_BITS
+ ROW_BITS
+ ROW_OFF_BITS
), \
136 "geometry bits don't add up"
137 assert 64 == WB_DATA_BITS
, "Can't yet handle wb width that isn't 64-bits"
138 assert SET_SIZE_BITS
<= TLB_LG_PGSZ
, "Set indexed by virtual address"
141 def TLBValidBitsArray():
142 return Array(Signal(TLB_NUM_WAYS
) for x
in range(TLB_SET_SIZE
))
145 return Array(Signal(TLB_EA_TAG_BITS
) for x
in range (TLB_NUM_WAYS
))
148 return Array(Signal(TLB_TAG_WAY_BITS
) for x
in range (TLB_SET_SIZE
))
151 return Array(Signal(TLB_PTE_WAY_BITS
) for x
in range(TLB_SET_SIZE
))
154 return Array(Signal(NUM_WAYS
) for x
in range(TLB_NUM_WAYS
))
156 # Cache RAM interface
158 return Array(Signal(WB_DATA_BITS
) for x
in range(NUM_WAYS
))
160 # PLRU output interface
162 return Array(Signal(WAY_BITS
) for x
in range(NUM_LINES
))
164 # TLB PLRU output interface
166 return Array(Signal(TLB_WAY_BITS
) for x
in range(TLB_SET_SIZE
))
168 # Helper functions to decode incoming requests
170 # Return the cache line index (tag index) for an address
172 return addr
[LINE_OFF_BITS
:SET_SIZE_BITS
]
174 # Return the cache row index (data memory) for an address
176 return addr
[ROW_OFF_BITS
:SET_SIZE_BITS
]
178 # Return the index of a row within a line
179 def get_row_of_line(row
):
180 return row
[:ROW_LINE_BITS
]
182 # Returns whether this is the last row of a line
183 def is_last_row_addr(addr
, last
):
184 return addr
[ROW_OFF_BITS
:LINE_OFF_BITS
] == last
186 # Returns whether this is the last row of a line
187 def is_last_row(row
, last
):
188 return get_row_of_line(row
) == last
190 # Return the next row in the current cache line. We use a
191 # dedicated function in order to limit the size of the
192 # generated adder to be only the bits within a cache line
193 # (3 bits with default settings)
195 row_v
= row
[0:ROW_LINE_BITS
] + 1
196 return Cat(row_v
[:ROW_LINE_BITS
], row
[ROW_LINE_BITS
:])
198 # Get the tag value from the address
200 return addr
[SET_SIZE_BITS
:REAL_ADDR_BITS
]
202 # Read a tag from a tag memory row
203 def read_tag(way
, tagset
):
204 return tagset
.word_select(way
, TAG_WIDTH
)[:TAG_BITS
]
206 # Read a TLB tag from a TLB tag memory row
207 def read_tlb_tag(way
, tags
):
208 return tags
.word_select(way
, TLB_EA_TAG_BITS
)
210 # Write a TLB tag to a TLB tag memory row
211 def write_tlb_tag(way
, tags
, tag
):
212 return read_tlb_tag(way
, tags
).eq(tag
)
214 # Read a PTE from a TLB PTE memory row
215 def read_tlb_pte(way
, ptes
):
216 return ptes
.word_select(way
, TLB_PTE_BITS
)
218 def write_tlb_pte(way
, ptes
, newpte
):
219 return read_tlb_pte(way
, ptes
).eq(newpte
)
222 # Record for storing permission, attribute, etc. bits from a PTE
223 class PermAttr(RecordObject
):
226 self
.reference
= Signal()
227 self
.changed
= Signal()
228 self
.nocache
= Signal()
230 self
.rd_perm
= Signal()
231 self
.wr_perm
= Signal()
234 def extract_perm_attr(pte
):
236 pa
.reference
= pte
[8]
245 # Type of operation on a "valid" input
249 OP_BAD
= 1 # NC cache hit, TLB miss, prot/RC failure
250 OP_STCX_FAIL
= 2 # conditional store w/o reservation
251 OP_LOAD_HIT
= 3 # Cache hit on load
252 OP_LOAD_MISS
= 4 # Load missing cache
253 OP_LOAD_NC
= 5 # Non-cachable load
254 OP_STORE_HIT
= 6 # Store hitting cache
255 OP_STORE_MISS
= 7 # Store missing cache
258 # Cache state machine
261 IDLE
= 0 # Normal load hit processing
262 RELOAD_WAIT_ACK
= 1 # Cache reload wait ack
263 STORE_WAIT_ACK
= 2 # Store wait ack
264 NC_LOAD_WAIT_ACK
= 3 # Non-cachable load wait ack
269 # In order to make timing, we use the BRAMs with
270 # an output buffer, which means that the BRAM
271 # output is delayed by an extra cycle.
273 # Thus, the dcache has a 2-stage internal pipeline
274 # for cache hits with no stalls.
276 # All other operations are handled via stalling
277 # in the first stage.
279 # The second stage can thus complete a hit at the same
280 # time as the first stage emits a stall for a complex op.
282 # Stage 0 register, basically contains just the latched request
284 class RegStage0(RecordObject
):
287 self
.req
= LoadStore1ToDCacheType()
288 self
.tlbie
= Signal()
289 self
.doall
= Signal()
290 self
.tlbld
= Signal()
291 self
.mmu_req
= Signal() # indicates source of request
294 class MemAccessRequest(RecordObject
):
298 self
.valid
= Signal()
300 self
.real_addr
= Signal(REAL_ADDR_BITS
)
301 self
.data
= Signal(64)
302 self
.byte_sel
= Signal(8)
303 self
.hit_way
= Signal(WAY_BITS
)
304 self
.same_tag
= Signal()
305 self
.mmu_req
= Signal()
308 # First stage register, contains state for stage 1 of load hits
309 # and for the state machine used by all other operations
310 class RegStage1(RecordObject
):
313 # Info about the request
314 self
.full
= Signal() # have uncompleted request
315 self
.mmu_req
= Signal() # request is from MMU
316 self
.req
= MemAccessRequest()
319 self
.hit_way
= Signal(WAY_BITS
)
320 self
.hit_load_valid
= Signal()
321 self
.hit_index
= Signal(NUM_LINES
)
322 self
.cache_hit
= Signal()
325 self
.tlb_hit
= Signal()
326 self
.tlb_hit_way
= Signal(TLB_NUM_WAYS
)
327 self
.tlb_hit_index
= Signal(TLB_WAY_BITS
)
329 # 2-stage data buffer for data forwarded from writes to reads
330 self
.forward_data1
= Signal(64)
331 self
.forward_data2
= Signal(64)
332 self
.forward_sel1
= Signal(8)
333 self
.forward_valid1
= Signal()
334 self
.forward_way1
= Signal(WAY_BITS
)
335 self
.forward_row1
= Signal(ROW_BITS
)
336 self
.use_forward1
= Signal()
337 self
.forward_sel
= Signal(8)
339 # Cache miss state (reload state machine)
340 self
.state
= Signal(State
)
342 self
.write_bram
= Signal()
343 self
.write_tag
= Signal()
344 self
.slow_valid
= Signal()
345 self
.wb
= WBMasterOut()
346 self
.reload_tag
= Signal(TAG_BITS
)
347 self
.store_way
= Signal(WAY_BITS
)
348 self
.store_row
= Signal(ROW_BITS
)
349 self
.store_index
= Signal(INDEX_BITS
)
350 self
.end_row_ix
= Signal(log2_int(ROW_LINE_BITS
, False))
351 self
.rows_valid
= RowPerLineValidArray()
352 self
.acks_pending
= Signal(3)
353 self
.inc_acks
= Signal()
354 self
.dec_acks
= Signal()
356 # Signals to complete (possibly with error)
357 self
.ls_valid
= Signal()
358 self
.ls_error
= Signal()
359 self
.mmu_done
= Signal()
360 self
.mmu_error
= Signal()
361 self
.cache_paradox
= Signal()
363 # Signal to complete a failed stcx.
364 self
.stcx_fail
= Signal()
367 # Reservation information
368 class Reservation(RecordObject
):
371 self
.valid
= Signal()
372 self
.addr
= Signal(64-LINE_OFF_BITS
)
375 class DTLBUpdate(Elaboratable
):
376 def __init__(self
, dtlb_valid_bits
, dtlb_ptes
):
377 self
.tlbie
= Signal()
378 self
.tlbwe
= Signal()
379 self
.doall
= Signal()
380 self
.tlb_hit
= Signal()
381 self
.tlb_req_index
= Signal(TLB_SET_BITS
)
383 self
.dtlb_valid_bits
= dtlb_valid_bits
384 self
.dtlb_ptes
= dtlb_ptes
386 self
.tlb_hit_way
= Signal(TLB_WAY_BITS
)
387 self
.tlb_tag_way
= Signal(TLB_TAG_WAY_BITS
)
388 self
.tlb_pte_way
= Signal(TLB_PTE_WAY_BITS
)
389 self
.repl_way
= Signal(TLB_WAY_BITS
)
390 self
.eatag
= Signal(TLB_EA_TAG_BITS
)
391 self
.pte_data
= Signal(TLB_PTE_BITS
)
393 def elaborate(self
, platform
):
398 tagset
= Signal(TLB_TAG_WAY_BITS
)
399 pteset
= Signal(TLB_PTE_WAY_BITS
)
401 vb
= Signal(TLB_NUM_WAYS
)
402 db
= Signal(TLB_PTE_WAY_BITS
)
404 sync
+= vb
.eq(self
.dtlb_valid_bits
[self
.tlb_req_index
])
405 sync
+= db
.eq(self
.dtlb_ptes
[self
.tlb_req_index
])
407 with m
.If(self
.tlbie
& self
.doall
):
408 # clear all valid bits at once
409 for i
in range(TLB_SET_SIZE
):
410 sync
+= self
.dtlb_valid_bits
[i
].eq(0)
412 with m
.Elif(self
.tlbie
):
413 with m
.If(self
.tlb_hit
):
414 sync
+= vb
.bit_select(self
.tlb_hit_way
, 1).eq(Const(0, 1))
416 with m
.Elif(self
.tlbwe
):
418 comb
+= tagset
.eq(self
.tlb_tag_way
)
419 comb
+= write_tlb_tag(self
.repl_way
, tagset
, self
.eatag
)
420 sync
+= db
.eq(tagset
)
422 comb
+= pteset
.eq(self
.tlb_pte_way
)
423 comb
+= write_tlb_pte(self
.repl_way
, pteset
, self
.pte_data
)
424 sync
+= db
.eq(pteset
)
426 sync
+= vb
.bit_select(self
.repl_way
, 1).eq(1)
431 class DCache(Elaboratable
):
432 """Set associative dcache write-through
433 TODO (in no specific order):
434 * See list in icache.vhdl
435 * Complete load misses on the cycle when WB data comes instead of
436 at the end of line (this requires dealing with requests coming in
440 self
.d_in
= LoadStore1ToDCacheType()
441 self
.d_out
= DCacheToLoadStore1Type()
443 self
.m_in
= MMUToDCacheType()
444 self
.m_out
= DCacheToMMUType()
446 self
.stall_out
= Signal()
448 self
.wb_out
= WBMasterOut()
449 self
.wb_in
= WBSlaveOut()
451 self
.log_out
= Signal(20)
453 def stage_0(self
, m
, r0
, r1
, r0_full
):
454 """Latch the request in r0.req as long as we're not stalling
458 d_in
, d_out
, m_in
= self
.d_in
, self
.d_out
, self
.m_in
462 # TODO, this goes in unit tests and formal proofs
463 with m
.If(~
(d_in
.valid
& m_in
.valid
)):
464 #sync += Display("request collision loadstore vs MMU")
467 with m
.If(m_in
.valid
):
468 sync
+= r
.req
.valid
.eq(1)
469 sync
+= r
.req
.load
.eq(~
(m_in
.tlbie | m_in
.tlbld
))
470 sync
+= r
.req
.dcbz
.eq(0)
471 sync
+= r
.req
.nc
.eq(0)
472 sync
+= r
.req
.reserve
.eq(0)
473 sync
+= r
.req
.virt_mode
.eq(1)
474 sync
+= r
.req
.priv_mode
.eq(1)
475 sync
+= r
.req
.addr
.eq(m_in
.addr
)
476 sync
+= r
.req
.data
.eq(m_in
.pte
)
477 sync
+= r
.req
.byte_sel
.eq(~
0) # Const -1 sets all to 0b111....
478 sync
+= r
.tlbie
.eq(m_in
.tlbie
)
479 sync
+= r
.doall
.eq(m_in
.doall
)
480 sync
+= r
.tlbld
.eq(m_in
.tlbld
)
481 sync
+= r
.mmu_req
.eq(1)
483 sync
+= r
.req
.eq(d_in
)
484 sync
+= r
.tlbie
.eq(0)
485 sync
+= r
.doall
.eq(0)
486 sync
+= r
.tlbld
.eq(0)
487 sync
+= r
.mmu_req
.eq(0)
488 with m
.If(~
(r1
.full
& r0_full
)):
490 sync
+= r0_full
.eq(r
.req
.valid
)
492 def tlb_read(self
, m
, r0_stall
, tlb_valid_way
,
493 tlb_tag_way
, tlb_pte_way
, dtlb_valid_bits
,
494 dtlb_tags
, dtlb_ptes
):
496 Operates in the second cycle on the request latched in r0.req.
497 TLB updates write the entry at the end of the second cycle.
501 m_in
, d_in
= self
.m_in
, self
.d_in
503 index
= Signal(TLB_SET_BITS
)
504 addrbits
= Signal(TLB_SET_BITS
)
507 amax
= TLB_LG_PGSZ
+ TLB_SET_BITS
509 with m
.If(m_in
.valid
):
510 comb
+= addrbits
.eq(m_in
.addr
[amin
: amax
])
512 comb
+= addrbits
.eq(d_in
.addr
[amin
: amax
])
513 comb
+= index
.eq(addrbits
)
515 # If we have any op and the previous op isn't finished,
516 # then keep the same output for next cycle.
517 with m
.If(~r0_stall
):
518 sync
+= tlb_valid_way
.eq(dtlb_valid_bits
[index
])
519 sync
+= tlb_tag_way
.eq(dtlb_tags
[index
])
520 sync
+= tlb_pte_way
.eq(dtlb_ptes
[index
])
522 def maybe_tlb_plrus(self
, m
, r1
, tlb_plru_victim
, acc
, acc_en
, lru
):
523 """Generate TLB PLRUs
528 with m
.If(TLB_NUM_WAYS
> 1):
529 for i
in range(TLB_SET_SIZE
):
531 tlb_plru
= PLRU(TLB_WAY_BITS
)
532 setattr(m
.submodules
, "maybe_plru_%d" % i
, tlb_plru
)
533 tlb_plru_acc
= Signal(TLB_WAY_BITS
)
534 tlb_plru_acc_en
= Signal()
535 tlb_plru_out
= Signal(TLB_WAY_BITS
)
537 comb
+= tlb_plru
.acc
.eq(tlb_plru_acc
)
538 comb
+= tlb_plru
.acc_en
.eq(tlb_plru_acc_en
)
539 comb
+= tlb_plru
.lru
.eq(tlb_plru_out
)
542 with m
.If(r1
.tlb_hit_index
== i
):
543 comb
+= tlb_plru
.acc_en
.eq(r1
.tlb_hit
)
545 comb
+= tlb_plru
.acc_en
.eq(0)
546 comb
+= tlb_plru
.acc
.eq(r1
.tlb_hit_way
)
548 comb
+= tlb_plru_victim
[i
].eq(tlb_plru
.lru
)
550 def tlb_search(self
, m
, tlb_req_index
, r0
, r0_valid
,
551 tlb_valid_way
, tlb_tag_way
, tlb_hit_way
,
552 tlb_pte_way
, pte
, tlb_hit
, valid_ra
, perm_attr
, ra
):
557 hitway
= Signal(TLB_WAY_BITS
)
559 eatag
= Signal(TLB_EA_TAG_BITS
)
561 TLB_LG_END
= TLB_LG_PGSZ
+ TLB_SET_BITS
562 comb
+= tlb_req_index
.eq(r0
.req
.addr
[TLB_LG_PGSZ
: TLB_LG_END
])
563 comb
+= eatag
.eq(r0
.req
.addr
[TLB_LG_END
: 64 ])
565 for i
in range(TLB_NUM_WAYS
):
566 is_tag_hit
= Signal()
567 comb
+= is_tag_hit
.eq(tlb_valid_way
[i
]
568 & read_tlb_tag(i
, tlb_tag_way
) == eatag
)
569 with m
.If(is_tag_hit
):
573 comb
+= tlb_hit
.eq(hit
& r0_valid
)
574 comb
+= tlb_hit_way
.eq(hitway
)
577 comb
+= pte
.eq(read_tlb_pte(hitway
, tlb_pte_way
))
580 comb
+= valid_ra
.eq(tlb_hit | ~r0
.req
.virt_mode
)
581 with m
.If(r0
.req
.virt_mode
):
582 comb
+= ra
.eq(Cat(Const(0, ROW_OFF_BITS
),
583 r0
.req
.addr
[ROW_OFF_BITS
:TLB_LG_PGSZ
],
584 pte
[TLB_LG_PGSZ
:REAL_ADDR_BITS
]))
585 comb
+= perm_attr
.eq(extract_perm_attr(pte
))
587 comb
+= ra
.eq(Cat(Const(0, ROW_OFF_BITS
),
588 r0
.req
.addr
[ROW_OFF_BITS
:REAL_ADDR_BITS
]))
590 comb
+= perm_attr
.reference
.eq(1)
591 comb
+= perm_attr
.changed
.eq(1)
592 comb
+= perm_attr
.priv
.eq(1)
593 comb
+= perm_attr
.nocache
.eq(0)
594 comb
+= perm_attr
.rd_perm
.eq(1)
595 comb
+= perm_attr
.wr_perm
.eq(1)
597 def tlb_update(self
, m
, r0_valid
, r0
, dtlb_valid_bits
, tlb_req_index
,
598 tlb_hit_way
, tlb_hit
, tlb_plru_victim
, tlb_tag_way
,
599 dtlb_tags
, tlb_pte_way
, dtlb_ptes
):
606 comb
+= tlbie
.eq(r0_valid
& r0
.tlbie
)
607 comb
+= tlbwe
.eq(r0_valid
& r0
.tlbld
)
609 m
.submodules
.tlb_update
= d
= DTLBUpdate(dtlb_valid_bits
, dtlb_ptes
)
610 comb
+= d
.tlbie
.eq(tlbie
)
611 comb
+= d
.tlbwe
.eq(tlbwe
)
612 comb
+= d
.doall
.eq(r0
.doall
)
613 comb
+= d
.tlb_hit
.eq(tlb_hit
)
614 comb
+= d
.tlb_hit_way
.eq(tlb_hit_way
)
615 comb
+= d
.tlb_tag_way
.eq(tlb_tag_way
)
616 comb
+= d
.tlb_pte_way
.eq(tlb_pte_way
)
617 comb
+= d
.tlb_req_index
.eq(tlb_req_index
)
620 comb
+= d
.repl_way
.eq(tlb_hit_way
)
622 comb
+= d
.repl_way
.eq(tlb_plru_victim
[tlb_req_index
])
623 comb
+= d
.eatag
.eq(r0
.req
.addr
[TLB_LG_PGSZ
+ TLB_SET_BITS
:64])
624 comb
+= d
.pte_data
.eq(r0
.req
.data
)
626 def maybe_plrus(self
, m
, r1
, plru_victim
):
632 for i
in range(NUM_LINES
):
634 plru
= PLRU(TLB_WAY_BITS
)
635 setattr(m
.submodules
, "plru%d" % i
, plru
)
636 plru_acc
= Signal(WAY_BITS
)
637 plru_acc_en
= Signal()
638 plru_out
= Signal(WAY_BITS
)
640 comb
+= plru
.acc
.eq(plru_acc
)
641 comb
+= plru
.acc_en
.eq(plru_acc_en
)
642 comb
+= plru_out
.eq(plru
.lru_o
)
644 with m
.If(r1
.hit_index
== i
):
645 comb
+= plru_acc_en
.eq(r1
.cache_hit
)
647 comb
+= plru_acc
.eq(r1
.hit_way
)
648 comb
+= plru_victim
[i
].eq(plru_out
)
650 def cache_tag_read(self
, m
, r0_stall
, req_index
, cache_tag_set
, cache_tags
):
651 """Cache tag RAM read port
655 m_in
, d_in
= self
.m_in
, self
.d_in
657 index
= Signal(INDEX_BITS
)
660 comb
+= index
.eq(req_index
)
661 with m
.Elif(m_in
.valid
):
662 comb
+= index
.eq(get_index(m_in
.addr
))
664 comb
+= index
.eq(get_index(d_in
.addr
))
665 sync
+= cache_tag_set
.eq(cache_tags
[index
])
667 def dcache_request(self
, m
, r0
, ra
, req_index
, req_row
, req_tag
,
668 r0_valid
, r1
, cache_valid_bits
, replace_way
,
669 use_forward1_next
, use_forward2_next
,
670 req_hit_way
, plru_victim
, rc_ok
, perm_attr
,
671 valid_ra
, perm_ok
, access_ok
, req_op
, req_go
,
673 tlb_hit
, tlb_hit_way
, tlb_valid_way
, cache_tag_set
,
674 cancel_store
, req_same_tag
, r0_stall
, early_req_row
):
675 """Cache request parsing and hit detection
680 m_in
, d_in
= self
.m_in
, self
.d_in
683 hit_way
= Signal(WAY_BITS
)
688 hit_set
= Array(Signal() for i
in range(TLB_NUM_WAYS
))
689 hit_way_set
= HitWaySet()
690 rel_matches
= Array(Signal() for i
in range(TLB_NUM_WAYS
))
693 # Extract line, row and tag from request
694 comb
+= req_index
.eq(get_index(r0
.req
.addr
))
695 comb
+= req_row
.eq(get_row(r0
.req
.addr
))
696 comb
+= req_tag
.eq(get_tag(ra
))
698 comb
+= go
.eq(r0_valid
& ~
(r0
.tlbie | r0
.tlbld
) & ~r1
.ls_error
)
700 # Test if pending request is a hit on any way
701 # In order to make timing in virtual mode,
702 # when we are using the TLB, we compare each
703 # way with each of the real addresses from each way of
704 # the TLB, and then decide later which match to use.
706 with m
.If(r0
.req
.virt_mode
):
707 for j
in range(TLB_NUM_WAYS
):
708 s_tag
= Signal(TAG_BITS
)
710 s_pte
= Signal(TLB_PTE_BITS
)
711 s_ra
= Signal(REAL_ADDR_BITS
)
712 comb
+= s_pte
.eq(read_tlb_pte(j
, tlb_pte_way
))
713 comb
+= s_ra
.eq(Cat(r0
.req
.addr
[0:TLB_LG_PGSZ
],
714 s_pte
[TLB_LG_PGSZ
:REAL_ADDR_BITS
]))
715 comb
+= s_tag
.eq(get_tag(s_ra
))
717 for i
in range(NUM_WAYS
):
718 is_tag_hit
= Signal()
719 comb
+= is_tag_hit
.eq(go
& cache_valid_bits
[req_index
][i
] &
720 (read_tag(i
, cache_tag_set
) == s_tag
)
722 with m
.If(is_tag_hit
):
723 comb
+= hit_way_set
[j
].eq(i
)
725 comb
+= hit_set
[j
].eq(s_hit
)
726 with m
.If(s_tag
== r1
.reload_tag
):
727 comb
+= rel_matches
[j
].eq(1)
729 comb
+= is_hit
.eq(hit_set
[tlb_hit_way
])
730 comb
+= hit_way
.eq(hit_way_set
[tlb_hit_way
])
731 comb
+= rel_match
.eq(rel_matches
[tlb_hit_way
])
733 s_tag
= Signal(TAG_BITS
)
734 comb
+= s_tag
.eq(get_tag(r0
.req
.addr
))
735 for i
in range(NUM_WAYS
):
736 is_tag_hit
= Signal()
737 comb
+= is_tag_hit
.eq(go
& cache_valid_bits
[req_index
][i
] &
738 read_tag(i
, cache_tag_set
) == s_tag
)
739 with m
.If(is_tag_hit
):
740 comb
+= hit_way
.eq(i
)
742 with m
.If(s_tag
== r1
.reload_tag
):
743 comb
+= rel_match
.eq(1)
745 comb
+= req_same_tag
.eq(rel_match
)
747 # See if the request matches the line currently being reloaded
748 with m
.If((r1
.state
== State
.RELOAD_WAIT_ACK
) &
749 (req_index
== r1
.store_index
) & rel_match
):
750 # For a store, consider this a hit even if the row isn't
751 # valid since it will be by the time we perform the store.
752 # For a load, check the appropriate row valid bit.
753 valid
= r1
.rows_valid
[req_row
% ROW_PER_LINE
]
754 comb
+= is_hit
.eq(~r0
.req
.load | valid
)
755 comb
+= hit_way
.eq(replace_way
)
757 # Whether to use forwarded data for a load or not
758 comb
+= use_forward1_next
.eq(0)
759 with m
.If((get_row(r1
.req
.real_addr
) == req_row
) &
760 (r1
.req
.hit_way
== hit_way
)):
761 # Only need to consider r1.write_bram here, since if we
762 # are writing refill data here, then we don't have a
763 # cache hit this cycle on the line being refilled.
764 # (There is the possibility that the load following the
765 # load miss that started the refill could be to the old
766 # contents of the victim line, since it is a couple of
767 # cycles after the refill starts before we see the updated
768 # cache tag. In that case we don't use the bypass.)
769 comb
+= use_forward1_next
.eq(r1
.write_bram
)
770 comb
+= use_forward2_next
.eq(0)
771 with m
.If((r1
.forward_row1
== req_row
) & (r1
.forward_way1
== hit_way
)):
772 comb
+= use_forward2_next
.eq(r1
.forward_valid1
)
774 # The way that matched on a hit
775 comb
+= req_hit_way
.eq(hit_way
)
777 # The way to replace on a miss
778 with m
.If(r1
.write_tag
):
779 replace_way
.eq(plru_victim
[r1
.store_index
])
781 comb
+= replace_way
.eq(r1
.store_way
)
783 # work out whether we have permission for this access
784 # NB we don't yet implement AMR, thus no KUAP
785 comb
+= rc_ok
.eq(perm_attr
.reference
786 & (r0
.req
.load | perm_attr
.changed
)
788 comb
+= perm_ok
.eq((r0
.req
.priv_mode | ~perm_attr
.priv
)
790 |
(r0
.req
.load
& perm_attr
.rd_perm
)
792 comb
+= access_ok
.eq(valid_ra
& perm_ok
& rc_ok
)
793 # Combine the request and cache hit status to decide what
794 # operation needs to be done
795 comb
+= nc
.eq(r0
.req
.nc | perm_attr
.nocache
)
796 comb
+= op
.eq(Op
.OP_NONE
)
798 with m
.If(~access_ok
):
799 comb
+= op
.eq(Op
.OP_BAD
)
800 with m
.Elif(cancel_store
):
801 comb
+= op
.eq(Op
.OP_STCX_FAIL
)
803 comb
+= opsel
.eq(Cat(is_hit
, nc
, r0
.req
.load
))
804 with m
.Switch(opsel
):
806 comb
+= op
.eq(Op
.OP_LOAD_HIT
)
808 comb
+= op
.eq(Op
.OP_LOAD_MISS
)
810 comb
+= op
.eq(Op
.OP_LOAD_NC
)
812 comb
+= op
.eq(Op
.OP_STORE_HIT
)
814 comb
+= op
.eq(Op
.OP_STORE_MISS
)
816 comb
+= op
.eq(Op
.OP_STORE_MISS
)
818 comb
+= op
.eq(Op
.OP_BAD
)
820 comb
+= op
.eq(Op
.OP_BAD
)
822 comb
+= op
.eq(Op
.OP_NONE
)
823 comb
+= req_op
.eq(op
)
824 comb
+= req_go
.eq(go
)
826 # Version of the row number that is valid one cycle earlier
827 # in the cases where we need to read the cache data BRAM.
828 # If we're stalling then we need to keep reading the last
830 with m
.If(~r0_stall
):
831 with m
.If(m_in
.valid
):
832 comb
+= early_req_row
.eq(get_row(m_in
.addr
))
834 comb
+= early_req_row
.eq(get_row(d_in
.addr
))
836 comb
+= early_req_row
.eq(req_row
)
838 def reservation_comb(self
, m
, cancel_store
, set_rsrv
, clear_rsrv
,
839 r0_valid
, r0
, reservation
):
840 """Handle load-with-reservation and store-conditional instructions
845 with m
.If(r0_valid
& r0
.req
.reserve
):
847 # XXX generate alignment interrupt if address
848 # is not aligned XXX or if r0.req.nc = '1'
849 with m
.If(r0
.req
.load
):
850 comb
+= set_rsrv
.eq(1) # load with reservation
852 comb
+= clear_rsrv
.eq(1) # store conditional
853 with m
.If(~reservation
.valid | r0
.req
.addr
[LINE_OFF_BITS
:64]):
854 comb
+= cancel_store
.eq(1)
856 def reservation_reg(self
, m
, r0_valid
, access_ok
, set_rsrv
, clear_rsrv
,
862 with m
.If(r0_valid
& access_ok
):
863 with m
.If(clear_rsrv
):
864 sync
+= reservation
.valid
.eq(0)
865 with m
.Elif(set_rsrv
):
866 sync
+= reservation
.valid
.eq(1)
867 sync
+= reservation
.addr
.eq(r0
.req
.addr
[LINE_OFF_BITS
:64])
869 def writeback_control(self
, m
, r1
, cache_out
):
870 """Return data for loads & completion control logic
874 d_out
, m_out
= self
.d_out
, self
.m_out
876 data_out
= Signal(64)
877 data_fwd
= Signal(64)
879 # Use the bypass if are reading the row that was
880 # written 1 or 2 cycles ago, including for the
881 # slow_valid = 1 case (i.e. completing a load
882 # miss or a non-cacheable load).
883 with m
.If(r1
.use_forward1
):
884 comb
+= data_fwd
.eq(r1
.forward_data1
)
886 comb
+= data_fwd
.eq(r1
.forward_data2
)
888 comb
+= data_out
.eq(cache_out
[r1
.hit_way
])
891 with m
.If(r1
.forward_sel
[i
]):
892 dsel
= data_fwd
.word_select(i
, 8)
893 comb
+= data_out
.word_select(i
, 8).eq(dsel
)
895 comb
+= d_out
.valid
.eq(r1
.ls_valid
)
896 comb
+= d_out
.data
.eq(data_out
)
897 comb
+= d_out
.store_done
.eq(~r1
.stcx_fail
)
898 comb
+= d_out
.error
.eq(r1
.ls_error
)
899 comb
+= d_out
.cache_paradox
.eq(r1
.cache_paradox
)
902 comb
+= m_out
.done
.eq(r1
.mmu_done
)
903 comb
+= m_out
.err
.eq(r1
.mmu_error
)
904 comb
+= m_out
.data
.eq(data_out
)
906 # We have a valid load or store hit or we just completed
907 # a slow op such as a load miss, a NC load or a store
909 # Note: the load hit is delayed by one cycle. However it
910 # can still not collide with r.slow_valid (well unless I
911 # miscalculated) because slow_valid can only be set on a
912 # subsequent request and not on its first cycle (the state
913 # machine must have advanced), which makes slow_valid
914 # at least 2 cycles from the previous hit_load_valid.
916 # Sanity: Only one of these must be set in any given cycle
918 if False: # TODO: need Display to get this to work
919 assert (r1
.slow_valid
& r1
.stcx_fail
) != 1, \
920 "unexpected slow_valid collision with stcx_fail"
922 assert ((r1
.slow_valid | r1
.stcx_fail
) | r1
.hit_load_valid
) != 1, \
923 "unexpected hit_load_delayed collision with slow_valid"
925 with m
.If(~r1
.mmu_req
):
926 # Request came from loadstore1...
927 # Load hit case is the standard path
928 with m
.If(r1
.hit_load_valid
):
929 #Display(f"completing load hit data={data_out}")
932 # error cases complete without stalling
933 with m
.If(r1
.ls_error
):
934 # Display("completing ld/st with error")
937 # Slow ops (load miss, NC, stores)
938 with m
.If(r1
.slow_valid
):
939 #Display(f"completing store or load miss data={data_out}")
943 # Request came from MMU
944 with m
.If(r1
.hit_load_valid
):
945 # Display(f"completing load hit to MMU, data={m_out.data}")
947 # error cases complete without stalling
948 with m
.If(r1
.mmu_error
):
949 #Display("combpleting MMU ld with error")
952 # Slow ops (i.e. load miss)
953 with m
.If(r1
.slow_valid
):
954 #Display("completing MMU load miss, data={m_out.data}")
957 def rams(self
, m
, r1
, early_req_row
, cache_out
, replace_way
):
959 Generate a cache RAM for each way. This handles the normal
960 reads, writes from reloads and the special store-hit update
963 Note: the BRAMs have an extra read buffer, meaning the output
964 is pipelined an extra cycle. This differs from the
965 icache. The writeback logic needs to take that into
966 account by using 1-cycle delayed signals for load hits.
971 for i
in range(NUM_WAYS
):
973 rd_addr
= Signal(ROW_BITS
)
975 wr_addr
= Signal(ROW_BITS
)
976 wr_data
= Signal(WB_DATA_BITS
)
977 wr_sel
= Signal(ROW_SIZE
)
978 wr_sel_m
= Signal(ROW_SIZE
)
979 _d_out
= Signal(WB_DATA_BITS
)
981 way
= CacheRam(ROW_BITS
, WB_DATA_BITS
, True)
982 setattr(m
.submodules
, "cacheram_%d" % i
, way
)
984 comb
+= way
.rd_en
.eq(do_read
)
985 comb
+= way
.rd_addr
.eq(rd_addr
)
986 comb
+= _d_out
.eq(way
.rd_data_o
)
987 comb
+= way
.wr_sel
.eq(wr_sel_m
)
988 comb
+= way
.wr_addr
.eq(wr_addr
)
989 comb
+= way
.wr_data
.eq(wr_data
)
992 comb
+= do_read
.eq(1)
993 comb
+= rd_addr
.eq(early_req_row
)
994 comb
+= cache_out
[i
].eq(_d_out
)
998 # Defaults to wishbone read responses (cache refill)
1000 # For timing, the mux on wr_data/sel/addr is not
1001 # dependent on anything other than the current state.
1003 with m
.If(r1
.write_bram
):
1004 # Write store data to BRAM. This happens one
1005 # cycle after the store is in r0.
1006 comb
+= wr_data
.eq(r1
.req
.data
)
1007 comb
+= wr_sel
.eq(r1
.req
.byte_sel
)
1008 comb
+= wr_addr
.eq(get_row(r1
.req
.real_addr
))
1010 with m
.If(i
== r1
.req
.hit_way
):
1011 comb
+= do_write
.eq(1)
1013 # Otherwise, we might be doing a reload or a DCBZ
1015 comb
+= wr_data
.eq(0)
1017 comb
+= wr_data
.eq(wb_in
.dat
)
1018 comb
+= wr_addr
.eq(r1
.store_row
)
1019 comb
+= wr_sel
.eq(~
0) # all 1s
1021 with m
.If((r1
.state
== State
.RELOAD_WAIT_ACK
)
1022 & wb_in
.ack
& (replace_way
== i
)):
1023 comb
+= do_write
.eq(1)
1025 # Mask write selects with do_write since BRAM
1026 # doesn't have a global write-enable
1027 with m
.If(do_write
):
1028 comb
+= wr_sel_m
.eq(wr_sel
)
1030 # Cache hit synchronous machine for the easy case.
1031 # This handles load hits.
1032 # It also handles error cases (TLB miss, cache paradox)
1033 def dcache_fast_hit(self
, m
, req_op
, r0_valid
, r0
, r1
,
1034 req_hit_way
, req_index
, access_ok
,
1035 tlb_hit
, tlb_hit_way
, tlb_req_index
):
1040 with m
.If(req_op
!= Op
.OP_NONE
):
1041 #Display(f"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \
1042 # f"idx:{req_index} tag:{req_tag} way: {req_hit_way}"
1046 with m
.If(r0_valid
):
1047 sync
+= r1
.mmu_req
.eq(r0
.mmu_req
)
1049 # Fast path for load/store hits.
1050 # Set signals for the writeback controls.
1051 sync
+= r1
.hit_way
.eq(req_hit_way
)
1052 sync
+= r1
.hit_index
.eq(req_index
)
1054 with m
.If(req_op
== Op
.OP_LOAD_HIT
):
1055 sync
+= r1
.hit_load_valid
.eq(1)
1057 sync
+= r1
.hit_load_valid
.eq(0)
1059 with m
.If((req_op
== Op
.OP_LOAD_HIT
) |
(req_op
== Op
.OP_STORE_HIT
)):
1060 sync
+= r1
.cache_hit
.eq(1)
1062 sync
+= r1
.cache_hit
.eq(0)
1064 with m
.If(req_op
== Op
.OP_BAD
):
1065 # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1066 # f"rc_ok={rc_ok} perm_ok={perm_ok}"
1067 sync
+= r1
.ls_error
.eq(~r0
.mmu_req
)
1068 sync
+= r1
.mmu_error
.eq(r0
.mmu_req
)
1069 sync
+= r1
.cache_paradox
.eq(access_ok
)
1072 sync
+= r1
.ls_error
.eq(0)
1073 sync
+= r1
.mmu_error
.eq(0)
1074 sync
+= r1
.cache_paradox
.eq(0)
1076 with m
.If(req_op
== Op
.OP_STCX_FAIL
):
1079 sync
+= r1
.stcx_fail
.eq(0)
1081 # Record TLB hit information for updating TLB PLRU
1082 sync
+= r1
.tlb_hit
.eq(tlb_hit
)
1083 sync
+= r1
.tlb_hit_way
.eq(tlb_hit_way
)
1084 sync
+= r1
.tlb_hit_index
.eq(tlb_req_index
)
1086 # Memory accesses are handled by this state machine:
1088 # * Cache load miss/reload (in conjunction with "rams")
1089 # * Load hits for non-cachable forms
1090 # * Stores (the collision case is handled in "rams")
1092 # All wishbone requests generation is done here.
1093 # This machine operates at stage 1.
1094 def dcache_slow(self
, m
, r1
, use_forward1_next
, use_forward2_next
,
1095 cache_valid_bits
, r0
, replace_way
,
1096 req_hit_way
, req_same_tag
,
1097 r0_valid
, req_op
, cache_tag
, req_go
, ra
):
1103 req
= MemAccessRequest()
1105 adjust_acks
= Signal(3)
1106 stbs_done
= Signal()
1108 sync
+= r1
.use_forward1
.eq(use_forward1_next
)
1109 sync
+= r1
.forward_sel
.eq(0)
1111 with m
.If(use_forward1_next
):
1112 sync
+= r1
.forward_sel
.eq(r1
.req
.byte_sel
)
1113 with m
.Elif(use_forward2_next
):
1114 sync
+= r1
.forward_sel
.eq(r1
.forward_sel1
)
1116 sync
+= r1
.forward_data2
.eq(r1
.forward_data1
)
1117 with m
.If(r1
.write_bram
):
1118 sync
+= r1
.forward_data1
.eq(r1
.req
.data
)
1119 sync
+= r1
.forward_sel1
.eq(r1
.req
.byte_sel
)
1120 sync
+= r1
.forward_way1
.eq(r1
.req
.hit_way
)
1121 sync
+= r1
.forward_row1
.eq(get_row(r1
.req
.real_addr
))
1122 sync
+= r1
.forward_valid1
.eq(1)
1125 sync
+= r1
.forward_data1
.eq(0)
1127 sync
+= r1
.forward_data1
.eq(wb_in
.dat
)
1128 sync
+= r1
.forward_sel1
.eq(~
0) # all 1s
1129 sync
+= r1
.forward_way1
.eq(replace_way
)
1130 sync
+= r1
.forward_row1
.eq(r1
.store_row
)
1131 sync
+= r1
.forward_valid1
.eq(0)
1133 # One cycle pulses reset
1134 sync
+= r1
.slow_valid
.eq(0)
1135 sync
+= r1
.write_bram
.eq(0)
1136 sync
+= r1
.inc_acks
.eq(0)
1137 sync
+= r1
.dec_acks
.eq(0)
1139 sync
+= r1
.ls_valid
.eq(0)
1140 # complete tlbies and TLB loads in the third cycle
1141 sync
+= r1
.mmu_done
.eq(r0_valid
& (r0
.tlbie | r0
.tlbld
))
1143 with m
.If((req_op
== Op
.OP_LOAD_HIT
)
1144 |
(req_op
== Op
.OP_STCX_FAIL
)):
1145 with m
.If(~r0
.mmu_req
):
1146 sync
+= r1
.ls_valid
.eq(1)
1148 sync
+= r1
.mmu_done
.eq(1)
1150 with m
.If(r1
.write_tag
):
1151 # Store new tag in selected way
1152 for i
in range(NUM_WAYS
):
1153 with m
.If(i
== replace_way
):
1154 ct
= Signal(TAG_RAM_WIDTH
)
1155 comb
+= ct
.eq(cache_tag
[r1
.store_index
])
1156 comb
+= ct
.word_select(i
, TAG_WIDTH
).eq(r1
.reload_tag
)
1157 sync
+= cache_tag
[r1
.store_index
].eq(ct
)
1158 sync
+= r1
.store_way
.eq(replace_way
)
1159 sync
+= r1
.write_tag
.eq(0)
1161 # Take request from r1.req if there is one there,
1162 # else from req_op, ra, etc.
1164 comb
+= req
.eq(r1
.req
)
1166 comb
+= req
.op
.eq(req_op
)
1167 comb
+= req
.valid
.eq(req_go
)
1168 comb
+= req
.mmu_req
.eq(r0
.mmu_req
)
1169 comb
+= req
.dcbz
.eq(r0
.req
.dcbz
)
1170 comb
+= req
.real_addr
.eq(ra
)
1172 with m
.If(~r0
.req
.dcbz
):
1173 comb
+= req
.data
.eq(r0
.req
.data
)
1175 comb
+= req
.data
.eq(0)
1177 # Select all bytes for dcbz
1178 # and for cacheable loads
1179 with m
.If(r0
.req
.dcbz |
(r0
.req
.load
& ~r0
.req
.nc
)):
1180 comb
+= req
.byte_sel
.eq(~
0) # all 1s
1182 comb
+= req
.byte_sel
.eq(r0
.req
.byte_sel
)
1183 comb
+= req
.hit_way
.eq(req_hit_way
)
1184 comb
+= req
.same_tag
.eq(req_same_tag
)
1186 # Store the incoming request from r0,
1187 # if it is a slow request
1188 # Note that r1.full = 1 implies req_op = OP_NONE
1189 with m
.If((req_op
== Op
.OP_LOAD_MISS
)
1190 |
(req_op
== Op
.OP_LOAD_NC
)
1191 |
(req_op
== Op
.OP_STORE_MISS
)
1192 |
(req_op
== Op
.OP_STORE_HIT
)):
1193 sync
+= r1
.req
.eq(req
)
1194 sync
+= r1
.full
.eq(1)
1196 # Main state machine
1197 with m
.Switch(r1
.state
):
1199 with m
.Case(State
.IDLE
):
1200 # XXX check 'left downto. probably means len(r1.wb.adr)
1201 # r1.wb.adr <= req.real_addr(
1202 # r1.wb.adr'left downto 0
1204 sync
+= r1
.wb
.adr
.eq(req
.real_addr
)
1205 sync
+= r1
.wb
.sel
.eq(req
.byte_sel
)
1206 sync
+= r1
.wb
.dat
.eq(req
.data
)
1207 sync
+= r1
.dcbz
.eq(req
.dcbz
)
1209 # Keep track of our index and way
1210 # for subsequent stores.
1211 sync
+= r1
.store_index
.eq(get_index(req
.real_addr
))
1212 sync
+= r1
.store_row
.eq(get_row(req
.real_addr
))
1213 sync
+= r1
.end_row_ix
.eq(
1214 get_row_of_line(get_row(req
.real_addr
))
1216 sync
+= r1
.reload_tag
.eq(get_tag(req
.real_addr
))
1217 sync
+= r1
.req
.same_tag
.eq(1)
1219 with m
.If(req
.op
== Op
.OP_STORE_HIT
):
1220 sync
+= r1
.store_way
.eq(req
.hit_way
)
1222 # Reset per-row valid bits,
1223 # ready for handling OP_LOAD_MISS
1224 for i
in range(ROW_PER_LINE
):
1225 sync
+= r1
.rows_valid
[i
].eq(0)
1227 with m
.Switch(req
.op
):
1228 with m
.Case(Op
.OP_LOAD_HIT
):
1229 # stay in IDLE state
1232 with m
.Case(Op
.OP_LOAD_MISS
):
1233 #Display(f"cache miss real addr:" \
1234 # f"{req_real_addr}" \
1235 # f" idx:{get_index(req_real_addr)}" \
1236 # f" tag:{get_tag(req.real_addr)}")
1239 # Start the wishbone cycle
1240 sync
+= r1
.wb
.we
.eq(0)
1241 sync
+= r1
.wb
.cyc
.eq(1)
1242 sync
+= r1
.wb
.stb
.eq(1)
1244 # Track that we had one request sent
1245 sync
+= r1
.state
.eq(State
.RELOAD_WAIT_ACK
)
1246 sync
+= r1
.write_tag
.eq(1)
1248 with m
.Case(Op
.OP_LOAD_NC
):
1249 sync
+= r1
.wb
.cyc
.eq(1)
1250 sync
+= r1
.wb
.stb
.eq(1)
1251 sync
+= r1
.wb
.we
.eq(0)
1252 sync
+= r1
.state
.eq(State
.NC_LOAD_WAIT_ACK
)
1254 with m
.Case(Op
.OP_STORE_HIT
, Op
.OP_STORE_MISS
):
1255 with m
.If(~req
.dcbz
):
1256 sync
+= r1
.state
.eq(State
.STORE_WAIT_ACK
)
1257 sync
+= r1
.acks_pending
.eq(1)
1258 sync
+= r1
.full
.eq(0)
1259 sync
+= r1
.slow_valid
.eq(1)
1261 with m
.If(~req
.mmu_req
):
1262 sync
+= r1
.ls_valid
.eq(1)
1264 sync
+= r1
.mmu_done
.eq(1)
1266 with m
.If(req
.op
== Op
.OP_STORE_HIT
):
1267 sync
+= r1
.write_bram
.eq(1)
1269 sync
+= r1
.state
.eq(State
.RELOAD_WAIT_ACK
)
1271 with m
.If(req
.op
== Op
.OP_STORE_MISS
):
1272 sync
+= r1
.write_tag
.eq(1)
1274 sync
+= r1
.wb
.we
.eq(1)
1275 sync
+= r1
.wb
.cyc
.eq(1)
1276 sync
+= r1
.wb
.stb
.eq(1)
1278 # OP_NONE and OP_BAD do nothing
1279 # OP_BAD & OP_STCX_FAIL were
1280 # handled above already
1281 with m
.Case(Op
.OP_NONE
):
1283 with m
.Case(Op
.OP_BAD
):
1285 with m
.Case(Op
.OP_STCX_FAIL
):
1288 with m
.Case(State
.RELOAD_WAIT_ACK
):
1289 # Requests are all sent if stb is 0
1290 comb
+= stbs_done
.eq(~r1
.wb
.stb
)
1292 with m
.If(~wb_in
.stall
& ~stbs_done
):
1293 # That was the last word?
1294 # We are done sending.
1295 # Clear stb and set stbs_done
1296 # so we can handle an eventual
1297 # last ack on the same cycle.
1298 with m
.If(is_last_row_addr(
1299 r1
.wb
.adr
, r1
.end_row_ix
)):
1300 sync
+= r1
.wb
.stb
.eq(0)
1301 comb
+= stbs_done
.eq(0)
1303 # Calculate the next row address in the current cache line
1304 rarange
= r1
.wb
.adr
[ROW_OFF_BITS
: LINE_OFF_BITS
]
1305 sync
+= rarange
.eq(rarange
+ 1)
1307 # Incoming acks processing
1308 sync
+= r1
.forward_valid1
.eq(wb_in
.ack
)
1309 with m
.If(wb_in
.ack
):
1310 # XXX needs an Array bit-accessor here
1311 sync
+= r1
.rows_valid
[r1
.store_row
% ROW_PER_LINE
].eq(1)
1313 # If this is the data we were looking for,
1314 # we can complete the request next cycle.
1315 # Compare the whole address in case the
1316 # request in r1.req is not the one that
1317 # started this refill.
1318 with m
.If(r1
.full
& r1
.req
.same_tag
&
1319 ((r1
.dcbz
& r1
.req
.dcbz
) |
1320 (~r1
.dcbz
& (r1
.req
.op
== Op
.OP_LOAD_MISS
))) &
1321 (r1
.store_row
== get_row(r1
.req
.real_addr
))):
1322 sync
+= r1
.full
.eq(0)
1323 sync
+= r1
.slow_valid
.eq(1)
1324 with m
.If(~r1
.mmu_req
):
1325 sync
+= r1
.ls_valid
.eq(1)
1327 sync
+= r1
.mmu_done
.eq(1)
1328 sync
+= r1
.forward_sel
.eq(~
0) # all 1s
1329 sync
+= r1
.use_forward1
.eq(1)
1331 # Check for completion
1332 with m
.If(stbs_done
& is_last_row(r1
.store_row
,
1334 # Complete wishbone cycle
1335 sync
+= r1
.wb
.cyc
.eq(0)
1337 # Cache line is now valid
1338 cv
= Signal(INDEX_BITS
)
1339 sync
+= cv
.eq(cache_valid_bits
[r1
.store_index
])
1340 sync
+= cv
.bit_select(r1
.store_way
, 1).eq(1)
1341 sync
+= r1
.state
.eq(State
.IDLE
)
1343 # Increment store row counter
1344 sync
+= r1
.store_row
.eq(next_row(r1
.store_row
))
1346 with m
.Case(State
.STORE_WAIT_ACK
):
1347 comb
+= stbs_done
.eq(~r1
.wb
.stb
)
1348 comb
+= acks
.eq(r1
.acks_pending
)
1350 with m
.If(r1
.inc_acks
!= r1
.dec_acks
):
1351 with m
.If(r1
.inc_acks
):
1352 comb
+= adjust_acks
.eq(acks
+ 1)
1354 comb
+= adjust_acks
.eq(acks
- 1)
1356 comb
+= adjust_acks
.eq(acks
)
1358 sync
+= r1
.acks_pending
.eq(adjust_acks
)
1360 # Clear stb when slave accepted request
1361 with m
.If(~wb_in
.stall
):
1362 # See if there is another store waiting
1363 # to be done which is in the same real page.
1364 with m
.If(req
.valid
):
1365 ra
= req
.real_addr
[0:SET_SIZE_BITS
]
1366 sync
+= r1
.wb
.adr
[0:SET_SIZE_BITS
].eq(ra
)
1367 sync
+= r1
.wb
.dat
.eq(req
.data
)
1368 sync
+= r1
.wb
.sel
.eq(req
.byte_sel
)
1370 with m
.Elif((adjust_acks
< 7) & req
.same_tag
&
1371 ((req
.op
== Op
.OP_STORE_MISS
)
1372 |
(req
.op
== Op
.OP_STORE_HIT
))):
1373 sync
+= r1
.wb
.stb
.eq(1)
1374 comb
+= stbs_done
.eq(0)
1376 with m
.If(req
.op
== Op
.OP_STORE_HIT
):
1377 sync
+= r1
.write_bram
.eq(1)
1378 sync
+= r1
.full
.eq(0)
1379 sync
+= r1
.slow_valid
.eq(1)
1381 # Store requests never come from the MMU
1382 sync
+= r1
.ls_valid
.eq(1)
1383 comb
+= stbs_done
.eq(0)
1384 sync
+= r1
.inc_acks
.eq(1)
1386 sync
+= r1
.wb
.stb
.eq(0)
1387 comb
+= stbs_done
.eq(1)
1389 # Got ack ? See if complete.
1390 with m
.If(wb_in
.ack
):
1391 with m
.If(stbs_done
& (adjust_acks
== 1)):
1392 sync
+= r1
.state
.eq(State
.IDLE
)
1393 sync
+= r1
.wb
.cyc
.eq(0)
1394 sync
+= r1
.wb
.stb
.eq(0)
1395 sync
+= r1
.dec_acks
.eq(1)
1397 with m
.Case(State
.NC_LOAD_WAIT_ACK
):
1398 # Clear stb when slave accepted request
1399 with m
.If(~wb_in
.stall
):
1400 sync
+= r1
.wb
.stb
.eq(0)
1402 # Got ack ? complete.
1403 with m
.If(wb_in
.ack
):
1404 sync
+= r1
.state
.eq(State
.IDLE
)
1405 sync
+= r1
.full
.eq(0)
1406 sync
+= r1
.slow_valid
.eq(1)
1408 with m
.If(~r1
.mmu_req
):
1409 sync
+= r1
.ls_valid
.eq(1)
1411 sync
+= r1
.mmu_done
.eq(1)
1413 sync
+= r1
.forward_sel
.eq(~
0) # all 1s
1414 sync
+= r1
.use_forward1
.eq(1)
1415 sync
+= r1
.wb
.cyc
.eq(0)
1416 sync
+= r1
.wb
.stb
.eq(0)
1418 def dcache_log(self
, m
, r1
, valid_ra
, tlb_hit_way
, stall_out
):
1421 d_out
, wb_in
, log_out
= self
.d_out
, self
.wb_in
, self
.log_out
1423 sync
+= log_out
.eq(Cat(r1
.state
[:3], valid_ra
, tlb_hit_way
[:3],
1424 stall_out
, req_op
[:3], d_out
.valid
, d_out
.error
,
1425 r1
.wb
.cyc
, r1
.wb
.stb
, wb_in
.ack
, wb_in
.stall
,
1428 def elaborate(self
, platform
):
1433 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1434 cache_tags
= CacheTagArray()
1435 cache_tag_set
= Signal(TAG_RAM_WIDTH
)
1436 cache_valid_bits
= CacheValidBitsArray()
1438 # TODO attribute ram_style : string;
1439 # TODO attribute ram_style of cache_tags : signal is "distributed";
1441 """note: these are passed to nmigen.hdl.Memory as "attributes".
1442 don't know how, just that they are.
1444 dtlb_valid_bits
= TLBValidBitsArray()
1445 dtlb_tags
= TLBTagsArray()
1446 dtlb_ptes
= TLBPtesArray()
1447 # TODO attribute ram_style of
1448 # dtlb_tags : signal is "distributed";
1449 # TODO attribute ram_style of
1450 # dtlb_ptes : signal is "distributed";
1457 reservation
= Reservation()
1459 # Async signals on incoming request
1460 req_index
= Signal(INDEX_BITS
)
1461 req_row
= Signal(ROW_BITS
)
1462 req_hit_way
= Signal(WAY_BITS
)
1463 req_tag
= Signal(TAG_BITS
)
1465 req_data
= Signal(64)
1466 req_same_tag
= Signal()
1469 early_req_row
= Signal(ROW_BITS
)
1471 cancel_store
= Signal()
1473 clear_rsrv
= Signal()
1478 use_forward1_next
= Signal()
1479 use_forward2_next
= Signal()
1481 cache_out
= CacheRamOut()
1483 plru_victim
= PLRUOut()
1484 replace_way
= Signal(WAY_BITS
)
1486 # Wishbone read/write/cache write formatting signals
1490 tlb_tag_way
= Signal(TLB_TAG_WAY_BITS
)
1491 tlb_pte_way
= Signal(TLB_PTE_WAY_BITS
)
1492 tlb_valid_way
= Signal(TLB_NUM_WAYS
)
1493 tlb_req_index
= Signal(TLB_SET_BITS
)
1495 tlb_hit_way
= Signal(TLB_WAY_BITS
)
1496 pte
= Signal(TLB_PTE_BITS
)
1497 ra
= Signal(REAL_ADDR_BITS
)
1499 perm_attr
= PermAttr()
1502 access_ok
= Signal()
1504 tlb_plru_victim
= TLBPLRUOut()
1506 # we don't yet handle collisions between loadstore1 requests
1508 comb
+= self
.m_out
.stall
.eq(0)
1510 # Hold off the request in r0 when r1 has an uncompleted request
1511 comb
+= r0_stall
.eq(r0_full
& r1
.full
)
1512 comb
+= r0_valid
.eq(r0_full
& ~r1
.full
)
1513 comb
+= self
.stall_out
.eq(r0_stall
)
1515 # Wire up wishbone request latch out of stage 1
1516 comb
+= self
.wb_out
.eq(r1
.wb
)
1518 # call sub-functions putting everything together, using shared
1519 # signals established above
1520 self
.stage_0(m
, r0
, r1
, r0_full
)
1521 self
.tlb_read(m
, r0_stall
, tlb_valid_way
,
1522 tlb_tag_way
, tlb_pte_way
, dtlb_valid_bits
,
1523 dtlb_tags
, dtlb_ptes
)
1524 self
.tlb_search(m
, tlb_req_index
, r0
, r0_valid
,
1525 tlb_valid_way
, tlb_tag_way
, tlb_hit_way
,
1526 tlb_pte_way
, pte
, tlb_hit
, valid_ra
, perm_attr
, ra
)
1527 self
.tlb_update(m
, r0_valid
, r0
, dtlb_valid_bits
, tlb_req_index
,
1528 tlb_hit_way
, tlb_hit
, tlb_plru_victim
, tlb_tag_way
,
1529 dtlb_tags
, tlb_pte_way
, dtlb_ptes
)
1530 self
.maybe_plrus(m
, r1
, plru_victim
)
1531 self
.cache_tag_read(m
, r0_stall
, req_index
, cache_tag_set
, cache_tags
)
1532 self
.dcache_request(m
, r0
, ra
, req_index
, req_row
, req_tag
,
1533 r0_valid
, r1
, cache_valid_bits
, replace_way
,
1534 use_forward1_next
, use_forward2_next
,
1535 req_hit_way
, plru_victim
, rc_ok
, perm_attr
,
1536 valid_ra
, perm_ok
, access_ok
, req_op
, req_go
,
1538 tlb_hit
, tlb_hit_way
, tlb_valid_way
, cache_tag_set
,
1539 cancel_store
, req_same_tag
, r0_stall
, early_req_row
)
1540 self
.reservation_comb(m
, cancel_store
, set_rsrv
, clear_rsrv
,
1541 r0_valid
, r0
, reservation
)
1542 self
.reservation_reg(m
, r0_valid
, access_ok
, set_rsrv
, clear_rsrv
,
1544 self
.writeback_control(m
, r1
, cache_out
)
1545 self
.rams(m
, r1
, early_req_row
, cache_out
, replace_way
)
1546 self
.dcache_fast_hit(m
, req_op
, r0_valid
, r0
, r1
,
1547 req_hit_way
, req_index
, access_ok
,
1548 tlb_hit
, tlb_hit_way
, tlb_req_index
)
1549 self
.dcache_slow(m
, r1
, use_forward1_next
, use_forward2_next
,
1550 cache_valid_bits
, r0
, replace_way
,
1551 req_hit_way
, req_same_tag
,
1552 r0_valid
, req_op
, cache_tags
, req_go
, ra
)
1553 #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1560 # entity dcache_tb is
1563 # architecture behave of dcache_tb is
1564 # signal clk : std_ulogic;
1565 # signal rst : std_ulogic;
1567 # signal d_in : Loadstore1ToDcacheType;
1568 # signal d_out : DcacheToLoadstore1Type;
1570 # signal m_in : MmuToDcacheType;
1571 # signal m_out : DcacheToMmuType;
1573 # signal wb_bram_in : wishbone_master_out;
1574 # signal wb_bram_out : wishbone_slave_out;
1576 # constant clk_period : time := 10 ns;
1578 # dcache0: entity work.dcache
1591 # wishbone_out => wb_bram_in,
1592 # wishbone_in => wb_bram_out
1595 # -- BRAM Memory slave
1596 # bram0: entity work.wishbone_bram_wrapper
1598 # MEMORY_SIZE => 1024,
1599 # RAM_INIT_FILE => "icache_test.bin"
1604 # wishbone_in => wb_bram_in,
1605 # wishbone_out => wb_bram_out
1608 # clk_process: process
1611 # wait for clk_period/2;
1613 # wait for clk_period/2;
1616 # rst_process: process
1619 # wait for 2*clk_period;
1627 # d_in.valid <= '0';
1630 # d_in.addr <= (others => '0');
1631 # d_in.data <= (others => '0');
1632 # m_in.valid <= '0';
1633 # m_in.addr <= (others => '0');
1634 # m_in.pte <= (others => '0');
1636 # wait for 4*clk_period;
1637 # wait until rising_edge(clk);
1639 # -- Cacheable read of address 4
1642 # d_in.addr <= x"0000000000000004";
1643 # d_in.valid <= '1';
1644 # wait until rising_edge(clk);
1645 # d_in.valid <= '0';
1647 # wait until rising_edge(clk) and d_out.valid = '1';
1648 # assert d_out.data = x"0000000100000000"
1649 # report "data @" & to_hstring(d_in.addr) &
1650 # "=" & to_hstring(d_out.data) &
1651 # " expected 0000000100000000"
1653 # -- wait for clk_period;
1655 # -- Cacheable read of address 30
1658 # d_in.addr <= x"0000000000000030";
1659 # d_in.valid <= '1';
1660 # wait until rising_edge(clk);
1661 # d_in.valid <= '0';
1663 # wait until rising_edge(clk) and d_out.valid = '1';
1664 # assert d_out.data = x"0000000D0000000C"
1665 # report "data @" & to_hstring(d_in.addr) &
1666 # "=" & to_hstring(d_out.data) &
1667 # " expected 0000000D0000000C"
1670 # -- Non-cacheable read of address 100
1673 # d_in.addr <= x"0000000000000100";
1674 # d_in.valid <= '1';
1675 # wait until rising_edge(clk);
1676 # d_in.valid <= '0';
1677 # wait until rising_edge(clk) and d_out.valid = '1';
1678 # assert d_out.data = x"0000004100000040"
1679 # report "data @" & to_hstring(d_in.addr) &
1680 # "=" & to_hstring(d_out.data) &
1681 # " expected 0000004100000040"
1684 # wait until rising_edge(clk);
1685 # wait until rising_edge(clk);
1686 # wait until rising_edge(clk);
1687 # wait until rising_edge(clk);
1692 def dcache_sim(dut
):
1694 yield dut
.d_in
.valid
.eq(0)
1695 yield dut
.d_in
.load
.eq(0)
1696 yield dut
.d_in
.nc
.eq(0)
1697 yield dut
.d_in
.adrr
.eq(0)
1698 yield dut
.d_in
.data
.eq(0)
1699 yield dut
.m_in
.valid
.eq(0)
1700 yield dut
.m_in
.addr
.eq(0)
1701 yield dut
.m_in
.pte
.eq(0)
1702 # wait 4 * clk_period
1707 # wait_until rising_edge(clk)
1709 # Cacheable read of address 4
1710 yield dut
.d_in
.load
.eq(1)
1711 yield dut
.d_in
.nc
.eq(0)
1712 yield dut
.d_in
.addr
.eq(Const(0x0000000000000004, 64))
1713 yield dut
.d_in
.valid
.eq(1)
1714 # wait-until rising_edge(clk)
1716 yield dut
.d_in
.valid
.eq(0)
1718 while not (yield dut
.d_out
.valid
):
1720 assert dut
.d_out
.data
== 0x0000000100000000, \
1721 f
"data @ {dut.d_in.addr}={dut.d_in.data} expected 0000000100000000"
1724 # Cacheable read of address 30
1725 yield dut
.d_in
.load
.eq(1)
1726 yield dut
.d_in
.nc
.eq(0)
1727 yield dut
.d_in
.addr
.eq(Const(0x0000000000000030, 64))
1728 yield dut
.d_in
.valid
.eq(1)
1730 yield dut
.d_in
.valid
.eq(0)
1732 while not (yield dut
.d_out
.valid
):
1734 assert dut
.d_out
.data
== 0x0000000D0000000C, \
1735 f
"data @{dut.d_in.addr}={dut.d_out.data} expected 0000000D0000000C"
1737 # Non-cacheable read of address 100
1738 yield dut
.d_in
.load
.eq(1)
1739 yield dut
.d_in
.nc
.eq(1)
1740 yield dut
.d_in
.addr
.eq(Const(0x0000000000000100, 64))
1741 yield dut
.d_in
.valid
.eq(1)
1743 yield dut
.d_in
.valid
.eq(0)
1745 while not (yield dut
.d_out
.valid
):
1747 assert dut
.d_out
.data
== 0x0000004100000040, \
1748 f
"data @ {dut.d_in.addr}={dut.d_out.data} expected 0000004100000040"
1758 vl
= rtlil
.convert(dut
, ports
=[])
1759 with
open("test_dcache.il", "w") as f
:
1762 #run_simulation(dut, dcache_sim(), vcd_name='test_dcache.vcd')
1764 if __name__
== '__main__':