3 based on Anton Blanchard microwatt dcache.vhdl
7 from enum
import Enum
, unique
9 from nmigen
import Module
, Signal
, Elaboratable
,
11 from nmigen
.cli
import main
12 from nmigen
.iocontrol
import RecordObject
13 from nmigen
.util
import log2_int
15 from experiment
.mem_types
import LoadStore1ToDCacheType
,
16 DCacheToLoadStore1Type
,
20 from experiment
.wb_types
import WB_ADDR_BITS
, WB_DATA_BITS
, WB_SEL_BITS
,
21 WBAddrType
, WBDataType
, WBSelType
,
22 WbMasterOut
, WBSlaveOut
,
23 WBMasterOutVector
, WBSlaveOutVector
,
24 WBIOMasterOut
, WBIOSlaveOut
27 # Record for storing permission, attribute, etc. bits from a PTE
28 class PermAttr(RecordObject
):
31 self
.reference
= Signal()
32 self
.changed
= Signal()
33 self
.nocache
= Signal()
35 self
.rd_perm
= Signal()
36 self
.wr_perm
= Signal()
39 def extract_perm_attr(pte
):
50 # Type of operation on a "valid" input
54 OP_BAD
= 1 # NC cache hit, TLB miss, prot/RC failure
55 OP_STCX_FAIL
= 2 # conditional store w/o reservation
56 OP_LOAD_HIT
= 3 # Cache hit on load
57 OP_LOAD_MISS
= 4 # Load missing cache
58 OP_LOAD_NC
= 5 # Non-cachable load
59 OP_STORE_HIT
= 6 # Store hitting cache
60 OP_STORE_MISS
= 7 # Store missing cache
66 IDLE
= 0 # Normal load hit processing
67 RELOAD_WAIT_ACK
= 1 # Cache reload wait ack
68 STORE_WAIT_ACK
= 2 # Store wait ack
69 NC_LOAD_WAIT_ACK
= 3 # Non-cachable load wait ack
74 # In order to make timing, we use the BRAMs with
75 # an output buffer, which means that the BRAM
76 # output is delayed by an extra cycle.
78 # Thus, the dcache has a 2-stage internal pipeline
79 # for cache hits with no stalls.
81 # All other operations are handled via stalling
84 # The second stage can thus complete a hit at the same
85 # time as the first stage emits a stall for a complex op.
87 # Stage 0 register, basically contains just the latched request
88 class RegStage0(RecordObject
):
91 self
.req
= LoadStore1ToDCacheType()
95 self
.mmu_req
= Signal() # indicates source of request
98 class MemAccessRequest(RecordObject
):
102 self
.valid
= Signal()
104 self
.real_addr
= Signal(REAL_ADDR_BITS
)
105 self
.data
= Signal(64)
106 self
.byte_sel
= Signal(8)
107 self
.hit_way
= Signal(WAY_BITS
)
108 self
.same_tag
= Signal()
109 self
.mmu_req
= Signal()
112 # First stage register, contains state for stage 1 of load hits
113 # and for the state machine used by all other operations
114 class RegStage1(RecordObject
):
117 # Info about the request
118 self
.full
= Signal() # have uncompleted request
119 self
.mmu_req
= Signal() # request is from MMU
120 self
.req
= MemAccessRequest()
123 self
.hit_way
= Signal(WAY_BITS
)
124 self
.hit_load_valid
= Signal()
125 self
.hit_index
= Signal(NUM_LINES
)
126 self
.cache_hit
= Signal()
129 self
.tlb_hit
= Signal()
130 self
.tlb_hit_way
= Signal(TLB_NUM_WAYS
)
131 self
.tlb_hit_index
= Signal(TLB_SET_SIZE
)
133 # 2-stage data buffer for data forwarded from writes to reads
134 self
.forward_data1
= Signal(64)
135 self
.forward_data2
= Signal(64)
136 self
.forward_sel1
= Signal(8)
137 self
.forward_valid1
= Signal()
138 self
.forward_way1
= Signal(WAY_BITS
)
139 self
.forward_row1
= Signal(BRAM_ROWS
)
140 self
.use_forward1
= Signal()
141 self
.forward_sel
= Signal(8)
143 # Cache miss state (reload state machine)
146 self
.write_bram
= Signal()
147 self
.write_tag
= Signal()
148 self
.slow_valid
= Signal()
149 self
.wb
= WishboneMasterOut()
150 self
.reload_tag
= Signal(TAG_BITS
)
151 self
.store_way
= Signal(WAY_BITS
)
152 self
.store_row
= Signal(BRAM_ROWS
)
153 self
.store_index
= Signal(NUM_LINES
)
154 self
.end_row_ix
= Signal(ROW_LINE_BIT
)
155 self
.rows_valid
= RowPerLineValidArray()
156 self
.acks_pending
= Signal(3)
157 self
.inc_acks
= Signal()
158 self
.dec_acks
= Signal()
160 # Signals to complete (possibly with error)
161 self
.ls_valid
= Signal()
162 self
.ls_error
= Signal()
163 self
.mmu_done
= Signal()
164 self
.mmu_error
= Signal()
165 self
.cache_paradox
= Signal()
167 # Signal to complete a failed stcx.
168 self
.stcx_fail
= Signal()
171 # Reservation information
172 class Reservation(RecordObject
):
176 # TODO LINE_OFF_BITS is 6
177 addr
= Signal(63 downto LINE_OFF_BITS
)
180 # Set associative dcache write-through
182 # TODO (in no specific order):
184 # * See list in icache.vhdl
185 # * Complete load misses on the cycle when WB data comes instead of
186 # at the end of line (this requires dealing with requests coming in
188 class DCache(Elaboratable
):
190 # TODO: make these parameters of DCache at some point
191 self
.LINE_SIZE
= 64 # Line size in bytes
192 self
.NUM_LINES
= 32 # Number of lines in a set
193 self
.NUM_WAYS
= 4 # Number of ways
194 self
.TLB_SET_SIZE
= 64 # L1 DTLB entries per set
195 self
.TLB_NUM_WAYS
= 2 # L1 DTLB number of sets
196 self
.TLB_LG_PGSZ
= 12 # L1 DTLB log_2(page_size)
197 self
.LOG_LENGTH
= 0 # Non-zero to enable log data collection
199 self
.d_in
= LoadStore1ToDCacheType()
200 self
.d_out
= DCacheToLoadStore1Type()
202 self
.m_in
= MMUToDCacheType()
203 self
.m_out
= DCacheToMMUType()
205 self
.stall_out
= Signal()
207 self
.wb_out
= WBMasterOut()
208 self
.wb_in
= WBSlaveOut()
210 self
.log_out
= Signal(20)
212 # Latch the request in r0.req as long as we're not stalling
213 def stage_0(self
, m
, d_in
, m_in
):
219 # TODO, this goes in unit tests and formal proofs
220 # assert ~(d_in.valid & m_in.valid),
221 # "request collision loadstore vs MMU"
222 with m
.If(~
(d_in
.valid
& m_in
.valid
)):
223 #sync += Display("request collision loadstore vs MMU")
226 with m
.If(m_in
.valid
):
227 sync
+= r
.req
.valid
.eq(1)
228 sync
+= r
.req
.load
.eq(~
(m_in
.tlbie | m_in
.tlbld
))
229 sync
+= r
.req
.dcbz
.eq(0)
230 sync
+= r
.req
.nc
.eq(0)
231 sync
+= r
.req
.reserve
.eq(0)
232 sync
+= r
.req
.virt_mode
.eq(1)
233 sync
+= r
.req
.priv_mode
.eq(1)
234 sync
+= r
.req
.addr
.eq(m_in
.addr
)
235 sync
+= r
.req
.data
.eq(m_in
.pte
)
236 sync
+= r
.req
.byte_sel
.eq(-1) # Const -1 sets all to 0b111....
237 sync
+= r
.tlbie
.eq(m_in
.tlbie
)
238 sync
+= r
.doall
.eq(m_in
.doall
)
239 sync
+= r
.tlbld
.eq(m_in
.tlbld
)
240 sync
+= r
.mmu_req
.eq(1)
242 sync
+= r
.req
.eq(d_in
)
243 sync
+= r
.req
.tlbie
.eq(0)
244 sync
+= r
.req
.doall
.eq(0)
245 sync
+= r
.req
.tlbd
.eq(0)
246 sync
+= r
.req
.mmu_req
.eq(0)
247 with m
.If(~
(r1
.full
& r0_full
)):
249 sync
+= r0_full
.eq(r
.req
.valid
)
252 # Operates in the second cycle on the request latched in r0.req.
253 # TLB updates write the entry at the end of the second cycle.
254 def tlb_read(self
, m
, m_in
, d_in
, r0_stall
, tlb_valid_way
,
255 tlb_tag_way
, tlb_pte_way
, dtlb_valid_bits
,
256 dtlb_tags
, dtlb_ptes
):
261 index
= Signal(log2_int(TLB_SET_BITS
), False)
262 addrbits
= Signal(TLB_SET_BITS
)
265 amax
= TLB_LG_PGSZ
+ TLB_SET_BITS
267 with m
.If(m_in
.valid
):
268 comb
+= addrbits
.eq(m_in
.addr
[amin
: amax
])
270 comb
+= addrbits
.eq(d_in
.addr
[amin
: amax
])
271 comb
+= index
.eq(addrbits
)
273 # If we have any op and the previous op isn't finished,
274 # then keep the same output for next cycle.
275 with m
.If(~r0_stall
):
276 sync
+= tlb_valid_way
.eq(dtlb_valid_bits
[index
])
277 sync
+= tlb_tag_way
.eq(dtlb_tags
[index
])
278 sync
+= tlb_pte_way
.eq(dtlb_ptes
[index
])
281 def maybe_tlb_plrus(self
, m
, r1
, tlb_plru_victim
, acc
, acc_en
, lru
):
285 with m
.If(TLB_NUM_WAYS
> 1):
286 for i
in range(TLB_SET_SIZE
):
288 tlb_plru
= PLRU(TLB_WAY_BITS
)
289 tlb_plru_acc
= Signal(TLB_WAY_BITS
)
290 tlb_plru_acc_en
= Signal()
291 tlb_plru_out
= Signal(TLB_WAY_BITS
)
293 comb
+= tlb_plru
.acc
.eq(tlb_plru_acc
)
294 comb
+= tlb_plru
.acc_en
.eq(tlb_plru_acc_en
)
295 comb
+= tlb_plru
.lru
.eq(tlb_plru_out
)
298 with m
.If(r1
.tlb_hit_index
== i
):
299 comb
+= tlb_plru
.acc_en
.eq(r1
.tlb_hit
)
301 comb
+= tlb_plru
.acc_en
.eq(0)
302 comb
+= tlb_plru
.acc
.eq(r1
.tlb_hit_way
)
304 comb
+= tlb_plru_victim
[i
].eq(tlb_plru
.lru
)
306 def tlb_search(self
, tlb_req_index
, r0
, tlb_valid_way_ tlb_tag_way
,
307 tlb_pte_way
, pte
, tlb_hit
, valid_ra
, perm_attr
, ra
):
312 hitway
= Signal(TLB_WAY_BITS
)
314 eatag
= Signal(log2_int(TLB_EA_TAG_BITS
, False))
316 TLB_LG_END
= TLB_LG_PGSZ
+ TLB_SET_BITS
317 comb
+= tlb_req_index
.eq(r0
.req
.addr
[TLB_LG_PGSZ
: TLB_LG_END
])
318 comb
+= eatag
.eq(r0
.req
.addr
[TLB_LG_END
: 64 ])
320 for i
in range(TLB_NUM_WAYS
):
321 with m
.If(tlb_valid_way(i
)
322 & read_tlb_tag(i
, tlb_tag_way
) == eatag
):
326 comb
+= tlb_hit
.eq(hit
& r0_valid
)
327 comb
+= tlb_hit_way
.eq(hitway
)
330 comb
+= pte
.eq(read_tlb_pte(hitway
, tlb_pte_way
))
333 comb
+= valid_ra
.eq(tlb_hit | ~r0
.req
.virt_mode
)
334 with m
.If(r0
.req
.virt_mode
):
335 comb
+= ra
.eq(Cat(Const(0, ROW_OFF_BITS
),
336 r0
.req
.addr
[ROW_OFF_BITS
:TLB_LG_PGSZ
],
337 pte
[TLB_LG_PGSZ
:REAL_ADDR_BITS
]))
338 comb
+= perm_attr
.eq(extract_perm_attr(pte
))
340 comb
+= ra
.eq(Cat(Const(0, ROW_OFF_BITS
),
341 r0
.rq
.addr
[ROW_OFF_BITS
:REAL_ADDR_BITS
]))
343 comb
+= perm_attr
.reference
.eq(1)
344 comb
+= perm_attr
.changed
.eq(1)
345 comb
+= perm_attr
.priv
.eq(1)
346 comb
+= perm_attr
.nocache
.eq(0)
347 comb
+= perm_attr
.rd_perm
.eq(1)
348 comb
+= perm_attr
.wr_perm
.eq(1)
350 def tlb_update(self
, r0_valid
, r0
, dtlb_valid_bits
, tlb_req_index
,
351 tlb_hit_way
, tlb_hit
, tlb_plru_victim
, tlb_tag_way
,
352 dtlb_tags
, tlb_pte_way
, dtlb_ptes
, dtlb_valid_bits
):
357 # variable tlbie : std_ulogic;
358 # variable tlbwe : std_ulogic;
359 # variable repl_way : tlb_way_t;
360 # variable eatag : tlb_tag_t;
361 # variable tagset : tlb_way_tags_t;
362 # variable pteset : tlb_way_ptes_t;
363 #type tlb_tags_t is array(tlb_index_t) of tlb_way_tags_t;
364 # --> Array([Signal(log(way_tags length)) for i in range(number of tlbs)])
368 repl_way
= Signal(TLB_WAY_BITS
)
369 eatag
= Signal(log2_int(TLB_EA_TAG_BITS
, False))
370 tagset
= TLBWayTags()
371 pteset
= TLBWayPtes()
373 comb
+= tlbie
.eq(r0_valid
& r0
.tlbie
)
374 comb
+= tlbwe
.eq(r0_valid
& r0
.tlbldoi
)
376 with m
.If(tlbie
& r0
.doall
):
377 # clear all valid bits at once
378 for i
in range(TLB_SET_SIZE
):
379 sync
+= dtlb_valid_bits
[i
].eq(0)
383 sync
+= dtlb_valid_bits
[tlb_req_index
][tlb_hit_way
].eq(0)
386 comb
+= repl_way
.eq(tlb_hit_way
)
388 comb
+= repl_way
.eq(tlb_plru_victim
[tlb_req_index
])
389 comb
+= eatag
.eq(r0
.req
.addr
[TLB_LG_PGSZ
+ TLB_SET_BITS
:64])
390 comb
+= tagset
.eq(tlb_tag_way
)
391 sync
+= write_tlb_tag(repl_way
, tagset
, eatag
)
392 sync
+= dtlb_tags
[tlb_req_index
].eq(tagset
)
393 comb
+= pteset
.eq(tlb_pte_way
)
394 sync
+= write_tlb_pte(repl_way
, pteset
, r0
.req
.data
)
395 sync
+= dtlb_ptes
[tlb_req_index
].eq(pteset
)
396 sync
+= dtlb_valid_bits
[tlb_req_index
][repl_way
].eq(1)
399 def maybe_plrus(self
, r1
):
404 for i
in range(NUM_LINES
):
406 plru
= PLRU(TLB_WAY_BITS
)
407 setattr(m
.submodules
, "plru%d" % i
, plru
)
408 plru_acc
= Signal(TLB_WAY_BITS
)
409 plru_acc_en
= Signal()
410 plru_out
= Signal(TLB_WAY_BITS
)
412 comb
+= plru
.acc
.eq(plru_acc
)
413 comb
+= plru
.acc_en
.eq(plru_acc_en
)
414 comb
+= plru
.lru
.eq(plru_out
)
416 with m
.If(r1
.hit_index
== i
):
417 comb
+= plru_acc_en
.eq(r1
.cache_hit
)
419 comb
+= plru_acc
.eq(r1
.hit_way
)
420 comb
+= plru_victim
[i
].eq(plru_out
)
422 # Cache tag RAM read port
423 def cache_tag_read(self
, r0_stall
, req_index
, m_in
, d_in
,
424 cache_tag_set
, cache_tags
):
429 index
= Signal(INDEX_BITS
)
432 comb
+= index
.eq(req_index
)
433 with m
.Elif(m_in
.valid
):
434 comb
+= index
.eq(get_index(m_in
.addr
))
436 comb
+= index
.eq(get_index(d_in
.addr
))
437 sync
+= cache_tag_set
.eq(cache_tags
[index
])
439 # Cache request parsing and hit detection
440 def dcache_request(self
, r0
, ra
, req_index
, req_row
, req_tag
,
441 r0_valid
, r1
, cache_valid_bits
, replace_way
,
442 use_forward1_next
, use_forward2_next
,
443 req_hit_way
, plru_victim
, rc_ok
, perm_attr
,
444 valid_ra
, perm_ok
, access_ok
, req_op
, req_ok
,
445 r0_stall
, m_in
, early_req_row
, d_in
):
451 hit_way
= Signal(WAY_BITS
)
457 s_tag
= Signal(TAG_BITS
)
458 s_pte
= Signal(TLB_PTE_BITS
)
459 s_ra
= Signal(REAL_ADDR_BITS
)
460 hit_set
= Signal(TLB_NUM_WAYS
)
461 hit_way_set
= HitWaySet()
462 rel_matches
= Signal(TLB_NUM_WAYS
)
465 # Extract line, row and tag from request
466 comb
+= req_index
.eq(get_index(r0
.req
.addr
))
467 comb
+= req_row
.eq(get_row(r0
.req
.addr
))
468 comb
+= req_tag
.eq(get_tag(ra
))
470 comb
+= go
.eq(r0_valid
& ~
(r0
.tlbie | r0
.tlbld
) & ~r1
.ls_error
)
472 # Test if pending request is a hit on any way
473 # In order to make timing in virtual mode,
474 # when we are using the TLB, we compare each
475 # way with each of the real addresses from each way of
476 # the TLB, and then decide later which match to use.
478 with m
.If(r0
.req
.virt_mode
):
479 comb
+= rel_matches
.eq(0)
480 for j
in range(TLB_NUM_WAYS
):
481 comb
+= s_pte
.eq(read_tlb_pte(j
, tlb_pte_way
))
482 comb
+= s_ra
.eq(Cat(r0
.req
.addr
[0:TLB_LG_PGSZ
],
483 s_pte
[TLB_LG_PGSZ
:REAL_ADDR_BITS
]))
484 comb
+= s_tag
.eq(get_tag(s_ra
))
486 for i
in range(NUM_WAYS
):
487 with m
.If(go
& cache_valid_bits
[req_index
][i
] &
488 read_tag(i
, cache_tag_set
) == s_tag
490 comb
+= hit_way_set
[j
].eq(i
)
492 comb
+= hit_set
[j
].eq(s_hit
)
493 with m
.If(s_tag
== r1
.reload_tag
):
494 comb
+= rel_matches
[j
].eq(1)
496 comb
+= is_hit
.eq(hit_set
[tlb_hit_way
])
497 comb
+= hit_way
.eq(hit_way_set
[tlb_hit_way
])
498 comb
+= rel_match
.eq(rel_matches
[tlb_hit_way
])
500 comb
+= s_tag
.eq(get_tag(r0
.req
.addr
))
501 for i
in range(NUM_WAYS
):
502 with m
.If(go
& cache_valid_bits
[req_index
][i
] &
503 read_tag(i
, cache_tag_set
) == s_tag
):
504 comb
+= hit_way
.eq(i
)
506 with m
.If(s_tag
== r1
.reload_tag
):
507 comb
+= rel_match
.eq(1)
508 comb
+= req_same_tag
.eq(rel_match
)
510 # See if the request matches the line currently being reloaded
511 with m
.If((r1
.state
== State
.RELOAD_WAIT_ACK
) &
512 (req_index
== r1
.store_index
) & rel_match
):
513 # For a store, consider this a hit even if the row isn't
514 # valid since it will be by the time we perform the store.
515 # For a load, check the appropriate row valid bit.
516 valid
= r1
.rows_valid
[req_row
% ROW_PER_LINE
]
517 comb
+= is_hit
.eq(~r0
.req
.load | valid
)
518 comb
+= hit_way
.eq(replace_way
)
520 # Whether to use forwarded data for a load or not
521 comb
+= use_forward1_next
.eq(0)
522 with m
.If((get_row(r1
.req
.real_addr
) == req_row
)
523 & (r1
.req
.hit_way
== hit_way
))
524 # Only need to consider r1.write_bram here, since if we
525 # are writing refill data here, then we don't have a
526 # cache hit this cycle on the line being refilled.
527 # (There is the possibility that the load following the
528 # load miss that started the refill could be to the old
529 # contents of the victim line, since it is a couple of
530 # cycles after the refill starts before we see the updated
531 # cache tag. In that case we don't use the bypass.)
532 comb
+= use_forward1_next
.eq(r1
.write_bram
)
533 comb
+= use_forward2_next
.eq(0)
534 with m
.If((r1
.forward_row1
== req_row
) & (r1
.forward_way1
== hit_way
)):
535 comb
+= use_forward2_next
.eq(r1
.forward_valid1
)
537 # The way that matched on a hit
538 comb
+= req_hit_way
.eq(hit_way
)
540 # The way to replace on a miss
541 with m
.If(r1
.write_tag
):
542 replace_way
.eq(plru_victim
[r1
.store_index
])
544 comb
+= replace_way
.eq(r1
.store_way
)
546 # work out whether we have permission for this access
547 # NB we don't yet implement AMR, thus no KUAP
548 comb
+= rc_ok
.eq( perm_attr
.reference
549 & (r0
.req
.load | perm_attr
.changed
)
551 comb
+= perm_ok
.eq((r0
.req
.prive_mode | ~perm_attr
.priv
)
553 |
(r0
.req
.load
& perm_attr
.rd_perm
)
555 comb
+= access_ok
.eq(valid_ra
& perm_ok
& rc_ok
)
556 # Combine the request and cache hit status to decide what
557 # operation needs to be done
558 comb
+= nc
.eq(r0
.req
.nc | perm_attr
.nocache
)
559 comb
+= op
.eq(Op
.OP_NONE
)
561 with m
.If(~access_ok
):
562 comb
+= op
.eq(Op
.OP_BAD
)
563 with m
.Elif(cancel_store
):
564 comb
+= op
.eq(Op
.OP_STCX_FAIL
)
566 comb
+= opsel
.eq(Cat(is_hit
, nc
, r0
.req
.load
))
567 with m
.Switch(opsel
):
568 with m
.Case(Const(0b101, 3)):
569 comb
+= op
.eq(Op
.OP_LOAD_HIT
)
570 with m
.Case(Cosnt(0b100, 3)):
571 comb
+= op
.eq(Op
.OP_LOAD_MISS
)
572 with m
.Case(Const(0b110, 3)):
573 comb
+= op
.eq(Op
.OP_LOAD_NC
)
574 with m
.Case(Const(0b001, 3)):
575 comb
+= op
.eq(Op
.OP_STORE_HIT
)
576 with m
.Case(Const(0b000, 3)):
577 comb
+= op
.eq(Op
.OP_STORE_MISS
)
578 with m
.Case(Const(0b010, 3)):
579 comb
+= op
.eq(Op
.OP_STORE_MISS
)
580 with m
.Case(Const(0b011, 3)):
581 comb
+= op
.eq(Op
.OP_BAD
)
582 with m
.Case(Const(0b111, 3)):
583 comb
+= op
.eq(Op
.OP_BAD
)
585 comb
+= op
.eq(Op
.OP_NONE
)
586 comb
+= req_op
.eq(op
)
587 comb
+= req_go
.eq(go
)
589 # Version of the row number that is valid one cycle earlier
590 # in the cases where we need to read the cache data BRAM.
591 # If we're stalling then we need to keep reading the last
593 with m
.If(~r0_stall
):
594 with m
.If(m_in
.valid
):
595 comb
+= early_req_row
.eq(get_row(m_in
.addr
))
597 comb
+= early_req_row
.eq(get_row(d_in
.addr
))
599 comb
+= early_req_row
.eq(req_row
)
601 # Handle load-with-reservation and store-conditional instructions
602 def reservation_comb(self
, cancel_store
, set_rsrv
, clear_rsrv
,
603 r0_valid
, r0
, reservation
):
608 with m
.If(r0_valid
& r0
.req
.reserve
):
610 # XXX generate alignment interrupt if address
611 # is not aligned XXX or if r0.req.nc = '1'
612 with m
.If(r0
.req
.load
):
613 comb
+= set_rsrv(1) # load with reservation
615 comb
+= clear_rsrv
.eq(1) # store conditional
616 with m
.If(~reservation
.valid | r0
.req
.addr
[LINE_OFF_BITS
:64]):
617 comb
+= cancel_store
.eq(1)
619 def reservation_reg(self
, r0_valid
, access_ok
, clear_rsrv
,
625 with m
.If(r0_valid
& access_ok
):
626 with m
.If(clear_rsrv
):
627 sync
+= reservation
.valid
.eq(0)
628 with m
.Elif(set_rsrv
):
629 sync
+= reservation
.valid
.eq(1)
630 sync
+= reservation
.addr
.eq(r0
.req
.addr
[LINE_OFF_BITS
:64])
632 # Return data for loads & completion control logic
633 def writeback_control(self
, r1
, cache_out
, d_out
, m_out
):
638 data_out
= Signal(64)
639 data_fwd
= Signal(64)
642 # Use the bypass if are reading the row that was
643 # written 1 or 2 cycles ago, including for the
644 # slow_valid = 1 case (i.e. completing a load
645 # miss or a non-cacheable load).
646 with m
.If(r1
.use_forward1
):
647 comb
+= data_fwd
.eq(r1
.forward_data1
)
649 comb
+= data_fwd
.eq(r1
.forward_data2
)
651 comb
+= data_out
.eq(cache_out
[r1
.hit_way
])
654 with m
.If(r1
.forward_sel
[i
]):
655 dsel
= data_fwd
.word_select(i
, 8)
656 comb
+= data_out
.word_select(i
, 8).eq(dsel
)
658 comb
+= d_out
.valid
.eq(r1
.ls_valid
)
659 comb
+= d_out
.data
.eq(data_out
)
660 comb
+= d_out
.store_done
.eq(~r1
.stcx_fail
)
661 comb
+= d_out
.error
.eq(r1
.ls_error
)
662 comb
+= d_out
.cache_paradox
.eq(r1
.cache_paradox
)
665 comb
+= m_out
.done
.eq(r1
.mmu_done
)
666 comb
+= m_out
.err
.eq(r1
.mmu_error
)
667 comb
+= m_out
.data
.eq(data_out
)
669 # We have a valid load or store hit or we just completed
670 # a slow op such as a load miss, a NC load or a store
672 # Note: the load hit is delayed by one cycle. However it
673 # can still not collide with r.slow_valid (well unless I
674 # miscalculated) because slow_valid can only be set on a
675 # subsequent request and not on its first cycle (the state
676 # machine must have advanced), which makes slow_valid
677 # at least 2 cycles from the previous hit_load_valid.
679 # Sanity: Only one of these must be set in any given cycle
681 if False: # TODO: need Display to get this to work
682 assert (r1
.slow_valid
& r1
.stcx_fail
) != 1 "unexpected" \
683 "slow_valid collision with stcx_fail -!- severity FAILURE"
685 assert ((r1
.slow_valid | r1
.stcx_fail
) | r1
.hit_load_valid
) != 1
686 "unexpected hit_load_delayed collision with slow_valid -!-" \
689 with m
.If(~r1
._mmu_req
):
690 # Request came from loadstore1...
691 # Load hit case is the standard path
692 with m
.If(r1
.hit_load_valid
):
693 #Display(f"completing load hit data={data_out}")
696 # error cases complete without stalling
697 with m
.If(r1
.ls_error
):
698 # Display("completing ld/st with error")
701 # Slow ops (load miss, NC, stores)
702 with m
.If(r1
.slow_valid
):
703 #Display(f"completing store or load miss data={data_out}")
707 # Request came from MMU
708 with m
.If(r1
.hit_load_valid
):
709 # Display(f"completing load hit to MMU, data={m_out.data}")
711 # error cases complete without stalling
712 with m
.If(r1
.mmu_error
):
713 #Display("combpleting MMU ld with error")
716 # Slow ops (i.e. load miss)
717 with m
.If(r1
.slow_valid
):
718 #Display("completing MMU load miss, data={m_out.data}")
721 # Generate a cache RAM for each way. This handles the normal
722 # reads, writes from reloads and the special store-hit update
725 # Note: the BRAMs have an extra read buffer, meaning the output
726 # is pipelined an extra cycle. This differs from the
727 # icache. The writeback logic needs to take that into
728 # account by using 1-cycle delayed signals for load hits.
730 for i
in range(NUM_WAYS
):
732 rd_addr
= Signal(ROW_BITS
)
734 wr_addr
= Signal(ROW_BITS
)
735 wr_data
= Signal(WB_DATA_BITS
)
736 wr_sel
= Signal(ROW_SIZE
)
737 wr_sel_m
= Signal(ROW_SIZE
)
738 _d_out
= Signal(WB_DATA_BITS
)
741 # way: entity work.cache_ram
743 # ROW_BITS => ROW_BITS,
744 # WIDTH => wishbone_data_bits,
750 # rd_addr => rd_addr,
752 # wr_sel => wr_sel_m,
753 # wr_addr => wr_addr,
757 way
= CacheRam(ROW_BITS
, WB_DATA_BITS
, True)
758 comb
+= way
.rd_en
.eq(do_read
)
759 comb
+= way
.rd_addr
.eq(rd_addr
)
760 comb
+= _d_out
.eq(way
.rd_data
)
761 comb
+= way
.wr_sel
.eq(wr_sel_m
)
762 comb
+= way
.wr_addr
.eq(wr_addr
)
763 comb
+= way
.wr_data
.eq(wr_data
)
766 comb
+= do_read
.eq(1)
767 comb
+= rd_addr
.eq(early_req_row
)
768 comb
+= cache_out
[i
].eq(_d_out
)
772 # Defaults to wishbone read responses (cache refill)
774 # For timing, the mux on wr_data/sel/addr is not
775 # dependent on anything other than the current state.
777 with m
.If(r1
.write_bram
):
778 # Write store data to BRAM. This happens one
779 # cycle after the store is in r0.
780 comb
+= wr_data
.eq(r1
.req
.data
)
781 comb
+= wr_sel
.eq(r1
.req
.byte_sel
)
782 comb
+= wr_addr
.eq(get_row(r1
.req
.real_addr
))
784 with m
.If(i
== r1
.req
.hit_way
):
785 comb
+= do_write
.eq(1)
787 # Otherwise, we might be doing a reload or a DCBZ
789 comb
+= wr_data
.eq(0)
791 comb
+= wr_data
.eq(wishbone_in
.dat
)
792 comb
+= wr_addr
.eq(r1
.store_row
)
793 comb
+= wr_sel
.eq(~
0) # all 1s
795 with m
.If((r1
.state
== State
.RELOAD_WAIT_ACK
)
796 & wishbone_in
.ack
& (relpace_way
== i
)):
797 comb
+= do_write
.eq(1)
799 # Mask write selects with do_write since BRAM
800 # doesn't have a global write-enable
802 comb
+= wr_sel_m
.eq(wr_sel
)
804 # Cache hit synchronous machine for the easy case.
805 # This handles load hits.
806 # It also handles error cases (TLB miss, cache paradox)
807 def dcache_fast_hit(self
, req_op
, r0_valid
, r1
, ):
812 with m
.If(req_op
!= Op
.OP_NONE
):
813 #Display(f"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \
814 # f"idx:{req_index} tag:{req_tag} way: {req_hit_way}"
819 sync
+= r1
.mmu_req
.eq(r0
.mmu_req
)
821 # Fast path for load/store hits.
822 # Set signals for the writeback controls.
823 sync
+= r1
.hit_way
.eq(req_hit_way
)
824 sync
+= r1
.hit_index
.eq(req_index
)
826 with m
.If(req_op
== Op
.OP_LOAD_HIT
):
827 sync
+= r1
.hit_load_valid
.eq(1)
829 sync
+= r1
.hit_load_valid
.eq(0)
831 with m
.If((req_op
== Op
.OP_LOAD_HIT
) |
(req_op
== Op
.OP_STORE_HIT
)):
832 sync
+= r1
.cache_hit
.eq(1)
834 sync
+= r1
.cache_hit
.eq(0)
836 with m
.If(req_op
== Op
.OP_BAD
):
837 # Display(f"Signalling ld/st error valid_ra={valid_ra}"
838 # f"rc_ok={rc_ok} perm_ok={perm_ok}"
839 sync
+= r1
.ls_error
.eq(~r0
.mmu_req
)
840 sync
+= r1
.mmu_error
.eq(r0
.mmu_req
)
841 sync
+= r1
.cache_paradox
.eq(access_ok
)
844 sync
+= r1
.ls_error
.eq(0)
845 sync
+= r1
.mmu_error
.eq(0)
846 sync
+= r1
.cache_paradox
.eq(0)
848 with m
.If(req_op
== Op
.OP_STCX_FAIL
):
851 sync
+= r1
.stcx_fail
.eq(0)
853 # Record TLB hit information for updating TLB PLRU
854 sync
+= r1
.tlb_hit
.eq(tlb_hit
)
855 sync
+= r1
.tlb_hit_way
.eq(tlb_hit_way
)
856 sync
+= r1
.tlb_hit_index
.eq(tlb_req_index
)
858 # Memory accesses are handled by this state machine:
860 # * Cache load miss/reload (in conjunction with "rams")
861 # * Load hits for non-cachable forms
862 # * Stores (the collision case is handled in "rams")
864 # All wishbone requests generation is done here.
865 # This machine operates at stage 1.
866 def dcache_slow(self
, r1
, use_forward1_next
, cache_valid_bits
, r0
,
867 r0_valid
, req_op
, cache_tag
, req_go
, ra
, wb_in
):
872 req
= MemAccessRequest()
874 adjust_acks
= Signal(3)
876 sync
+= r1
.use_forward1
.eq(use_forward1_next
)
877 sync
+= r1
.forward_sel
.eq(0)
879 with m
.If(use_forward1_next
):
880 sync
+= r1
.forward_sel
.eq(r1
.req
.byte_sel
)
881 with m
.Elif(use_forward2_next
):
882 sync
+= r1
.forward_sel
.eq(r1
.forward_sel1
)
884 sync
+= r1
.forward_data2
.eq(r1
.forward_data1
)
885 with m
.If(r1
.write_bram
):
886 sync
+= r1
.forward_data1
.eq(r1
.req
.data
)
887 sync
+= r1
.forward_sel1
.eq(r1
.req
.byte_sel
)
888 sync
+= r1
.forward_way1
.eq(r1
.req
.hit_way
)
889 sync
+= r1
.forward_row1
.eq(get_row(r1
.req
.real_addr
))
890 sync
+= r1
.forward_valid1
.eq(1)
893 sync
+= r1
.forward_data1
.eq(0)
895 sync
+= r1
.forward_data1
.eq(wb_in
.dat
)
896 sync
+= r1
.forward_sel1
.eq(~
0) # all 1s
897 sync
+= r1
.forward_way1
.eq(replace_way
)
898 sync
+= r1
.forward_row1
.eq(r1
.store_row
)
899 sync
+= r1
.forward_valid1
.eq(0)
901 # One cycle pulses reset
902 sync
+= r1
.slow_valid
.eq(0)
903 sync
+= r1
.write_bram
.eq(0)
904 sync
+= r1
.inc_acks
.eq(0)
905 sync
+= r1
.dec_acks
.eq(0)
907 sync
+= r1
.ls_valid
.eq(0)
908 # complete tlbies and TLB loads in the third cycle
909 sync
+= r1
.mmu_done
.eq(r0_valid
& (r0
.tlbie | r0
.tlbld
))
911 with m
.If((req_op
== Op
.OP_LOAD_HIT
)
912 |
(req_op
== Op
.OP_STCX_FAIL
)):
913 with m
.If(~r0
.mmu_req
):
914 sync
+= r1
.ls_valid
.eq(1)
916 sync
+= r1
.mmu_done
.eq(1)
918 with m
.If(r1
.write_tag
):
919 # Store new tag in selected way
920 for i
in range(NUM_WAYS
):
921 with m
.If(i
== replace_way
):
923 trange
= range(i
* TAG_WIDTH
, (i
+1) * TAG_WIDTH
)
924 sync
+= cache_tag
[idx
][trange
].eq(r1
.reload_tag
)
925 sync
+= r1
.store_way
.eq(replace_way
)
926 sync
+= r1
.write_tag
.eq(0)
928 # Take request from r1.req if there is one there,
929 # else from req_op, ra, etc.
931 comb
+= req
.eq(r1
.req
)
933 comb
+= req
.op
.eq(req_op
)
934 comb
+= req
.valid
.eq(req_go
)
935 comb
+= req
.mmu_req
.eq(r0
.mmu_req
)
936 comb
+= req
.dcbz
.eq(r0
.req
.dcbz
)
937 comb
+= req
.real_addr
.eq(ra
)
939 with m
.If(~r0
.req
.dcbz
):
940 comb
+= req
.data
.eq(r0
.req
.data
)
942 comb
+= req
.data
.eq(0)
944 # Select all bytes for dcbz
945 # and for cacheable loads
946 with m
.If(r0
.req
.dcbz |
(r0
.req
.load
& ~r0
.req
.nc
):
947 comb
+= req
.byte_sel
.eq(~
0) # all 1s
949 comb
+= req
.byte_sel
.eq(r0
.req
.byte_sel
)
950 comb
+= req
.hit_way
.eq(req_hit_way
)
951 comb
+= req
.same_tag
.eq(req_same_tag
)
953 # Store the incoming request from r0,
954 # if it is a slow request
955 # Note that r1.full = 1 implies req_op = OP_NONE
956 with m
.If((req_op
== Op
.OP_LOAD_MISS
)
957 |
(req_op
== Op
.OP_LOAD_NC
)
958 |
(req_op
== Op
.OP_STORE_MISS
)
959 |
(req_op
== Op
.OP_STORE_HIT
)):
961 sync
+= r1
.full
.eq(1)
964 with m
.Switch(r1
.state
):
966 with m
.Case(State
.IDLE
)
967 # XXX check 'left downto. probably means len(r1.wb.adr)
968 # r1.wb.adr <= req.real_addr(
969 # r1.wb.adr'left downto 0
971 sync
+= r1
.wb
.adr
.eq(req
.real_addr
[0:r1
.wb
.adr
])
972 sync
+= r1
.wb
.sel
.eq(req
.byte_sel
)
973 sync
+= r1
.wb
.dat
.eq(req
.data
)
974 sync
+= r1
.dcbz
.eq(req
.dcbz
)
976 # Keep track of our index and way
977 # for subsequent stores.
978 sync
+= r1
.store_index
.eq(get_index(req
.real_addr
))
979 sync
+= r1
.store_row
.eq(get_row(req
.real_addr
))
980 sync
+= r1
.end_row_ix
.eq(
981 get_row_of_line(get_row(req
.real_addr
))
983 sync
+= r1
.reload_tag
.eq(get_tag(req
.real_addr
))
984 sync
+= r1
.req
.same_tag
.eq(1)
986 with m
.If(req
.op
== Op
.OP_STORE_HIT
):
987 sync
+= r1
.store_way
.eq(req
.hit_way
)
989 # Reset per-row valid bits,
990 # ready for handling OP_LOAD_MISS
991 for i
in range(ROW_PER_LINE
):
992 sync
+= r1
.rows_valid
[i
].eq(0)
994 with m
.Switch(req
.op
):
995 with m
.Case(Op
.OP_LOAD_HIT
):
999 with m
.Case(Op
.OP_LOAD_MISS
):
1000 #Display(f"cache miss real addr:" \
1001 # f"{req_real_addr}" \
1002 # f" idx:{get_index(req_real_addr)}" \
1003 # f" tag:{get_tag(req.real_addr)}")
1006 # Start the wishbone cycle
1007 sync
+= r1
.wb
.we
.eq(0)
1008 sync
+= r1
.wb
.cyc
.eq(1)
1009 sync
+= r1
.wb
.stb
.eq(1)
1011 # Track that we had one request sent
1012 sync
+= r1
.state
.eq(State
.RELOAD_WAIT_ACK
)
1013 sync
+= r1
.write_tag
.eq(1)
1015 with m
.Case(Op
.OP_LOAD_NC
):
1016 sync
+= r1
.wb
.cyc
.eq(1)
1017 sync
+= r1
.wb
.stb
.eq(1)
1018 sync
+= r1
.wb
.we
.eq(0)
1019 sync
+= r1
.state
.eq(State
.NC_LOAD_WAIT_ACK
)
1021 with m
.Case(Op
.OP_STORE_HIT
, Op
.OP_STORE_MISS
):
1022 with m
.If(~req
.bcbz
):
1023 sync
+= r1
.state
.eq(State
.STORE_WAIT_ACK
)
1024 sync
+= r1
.acks_pending
.eq(1)
1025 sync
+= r1
.full
.eq(0)
1026 sync
+= r1
.slow_valid
.eq(1)
1028 with m
.If(~req
.mmu_req
):
1029 sync
+= r1
.ls_valid
.eq(1)
1031 sync
+= r1
.mmu_done
.eq(1)
1033 with m
.If(req
.op
== Op
.OP_STORE_HIT
):
1034 sync
+= r1
.write_bram
.eq(1)
1036 sync
+= r1
.state
.eq(Op
.RELOAD_WAIT_ACK
)
1038 with m
.If(req
.op
== Op
.OP_STORE_MISS
):
1039 sync
+= r1
.write_tag
.eq(1)
1041 sync
+= r1
.wb
.we
.eq(1)
1042 sync
+= r1
.wb
.cyc
.eq(1)
1043 sync
+= r1
.wb
.stb
.eq(1)
1045 # OP_NONE and OP_BAD do nothing
1046 # OP_BAD & OP_STCX_FAIL were
1047 # handled above already
1048 with m
.Case(Op
.OP_NONE
):
1050 with m
.Case(OP_BAD
):
1052 with m
.Case(OP_STCX_FAIL
):
1055 with m
.Case(State
.RELOAD_WAIT_ACK
):
1056 # Requests are all sent if stb is 0
1057 comb
+= stbs_done
.eq(~r1
.wb
.stb
)
1059 with m
.If(~wb_in
.stall
& ~stbs_done
):
1060 # That was the last word?
1061 # We are done sending.
1062 # Clear stb and set stbs_done
1063 # so we can handle an eventual
1064 # last ack on the same cycle.
1065 with m
.If(is_last_row_addr(
1066 r1
.wb
.adr
, r1
.end_row_ix
)):
1067 sync
+= r1
.wb
.stb
.eq(0)
1068 comb
+= stbs_done
.eq(0)
1070 # Calculate the next row address
1071 sync
+= r1
.wb
.adr
.eq(next_row_addr(r1
.wb
.adr
))
1073 # Incoming acks processing
1074 sync
+= r1
.forward_valid1
.eq(wb_in
.ack
)
1075 with m
.If(wb_in
.ack
):
1076 # XXX needs an Array bit-accessor here
1077 sync
+= r1
.rows_valid
[r1
.store_row
% ROW_PER_LINE
].eq(1)
1079 # If this is the data we were looking for,
1080 # we can complete the request next cycle.
1081 # Compare the whole address in case the
1082 # request in r1.req is not the one that
1083 # started this refill.
1084 with m
.If(r1
.full
& r1
.req
.same_tag
&
1085 ((r1
.dcbz
& r1
.req
.dcbz
) |
1086 (~r1
.dcbz
& (r1
.req
.op
== Op
.OP_LOAD_MISS
))) &
1087 (r1
.store_row
== get_row(r1
.req
.real_addr
))):
1088 sync
+= r1
.full
.eq(0)
1089 sync
+= r1
.slow_valid
.eq(1)
1090 with m
.If(~r1
.mmu_req
):
1091 sync
+= r1
.ls_valid
.eq(1)
1093 sync
+= r1
.mmu_done
.eq(1)
1094 sync
+= r1
.forward_sel
.eq(~
0) # all 1s
1095 sync
+= r1
.use_forward1
.eq(1)
1097 # Check for completion
1098 with m
.If(stbs_done
& is_last_row(r1
.store_row
,
1100 # Complete wishbone cycle
1101 sync
+= r1
.wb
.cyc
.eq(0)
1103 # Cache line is now valid
1104 cv
= cache_valid_bits
[r1
.store_index
]
1105 sync
+= cv
[r1
.store_way
].eq(1)
1106 sync
+= r1
.state
.eq(State
.IDLE
)
1108 # Increment store row counter
1109 sync
+= r1
.store_row
.eq(next_row(r1
.store_row
))
1111 with m
.Case(State
.STORE_WAIT_ACK
):
1112 comb
+= stbs_done
.eq(~r1
.wb
.stb
)
1113 comb
+= acks
.eq(r1
.acks_pending
)
1115 with m
.If(r1
.inc_acks
!= r1
.dec_acks
):
1116 with m
.If(r1
.inc_acks
):
1117 comb
+= adjust_acks
.eq(acks
+ 1)
1119 comb
+= adjust_acks
.eq(acks
- 1)
1121 comb
+= adjust_acks
.eq(acks
)
1123 sync
+= r1
.acks_pending
.eq(adjust_acks
)
1125 # Clear stb when slave accepted request
1126 with m
.If(~wb_in
.stall
):
1127 # See if there is another store waiting
1128 # to be done which is in the same real page.
1129 with m
.If(req
.valid
):
1130 ra
= req
.real_addr
[0:SET_SIZE_BITS
]
1131 sync
+= r1
.wb
.adr
[0:SET_SIZE_BITS
].eq(ra
)
1132 sync
+= r1
.wb
.dat
.eq(req
.data
)
1133 sync
+= r1
.wb
.sel
.eq(req
.byte_sel
)
1135 with m
.Elif((adjust_acks
< 7) & req
.same_tag
&
1136 ((req
.op
== Op
.Op_STORE_MISS
)
1137 |
(req
.op
== Op
.OP_SOTRE_HIT
))):
1138 sync
+= r1
.wb
.stb
.eq(1)
1139 comb
+= stbs_done
.eq(0)
1141 with m
.If(req
.op
== Op
.OP_STORE_HIT
):
1142 sync
+= r1
.write_bram
.eq(1)
1143 sync
+= r1
.full
.eq(0)
1144 sync
+= r1
.slow_valid
.eq(1)
1146 # Store requests never come from the MMU
1147 sync
+= r1
.ls_valid
.eq(1)
1148 comb
+= stbs_done
.eq(0)
1149 sync
+= r1
.inc_acks
.eq(1)
1151 sync
+= r1
.wb
.stb
.eq(0)
1152 comb
+= stbs_done
.eq(1)
1154 # Got ack ? See if complete.
1155 with m
.If(wb_in
.ack
):
1156 with m
.If(stbs_done
& (adjust_acks
== 1))
1157 sync
+= r1
.state
.eq(State
.IDLE
)
1158 sync
+= r1
.wb
.cyc
.eq(0)
1159 sync
+= r1
.wb
.stb
.eq(0)
1160 sync
+= r1
.dec_acks
.eq(1)
1162 with m
.Case(State
.NC_LOAD_WAIT_ACK
):
1163 # Clear stb when slave accepted request
1164 with m
.If(~wb_in
.stall
):
1165 sync
+= r1
.wb
.stb
.eq(0)
1167 # Got ack ? complete.
1168 with m
.If(wb_in
.ack
):
1169 sync
+= r1
.state
.eq(State
.IDLE
)
1170 sync
+= r1
.full
.eq(0)
1171 sync
+= r1
.slow_valid
.eq(1)
1173 with m
.If(~r1
.mmu_req
):
1174 sync
+= r1
.ls_valid
.eq(1)
1176 sync
+= r1
.mmu_done
.eq(1)
1178 sync
+= r1
.forward_sel
.eq(~
0) # all 1s
1179 sync
+= r1
.use_forward1
.eq(1)
1180 sync
+= r1
.wb
.cyc
.eq(0)
1181 sync
+= r1
.wb
.stb
.eq(0)
1183 # dc_log: if LOG_LENGTH > 0 generate
1184 # TODO learn how to tranlate vhdl generate into nmigen
1185 def dcache_log(self
, r1
, valid_ra
, tlb_hit_way
, stall_out
,
1186 d_out
, wb_in
, log_out
):
1191 # signal log_data : std_ulogic_vector(19 downto 0);
1192 log_data
= Signal(20)
1197 # dcache_log: process(clk)
1199 # if rising_edge(clk) then
1200 # log_data <= r1.wb.adr(5 downto 3) &
1201 # wishbone_in.stall &
1203 # r1.wb.stb & r1.wb.cyc &
1206 # std_ulogic_vector(
1207 # to_unsigned(op_t'pos(req_op), 3)) &
1209 # std_ulogic_vector(
1210 # to_unsigned(tlb_hit_way, 3)) &
1212 # std_ulogic_vector(
1213 # to_unsigned(state_t'pos(r1.state), 3));
1214 sync
+= log_data
.eq(Cat(
1215 Const(r1
.state
, 3), valid_ra
, Const(tlb_hit_way
, 3),
1216 stall_out
, Const(req_op
, 3), d_out
.valid
, d_out
.error
,
1217 r1
.wb
.cyc
, r1
.wb
.stb
, wb_in
.ack
, wb_in
.stall
,
1222 # log_out <= log_data;
1223 # TODO ??? I am very confused need help
1224 comb
+= log_out
.eq(log_data
)
1228 def elaborate(self
, platform
):
1229 LINE_SIZE
= self
.LINE_SIZE
1230 NUM_LINES
= self
.NUM_LINES
1231 NUM_WAYS
= self
.NUM_WAYS
1232 TLB_SET_SIZE
= self
.TLB_SET_SIZE
1233 TLB_NUM_WAYS
= self
.TLB_NUM_WAYS
1234 TLB_LG_PGSZ
= self
.TLB_LG_PGSZ
1235 LOG_LENGTH
= self
.LOG_LENGTH
1237 # BRAM organisation: We never access more than
1238 # -- wishbone_data_bits at a time so to save
1239 # -- resources we make the array only that wide, and
1240 # -- use consecutive indices for to make a cache "line"
1242 # -- ROW_SIZE is the width in bytes of the BRAM
1243 # -- (based on WB, so 64-bits)
1244 ROW_SIZE
= WB_DATA_BITS
/ 8;
1246 # ROW_PER_LINE is the number of row (wishbone
1247 # transactions) in a line
1248 ROW_PER_LINE
= LINE_SIZE
// ROW_SIZE
1250 # BRAM_ROWS is the number of rows in BRAM needed
1251 # to represent the full dcache
1252 BRAM_ROWS
= NUM_LINES
* ROW_PER_LINE
1255 # Bit fields counts in the address
1257 # REAL_ADDR_BITS is the number of real address
1258 # bits that we store
1261 # ROW_BITS is the number of bits to select a row
1262 ROW_BITS
= log2_int(BRAM_ROWS
)
1264 # ROW_LINE_BITS is the number of bits to select
1265 # a row within a line
1266 ROW_LINE_BITS
= log2_int(ROW_PER_LINE
)
1268 # LINE_OFF_BITS is the number of bits for
1269 # the offset in a cache line
1270 LINE_OFF_BITS
= log2_int(LINE_SIZE
)
1272 # ROW_OFF_BITS is the number of bits for
1273 # the offset in a row
1274 ROW_OFF_BITS
= log2_int(ROW_SIZE
)
1276 # INDEX_BITS is the number if bits to
1277 # select a cache line
1278 INDEX_BITS
= log2_int(NUM_LINES
)
1280 # SET_SIZE_BITS is the log base 2 of the set size
1281 SET_SIZE_BITS
= LINE_OFF_BITS
+ INDEX_BITS
1283 # TAG_BITS is the number of bits of
1284 # the tag part of the address
1285 TAG_BITS
= REAL_ADDR_BITS
- SET_SIZE_BITS
1287 # TAG_WIDTH is the width in bits of each way of the tag RAM
1288 TAG_WIDTH
= TAG_BITS
+ 7 - ((TAG_BITS
+ 7) % 8)
1290 # WAY_BITS is the number of bits to select a way
1291 WAY_BITS
= log2_int(NUM_WAYS
)
1293 # Example of layout for 32 lines of 64 bytes:
1295 # .. tag |index| line |
1297 # .. | |---| | ROW_LINE_BITS (3)
1298 # .. | |--- - --| LINE_OFF_BITS (6)
1299 # .. | |- --| ROW_OFF_BITS (3)
1300 # .. |----- ---| | ROW_BITS (8)
1301 # .. |-----| | INDEX_BITS (5)
1302 # .. --------| | TAG_BITS (45)
1304 TAG_RAM_WIDTH
= TAG_WIDTH
* NUM_WAYS
1306 def CacheTagArray():
1307 return Array(CacheTagSet() for x
in range(NUM_LINES
))
1309 def CacheValidBitsArray():
1310 return Array(CacheWayValidBits() for x
in range(NUM_LINES
))
1312 def RowPerLineValidArray():
1313 return Array(Signal() for x
in range(ROW_PER_LINE
))
1315 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1316 cache_tags
= CacheTagArray()
1317 cache_tag_set
= Signal(TAG_RAM_WIDTH
)
1318 cache_valid_bits
= CacheValidBitsArray()
1320 # TODO attribute ram_style : string;
1321 # TODO attribute ram_style of cache_tags : signal is "distributed";
1324 TLB_SET_BITS
= log2_int(TLB_SET_SIZE
)
1325 TLB_WAY_BITS
= log2_int(TLB_NUM_WAYS
)
1326 TLB_EA_TAG_BITS
= 64 - (TLB_LG_PGSZ
+ TLB_SET_BITS
)
1327 TLB_TAG_WAY_BITS
= TLB_NUM_WAYS
* TLB_EA_TAG_BITS
1329 TLB_PTE_WAY_BITS
= TLB_NUM_WAYS
* TLB_PTE_BITS
;
1331 def TLBValidBitsArray():
1333 Signal(TLB_NUM_WAYS
) for x
in range(TLB_SET_SIZE
)
1338 Signal(TLB_TAG_WAY_BITS
) for x
in range (TLB_SET_SIZE
)
1343 Signal(TLB_PTE_WAY_BITS
) for x
in range(TLB_SET_SIZE
)
1347 return Array(Signal(NUM_WAYS
) for x
in range(TLB_NUM_WAYS
))
1349 """note: these are passed to nmigen.hdl.Memory as "attributes".
1350 don't know how, just that they are.
1352 dtlb_valid_bits
= TLBValidBitsArray()
1353 dtlb_tags
= TLBTagsArray()
1354 dtlb_ptes
= TLBPtesArray()
1355 # TODO attribute ram_style of
1356 # dtlb_tags : signal is "distributed";
1357 # TODO attribute ram_style of
1358 # dtlb_ptes : signal is "distributed";
1365 reservation
= Reservation()
1367 # Async signals on incoming request
1368 req_index
= Signal(NUM_LINES
)
1369 req_row
= Signal(BRAM_ROWS
)
1370 req_hit_way
= Signal(WAY_BITS
)
1371 req_tag
= Signal(TAG_BITS
)
1373 req_data
= Signal(64)
1374 req_same_tag
= Signal()
1377 early_req_row
= Signal(BRAM_ROWS
)
1379 cancel_store
= Signal()
1381 clear_rsrv
= Signal()
1386 use_forward1_next
= Signal()
1387 use_forward2_next
= Signal()
1389 # Cache RAM interface
1391 return Array(Signal(WB_DATA_BITS
) for x
in range(NUM_WAYS
))
1393 cache_out
= CacheRamOut()
1395 # PLRU output interface
1397 return Array(Signal(WAY_BITS
) for x
in range(Index()))
1399 plru_victim
= PLRUOut()
1400 replace_way
= Signal(WAY_BITS
)
1402 # Wishbone read/write/cache write formatting signals
1406 tlb_tag_way
= Signal(TLB_TAG_WAY_BITS
)
1407 tlb_pte_way
= Signal(TLB_PTE_WAY_BITS
)
1408 tlb_valid_way
= Signal(TLB_NUM_WAYS
)
1409 tlb_req_index
= Signal(TLB_SET_SIZE
)
1411 tlb_hit_way
= Signal(TLB_NUM_WAYS
)
1412 pte
= Signal(TLB_PTE_BITS
)
1413 ra
= Signal(REAL_ADDR_BITS
)
1415 perm_attr
= PermAttr()
1418 access_ok
= Signal()
1420 # TLB PLRU output interface
1423 Signal(TLB_WAY_BITS
) for x
in range(TLB_SET_SIZE
)
1426 tlb_plru_victim
= TLBPLRUOut()
1428 # Helper functions to decode incoming requests
1430 # Return the cache line index (tag index) for an address
1431 def get_index(addr
):
1432 return addr
[LINE_OFF_BITS
:SET_SIZE_BITS
]
1434 # Return the cache row index (data memory) for an address
1436 return addr
[ROW_OFF_BITS
:SET_SIZE_BITS
]
1438 # Return the index of a row within a line
1439 def get_row_of_line(row
):
1440 row_v
= Signal(ROW_BITS
)
1442 return row_v
[0:ROW_LINE_BITS
]
1444 # Returns whether this is the last row of a line
1445 def is_last_row_addr(addr
, last
):
1446 return addr
[ROW_OFF_BITS
:LINE_OFF_BITS
] == last
1448 # Returns whether this is the last row of a line
1449 def is_last_row(row
, last
):
1450 return get_row_of_line(row
) == last
1452 # Return the address of the next row in the current cache line
1453 def next_row_addr(addr
):
1454 row_idx
= Signal(ROW_LINE_BITS
)
1455 result
= WBAddrType()
1456 # Is there no simpler way in VHDL to
1457 # generate that 3 bits adder ?
1458 row_idx
= addr
[ROW_OFF_BITS
:LINE_OFF_BITS
]
1459 row_idx
= Signal(row_idx
+ 1)
1461 result
[ROW_OFF_BITS
:LINE_OFF_BITS
] = row_idx
1464 # Return the next row in the current cache line. We use a
1465 # dedicated function in order to limit the size of the
1466 # generated adder to be only the bits within a cache line
1467 # (3 bits with default settings)
1469 row_v
= Signal(ROW_BITS
)
1470 row_idx
= Signal(ROW_LINE_BITS
)
1471 result
= Signal(ROW_BITS
)
1474 row_idx
= row_v
[ROW_LINE_BITS
]
1475 row_v
[0:ROW_LINE_BITS
] = Signal(row_idx
+ 1)
1478 # Get the tag value from the address
1480 return addr
[SET_SIZE_BITS
:REAL_ADDR_BITS
]
1482 # Read a tag from a tag memory row
1483 def read_tag(way
, tagset
):
1484 return tagset
[way
*TAG_WIDTH
:way
* TAG_WIDTH
+ TAG_BITS
]
1486 # Read a TLB tag from a TLB tag memory row
1487 def read_tlb_tag(way
, tags
):
1490 j
= way
* TLB_EA_TAG_BITS
1491 return tags
[j
:j
+ TLB_EA_TAG_BITS
]
1493 # Write a TLB tag to a TLB tag memory row
1494 def write_tlb_tag(way
, tags
), tag
):
1497 j
= way
* TLB_EA_TAG_BITS
1498 tags
[j
:j
+ TLB_EA_TAG_BITS
] = tag
1500 # Read a PTE from a TLB PTE memory row
1501 def read_tlb_pte(way
, ptes
):
1504 j
= way
* TLB_PTE_BITS
1505 return ptes
[j
:j
+ TLB_PTE_BITS
]
1507 def write_tlb_pte(way
, ptes
,newpte
):
1510 j
= way
* TLB_PTE_BITS
1511 return ptes
[j
:j
+ TLB_PTE_BITS
] = newpte
1513 assert (LINE_SIZE
% ROW_SIZE
) == 0 "LINE_SIZE not " \
1514 "multiple of ROW_SIZE"
1516 assert (LINE_SIZE
% 2) == 0 "LINE_SIZE not power of 2"
1518 assert (NUM_LINES
% 2) == 0 "NUM_LINES not power of 2"
1520 assert (ROW_PER_LINE
% 2) == 0 "ROW_PER_LINE not" \
1523 assert ROW_BITS
== (INDEX_BITS
+ ROW_LINE_BITS
) \
1524 "geometry bits don't add up"
1526 assert (LINE_OFF_BITS
= ROW_OFF_BITS
+ ROW_LINEBITS
) \
1527 "geometry bits don't add up"
1529 assert REAL_ADDR_BITS
== (TAG_BITS
+ INDEX_BITS \
1530 + LINE_OFF_BITS
) "geometry bits don't add up"
1532 assert REAL_ADDR_BITS
== (TAG_BITS
+ ROW_BITS
+ ROW_OFF_BITS
) \
1533 "geometry bits don't add up"
1535 assert 64 == wishbone_data_bits
"Can't yet handle a" \
1536 "wishbone width that isn't 64-bits"
1538 assert SET_SIZE_BITS
<= TLB_LG_PGSZ
"Set indexed by" \
1541 # we don't yet handle collisions between loadstore1 requests
1543 comb
+= m_out
.stall
.eq(0)
1545 # Hold off the request in r0 when r1 has an uncompleted request
1546 comb
+= r0_stall
.eq(r0_full
& r1
.full
)
1547 comb
+= r0_valid
.eq(r0_full
& ~r1
.full
)
1548 comb
+= stall_out
.eq(r0_stall
)
1550 # Wire up wishbone request latch out of stage 1
1551 comb
+= wishbone_out
.eq(r1
.wb
)
1557 # entity dcache_tb is
1560 # architecture behave of dcache_tb is
1561 # signal clk : std_ulogic;
1562 # signal rst : std_ulogic;
1564 # signal d_in : Loadstore1ToDcacheType;
1565 # signal d_out : DcacheToLoadstore1Type;
1567 # signal m_in : MmuToDcacheType;
1568 # signal m_out : DcacheToMmuType;
1570 # signal wb_bram_in : wishbone_master_out;
1571 # signal wb_bram_out : wishbone_slave_out;
1573 # constant clk_period : time := 10 ns;
1575 # dcache0: entity work.dcache
1588 # wishbone_out => wb_bram_in,
1589 # wishbone_in => wb_bram_out
1592 # -- BRAM Memory slave
1593 # bram0: entity work.wishbone_bram_wrapper
1595 # MEMORY_SIZE => 1024,
1596 # RAM_INIT_FILE => "icache_test.bin"
1601 # wishbone_in => wb_bram_in,
1602 # wishbone_out => wb_bram_out
1605 # clk_process: process
1608 # wait for clk_period/2;
1610 # wait for clk_period/2;
1613 # rst_process: process
1616 # wait for 2*clk_period;
1624 # d_in.valid <= '0';
1627 # d_in.addr <= (others => '0');
1628 # d_in.data <= (others => '0');
1629 # m_in.valid <= '0';
1630 # m_in.addr <= (others => '0');
1631 # m_in.pte <= (others => '0');
1633 # wait for 4*clk_period;
1634 # wait until rising_edge(clk);
1636 # -- Cacheable read of address 4
1639 # d_in.addr <= x"0000000000000004";
1640 # d_in.valid <= '1';
1641 # wait until rising_edge(clk);
1642 # d_in.valid <= '0';
1644 # wait until rising_edge(clk) and d_out.valid = '1';
1645 # assert d_out.data = x"0000000100000000"
1646 # report "data @" & to_hstring(d_in.addr) &
1647 # "=" & to_hstring(d_out.data) &
1648 # " expected 0000000100000000"
1650 # -- wait for clk_period;
1652 # -- Cacheable read of address 30
1655 # d_in.addr <= x"0000000000000030";
1656 # d_in.valid <= '1';
1657 # wait until rising_edge(clk);
1658 # d_in.valid <= '0';
1660 # wait until rising_edge(clk) and d_out.valid = '1';
1661 # assert d_out.data = x"0000000D0000000C"
1662 # report "data @" & to_hstring(d_in.addr) &
1663 # "=" & to_hstring(d_out.data) &
1664 # " expected 0000000D0000000C"
1667 # -- Non-cacheable read of address 100
1670 # d_in.addr <= x"0000000000000100";
1671 # d_in.valid <= '1';
1672 # wait until rising_edge(clk);
1673 # d_in.valid <= '0';
1674 # wait until rising_edge(clk) and d_out.valid = '1';
1675 # assert d_out.data = x"0000004100000040"
1676 # report "data @" & to_hstring(d_in.addr) &
1677 # "=" & to_hstring(d_out.data) &
1678 # " expected 0000004100000040"
1681 # wait until rising_edge(clk);
1682 # wait until rising_edge(clk);
1683 # wait until rising_edge(clk);
1684 # wait until rising_edge(clk);
1689 def dcache_sim(dut
):
1691 yield dut
.d_in
.valid
.eq(0)
1692 yield dut
.d_in
.load
.eq(0)
1693 yield dut
.d_in
.nc
.eq(0)
1694 yield dut
.d_in
.adrr
.eq(0)
1695 yield dut
.d_in
.data
.eq(0)
1696 yield dut
.m_in
.valid
.eq(0)
1697 yield dut
.m_in
.addr
.eq(0)
1698 yield dut
.m_in
.pte
.eq(0)
1699 # wait 4 * clk_period
1704 # wait_until rising_edge(clk)
1706 # Cacheable read of address 4
1707 yield dut
.d_in
.load
.eq(1)
1708 yield dut
.d_in
.nc
.eq(0)
1709 yield dut
.d_in
.addr
.eq(Const(0x0000000000000004, 64))
1710 yield dut
.d_in
.valid
.eq(1)
1711 # wait-until rising_edge(clk)
1713 yield dut
.d_in
.valid
.eq(0)
1715 while not (yield dut
.d_out
.valid
):
1717 assert dut
.d_out
.data
== Const(0x0000000100000000, 64) f
"data @" \
1718 f
"{dut.d_in.addr}={dut.d_in.data} expected 0000000100000000" \
1719 " -!- severity failure"
1722 # Cacheable read of address 30
1723 yield dut
.d_in
.load
.eq(1)
1724 yield dut
.d_in
.nc
.eq(0)
1725 yield dut
.d_in
.addr
.eq(Const(0x0000000000000030, 64))
1726 yield dut
.d_in
.valid
.eq(1)
1728 yield dut
.d_in
.valid
.eq(0)
1730 while not (yield dut
.d_out
.valid
):
1732 assert dut
.d_out
.data
== Const(0x0000000D0000000C, 64) f
"data @" \
1733 f
"{dut.d_in.addr}={dut.d_out.data} expected 0000000D0000000C" \
1734 f
"-!- severity failure"
1736 # Non-cacheable read of address 100
1737 yield dut
.d_in
.load
.eq(1)
1738 yield dut
.d_in
.nc
.eq(1)
1739 yield dut
.d_in
.addr
.eq(Const(0x0000000000000100, 64))
1740 yield dut
.d_in
.valid
.eq(1)
1742 yield dut
.d_in
.valid
.eq(0)
1744 while not (yield dut
.d_out
.valid
):
1746 assert dut
.d_out
.data
== Const(0x0000004100000040, 64) f
"data @" \
1747 f
"{dut.d_in.addr}={dut.d_out.data} expected 0000004100000040" \
1748 f
"-!- severity failure"
1758 vl
= rtlil
.convert(dut
, ports
=[])
1759 with
open("test_dcache.il", "w") as f
:
1762 run_simulation(dut
, dcache_sim(), vcd_name
='test_dcache.vcd')
1764 if __name__
== '__main__':