3 based on Anton Blanchard microwatt dcache.vhdl
7 from enum
import Enum
, unique
9 from nmigen
import Module
, Signal
, Elaboratable
,
11 from nmigen
.cli
import main
12 from nmigen
.iocontrol
import RecordObject
13 from nmigen
.util
import log2_int
15 from experiment
.mem_types
import LoadStore1ToDCacheType
,
16 DCacheToLoadStore1Type
,
20 from experiment
.wb_types
import WB_ADDR_BITS
, WB_DATA_BITS
, WB_SEL_BITS
,
21 WBAddrType
, WBDataType
, WBSelType
,
22 WbMasterOut
, WBSlaveOut
,
23 WBMasterOutVector
, WBSlaveOutVector
,
24 WBIOMasterOut
, WBIOSlaveOut
27 # Record for storing permission, attribute, etc. bits from a PTE
28 class PermAttr(RecordObject
):
31 self
.reference
= Signal()
32 self
.changed
= Signal()
33 self
.nocache
= Signal()
35 self
.rd_perm
= Signal()
36 self
.wr_perm
= Signal()
39 def extract_perm_attr(pte
):
50 # Type of operation on a "valid" input
54 OP_BAD
= 1 # NC cache hit, TLB miss, prot/RC failure
55 OP_STCX_FAIL
= 2 # conditional store w/o reservation
56 OP_LOAD_HIT
= 3 # Cache hit on load
57 OP_LOAD_MISS
= 4 # Load missing cache
58 OP_LOAD_NC
= 5 # Non-cachable load
59 OP_STORE_HIT
= 6 # Store hitting cache
60 OP_STORE_MISS
= 7 # Store missing cache
66 IDLE
= 0 # Normal load hit processing
67 RELOAD_WAIT_ACK
= 1 # Cache reload wait ack
68 STORE_WAIT_ACK
= 2 # Store wait ack
69 NC_LOAD_WAIT_ACK
= 3 # Non-cachable load wait ack
74 # In order to make timing, we use the BRAMs with
75 # an output buffer, which means that the BRAM
76 # output is delayed by an extra cycle.
78 # Thus, the dcache has a 2-stage internal pipeline
79 # for cache hits with no stalls.
81 # All other operations are handled via stalling
84 # The second stage can thus complete a hit at the same
85 # time as the first stage emits a stall for a complex op.
87 # Stage 0 register, basically contains just the latched request
88 class RegStage0(RecordObject
):
91 self
.req
= LoadStore1ToDCacheType()
95 self
.mmu_req
= Signal() # indicates source of request
98 class MemAccessRequest(RecordObject
):
102 self
.valid
= Signal()
104 self
.real_addr
= Signal(REAL_ADDR_BITS
)
105 self
.data
= Signal(64)
106 self
.byte_sel
= Signal(8)
107 self
.hit_way
= Signal(WAY_BITS
)
108 self
.same_tag
= Signal()
109 self
.mmu_req
= Signal()
112 # First stage register, contains state for stage 1 of load hits
113 # and for the state machine used by all other operations
114 class RegStage1(RecordObject
):
117 # Info about the request
118 self
.full
= Signal() # have uncompleted request
119 self
.mmu_req
= Signal() # request is from MMU
120 self
.req
= MemAccessRequest()
123 self
.hit_way
= Signal(WAY_BITS
)
124 self
.hit_load_valid
= Signal()
125 self
.hit_index
= Signal(NUM_LINES
)
126 self
.cache_hit
= Signal()
129 self
.tlb_hit
= Signal()
130 self
.tlb_hit_way
= Signal(TLB_NUM_WAYS
)
131 self
.tlb_hit_index
= Signal(TLB_SET_SIZE
)
133 # 2-stage data buffer for data forwarded from writes to reads
134 self
.forward_data1
= Signal(64)
135 self
.forward_data2
= Signal(64)
136 self
.forward_sel1
= Signal(8)
137 self
.forward_valid1
= Signal()
138 self
.forward_way1
= Signal(WAY_BITS
)
139 self
.forward_row1
= Signal(BRAM_ROWS
)
140 self
.use_forward1
= Signal()
141 self
.forward_sel
= Signal(8)
143 # Cache miss state (reload state machine)
146 self
.write_bram
= Signal()
147 self
.write_tag
= Signal()
148 self
.slow_valid
= Signal()
149 self
.wb
= WishboneMasterOut()
150 self
.reload_tag
= Signal(TAG_BITS
)
151 self
.store_way
= Signal(WAY_BITS
)
152 self
.store_row
= Signal(BRAM_ROWS
)
153 self
.store_index
= Signal(NUM_LINES
)
154 self
.end_row_ix
= Signal(ROW_LINE_BIT
)
155 self
.rows_valid
= RowPerLineValidArray()
156 self
.acks_pending
= Signal(3)
157 self
.inc_acks
= Signal()
158 self
.dec_acks
= Signal()
160 # Signals to complete (possibly with error)
161 self
.ls_valid
= Signal()
162 self
.ls_error
= Signal()
163 self
.mmu_done
= Signal()
164 self
.mmu_error
= Signal()
165 self
.cache_paradox
= Signal()
167 # Signal to complete a failed stcx.
168 self
.stcx_fail
= Signal()
171 # Reservation information
172 class Reservation(RecordObject
):
176 # TODO LINE_OFF_BITS is 6
177 addr
= Signal(63 downto LINE_OFF_BITS
)
180 # Set associative dcache write-through
182 # TODO (in no specific order):
184 # * See list in icache.vhdl
185 # * Complete load misses on the cycle when WB data comes instead of
186 # at the end of line (this requires dealing with requests coming in
188 class DCache(Elaboratable
):
190 # TODO: make these parameters of DCache at some point
191 self
.LINE_SIZE
= 64 # Line size in bytes
192 self
.NUM_LINES
= 32 # Number of lines in a set
193 self
.NUM_WAYS
= 4 # Number of ways
194 self
.TLB_SET_SIZE
= 64 # L1 DTLB entries per set
195 self
.TLB_NUM_WAYS
= 2 # L1 DTLB number of sets
196 self
.TLB_LG_PGSZ
= 12 # L1 DTLB log_2(page_size)
197 self
.LOG_LENGTH
= 0 # Non-zero to enable log data collection
199 self
.d_in
= LoadStore1ToDCacheType()
200 self
.d_out
= DCacheToLoadStore1Type()
202 self
.m_in
= MMUToDCacheType()
203 self
.m_out
= DCacheToMMUType()
205 self
.stall_out
= Signal()
207 self
.wb_out
= WBMasterOut()
208 self
.wb_in
= WBSlaveOut()
210 self
.log_out
= Signal(20)
212 # Latch the request in r0.req as long as we're not stalling
213 def stage_0(self
, m
, d_in
, m_in
):
219 # TODO, this goes in unit tests and formal proofs
220 # assert ~(d_in.valid & m_in.valid),
221 # "request collision loadstore vs MMU"
222 with m
.If(~
(d_in
.valid
& m_in
.valid
)):
223 #sync += Display("request collision loadstore vs MMU")
226 with m
.If(m_in
.valid
):
227 sync
+= r
.req
.valid
.eq(1)
228 sync
+= r
.req
.load
.eq(~
(m_in
.tlbie | m_in
.tlbld
))
229 sync
+= r
.req
.dcbz
.eq(0)
230 sync
+= r
.req
.nc
.eq(0)
231 sync
+= r
.req
.reserve
.eq(0)
232 sync
+= r
.req
.virt_mode
.eq(1)
233 sync
+= r
.req
.priv_mode
.eq(1)
234 sync
+= r
.req
.addr
.eq(m_in
.addr
)
235 sync
+= r
.req
.data
.eq(m_in
.pte
)
236 sync
+= r
.req
.byte_sel
.eq(-1) # Const -1 sets all to 0b111....
237 sync
+= r
.tlbie
.eq(m_in
.tlbie
)
238 sync
+= r
.doall
.eq(m_in
.doall
)
239 sync
+= r
.tlbld
.eq(m_in
.tlbld
)
240 sync
+= r
.mmu_req
.eq(1)
242 sync
+= r
.req
.eq(d_in
)
243 sync
+= r
.req
.tlbie
.eq(0)
244 sync
+= r
.req
.doall
.eq(0)
245 sync
+= r
.req
.tlbd
.eq(0)
246 sync
+= r
.req
.mmu_req
.eq(0)
247 with m
.If(~
(r1
.full
& r0_full
)):
249 sync
+= r0_full
.eq(r
.req
.valid
)
252 # Operates in the second cycle on the request latched in r0.req.
253 # TLB updates write the entry at the end of the second cycle.
254 def tlb_read(self
, m
, m_in
, d_in
, r0_stall
, tlb_valid_way
,
255 tlb_tag_way
, tlb_pte_way
, dtlb_valid_bits
,
256 dtlb_tags
, dtlb_ptes
):
261 index
= Signal(log2_int(TLB_SET_BITS
), False)
262 addrbits
= Signal(TLB_SET_BITS
)
265 amax
= TLB_LG_PGSZ
+ TLB_SET_BITS
267 with m
.If(m_in
.valid
):
268 comb
+= addrbits
.eq(m_in
.addr
[amin
: amax
])
270 comb
+= addrbits
.eq(d_in
.addr
[amin
: amax
])
271 comb
+= index
.eq(addrbits
)
273 # If we have any op and the previous op isn't finished,
274 # then keep the same output for next cycle.
275 with m
.If(~r0_stall
):
276 sync
+= tlb_valid_way
.eq(dtlb_valid_bits
[index
])
277 sync
+= tlb_tag_way
.eq(dtlb_tags
[index
])
278 sync
+= tlb_pte_way
.eq(dtlb_ptes
[index
])
281 def maybe_tlb_plrus(self
, m
, r1
, tlb_plru_victim
, acc
, acc_en
, lru
):
285 with m
.If(TLB_NUM_WAYS
> 1):
286 for i
in range(TLB_SET_SIZE
):
288 tlb_plru
= PLRU(TLB_WAY_BITS
)
289 tlb_plru_acc
= Signal(TLB_WAY_BITS
)
290 tlb_plru_acc_en
= Signal()
291 tlb_plru_out
= Signal(TLB_WAY_BITS
)
293 comb
+= tlb_plru
.acc
.eq(tlb_plru_acc
)
294 comb
+= tlb_plru
.acc_en
.eq(tlb_plru_acc_en
)
295 comb
+= tlb_plru
.lru
.eq(tlb_plru_out
)
298 with m
.If(r1
.tlb_hit_index
== i
):
299 comb
+= tlb_plru
.acc_en
.eq(r1
.tlb_hit
)
301 comb
+= tlb_plru
.acc_en
.eq(0)
302 comb
+= tlb_plru
.acc
.eq(r1
.tlb_hit_way
)
304 comb
+= tlb_plru_victim
[i
].eq(tlb_plru
.lru
)
306 def tlb_search(self
, tlb_req_index
, r0
, tlb_valid_way_ tlb_tag_way
,
307 tlb_pte_way
, pte
, tlb_hit
, valid_ra
, perm_attr
, ra
):
312 hitway
= Signal(TLB_WAY_BITS
)
314 eatag
= Signal(log2_int(TLB_EA_TAG_BITS
, False))
316 TLB_LG_END
= TLB_LG_PGSZ
+ TLB_SET_BITS
317 comb
+= tlb_req_index
.eq(r0
.req
.addr
[TLB_LG_PGSZ
: TLB_LG_END
])
318 comb
+= eatag
.eq(r0
.req
.addr
[TLB_LG_END
: 64 ])
320 for i
in range(TLB_NUM_WAYS
):
321 with m
.If(tlb_valid_way(i
)
322 & read_tlb_tag(i
, tlb_tag_way
) == eatag
):
326 comb
+= tlb_hit
.eq(hit
& r0_valid
)
327 comb
+= tlb_hit_way
.eq(hitway
)
330 comb
+= pte
.eq(read_tlb_pte(hitway
, tlb_pte_way
))
333 comb
+= valid_ra
.eq(tlb_hit | ~r0
.req
.virt_mode
)
334 with m
.If(r0
.req
.virt_mode
):
335 comb
+= ra
.eq(Cat(Const(0, ROW_OFF_BITS
),
336 r0
.req
.addr
[ROW_OFF_BITS
:TLB_LG_PGSZ
],
337 pte
[TLB_LG_PGSZ
:REAL_ADDR_BITS
]))
338 comb
+= perm_attr
.eq(extract_perm_attr(pte
))
340 comb
+= ra
.eq(Cat(Const(0, ROW_OFF_BITS
),
341 r0
.rq
.addr
[ROW_OFF_BITS
:REAL_ADDR_BITS
]))
343 comb
+= perm_attr
.reference
.eq(1)
344 comb
+= perm_attr
.changed
.eq(1)
345 comb
+= perm_attr
.priv
.eq(1)
346 comb
+= perm_attr
.nocache
.eq(0)
347 comb
+= perm_attr
.rd_perm
.eq(1)
348 comb
+= perm_attr
.wr_perm
.eq(1)
350 def tlb_update(self
, r0_valid
, r0
, dtlb_valid_bits
, tlb_req_index
,
351 tlb_hit_way
, tlb_hit
, tlb_plru_victim
, tlb_tag_way
,
352 dtlb_tags
, tlb_pte_way
, dtlb_ptes
, dtlb_valid_bits
):
357 # variable tlbie : std_ulogic;
358 # variable tlbwe : std_ulogic;
359 # variable repl_way : tlb_way_t;
360 # variable eatag : tlb_tag_t;
361 # variable tagset : tlb_way_tags_t;
362 # variable pteset : tlb_way_ptes_t;
363 #type tlb_tags_t is array(tlb_index_t) of tlb_way_tags_t;
364 # --> Array([Signal(log(way_tags length)) for i in range(number of tlbs)])
368 repl_way
= Signal(TLB_WAY_BITS
)
369 eatag
= Signal(log2_int(TLB_EA_TAG_BITS
, False))
370 tagset
= TLBWayTags()
371 pteset
= TLBWayPtes()
373 comb
+= tlbie
.eq(r0_valid
& r0
.tlbie
)
374 comb
+= tlbwe
.eq(r0_valid
& r0
.tlbldoi
)
376 with m
.If(tlbie
& r0
.doall
):
377 # clear all valid bits at once
378 for i
in range(TLB_SET_SIZE
):
379 sync
+= dtlb_valid_bits
[i
].eq(0)
383 sync
+= dtlb_valid_bits
[tlb_req_index
][tlb_hit_way
].eq(0)
386 comb
+= repl_way
.eq(tlb_hit_way
)
388 comb
+= repl_way
.eq(tlb_plru_victim
[tlb_req_index
])
389 comb
+= eatag
.eq(r0
.req
.addr
[TLB_LG_PGSZ
+ TLB_SET_BITS
:64])
390 comb
+= tagset
.eq(tlb_tag_way
)
391 sync
+= write_tlb_tag(repl_way
, tagset
, eatag
)
392 sync
+= dtlb_tags
[tlb_req_index
].eq(tagset
)
393 comb
+= pteset
.eq(tlb_pte_way
)
394 sync
+= write_tlb_pte(repl_way
, pteset
, r0
.req
.data
)
395 sync
+= dtlb_ptes
[tlb_req_index
].eq(pteset
)
396 sync
+= dtlb_valid_bits
[tlb_req_index
][repl_way
].eq(1)
399 def maybe_plrus(self
, r1
):
404 for i
in range(NUM_LINES
):
406 plru
= PLRU(TLB_WAY_BITS
)
407 setattr(m
.submodules
, "plru%d" % i
, plru
)
408 plru_acc
= Signal(TLB_WAY_BITS
)
409 plru_acc_en
= Signal()
410 plru_out
= Signal(TLB_WAY_BITS
)
412 comb
+= plru
.acc
.eq(plru_acc
)
413 comb
+= plru
.acc_en
.eq(plru_acc_en
)
414 comb
+= plru
.lru
.eq(plru_out
)
416 with m
.If(r1
.hit_index
== i
):
417 comb
+= plru_acc_en
.eq(r1
.cache_hit
)
419 comb
+= plru_acc
.eq(r1
.hit_way
)
420 comb
+= plru_victim
[i
].eq(plru_out
)
422 # Cache tag RAM read port
423 def cache_tag_read(self
, r0_stall
, req_index
, m_in
, d_in
,
424 cache_tag_set
, cache_tags
):
429 index
= Signal(INDEX_BITS
)
432 comb
+= index
.eq(req_index
)
433 with m
.Elif(m_in
.valid
):
434 comb
+= index
.eq(get_index(m_in
.addr
))
436 comb
+= index
.eq(get_index(d_in
.addr
))
437 sync
+= cache_tag_set
.eq(cache_tags
[index
])
439 # Cache request parsing and hit detection
440 def dcache_request(self
, r0
, ra
, req_index
, req_row
, req_tag
,
441 r0_valid
, r1
, cache_valid_bits
, replace_way
,
442 use_forward1_next
, use_forward2_next
,
443 req_hit_way
, plru_victim
, rc_ok
, perm_attr
,
444 valid_ra
, perm_ok
, access_ok
, req_op
, req_ok
,
445 r0_stall
, m_in
, early_req_row
, d_in
):
451 hit_way
= Signal(WAY_BITS
)
457 s_tag
= Signal(TAG_BITS
)
458 s_pte
= Signal(TLB_PTE_BITS
)
459 s_ra
= Signal(REAL_ADDR_BITS
)
460 hit_set
= Signal(TLB_NUM_WAYS
)
461 hit_way_set
= HitWaySet()
462 rel_matches
= Signal(TLB_NUM_WAYS
)
465 # Extract line, row and tag from request
466 comb
+= req_index
.eq(get_index(r0
.req
.addr
))
467 comb
+= req_row
.eq(get_row(r0
.req
.addr
))
468 comb
+= req_tag
.eq(get_tag(ra
))
470 comb
+= go
.eq(r0_valid
& ~
(r0
.tlbie | r0
.tlbld
) & ~r1
.ls_error
)
472 # Test if pending request is a hit on any way
473 # In order to make timing in virtual mode,
474 # when we are using the TLB, we compare each
475 # way with each of the real addresses from each way of
476 # the TLB, and then decide later which match to use.
478 with m
.If(r0
.req
.virt_mode
):
479 comb
+= rel_matches
.eq(0)
480 for j
in range(TLB_NUM_WAYS
):
481 comb
+= s_pte
.eq(read_tlb_pte(j
, tlb_pte_way
))
482 comb
+= s_ra
.eq(Cat(r0
.req
.addr
[0:TLB_LG_PGSZ
],
483 s_pte
[TLB_LG_PGSZ
:REAL_ADDR_BITS
]))
484 comb
+= s_tag
.eq(get_tag(s_ra
))
486 for i
in range(NUM_WAYS
):
487 with m
.If(go
& cache_valid_bits
[req_index
][i
] &
488 read_tag(i
, cache_tag_set
) == s_tag
490 comb
+= hit_way_set
[j
].eq(i
)
492 comb
+= hit_set
[j
].eq(s_hit
)
493 with m
.If(s_tag
== r1
.reload_tag
):
494 comb
+= rel_matches
[j
].eq(1)
496 comb
+= is_hit
.eq(hit_set
[tlb_hit_way
])
497 comb
+= hit_way
.eq(hit_way_set
[tlb_hit_way
])
498 comb
+= rel_match
.eq(rel_matches
[tlb_hit_way
])
500 comb
+= s_tag
.eq(get_tag(r0
.req
.addr
))
501 for i
in range(NUM_WAYS
):
502 with m
.If(go
& cache_valid_bits
[req_index
][i
] &
503 read_tag(i
, cache_tag_set
) == s_tag
):
504 comb
+= hit_way
.eq(i
)
506 with m
.If(s_tag
== r1
.reload_tag
):
507 comb
+= rel_match
.eq(1)
508 comb
+= req_same_tag
.eq(rel_match
)
510 # See if the request matches the line currently being reloaded
511 with m
.If((r1
.state
== State
.RELOAD_WAIT_ACK
) &
512 (req_index
== r1
.store_index
) & rel_match
):
513 # For a store, consider this a hit even if the row isn't
514 # valid since it will be by the time we perform the store.
515 # For a load, check the appropriate row valid bit.
516 valid
= r1
.rows_valid
[req_row
% ROW_PER_LINE
]
517 comb
+= is_hit
.eq(~r0
.req
.load | valid
)
518 comb
+= hit_way
.eq(replace_way
)
520 # Whether to use forwarded data for a load or not
521 comb
+= use_forward1_next
.eq(0)
522 with m
.If((get_row(r1
.req
.real_addr
) == req_row
)
523 & (r1
.req
.hit_way
== hit_way
))
524 # Only need to consider r1.write_bram here, since if we
525 # are writing refill data here, then we don't have a
526 # cache hit this cycle on the line being refilled.
527 # (There is the possibility that the load following the
528 # load miss that started the refill could be to the old
529 # contents of the victim line, since it is a couple of
530 # cycles after the refill starts before we see the updated
531 # cache tag. In that case we don't use the bypass.)
532 comb
+= use_forward1_next
.eq(r1
.write_bram
)
533 comb
+= use_forward2_next
.eq(0)
534 with m
.If((r1
.forward_row1
== req_row
) & (r1
.forward_way1
== hit_way
)):
535 comb
+= use_forward2_next
.eq(r1
.forward_valid1
)
537 # The way that matched on a hit
538 comb
+= req_hit_way
.eq(hit_way
)
540 # The way to replace on a miss
541 with m
.If(r1
.write_tag
):
542 replace_way
.eq(plru_victim
[r1
.store_index
])
544 comb
+= replace_way
.eq(r1
.store_way
)
546 # work out whether we have permission for this access
547 # NB we don't yet implement AMR, thus no KUAP
548 comb
+= rc_ok
.eq( perm_attr
.reference
549 & (r0
.req
.load | perm_attr
.changed
)
551 comb
+= perm_ok
.eq((r0
.req
.prive_mode | ~perm_attr
.priv
)
553 |
(r0
.req
.load
& perm_attr
.rd_perm
)
555 comb
+= access_ok
.eq(valid_ra
& perm_ok
& rc_ok
)
556 # Combine the request and cache hit status to decide what
557 # operation needs to be done
558 comb
+= nc
.eq(r0
.req
.nc | perm_attr
.nocache
)
559 comb
+= op
.eq(Op
.OP_NONE
)
561 with m
.If(~access_ok
):
562 comb
+= op
.eq(Op
.OP_BAD
)
563 with m
.Elif(cancel_store
):
564 comb
+= op
.eq(Op
.OP_STCX_FAIL
)
566 comb
+= opsel
.eq(Cat(is_hit
, nc
, r0
.req
.load
))
567 with m
.Switch(opsel
):
568 with m
.Case(Const(0b101, 3)):
569 comb
+= op
.eq(Op
.OP_LOAD_HIT
)
570 with m
.Case(Cosnt(0b100, 3)):
571 comb
+= op
.eq(Op
.OP_LOAD_MISS
)
572 with m
.Case(Const(0b110, 3)):
573 comb
+= op
.eq(Op
.OP_LOAD_NC
)
574 with m
.Case(Const(0b001, 3)):
575 comb
+= op
.eq(Op
.OP_STORE_HIT
)
576 with m
.Case(Const(0b000, 3)):
577 comb
+= op
.eq(Op
.OP_STORE_MISS
)
578 with m
.Case(Const(0b010, 3)):
579 comb
+= op
.eq(Op
.OP_STORE_MISS
)
580 with m
.Case(Const(0b011, 3)):
581 comb
+= op
.eq(Op
.OP_BAD
)
582 with m
.Case(Const(0b111, 3)):
583 comb
+= op
.eq(Op
.OP_BAD
)
585 comb
+= op
.eq(Op
.OP_NONE
)
586 comb
+= req_op
.eq(op
)
587 comb
+= req_go
.eq(go
)
589 # Version of the row number that is valid one cycle earlier
590 # in the cases where we need to read the cache data BRAM.
591 # If we're stalling then we need to keep reading the last
593 with m
.If(~r0_stall
):
594 with m
.If(m_in
.valid
):
595 comb
+= early_req_row
.eq(get_row(m_in
.addr
))
597 comb
+= early_req_row
.eq(get_row(d_in
.addr
))
599 comb
+= early_req_row
.eq(req_row
)
601 # Handle load-with-reservation and store-conditional instructions
602 def reservation_comb(self
, cancel_store
, set_rsrv
, clear_rsrv
,
603 r0_valid
, r0
, reservation
):
608 with m
.If(r0_valid
& r0
.req
.reserve
):
610 # XXX generate alignment interrupt if address
611 # is not aligned XXX or if r0.req.nc = '1'
612 with m
.If(r0
.req
.load
):
613 comb
+= set_rsrv(1) # load with reservation
615 comb
+= clear_rsrv
.eq(1) # store conditional
616 with m
.If(~reservation
.valid | r0
.req
.addr
[LINE_OFF_BITS
:64]):
617 comb
+= cancel_store
.eq(1)
619 def reservation_reg(self
, r0_valid
, access_ok
, clear_rsrv
,
625 with m
.If(r0_valid
& access_ok
):
626 with m
.If(clear_rsrv
):
627 sync
+= reservation
.valid
.eq(0)
628 with m
.Elif(set_rsrv
):
629 sync
+= reservation
.valid
.eq(1)
630 sync
+= reservation
.addr
.eq(r0
.req
.addr
[LINE_OFF_BITS
:64])
632 # Return data for loads & completion control logic
633 def writeback_control(self
, r1
, cache_out
, d_out
, m_out
):
638 data_out
= Signal(64)
639 data_fwd
= Signal(64)
642 # Use the bypass if are reading the row that was
643 # written 1 or 2 cycles ago, including for the
644 # slow_valid = 1 case (i.e. completing a load
645 # miss or a non-cacheable load).
646 with m
.If(r1
.use_forward1
):
647 comb
+= data_fwd
.eq(r1
.forward_data1
)
649 comb
+= data_fwd
.eq(r1
.forward_data2
)
651 comb
+= data_out
.eq(cache_out
[r1
.hit_way
])
654 with m
.If(r1
.forward_sel
[i
]):
655 dsel
= data_fwd
.word_select(i
, 8)
656 comb
+= data_out
.word_select(i
, 8).eq(dsel
)
658 comb
+= d_out
.valid
.eq(r1
.ls_valid
)
659 comb
+= d_out
.data
.eq(data_out
)
660 comb
+= d_out
.store_done
.eq(~r1
.stcx_fail
)
661 comb
+= d_out
.error
.eq(r1
.ls_error
)
662 comb
+= d_out
.cache_paradox
.eq(r1
.cache_paradox
)
665 comb
+= m_out
.done
.eq(r1
.mmu_done
)
666 comb
+= m_out
.err
.eq(r1
.mmu_error
)
667 comb
+= m_out
.data
.eq(data_out
)
669 # We have a valid load or store hit or we just completed
670 # a slow op such as a load miss, a NC load or a store
672 # Note: the load hit is delayed by one cycle. However it
673 # can still not collide with r.slow_valid (well unless I
674 # miscalculated) because slow_valid can only be set on a
675 # subsequent request and not on its first cycle (the state
676 # machine must have advanced), which makes slow_valid
677 # at least 2 cycles from the previous hit_load_valid.
679 # Sanity: Only one of these must be set in any given cycle
681 if False: # TODO: need Display to get this to work
682 assert (r1
.slow_valid
& r1
.stcx_fail
) != 1 "unexpected" \
683 "slow_valid collision with stcx_fail -!- severity FAILURE"
685 assert ((r1
.slow_valid | r1
.stcx_fail
) | r1
.hit_load_valid
) != 1
686 "unexpected hit_load_delayed collision with slow_valid -!-" \
689 with m
.If(~r1
._mmu_req
):
690 # Request came from loadstore1...
691 # Load hit case is the standard path
692 with m
.If(r1
.hit_load_valid
):
694 # "completing load hit data=" & to_hstring(data_out);
695 #Display(f"completing load hit data={data_out}")
698 # error cases complete without stalling
699 with m
.If(r1
.ls_error
):
700 # Display("completing ld/st with error")
703 # Slow ops (load miss, NC, stores)
704 with m
.If(r1
.slow_valid
):
705 #Display(f"completing store or load miss data={data_out}")
709 # Request came from MMU
710 with m
.If(r1
.hit_load_valid
):
711 # Display(f"completing load hit to MMU, data={m_out.data}")
713 # error cases complete without stalling
714 with m
.If(r1
.mmu_error
):
715 #Display("combpleting MMU ld with error")
718 # Slow ops (i.e. load miss)
719 with m
.If(r1
.slow_valid
):
720 #Display("completing MMU load miss, data={m_out.data}")
723 # Generate a cache RAM for each way. This handles the normal
724 # reads, writes from reloads and the special store-hit update
727 # Note: the BRAMs have an extra read buffer, meaning the output
728 # is pipelined an extra cycle. This differs from the
729 # icache. The writeback logic needs to take that into
730 # account by using 1-cycle delayed signals for load hits.
732 for i
in range(NUM_WAYS
):
733 # signal do_read : std_ulogic;
734 # signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
735 # signal do_write : std_ulogic;
736 # signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
738 # std_ulogic_vector(wishbone_data_bits-1 downto 0);
739 # signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
740 # signal wr_sel_m : std_ulogic_vector(ROW_SIZE-1 downto 0);
741 # signal dout : cache_row_t;
743 rd_addr
= Signal(ROW_BITS
)
745 wr_addr
= Signal(ROW_BITS
)
746 wr_data
= Signal(WB_DATA_BITS
)
747 wr_sel
= Signal(ROW_SIZE
)
748 wr_sel_m
= Signal(ROW_SIZE
)
749 _d_out
= Signal(WB_DATA_BITS
)
752 # way: entity work.cache_ram
754 # ROW_BITS => ROW_BITS,
755 # WIDTH => wishbone_data_bits,
761 # rd_addr => rd_addr,
763 # wr_sel => wr_sel_m,
764 # wr_addr => wr_addr,
768 way
= CacheRam(ROW_BITS
, WB_DATA_BITS
, True)
769 comb
+= way
.rd_en
.eq(do_read
)
770 comb
+= way
.rd_addr
.eq(rd_addr
)
771 comb
+= way
.rd_data
.eq(_d_out
)
772 comb
+= way
.wr_sel
.eq(wr_sel_m
)
773 comb
+= way
.wr_addr
.eq(wr_addr
)
774 comb
+= way
.wr_data
.eq(wr_data
)
780 # std_ulogic_vector(to_unsigned(early_req_row, ROW_BITS));
781 # cache_out(i) <= dout;
783 comb
+= do_read
.eq(1)
784 comb
+= rd_addr
.eq(Signal(BRAM_ROWS
))
785 comb
+= cache_out
[i
].eq(dout
)
789 # -- Defaults to wishbone read responses (cache refill)
791 # -- For timing, the mux on wr_data/sel/addr is not
792 # -- dependent on anything other than the current state.
795 # Defaults to wishbone read responses (cache refill)
797 # For timing, the mux on wr_data/sel/addr is not
798 # dependent on anything other than the current state.
799 # wr_sel_m <= (others => '0');
800 comb
+= wr_sel_m
.eq(0)
803 comb
+= do_write
.eq(0)
804 # if r1.write_bram = '1' then
805 with m
.If(r1
.write_bram
):
806 # -- Write store data to BRAM. This happens one
807 # -- cycle after the store is in r0.
808 # Write store data to BRAM. This happens one
809 # cycle after the store is in r0.
810 # wr_data <= r1.req.data;
811 # wr_sel <= r1.req.byte_sel;
812 # wr_addr <= std_ulogic_vector(to_unsigned(
813 # get_row(r1.req.real_addr), ROW_BITS
815 comb
+= wr_data
.eq(r1
.req
.data
)
816 comb
+= wr_sel
.eq(r1
.req
.byte_sel
)
817 comb
+= wr_addr
.eq(Signal(get_row(r1
.req
.real_addr
)))
819 # if i = r1.req.hit_way then
820 with m
.If(i
== r1
.req
.hit_way
):
822 comb
+= do_write
.eq(1)
826 # -- Otherwise, we might be doing a reload or a DCBZ
827 # if r1.dcbz = '1' then
828 # Otherwise, we might be doing a reload or a DCBZ
830 # wr_data <= (others => '0');
831 comb
+= wr_data
.eq(0)
834 # wr_data <= wishbone_in.dat;
835 comb
+= wr_data
.eq(wishbone_in
.dat
)
838 # wr_addr <= std_ulogic_vector(to_unsigned(
839 # r1.store_row, ROW_BITS
841 # wr_sel <= (others => '1');
842 comb
+= wr_addr
.eq(Signal(r1
.store_row
))
845 # if r1.state = RELOAD_WAIT_ACK and
846 # wishbone_in.ack = '1' and replace_way = i then
847 with m
.If(r1
.state
== State
.RELOAD_WAIT_ACK
848 & wishbone_in
.ack
& relpace_way
== i
):
850 comb
+= do_write
.eq(1)
854 # -- Mask write selects with do_write since BRAM
855 # -- doesn't have a global write-enable
856 # if do_write = '1' then
857 # -- Mask write selects with do_write since BRAM
858 # -- doesn't have a global write-enable
860 # wr_sel_m <= wr_sel;
861 comb
+= wr_sel_m
.eq(wr_sel
)
866 # Cache hit synchronous machine for the easy case.
867 # This handles load hits.
868 # It also handles error cases (TLB miss, cache paradox)
869 def dcache_fast_hit(self
, req_op
, r0_valid
, r1
, ):
875 # if rising_edge(clk) then
876 # if req_op /= OP_NONE then
877 with m
.If(req_op
!= Op
.OP_NONE
):
878 # report "op:" & op_t'image(req_op) &
879 # " addr:" & to_hstring(r0.req.addr) &
880 # " nc:" & std_ulogic'image(r0.req.nc) &
881 # " idx:" & integer'image(req_index) &
882 # " tag:" & to_hstring(req_tag) &
883 # " way: " & integer'image(req_hit_way);
884 print(f
"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \
885 f
"idx:{req_index} tag:{req_tag} way: {req_hit_way}"
888 # if r0_valid = '1' then
890 # r1.mmu_req <= r0.mmu_req;
891 sync
+= r1
.mmu_req
.eq(r0
.mmu_req
)
894 # -- Fast path for load/store hits.
895 # -- Set signals for the writeback controls.
896 # r1.hit_way <= req_hit_way;
897 # r1.hit_index <= req_index;
898 # Fast path for load/store hits.
899 # Set signals for the writeback controls.
900 sync
+= r1
.hit_way
.eq(req_hit_way
)
901 sync
+= r1
.hit_index
.eq(req_index
)
903 # if req_op = OP_LOAD_HIT then
904 with m
.If(req_op
== Op
.OP_LOAD_HIT
):
905 # r1.hit_load_valid <= '1';
906 sync
+= r1
.hit_load_valid
.eq(1)
910 # r1.hit_load_valid <= '0';
911 sync
+= r1
.hit_load_valid
.eq(0)
914 # if req_op = OP_LOAD_HIT or req_op = OP_STORE_HIT then
915 with m
.If(req_op
== Op
.OP_LOAD_HIT | req_op
== Op
.OP_STORE_HIT
):
916 # r1.cache_hit <= '1';
917 sync
+= r1
.cache_hit
.eq(1)
920 # r1.cache_hit <= '0';
921 sync
+= r1
.cache_hit
.eq(0)
924 # if req_op = OP_BAD then
925 with m
.If(req_op
== Op
.OP_BAD
):
926 # report "Signalling ld/st error valid_ra=" &
927 # std_ulogic'image(valid_ra) & " rc_ok=" &
928 # std_ulogic'image(rc_ok) & " perm_ok=" &
929 # std_ulogic'image(perm_ok);
930 print(f
"Signalling ld/st error valid_ra={valid_ra}"
931 f
"rc_ok={rc_ok} perm_ok={perm_ok}"
933 # r1.ls_error <= not r0.mmu_req;
934 # r1.mmu_error <= r0.mmu_req;
935 # r1.cache_paradox <= access_ok;
936 sync
+= r1
.ls_error
.eq(~r0
.mmu_req
)
937 sync
+= r1
.mmu_error
.eq(r0
.mmu_req
)
938 sync
+= r1
.cache_paradox
.eq(access_ok
)
942 # r1.ls_error <= '0';
943 # r1.mmu_error <= '0';
944 # r1.cache_paradox <= '0';
945 sync
+= r1
.ls_error
.eq(0)
946 sync
+= r1
.mmu_error
.eq(0)
947 sync
+= r1
.cache_paradox
.eq(0)
950 # if req_op = OP_STCX_FAIL then
951 with m
.If(req_op
== Op
.OP_STCX_FAIL
):
952 # r1.stcx_fail <= '1';
957 # r1.stcx_fail <= '0';
958 sync
+= r1
.stcx_fail
.eq(0)
961 # -- Record TLB hit information for updating TLB PLRU
962 # r1.tlb_hit <= tlb_hit;
963 # r1.tlb_hit_way <= tlb_hit_way;
964 # r1.tlb_hit_index <= tlb_req_index;
965 # Record TLB hit information for updating TLB PLRU
966 sync
+= r1
.tlb_hit
.eq(tlb_hit
)
967 sync
+= r1
.tlb_hit_way
.eq(tlb_hit_way
)
968 sync
+= r1
.tlb_hit_index
.eq(tlb_req_index
)
972 # Memory accesses are handled by this state machine:
974 # * Cache load miss/reload (in conjunction with "rams")
975 # * Load hits for non-cachable forms
976 # * Stores (the collision case is handled in "rams")
978 # All wishbone requests generation is done here.
979 # This machine operates at stage 1.
980 def dcache_slow(self
, r1
, use_forward1_next
, cache_valid_bits
, r0
,
981 r0_valid
, req_op
, cache_tag
, req_go
, ra
, wb_in
):
986 # variable stbs_done : boolean;
987 # variable req : mem_access_request_t;
988 # variable acks : unsigned(2 downto 0);
990 req
= MemAccessRequest()
998 # if rising_edge(clk) then
999 # r1.use_forward1 <= use_forward1_next;
1000 # r1.forward_sel <= (others => '0');
1001 sync
+= r1
.use_forward1
.eq(use_forward1_next
)
1002 sync
+= r1
.forward_sel
.eq(0)
1004 # if use_forward1_next = '1' then
1005 with m
.If(use_forward1_next
):
1006 # r1.forward_sel <= r1.req.byte_sel;
1007 sync
+= r1
.forward_sel
.eq(r1
.req
.byte_sel
)
1009 # elsif use_forward2_next = '1' then
1010 with m
.Elif(use_forward2_next
):
1011 # r1.forward_sel <= r1.forward_sel1;
1012 sync
+= r1
.forward_sel
.eq(r1
.forward_sel1
)
1015 # r1.forward_data2 <= r1.forward_data1;
1016 sync
+= r1
.forward_data2
.eq(r1
.forward_data1
)
1018 # if r1.write_bram = '1' then
1019 with m
.If(r1
.write_bram
):
1020 # r1.forward_data1 <= r1.req.data;
1021 # r1.forward_sel1 <= r1.req.byte_sel;
1022 # r1.forward_way1 <= r1.req.hit_way;
1023 # r1.forward_row1 <= get_row(r1.req.real_addr);
1024 # r1.forward_valid1 <= '1';
1025 sync
+= r1
.forward_data1
.eq(r1
.req
.data
)
1026 sync
+= r1
.forward_sel1
.eq(r1
.req
.byte_sel
)
1027 sync
+= r1
.forward_way1
.eq(r1
.req
.hit_way
)
1028 sync
+= r1
.forward_row1
.eq(get_row(r1
.req
.real_addr
))
1029 sync
+= r1
.forward_valid1
.eq(1)
1033 # if r1.dcbz = '1' then
1035 # r1.forward_data1 <= (others => '0');
1036 sync
+= r1
.forward_data1
.eq(0)
1040 # r1.forward_data1 <= wishbone_in.dat;
1041 sync
+= r1
.forward_data1
.eq(wb_in
.dat
)
1044 # r1.forward_sel1 <= (others => '1');
1045 # r1.forward_way1 <= replace_way;
1046 # r1.forward_row1 <= r1.store_row;
1047 # r1.forward_valid1 <= '0';
1048 sync
+= r1
.forward_sel1
.eq(1)
1049 sync
+= r1
.forward_way1
.eq(replace_way
)
1050 sync
+= r1
.forward_row1
.eq(r1
.store_row
)
1051 sync
+= r1
.forward_valid1
.eq(0)
1054 # -- On reset, clear all valid bits to force misses
1056 # On reset, clear all valid bits to force misses
1057 # TODO figure out how reset signal works in nmigeni
1058 with m
.If("""TODO RST???"""):
1059 # for i in index_t loop
1060 for i
in range(NUM_LINES
):
1061 # cache_valids(i) <= (others => '0');
1062 sync
+= cache_valid_bits
[i
].eq(0)
1067 # r1.slow_valid <= '0';
1070 # r1.ls_valid <= '0';
1071 # r1.mmu_done <= '0';
1072 sync
+= r1
.state
.eq(State
.IDLE
)
1073 sync
+= r1
.full
.eq(0)
1074 sync
+= r1
.slow_valid
.eq(0)
1075 sync
+= r1
.wb
.cyc
.eq(0)
1076 sync
+= r1
.wb
.stb
.eq(0)
1077 sync
+= r1
.ls_valid
.eq(0)
1078 sync
+= r1
.mmu_done
.eq(0)
1080 # -- Not useful normally but helps avoiding
1081 # -- tons of sim warnings
1082 # Not useful normally but helps avoiding
1083 # tons of sim warnings
1084 # r1.wb.adr <= (others => '0');
1085 sync
+= r1
.wb
.adr
.eq(0)
1088 # -- One cycle pulses reset
1089 # r1.slow_valid <= '0';
1090 # r1.write_bram <= '0';
1091 # r1.inc_acks <= '0';
1092 # r1.dec_acks <= '0';
1094 # r1.ls_valid <= '0';
1095 # -- complete tlbies and TLB loads in the third cycle
1096 # r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld);
1097 # One cycle pulses reset
1098 sync
+= r1
.slow_valid
.eq(0)
1099 sync
+= r1
.write_bram
.eq(0)
1100 sync
+= r1
.inc_acks
.eq(0)
1101 sync
+= r1
.dec_acks
.eq(0)
1103 sync
+= r1
.ls_valid
.eq(0)
1104 # complete tlbies and TLB loads in the third cycle
1105 sync
+= r1
.mmu_done
.eq(r0_valid
& (r0
.tlbie | r0
.tlbld
))
1107 # if req_op = OP_LOAD_HIT or req_op = OP_STCX_FAIL then
1108 with m
.If(req_op
== Op
.OP_LOAD_HIT
1109 | req_op
== Op
.OP_STCX_FAIL
):
1110 # if r0.mmu_req = '0' then
1111 with m
.If(~r0
.mmu_req
):
1112 # r1.ls_valid <= '1';
1113 sync
+= r1
.ls_valid
.eq(1)
1116 # r1.mmu_done <= '1';
1117 sync
+= r1
.mmu_done
.eq(1)
1121 # if r1.write_tag = '1' then
1122 with m
.If(r1
.write_tag
):
1123 # -- Store new tag in selected way
1124 # for i in 0 to NUM_WAYS-1 loop
1125 # Store new tag in selected way
1126 for i
in range(NUM_WAYS
):
1127 # if i = replace_way then
1128 with m
.If(i
== replace_way
):
1129 # cache_tags(r1.store_index)(
1130 # (i + 1) * TAG_WIDTH - 1
1131 # downto i * TAG_WIDTH
1133 # (TAG_WIDTH - 1 downto TAG_BITS => '0')
1137 ][i
* TAG_WIDTH
:(i
+1) * TAG_WIDTH
].eq(
1138 Const(TAG_WIDTH
, TAG_WIDTH
)
1143 # r1.store_way <= replace_way;
1144 # r1.write_tag <= '0';
1145 sync
+= r1
.store_way
.eq(replace_way
)
1146 sync
+= r1
.write_tag
.eq(0)
1149 # -- Take request from r1.req if there is one there,
1150 # -- else from req_op, ra, etc.
1151 # if r1.full = '1' then
1152 # Take request from r1.req if there is one there,
1153 # else from req_op, ra, etc.
1156 sync
+= req
.eq(r1
.req
)
1161 # req.valid := req_go;
1162 # req.mmu_req := r0.mmu_req;
1163 # req.dcbz := r0.req.dcbz;
1164 # req.real_addr := ra;
1165 sync
+= req
.op
.eq(req_op
)
1166 sync
+= req
.valid
.eq(req_go
)
1167 sync
+= req
.mmu_req
.eq(r0
.mmu_req
)
1168 sync
+= req
.dcbz
.eq(r0
.req
.dcbz
)
1169 sync
+= req
.real_addr
.eq(ra
)
1171 # -- Force data to 0 for dcbz
1172 # if r0.req.dcbz = '0' then
1173 with m
.If(~r0
.req
.dcbz
):
1174 # req.data := r0.req.data;
1175 sync
+= req
.data
.eq(r0
.req
.data
)
1179 # req.data := (others => '0');
1180 sync
+= req
.data
.eq(0)
1183 # -- Select all bytes for dcbz
1184 # -- and for cacheable loads
1185 # if r0.req.dcbz = '1'
1186 # or (r0.req.load = '1' and r0.req.nc = '0') then
1187 # Select all bytes for dcbz
1188 # and for cacheable loads
1189 with m
.If(r0
.req
.dcbz |
(r0
.req
.load
& ~r0
.req
.nc
):
1190 # req.byte_sel := (others => '1');
1191 sync
+= req
.byte_sel
.eq(1)
1195 # req.byte_sel := r0.req.byte_sel;
1196 sync
+= req
.byte_sel
.eq(r0
.req
.byte_sel
)
1199 # req.hit_way := req_hit_way;
1200 # req.same_tag := req_same_tag;
1201 sync
+= req
.hit_way
.eq(req_hit_way
)
1202 sync
+= req
.same_tag
.eq(req_same_tag
)
1204 # -- Store the incoming request from r0,
1205 # -- if it is a slow request
1206 # -- Note that r1.full = 1 implies req_op = OP_NONE
1207 # if req_op = OP_LOAD_MISS or req_op = OP_LOAD_NC
1208 # or req_op = OP_STORE_MISS
1209 # or req_op = OP_STORE_HIT then
1210 # Store the incoming request from r0,
1211 # if it is a slow request
1212 # Note that r1.full = 1 implies req_op = OP_NONE
1213 with m
.If(req_op
== Op
.OP_LOAD_MISS
1214 | req_op
== Op
.OP_LOAD_NC
1215 | req_op
== Op
.OP_STORE_MISS
1216 | req_op
== Op
.OP_STORE_HIT
):
1220 sync
+= r1
.full
.eq(1)
1224 # -- Main state machine
1226 # Main state machine
1227 with m
.Switch(r1
.state
):
1230 with m
.Case(State
.IDLE
)
1231 # r1.wb.adr <= req.real_addr(
1232 # r1.wb.adr'left downto 0
1234 # r1.wb.sel <= req.byte_sel;
1235 # r1.wb.dat <= req.data;
1236 # r1.dcbz <= req.dcbz;
1238 # -- Keep track of our index and way
1239 # -- for subsequent stores.
1240 # r1.store_index <= get_index(req.real_addr);
1241 # r1.store_row <= get_row(req.real_addr);
1243 # get_row_of_line(get_row(req.real_addr)) - 1;
1244 # r1.reload_tag <= get_tag(req.real_addr);
1245 # r1.req.same_tag <= '1';
1246 sync
+= r1
.wb
.adr
.eq(req
.real_addr
[0:r1
.wb
.adr
])
1247 sync
+= r1
.wb
.sel
.eq(req
.byte_sel
)
1248 sync
+= r1
.wb
.dat
.eq(req
.data
)
1249 sync
+= r1
.dcbz
.eq(req
.dcbz
)
1251 # Keep track of our index and way
1252 # for subsequent stores.
1253 sync
+= r1
.store_index
.eq(get_index(req
.real_addr
))
1254 sync
+= r1
.store_row
.eq(get_row(req
.real_addr
))
1255 sync
+= r1
.end_row_ix
.eq(
1256 get_row_of_line(get_row(req
.real_addr
))
1258 sync
+= r1
.reload_tag
.eq(get_tag(req
.real_addr
))
1259 sync
+= r1
.req
.same_tag
.eq(1)
1261 # if req.op = OP_STORE_HIT theni
1262 with m
.If(req
.op
== Op
.OP_STORE_HIT
):
1263 # r1.store_way <= req.hit_way;
1264 sync
+= r1
.store_way
.eq(req
.hit_way
)
1267 # -- Reset per-row valid bits,
1268 # -- ready for handling OP_LOAD_MISS
1269 # for i in 0 to ROW_PER_LINE - 1 loop
1270 # Reset per-row valid bits,
1271 # ready for handling OP_LOAD_MISS
1272 for i
in range(ROW_PER_LINE
):
1273 # r1.rows_valid(i) <= '0';
1274 sync
+= r1
.rows_valid
[i
].eq(0)
1278 with m
.Switch(req
.op
):
1279 # when OP_LOAD_HIT =>
1280 with m
.Case(Op
.OP_LOAD_HIT
):
1281 # -- stay in IDLE state
1282 # stay in IDLE state
1285 # when OP_LOAD_MISS =>
1286 with m
.Case(Op
.OP_LOAD_MISS
):
1287 # -- Normal load cache miss,
1288 # -- start the reload machine
1289 # report "cache miss real addr:" &
1290 # to_hstring(req.real_addr) & " idx:" &
1291 # integer'image(get_index(req.real_addr)) &
1292 # " tag:" & to_hstring(get_tag(req.real_addr));
1293 # Normal load cache miss,
1294 # start the reload machine
1295 print(f
"cache miss real addr:" \
1296 f
"{req_real_addr}" \
1297 f
" idx:{get_index(req_real_addr)}" \
1298 f
" tag:{get_tag(req.real_addr)}")
1300 # -- Start the wishbone cycle
1304 # Start the wishbone cycle
1305 sync
+= r1
.wb
.we
.eq(0)
1306 sync
+= r1
.wb
.cyc
.eq(1)
1307 sync
+= r1
.wb
.stb
.eq(1)
1309 # -- Track that we had one request sent
1310 # r1.state <= RELOAD_WAIT_ACK;
1311 # r1.write_tag <= '1';
1312 # Track that we had one request sent
1313 sync
+= r1
.state
.eq(State
.RELOAD_WAIT_ACK
)
1314 sync
+= r1
.write_tag
.eq(1)
1316 # when OP_LOAD_NC =>
1317 with m
.Case(Op
.OP_LOAD_NC
):
1321 # r1.state <= NC_LOAD_WAIT_ACK;
1322 sync
+= r1
.wb
.cyc
.eq(1)
1323 sync
+= r1
.wb
.stb
.eq(1)
1324 sync
+= r1
.wb
.we
.eq(0)
1325 sync
+= r1
.state
.eq(State
.NC_LOAD_WAIT_ACK
)
1327 # when OP_STORE_HIT | OP_STORE_MISS =>
1328 with m
.Case(Op
.OP_STORE_HIT
1329 | Op
.OP_STORE_MISS
):
1330 # if req.dcbz = '0' then
1331 with m
.If(~req
.bcbz
):
1332 # r1.state <= STORE_WAIT_ACK;
1333 # r1.acks_pending <= to_unsigned(1, 3);
1335 # r1.slow_valid <= '1';
1336 sync
+= r1
.state
.eq(
1337 State
.STORE_WAIT_ACK
1339 sync
+= r1
.acks_pending
.eq(
1340 '''TODO to_unsignes(1,3)'''
1342 sync
+= r1
.full
.eq(0)
1343 sync
+= r1
.slow_valid
.eq(1)
1345 # if req.mmu_req = '0' then
1346 with m
.If(~req
.mmu_req
):
1347 # r1.ls_valid <= '1';
1348 sync
+= r1
.ls_valid
.eq(1)
1351 # r1.mmu_done <= '1';
1352 sync
+= r1
.mmu_done
.eq(1)
1355 # if req.op = OP_STORE_HIT then
1356 with m
.If(req
.op
== Op
.OP_STORE_HIT
):
1357 # r1.write_bram <= '1';
1358 sync
+= r1
.write_bram
.eq(1)
1363 # -- dcbz is handled much like a load
1364 # -- miss except that we are writing
1365 # -- to memory instead of reading
1366 # r1.state <= RELOAD_WAIT_ACK;
1367 # dcbz is handled much like a load
1368 # miss except that we are writing
1369 # to memory instead of reading
1370 sync
+= r1
.state
.eq(Op
.RELOAD_WAIT_ACK
)
1372 # if req.op = OP_STORE_MISS then
1373 with m
.If(req
.op
== Op
.OP_STORE_MISS
):
1374 # r1.write_tag <= '1';
1375 sync
+= r1
.write_tag
.eq(1)
1382 sync
+= r1
.wb
.we
.eq(1)
1383 sync
+= r1
.wb
.cyc
.eq(1)
1384 sync
+= r1
.wb
.stb
.eq(1)
1386 # -- OP_NONE and OP_BAD do nothing
1387 # -- OP_BAD & OP_STCX_FAIL were handled above already
1390 # when OP_STCX_FAIL =>
1391 # OP_NONE and OP_BAD do nothing
1392 # OP_BAD & OP_STCX_FAIL were
1393 # handled above already
1394 with m
.Case(Op
.OP_NONE
):
1397 with m
.Case(OP_BAD
):
1400 with m
.Case(OP_STCX_FAIL
):
1404 # when RELOAD_WAIT_ACK =>
1405 with m
.Case(State
.RELOAD_WAIT_ACK
):
1406 # -- Requests are all sent if stb is 0
1407 # Requests are all sent if stb is 0
1408 sync
+= stbs_done
.eq(~r1
.wb
.stb
)
1409 # stbs_done := r1.wb.stb = '0';
1411 # -- If we are still sending requests,
1412 # -- was one accepted?
1413 # if wishbone_in.stall = '0' and not stbs_done then
1414 # If we are still sending requests,
1416 with m
.If(~wb_in
.stall
& ~stbs_done
):
1417 # -- That was the last word ? We are done sending.
1418 # -- Clear stb and set stbs_done so we can handle
1419 # -- an eventual last ack on the same cycle.
1420 # if is_last_row_addr(
1421 # r1.wb.adr, r1.end_row_ix
1423 # That was the last word?
1424 # We are done sending.
1425 # Clear stb and set stbs_done
1426 # so we can handle an eventual
1427 # last ack on the same cycle.
1428 with m
.If(is_last_row_addr(
1429 r1
.wb
.adr
, r1
.end_row_ix
)):
1431 # stbs_done := true;
1432 sync
+= r1
.wb
.stb
.eq(0)
1433 sync
+= stbs_done
.eq(0)
1436 # -- Calculate the next row address
1437 # r1.wb.adr <= next_row_addr(r1.wb.adr);
1438 # Calculate the next row address
1439 sync
+= r1
.wb
.adr
.eq(next_row_addr(r1
.wb
.adr
))
1442 # -- Incoming acks processing
1443 # r1.forward_valid1 <= wishbone_in.ack;
1444 # Incoming acks processing
1445 sync
+= r1
.forward_valid1
.eq(wb_in
.ack
)
1447 # if wishbone_in.ack = '1' then
1448 with m
.If(wb_in
.ack
):
1450 # r1.store_row mod ROW_PER_LINE
1452 sync
+= r1
.rows_valid
[
1453 r1
.store_row
% ROW_PER_LINE
1456 # -- If this is the data we were looking for,
1457 # -- we can complete the request next cycle.
1458 # -- Compare the whole address in case the
1459 # -- request in r1.req is not the one that
1460 # -- started this refill.
1461 # if r1.full = '1' and r1.req.same_tag = '1'
1462 # and ((r1.dcbz = '1' and r1.req.dcbz = '1')
1463 # or (r1.dcbz = '0' and r1.req.op = OP_LOAD_MISS))
1464 # and r1.store_row = get_row(r1.req.real_addr) then
1465 # If this is the data we were looking for,
1466 # we can complete the request next cycle.
1467 # Compare the whole address in case the
1468 # request in r1.req is not the one that
1469 # started this refill.
1470 with m
.If(r1
.full
& r1
.req
.same_tag
&
1471 ((r1
.dcbz
& r1
.req
.dcbz
)
1473 r1
.req
.op
== Op
.OP_LOAD_MISS
)
1476 == get_row(r1
.req
.real_addr
):
1478 # r1.slow_valid <= '1';
1479 sync
+= r1
.full
.eq(0)
1480 sync
+= r1
.slow_valid
.eq(1)
1482 # if r1.mmu_req = '0' then
1483 with m
.If(~r1
.mmu_req
):
1484 # r1.ls_valid <= '1';
1485 sync
+= r1
.ls_valid
.eq(1)
1488 # r1.mmu_done <= '1';
1489 sync
+= r1
.mmu_done
.eq(1)
1491 # r1.forward_sel <= (others => '1');
1492 # r1.use_forward1 <= '1';
1493 sync
+= r1
.forward_sel
.eq(1)
1494 sync
+= r1
.use_forward1
.eq(1)
1497 # -- Check for completion
1498 # if stbs_done and is_last_row(r1.store_row,
1499 # r1.end_row_ix) then
1500 # Check for completion
1501 with m
.If(stbs_done
&
1502 is_last_row(r1
.store_row
,
1505 # -- Complete wishbone cycle
1507 # Complete wishbone cycle
1508 sync
+= r1
.wb
.cyc
.eq(0)
1510 # -- Cache line is now valid
1511 # cache_valids(r1.store_index)(
1514 # Cache line is now valid
1515 sync
+= cache_valid_bits
[
1517 ][r1
.store_way
].eq(1)
1520 sync
+= r1
.state
.eq(State
.IDLE
)
1523 # -- Increment store row counter
1524 # r1.store_row <= next_row(r1.store_row);
1525 # Increment store row counter
1526 sync
+= r1
.store_row
.eq(next_row(
1531 # when STORE_WAIT_ACK =>
1532 with m
.Case(State
.STORE_WAIT_ACK
):
1533 # stbs_done := r1.wb.stb = '0';
1534 # acks := r1.acks_pending;
1535 sync
+= stbs_done
.eq(~r1
.wb
.stb
)
1536 sync
+= acks
.eq(r1
.acks_pending
)
1538 # if r1.inc_acks /= r1.dec_acks then
1539 with m
.If(r1
.inc_acks
!= r1
.dec_acks
):
1541 # if r1.inc_acks = '1' then
1542 with m
.If(r1
.inc_acks
):
1544 sync
+= acks
.eq(acks
+ 1)
1549 sync
+= acks
.eq(acks
- 1)
1553 # r1.acks_pending <= acks;
1554 sync
+= r1
.acks_pending
.eq(acks
)
1556 # -- Clear stb when slave accepted request
1557 # if wishbone_in.stall = '0' then
1558 # Clear stb when slave accepted request
1559 with m
.If(~wb_in
.stall
):
1560 # -- See if there is another store waiting
1561 # -- to be done which is in the same real page.
1562 # if req.valid = '1' then
1563 # See if there is another store waiting
1564 # to be done which is in the same real page.
1565 with m
.If(req
.valid
):
1567 # SET_SIZE_BITS - 1 downto 0
1568 # ) <= req.real_addr(
1569 # SET_SIZE_BITS - 1 downto 0
1571 # r1.wb.dat <= req.data;
1572 # r1.wb.sel <= req.byte_sel;
1573 sync
+= r1
.wb
.adr
[0:SET_SIZE_BITS
].eq(
1574 req
.real_addr
[0:SET_SIZE_BITS
]
1578 # if acks < 7 and req.same_tag = '1'
1579 # and (req.op = OP_STORE_MISS
1580 # or req.op = OP_STORE_HIT) then
1581 with m
.Elif(acks
< 7 & req
.same_tag
&
1582 (req
.op
== Op
.Op_STORE_MISS
1583 | req
.op
== Op
.OP_SOTRE_HIT
)):
1585 # stbs_done := false;
1586 sync
+= r1
.wb
.stb
.eq(1)
1587 sync
+= stbs_done
.eq(0)
1589 # if req.op = OP_STORE_HIT then
1590 with m
.If(req
.op
== Op
.OP_STORE_HIT
):
1591 # r1.write_bram <= '1';
1592 sync
+= r1
.write_bram
.eq(1)
1595 # r1.slow_valid <= '1';
1596 sync
+= r1
.full
.eq(0)
1597 sync
+= r1
.slow_valid
.eq(1)
1599 # -- Store requests never come from the MMU
1600 # r1.ls_valid <= '1';
1601 # stbs_done := false;
1602 # r1.inc_acks <= '1';
1603 # Store request never come from the MMU
1604 sync
+= r1
.ls_valid
.eq(1)
1605 sync
+= stbs_done
.eq(0)
1606 sync
+= r1
.inc_acks
.eq(1)
1610 # stbs_done := true;
1611 sync
+= r1
.wb
.stb
.eq(0)
1612 sync
+= stbs_done
.eq(1)
1616 # -- Got ack ? See if complete.
1617 # if wishbone_in.ack = '1' then
1618 # Got ack ? See if complete.
1619 with m
.If(wb_in
.ack
):
1620 # if stbs_done and acks = 1 then
1621 with m
.If(stbs_done
& acks
)
1625 sync
+= r1
.state
.eq(State
.IDLE
)
1626 sync
+= r1
.wb
.cyc
.eq(0)
1627 sync
+= r1
.wb
.stb
.eq(0)
1629 # r1.dec_acks <= '1';
1630 sync
+= r1
.dec_acks
.eq(1)
1633 # when NC_LOAD_WAIT_ACK =>
1634 with m
.Case(State
.NC_LOAD_WAIT_ACK
):
1635 # -- Clear stb when slave accepted request
1636 # if wishbone_in.stall = '0' then
1637 # Clear stb when slave accepted request
1638 with m
.If(~wb_in
.stall
):
1640 sync
+= r1
.wb
.stb
.eq(0)
1643 # -- Got ack ? complete.
1644 # if wishbone_in.ack = '1' then
1645 # Got ack ? complete.
1646 with m
.If(wb_in
.ack
):
1649 # r1.slow_valid <= '1';
1650 sync
+= r1
.state
.eq(State
.IDLE
)
1651 sync
+= r1
.full
.eq(0)
1652 sync
+= r1
.slow_valid
.eq(1)
1654 # if r1.mmu_req = '0' then
1655 with m
.If(~r1
.mmu_req
):
1656 # r1.ls_valid <= '1';
1657 sync
+= r1
.ls_valid
.eq(1)
1661 # r1.mmu_done <= '1';
1662 sync
+= r1
.mmu_done
.eq(1)
1665 # r1.forward_sel <= (others => '1');
1666 # r1.use_forward1 <= '1';
1669 sync
+= r1
.forward_sel
.eq(1)
1670 sync
+= r1
.use_forward1
.eq(1)
1671 sync
+= r1
.wb
.cyc
.eq(0)
1672 sync
+= r1
.wb
.stb
.eq(0)
1679 # dc_log: if LOG_LENGTH > 0 generate
1680 # TODO learn how to tranlate vhdl generate into nmigen
1681 def dcache_log(self
, r1
, valid_ra
, tlb_hit_way
, stall_out
,
1682 d_out
, wb_in
, log_out
):
1687 # signal log_data : std_ulogic_vector(19 downto 0);
1688 log_data
= Signal(20)
1693 # dcache_log: process(clk)
1695 # if rising_edge(clk) then
1696 # log_data <= r1.wb.adr(5 downto 3) &
1697 # wishbone_in.stall &
1699 # r1.wb.stb & r1.wb.cyc &
1702 # std_ulogic_vector(
1703 # to_unsigned(op_t'pos(req_op), 3)) &
1705 # std_ulogic_vector(
1706 # to_unsigned(tlb_hit_way, 3)) &
1708 # std_ulogic_vector(
1709 # to_unsigned(state_t'pos(r1.state), 3));
1710 sync
+= log_data
.eq(Cat(
1711 Const(r1
.state
, 3), valid_ra
, Const(tlb_hit_way
, 3),
1712 stall_out
, Const(req_op
, 3), d_out
.valid
, d_out
.error
,
1713 r1
.wb
.cyc
, r1
.wb
.stb
, wb_in
.ack
, wb_in
.stall
,
1718 # log_out <= log_data;
1719 # TODO ??? I am very confused need help
1720 comb
+= log_out
.eq(log_data
)
1724 def elaborate(self
, platform
):
1725 LINE_SIZE
= self
.LINE_SIZE
1726 NUM_LINES
= self
.NUM_LINES
1727 NUM_WAYS
= self
.NUM_WAYS
1728 TLB_SET_SIZE
= self
.TLB_SET_SIZE
1729 TLB_NUM_WAYS
= self
.TLB_NUM_WAYS
1730 TLB_LG_PGSZ
= self
.TLB_LG_PGSZ
1731 LOG_LENGTH
= self
.LOG_LENGTH
1733 # BRAM organisation: We never access more than
1734 # -- wishbone_data_bits at a time so to save
1735 # -- resources we make the array only that wide, and
1736 # -- use consecutive indices for to make a cache "line"
1738 # -- ROW_SIZE is the width in bytes of the BRAM
1739 # -- (based on WB, so 64-bits)
1740 ROW_SIZE
= WB_DATA_BITS
/ 8;
1742 # ROW_PER_LINE is the number of row (wishbone
1743 # transactions) in a line
1744 ROW_PER_LINE
= LINE_SIZE
// ROW_SIZE
1746 # BRAM_ROWS is the number of rows in BRAM needed
1747 # to represent the full dcache
1748 BRAM_ROWS
= NUM_LINES
* ROW_PER_LINE
1751 # Bit fields counts in the address
1753 # REAL_ADDR_BITS is the number of real address
1754 # bits that we store
1757 # ROW_BITS is the number of bits to select a row
1758 ROW_BITS
= log2_int(BRAM_ROWS
)
1760 # ROW_LINE_BITS is the number of bits to select
1761 # a row within a line
1762 ROW_LINE_BITS
= log2_int(ROW_PER_LINE
)
1764 # LINE_OFF_BITS is the number of bits for
1765 # the offset in a cache line
1766 LINE_OFF_BITS
= log2_int(LINE_SIZE
)
1768 # ROW_OFF_BITS is the number of bits for
1769 # the offset in a row
1770 ROW_OFF_BITS
= log2_int(ROW_SIZE
)
1772 # INDEX_BITS is the number if bits to
1773 # select a cache line
1774 INDEX_BITS
= log2_int(NUM_LINES
)
1776 # SET_SIZE_BITS is the log base 2 of the set size
1777 SET_SIZE_BITS
= LINE_OFF_BITS
+ INDEX_BITS
1779 # TAG_BITS is the number of bits of
1780 # the tag part of the address
1781 TAG_BITS
= REAL_ADDR_BITS
- SET_SIZE_BITS
1783 # TAG_WIDTH is the width in bits of each way of the tag RAM
1784 TAG_WIDTH
= TAG_BITS
+ 7 - ((TAG_BITS
+ 7) % 8)
1786 # WAY_BITS is the number of bits to select a way
1787 WAY_BITS
= log2_int(NUM_WAYS
)
1789 # Example of layout for 32 lines of 64 bytes:
1791 # .. tag |index| line |
1793 # .. | |---| | ROW_LINE_BITS (3)
1794 # .. | |--- - --| LINE_OFF_BITS (6)
1795 # .. | |- --| ROW_OFF_BITS (3)
1796 # .. |----- ---| | ROW_BITS (8)
1797 # .. |-----| | INDEX_BITS (5)
1798 # .. --------| | TAG_BITS (45)
1800 TAG_RAM_WIDTH
= TAG_WIDTH
* NUM_WAYS
1802 def CacheTagArray():
1803 return Array(CacheTagSet() for x
in range(NUM_LINES
))
1805 def CacheValidBitsArray():
1806 return Array(CacheWayValidBits() for x
in range(NUM_LINES
))
1808 def RowPerLineValidArray():
1809 return Array(Signal() for x
in range(ROW_PER_LINE
))
1811 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1812 cache_tags
= CacheTagArray()
1813 cache_tag_set
= Signal(TAG_RAM_WIDTH
)
1814 cache_valid_bits
= CacheValidBitsArray()
1816 # TODO attribute ram_style : string;
1817 # TODO attribute ram_style of cache_tags : signal is "distributed";
1820 TLB_SET_BITS
= log2_int(TLB_SET_SIZE
)
1821 TLB_WAY_BITS
= log2_int(TLB_NUM_WAYS
)
1822 TLB_EA_TAG_BITS
= 64 - (TLB_LG_PGSZ
+ TLB_SET_BITS
)
1823 TLB_TAG_WAY_BITS
= TLB_NUM_WAYS
* TLB_EA_TAG_BITS
1825 TLB_PTE_WAY_BITS
= TLB_NUM_WAYS
* TLB_PTE_BITS
;
1827 def TLBValidBitsArray():
1829 Signal(TLB_NUM_WAYS
) for x
in range(TLB_SET_SIZE
)
1834 Signal(TLB_TAG_WAY_BITS
) for x
in range (TLB_SET_SIZE
)
1839 Signal(TLB_PTE_WAY_BITS
) for x
in range(TLB_SET_SIZE
)
1843 return Array(Signal(NUM_WAYS
) for x
in range(TLB_NUM_WAYS
))
1845 """note: these are passed to nmigen.hdl.Memory as "attributes".
1846 don't know how, just that they are.
1848 dtlb_valid_bits
= TLBValidBitsArray()
1849 dtlb_tags
= TLBTagsArray()
1850 dtlb_ptes
= TLBPtesArray()
1851 # TODO attribute ram_style of
1852 # dtlb_tags : signal is "distributed";
1853 # TODO attribute ram_style of
1854 # dtlb_ptes : signal is "distributed";
1861 reservation
= Reservation()
1863 # Async signals on incoming request
1864 req_index
= Signal(NUM_LINES
)
1865 req_row
= Signal(BRAM_ROWS
)
1866 req_hit_way
= Signal(WAY_BITS
)
1867 req_tag
= Signal(TAG_BITS
)
1869 req_data
= Signal(64)
1870 req_same_tag
= Signal()
1873 early_req_row
= Signal(BRAM_ROWS
)
1875 cancel_store
= Signal()
1877 clear_rsrv
= Signal()
1882 use_forward1_next
= Signal()
1883 use_forward2_next
= Signal()
1885 # Cache RAM interface
1887 return Array(Signal(WB_DATA_BITS
) for x
in range(NUM_WAYS
))
1889 cache_out
= CacheRamOut()
1891 # PLRU output interface
1893 return Array(Signal(WAY_BITS
) for x
in range(Index()))
1895 plru_victim
= PLRUOut()
1896 replace_way
= Signal(WAY_BITS
)
1898 # Wishbone read/write/cache write formatting signals
1902 tlb_tag_way
= Signal(TLB_TAG_WAY_BITS
)
1903 tlb_pte_way
= Signal(TLB_PTE_WAY_BITS
)
1904 tlb_valid_way
= Signal(TLB_NUM_WAYS
)
1905 tlb_req_index
= Signal(TLB_SET_SIZE
)
1907 tlb_hit_way
= Signal(TLB_NUM_WAYS
)
1908 pte
= Signal(TLB_PTE_BITS
)
1909 ra
= Signal(REAL_ADDR_BITS
)
1911 perm_attr
= PermAttr()
1914 access_ok
= Signal()
1916 # TLB PLRU output interface
1919 Signal(TLB_WAY_BITS
) for x
in range(TLB_SET_SIZE
)
1922 tlb_plru_victim
= TLBPLRUOut()
1924 # Helper functions to decode incoming requests
1926 # Return the cache line index (tag index) for an address
1927 def get_index(addr
):
1928 return addr
[LINE_OFF_BITS
:SET_SIZE_BITS
]
1930 # Return the cache row index (data memory) for an address
1932 return addr
[ROW_OFF_BITS
:SET_SIZE_BITS
]
1934 # Return the index of a row within a line
1935 def get_row_of_line(row
):
1936 row_v
= Signal(ROW_BITS
)
1938 return row_v
[0:ROW_LINE_BITS
]
1940 # Returns whether this is the last row of a line
1941 def is_last_row_addr(addr
, last
):
1942 return addr
[ROW_OFF_BITS
:LINE_OFF_BITS
] == last
1944 # Returns whether this is the last row of a line
1945 def is_last_row(row
, last
):
1946 return get_row_of_line(row
) == last
1948 # Return the address of the next row in the current cache line
1949 def next_row_addr(addr
):
1950 row_idx
= Signal(ROW_LINE_BITS
)
1951 result
= WBAddrType()
1952 # Is there no simpler way in VHDL to
1953 # generate that 3 bits adder ?
1954 row_idx
= addr
[ROW_OFF_BITS
:LINE_OFF_BITS
]
1955 row_idx
= Signal(row_idx
+ 1)
1957 result
[ROW_OFF_BITS
:LINE_OFF_BITS
] = row_idx
1960 # Return the next row in the current cache line. We use a
1961 # dedicated function in order to limit the size of the
1962 # generated adder to be only the bits within a cache line
1963 # (3 bits with default settings)
1965 row_v
= Signal(ROW_BITS
)
1966 row_idx
= Signal(ROW_LINE_BITS
)
1967 result
= Signal(ROW_BITS
)
1970 row_idx
= row_v
[ROW_LINE_BITS
]
1971 row_v
[0:ROW_LINE_BITS
] = Signal(row_idx
+ 1)
1974 # Get the tag value from the address
1976 return addr
[SET_SIZE_BITS
:REAL_ADDR_BITS
]
1978 # Read a tag from a tag memory row
1979 def read_tag(way
, tagset
):
1980 return tagset
[way
*TAG_WIDTH
:way
* TAG_WIDTH
+ TAG_BITS
]
1982 # Read a TLB tag from a TLB tag memory row
1983 def read_tlb_tag(way
, tags
):
1986 j
= way
* TLB_EA_TAG_BITS
1987 return tags
[j
:j
+ TLB_EA_TAG_BITS
]
1989 # Write a TLB tag to a TLB tag memory row
1990 def write_tlb_tag(way
, tags
), tag
):
1993 j
= way
* TLB_EA_TAG_BITS
1994 tags
[j
:j
+ TLB_EA_TAG_BITS
] = tag
1996 # Read a PTE from a TLB PTE memory row
1997 def read_tlb_pte(way
, ptes
):
2000 j
= way
* TLB_PTE_BITS
2001 return ptes
[j
:j
+ TLB_PTE_BITS
]
2003 def write_tlb_pte(way
, ptes
,newpte
):
2006 j
= way
* TLB_PTE_BITS
2007 return ptes
[j
:j
+ TLB_PTE_BITS
] = newpte
2009 assert (LINE_SIZE
% ROW_SIZE
) == 0 "LINE_SIZE not " \
2010 "multiple of ROW_SIZE"
2012 assert (LINE_SIZE
% 2) == 0 "LINE_SIZE not power of 2"
2014 assert (NUM_LINES
% 2) == 0 "NUM_LINES not power of 2"
2016 assert (ROW_PER_LINE
% 2) == 0 "ROW_PER_LINE not" \
2019 assert ROW_BITS
== (INDEX_BITS
+ ROW_LINE_BITS
) \
2020 "geometry bits don't add up"
2022 assert (LINE_OFF_BITS
= ROW_OFF_BITS
+ ROW_LINEBITS
) \
2023 "geometry bits don't add up"
2025 assert REAL_ADDR_BITS
== (TAG_BITS
+ INDEX_BITS \
2026 + LINE_OFF_BITS
) "geometry bits don't add up"
2028 assert REAL_ADDR_BITS
== (TAG_BITS
+ ROW_BITS
+ ROW_OFF_BITS
) \
2029 "geometry bits don't add up"
2031 assert 64 == wishbone_data_bits
"Can't yet handle a" \
2032 "wishbone width that isn't 64-bits"
2034 assert SET_SIZE_BITS
<= TLB_LG_PGSZ
"Set indexed by" \
2037 # we don't yet handle collisions between loadstore1 requests
2039 comb
+= m_out
.stall
.eq(0)
2041 # Hold off the request in r0 when r1 has an uncompleted request
2042 comb
+= r0_stall
.eq(r0_full
& r1
.full
)
2043 comb
+= r0_valid
.eq(r0_full
& ~r1
.full
)
2044 comb
+= stall_out
.eq(r0_stall
)
2046 # Wire up wishbone request latch out of stage 1
2047 comb
+= wishbone_out
.eq(r1
.wb
)
2053 # entity dcache_tb is
2056 # architecture behave of dcache_tb is
2057 # signal clk : std_ulogic;
2058 # signal rst : std_ulogic;
2060 # signal d_in : Loadstore1ToDcacheType;
2061 # signal d_out : DcacheToLoadstore1Type;
2063 # signal m_in : MmuToDcacheType;
2064 # signal m_out : DcacheToMmuType;
2066 # signal wb_bram_in : wishbone_master_out;
2067 # signal wb_bram_out : wishbone_slave_out;
2069 # constant clk_period : time := 10 ns;
2071 # dcache0: entity work.dcache
2084 # wishbone_out => wb_bram_in,
2085 # wishbone_in => wb_bram_out
2088 # -- BRAM Memory slave
2089 # bram0: entity work.wishbone_bram_wrapper
2091 # MEMORY_SIZE => 1024,
2092 # RAM_INIT_FILE => "icache_test.bin"
2097 # wishbone_in => wb_bram_in,
2098 # wishbone_out => wb_bram_out
2101 # clk_process: process
2104 # wait for clk_period/2;
2106 # wait for clk_period/2;
2109 # rst_process: process
2112 # wait for 2*clk_period;
2120 # d_in.valid <= '0';
2123 # d_in.addr <= (others => '0');
2124 # d_in.data <= (others => '0');
2125 # m_in.valid <= '0';
2126 # m_in.addr <= (others => '0');
2127 # m_in.pte <= (others => '0');
2129 # wait for 4*clk_period;
2130 # wait until rising_edge(clk);
2132 # -- Cacheable read of address 4
2135 # d_in.addr <= x"0000000000000004";
2136 # d_in.valid <= '1';
2137 # wait until rising_edge(clk);
2138 # d_in.valid <= '0';
2140 # wait until rising_edge(clk) and d_out.valid = '1';
2141 # assert d_out.data = x"0000000100000000"
2142 # report "data @" & to_hstring(d_in.addr) &
2143 # "=" & to_hstring(d_out.data) &
2144 # " expected 0000000100000000"
2146 # -- wait for clk_period;
2148 # -- Cacheable read of address 30
2151 # d_in.addr <= x"0000000000000030";
2152 # d_in.valid <= '1';
2153 # wait until rising_edge(clk);
2154 # d_in.valid <= '0';
2156 # wait until rising_edge(clk) and d_out.valid = '1';
2157 # assert d_out.data = x"0000000D0000000C"
2158 # report "data @" & to_hstring(d_in.addr) &
2159 # "=" & to_hstring(d_out.data) &
2160 # " expected 0000000D0000000C"
2163 # -- Non-cacheable read of address 100
2166 # d_in.addr <= x"0000000000000100";
2167 # d_in.valid <= '1';
2168 # wait until rising_edge(clk);
2169 # d_in.valid <= '0';
2170 # wait until rising_edge(clk) and d_out.valid = '1';
2171 # assert d_out.data = x"0000004100000040"
2172 # report "data @" & to_hstring(d_in.addr) &
2173 # "=" & to_hstring(d_out.data) &
2174 # " expected 0000004100000040"
2177 # wait until rising_edge(clk);
2178 # wait until rising_edge(clk);
2179 # wait until rising_edge(clk);
2180 # wait until rising_edge(clk);
2185 def dcache_sim(dut
):
2187 yield dut
.d_in
.valid
.eq(0)
2188 yield dut
.d_in
.load
.eq(0)
2189 yield dut
.d_in
.nc
.eq(0)
2190 yield dut
.d_in
.adrr
.eq(0)
2191 yield dut
.d_in
.data
.eq(0)
2192 yield dut
.m_in
.valid
.eq(0)
2193 yield dut
.m_in
.addr
.eq(0)
2194 yield dut
.m_in
.pte
.eq(0)
2195 # wait 4 * clk_period
2200 # wait_until rising_edge(clk)
2202 # Cacheable read of address 4
2203 yield dut
.d_in
.load
.eq(1)
2204 yield dut
.d_in
.nc
.eq(0)
2205 yield dut
.d_in
.addr
.eq(Const(0x0000000000000004, 64))
2206 yield dut
.d_in
.valid
.eq(1)
2207 # wait-until rising_edge(clk)
2209 yield dut
.d_in
.valid
.eq(0)
2211 while not (yield dut
.d_out
.valid
):
2213 assert dut
.d_out
.data
== Const(0x0000000100000000, 64) f
"data @" \
2214 f
"{dut.d_in.addr}={dut.d_in.data} expected 0000000100000000" \
2215 " -!- severity failure"
2218 # Cacheable read of address 30
2219 yield dut
.d_in
.load
.eq(1)
2220 yield dut
.d_in
.nc
.eq(0)
2221 yield dut
.d_in
.addr
.eq(Const(0x0000000000000030, 64))
2222 yield dut
.d_in
.valid
.eq(1)
2224 yield dut
.d_in
.valid
.eq(0)
2226 while not (yield dut
.d_out
.valid
):
2228 assert dut
.d_out
.data
== Const(0x0000000D0000000C, 64) f
"data @" \
2229 f
"{dut.d_in.addr}={dut.d_out.data} expected 0000000D0000000C" \
2230 f
"-!- severity failure"
2232 # Non-cacheable read of address 100
2233 yield dut
.d_in
.load
.eq(1)
2234 yield dut
.d_in
.nc
.eq(1)
2235 yield dut
.d_in
.addr
.eq(Const(0x0000000000000100, 64))
2236 yield dut
.d_in
.valid
.eq(1)
2238 yield dut
.d_in
.valid
.eq(0)
2240 while not (yield dut
.d_out
.valid
):
2242 assert dut
.d_out
.data
== Const(0x0000004100000040, 64) f
"data @" \
2243 f
"{dut.d_in.addr}={dut.d_out.data} expected 0000004100000040" \
2244 f
"-!- severity failure"
2254 vl
= rtlil
.convert(dut
, ports
=[])
2255 with
open("test_dcache.il", "w") as f
:
2258 run_simulation(dut
, dcache_sim(), vcd_name
='test_dcache.vcd')
2260 if __name__
== '__main__':