3 based on Anton Blanchard microwatt dcache.vhdl
7 from enum
import Enum
, unique
9 from nmigen
import Module
, Signal
, Elaboratable
,
11 from nmigen
.cli
import main
12 from nmigen
.iocontrol
import RecordObject
13 from nmigen
.util
import log2_int
15 from experiment
.mem_types
import LoadStore1ToDCacheType
,
16 DCacheToLoadStore1Type
,
20 from experiment
.wb_types
import WB_ADDR_BITS
, WB_DATA_BITS
, WB_SEL_BITS
,
21 WBAddrType
, WBDataType
, WBSelType
,
22 WbMasterOut
, WBSlaveOut
,
23 WBMasterOutVector
, WBSlaveOutVector
,
24 WBIOMasterOut
, WBIOSlaveOut
27 # Record for storing permission, attribute, etc. bits from a PTE
28 class PermAttr(RecordObject
):
31 self
.reference
= Signal()
32 self
.changed
= Signal()
33 self
.nocache
= Signal()
35 self
.rd_perm
= Signal()
36 self
.wr_perm
= Signal()
39 def extract_perm_attr(pte
):
50 # Type of operation on a "valid" input
54 OP_BAD
= 1 # NC cache hit, TLB miss, prot/RC failure
55 OP_STCX_FAIL
= 2 # conditional store w/o reservation
56 OP_LOAD_HIT
= 3 # Cache hit on load
57 OP_LOAD_MISS
= 4 # Load missing cache
58 OP_LOAD_NC
= 5 # Non-cachable load
59 OP_STORE_HIT
= 6 # Store hitting cache
60 OP_STORE_MISS
= 7 # Store missing cache
66 IDLE
= 0 # Normal load hit processing
67 RELOAD_WAIT_ACK
= 1 # Cache reload wait ack
68 STORE_WAIT_ACK
= 2 # Store wait ack
69 NC_LOAD_WAIT_ACK
= 3 # Non-cachable load wait ack
74 # In order to make timing, we use the BRAMs with
75 # an output buffer, which means that the BRAM
76 # output is delayed by an extra cycle.
78 # Thus, the dcache has a 2-stage internal pipeline
79 # for cache hits with no stalls.
81 # All other operations are handled via stalling
84 # The second stage can thus complete a hit at the same
85 # time as the first stage emits a stall for a complex op.
87 # Stage 0 register, basically contains just the latched request
88 class RegStage0(RecordObject
):
91 self
.req
= LoadStore1ToDCacheType()
95 self
.mmu_req
= Signal() # indicates source of request
98 class MemAccessRequest(RecordObject
):
102 self
.valid
= Signal()
104 self
.real_addr
= Signal(REAL_ADDR_BITS
)
105 self
.data
= Signal(64)
106 self
.byte_sel
= Signal(8)
107 self
.hit_way
= Signal(WAY_BITS
)
108 self
.same_tag
= Signal()
109 self
.mmu_req
= Signal()
112 # First stage register, contains state for stage 1 of load hits
113 # and for the state machine used by all other operations
114 class RegStage1(RecordObject
):
117 # Info about the request
118 self
.full
= Signal() # have uncompleted request
119 self
.mmu_req
= Signal() # request is from MMU
120 self
.req
= MemAccessRequest()
123 self
.hit_way
= Signal(WAY_BITS
)
124 self
.hit_load_valid
= Signal()
125 self
.hit_index
= Signal(NUM_LINES
)
126 self
.cache_hit
= Signal()
129 self
.tlb_hit
= Signal()
130 self
.tlb_hit_way
= Signal(TLB_NUM_WAYS
)
131 self
.tlb_hit_index
= Signal(TLB_SET_SIZE
)
133 # 2-stage data buffer for data forwarded from writes to reads
134 self
.forward_data1
= Signal(64)
135 self
.forward_data2
= Signal(64)
136 self
.forward_sel1
= Signal(8)
137 self
.forward_valid1
= Signal()
138 self
.forward_way1
= Signal(WAY_BITS
)
139 self
.forward_row1
= Signal(BRAM_ROWS
)
140 self
.use_forward1
= Signal()
141 self
.forward_sel
= Signal(8)
143 # Cache miss state (reload state machine)
146 self
.write_bram
= Signal()
147 self
.write_tag
= Signal()
148 self
.slow_valid
= Signal()
149 self
.wb
= WishboneMasterOut()
150 self
.reload_tag
= Signal(TAG_BITS
)
151 self
.store_way
= Signal(WAY_BITS
)
152 self
.store_row
= Signal(BRAM_ROWS
)
153 self
.store_index
= Signal(NUM_LINES
)
154 self
.end_row_ix
= Signal(ROW_LINE_BIT
)
155 self
.rows_valid
= RowPerLineValidArray()
156 self
.acks_pending
= Signal(3)
157 self
.inc_acks
= Signal()
158 self
.dec_acks
= Signal()
160 # Signals to complete (possibly with error)
161 self
.ls_valid
= Signal()
162 self
.ls_error
= Signal()
163 self
.mmu_done
= Signal()
164 self
.mmu_error
= Signal()
165 self
.cache_paradox
= Signal()
167 # Signal to complete a failed stcx.
168 self
.stcx_fail
= Signal()
171 # Reservation information
172 class Reservation(RecordObject
):
176 # TODO LINE_OFF_BITS is 6
177 addr
= Signal(63 downto LINE_OFF_BITS
)
180 # Set associative dcache write-through
182 # TODO (in no specific order):
184 # * See list in icache.vhdl
185 # * Complete load misses on the cycle when WB data comes instead of
186 # at the end of line (this requires dealing with requests coming in
188 class DCache(Elaboratable
):
190 # TODO: make these parameters of DCache at some point
191 self
.LINE_SIZE
= 64 # Line size in bytes
192 self
.NUM_LINES
= 32 # Number of lines in a set
193 self
.NUM_WAYS
= 4 # Number of ways
194 self
.TLB_SET_SIZE
= 64 # L1 DTLB entries per set
195 self
.TLB_NUM_WAYS
= 2 # L1 DTLB number of sets
196 self
.TLB_LG_PGSZ
= 12 # L1 DTLB log_2(page_size)
197 self
.LOG_LENGTH
= 0 # Non-zero to enable log data collection
199 self
.d_in
= LoadStore1ToDCacheType()
200 self
.d_out
= DCacheToLoadStore1Type()
202 self
.m_in
= MMUToDCacheType()
203 self
.m_out
= DCacheToMMUType()
205 self
.stall_out
= Signal()
207 self
.wb_out
= WBMasterOut()
208 self
.wb_in
= WBSlaveOut()
210 self
.log_out
= Signal(20)
212 # Latch the request in r0.req as long as we're not stalling
213 def stage_0(self
, m
, d_in
, m_in
):
219 # TODO, this goes in unit tests and formal proofs
220 # assert ~(d_in.valid & m_in.valid),
221 # "request collision loadstore vs MMU"
222 with m
.If(~
(d_in
.valid
& m_in
.valid
)):
223 #sync += Display("request collision loadstore vs MMU")
226 with m
.If(m_in
.valid
):
227 sync
+= r
.req
.valid
.eq(1)
228 sync
+= r
.req
.load
.eq(~
(m_in
.tlbie | m_in
.tlbld
))
229 sync
+= r
.req
.dcbz
.eq(0)
230 sync
+= r
.req
.nc
.eq(0)
231 sync
+= r
.req
.reserve
.eq(0)
232 sync
+= r
.req
.virt_mode
.eq(1)
233 sync
+= r
.req
.priv_mode
.eq(1)
234 sync
+= r
.req
.addr
.eq(m_in
.addr
)
235 sync
+= r
.req
.data
.eq(m_in
.pte
)
236 sync
+= r
.req
.byte_sel
.eq(-1) # Const -1 sets all to 0b111....
237 sync
+= r
.tlbie
.eq(m_in
.tlbie
)
238 sync
+= r
.doall
.eq(m_in
.doall
)
239 sync
+= r
.tlbld
.eq(m_in
.tlbld
)
240 sync
+= r
.mmu_req
.eq(1)
242 sync
+= r
.req
.eq(d_in
)
243 sync
+= r
.req
.tlbie
.eq(0)
244 sync
+= r
.req
.doall
.eq(0)
245 sync
+= r
.req
.tlbd
.eq(0)
246 sync
+= r
.req
.mmu_req
.eq(0)
247 with m
.If(~
(r1
.full
& r0_full
)):
249 sync
+= r0_full
.eq(r
.req
.valid
)
252 # Operates in the second cycle on the request latched in r0.req.
253 # TLB updates write the entry at the end of the second cycle.
254 def tlb_read(self
, m
, m_in
, d_in
, r0_stall
, tlb_valid_way
,
255 tlb_tag_way
, tlb_pte_way
, dtlb_valid_bits
,
256 dtlb_tags
, dtlb_ptes
):
261 index
= Signal(log2_int(TLB_SET_BITS
), False)
262 addrbits
= Signal(TLB_SET_BITS
)
265 amax
= TLB_LG_PGSZ
+ TLB_SET_BITS
267 with m
.If(m_in
.valid
):
268 comb
+= addrbits
.eq(m_in
.addr
[amin
: amax
])
270 comb
+= addrbits
.eq(d_in
.addr
[amin
: amax
])
271 comb
+= index
.eq(addrbits
)
273 # If we have any op and the previous op isn't finished,
274 # then keep the same output for next cycle.
275 with m
.If(~r0_stall
):
276 sync
+= tlb_valid_way
.eq(dtlb_valid_bits
[index
])
277 sync
+= tlb_tag_way
.eq(dtlb_tags
[index
])
278 sync
+= tlb_pte_way
.eq(dtlb_ptes
[index
])
281 def maybe_tlb_plrus(self
, m
, r1
, tlb_plru_victim
, acc
, acc_en
, lru
):
285 with m
.If(TLB_NUM_WAYS
> 1):
286 for i
in range(TLB_SET_SIZE
):
288 tlb_plru
= PLRU(TLB_WAY_BITS
)
289 tlb_plru_acc
= Signal(TLB_WAY_BITS
)
290 tlb_plru_acc_en
= Signal()
291 tlb_plru_out
= Signal(TLB_WAY_BITS
)
293 comb
+= tlb_plru
.acc
.eq(tlb_plru_acc
)
294 comb
+= tlb_plru
.acc_en
.eq(tlb_plru_acc_en
)
295 comb
+= tlb_plru
.lru
.eq(tlb_plru_out
)
298 with m
.If(r1
.tlb_hit_index
== i
):
299 comb
+= tlb_plru
.acc_en
.eq(r1
.tlb_hit
)
301 comb
+= tlb_plru
.acc_en
.eq(0)
302 comb
+= tlb_plru
.acc
.eq(r1
.tlb_hit_way
)
304 comb
+= tlb_plru_victim
[i
].eq(tlb_plru
.lru
)
306 def tlb_search(self
, tlb_req_index
, r0
, tlb_valid_way_ tlb_tag_way
,
307 tlb_pte_way
, pte
, tlb_hit
, valid_ra
, perm_attr
, ra
):
312 hitway
= Signal(TLB_WAY_BITS
)
314 eatag
= Signal(log2_int(TLB_EA_TAG_BITS
, False))
316 TLB_LG_END
= TLB_LG_PGSZ
+ TLB_SET_BITS
317 comb
+= tlb_req_index
.eq(r0
.req
.addr
[TLB_LG_PGSZ
: TLB_LG_END
])
318 comb
+= eatag
.eq(r0
.req
.addr
[TLB_LG_END
: 64 ])
320 for i
in range(TLB_NUM_WAYS
):
321 with m
.If(tlb_valid_way(i
)
322 & read_tlb_tag(i
, tlb_tag_way
) == eatag
):
326 comb
+= tlb_hit
.eq(hit
& r0_valid
)
327 comb
+= tlb_hit_way
.eq(hitway
)
330 comb
+= pte
.eq(read_tlb_pte(hitway
, tlb_pte_way
))
333 comb
+= valid_ra
.eq(tlb_hit | ~r0
.req
.virt_mode
)
334 with m
.If(r0
.req
.virt_mode
):
335 comb
+= ra
.eq(Cat(Const(0, ROW_OFF_BITS
),
336 r0
.req
.addr
[ROW_OFF_BITS
:TLB_LG_PGSZ
],
337 pte
[TLB_LG_PGSZ
:REAL_ADDR_BITS
]))
338 comb
+= perm_attr
.eq(extract_perm_attr(pte
))
340 comb
+= ra
.eq(Cat(Const(0, ROW_OFF_BITS
),
341 r0
.rq
.addr
[ROW_OFF_BITS
:REAL_ADDR_BITS
]))
343 comb
+= perm_attr
.reference
.eq(1)
344 comb
+= perm_attr
.changed
.eq(1)
345 comb
+= perm_attr
.priv
.eq(1)
346 comb
+= perm_attr
.nocache
.eq(0)
347 comb
+= perm_attr
.rd_perm
.eq(1)
348 comb
+= perm_attr
.wr_perm
.eq(1)
350 def tlb_update(self
, r0_valid
, r0
, dtlb_valid_bits
, tlb_req_index
,
351 tlb_hit_way
, tlb_hit
, tlb_plru_victim
, tlb_tag_way
,
352 dtlb_tags
, tlb_pte_way
, dtlb_ptes
, dtlb_valid_bits
):
357 # variable tlbie : std_ulogic;
358 # variable tlbwe : std_ulogic;
359 # variable repl_way : tlb_way_t;
360 # variable eatag : tlb_tag_t;
361 # variable tagset : tlb_way_tags_t;
362 # variable pteset : tlb_way_ptes_t;
363 #type tlb_tags_t is array(tlb_index_t) of tlb_way_tags_t;
364 # --> Array([Signal(log(way_tags length)) for i in range(number of tlbs)])
368 repl_way
= Signal(TLB_WAY_BITS
)
369 eatag
= Signal(log2_int(TLB_EA_TAG_BITS
, False))
370 tagset
= TLBWayTags()
371 pteset
= TLBWayPtes()
381 # if rising_edge(clk) then
382 # tlbie := r0_valid and r0.tlbie;
383 # tlbwe := r0_valid and r0.tlbldoi;
384 comb
+= tlbie
.eq(r0_valid
& r0
.tlbie
)
385 comb
+= tlbwe
.eq(r0_valid
& r0
.tlbldoi
)
387 with m
.If(tlbie
& r0
.doall
):
388 # clear all valid bits at once
389 for i
in range(TLB_SET_SIZE
):
390 sync
+= dtlb_valid_bits
[i
].eq(0)
394 sync
+= dtlb_valid_bits
[tlb_req_index
][tlb_hit_way
].eq(0)
397 comb
+= repl_way
.eq(tlb_hit_way
)
399 comb
+= repl_way
.eq(tlb_plru_victim
[tlb_req_index
])
400 comb
+= eatag
.eq(r0
.req
.addr
[TLB_LG_PGSZ
+ TLB_SET_BITS
:64])
401 comb
+= tagset
.eq(tlb_tag_way
)
402 sync
+= write_tlb_tag(repl_way
, tagset
, eatag
)
403 sync
+= dtlb_tags
[tlb_req_index
].eq(tagset
)
404 comb
+= pteset
.eq(tlb_pte_way
)
405 sync
+= write_tlb_pte(repl_way
, pteset
, r0
.req
.data
)
406 sync
+= dtlb_ptes
[tlb_req_index
].eq(pteset
)
407 sync
+= dtlb_valid_bits
[tlb_req_index
][repl_way
].eq(1)
410 # maybe_plrus: if NUM_WAYS > 1 generate
412 def maybe_plrus(self
, r1
):
418 # TODO learn translation of generate into nmgien @lkcl
419 # plrus: for i in 0 to NUM_LINES-1 generate
420 for i
in range(NUM_LINES
):
422 # signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
423 # signal plru_acc_en : std_ulogic;
424 # signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
425 plru
= PLRU(WAY_BITS
)
426 plru_acc
= Signal(WAY_BITS
)
427 plru_acc_en
= Signal()
428 plru_out
= Signal(WAY_BITS
)
431 # TODO learn tranlation of entity, generic map, port map in
433 # plru : entity work.plru
441 # acc_en => plru_acc_en,
444 comb
+= plru
.acc
.eq(plru_acc
)
445 comb
+= plru
.acc_en
.eq(plru_acc_en
)
446 comb
+= plru
.lru
.eq(plru_out
)
451 # if r1.hit_index = i then
453 with m
.If(r1
.hit_index
== i
):
454 # plru_acc_en <= r1.cache_hit;
455 comb
+= plru_acc_en
.eq(r1
.cache_hit
)
458 # plru_acc_en <= '0';
459 comb
+= plru_acc_en
.eq(0)
461 # plru_acc <= std_ulogic_vector(to_unsigned(
462 # r1.hit_way, WAY_BITS
464 # plru_victim(i) <= plru_out;
465 comb
+= plru_acc
.eq(r1
.hit_way
)
466 comb
+= plru_victim
[i
].eq(plru_out
)
471 # -- Cache tag RAM read port
472 # cache_tag_read : process(clk)
473 # Cache tag RAM read port
474 def cache_tag_read(self
, r0_stall
, req_index
, m_in
, d_in
,
475 cache_tag_set
, cache_tags
):
480 # variable index : index_t;
481 index
= Signal(NUM_LINES
)
486 # if rising_edge(clk) then
487 # if r0_stall = '1' then
489 # index := req_index;
490 sync
+= index
.eq(req_index
)
492 # elsif m_in.valid = '1' then
493 with m
.Elif(m_in
.valid
):
494 # index := get_index(m_in.addr);
495 sync
+= index
.eq(get_index(m_in
.addr
))
499 # index := get_index(d_in.addr);
500 sync
+= index
.eq(get_index(d_in
.addr
))
502 # cache_tag_set <= cache_tags(index);
503 sync
+= cache_tag_set
.eq(cache_tags
[index
])
507 # Cache request parsing and hit detection
508 def dcache_request(self
, r0
, ra
, req_index
, req_row
, req_tag
,
509 r0_valid
, r1
, cache_valid_bits
, replace_way
,
510 use_forward1_next
, use_forward2_next
,
511 req_hit_way
, plru_victim
, rc_ok
, perm_attr
,
512 valid_ra
, perm_ok
, access_ok
, req_op
, req_ok
,
513 r0_stall
, m_in
, early_req_row
, d_in
):
518 # variable is_hit : std_ulogic;
519 # variable hit_way : way_t;
520 # variable op : op_t;
521 # variable opsel : std_ulogic_vector(2 downto 0);
522 # variable go : std_ulogic;
523 # variable nc : std_ulogic;
524 # variable s_hit : std_ulogic;
525 # variable s_tag : cache_tag_t;
526 # variable s_pte : tlb_pte_t;
527 # variable s_ra : std_ulogic_vector(
528 # REAL_ADDR_BITS - 1 downto 0
530 # variable hit_set : std_ulogic_vector(
531 # TLB_NUM_WAYS - 1 downto 0
533 # variable hit_way_set : hit_way_set_t;
534 # variable rel_matches : std_ulogic_vector(
535 # TLB_NUM_WAYS - 1 downto 0
539 hit_way
= Signal(WAY_BITS
)
545 s_tag
= Signal(TAG_BITS
)
546 s_pte
= Signal(TLB_PTE_BITS
)
547 s_ra
= Signal(REAL_ADDR_BITS
)
548 hit_set
= Signal(TLB_NUM_WAYS
)
549 hit_way_set
= HitWaySet()
550 rel_matches
= Signal(TLB_NUM_WAYS
)
554 # -- Extract line, row and tag from request
555 # req_index <= get_index(r0.req.addr);
556 # req_row <= get_row(r0.req.addr);
557 # req_tag <= get_tag(ra);
559 # go := r0_valid and not (r0.tlbie or r0.tlbld)
560 # and not r1.ls_error;
561 # Extract line, row and tag from request
562 comb
+= req_index
.eq(get_index(r0
.req
.addr
))
563 comb
+= req_row
.eq(get_row(r0
.req
.addr
))
564 comb
+= req_tag
.eq(get_tag(ra
))
566 comb
+= go
.eq(r0_valid
& ~
(r0
.tlbie | r0
.tlbld
) & ~r1
.ls_error
)
571 # Test if pending request is a hit on any way
572 # In order to make timing in virtual mode,
573 # when we are using the TLB, we compare each
574 # way with each of the real addresses from each way of
575 # the TLB, and then decide later which match to use.
577 # if r0.req.virt_mode = '1' then
578 with m
.If(r0
.req
.virt_mode
):
579 # rel_matches := (others => '0');
580 comb
+= rel_matches
.eq(0)
581 # for j in tlb_way_t loop
582 for j
in range(TLB_NUM_WAYS
):
583 # hit_way_set(j) := 0;
585 # s_pte := read_tlb_pte(j, tlb_pte_way);
586 # s_ra := s_pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ)
587 # & r0.req.addr(TLB_LG_PGSZ - 1 downto 0);
588 # s_tag := get_tag(s_ra);
589 comb
+= hit_way_set
[j
].eq(0)
591 comb
+= s_pte
.eq(read_tlb_pte(j
, tlb_pte_way
))
593 r0
.req
.addr
[0:TLB_LG_PGSZ
],
594 s_pte
[TLB_LG_PGSZ
:REAL_ADDR_BITS
]
596 comb
+= s_tag
.eq(get_tag(s_ra
))
598 # for i in way_t loop
599 for i
in range(NUM_WAYS
):
600 # if go = '1' and cache_valids(req_index)(i) = '1'
601 # and read_tag(i, cache_tag_set) = s_tag
602 # and tlb_valid_way(j) = '1' then
603 with m
.If(go
& cache_valid_bits
[req_index
][i
] &
604 read_tag(i
, cache_tag_set
) == s_tag
606 # hit_way_set(j) := i;
608 comb
+= hit_way_set
[j
].eq(i
)
612 # hit_set(j) := s_hit;
613 comb
+= hit_set
[j
].eq(s_hit
)
614 # if s_tag = r1.reload_tag then
615 with m
.If(s_tag
== r1
.reload_tag
):
616 # rel_matches(j) := '1';
617 comb
+= rel_matches
[j
].eq(1)
620 # if tlb_hit = '1' then
622 # is_hit := hit_set(tlb_hit_way);
623 # hit_way := hit_way_set(tlb_hit_way);
624 # rel_match := rel_matches(tlb_hit_way);
625 comb
+= is_hit
.eq(hit_set
[tlb_hit_way
])
626 comb
+= hit_way
.eq(hit_way_set
[tlb_hit_way
])
627 comb
+= rel_match
.eq(rel_matches
[tlb_hit_way
])
631 # s_tag := get_tag(r0.req.addr);
632 comb
+= s_tag
.eq(get_tag(r0
.req
.addr
))
633 # for i in way_t loop
634 for i
in range(NUM_WAYS
):
635 # if go = '1' and cache_valids(req_index)(i) = '1' and
636 # read_tag(i, cache_tag_set) = s_tag then
637 with m
.If(go
& cache_valid_bits
[req_index
][i
] &
638 read_tag(i
, cache_tag_set
) == s_tag
):
641 comb
+= hit_way
.eq(i
)
645 # if s_tag = r1.reload_tag then
646 with m
.If(s_tag
== r1
.reload_tag
):
648 comb
+= rel_match
.eq(1)
651 # req_same_tag <= rel_match;
652 comb
+= req_same_tag
.eq(rel_match
)
654 # if r1.state = RELOAD_WAIT_ACK and req_index = r1.store_index
655 # and rel_match = '1' then
656 # See if the request matches the line currently being reloaded
657 with m
.If(r1
.state
== State
.RELOAD_WAIT_ACK
& req_index
==
658 r1
.store_index
& rel_match
):
659 # For a store, consider this a hit even if the row isn't
660 # valid since it will be by the time we perform the store.
661 # For a load, check the appropriate row valid bit.
664 # or r1.rows_valid(req_row mod ROW_PER_LINE);
665 # hit_way := replace_way;
666 comb
+= is_hit
.eq(~r0
.req
.load
667 | r1
.rows_valid
[req_row
% ROW_PER_LINE
]
669 comb
+= hit_way
.eq(replace_way
)
672 # -- Whether to use forwarded data for a load or not
673 # Whether to use forwarded data for a load or not
674 # use_forward1_next <= '0';
675 comb
+= use_forward1_next
.eq(0)
676 # if get_row(r1.req.real_addr) = req_row
677 # and r1.req.hit_way = hit_way then
678 with m
.If(get_row(r1
.req
.real_addr
) == req_row
679 & r1
.req
.hit_way
== hit_way
)
680 # Only need to consider r1.write_bram here, since if we
681 # are writing refill data here, then we don't have a
682 # cache hit this cycle on the line being refilled.
683 # (There is the possibility that the load following the
684 # load miss that started the refill could be to the old
685 # contents of the victim line, since it is a couple of
686 # cycles after the refill starts before we see the updated
687 # cache tag. In that case we don't use the bypass.)
688 # use_forward1_next <= r1.write_bram;
689 comb
+= use_forward1_next
.eq(r1
.write_bram
)
691 # use_forward2_next <= '0';
692 comb
+= use_forward2_next
.eq(0)
693 # if r1.forward_row1 = req_row
694 # and r1.forward_way1 = hit_way then
695 with m
.If(r1
.forward_row1
== req_row
696 & r1
.forward_way1
== hit_way
):
697 # use_forward2_next <= r1.forward_valid1;
698 comb
+= use_forward2_next
.eq(r1
.forward_valid1
)
701 # The way that matched on a hit
702 # req_hit_way <= hit_way;
703 comb
+= req_hit_way
.eq(hit_way
)
705 # The way to replace on a miss
706 # if r1.write_tag = '1' then
707 with m
.If(r1
.write_tag
):
708 # replace_way <= to_integer(unsigned(
709 # plru_victim(r1.store_index)
711 replace_way
.eq(plru_victim
[r1
.store_index
])
714 # replace_way <= r1.store_way;
715 comb
+= replace_way
.eq(r1
.store_way
)
718 # work out whether we have permission for this access
719 # NB we don't yet implement AMR, thus no KUAP
720 # rc_ok <= perm_attr.reference and
721 # (r0.req.load or perm_attr.changed);
722 # perm_ok <= (r0.req.priv_mode or not perm_attr.priv) and
723 # (perm_attr.wr_perm or (r0.req.load
724 # and perm_attr.rd_perm));
725 # access_ok <= valid_ra and perm_ok and rc_ok;
728 & (r0
.req
.load | perm_attr
.changed
)
730 comb
+= perm_ok
.eq((r0
.req
.prive_mode | ~perm_attr
.priv
)
732 |
(r0
.req
.load
& perm_attr
.rd_perm
)
734 comb
+= access_ok
.eq(valid_ra
& perm_ok
& rc_ok
)
735 # nc := r0.req.nc or perm_attr.nocache;
737 # Combine the request and cache hit status to decide what
738 # operation needs to be done
739 comb
+= nc
.eq(r0
.req
.nc | perm_attr
.nocache
)
740 comb
+= op
.eq(Op
.OP_NONE
)
743 # if access_ok = '0' then
744 with m
.If(~access_ok
):
746 comb
+= op
.eq(Op
.OP_BAD
)
747 # elsif cancel_store = '1' then
748 with m
.Elif(cancel_store
):
749 # op := OP_STCX_FAIL;
750 comb
+= op
.eq(Op
.OP_STCX_FAIL
)
753 # opsel := r0.req.load & nc & is_hit;
754 comb
+= opsel
.eq(Cat(is_hit
, nc
, r0
.req
.load
))
756 with m
.Switch(opsel
):
757 # when "101" => op := OP_LOAD_HIT;
758 # when "100" => op := OP_LOAD_MISS;
759 # when "110" => op := OP_LOAD_NC;
760 # when "001" => op := OP_STORE_HIT;
761 # when "000" => op := OP_STORE_MISS;
762 # when "010" => op := OP_STORE_MISS;
763 # when "011" => op := OP_BAD;
764 # when "111" => op := OP_BAD;
765 # when others => op := OP_NONE;
766 with m
.Case(Const(0b101, 3)):
767 comb
+= op
.eq(Op
.OP_LOAD_HIT
)
769 with m
.Case(Cosnt(0b100, 3)):
770 comb
+= op
.eq(Op
.OP_LOAD_MISS
)
772 with m
.Case(Const(0b110, 3)):
773 comb
+= op
.eq(Op
.OP_LOAD_NC
)
775 with m
.Case(Const(0b001, 3)):
776 comb
+= op
.eq(Op
.OP_STORE_HIT
)
778 with m
.Case(Const(0b000, 3)):
779 comb
+= op
.eq(Op
.OP_STORE_MISS
)
781 with m
.Case(Const(0b010, 3)):
782 comb
+= op
.eq(Op
.OP_STORE_MISS
)
784 with m
.Case(Const(0b011, 3)):
785 comb
+= op
.eq(Op
.OP_BAD
)
787 with m
.Case(Const(0b111, 3)):
788 comb
+= op
.eq(Op
.OP_BAD
)
791 comb
+= op
.eq(Op
.OP_NONE
)
797 comb
+= req_op
.eq(op
)
798 comb
+= req_go
.eq(go
)
800 # Version of the row number that is valid one cycle earlier
801 # in the cases where we need to read the cache data BRAM.
802 # If we're stalling then we need to keep reading the last
804 # if r0_stall = '0' then
805 with m
.If(~r0_stall
):
806 # if m_in.valid = '1' then
807 with m
.If(m_in
.valid
):
808 # early_req_row <= get_row(m_in.addr);
809 comb
+= early_req_row
.eq(get_row(m_in
.addr
))
812 # early_req_row <= get_row(d_in.addr);
813 comb
+= early_req_row
.eq(get_row(d_in
.addr
))
817 # early_req_row <= req_row;
818 comb
+= early_req_row
.eq(req_row
)
822 # Handle load-with-reservation and store-conditional instructions
823 def reservation_comb(self
, cancel_store
, set_rsrv
, clear_rsrv
,
824 r0_valid
, r0
, reservation
):
830 # cancel_store <= '0';
833 # if r0_valid = '1' and r0.req.reserve = '1' then
834 with m
.If(r0_valid
& r0
.req
.reserve
):
836 # -- XXX generate alignment interrupt if address
837 # -- is not aligned XXX or if r0.req.nc = '1'
838 # if r0.req.load = '1' then
839 # XXX generate alignment interrupt if address
840 # is not aligned XXX or if r0.req.nc = '1'
841 with m
.If(r0
.req
.load
):
842 # -- load with reservation
844 # load with reservation
848 # -- store conditional
851 comb
+= clear_rsrv
.eq(1)
852 # if reservation.valid = '0' or r0.req.addr(63
853 # downto LINE_OFF_BITS) /= reservation.addr then
854 with m
.If(~reservation
.valid
855 | r0
.req
.addr
[LINE_OFF_BITS
:64]):
856 # cancel_store <= '1';
857 comb
+= cancel_store
.eq(1)
863 def reservation_reg(self
, r0_valid
, access_ok
, clear_rsrv
,
870 # if rising_edge(clk) then
872 # reservation.valid <= '0';
873 # TODO understand how resets work in nmigen
874 # elsif r0_valid = '1' and access_ok = '1' then
875 with m
.Elif(r0_valid
& access_ok
):
876 # if clear_rsrv = '1' then
877 with m
.If(clear_rsrv
):
878 # reservation.valid <= '0';
879 sync
+= reservation
.valid
.ea(0)
880 # elsif set_rsrv = '1' then
881 with m
.Elif(set_rsrv
):
882 # reservation.valid <= '1';
883 # reservation.addr <=
884 # r0.req.addr(63 downto LINE_OFF_BITS);
885 sync
+= reservation
.valid
.eq(1)
886 sync
+= reservation
.addr
.eq(
887 r0
.req
.addr
[LINE_OFF_BITS
:64]
894 # Return data for loads & completion control logic
895 def writeback_control(self
, r1
, cache_out
, d_out
, m_out
):
900 # variable data_out : std_ulogic_vector(63 downto 0);
901 # variable data_fwd : std_ulogic_vector(63 downto 0);
902 # variable j : integer;
903 data_out
= Signal(64)
904 data_fwd
= Signal(64)
908 # -- Use the bypass if are reading the row that was
909 # -- written 1 or 2 cycles ago, including for the
910 # -- slow_valid = 1 case (i.e. completing a load
911 # -- miss or a non-cacheable load).
912 # if r1.use_forward1 = '1' then
913 # Use the bypass if are reading the row that was
914 # written 1 or 2 cycles ago, including for the
915 # slow_valid = 1 case (i.e. completing a load
916 # miss or a non-cacheable load).
917 with m
.If(r1
.use_forward1
):
918 # data_fwd := r1.forward_data1;
919 comb
+= data_fwd
.eq(r1
.forward_data1
)
922 # data_fwd := r1.forward_data2;
923 comb
+= data_fwd
.eq(r1
.forward_data2
)
926 # data_out := cache_out(r1.hit_way);
927 comb
+= data_out
.eq(cache_out
[r1
.hit_way
])
929 # for i in 0 to 7 loop
934 # if r1.forward_sel(i) = '1' then
935 with m
.If(r1
.forward_sel
[i
]):
936 # data_out(j + 7 downto j) := data_fwd(j + 7 downto j);
937 comb
+= data_out
[j
:j
+8].eq(data_fwd
[j
:j
+8])
941 # d_out.valid <= r1.ls_valid;
942 # d_out.data <= data_out;
943 # d_out.store_done <= not r1.stcx_fail;
944 # d_out.error <= r1.ls_error;
945 # d_out.cache_paradox <= r1.cache_paradox;
946 comb
+= d_out
.valid
.eq(r1
.ls_valid
)
947 comb
+= d_out
.data
.eq(data_out
)
948 comb
+= d_out
.store_done
.eq(~r1
.stcx_fail
)
949 comb
+= d_out
.error
.eq(r1
.ls_error
)
950 comb
+= d_out
.cache_paradox
.eq(r1
.cache_paradox
)
953 # m_out.done <= r1.mmu_done;
954 # m_out.err <= r1.mmu_error;
955 # m_out.data <= data_out;
956 comb
+= m_out
.done
.eq(r1
.mmu_done
)
957 comb
+= m_out
.err
.eq(r1
.mmu_error
)
958 comb
+= m_out
.data
.eq(data_out
)
960 # -- We have a valid load or store hit or we just completed
961 # -- a slow op such as a load miss, a NC load or a store
963 # -- Note: the load hit is delayed by one cycle. However it
964 # -- can still not collide with r.slow_valid (well unless I
965 # -- miscalculated) because slow_valid can only be set on a
966 # -- subsequent request and not on its first cycle (the state
967 # -- machine must have advanced), which makes slow_valid
968 # -- at least 2 cycles from the previous hit_load_valid.
970 # -- Sanity: Only one of these must be set in any given cycle
971 # assert (r1.slow_valid and r1.stcx_fail) /= '1'
972 # report "unexpected slow_valid collision with stcx_fail"
974 # assert ((r1.slow_valid or r1.stcx_fail) and r1.hit_load_valid)
975 # /= '1' report "unexpected hit_load_delayed collision with
976 # slow_valid" severity FAILURE;
977 # We have a valid load or store hit or we just completed
978 # a slow op such as a load miss, a NC load or a store
980 # Note: the load hit is delayed by one cycle. However it
981 # can still not collide with r.slow_valid (well unless I
982 # miscalculated) because slow_valid can only be set on a
983 # subsequent request and not on its first cycle (the state
984 # machine must have advanced), which makes slow_valid
985 # at least 2 cycles from the previous hit_load_valid.
987 # Sanity: Only one of these must be set in any given cycle
988 assert (r1
.slow_valid
& r1
.stcx_fail
) != 1 "unexpected" \
989 "slow_valid collision with stcx_fail -!- severity FAILURE"
991 assert ((r1
.slow_valid | r1
.stcx_fail
) | r1
.hit_load_valid
) != 1
992 "unexpected hit_load_delayed collision with slow_valid -!-" \
995 # if r1.mmu_req = '0' then
996 with m
.If(~r1
._mmu_req
):
997 # -- Request came from loadstore1...
998 # -- Load hit case is the standard path
999 # if r1.hit_load_valid = '1' then
1000 # Request came from loadstore1...
1001 # Load hit case is the standard path
1002 with m
.If(r1
.hit_load_valid
):
1004 # "completing load hit data=" & to_hstring(data_out);
1005 print(f
"completing load hit data={data_out}")
1008 # -- error cases complete without stalling
1009 # if r1.ls_error = '1' then
1010 # error cases complete without stalling
1011 with m
.If(r1
.ls_error
):
1012 # report "completing ld/st with error";
1013 print("completing ld/st with error")
1016 # -- Slow ops (load miss, NC, stores)
1017 # if r1.slow_valid = '1' then
1018 # Slow ops (load miss, NC, stores)
1019 with m
.If(r1
.slow_valid
):
1021 # "completing store or load miss data="
1022 # & to_hstring(data_out);
1023 print(f
"completing store or load miss data={data_out}")
1028 # -- Request came from MMU
1029 # if r1.hit_load_valid = '1' then
1030 # Request came from MMU
1031 with m
.If(r1
.hit_load_valid
):
1032 # report "completing load hit to MMU, data="
1033 # & to_hstring(m_out.data);
1034 print(f
"completing load hit to MMU, data={m_out.data}")
1037 # -- error cases complete without stalling
1038 # if r1.mmu_error = '1' then
1039 # report "completing MMU ld with error";
1040 # error cases complete without stalling
1041 with m
.If(r1
.mmu_error
):
1042 print("combpleting MMU ld with error")
1045 # -- Slow ops (i.e. load miss)
1046 # if r1.slow_valid = '1' then
1047 # Slow ops (i.e. load miss)
1048 with m
.If(r1
.slow_valid
):
1049 # report "completing MMU load miss, data="
1050 # & to_hstring(m_out.data);
1051 print("completing MMU load miss, data={m_out.data}")
1056 # -- Generate a cache RAM for each way. This handles the normal
1057 # -- reads, writes from reloads and the special store-hit update
1060 # -- Note: the BRAMs have an extra read buffer, meaning the output
1061 # -- is pipelined an extra cycle. This differs from the
1062 # -- icache. The writeback logic needs to take that into
1063 # -- account by using 1-cycle delayed signals for load hits.
1065 # rams: for i in 0 to NUM_WAYS-1 generate
1066 # Generate a cache RAM for each way. This handles the normal
1067 # reads, writes from reloads and the special store-hit update
1070 # Note: the BRAMs have an extra read buffer, meaning the output
1071 # is pipelined an extra cycle. This differs from the
1072 # icache. The writeback logic needs to take that into
1073 # account by using 1-cycle delayed signals for load hits.
1075 for i
in range(NUM_WAYS
):
1076 # signal do_read : std_ulogic;
1077 # signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
1078 # signal do_write : std_ulogic;
1079 # signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
1081 # std_ulogic_vector(wishbone_data_bits-1 downto 0);
1082 # signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
1083 # signal wr_sel_m : std_ulogic_vector(ROW_SIZE-1 downto 0);
1084 # signal dout : cache_row_t;
1086 rd_addr
= Signal(ROW_BITS
)
1088 wr_addr
= Signal(ROW_BITS
)
1089 wr_data
= Signal(WB_DATA_BITS
)
1090 wr_sel
= Signal(ROW_SIZE
)
1091 wr_sel_m
= Signal(ROW_SIZE
)
1092 _d_out
= Signal(WB_DATA_BITS
)
1095 # way: entity work.cache_ram
1097 # ROW_BITS => ROW_BITS,
1098 # WIDTH => wishbone_data_bits,
1104 # rd_addr => rd_addr,
1106 # wr_sel => wr_sel_m,
1107 # wr_addr => wr_addr,
1108 # wr_data => wr_data
1111 way
= CacheRam(ROW_BITS
, WB_DATA_BITS
, True)
1112 comb
+= way
.rd_en
.eq(do_read
)
1113 comb
+= way
.rd_addr
.eq(rd_addr
)
1114 comb
+= way
.rd_data
.eq(_d_out
)
1115 comb
+= way
.wr_sel
.eq(wr_sel_m
)
1116 comb
+= way
.wr_addr
.eq(wr_addr
)
1117 comb
+= way
.wr_data
.eq(wr_data
)
1120 # -- Cache hit reads
1123 # std_ulogic_vector(to_unsigned(early_req_row, ROW_BITS));
1124 # cache_out(i) <= dout;
1126 comb
+= do_read
.eq(1)
1127 comb
+= rd_addr
.eq(Signal(BRAM_ROWS
))
1128 comb
+= cache_out
[i
].eq(dout
)
1132 # -- Defaults to wishbone read responses (cache refill)
1134 # -- For timing, the mux on wr_data/sel/addr is not
1135 # -- dependent on anything other than the current state.
1138 # Defaults to wishbone read responses (cache refill)
1140 # For timing, the mux on wr_data/sel/addr is not
1141 # dependent on anything other than the current state.
1142 # wr_sel_m <= (others => '0');
1143 comb
+= wr_sel_m
.eq(0)
1146 comb
+= do_write
.eq(0)
1147 # if r1.write_bram = '1' then
1148 with m
.If(r1
.write_bram
):
1149 # -- Write store data to BRAM. This happens one
1150 # -- cycle after the store is in r0.
1151 # Write store data to BRAM. This happens one
1152 # cycle after the store is in r0.
1153 # wr_data <= r1.req.data;
1154 # wr_sel <= r1.req.byte_sel;
1155 # wr_addr <= std_ulogic_vector(to_unsigned(
1156 # get_row(r1.req.real_addr), ROW_BITS
1158 comb
+= wr_data
.eq(r1
.req
.data
)
1159 comb
+= wr_sel
.eq(r1
.req
.byte_sel
)
1160 comb
+= wr_addr
.eq(Signal(get_row(r1
.req
.real_addr
)))
1162 # if i = r1.req.hit_way then
1163 with m
.If(i
== r1
.req
.hit_way
):
1165 comb
+= do_write
.eq(1)
1169 # -- Otherwise, we might be doing a reload or a DCBZ
1170 # if r1.dcbz = '1' then
1171 # Otherwise, we might be doing a reload or a DCBZ
1173 # wr_data <= (others => '0');
1174 comb
+= wr_data
.eq(0)
1177 # wr_data <= wishbone_in.dat;
1178 comb
+= wr_data
.eq(wishbone_in
.dat
)
1181 # wr_addr <= std_ulogic_vector(to_unsigned(
1182 # r1.store_row, ROW_BITS
1184 # wr_sel <= (others => '1');
1185 comb
+= wr_addr
.eq(Signal(r1
.store_row
))
1186 comb
+= wr_sel
.eq(1)
1188 # if r1.state = RELOAD_WAIT_ACK and
1189 # wishbone_in.ack = '1' and replace_way = i then
1190 with m
.If(r1
.state
== State
.RELOAD_WAIT_ACK
1191 & wishbone_in
.ack
& relpace_way
== i
):
1193 comb
+= do_write
.eq(1)
1197 # -- Mask write selects with do_write since BRAM
1198 # -- doesn't have a global write-enable
1199 # if do_write = '1' then
1200 # -- Mask write selects with do_write since BRAM
1201 # -- doesn't have a global write-enable
1202 with m
.If(do_write
):
1203 # wr_sel_m <= wr_sel;
1204 comb
+= wr_sel_m
.eq(wr_sel
)
1209 # Cache hit synchronous machine for the easy case.
1210 # This handles load hits.
1211 # It also handles error cases (TLB miss, cache paradox)
1212 def dcache_fast_hit(self
, req_op
, r0_valid
, r1
, ):
1218 # if rising_edge(clk) then
1219 # if req_op /= OP_NONE then
1220 with m
.If(req_op
!= Op
.OP_NONE
):
1221 # report "op:" & op_t'image(req_op) &
1222 # " addr:" & to_hstring(r0.req.addr) &
1223 # " nc:" & std_ulogic'image(r0.req.nc) &
1224 # " idx:" & integer'image(req_index) &
1225 # " tag:" & to_hstring(req_tag) &
1226 # " way: " & integer'image(req_hit_way);
1227 print(f
"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \
1228 f
"idx:{req_index} tag:{req_tag} way: {req_hit_way}"
1231 # if r0_valid = '1' then
1232 with m
.If(r0_valid
):
1233 # r1.mmu_req <= r0.mmu_req;
1234 sync
+= r1
.mmu_req
.eq(r0
.mmu_req
)
1237 # -- Fast path for load/store hits.
1238 # -- Set signals for the writeback controls.
1239 # r1.hit_way <= req_hit_way;
1240 # r1.hit_index <= req_index;
1241 # Fast path for load/store hits.
1242 # Set signals for the writeback controls.
1243 sync
+= r1
.hit_way
.eq(req_hit_way
)
1244 sync
+= r1
.hit_index
.eq(req_index
)
1246 # if req_op = OP_LOAD_HIT then
1247 with m
.If(req_op
== Op
.OP_LOAD_HIT
):
1248 # r1.hit_load_valid <= '1';
1249 sync
+= r1
.hit_load_valid
.eq(1)
1253 # r1.hit_load_valid <= '0';
1254 sync
+= r1
.hit_load_valid
.eq(0)
1257 # if req_op = OP_LOAD_HIT or req_op = OP_STORE_HIT then
1258 with m
.If(req_op
== Op
.OP_LOAD_HIT | req_op
== Op
.OP_STORE_HIT
):
1259 # r1.cache_hit <= '1';
1260 sync
+= r1
.cache_hit
.eq(1)
1263 # r1.cache_hit <= '0';
1264 sync
+= r1
.cache_hit
.eq(0)
1267 # if req_op = OP_BAD then
1268 with m
.If(req_op
== Op
.OP_BAD
):
1269 # report "Signalling ld/st error valid_ra=" &
1270 # std_ulogic'image(valid_ra) & " rc_ok=" &
1271 # std_ulogic'image(rc_ok) & " perm_ok=" &
1272 # std_ulogic'image(perm_ok);
1273 print(f
"Signalling ld/st error valid_ra={valid_ra}"
1274 f
"rc_ok={rc_ok} perm_ok={perm_ok}"
1276 # r1.ls_error <= not r0.mmu_req;
1277 # r1.mmu_error <= r0.mmu_req;
1278 # r1.cache_paradox <= access_ok;
1279 sync
+= r1
.ls_error
.eq(~r0
.mmu_req
)
1280 sync
+= r1
.mmu_error
.eq(r0
.mmu_req
)
1281 sync
+= r1
.cache_paradox
.eq(access_ok
)
1285 # r1.ls_error <= '0';
1286 # r1.mmu_error <= '0';
1287 # r1.cache_paradox <= '0';
1288 sync
+= r1
.ls_error
.eq(0)
1289 sync
+= r1
.mmu_error
.eq(0)
1290 sync
+= r1
.cache_paradox
.eq(0)
1293 # if req_op = OP_STCX_FAIL then
1294 with m
.If(req_op
== Op
.OP_STCX_FAIL
):
1295 # r1.stcx_fail <= '1';
1300 # r1.stcx_fail <= '0';
1301 sync
+= r1
.stcx_fail
.eq(0)
1304 # -- Record TLB hit information for updating TLB PLRU
1305 # r1.tlb_hit <= tlb_hit;
1306 # r1.tlb_hit_way <= tlb_hit_way;
1307 # r1.tlb_hit_index <= tlb_req_index;
1308 # Record TLB hit information for updating TLB PLRU
1309 sync
+= r1
.tlb_hit
.eq(tlb_hit
)
1310 sync
+= r1
.tlb_hit_way
.eq(tlb_hit_way
)
1311 sync
+= r1
.tlb_hit_index
.eq(tlb_req_index
)
1315 # Memory accesses are handled by this state machine:
1317 # * Cache load miss/reload (in conjunction with "rams")
1318 # * Load hits for non-cachable forms
1319 # * Stores (the collision case is handled in "rams")
1321 # All wishbone requests generation is done here.
1322 # This machine operates at stage 1.
1323 def dcache_slow(self
, r1
, use_forward1_next
, cache_valid_bits
, r0
,
1324 r0_valid
, req_op
, cache_tag
, req_go
, ra
, wb_in
):
1329 # variable stbs_done : boolean;
1330 # variable req : mem_access_request_t;
1331 # variable acks : unsigned(2 downto 0);
1332 stbs_done
= Signal()
1333 req
= MemAccessRequest()
1341 # if rising_edge(clk) then
1342 # r1.use_forward1 <= use_forward1_next;
1343 # r1.forward_sel <= (others => '0');
1344 sync
+= r1
.use_forward1
.eq(use_forward1_next
)
1345 sync
+= r1
.forward_sel
.eq(0)
1347 # if use_forward1_next = '1' then
1348 with m
.If(use_forward1_next
):
1349 # r1.forward_sel <= r1.req.byte_sel;
1350 sync
+= r1
.forward_sel
.eq(r1
.req
.byte_sel
)
1352 # elsif use_forward2_next = '1' then
1353 with m
.Elif(use_forward2_next
):
1354 # r1.forward_sel <= r1.forward_sel1;
1355 sync
+= r1
.forward_sel
.eq(r1
.forward_sel1
)
1358 # r1.forward_data2 <= r1.forward_data1;
1359 sync
+= r1
.forward_data2
.eq(r1
.forward_data1
)
1361 # if r1.write_bram = '1' then
1362 with m
.If(r1
.write_bram
):
1363 # r1.forward_data1 <= r1.req.data;
1364 # r1.forward_sel1 <= r1.req.byte_sel;
1365 # r1.forward_way1 <= r1.req.hit_way;
1366 # r1.forward_row1 <= get_row(r1.req.real_addr);
1367 # r1.forward_valid1 <= '1';
1368 sync
+= r1
.forward_data1
.eq(r1
.req
.data
)
1369 sync
+= r1
.forward_sel1
.eq(r1
.req
.byte_sel
)
1370 sync
+= r1
.forward_way1
.eq(r1
.req
.hit_way
)
1371 sync
+= r1
.forward_row1
.eq(get_row(r1
.req
.real_addr
))
1372 sync
+= r1
.forward_valid1
.eq(1)
1376 # if r1.dcbz = '1' then
1378 # r1.forward_data1 <= (others => '0');
1379 sync
+= r1
.forward_data1
.eq(0)
1383 # r1.forward_data1 <= wishbone_in.dat;
1384 sync
+= r1
.forward_data1
.eq(wb_in
.dat
)
1387 # r1.forward_sel1 <= (others => '1');
1388 # r1.forward_way1 <= replace_way;
1389 # r1.forward_row1 <= r1.store_row;
1390 # r1.forward_valid1 <= '0';
1391 sync
+= r1
.forward_sel1
.eq(1)
1392 sync
+= r1
.forward_way1
.eq(replace_way
)
1393 sync
+= r1
.forward_row1
.eq(r1
.store_row
)
1394 sync
+= r1
.forward_valid1
.eq(0)
1397 # -- On reset, clear all valid bits to force misses
1399 # On reset, clear all valid bits to force misses
1400 # TODO figure out how reset signal works in nmigeni
1401 with m
.If("""TODO RST???"""):
1402 # for i in index_t loop
1403 for i
in range(NUM_LINES
):
1404 # cache_valids(i) <= (others => '0');
1405 sync
+= cache_valid_bits
[i
].eq(0)
1410 # r1.slow_valid <= '0';
1413 # r1.ls_valid <= '0';
1414 # r1.mmu_done <= '0';
1415 sync
+= r1
.state
.eq(State
.IDLE
)
1416 sync
+= r1
.full
.eq(0)
1417 sync
+= r1
.slow_valid
.eq(0)
1418 sync
+= r1
.wb
.cyc
.eq(0)
1419 sync
+= r1
.wb
.stb
.eq(0)
1420 sync
+= r1
.ls_valid
.eq(0)
1421 sync
+= r1
.mmu_done
.eq(0)
1423 # -- Not useful normally but helps avoiding
1424 # -- tons of sim warnings
1425 # Not useful normally but helps avoiding
1426 # tons of sim warnings
1427 # r1.wb.adr <= (others => '0');
1428 sync
+= r1
.wb
.adr
.eq(0)
1431 # -- One cycle pulses reset
1432 # r1.slow_valid <= '0';
1433 # r1.write_bram <= '0';
1434 # r1.inc_acks <= '0';
1435 # r1.dec_acks <= '0';
1437 # r1.ls_valid <= '0';
1438 # -- complete tlbies and TLB loads in the third cycle
1439 # r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld);
1440 # One cycle pulses reset
1441 sync
+= r1
.slow_valid
.eq(0)
1442 sync
+= r1
.write_bram
.eq(0)
1443 sync
+= r1
.inc_acks
.eq(0)
1444 sync
+= r1
.dec_acks
.eq(0)
1446 sync
+= r1
.ls_valid
.eq(0)
1447 # complete tlbies and TLB loads in the third cycle
1448 sync
+= r1
.mmu_done
.eq(r0_valid
& (r0
.tlbie | r0
.tlbld
))
1450 # if req_op = OP_LOAD_HIT or req_op = OP_STCX_FAIL then
1451 with m
.If(req_op
== Op
.OP_LOAD_HIT
1452 | req_op
== Op
.OP_STCX_FAIL
):
1453 # if r0.mmu_req = '0' then
1454 with m
.If(~r0
.mmu_req
):
1455 # r1.ls_valid <= '1';
1456 sync
+= r1
.ls_valid
.eq(1)
1459 # r1.mmu_done <= '1';
1460 sync
+= r1
.mmu_done
.eq(1)
1464 # if r1.write_tag = '1' then
1465 with m
.If(r1
.write_tag
):
1466 # -- Store new tag in selected way
1467 # for i in 0 to NUM_WAYS-1 loop
1468 # Store new tag in selected way
1469 for i
in range(NUM_WAYS
):
1470 # if i = replace_way then
1471 with m
.If(i
== replace_way
):
1472 # cache_tags(r1.store_index)(
1473 # (i + 1) * TAG_WIDTH - 1
1474 # downto i * TAG_WIDTH
1476 # (TAG_WIDTH - 1 downto TAG_BITS => '0')
1480 ][i
* TAG_WIDTH
:(i
+1) * TAG_WIDTH
].eq(
1481 Const(TAG_WIDTH
, TAG_WIDTH
)
1486 # r1.store_way <= replace_way;
1487 # r1.write_tag <= '0';
1488 sync
+= r1
.store_way
.eq(replace_way
)
1489 sync
+= r1
.write_tag
.eq(0)
1492 # -- Take request from r1.req if there is one there,
1493 # -- else from req_op, ra, etc.
1494 # if r1.full = '1' then
1495 # Take request from r1.req if there is one there,
1496 # else from req_op, ra, etc.
1499 sync
+= req
.eq(r1
.req
)
1504 # req.valid := req_go;
1505 # req.mmu_req := r0.mmu_req;
1506 # req.dcbz := r0.req.dcbz;
1507 # req.real_addr := ra;
1508 sync
+= req
.op
.eq(req_op
)
1509 sync
+= req
.valid
.eq(req_go
)
1510 sync
+= req
.mmu_req
.eq(r0
.mmu_req
)
1511 sync
+= req
.dcbz
.eq(r0
.req
.dcbz
)
1512 sync
+= req
.real_addr
.eq(ra
)
1514 # -- Force data to 0 for dcbz
1515 # if r0.req.dcbz = '0' then
1516 with m
.If(~r0
.req
.dcbz
):
1517 # req.data := r0.req.data;
1518 sync
+= req
.data
.eq(r0
.req
.data
)
1522 # req.data := (others => '0');
1523 sync
+= req
.data
.eq(0)
1526 # -- Select all bytes for dcbz
1527 # -- and for cacheable loads
1528 # if r0.req.dcbz = '1'
1529 # or (r0.req.load = '1' and r0.req.nc = '0') then
1530 # Select all bytes for dcbz
1531 # and for cacheable loads
1532 with m
.If(r0
.req
.dcbz |
(r0
.req
.load
& ~r0
.req
.nc
):
1533 # req.byte_sel := (others => '1');
1534 sync
+= req
.byte_sel
.eq(1)
1538 # req.byte_sel := r0.req.byte_sel;
1539 sync
+= req
.byte_sel
.eq(r0
.req
.byte_sel
)
1542 # req.hit_way := req_hit_way;
1543 # req.same_tag := req_same_tag;
1544 sync
+= req
.hit_way
.eq(req_hit_way
)
1545 sync
+= req
.same_tag
.eq(req_same_tag
)
1547 # -- Store the incoming request from r0,
1548 # -- if it is a slow request
1549 # -- Note that r1.full = 1 implies req_op = OP_NONE
1550 # if req_op = OP_LOAD_MISS or req_op = OP_LOAD_NC
1551 # or req_op = OP_STORE_MISS
1552 # or req_op = OP_STORE_HIT then
1553 # Store the incoming request from r0,
1554 # if it is a slow request
1555 # Note that r1.full = 1 implies req_op = OP_NONE
1556 with m
.If(req_op
== Op
.OP_LOAD_MISS
1557 | req_op
== Op
.OP_LOAD_NC
1558 | req_op
== Op
.OP_STORE_MISS
1559 | req_op
== Op
.OP_STORE_HIT
):
1563 sync
+= r1
.full
.eq(1)
1567 # -- Main state machine
1569 # Main state machine
1570 with m
.Switch(r1
.state
):
1573 with m
.Case(State
.IDLE
)
1574 # r1.wb.adr <= req.real_addr(
1575 # r1.wb.adr'left downto 0
1577 # r1.wb.sel <= req.byte_sel;
1578 # r1.wb.dat <= req.data;
1579 # r1.dcbz <= req.dcbz;
1581 # -- Keep track of our index and way
1582 # -- for subsequent stores.
1583 # r1.store_index <= get_index(req.real_addr);
1584 # r1.store_row <= get_row(req.real_addr);
1586 # get_row_of_line(get_row(req.real_addr)) - 1;
1587 # r1.reload_tag <= get_tag(req.real_addr);
1588 # r1.req.same_tag <= '1';
1589 sync
+= r1
.wb
.adr
.eq(req
.real_addr
[0:r1
.wb
.adr
])
1590 sync
+= r1
.wb
.sel
.eq(req
.byte_sel
)
1591 sync
+= r1
.wb
.dat
.eq(req
.data
)
1592 sync
+= r1
.dcbz
.eq(req
.dcbz
)
1594 # Keep track of our index and way
1595 # for subsequent stores.
1596 sync
+= r1
.store_index
.eq(get_index(req
.real_addr
))
1597 sync
+= r1
.store_row
.eq(get_row(req
.real_addr
))
1598 sync
+= r1
.end_row_ix
.eq(
1599 get_row_of_line(get_row(req
.real_addr
))
1601 sync
+= r1
.reload_tag
.eq(get_tag(req
.real_addr
))
1602 sync
+= r1
.req
.same_tag
.eq(1)
1604 # if req.op = OP_STORE_HIT theni
1605 with m
.If(req
.op
== Op
.OP_STORE_HIT
):
1606 # r1.store_way <= req.hit_way;
1607 sync
+= r1
.store_way
.eq(req
.hit_way
)
1610 # -- Reset per-row valid bits,
1611 # -- ready for handling OP_LOAD_MISS
1612 # for i in 0 to ROW_PER_LINE - 1 loop
1613 # Reset per-row valid bits,
1614 # ready for handling OP_LOAD_MISS
1615 for i
in range(ROW_PER_LINE
):
1616 # r1.rows_valid(i) <= '0';
1617 sync
+= r1
.rows_valid
[i
].eq(0)
1621 with m
.Switch(req
.op
):
1622 # when OP_LOAD_HIT =>
1623 with m
.Case(Op
.OP_LOAD_HIT
):
1624 # -- stay in IDLE state
1625 # stay in IDLE state
1628 # when OP_LOAD_MISS =>
1629 with m
.Case(Op
.OP_LOAD_MISS
):
1630 # -- Normal load cache miss,
1631 # -- start the reload machine
1632 # report "cache miss real addr:" &
1633 # to_hstring(req.real_addr) & " idx:" &
1634 # integer'image(get_index(req.real_addr)) &
1635 # " tag:" & to_hstring(get_tag(req.real_addr));
1636 # Normal load cache miss,
1637 # start the reload machine
1638 print(f
"cache miss real addr:" \
1639 f
"{req_real_addr}" \
1640 f
" idx:{get_index(req_real_addr)}" \
1641 f
" tag:{get_tag(req.real_addr)}")
1643 # -- Start the wishbone cycle
1647 # Start the wishbone cycle
1648 sync
+= r1
.wb
.we
.eq(0)
1649 sync
+= r1
.wb
.cyc
.eq(1)
1650 sync
+= r1
.wb
.stb
.eq(1)
1652 # -- Track that we had one request sent
1653 # r1.state <= RELOAD_WAIT_ACK;
1654 # r1.write_tag <= '1';
1655 # Track that we had one request sent
1656 sync
+= r1
.state
.eq(State
.RELOAD_WAIT_ACK
)
1657 sync
+= r1
.write_tag
.eq(1)
1659 # when OP_LOAD_NC =>
1660 with m
.Case(Op
.OP_LOAD_NC
):
1664 # r1.state <= NC_LOAD_WAIT_ACK;
1665 sync
+= r1
.wb
.cyc
.eq(1)
1666 sync
+= r1
.wb
.stb
.eq(1)
1667 sync
+= r1
.wb
.we
.eq(0)
1668 sync
+= r1
.state
.eq(State
.NC_LOAD_WAIT_ACK
)
1670 # when OP_STORE_HIT | OP_STORE_MISS =>
1671 with m
.Case(Op
.OP_STORE_HIT
1672 | Op
.OP_STORE_MISS
):
1673 # if req.dcbz = '0' then
1674 with m
.If(~req
.bcbz
):
1675 # r1.state <= STORE_WAIT_ACK;
1676 # r1.acks_pending <= to_unsigned(1, 3);
1678 # r1.slow_valid <= '1';
1679 sync
+= r1
.state
.eq(
1680 State
.STORE_WAIT_ACK
1682 sync
+= r1
.acks_pending
.eq(
1683 '''TODO to_unsignes(1,3)'''
1685 sync
+= r1
.full
.eq(0)
1686 sync
+= r1
.slow_valid
.eq(1)
1688 # if req.mmu_req = '0' then
1689 with m
.If(~req
.mmu_req
):
1690 # r1.ls_valid <= '1';
1691 sync
+= r1
.ls_valid
.eq(1)
1694 # r1.mmu_done <= '1';
1695 sync
+= r1
.mmu_done
.eq(1)
1698 # if req.op = OP_STORE_HIT then
1699 with m
.If(req
.op
== Op
.OP_STORE_HIT
):
1700 # r1.write_bram <= '1';
1701 sync
+= r1
.write_bram
.eq(1)
1706 # -- dcbz is handled much like a load
1707 # -- miss except that we are writing
1708 # -- to memory instead of reading
1709 # r1.state <= RELOAD_WAIT_ACK;
1710 # dcbz is handled much like a load
1711 # miss except that we are writing
1712 # to memory instead of reading
1713 sync
+= r1
.state
.eq(Op
.RELOAD_WAIT_ACK
)
1715 # if req.op = OP_STORE_MISS then
1716 with m
.If(req
.op
== Op
.OP_STORE_MISS
):
1717 # r1.write_tag <= '1';
1718 sync
+= r1
.write_tag
.eq(1)
1725 sync
+= r1
.wb
.we
.eq(1)
1726 sync
+= r1
.wb
.cyc
.eq(1)
1727 sync
+= r1
.wb
.stb
.eq(1)
1729 # -- OP_NONE and OP_BAD do nothing
1730 # -- OP_BAD & OP_STCX_FAIL were handled above already
1733 # when OP_STCX_FAIL =>
1734 # OP_NONE and OP_BAD do nothing
1735 # OP_BAD & OP_STCX_FAIL were
1736 # handled above already
1737 with m
.Case(Op
.OP_NONE
):
1740 with m
.Case(OP_BAD
):
1743 with m
.Case(OP_STCX_FAIL
):
1747 # when RELOAD_WAIT_ACK =>
1748 with m
.Case(State
.RELOAD_WAIT_ACK
):
1749 # -- Requests are all sent if stb is 0
1750 # Requests are all sent if stb is 0
1751 sync
+= stbs_done
.eq(~r1
.wb
.stb
)
1752 # stbs_done := r1.wb.stb = '0';
1754 # -- If we are still sending requests,
1755 # -- was one accepted?
1756 # if wishbone_in.stall = '0' and not stbs_done then
1757 # If we are still sending requests,
1759 with m
.If(~wb_in
.stall
& ~stbs_done
):
1760 # -- That was the last word ? We are done sending.
1761 # -- Clear stb and set stbs_done so we can handle
1762 # -- an eventual last ack on the same cycle.
1763 # if is_last_row_addr(
1764 # r1.wb.adr, r1.end_row_ix
1766 # That was the last word?
1767 # We are done sending.
1768 # Clear stb and set stbs_done
1769 # so we can handle an eventual
1770 # last ack on the same cycle.
1771 with m
.If(is_last_row_addr(
1772 r1
.wb
.adr
, r1
.end_row_ix
)):
1774 # stbs_done := true;
1775 sync
+= r1
.wb
.stb
.eq(0)
1776 sync
+= stbs_done
.eq(0)
1779 # -- Calculate the next row address
1780 # r1.wb.adr <= next_row_addr(r1.wb.adr);
1781 # Calculate the next row address
1782 sync
+= r1
.wb
.adr
.eq(next_row_addr(r1
.wb
.adr
))
1785 # -- Incoming acks processing
1786 # r1.forward_valid1 <= wishbone_in.ack;
1787 # Incoming acks processing
1788 sync
+= r1
.forward_valid1
.eq(wb_in
.ack
)
1790 # if wishbone_in.ack = '1' then
1791 with m
.If(wb_in
.ack
):
1793 # r1.store_row mod ROW_PER_LINE
1795 sync
+= r1
.rows_valid
[
1796 r1
.store_row
% ROW_PER_LINE
1799 # -- If this is the data we were looking for,
1800 # -- we can complete the request next cycle.
1801 # -- Compare the whole address in case the
1802 # -- request in r1.req is not the one that
1803 # -- started this refill.
1804 # if r1.full = '1' and r1.req.same_tag = '1'
1805 # and ((r1.dcbz = '1' and r1.req.dcbz = '1')
1806 # or (r1.dcbz = '0' and r1.req.op = OP_LOAD_MISS))
1807 # and r1.store_row = get_row(r1.req.real_addr) then
1808 # If this is the data we were looking for,
1809 # we can complete the request next cycle.
1810 # Compare the whole address in case the
1811 # request in r1.req is not the one that
1812 # started this refill.
1813 with m
.If(r1
.full
& r1
.req
.same_tag
&
1814 ((r1
.dcbz
& r1
.req
.dcbz
)
1816 r1
.req
.op
== Op
.OP_LOAD_MISS
)
1819 == get_row(r1
.req
.real_addr
):
1821 # r1.slow_valid <= '1';
1822 sync
+= r1
.full
.eq(0)
1823 sync
+= r1
.slow_valid
.eq(1)
1825 # if r1.mmu_req = '0' then
1826 with m
.If(~r1
.mmu_req
):
1827 # r1.ls_valid <= '1';
1828 sync
+= r1
.ls_valid
.eq(1)
1831 # r1.mmu_done <= '1';
1832 sync
+= r1
.mmu_done
.eq(1)
1834 # r1.forward_sel <= (others => '1');
1835 # r1.use_forward1 <= '1';
1836 sync
+= r1
.forward_sel
.eq(1)
1837 sync
+= r1
.use_forward1
.eq(1)
1840 # -- Check for completion
1841 # if stbs_done and is_last_row(r1.store_row,
1842 # r1.end_row_ix) then
1843 # Check for completion
1844 with m
.If(stbs_done
&
1845 is_last_row(r1
.store_row
,
1848 # -- Complete wishbone cycle
1850 # Complete wishbone cycle
1851 sync
+= r1
.wb
.cyc
.eq(0)
1853 # -- Cache line is now valid
1854 # cache_valids(r1.store_index)(
1857 # Cache line is now valid
1858 sync
+= cache_valid_bits
[
1860 ][r1
.store_way
].eq(1)
1863 sync
+= r1
.state
.eq(State
.IDLE
)
1866 # -- Increment store row counter
1867 # r1.store_row <= next_row(r1.store_row);
1868 # Increment store row counter
1869 sync
+= r1
.store_row
.eq(next_row(
1874 # when STORE_WAIT_ACK =>
1875 with m
.Case(State
.STORE_WAIT_ACK
):
1876 # stbs_done := r1.wb.stb = '0';
1877 # acks := r1.acks_pending;
1878 sync
+= stbs_done
.eq(~r1
.wb
.stb
)
1879 sync
+= acks
.eq(r1
.acks_pending
)
1881 # if r1.inc_acks /= r1.dec_acks then
1882 with m
.If(r1
.inc_acks
!= r1
.dec_acks
):
1884 # if r1.inc_acks = '1' then
1885 with m
.If(r1
.inc_acks
):
1887 sync
+= acks
.eq(acks
+ 1)
1892 sync
+= acks
.eq(acks
- 1)
1896 # r1.acks_pending <= acks;
1897 sync
+= r1
.acks_pending
.eq(acks
)
1899 # -- Clear stb when slave accepted request
1900 # if wishbone_in.stall = '0' then
1901 # Clear stb when slave accepted request
1902 with m
.If(~wb_in
.stall
):
1903 # -- See if there is another store waiting
1904 # -- to be done which is in the same real page.
1905 # if req.valid = '1' then
1906 # See if there is another store waiting
1907 # to be done which is in the same real page.
1908 with m
.If(req
.valid
):
1910 # SET_SIZE_BITS - 1 downto 0
1911 # ) <= req.real_addr(
1912 # SET_SIZE_BITS - 1 downto 0
1914 # r1.wb.dat <= req.data;
1915 # r1.wb.sel <= req.byte_sel;
1916 sync
+= r1
.wb
.adr
[0:SET_SIZE_BITS
].eq(
1917 req
.real_addr
[0:SET_SIZE_BITS
]
1921 # if acks < 7 and req.same_tag = '1'
1922 # and (req.op = OP_STORE_MISS
1923 # or req.op = OP_STORE_HIT) then
1924 with m
.Elif(acks
< 7 & req
.same_tag
&
1925 (req
.op
== Op
.Op_STORE_MISS
1926 | req
.op
== Op
.OP_SOTRE_HIT
)):
1928 # stbs_done := false;
1929 sync
+= r1
.wb
.stb
.eq(1)
1930 sync
+= stbs_done
.eq(0)
1932 # if req.op = OP_STORE_HIT then
1933 with m
.If(req
.op
== Op
.OP_STORE_HIT
):
1934 # r1.write_bram <= '1';
1935 sync
+= r1
.write_bram
.eq(1)
1938 # r1.slow_valid <= '1';
1939 sync
+= r1
.full
.eq(0)
1940 sync
+= r1
.slow_valid
.eq(1)
1942 # -- Store requests never come from the MMU
1943 # r1.ls_valid <= '1';
1944 # stbs_done := false;
1945 # r1.inc_acks <= '1';
1946 # Store request never come from the MMU
1947 sync
+= r1
.ls_valid
.eq(1)
1948 sync
+= stbs_done
.eq(0)
1949 sync
+= r1
.inc_acks
.eq(1)
1953 # stbs_done := true;
1954 sync
+= r1
.wb
.stb
.eq(0)
1955 sync
+= stbs_done
.eq(1)
1959 # -- Got ack ? See if complete.
1960 # if wishbone_in.ack = '1' then
1961 # Got ack ? See if complete.
1962 with m
.If(wb_in
.ack
):
1963 # if stbs_done and acks = 1 then
1964 with m
.If(stbs_done
& acks
)
1968 sync
+= r1
.state
.eq(State
.IDLE
)
1969 sync
+= r1
.wb
.cyc
.eq(0)
1970 sync
+= r1
.wb
.stb
.eq(0)
1972 # r1.dec_acks <= '1';
1973 sync
+= r1
.dec_acks
.eq(1)
1976 # when NC_LOAD_WAIT_ACK =>
1977 with m
.Case(State
.NC_LOAD_WAIT_ACK
):
1978 # -- Clear stb when slave accepted request
1979 # if wishbone_in.stall = '0' then
1980 # Clear stb when slave accepted request
1981 with m
.If(~wb_in
.stall
):
1983 sync
+= r1
.wb
.stb
.eq(0)
1986 # -- Got ack ? complete.
1987 # if wishbone_in.ack = '1' then
1988 # Got ack ? complete.
1989 with m
.If(wb_in
.ack
):
1992 # r1.slow_valid <= '1';
1993 sync
+= r1
.state
.eq(State
.IDLE
)
1994 sync
+= r1
.full
.eq(0)
1995 sync
+= r1
.slow_valid
.eq(1)
1997 # if r1.mmu_req = '0' then
1998 with m
.If(~r1
.mmu_req
):
1999 # r1.ls_valid <= '1';
2000 sync
+= r1
.ls_valid
.eq(1)
2004 # r1.mmu_done <= '1';
2005 sync
+= r1
.mmu_done
.eq(1)
2008 # r1.forward_sel <= (others => '1');
2009 # r1.use_forward1 <= '1';
2012 sync
+= r1
.forward_sel
.eq(1)
2013 sync
+= r1
.use_forward1
.eq(1)
2014 sync
+= r1
.wb
.cyc
.eq(0)
2015 sync
+= r1
.wb
.stb
.eq(0)
2022 # dc_log: if LOG_LENGTH > 0 generate
2023 # TODO learn how to tranlate vhdl generate into nmigen
2024 def dcache_log(self
, r1
, valid_ra
, tlb_hit_way
, stall_out
,
2025 d_out
, wb_in
, log_out
):
2030 # signal log_data : std_ulogic_vector(19 downto 0);
2031 log_data
= Signal(20)
2036 # dcache_log: process(clk)
2038 # if rising_edge(clk) then
2039 # log_data <= r1.wb.adr(5 downto 3) &
2040 # wishbone_in.stall &
2042 # r1.wb.stb & r1.wb.cyc &
2045 # std_ulogic_vector(
2046 # to_unsigned(op_t'pos(req_op), 3)) &
2048 # std_ulogic_vector(
2049 # to_unsigned(tlb_hit_way, 3)) &
2051 # std_ulogic_vector(
2052 # to_unsigned(state_t'pos(r1.state), 3));
2053 sync
+= log_data
.eq(Cat(
2054 Const(r1
.state
, 3), valid_ra
, Const(tlb_hit_way
, 3),
2055 stall_out
, Const(req_op
, 3), d_out
.valid
, d_out
.error
,
2056 r1
.wb
.cyc
, r1
.wb
.stb
, wb_in
.ack
, wb_in
.stall
,
2061 # log_out <= log_data;
2062 # TODO ??? I am very confused need help
2063 comb
+= log_out
.eq(log_data
)
2067 def elaborate(self
, platform
):
2068 LINE_SIZE
= self
.LINE_SIZE
2069 NUM_LINES
= self
.NUM_LINES
2070 NUM_WAYS
= self
.NUM_WAYS
2071 TLB_SET_SIZE
= self
.TLB_SET_SIZE
2072 TLB_NUM_WAYS
= self
.TLB_NUM_WAYS
2073 TLB_LG_PGSZ
= self
.TLB_LG_PGSZ
2074 LOG_LENGTH
= self
.LOG_LENGTH
2076 # BRAM organisation: We never access more than
2077 # -- wishbone_data_bits at a time so to save
2078 # -- resources we make the array only that wide, and
2079 # -- use consecutive indices for to make a cache "line"
2081 # -- ROW_SIZE is the width in bytes of the BRAM
2082 # -- (based on WB, so 64-bits)
2083 ROW_SIZE
= WB_DATA_BITS
/ 8;
2085 # ROW_PER_LINE is the number of row (wishbone
2086 # transactions) in a line
2087 ROW_PER_LINE
= LINE_SIZE
// ROW_SIZE
2089 # BRAM_ROWS is the number of rows in BRAM needed
2090 # to represent the full dcache
2091 BRAM_ROWS
= NUM_LINES
* ROW_PER_LINE
2094 # Bit fields counts in the address
2096 # REAL_ADDR_BITS is the number of real address
2097 # bits that we store
2100 # ROW_BITS is the number of bits to select a row
2101 ROW_BITS
= log2_int(BRAM_ROWS
)
2103 # ROW_LINE_BITS is the number of bits to select
2104 # a row within a line
2105 ROW_LINE_BITS
= log2_int(ROW_PER_LINE
)
2107 # LINE_OFF_BITS is the number of bits for
2108 # the offset in a cache line
2109 LINE_OFF_BITS
= log2_int(LINE_SIZE
)
2111 # ROW_OFF_BITS is the number of bits for
2112 # the offset in a row
2113 ROW_OFF_BITS
= log2_int(ROW_SIZE
)
2115 # INDEX_BITS is the number if bits to
2116 # select a cache line
2117 INDEX_BITS
= log2_int(NUM_LINES
)
2119 # SET_SIZE_BITS is the log base 2 of the set size
2120 SET_SIZE_BITS
= LINE_OFF_BITS
+ INDEX_BITS
2122 # TAG_BITS is the number of bits of
2123 # the tag part of the address
2124 TAG_BITS
= REAL_ADDR_BITS
- SET_SIZE_BITS
2126 # TAG_WIDTH is the width in bits of each way of the tag RAM
2127 TAG_WIDTH
= TAG_BITS
+ 7 - ((TAG_BITS
+ 7) % 8)
2129 # WAY_BITS is the number of bits to select a way
2130 WAY_BITS
= log2_int(NUM_WAYS
)
2132 # Example of layout for 32 lines of 64 bytes:
2134 # .. tag |index| line |
2136 # .. | |---| | ROW_LINE_BITS (3)
2137 # .. | |--- - --| LINE_OFF_BITS (6)
2138 # .. | |- --| ROW_OFF_BITS (3)
2139 # .. |----- ---| | ROW_BITS (8)
2140 # .. |-----| | INDEX_BITS (5)
2141 # .. --------| | TAG_BITS (45)
2143 TAG_RAM_WIDTH
= TAG_WIDTH
* NUM_WAYS
2145 def CacheTagArray():
2146 return Array(CacheTagSet() for x
in range(NUM_LINES
))
2148 def CacheValidBitsArray():
2149 return Array(CacheWayValidBits() for x
in range(NUM_LINES
))
2151 def RowPerLineValidArray():
2152 return Array(Signal() for x
in range(ROW_PER_LINE
))
2154 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
2155 cache_tags
= CacheTagArray()
2156 cache_tag_set
= Signal(TAG_RAM_WIDTH
)
2157 cache_valid_bits
= CacheValidBitsArray()
2159 # TODO attribute ram_style : string;
2160 # TODO attribute ram_style of cache_tags : signal is "distributed";
2163 TLB_SET_BITS
= log2_int(TLB_SET_SIZE
)
2164 TLB_WAY_BITS
= log2_int(TLB_NUM_WAYS
)
2165 TLB_EA_TAG_BITS
= 64 - (TLB_LG_PGSZ
+ TLB_SET_BITS
)
2166 TLB_TAG_WAY_BITS
= TLB_NUM_WAYS
* TLB_EA_TAG_BITS
2168 TLB_PTE_WAY_BITS
= TLB_NUM_WAYS
* TLB_PTE_BITS
;
2170 def TLBValidBitsArray():
2172 Signal(TLB_NUM_WAYS
) for x
in range(TLB_SET_SIZE
)
2177 Signal(TLB_TAG_WAY_BITS
) for x
in range (TLB_SET_SIZE
)
2182 Signal(TLB_PTE_WAY_BITS
) for x
in range(TLB_SET_SIZE
)
2186 return Array(Signal(NUM_WAYS
) for x
in range(TLB_NUM_WAYS
))
2188 """note: these are passed to nmigen.hdl.Memory as "attributes".
2189 don't know how, just that they are.
2191 dtlb_valid_bits
= TLBValidBitsArray()
2192 dtlb_tags
= TLBTagsArray()
2193 dtlb_ptes
= TLBPtesArray()
2194 # TODO attribute ram_style of
2195 # dtlb_tags : signal is "distributed";
2196 # TODO attribute ram_style of
2197 # dtlb_ptes : signal is "distributed";
2204 reservation
= Reservation()
2206 # Async signals on incoming request
2207 req_index
= Signal(NUM_LINES
)
2208 req_row
= Signal(BRAM_ROWS
)
2209 req_hit_way
= Signal(WAY_BITS
)
2210 req_tag
= Signal(TAG_BITS
)
2212 req_data
= Signal(64)
2213 req_same_tag
= Signal()
2216 early_req_row
= Signal(BRAM_ROWS
)
2218 cancel_store
= Signal()
2220 clear_rsrv
= Signal()
2225 use_forward1_next
= Signal()
2226 use_forward2_next
= Signal()
2228 # Cache RAM interface
2230 return Array(Signal(WB_DATA_BITS
) for x
in range(NUM_WAYS
))
2232 cache_out
= CacheRamOut()
2234 # PLRU output interface
2236 return Array(Signal(WAY_BITS
) for x
in range(Index()))
2238 plru_victim
= PLRUOut()
2239 replace_way
= Signal(WAY_BITS
)
2241 # Wishbone read/write/cache write formatting signals
2245 tlb_tag_way
= Signal(TLB_TAG_WAY_BITS
)
2246 tlb_pte_way
= Signal(TLB_PTE_WAY_BITS
)
2247 tlb_valid_way
= Signal(TLB_NUM_WAYS
)
2248 tlb_req_index
= Signal(TLB_SET_SIZE
)
2250 tlb_hit_way
= Signal(TLB_NUM_WAYS
)
2251 pte
= Signal(TLB_PTE_BITS
)
2252 ra
= Signal(REAL_ADDR_BITS
)
2254 perm_attr
= PermAttr()
2257 access_ok
= Signal()
2259 # TLB PLRU output interface
2262 Signal(TLB_WAY_BITS
) for x
in range(TLB_SET_SIZE
)
2265 tlb_plru_victim
= TLBPLRUOut()
2267 # Helper functions to decode incoming requests
2269 # Return the cache line index (tag index) for an address
2270 def get_index(addr
):
2271 return addr
[LINE_OFF_BITS
:SET_SIZE_BITS
]
2273 # Return the cache row index (data memory) for an address
2275 return addr
[ROW_OFF_BITS
:SET_SIZE_BITS
]
2277 # Return the index of a row within a line
2278 def get_row_of_line(row
):
2279 row_v
= Signal(ROW_BITS
)
2281 return row_v
[0:ROW_LINE_BITS
]
2283 # Returns whether this is the last row of a line
2284 def is_last_row_addr(addr
, last
):
2285 return addr
[ROW_OFF_BITS
:LINE_OFF_BITS
] == last
2287 # Returns whether this is the last row of a line
2288 def is_last_row(row
, last
):
2289 return get_row_of_line(row
) == last
2291 # Return the address of the next row in the current cache line
2292 def next_row_addr(addr
):
2293 row_idx
= Signal(ROW_LINE_BITS
)
2294 result
= WBAddrType()
2295 # Is there no simpler way in VHDL to
2296 # generate that 3 bits adder ?
2297 row_idx
= addr
[ROW_OFF_BITS
:LINE_OFF_BITS
]
2298 row_idx
= Signal(row_idx
+ 1)
2300 result
[ROW_OFF_BITS
:LINE_OFF_BITS
] = row_idx
2303 # Return the next row in the current cache line. We use a
2304 # dedicated function in order to limit the size of the
2305 # generated adder to be only the bits within a cache line
2306 # (3 bits with default settings)
2308 row_v
= Signal(ROW_BITS
)
2309 row_idx
= Signal(ROW_LINE_BITS
)
2310 result
= Signal(ROW_BITS
)
2313 row_idx
= row_v
[ROW_LINE_BITS
]
2314 row_v
[0:ROW_LINE_BITS
] = Signal(row_idx
+ 1)
2317 # Get the tag value from the address
2319 return addr
[SET_SIZE_BITS
:REAL_ADDR_BITS
]
2321 # Read a tag from a tag memory row
2322 def read_tag(way
, tagset
):
2323 return tagset
[way
*TAG_WIDTH
:way
* TAG_WIDTH
+ TAG_BITS
]
2325 # Read a TLB tag from a TLB tag memory row
2326 def read_tlb_tag(way
, tags
):
2329 j
= way
* TLB_EA_TAG_BITS
2330 return tags
[j
:j
+ TLB_EA_TAG_BITS
]
2332 # Write a TLB tag to a TLB tag memory row
2333 def write_tlb_tag(way
, tags
), tag
):
2336 j
= way
* TLB_EA_TAG_BITS
2337 tags
[j
:j
+ TLB_EA_TAG_BITS
] = tag
2339 # Read a PTE from a TLB PTE memory row
2340 def read_tlb_pte(way
, ptes
):
2343 j
= way
* TLB_PTE_BITS
2344 return ptes
[j
:j
+ TLB_PTE_BITS
]
2346 def write_tlb_pte(way
, ptes
,newpte
):
2349 j
= way
* TLB_PTE_BITS
2350 return ptes
[j
:j
+ TLB_PTE_BITS
] = newpte
2352 assert (LINE_SIZE
% ROW_SIZE
) == 0 "LINE_SIZE not " \
2353 "multiple of ROW_SIZE"
2355 assert (LINE_SIZE
% 2) == 0 "LINE_SIZE not power of 2"
2357 assert (NUM_LINES
% 2) == 0 "NUM_LINES not power of 2"
2359 assert (ROW_PER_LINE
% 2) == 0 "ROW_PER_LINE not" \
2362 assert ROW_BITS
== (INDEX_BITS
+ ROW_LINE_BITS
) \
2363 "geometry bits don't add up"
2365 assert (LINE_OFF_BITS
= ROW_OFF_BITS
+ ROW_LINEBITS
) \
2366 "geometry bits don't add up"
2368 assert REAL_ADDR_BITS
== (TAG_BITS
+ INDEX_BITS \
2369 + LINE_OFF_BITS
) "geometry bits don't add up"
2371 assert REAL_ADDR_BITS
== (TAG_BITS
+ ROW_BITS
+ ROW_OFF_BITS
) \
2372 "geometry bits don't add up"
2374 assert 64 == wishbone_data_bits
"Can't yet handle a" \
2375 "wishbone width that isn't 64-bits"
2377 assert SET_SIZE_BITS
<= TLB_LG_PGSZ
"Set indexed by" \
2380 # we don't yet handle collisions between loadstore1 requests
2382 comb
+= m_out
.stall
.eq(0)
2384 # Hold off the request in r0 when r1 has an uncompleted request
2385 comb
+= r0_stall
.eq(r0_full
& r1
.full
)
2386 comb
+= r0_valid
.eq(r0_full
& ~r1
.full
)
2387 comb
+= stall_out
.eq(r0_stall
)
2389 # Wire up wishbone request latch out of stage 1
2390 comb
+= wishbone_out
.eq(r1
.wb
)
2396 # entity dcache_tb is
2399 # architecture behave of dcache_tb is
2400 # signal clk : std_ulogic;
2401 # signal rst : std_ulogic;
2403 # signal d_in : Loadstore1ToDcacheType;
2404 # signal d_out : DcacheToLoadstore1Type;
2406 # signal m_in : MmuToDcacheType;
2407 # signal m_out : DcacheToMmuType;
2409 # signal wb_bram_in : wishbone_master_out;
2410 # signal wb_bram_out : wishbone_slave_out;
2412 # constant clk_period : time := 10 ns;
2414 # dcache0: entity work.dcache
2427 # wishbone_out => wb_bram_in,
2428 # wishbone_in => wb_bram_out
2431 # -- BRAM Memory slave
2432 # bram0: entity work.wishbone_bram_wrapper
2434 # MEMORY_SIZE => 1024,
2435 # RAM_INIT_FILE => "icache_test.bin"
2440 # wishbone_in => wb_bram_in,
2441 # wishbone_out => wb_bram_out
2444 # clk_process: process
2447 # wait for clk_period/2;
2449 # wait for clk_period/2;
2452 # rst_process: process
2455 # wait for 2*clk_period;
2463 # d_in.valid <= '0';
2466 # d_in.addr <= (others => '0');
2467 # d_in.data <= (others => '0');
2468 # m_in.valid <= '0';
2469 # m_in.addr <= (others => '0');
2470 # m_in.pte <= (others => '0');
2472 # wait for 4*clk_period;
2473 # wait until rising_edge(clk);
2475 # -- Cacheable read of address 4
2478 # d_in.addr <= x"0000000000000004";
2479 # d_in.valid <= '1';
2480 # wait until rising_edge(clk);
2481 # d_in.valid <= '0';
2483 # wait until rising_edge(clk) and d_out.valid = '1';
2484 # assert d_out.data = x"0000000100000000"
2485 # report "data @" & to_hstring(d_in.addr) &
2486 # "=" & to_hstring(d_out.data) &
2487 # " expected 0000000100000000"
2489 # -- wait for clk_period;
2491 # -- Cacheable read of address 30
2494 # d_in.addr <= x"0000000000000030";
2495 # d_in.valid <= '1';
2496 # wait until rising_edge(clk);
2497 # d_in.valid <= '0';
2499 # wait until rising_edge(clk) and d_out.valid = '1';
2500 # assert d_out.data = x"0000000D0000000C"
2501 # report "data @" & to_hstring(d_in.addr) &
2502 # "=" & to_hstring(d_out.data) &
2503 # " expected 0000000D0000000C"
2506 # -- Non-cacheable read of address 100
2509 # d_in.addr <= x"0000000000000100";
2510 # d_in.valid <= '1';
2511 # wait until rising_edge(clk);
2512 # d_in.valid <= '0';
2513 # wait until rising_edge(clk) and d_out.valid = '1';
2514 # assert d_out.data = x"0000004100000040"
2515 # report "data @" & to_hstring(d_in.addr) &
2516 # "=" & to_hstring(d_out.data) &
2517 # " expected 0000004100000040"
2520 # wait until rising_edge(clk);
2521 # wait until rising_edge(clk);
2522 # wait until rising_edge(clk);
2523 # wait until rising_edge(clk);
2528 def dcache_sim(dut
):
2530 yield dut
.d_in
.valid
.eq(0)
2531 yield dut
.d_in
.load
.eq(0)
2532 yield dut
.d_in
.nc
.eq(0)
2533 yield dut
.d_in
.adrr
.eq(0)
2534 yield dut
.d_in
.data
.eq(0)
2535 yield dut
.m_in
.valid
.eq(0)
2536 yield dut
.m_in
.addr
.eq(0)
2537 yield dut
.m_in
.pte
.eq(0)
2538 # wait 4 * clk_period
2543 # wait_until rising_edge(clk)
2545 # Cacheable read of address 4
2546 yield dut
.d_in
.load
.eq(1)
2547 yield dut
.d_in
.nc
.eq(0)
2548 yield dut
.d_in
.addr
.eq(Const(0x0000000000000004, 64))
2549 yield dut
.d_in
.valid
.eq(1)
2550 # wait-until rising_edge(clk)
2552 yield dut
.d_in
.valid
.eq(0)
2554 while not (yield dut
.d_out
.valid
):
2556 assert dut
.d_out
.data
== Const(0x0000000100000000, 64) f
"data @" \
2557 f
"{dut.d_in.addr}={dut.d_in.data} expected 0000000100000000" \
2558 " -!- severity failure"
2561 # Cacheable read of address 30
2562 yield dut
.d_in
.load
.eq(1)
2563 yield dut
.d_in
.nc
.eq(0)
2564 yield dut
.d_in
.addr
.eq(Const(0x0000000000000030, 64))
2565 yield dut
.d_in
.valid
.eq(1)
2567 yield dut
.d_in
.valid
.eq(0)
2569 while not (yield dut
.d_out
.valid
):
2571 assert dut
.d_out
.data
== Const(0x0000000D0000000C, 64) f
"data @" \
2572 f
"{dut.d_in.addr}={dut.d_out.data} expected 0000000D0000000C" \
2573 f
"-!- severity failure"
2575 # Non-cacheable read of address 100
2576 yield dut
.d_in
.load
.eq(1)
2577 yield dut
.d_in
.nc
.eq(1)
2578 yield dut
.d_in
.addr
.eq(Const(0x0000000000000100, 64))
2579 yield dut
.d_in
.valid
.eq(1)
2581 yield dut
.d_in
.valid
.eq(0)
2583 while not (yield dut
.d_out
.valid
):
2585 assert dut
.d_out
.data
== Const(0x0000004100000040, 64) f
"data @" \
2586 f
"{dut.d_in.addr}={dut.d_out.data} expected 0000004100000040" \
2587 f
"-!- severity failure"
2597 vl
= rtlil
.convert(dut
, ports
=[])
2598 with
open("test_dcache.il", "w") as f
:
2601 run_simulation(dut
, dcache_sim(), vcd_name
='test_dcache.vcd')
2603 if __name__
== '__main__':