3 based on Anton Blanchard microwatt dcache.vhdl
7 from enum
import Enum
, unique
9 from nmigen
import Module
, Signal
, Elaboratable
,
11 from nmigen
.cli
import main
12 from nmigen
.iocontrol
import RecordObject
13 from nmigen
.util
import log2_int
15 from experiment
.mem_types
import LoadStore1ToDcacheType
,
16 DcacheToLoadStore1Type
,
20 from experiment
.wb_types
import WB_ADDR_BITS
, WB_DATA_BITS
, WB_SEL_BITS
,
21 WBAddrType
, WBDataType
, WBSelType
,
22 WbMasterOut
, WBSlaveOut
,
23 WBMasterOutVector
, WBSlaveOutVector
,
24 WBIOMasterOut
, WBIOSlaveOut
27 # Record for storing permission, attribute, etc. bits from a PTE
28 class PermAttr(RecordObject
):
31 self
.reference
= Signal()
32 self
.changed
= Signal()
33 self
.nocache
= Signal()
35 self
.rd_perm
= Signal()
36 self
.wr_perm
= Signal()
39 def extract_perm_attr(pte
):
50 # Type of operation on a "valid" input
54 OP_BAD
= 1 # NC cache hit, TLB miss, prot/RC failure
55 OP_STCX_FAIL
= 2 # conditional store w/o reservation
56 OP_LOAD_HIT
= 3 # Cache hit on load
57 OP_LOAD_MISS
= 4 # Load missing cache
58 OP_LOAD_NC
= 5 # Non-cachable load
59 OP_STORE_HIT
= 6 # Store hitting cache
60 OP_STORE_MISS
= 7 # Store missing cache
66 IDLE
= 0 # Normal load hit processing
67 RELOAD_WAIT_ACK
= 1 # Cache reload wait ack
68 STORE_WAIT_ACK
= 2 # Store wait ack
69 NC_LOAD_WAIT_ACK
= 3 # Non-cachable load wait ack
74 # In order to make timing, we use the BRAMs with
75 # an output buffer, which means that the BRAM
76 # output is delayed by an extra cycle.
78 # Thus, the dcache has a 2-stage internal pipeline
79 # for cache hits with no stalls.
81 # All other operations are handled via stalling
84 # The second stage can thus complete a hit at the same
85 # time as the first stage emits a stall for a complex op.
87 # Stage 0 register, basically contains just the latched request
88 class RegStage0(RecordObject
):
91 self
.req
= LoadStore1ToDcacheType()
95 self
.mmu_req
= Signal() # indicates source of request
98 class MemAccessRequest(RecordObject
):
102 self
.valid
= Signal()
104 self
.real_addr
= Signal(REAL_ADDR_BITS
)
105 self
.data
= Signal(64)
106 self
.byte_sel
= Signal(8)
107 self
.hit_way
= Signal(WAY_BITS
)
108 self
.same_tag
= Signal()
109 self
.mmu_req
= Signal()
112 # First stage register, contains state for stage 1 of load hits
113 # and for the state machine used by all other operations
114 class RegStage1(RecordObject
):
117 # Info about the request
118 self
.full
= Signal() # have uncompleted request
119 self
.mmu_req
= Signal() # request is from MMU
120 self
.req
= MemAccessRequest()
123 self
.hit_way
= Signal(WAY_BITS
)
124 self
.hit_load_valid
= Signal()
125 self
.hit_index
= Signal(NUM_LINES
)
126 self
.cache_hit
= Signal()
129 self
.tlb_hit
= Signal()
130 self
.tlb_hit_way
= Signal(TLB_NUM_WAYS
)
131 self
.tlb_hit_index
= Signal(TLB_SET_SIZE
)
133 # 2-stage data buffer for data forwarded from writes to reads
134 self
.forward_data1
= Signal(64)
135 self
.forward_data2
= Signal(64)
136 self
.forward_sel1
= Signal(8)
137 self
.forward_valid1
= Signal()
138 self
.forward_way1
= Signal(WAY_BITS
)
139 self
.forward_row1
= Signal(BRAM_ROWS
)
140 self
.use_forward1
= Signal()
141 self
.forward_sel
= Signal(8)
143 # Cache miss state (reload state machine)
146 self
.write_bram
= Signal()
147 self
.write_tag
= Signal()
148 self
.slow_valid
= Signal()
149 self
.wb
= WishboneMasterOut()
150 self
.reload_tag
= Signal(TAG_BITS
)
151 self
.store_way
= Signal(WAY_BITS
)
152 self
.store_row
= Signal(BRAM_ROWS
)
153 self
.store_index
= Signal(NUM_LINES
)
154 self
.end_row_ix
= Signal(ROW_LINE_BIT
)
155 self
.rows_valid
= RowPerLineValidArray()
156 self
.acks_pending
= Signal(3)
157 self
.inc_acks
= Signal()
158 self
.dec_acks
= Signal()
160 # Signals to complete (possibly with error)
161 self
.ls_valid
= Signal()
162 self
.ls_error
= Signal()
163 self
.mmu_done
= Signal()
164 self
.mmu_error
= Signal()
165 self
.cache_paradox
= Signal()
167 # Signal to complete a failed stcx.
168 self
.stcx_fail
= Signal()
171 # Reservation information
172 class Reservation(RecordObject
):
176 # TODO LINE_OFF_BITS is 6
177 addr
= Signal(63 downto LINE_OFF_BITS
)
180 # Set associative dcache write-through
182 # TODO (in no specific order):
184 # * See list in icache.vhdl
185 # * Complete load misses on the cycle when WB data comes instead of
186 # at the end of line (this requires dealing with requests coming in
188 class Dcache(Elaboratable
):
190 # TODO: make these parameters of Dcache at some point
191 self
.LINE_SIZE
= 64 # Line size in bytes
192 self
.NUM_LINES
= 32 # Number of lines in a set
193 self
.NUM_WAYS
= 4 # Number of ways
194 self
.TLB_SET_SIZE
= 64 # L1 DTLB entries per set
195 self
.TLB_NUM_WAYS
= 2 # L1 DTLB number of sets
196 self
.TLB_LG_PGSZ
= 12 # L1 DTLB log_2(page_size)
197 self
.LOG_LENGTH
= 0 # Non-zero to enable log data collection
199 self
.d_in
= LoadStore1ToDcacheType()
200 self
.d_out
= DcacheToLoadStore1Type()
202 self
.m_in
= MmuToDcacheType()
203 self
.m_out
= DcacheToMmuType()
205 self
.stall_out
= Signal()
207 self
.wb_out
= WBMasterOut()
208 self
.wb_in
= WBSlaveOut()
210 self
.log_out
= Signal(20)
212 # Latch the request in r0.req as long as we're not stalling
213 def stage_0(self
, m
, d_in
, m_in
):
217 # variable r : reg_stage_0_t;
222 # if rising_edge(clk) then
223 # assert (d_in.valid and m_in.valid) = '0'
224 # report "request collision loadstore vs MMU";
225 assert ~
(d_in
.valid
& m_in
.valid
) "request collision
228 # if m_in.valid = '1' then
229 with m
.If(m_in
.valid
):
230 # r.req.valid := '1';
231 # r.req.load := not (m_in.tlbie or m_in.tlbld);
234 # r.req.reserve := '0';
235 # r.req.virt_mode := '0';
236 # r.req.priv_mode := '1';
237 # r.req.addr := m_in.addr;
238 # r.req.data := m_in.pte;
239 # r.req.byte_sel := (others => '1');
240 # r.tlbie := m_in.tlbie;
241 # r.doall := m_in.doall;
242 # r.tlbld := m_in.tlbld;
244 sync
+= r
.req
.valid
.eq(1)
245 sync
+= r
.req
.load
.eq(~
(m_in
.tlbie | m_in
.tlbld
))
246 sync
+= r
.req
.priv_mode
.eq(1)
247 sync
+= r
.req
.addr
.eq(m_in
.addr
)
248 sync
+= r
.req
.data
.eq(m_in
.pte
)
249 sync
+= r
.req
.byte_sel
.eq(1)
250 sync
+= r
.tlbie
.eq(m_in
.tlbie
)
251 sync
+= r
.doall
.eq(m_in
.doall
)
252 sync
+= r
.tlbld
.eq(m_in
.tlbld
)
253 sync
+= r
.mmu_req
.eq(1)
261 sync
+= r
.req
.eq(d_in
)
265 # elsif r1.full = '0' or r0_full = '0' then
266 with m
.If(~r1
.full | ~r0_full
):
268 # r0_full <= r.req.valid;
270 sync
+= r0_full
.eq(r
.req
.valid
)
276 # Operates in the second cycle on the request latched in r0.req.
277 # TLB updates write the entry at the end of the second cycle.
278 def tlb_read(self
, m
, m_in
, d_in
, r0_stall
, tlb_valid_way
,
279 tlb_tag_way
, tlb_pte_way
, dtlb_valid_bits
,
280 dtlb_tags
, dtlb_ptes
):
285 # variable index : tlb_index_t;
286 # variable addrbits :
287 # std_ulogic_vector(TLB_SET_BITS - 1 downto 0);
289 addrbits
= Signal(TLB_SET_BITS
)
295 # if rising_edge(clk) then
296 # if m_in.valid = '1' then
297 with m
.If(m_in
.valid
):
298 # addrbits := m_in.addr(TLB_LG_PGSZ + TLB_SET_BITS
299 # - 1 downto TLB_LG_PGSZ);
300 sync
+= addrbits
.eq(m_in
.addr
[
301 TLB_LG_PGSZ
:TLB_LG_PGSZ
+ TLB_SET_BITS
305 # addrbits := d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS
306 # - 1 downto TLB_LG_PGSZ);
307 sync
+= addrbits
.eq(d_in
.addr
[
308 TLB_LG_PGSZ
:TLB_LG_PGSZ
+ TLB_SET_BITS
312 # index := to_integer(unsigned(addrbits));
313 sync
+= index
.eq(addrbits
)
314 # -- If we have any op and the previous op isn't
315 # -- finished, then keep the same output for next cycle.
316 # if r0_stall = '0' then
317 # If we have any op and the previous op isn't finished,
318 # then keep the same output for next cycle.
319 with m
.If(~r0_stall
):
320 sync
+= tlb_valid_way
.eq(dtlb_valid_bits
[index
])
321 sync
+= tlb_tag_way
.eq(dtlb_tags
[index
])
322 sync
+= tlb_pte_way
.eq(dtlb_ptes
[index
])
327 # -- Generate TLB PLRUs
328 # maybe_tlb_plrus: if TLB_NUM_WAYS > 1 generate
330 def maybe_tlb_plrus(self
, m
, r1
, tlb_plru_victim
, acc
, acc_en
, lru
):
334 with m
.If(TLB_NUM_WAYS
> 1):
335 for i
in range(TLB_SET_SIZE
):
337 tlb_plru
= PLRU(TLB_WAY_BITS
)
338 tlb_plru_acc
= Signal(TLB_WAY_BITS
)
339 tlb_plru_acc_en
= Signal()
340 tlb_plru_out
= Signal(TLB_WAY_BITS
)
342 comb
+= tlb_plru
.acc
.eq(tlb_plru_acc
)
343 comb
+= tlb_plru
.acc_en
.eq(tlb_plru_acc_en
)
344 comb
+= tlb_plru
.lru
.eq(tlb_plru_out
)
347 with m
.If(r1
.tlb_hit_index
== i
):
348 comb
+= tlb_plru
.acc_en
.eq(
353 comb
+= tlb_plru
.acc_en
.eq(0)
354 comb
+= tlb_plru
.acc
.eq(
358 comb
+= tlb_plru_victim
[i
].eq(tlb_plru
.lru
)
360 def tlb_search(self
, tlb_req_index
, r0
, tlb_valid_way_ tlb_tag_way
,
361 tlb_pte_way
, pte
, tlb_hit
, valid_ra
, perm_attr
, ra
):
366 # variable hitway : tlb_way_t;
367 # variable hit : std_ulogic;
368 # variable eatag : tlb_tag_t;
379 # to_integer(unsigned(r0.req.addr(
380 # TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ
384 # eatag := r0.req.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS);
385 # for i in tlb_way_t loop
386 # if tlb_valid_way(i) = '1' and
387 # read_tlb_tag(i, tlb_tag_way) = eatag then
392 # tlb_hit <= hit and r0_valid;
393 # tlb_hit_way <= hitway;
394 comb
+= tlb_req_index
.eq(r0
.req
.addr
[
395 TLB_LG_PGSZ
:TLB_LG_PGSZ
+ TLB_SET_BITS
398 comb
+= eatag
.eq(r0
.req
.addr
[
399 TLB_LG_PGSZ
+ TLB_SET_BITS
:64
403 with m
.If(tlb_valid_way(i
)
404 & read_tlb_tag(i
, tlb_tag_way
) == eatag
):
409 comb
+= tlb_hit
.eq(hit
& r0_valid
)
410 comb
+= tlb_hit_way
.eq(hitway
)
412 # if tlb_hit = '1' then
414 # pte <= read_tlb_pte(hitway, tlb_pte_way);
415 comb
+= pte
.eq(read_tlb_pte(hitway
, tlb_pte_way
))
418 # pte <= (others => '0');
421 # valid_ra <= tlb_hit or not r0.req.virt_mode;
422 comb
+= valid_ra
.eq(tlb_hit | ~r0
.req
.virt_mode
)
423 # if r0.req.virt_mode = '1' then
424 with m
.If(r0
.req
.virt_mode
):
425 # ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
426 # r0.req.addr(TLB_LG_PGSZ - 1 downto ROW_OFF_BITS) &
427 # (ROW_OFF_BITS-1 downto 0 => '0');
428 # perm_attr <= extract_perm_attr(pte);
430 Const(ROW_OFF_BITS
, ROW_OFF_BITS
),
431 r0
.req
.addr
[ROW_OFF_BITS
:TLB_LG_PGSZ
],
432 pte
[TLB_LG_PGSZ
:REAL_ADDR_BITS
]
434 comb
+= perm_attr
.eq(extract_perm_attr(pte
))
438 # REAL_ADDR_BITS - 1 downto ROW_OFF_BITS
439 # ) & (ROW_OFF_BITS-1 downto 0 => '0');
441 Const(ROW_OFF_BITS
, ROW_OFF_BITS
),
442 r0
.rq
.addr
[ROW_OFF_BITS
:REAL_ADDR_BITS
]
445 # perm_attr <= real_mode_perm_attr;
446 comb
+= perm_attr
.reference
.eq(1)
447 comb
+= perm_attr
.changed
.eq(1)
448 comb
+= perm_attr
.priv
.eq(1)
449 comb
+= perm_attr
.nocache
.eq(0)
450 comb
+= perm_attr
.rd_perm
.eq(1)
451 comb
+= perm_attr
.wr_perm
.eq(1)
455 def tlb_update(self
, r0_valid
, r0
, dtlb_valid_bits
, tlb_req_index
,
456 tlb_hit_way
, tlb_hit
, tlb_plru_victim
, tlb_tag_way
,
457 dtlb_tags
, tlb_pte_way
, dtlb_ptes
, dtlb_valid_bits
):
462 # variable tlbie : std_ulogic;
463 # variable tlbwe : std_ulogic;
464 # variable repl_way : tlb_way_t;
465 # variable eatag : tlb_tag_t;
466 # variable tagset : tlb_way_tags_t;
467 # variable pteset : tlb_way_ptes_t;
472 tagset
= TLBWayTags()
473 pteset
= TLBWayPtes()
483 # if rising_edge(clk) then
484 # tlbie := r0_valid and r0.tlbie;
485 # tlbwe := r0_valid and r0.tlbldoi;
486 sync
+= tlbie
.eq(r0_valid
& r0
.tlbie
)
487 sync
+= tlbwe
.eq(r0_valid
& r0
.tlbldoi
)
489 # if rst = '1' or (tlbie = '1' and r0.doall = '1') then
490 # with m.If (TODO understand how signal resets work in nmigen)
491 # -- clear all valid bits at once
492 # for i in tlb_index_t loop
493 # dtlb_valids(i) <= (others => '0');
495 # clear all valid bits at once
496 for i
in range(TLB_SET_SIZE
):
497 sync
+= dtlb_valid_bits
[i
].eq(0)
499 # elsif tlbie = '1' then
501 # if tlb_hit = '1' then
503 # dtlb_valids(tlb_req_index)(tlb_hit_way) <= '0';
504 sync
+= dtlb_valid_bits
[tlb_req_index
][tlb_hit_way
].eq(0)
506 # elsif tlbwe = '1' then
508 # if tlb_hit = '1' then
510 # repl_way := tlb_hit_way;
511 sync
+= repl_way
.eq(tlb_hit_way
)
514 # repl_way := to_integer(unsigned(
515 # tlb_plru_victim(tlb_req_index)));
516 sync
+= repl_way
.eq(tlb_plru_victim
[tlb_req_index
])
518 # eatag := r0.req.addr(
519 # 63 downto TLB_LG_PGSZ + TLB_SET_BITS
521 # tagset := tlb_tag_way;
522 # write_tlb_tag(repl_way, tagset, eatag);
523 # dtlb_tags(tlb_req_index) <= tagset;
524 # pteset := tlb_pte_way;
525 # write_tlb_pte(repl_way, pteset, r0.req.data);
526 # dtlb_ptes(tlb_req_index) <= pteset;
527 # dtlb_valids(tlb_req_index)(repl_way) <= '1';
528 sync
+= eatag
.eq(r0
.req
.addr
[TLB_LG_PGSZ
+ TLB_SET_BITS
:64])
529 sync
+= tagset
.eq(tlb_tag_way
)
530 sync
+= write_tlb_tag(repl_way
, tagset
, eatag
)
531 sync
+= dtlb_tags
[tlb_req_index
].eq(tagset
)
532 sync
+= pteset
.eq(tlb_pte_way
)
533 sync
+= write_tlb_pte(repl_way
, pteset
, r0
.req
.data
)
534 sync
+= dtlb_ptes
[tlb_req_index
].eq(pteset
)
535 sync
+= dtlb_valid_bits
[tlb_req_index
][repl_way
].eq(1)
541 # maybe_plrus: if NUM_WAYS > 1 generate
543 def maybe_plrus(self
, r1
):
549 # TODO learn translation of generate into nmgien @lkcl
550 # plrus: for i in 0 to NUM_LINES-1 generate
551 for i
in range(NUM_LINES
):
553 # signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
554 # signal plru_acc_en : std_ulogic;
555 # signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
556 plru
= PLRU(WAY_BITS
)
557 plru_acc
= Signal(WAY_BITS
)
558 plru_acc_en
= Signal()
559 plru_out
= Signal(WAY_BITS
)
562 # TODO learn tranlation of entity, generic map, port map in
564 # plru : entity work.plru
572 # acc_en => plru_acc_en,
575 comb
+= plru
.acc
.eq(plru_acc
)
576 comb
+= plru
.acc_en
.eq(plru_acc_en
)
577 comb
+= plru
.lru
.eq(plru_out
)
582 # if r1.hit_index = i then
584 with m
.If(r1
.hit_index
== i
):
585 # plru_acc_en <= r1.cache_hit;
586 comb
+= plru_acc_en
.eq(r1
.cache_hit
)
589 # plru_acc_en <= '0';
590 comb
+= plru_acc_en
.eq(0)
592 # plru_acc <= std_ulogic_vector(to_unsigned(
593 # r1.hit_way, WAY_BITS
595 # plru_victim(i) <= plru_out;
596 comb
+= plru_acc
.eq(r1
.hit_way
)
597 comb
+= plru_victim
[i
].eq(plru_out
)
602 # -- Cache tag RAM read port
603 # cache_tag_read : process(clk)
604 # Cache tag RAM read port
605 def cache_tag_read(self
, r0_stall
, req_index
, m_in
, d_in
,
606 cache_tag_set
, cache_tags
):
611 # variable index : index_t;
612 index
= Signal(NUM_LINES
)
617 # if rising_edge(clk) then
618 # if r0_stall = '1' then
620 # index := req_index;
621 sync
+= index
.eq(req_index
)
623 # elsif m_in.valid = '1' then
624 with m
.Elif(m_in
.valid
):
625 # index := get_index(m_in.addr);
626 sync
+= index
.eq(get_index(m_in
.addr
))
630 # index := get_index(d_in.addr);
631 sync
+= index
.eq(get_index(d_in
.addr
))
633 # cache_tag_set <= cache_tags(index);
634 sync
+= cache_tag_set
.eq(cache_tags
[index
])
638 # Cache request parsing and hit detection
639 def dcache_request(self
, r0
, ra
, req_index
, req_row
, req_tag
,
640 r0_valid
, r1
, cache_valid_bits
, replace_way
,
641 use_forward1_next
, use_forward2_next
,
642 req_hit_way
, plru_victim
, rc_ok
, perm_attr
,
643 valid_ra
, perm_ok
, access_ok
, req_op
, req_ok
,
644 r0_stall
, m_in
, early_req_row
, d_in
):
649 # variable is_hit : std_ulogic;
650 # variable hit_way : way_t;
651 # variable op : op_t;
652 # variable opsel : std_ulogic_vector(2 downto 0);
653 # variable go : std_ulogic;
654 # variable nc : std_ulogic;
655 # variable s_hit : std_ulogic;
656 # variable s_tag : cache_tag_t;
657 # variable s_pte : tlb_pte_t;
658 # variable s_ra : std_ulogic_vector(
659 # REAL_ADDR_BITS - 1 downto 0
661 # variable hit_set : std_ulogic_vector(
662 # TLB_NUM_WAYS - 1 downto 0
664 # variable hit_way_set : hit_way_set_t;
665 # variable rel_matches : std_ulogic_vector(
666 # TLB_NUM_WAYS - 1 downto 0
670 hit_way
= Signal(WAY_BITS
)
676 s_tag
= Signal(TAG_BITS
)
677 s_pte
= Signal(TLB_PTE_BITS
)
678 s_ra
= Signal(REAL_ADDR_BITS
)
679 hit_set
= Signal(TLB_NUM_WAYS
)
680 hit_way_set
= HitWaySet()
681 rel_matches
= Signal(TLB_NUM_WAYS
)
685 # -- Extract line, row and tag from request
686 # req_index <= get_index(r0.req.addr);
687 # req_row <= get_row(r0.req.addr);
688 # req_tag <= get_tag(ra);
690 # go := r0_valid and not (r0.tlbie or r0.tlbld)
691 # and not r1.ls_error;
692 # Extract line, row and tag from request
693 comb
+= req_index
.eq(get_index(r0
.req
.addr
))
694 comb
+= req_row
.eq(get_row(r0
.req
.addr
))
695 comb
+= req_tag
.eq(get_tag(ra
))
697 comb
+= go
.eq(r0_valid
& ~
(r0
.tlbie | r0
.tlbld
) & ~r1
.ls_error
)
702 # Test if pending request is a hit on any way
703 # In order to make timing in virtual mode,
704 # when we are using the TLB, we compare each
705 # way with each of the real addresses from each way of
706 # the TLB, and then decide later which match to use.
708 # if r0.req.virt_mode = '1' then
709 with m
.If(r0
.req
.virt_mode
):
710 # rel_matches := (others => '0');
711 comb
+= rel_matches
.eq(0)
712 # for j in tlb_way_t loop
713 for j
in range(TLB_NUM_WAYS
):
714 # hit_way_set(j) := 0;
716 # s_pte := read_tlb_pte(j, tlb_pte_way);
717 # s_ra := s_pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ)
718 # & r0.req.addr(TLB_LG_PGSZ - 1 downto 0);
719 # s_tag := get_tag(s_ra);
720 comb
+= hit_way_set
[j
].eq(0)
722 comb
+= s_pte
.eq(read_tlb_pte(j
, tlb_pte_way
))
724 r0
.req
.addr
[0:TLB_LG_PGSZ
],
725 s_pte
[TLB_LG_PGSZ
:REAL_ADDR_BITS
]
727 comb
+= s_tag
.eq(get_tag(s_ra
))
729 # for i in way_t loop
730 for i
in range(NUM_WAYS
):
731 # if go = '1' and cache_valids(req_index)(i) = '1'
732 # and read_tag(i, cache_tag_set) = s_tag
733 # and tlb_valid_way(j) = '1' then
734 with m
.If(go
& cache_valid_bits
[req_index
][i
] &
735 read_tag(i
, cache_tag_set
) == s_tag
737 # hit_way_set(j) := i;
739 comb
+= hit_way_set
[j
].eq(i
)
743 # hit_set(j) := s_hit;
744 comb
+= hit_set
[j
].eq(s_hit
)
745 # if s_tag = r1.reload_tag then
746 with m
.If(s_tag
== r1
.reload_tag
):
747 # rel_matches(j) := '1';
748 comb
+= rel_matches
[j
].eq(1)
751 # if tlb_hit = '1' then
753 # is_hit := hit_set(tlb_hit_way);
754 # hit_way := hit_way_set(tlb_hit_way);
755 # rel_match := rel_matches(tlb_hit_way);
756 comb
+= is_hit
.eq(hit_set
[tlb_hit_way
])
757 comb
+= hit_way
.eq(hit_way_set
[tlb_hit_way
])
758 comb
+= rel_match
.eq(rel_matches
[tlb_hit_way
])
762 # s_tag := get_tag(r0.req.addr);
763 comb
+= s_tag
.eq(get_tag(r0
.req
.addr
))
764 # for i in way_t loop
765 for i
in range(NUM_WAYS
):
766 # if go = '1' and cache_valids(req_index)(i) = '1' and
767 # read_tag(i, cache_tag_set) = s_tag then
768 with m
.If(go
& cache_valid_bits
[req_index
][i
] &
769 read_tag(i
, cache_tag_set
) == s_tag
):
772 comb
+= hit_way
.eq(i
)
776 # if s_tag = r1.reload_tag then
777 with m
.If(s_tag
== r1
.reload_tag
):
779 comb
+= rel_match
.eq(1)
782 # req_same_tag <= rel_match;
783 comb
+= req_same_tag
.eq(rel_match
)
785 # if r1.state = RELOAD_WAIT_ACK and req_index = r1.store_index
786 # and rel_match = '1' then
787 # See if the request matches the line currently being reloaded
788 with m
.If(r1
.state
== State
.RELOAD_WAIT_ACK
& req_index
==
789 r1
.store_index
& rel_match
):
790 # For a store, consider this a hit even if the row isn't
791 # valid since it will be by the time we perform the store.
792 # For a load, check the appropriate row valid bit.
795 # or r1.rows_valid(req_row mod ROW_PER_LINE);
796 # hit_way := replace_way;
797 comb
+= is_hit
.eq(~r0
.req
.load
798 | r1
.rows_valid
[req_row
% ROW_PER_LINE
]
800 comb
+= hit_way
.eq(replace_way
)
803 # -- Whether to use forwarded data for a load or not
804 # Whether to use forwarded data for a load or not
805 # use_forward1_next <= '0';
806 comb
+= use_forward1_next
.eq(0)
807 # if get_row(r1.req.real_addr) = req_row
808 # and r1.req.hit_way = hit_way then
809 with m
.If(get_row(r1
.req
.real_addr
) == req_row
810 & r1
.req
.hit_way
== hit_way
)
811 # Only need to consider r1.write_bram here, since if we
812 # are writing refill data here, then we don't have a
813 # cache hit this cycle on the line being refilled.
814 # (There is the possibility that the load following the
815 # load miss that started the refill could be to the old
816 # contents of the victim line, since it is a couple of
817 # cycles after the refill starts before we see the updated
818 # cache tag. In that case we don't use the bypass.)
819 # use_forward1_next <= r1.write_bram;
820 comb
+= use_forward1_next
.eq(r1
.write_bram
)
822 # use_forward2_next <= '0';
823 comb
+= use_forward2_next
.eq(0)
824 # if r1.forward_row1 = req_row
825 # and r1.forward_way1 = hit_way then
826 with m
.If(r1
.forward_row1
== req_row
827 & r1
.forward_way1
== hit_way
):
828 # use_forward2_next <= r1.forward_valid1;
829 comb
+= use_forward2_next
.eq(r1
.forward_valid1
)
832 # The way that matched on a hit
833 # req_hit_way <= hit_way;
834 comb
+= req_hit_way
.eq(hit_way
)
836 # The way to replace on a miss
837 # if r1.write_tag = '1' then
838 with m
.If(r1
.write_tag
):
839 # replace_way <= to_integer(unsigned(
840 # plru_victim(r1.store_index)
842 replace_way
.eq(plru_victim
[r1
.store_index
])
845 # replace_way <= r1.store_way;
846 comb
+= replace_way
.eq(r1
.store_way
)
849 # work out whether we have permission for this access
850 # NB we don't yet implement AMR, thus no KUAP
851 # rc_ok <= perm_attr.reference and
852 # (r0.req.load or perm_attr.changed);
853 # perm_ok <= (r0.req.priv_mode or not perm_attr.priv) and
854 # (perm_attr.wr_perm or (r0.req.load
855 # and perm_attr.rd_perm));
856 # access_ok <= valid_ra and perm_ok and rc_ok;
859 & (r0
.req
.load | perm_attr
.changed
)
861 comb
+= perm_ok
.eq((r0
.req
.prive_mode | ~perm_attr
.priv
)
863 |
(r0
.req
.load
& perm_attr
.rd_perm
)
865 comb
+= access_ok
.eq(valid_ra
& perm_ok
& rc_ok
)
866 # nc := r0.req.nc or perm_attr.nocache;
868 # Combine the request and cache hit status to decide what
869 # operation needs to be done
870 comb
+= nc
.eq(r0
.req
.nc | perm_attr
.nocache
)
871 comb
+= op
.eq(Op
.OP_NONE
)
874 # if access_ok = '0' then
875 with m
.If(~access_ok
):
877 comb
+= op
.eq(Op
.OP_BAD
)
878 # elsif cancel_store = '1' then
879 with m
.Elif(cancel_store
):
880 # op := OP_STCX_FAIL;
881 comb
+= op
.eq(Op
.OP_STCX_FAIL
)
884 # opsel := r0.req.load & nc & is_hit;
885 comb
+= opsel
.eq(Cat(is_hit
, nc
, r0
.req
.load
))
887 with m
.Switch(opsel
):
888 # when "101" => op := OP_LOAD_HIT;
889 # when "100" => op := OP_LOAD_MISS;
890 # when "110" => op := OP_LOAD_NC;
891 # when "001" => op := OP_STORE_HIT;
892 # when "000" => op := OP_STORE_MISS;
893 # when "010" => op := OP_STORE_MISS;
894 # when "011" => op := OP_BAD;
895 # when "111" => op := OP_BAD;
896 # when others => op := OP_NONE;
897 with m
.Case(Const(0b101, 3)):
898 comb
+= op
.eq(Op
.OP_LOAD_HIT
)
900 with m
.Case(Cosnt(0b100, 3)):
901 comb
+= op
.eq(Op
.OP_LOAD_MISS
)
903 with m
.Case(Const(0b110, 3)):
904 comb
+= op
.eq(Op
.OP_LOAD_NC
)
906 with m
.Case(Const(0b001, 3)):
907 comb
+= op
.eq(Op
.OP_STORE_HIT
)
909 with m
.Case(Const(0b000, 3)):
910 comb
+= op
.eq(Op
.OP_STORE_MISS
)
912 with m
.Case(Const(0b010, 3)):
913 comb
+= op
.eq(Op
.OP_STORE_MISS
)
915 with m
.Case(Const(0b011, 3)):
916 comb
+= op
.eq(Op
.OP_BAD
)
918 with m
.Case(Const(0b111, 3)):
919 comb
+= op
.eq(Op
.OP_BAD
)
922 comb
+= op
.eq(Op
.OP_NONE
)
928 comb
+= req_op
.eq(op
)
929 comb
+= req_go
.eq(go
)
931 # Version of the row number that is valid one cycle earlier
932 # in the cases where we need to read the cache data BRAM.
933 # If we're stalling then we need to keep reading the last
935 # if r0_stall = '0' then
936 with m
.If(~r0_stall
):
937 # if m_in.valid = '1' then
938 with m
.If(m_in
.valid
):
939 # early_req_row <= get_row(m_in.addr);
940 comb
+= early_req_row
.eq(get_row(m_in
.addr
))
943 # early_req_row <= get_row(d_in.addr);
944 comb
+= early_req_row
.eq(get_row(d_in
.addr
))
948 # early_req_row <= req_row;
949 comb
+= early_req_row
.eq(req_row
)
953 # Handle load-with-reservation and store-conditional instructions
954 def reservation_comb(self
, cancel_store
, set_rsrv
, clear_rsrv
,
955 r0_valid
, r0
, reservation
):
961 # cancel_store <= '0';
964 # if r0_valid = '1' and r0.req.reserve = '1' then
965 with m
.If(r0_valid
& r0
.req
.reserve
):
967 # -- XXX generate alignment interrupt if address
968 # -- is not aligned XXX or if r0.req.nc = '1'
969 # if r0.req.load = '1' then
970 # XXX generate alignment interrupt if address
971 # is not aligned XXX or if r0.req.nc = '1'
972 with m
.If(r0
.req
.load
):
973 # -- load with reservation
975 # load with reservation
979 # -- store conditional
982 comb
+= clear_rsrv
.eq(1)
983 # if reservation.valid = '0' or r0.req.addr(63
984 # downto LINE_OFF_BITS) /= reservation.addr then
985 with m
.If(~reservation
.valid
986 | r0
.req
.addr
[LINE_OFF_BITS
:64]):
987 # cancel_store <= '1';
988 comb
+= cancel_store
.eq(1)
994 def reservation_reg(self
, r0_valid
, access_ok
, clear_rsrv
,
1001 # if rising_edge(clk) then
1003 # reservation.valid <= '0';
1004 # TODO understand how resets work in nmigen
1005 # elsif r0_valid = '1' and access_ok = '1' then
1006 with m
.Elif(r0_valid
& access_ok
):
1007 # if clear_rsrv = '1' then
1008 with m
.If(clear_rsrv
):
1009 # reservation.valid <= '0';
1010 sync
+= reservation
.valid
.ea(0)
1011 # elsif set_rsrv = '1' then
1012 with m
.Elif(set_rsrv
):
1013 # reservation.valid <= '1';
1014 # reservation.addr <=
1015 # r0.req.addr(63 downto LINE_OFF_BITS);
1016 sync
+= reservation
.valid
.eq(1)
1017 sync
+= reservation
.addr
.eq(
1018 r0
.req
.addr
[LINE_OFF_BITS
:64]
1025 # Return data for loads & completion control logic
1026 def writeback_control(self
, r1
, cache_out
, d_out
, m_out
):
1031 # variable data_out : std_ulogic_vector(63 downto 0);
1032 # variable data_fwd : std_ulogic_vector(63 downto 0);
1033 # variable j : integer;
1034 data_out
= Signal(64)
1035 data_fwd
= Signal(64)
1039 # -- Use the bypass if are reading the row that was
1040 # -- written 1 or 2 cycles ago, including for the
1041 # -- slow_valid = 1 case (i.e. completing a load
1042 # -- miss or a non-cacheable load).
1043 # if r1.use_forward1 = '1' then
1044 # Use the bypass if are reading the row that was
1045 # written 1 or 2 cycles ago, including for the
1046 # slow_valid = 1 case (i.e. completing a load
1047 # miss or a non-cacheable load).
1048 with m
.If(r1
.use_forward1
):
1049 # data_fwd := r1.forward_data1;
1050 comb
+= data_fwd
.eq(r1
.forward_data1
)
1053 # data_fwd := r1.forward_data2;
1054 comb
+= data_fwd
.eq(r1
.forward_data2
)
1057 # data_out := cache_out(r1.hit_way);
1058 comb
+= data_out
.eq(cache_out
[r1
.hit_way
])
1060 # for i in 0 to 7 loop
1065 # if r1.forward_sel(i) = '1' then
1066 with m
.If(r1
.forward_sel
[i
]):
1067 # data_out(j + 7 downto j) := data_fwd(j + 7 downto j);
1068 comb
+= data_out
[j
:j
+8].eq(data_fwd
[j
:j
+8])
1072 # d_out.valid <= r1.ls_valid;
1073 # d_out.data <= data_out;
1074 # d_out.store_done <= not r1.stcx_fail;
1075 # d_out.error <= r1.ls_error;
1076 # d_out.cache_paradox <= r1.cache_paradox;
1077 comb
+= d_out
.valid
.eq(r1
.ls_valid
)
1078 comb
+= d_out
.data
.eq(data_out
)
1079 comb
+= d_out
.store_done
.eq(~r1
.stcx_fail
)
1080 comb
+= d_out
.error
.eq(r1
.ls_error
)
1081 comb
+= d_out
.cache_paradox
.eq(r1
.cache_paradox
)
1084 # m_out.done <= r1.mmu_done;
1085 # m_out.err <= r1.mmu_error;
1086 # m_out.data <= data_out;
1087 comb
+= m_out
.done
.eq(r1
.mmu_done
)
1088 comb
+= m_out
.err
.eq(r1
.mmu_error
)
1089 comb
+= m_out
.data
.eq(data_out
)
1091 # -- We have a valid load or store hit or we just completed
1092 # -- a slow op such as a load miss, a NC load or a store
1094 # -- Note: the load hit is delayed by one cycle. However it
1095 # -- can still not collide with r.slow_valid (well unless I
1096 # -- miscalculated) because slow_valid can only be set on a
1097 # -- subsequent request and not on its first cycle (the state
1098 # -- machine must have advanced), which makes slow_valid
1099 # -- at least 2 cycles from the previous hit_load_valid.
1101 # -- Sanity: Only one of these must be set in any given cycle
1102 # assert (r1.slow_valid and r1.stcx_fail) /= '1'
1103 # report "unexpected slow_valid collision with stcx_fail"
1105 # assert ((r1.slow_valid or r1.stcx_fail) and r1.hit_load_valid)
1106 # /= '1' report "unexpected hit_load_delayed collision with
1107 # slow_valid" severity FAILURE;
1108 # We have a valid load or store hit or we just completed
1109 # a slow op such as a load miss, a NC load or a store
1111 # Note: the load hit is delayed by one cycle. However it
1112 # can still not collide with r.slow_valid (well unless I
1113 # miscalculated) because slow_valid can only be set on a
1114 # subsequent request and not on its first cycle (the state
1115 # machine must have advanced), which makes slow_valid
1116 # at least 2 cycles from the previous hit_load_valid.
1118 # Sanity: Only one of these must be set in any given cycle
1119 assert (r1
.slow_valid
& r1
.stcx_fail
) != 1 "unexpected" \
1120 "slow_valid collision with stcx_fail -!- severity FAILURE"
1122 assert ((r1
.slow_valid | r1
.stcx_fail
) | r1
.hit_load_valid
) != 1
1123 "unexpected hit_load_delayed collision with slow_valid -!-" \
1126 # if r1.mmu_req = '0' then
1127 with m
.If(~r1
._mmu_req
):
1128 # -- Request came from loadstore1...
1129 # -- Load hit case is the standard path
1130 # if r1.hit_load_valid = '1' then
1131 # Request came from loadstore1...
1132 # Load hit case is the standard path
1133 with m
.If(r1
.hit_load_valid
):
1135 # "completing load hit data=" & to_hstring(data_out);
1136 print(f
"completing load hit data={data_out}")
1139 # -- error cases complete without stalling
1140 # if r1.ls_error = '1' then
1141 # error cases complete without stalling
1142 with m
.If(r1
.ls_error
):
1143 # report "completing ld/st with error";
1144 print("completing ld/st with error")
1147 # -- Slow ops (load miss, NC, stores)
1148 # if r1.slow_valid = '1' then
1149 # Slow ops (load miss, NC, stores)
1150 with m
.If(r1
.slow_valid
):
1152 # "completing store or load miss data="
1153 # & to_hstring(data_out);
1154 print(f
"completing store or load miss data={data_out}")
1159 # -- Request came from MMU
1160 # if r1.hit_load_valid = '1' then
1161 # Request came from MMU
1162 with m
.If(r1
.hit_load_valid
):
1163 # report "completing load hit to MMU, data="
1164 # & to_hstring(m_out.data);
1165 print(f
"completing load hit to MMU, data={m_out.data}")
1168 # -- error cases complete without stalling
1169 # if r1.mmu_error = '1' then
1170 # report "completing MMU ld with error";
1171 # error cases complete without stalling
1172 with m
.If(r1
.mmu_error
):
1173 print("combpleting MMU ld with error")
1176 # -- Slow ops (i.e. load miss)
1177 # if r1.slow_valid = '1' then
1178 # Slow ops (i.e. load miss)
1179 with m
.If(r1
.slow_valid
):
1180 # report "completing MMU load miss, data="
1181 # & to_hstring(m_out.data);
1182 print("completing MMU load miss, data={m_out.data}")
1187 # -- Generate a cache RAM for each way. This handles the normal
1188 # -- reads, writes from reloads and the special store-hit update
1191 # -- Note: the BRAMs have an extra read buffer, meaning the output
1192 # -- is pipelined an extra cycle. This differs from the
1193 # -- icache. The writeback logic needs to take that into
1194 # -- account by using 1-cycle delayed signals for load hits.
1196 # rams: for i in 0 to NUM_WAYS-1 generate
1197 # Generate a cache RAM for each way. This handles the normal
1198 # reads, writes from reloads and the special store-hit update
1201 # Note: the BRAMs have an extra read buffer, meaning the output
1202 # is pipelined an extra cycle. This differs from the
1203 # icache. The writeback logic needs to take that into
1204 # account by using 1-cycle delayed signals for load hits.
1206 for i
in range(NUM_WAYS
):
1207 # signal do_read : std_ulogic;
1208 # signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
1209 # signal do_write : std_ulogic;
1210 # signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
1212 # std_ulogic_vector(wishbone_data_bits-1 downto 0);
1213 # signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
1214 # signal wr_sel_m : std_ulogic_vector(ROW_SIZE-1 downto 0);
1215 # signal dout : cache_row_t;
1217 rd_addr
= Signal(ROW_BITS
)
1219 wr_addr
= Signal(ROW_BITS
)
1220 wr_data
= Signal(WB_DATA_BITS
)
1221 wr_sel
= Signal(ROW_SIZE
)
1222 wr_sel_m
= Signal(ROW_SIZE
)
1223 _d_out
= Signal(WB_DATA_BITS
)
1226 # way: entity work.cache_ram
1228 # ROW_BITS => ROW_BITS,
1229 # WIDTH => wishbone_data_bits,
1235 # rd_addr => rd_addr,
1237 # wr_sel => wr_sel_m,
1238 # wr_addr => wr_addr,
1239 # wr_data => wr_data
1242 way
= CacheRam(ROW_BITS
, WB_DATA_BITS
, True)
1243 comb
+= way
.rd_en
.eq(do_read
)
1244 comb
+= way
.rd_addr
.eq(rd_addr
)
1245 comb
+= way
.rd_data
.eq(_d_out
)
1246 comb
+= way
.wr_sel
.eq(wr_sel_m
)
1247 comb
+= way
.wr_addr
.eq(wr_addr
)
1248 comb
+= way
.wr_data
.eq(wr_data
)
1251 # -- Cache hit reads
1254 # std_ulogic_vector(to_unsigned(early_req_row, ROW_BITS));
1255 # cache_out(i) <= dout;
1257 comb
+= do_read
.eq(1)
1258 comb
+= rd_addr
.eq(Signal(BRAM_ROWS
))
1259 comb
+= cache_out
[i
].eq(dout
)
1263 # -- Defaults to wishbone read responses (cache refill)
1265 # -- For timing, the mux on wr_data/sel/addr is not
1266 # -- dependent on anything other than the current state.
1269 # Defaults to wishbone read responses (cache refill)
1271 # For timing, the mux on wr_data/sel/addr is not
1272 # dependent on anything other than the current state.
1273 # wr_sel_m <= (others => '0');
1274 comb
+= wr_sel_m
.eq(0)
1277 comb
+= do_write
.eq(0)
1278 # if r1.write_bram = '1' then
1279 with m
.If(r1
.write_bram
):
1280 # -- Write store data to BRAM. This happens one
1281 # -- cycle after the store is in r0.
1282 # Write store data to BRAM. This happens one
1283 # cycle after the store is in r0.
1284 # wr_data <= r1.req.data;
1285 # wr_sel <= r1.req.byte_sel;
1286 # wr_addr <= std_ulogic_vector(to_unsigned(
1287 # get_row(r1.req.real_addr), ROW_BITS
1289 comb
+= wr_data
.eq(r1
.req
.data
)
1290 comb
+= wr_sel
.eq(r1
.req
.byte_sel
)
1291 comb
+= wr_addr
.eq(Signal(get_row(r1
.req
.real_addr
)))
1293 # if i = r1.req.hit_way then
1294 with m
.If(i
== r1
.req
.hit_way
):
1296 comb
+= do_write
.eq(1)
1300 # -- Otherwise, we might be doing a reload or a DCBZ
1301 # if r1.dcbz = '1' then
1302 # Otherwise, we might be doing a reload or a DCBZ
1304 # wr_data <= (others => '0');
1305 comb
+= wr_data
.eq(0)
1308 # wr_data <= wishbone_in.dat;
1309 comb
+= wr_data
.eq(wishbone_in
.dat
)
1312 # wr_addr <= std_ulogic_vector(to_unsigned(
1313 # r1.store_row, ROW_BITS
1315 # wr_sel <= (others => '1');
1316 comb
+= wr_addr
.eq(Signal(r1
.store_row
))
1317 comb
+= wr_sel
.eq(1)
1319 # if r1.state = RELOAD_WAIT_ACK and
1320 # wishbone_in.ack = '1' and replace_way = i then
1321 with m
.If(r1
.state
== State
.RELOAD_WAIT_ACK
1322 & wishbone_in
.ack
& relpace_way
== i
):
1324 comb
+= do_write
.eq(1)
1328 # -- Mask write selects with do_write since BRAM
1329 # -- doesn't have a global write-enable
1330 # if do_write = '1' then
1331 # -- Mask write selects with do_write since BRAM
1332 # -- doesn't have a global write-enable
1333 with m
.If(do_write
):
1334 # wr_sel_m <= wr_sel;
1335 comb
+= wr_sel_m
.eq(wr_sel
)
1340 # Cache hit synchronous machine for the easy case.
1341 # This handles load hits.
1342 # It also handles error cases (TLB miss, cache paradox)
1343 def dcache_fast_hit(self
, req_op
, r0_valid
, r1
, ):
1349 # if rising_edge(clk) then
1350 # if req_op /= OP_NONE then
1351 with m
.If(req_op
!= Op
.OP_NONE
):
1352 # report "op:" & op_t'image(req_op) &
1353 # " addr:" & to_hstring(r0.req.addr) &
1354 # " nc:" & std_ulogic'image(r0.req.nc) &
1355 # " idx:" & integer'image(req_index) &
1356 # " tag:" & to_hstring(req_tag) &
1357 # " way: " & integer'image(req_hit_way);
1358 print(f
"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \
1359 f
"idx:{req_index} tag:{req_tag} way: {req_hit_way}"
1362 # if r0_valid = '1' then
1363 with m
.If(r0_valid
):
1364 # r1.mmu_req <= r0.mmu_req;
1365 sync
+= r1
.mmu_req
.eq(r0
.mmu_req
)
1368 # -- Fast path for load/store hits.
1369 # -- Set signals for the writeback controls.
1370 # r1.hit_way <= req_hit_way;
1371 # r1.hit_index <= req_index;
1372 # Fast path for load/store hits.
1373 # Set signals for the writeback controls.
1374 sync
+= r1
.hit_way
.eq(req_hit_way
)
1375 sync
+= r1
.hit_index
.eq(req_index
)
1377 # if req_op = OP_LOAD_HIT then
1378 with m
.If(req_op
== Op
.OP_LOAD_HIT
):
1379 # r1.hit_load_valid <= '1';
1380 sync
+= r1
.hit_load_valid
.eq(1)
1384 # r1.hit_load_valid <= '0';
1385 sync
+= r1
.hit_load_valid
.eq(0)
1388 # if req_op = OP_LOAD_HIT or req_op = OP_STORE_HIT then
1389 with m
.If(req_op
== Op
.OP_LOAD_HIT | req_op
== Op
.OP_STORE_HIT
):
1390 # r1.cache_hit <= '1';
1391 sync
+= r1
.cache_hit
.eq(1)
1394 # r1.cache_hit <= '0';
1395 sync
+= r1
.cache_hit
.eq(0)
1398 # if req_op = OP_BAD then
1399 with m
.If(req_op
== Op
.OP_BAD
):
1400 # report "Signalling ld/st error valid_ra=" &
1401 # std_ulogic'image(valid_ra) & " rc_ok=" &
1402 # std_ulogic'image(rc_ok) & " perm_ok=" &
1403 # std_ulogic'image(perm_ok);
1404 print(f
"Signalling ld/st error valid_ra={valid_ra}"
1405 f
"rc_ok={rc_ok} perm_ok={perm_ok}"
1407 # r1.ls_error <= not r0.mmu_req;
1408 # r1.mmu_error <= r0.mmu_req;
1409 # r1.cache_paradox <= access_ok;
1410 sync
+= r1
.ls_error
.eq(~r0
.mmu_req
)
1411 sync
+= r1
.mmu_error
.eq(r0
.mmu_req
)
1412 sync
+= r1
.cache_paradox
.eq(access_ok
)
1416 # r1.ls_error <= '0';
1417 # r1.mmu_error <= '0';
1418 # r1.cache_paradox <= '0';
1419 sync
+= r1
.ls_error
.eq(0)
1420 sync
+= r1
.mmu_error
.eq(0)
1421 sync
+= r1
.cache_paradox
.eq(0)
1424 # if req_op = OP_STCX_FAIL then
1425 with m
.If(req_op
== Op
.OP_STCX_FAIL
):
1426 # r1.stcx_fail <= '1';
1431 # r1.stcx_fail <= '0';
1432 sync
+= r1
.stcx_fail
.eq(0)
1435 # -- Record TLB hit information for updating TLB PLRU
1436 # r1.tlb_hit <= tlb_hit;
1437 # r1.tlb_hit_way <= tlb_hit_way;
1438 # r1.tlb_hit_index <= tlb_req_index;
1439 # Record TLB hit information for updating TLB PLRU
1440 sync
+= r1
.tlb_hit
.eq(tlb_hit
)
1441 sync
+= r1
.tlb_hit_way
.eq(tlb_hit_way
)
1442 sync
+= r1
.tlb_hit_index
.eq(tlb_req_index
)
1446 # Memory accesses are handled by this state machine:
1448 # * Cache load miss/reload (in conjunction with "rams")
1449 # * Load hits for non-cachable forms
1450 # * Stores (the collision case is handled in "rams")
1452 # All wishbone requests generation is done here.
1453 # This machine operates at stage 1.
1454 def dcache_slow(self
, r1
, use_forward1_next
, cache_valid_bits
, r0
,
1455 r0_valid
, req_op
, cache_tag
, req_go
, ra
, wb_in
):
1460 # variable stbs_done : boolean;
1461 # variable req : mem_access_request_t;
1462 # variable acks : unsigned(2 downto 0);
1463 stbs_done
= Signal()
1464 req
= MemAccessRequest()
1472 # if rising_edge(clk) then
1473 # r1.use_forward1 <= use_forward1_next;
1474 # r1.forward_sel <= (others => '0');
1475 sync
+= r1
.use_forward1
.eq(use_forward1_next
)
1476 sync
+= r1
.forward_sel
.eq(0)
1478 # if use_forward1_next = '1' then
1479 with m
.If(use_forward1_next
):
1480 # r1.forward_sel <= r1.req.byte_sel;
1481 sync
+= r1
.forward_sel
.eq(r1
.req
.byte_sel
)
1483 # elsif use_forward2_next = '1' then
1484 with m
.Elif(use_forward2_next
):
1485 # r1.forward_sel <= r1.forward_sel1;
1486 sync
+= r1
.forward_sel
.eq(r1
.forward_sel1
)
1489 # r1.forward_data2 <= r1.forward_data1;
1490 sync
+= r1
.forward_data2
.eq(r1
.forward_data1
)
1492 # if r1.write_bram = '1' then
1493 with m
.If(r1
.write_bram
):
1494 # r1.forward_data1 <= r1.req.data;
1495 # r1.forward_sel1 <= r1.req.byte_sel;
1496 # r1.forward_way1 <= r1.req.hit_way;
1497 # r1.forward_row1 <= get_row(r1.req.real_addr);
1498 # r1.forward_valid1 <= '1';
1499 sync
+= r1
.forward_data1
.eq(r1
.req
.data
)
1500 sync
+= r1
.forward_sel1
.eq(r1
.req
.byte_sel
)
1501 sync
+= r1
.forward_way1
.eq(r1
.req
.hit_way
)
1502 sync
+= r1
.forward_row1
.eq(get_row(r1
.req
.real_addr
))
1503 sync
+= r1
.forward_valid1
.eq(1)
1507 # if r1.dcbz = '1' then
1509 # r1.forward_data1 <= (others => '0');
1510 sync
+= r1
.forward_data1
.eq(0)
1514 # r1.forward_data1 <= wishbone_in.dat;
1515 sync
+= r1
.forward_data1
.eq(wb_in
.dat
)
1518 # r1.forward_sel1 <= (others => '1');
1519 # r1.forward_way1 <= replace_way;
1520 # r1.forward_row1 <= r1.store_row;
1521 # r1.forward_valid1 <= '0';
1522 sync
+= r1
.forward_sel1
.eq(1)
1523 sync
+= r1
.forward_way1
.eq(replace_way
)
1524 sync
+= r1
.forward_row1
.eq(r1
.store_row
)
1525 sync
+= r1
.forward_valid1
.eq(0)
1528 # -- On reset, clear all valid bits to force misses
1530 # On reset, clear all valid bits to force misses
1531 # TODO figure out how reset signal works in nmigeni
1532 with m
.If("""TODO RST???"""):
1533 # for i in index_t loop
1534 for i
in range(NUM_LINES
):
1535 # cache_valids(i) <= (others => '0');
1536 sync
+= cache_valid_bits
[i
].eq(0)
1541 # r1.slow_valid <= '0';
1544 # r1.ls_valid <= '0';
1545 # r1.mmu_done <= '0';
1546 sync
+= r1
.state
.eq(State
.IDLE
)
1547 sync
+= r1
.full
.eq(0)
1548 sync
+= r1
.slow_valid
.eq(0)
1549 sync
+= r1
.wb
.cyc
.eq(0)
1550 sync
+= r1
.wb
.stb
.eq(0)
1551 sync
+= r1
.ls_valid
.eq(0)
1552 sync
+= r1
.mmu_done
.eq(0)
1554 # -- Not useful normally but helps avoiding
1555 # -- tons of sim warnings
1556 # Not useful normally but helps avoiding
1557 # tons of sim warnings
1558 # r1.wb.adr <= (others => '0');
1559 sync
+= r1
.wb
.adr
.eq(0)
1562 # -- One cycle pulses reset
1563 # r1.slow_valid <= '0';
1564 # r1.write_bram <= '0';
1565 # r1.inc_acks <= '0';
1566 # r1.dec_acks <= '0';
1568 # r1.ls_valid <= '0';
1569 # -- complete tlbies and TLB loads in the third cycle
1570 # r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld);
1571 # One cycle pulses reset
1572 sync
+= r1
.slow_valid
.eq(0)
1573 sync
+= r1
.write_bram
.eq(0)
1574 sync
+= r1
.inc_acks
.eq(0)
1575 sync
+= r1
.dec_acks
.eq(0)
1577 sync
+= r1
.ls_valid
.eq(0)
1578 # complete tlbies and TLB loads in the third cycle
1579 sync
+= r1
.mmu_done
.eq(r0_valid
& (r0
.tlbie | r0
.tlbld
))
1581 # if req_op = OP_LOAD_HIT or req_op = OP_STCX_FAIL then
1582 with m
.If(req_op
== Op
.OP_LOAD_HIT
1583 | req_op
== Op
.OP_STCX_FAIL
):
1584 # if r0.mmu_req = '0' then
1585 with m
.If(~r0
.mmu_req
):
1586 # r1.ls_valid <= '1';
1587 sync
+= r1
.ls_valid
.eq(1)
1590 # r1.mmu_done <= '1';
1591 sync
+= r1
.mmu_done
.eq(1)
1595 # if r1.write_tag = '1' then
1596 with m
.If(r1
.write_tag
):
1597 # -- Store new tag in selected way
1598 # for i in 0 to NUM_WAYS-1 loop
1599 # Store new tag in selected way
1600 for i
in range(NUM_WAYS
):
1601 # if i = replace_way then
1602 with m
.If(i
== replace_way
):
1603 # cache_tags(r1.store_index)(
1604 # (i + 1) * TAG_WIDTH - 1
1605 # downto i * TAG_WIDTH
1607 # (TAG_WIDTH - 1 downto TAG_BITS => '0')
1611 ][i
* TAG_WIDTH
:(i
+1) * TAG_WIDTH
].eq(
1612 Const(TAG_WIDTH
, TAG_WIDTH
)
1617 # r1.store_way <= replace_way;
1618 # r1.write_tag <= '0';
1619 sync
+= r1
.store_way
.eq(replace_way
)
1620 sync
+= r1
.write_tag
.eq(0)
1623 # -- Take request from r1.req if there is one there,
1624 # -- else from req_op, ra, etc.
1625 # if r1.full = '1' then
1626 # Take request from r1.req if there is one there,
1627 # else from req_op, ra, etc.
1630 sync
+= req
.eq(r1
.req
)
1635 # req.valid := req_go;
1636 # req.mmu_req := r0.mmu_req;
1637 # req.dcbz := r0.req.dcbz;
1638 # req.real_addr := ra;
1639 sync
+= req
.op
.eq(req_op
)
1640 sync
+= req
.valid
.eq(req_go
)
1641 sync
+= req
.mmu_req
.eq(r0
.mmu_req
)
1642 sync
+= req
.dcbz
.eq(r0
.req
.dcbz
)
1643 sync
+= req
.real_addr
.eq(ra
)
1645 # -- Force data to 0 for dcbz
1646 # if r0.req.dcbz = '0' then
1647 with m
.If(~r0
.req
.dcbz
):
1648 # req.data := r0.req.data;
1649 sync
+= req
.data
.eq(r0
.req
.data
)
1653 # req.data := (others => '0');
1654 sync
+= req
.data
.eq(0)
1657 # -- Select all bytes for dcbz
1658 # -- and for cacheable loads
1659 # if r0.req.dcbz = '1'
1660 # or (r0.req.load = '1' and r0.req.nc = '0') then
1661 # Select all bytes for dcbz
1662 # and for cacheable loads
1663 with m
.If(r0
.req
.dcbz |
(r0
.req
.load
& ~r0
.req
.nc
):
1664 # req.byte_sel := (others => '1');
1665 sync
+= req
.byte_sel
.eq(1)
1669 # req.byte_sel := r0.req.byte_sel;
1670 sync
+= req
.byte_sel
.eq(r0
.req
.byte_sel
)
1673 # req.hit_way := req_hit_way;
1674 # req.same_tag := req_same_tag;
1675 sync
+= req
.hit_way
.eq(req_hit_way
)
1676 sync
+= req
.same_tag
.eq(req_same_tag
)
1678 # -- Store the incoming request from r0,
1679 # -- if it is a slow request
1680 # -- Note that r1.full = 1 implies req_op = OP_NONE
1681 # if req_op = OP_LOAD_MISS or req_op = OP_LOAD_NC
1682 # or req_op = OP_STORE_MISS
1683 # or req_op = OP_STORE_HIT then
1684 # Store the incoming request from r0,
1685 # if it is a slow request
1686 # Note that r1.full = 1 implies req_op = OP_NONE
1687 with m
.If(req_op
== Op
.OP_LOAD_MISS
1688 | req_op
== Op
.OP_LOAD_NC
1689 | req_op
== Op
.OP_STORE_MISS
1690 | req_op
== Op
.OP_STORE_HIT
):
1694 sync
+= r1
.full
.eq(1)
1698 # -- Main state machine
1700 # Main state machine
1701 with m
.Switch(r1
.state
):
1704 with m
.Case(State
.IDLE
)
1705 # r1.wb.adr <= req.real_addr(
1706 # r1.wb.adr'left downto 0
1708 # r1.wb.sel <= req.byte_sel;
1709 # r1.wb.dat <= req.data;
1710 # r1.dcbz <= req.dcbz;
1712 # -- Keep track of our index and way
1713 # -- for subsequent stores.
1714 # r1.store_index <= get_index(req.real_addr);
1715 # r1.store_row <= get_row(req.real_addr);
1717 # get_row_of_line(get_row(req.real_addr)) - 1;
1718 # r1.reload_tag <= get_tag(req.real_addr);
1719 # r1.req.same_tag <= '1';
1720 sync
+= r1
.wb
.adr
.eq(req
.real_addr
[0:r1
.wb
.adr
])
1721 sync
+= r1
.wb
.sel
.eq(req
.byte_sel
)
1722 sync
+= r1
.wb
.dat
.eq(req
.data
)
1723 sync
+= r1
.dcbz
.eq(req
.dcbz
)
1725 # Keep track of our index and way
1726 # for subsequent stores.
1727 sync
+= r1
.store_index
.eq(get_index(req
.real_addr
))
1728 sync
+= r1
.store_row
.eq(get_row(req
.real_addr
))
1729 sync
+= r1
.end_row_ix
.eq(
1730 get_row_of_line(get_row(req
.real_addr
))
1732 sync
+= r1
.reload_tag
.eq(get_tag(req
.real_addr
))
1733 sync
+= r1
.req
.same_tag
.eq(1)
1735 # if req.op = OP_STORE_HIT theni
1736 with m
.If(req
.op
== Op
.OP_STORE_HIT
):
1737 # r1.store_way <= req.hit_way;
1738 sync
+= r1
.store_way
.eq(req
.hit_way
)
1741 # -- Reset per-row valid bits,
1742 # -- ready for handling OP_LOAD_MISS
1743 # for i in 0 to ROW_PER_LINE - 1 loop
1744 # Reset per-row valid bits,
1745 # ready for handling OP_LOAD_MISS
1746 for i
in range(ROW_PER_LINE
):
1747 # r1.rows_valid(i) <= '0';
1748 sync
+= r1
.rows_valid
[i
].eq(0)
1752 with m
.Switch(req
.op
):
1753 # when OP_LOAD_HIT =>
1754 with m
.Case(Op
.OP_LOAD_HIT
):
1755 # -- stay in IDLE state
1756 # stay in IDLE state
1759 # when OP_LOAD_MISS =>
1760 with m
.Case(Op
.OP_LOAD_MISS
):
1761 # -- Normal load cache miss,
1762 # -- start the reload machine
1763 # report "cache miss real addr:" &
1764 # to_hstring(req.real_addr) & " idx:" &
1765 # integer'image(get_index(req.real_addr)) &
1766 # " tag:" & to_hstring(get_tag(req.real_addr));
1767 # Normal load cache miss,
1768 # start the reload machine
1769 print(f
"cache miss real addr:" \
1770 f
"{req_real_addr}" \
1771 f
" idx:{get_index(req_real_addr)}" \
1772 f
" tag:{get_tag(req.real_addr)}")
1774 # -- Start the wishbone cycle
1778 # Start the wishbone cycle
1779 sync
+= r1
.wb
.we
.eq(0)
1780 sync
+= r1
.wb
.cyc
.eq(1)
1781 sync
+= r1
.wb
.stb
.eq(1)
1783 # -- Track that we had one request sent
1784 # r1.state <= RELOAD_WAIT_ACK;
1785 # r1.write_tag <= '1';
1786 # Track that we had one request sent
1787 sync
+= r1
.state
.eq(State
.RELOAD_WAIT_ACK
)
1788 sync
+= r1
.write_tag
.eq(1)
1790 # when OP_LOAD_NC =>
1791 with m
.Case(Op
.OP_LOAD_NC
):
1795 # r1.state <= NC_LOAD_WAIT_ACK;
1796 sync
+= r1
.wb
.cyc
.eq(1)
1797 sync
+= r1
.wb
.stb
.eq(1)
1798 sync
+= r1
.wb
.we
.eq(0)
1799 sync
+= r1
.state
.eq(State
.NC_LOAD_WAIT_ACK
)
1801 # when OP_STORE_HIT | OP_STORE_MISS =>
1802 with m
.Case(Op
.OP_STORE_HIT
1803 | Op
.OP_STORE_MISS
):
1804 # if req.dcbz = '0' then
1805 with m
.If(~req
.bcbz
):
1806 # r1.state <= STORE_WAIT_ACK;
1807 # r1.acks_pending <= to_unsigned(1, 3);
1809 # r1.slow_valid <= '1';
1810 sync
+= r1
.state
.eq(
1811 State
.STORE_WAIT_ACK
1813 sync
+= r1
.acks_pending
.eq(
1814 '''TODO to_unsignes(1,3)'''
1816 sync
+= r1
.full
.eq(0)
1817 sync
+= r1
.slow_valid
.eq(1)
1819 # if req.mmu_req = '0' then
1820 with m
.If(~req
.mmu_req
):
1821 # r1.ls_valid <= '1';
1822 sync
+= r1
.ls_valid
.eq(1)
1825 # r1.mmu_done <= '1';
1826 sync
+= r1
.mmu_done
.eq(1)
1829 # if req.op = OP_STORE_HIT then
1830 with m
.If(req
.op
== Op
.OP_STORE_HIT
):
1831 # r1.write_bram <= '1';
1832 sync
+= r1
.write_bram
.eq(1)
1837 # -- dcbz is handled much like a load
1838 # -- miss except that we are writing
1839 # -- to memory instead of reading
1840 # r1.state <= RELOAD_WAIT_ACK;
1841 # dcbz is handled much like a load
1842 # miss except that we are writing
1843 # to memory instead of reading
1844 sync
+= r1
.state
.eq(Op
.RELOAD_WAIT_ACK
)
1846 # if req.op = OP_STORE_MISS then
1847 with m
.If(req
.op
== Op
.OP_STORE_MISS
):
1848 # r1.write_tag <= '1';
1849 sync
+= r1
.write_tag
.eq(1)
1856 sync
+= r1
.wb
.we
.eq(1)
1857 sync
+= r1
.wb
.cyc
.eq(1)
1858 sync
+= r1
.wb
.stb
.eq(1)
1860 # -- OP_NONE and OP_BAD do nothing
1861 # -- OP_BAD & OP_STCX_FAIL were handled above already
1864 # when OP_STCX_FAIL =>
1865 # OP_NONE and OP_BAD do nothing
1866 # OP_BAD & OP_STCX_FAIL were
1867 # handled above already
1868 with m
.Case(Op
.OP_NONE
):
1871 with m
.Case(OP_BAD
):
1874 with m
.Case(OP_STCX_FAIL
):
1878 # when RELOAD_WAIT_ACK =>
1879 with m
.Case(State
.RELOAD_WAIT_ACK
):
1880 # -- Requests are all sent if stb is 0
1881 # Requests are all sent if stb is 0
1882 sync
+= stbs_done
.eq(~r1
.wb
.stb
)
1883 # stbs_done := r1.wb.stb = '0';
1885 # -- If we are still sending requests,
1886 # -- was one accepted?
1887 # if wishbone_in.stall = '0' and not stbs_done then
1888 # If we are still sending requests,
1890 with m
.If(~wb_in
.stall
& ~stbs_done
):
1891 # -- That was the last word ? We are done sending.
1892 # -- Clear stb and set stbs_done so we can handle
1893 # -- an eventual last ack on the same cycle.
1894 # if is_last_row_addr(
1895 # r1.wb.adr, r1.end_row_ix
1897 # That was the last word?
1898 # We are done sending.
1899 # Clear stb and set stbs_done
1900 # so we can handle an eventual
1901 # last ack on the same cycle.
1902 with m
.If(is_last_row_addr(
1903 r1
.wb
.adr
, r1
.end_row_ix
)):
1905 # stbs_done := true;
1906 sync
+= r1
.wb
.stb
.eq(0)
1907 sync
+= stbs_done
.eq(0)
1910 # -- Calculate the next row address
1911 # r1.wb.adr <= next_row_addr(r1.wb.adr);
1912 # Calculate the next row address
1913 sync
+= r1
.wb
.adr
.eq(next_row_addr(r1
.wb
.adr
))
1916 # -- Incoming acks processing
1917 # r1.forward_valid1 <= wishbone_in.ack;
1918 # Incoming acks processing
1919 sync
+= r1
.forward_valid1
.eq(wb_in
.ack
)
1921 # if wishbone_in.ack = '1' then
1922 with m
.If(wb_in
.ack
):
1924 # r1.store_row mod ROW_PER_LINE
1926 sync
+= r1
.rows_valid
[
1927 r1
.store_row
% ROW_PER_LINE
1930 # -- If this is the data we were looking for,
1931 # -- we can complete the request next cycle.
1932 # -- Compare the whole address in case the
1933 # -- request in r1.req is not the one that
1934 # -- started this refill.
1935 # if r1.full = '1' and r1.req.same_tag = '1'
1936 # and ((r1.dcbz = '1' and r1.req.dcbz = '1')
1937 # or (r1.dcbz = '0' and r1.req.op = OP_LOAD_MISS))
1938 # and r1.store_row = get_row(r1.req.real_addr) then
1939 # If this is the data we were looking for,
1940 # we can complete the request next cycle.
1941 # Compare the whole address in case the
1942 # request in r1.req is not the one that
1943 # started this refill.
1944 with m
.If(r1
.full
& r1
.req
.same_tag
&
1945 ((r1
.dcbz
& r1
.req
.dcbz
)
1947 r1
.req
.op
== Op
.OP_LOAD_MISS
)
1950 == get_row(r1
.req
.real_addr
):
1952 # r1.slow_valid <= '1';
1953 sync
+= r1
.full
.eq(0)
1954 sync
+= r1
.slow_valid
.eq(1)
1956 # if r1.mmu_req = '0' then
1957 with m
.If(~r1
.mmu_req
):
1958 # r1.ls_valid <= '1';
1959 sync
+= r1
.ls_valid
.eq(1)
1962 # r1.mmu_done <= '1';
1963 sync
+= r1
.mmu_done
.eq(1)
1965 # r1.forward_sel <= (others => '1');
1966 # r1.use_forward1 <= '1';
1967 sync
+= r1
.forward_sel
.eq(1)
1968 sync
+= r1
.use_forward1
.eq(1)
1971 # -- Check for completion
1972 # if stbs_done and is_last_row(r1.store_row,
1973 # r1.end_row_ix) then
1974 # Check for completion
1975 with m
.If(stbs_done
&
1976 is_last_row(r1
.store_row
,
1979 # -- Complete wishbone cycle
1981 # Complete wishbone cycle
1982 sync
+= r1
.wb
.cyc
.eq(0)
1984 # -- Cache line is now valid
1985 # cache_valids(r1.store_index)(
1988 # Cache line is now valid
1989 sync
+= cache_valid_bits
[
1991 ][r1
.store_way
].eq(1)
1994 sync
+= r1
.state
.eq(State
.IDLE
)
1997 # -- Increment store row counter
1998 # r1.store_row <= next_row(r1.store_row);
1999 # Increment store row counter
2000 sync
+= r1
.store_row
.eq(next_row(
2005 # when STORE_WAIT_ACK =>
2006 with m
.Case(State
.STORE_WAIT_ACK
):
2007 # stbs_done := r1.wb.stb = '0';
2008 # acks := r1.acks_pending;
2009 sync
+= stbs_done
.eq(~r1
.wb
.stb
)
2010 sync
+= acks
.eq(r1
.acks_pending
)
2012 # if r1.inc_acks /= r1.dec_acks then
2013 with m
.If(r1
.inc_acks
!= r1
.dec_acks
):
2015 # if r1.inc_acks = '1' then
2016 with m
.If(r1
.inc_acks
):
2018 sync
+= acks
.eq(acks
+ 1)
2023 sync
+= acks
.eq(acks
- 1)
2027 # r1.acks_pending <= acks;
2028 sync
+= r1
.acks_pending
.eq(acks
)
2030 # -- Clear stb when slave accepted request
2031 # if wishbone_in.stall = '0' then
2032 # Clear stb when slave accepted request
2033 with m
.If(~wb_in
.stall
):
2034 # -- See if there is another store waiting
2035 # -- to be done which is in the same real page.
2036 # if req.valid = '1' then
2037 # See if there is another store waiting
2038 # to be done which is in the same real page.
2039 with m
.If(req
.valid
):
2041 # SET_SIZE_BITS - 1 downto 0
2042 # ) <= req.real_addr(
2043 # SET_SIZE_BITS - 1 downto 0
2045 # r1.wb.dat <= req.data;
2046 # r1.wb.sel <= req.byte_sel;
2047 sync
+= r1
.wb
.adr
[0:SET_SIZE_BITS
].eq(
2048 req
.real_addr
[0:SET_SIZE_BITS
]
2052 # if acks < 7 and req.same_tag = '1'
2053 # and (req.op = OP_STORE_MISS
2054 # or req.op = OP_STORE_HIT) then
2055 with m
.Elif(acks
< 7 & req
.same_tag
&
2056 (req
.op
== Op
.Op_STORE_MISS
2057 | req
.op
== Op
.OP_SOTRE_HIT
)):
2059 # stbs_done := false;
2060 sync
+= r1
.wb
.stb
.eq(1)
2061 sync
+= stbs_done
.eq(0)
2063 # if req.op = OP_STORE_HIT then
2064 with m
.If(req
.op
== Op
.OP_STORE_HIT
):
2065 # r1.write_bram <= '1';
2066 sync
+= r1
.write_bram
.eq(1)
2069 # r1.slow_valid <= '1';
2070 sync
+= r1
.full
.eq(0)
2071 sync
+= r1
.slow_valid
.eq(1)
2073 # -- Store requests never come from the MMU
2074 # r1.ls_valid <= '1';
2075 # stbs_done := false;
2076 # r1.inc_acks <= '1';
2077 # Store request never come from the MMU
2078 sync
+= r1
.ls_valid
.eq(1)
2079 sync
+= stbs_done
.eq(0)
2080 sync
+= r1
.inc_acks
.eq(1)
2084 # stbs_done := true;
2085 sync
+= r1
.wb
.stb
.eq(0)
2086 sync
+= stbs_done
.eq(1)
2090 # -- Got ack ? See if complete.
2091 # if wishbone_in.ack = '1' then
2092 # Got ack ? See if complete.
2093 with m
.If(wb_in
.ack
):
2094 # if stbs_done and acks = 1 then
2095 with m
.If(stbs_done
& acks
)
2099 sync
+= r1
.state
.eq(State
.IDLE
)
2100 sync
+= r1
.wb
.cyc
.eq(0)
2101 sync
+= r1
.wb
.stb
.eq(0)
2103 # r1.dec_acks <= '1';
2104 sync
+= r1
.dec_acks
.eq(1)
2107 # when NC_LOAD_WAIT_ACK =>
2108 with m
.Case(State
.NC_LOAD_WAIT_ACK
):
2109 # -- Clear stb when slave accepted request
2110 # if wishbone_in.stall = '0' then
2111 # Clear stb when slave accepted request
2112 with m
.If(~wb_in
.stall
):
2114 sync
+= r1
.wb
.stb
.eq(0)
2117 # -- Got ack ? complete.
2118 # if wishbone_in.ack = '1' then
2119 # Got ack ? complete.
2120 with m
.If(wb_in
.ack
):
2123 # r1.slow_valid <= '1';
2124 sync
+= r1
.state
.eq(State
.IDLE
)
2125 sync
+= r1
.full
.eq(0)
2126 sync
+= r1
.slow_valid
.eq(1)
2128 # if r1.mmu_req = '0' then
2129 with m
.If(~r1
.mmu_req
):
2130 # r1.ls_valid <= '1';
2131 sync
+= r1
.ls_valid
.eq(1)
2135 # r1.mmu_done <= '1';
2136 sync
+= r1
.mmu_done
.eq(1)
2139 # r1.forward_sel <= (others => '1');
2140 # r1.use_forward1 <= '1';
2143 sync
+= r1
.forward_sel
.eq(1)
2144 sync
+= r1
.use_forward1
.eq(1)
2145 sync
+= r1
.wb
.cyc
.eq(0)
2146 sync
+= r1
.wb
.stb
.eq(0)
2153 # dc_log: if LOG_LENGTH > 0 generate
2154 # TODO learn how to tranlate vhdl generate into nmigen
2155 def dcache_log(self
, r1
, valid_ra
, tlb_hit_way
, stall_out
,
2156 d_out
, wb_in
, log_out
):
2161 # signal log_data : std_ulogic_vector(19 downto 0);
2162 log_data
= Signal(20)
2167 # dcache_log: process(clk)
2169 # if rising_edge(clk) then
2170 # log_data <= r1.wb.adr(5 downto 3) &
2171 # wishbone_in.stall &
2173 # r1.wb.stb & r1.wb.cyc &
2176 # std_ulogic_vector(
2177 # to_unsigned(op_t'pos(req_op), 3)) &
2179 # std_ulogic_vector(
2180 # to_unsigned(tlb_hit_way, 3)) &
2182 # std_ulogic_vector(
2183 # to_unsigned(state_t'pos(r1.state), 3));
2184 sync
+= log_data
.eq(Cat(
2185 Const(r1
.state
, 3), valid_ra
, Const(tlb_hit_way
, 3),
2186 stall_out
, Const(req_op
, 3), d_out
.valid
, d_out
.error
,
2187 r1
.wb
.cyc
, r1
.wb
.stb
, wb_in
.ack
, wb_in
.stall
,
2192 # log_out <= log_data;
2193 # TODO ??? I am very confused need help
2194 comb
+= log_out
.eq(log_data
)
2198 def elaborate(self
, platform
):
2199 LINE_SIZE
= self
.LINE_SIZE
2200 NUM_LINES
= self
.NUM_LINES
2201 NUM_WAYS
= self
.NUM_WAYS
2202 TLB_SET_SIZE
= self
.TLB_SET_SIZE
2203 TLB_NUM_WAYS
= self
.TLB_NUM_WAYS
2204 TLB_LG_PGSZ
= self
.TLB_LG_PGSZ
2205 LOG_LENGTH
= self
.LOG_LENGTH
2207 # BRAM organisation: We never access more than
2208 # -- wishbone_data_bits at a time so to save
2209 # -- resources we make the array only that wide, and
2210 # -- use consecutive indices for to make a cache "line"
2212 # -- ROW_SIZE is the width in bytes of the BRAM
2213 # -- (based on WB, so 64-bits)
2214 ROW_SIZE
= WB_DATA_BITS
/ 8;
2216 # ROW_PER_LINE is the number of row (wishbone
2217 # transactions) in a line
2218 ROW_PER_LINE
= LINE_SIZE
// ROW_SIZE
2220 # BRAM_ROWS is the number of rows in BRAM needed
2221 # to represent the full dcache
2222 BRAM_ROWS
= NUM_LINES
* ROW_PER_LINE
2225 # Bit fields counts in the address
2227 # REAL_ADDR_BITS is the number of real address
2228 # bits that we store
2231 # ROW_BITS is the number of bits to select a row
2232 ROW_BITS
= log2_int(BRAM_ROWS
)
2234 # ROW_LINE_BITS is the number of bits to select
2235 # a row within a line
2236 ROW_LINE_BITS
= log2_int(ROW_PER_LINE
)
2238 # LINE_OFF_BITS is the number of bits for
2239 # the offset in a cache line
2240 LINE_OFF_BITS
= log2_int(LINE_SIZE
)
2242 # ROW_OFF_BITS is the number of bits for
2243 # the offset in a row
2244 ROW_OFF_BITS
= log2_int(ROW_SIZE
)
2246 # INDEX_BITS is the number if bits to
2247 # select a cache line
2248 INDEX_BITS
= log2_int(NUM_LINES
)
2250 # SET_SIZE_BITS is the log base 2 of the set size
2251 SET_SIZE_BITS
= LINE_OFF_BITS
+ INDEX_BITS
2253 # TAG_BITS is the number of bits of
2254 # the tag part of the address
2255 TAG_BITS
= REAL_ADDR_BITS
- SET_SIZE_BITS
2257 # TAG_WIDTH is the width in bits of each way of the tag RAM
2258 TAG_WIDTH
= TAG_BITS
+ 7 - ((TAG_BITS
+ 7) % 8)
2260 # WAY_BITS is the number of bits to select a way
2261 WAY_BITS
= log2_int(NUM_WAYS
)
2263 # Example of layout for 32 lines of 64 bytes:
2265 # .. tag |index| line |
2267 # .. | |---| | ROW_LINE_BITS (3)
2268 # .. | |--- - --| LINE_OFF_BITS (6)
2269 # .. | |- --| ROW_OFF_BITS (3)
2270 # .. |----- ---| | ROW_BITS (8)
2271 # .. |-----| | INDEX_BITS (5)
2272 # .. --------| | TAG_BITS (45)
2274 TAG_RAM_WIDTH
= TAG_WIDTH
* NUM_WAYS
2276 def CacheTagArray():
2277 return Array(CacheTagSet() for x
in range(NUM_LINES
))
2279 def CacheValidBitsArray():
2280 return Array(CacheWayValidBits() for x
in range(NUM_LINES
))
2282 def RowPerLineValidArray():
2283 return Array(Signal() for x
in range(ROW_PER_LINE
))
2285 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
2286 cache_tags
= CacheTagArray()
2287 cache_tag_set
= Signal(TAG_RAM_WIDTH
)
2288 cache_valid_bits
= CacheValidBitsArray()
2290 # TODO attribute ram_style : string;
2291 # TODO attribute ram_style of cache_tags : signal is "distributed";
2294 TLB_SET_BITS
= log2_int(TLB_SET_SIZE
)
2295 TLB_WAY_BITS
= log2_int(TLB_NUM_WAYS
)
2296 TLB_EA_TAG_BITS
= 64 - (TLB_LG_PGSZ
+ TLB_SET_BITS
)
2297 TLB_TAG_WAY_BITS
= TLB_NUM_WAYS
* TLB_EA_TAG_BITS
2299 TLB_PTE_WAY_BITS
= TLB_NUM_WAYS
* TLB_PTE_BITS
;
2301 def TLBValidBitsArray():
2303 Signal(TLB_NUM_WAYS
) for x
in range(TLB_SET_SIZE
)
2308 Signal(TLB_TAG_WAY_BITS
) for x
in range (TLB_SET_SIZE
)
2313 Signal(TLB_PTE_WAY_BITS
) for x
in range(TLB_SET_SIZE
)
2317 return Array(Signal(NUM_WAYS
) for x
in range(TLB_NUM_WAYS
))
2319 """note: these are passed to nmigen.hdl.Memory as "attributes".
2320 don't know how, just that they are.
2322 dtlb_valid_bits
= TLBValidBitsArray()
2323 dtlb_tags
= TLBTagsArray()
2324 dtlb_ptes
= TLBPtesArray()
2325 # TODO attribute ram_style of
2326 # dtlb_tags : signal is "distributed";
2327 # TODO attribute ram_style of
2328 # dtlb_ptes : signal is "distributed";
2335 reservation
= Reservation()
2337 # Async signals on incoming request
2338 req_index
= Signal(NUM_LINES
)
2339 req_row
= Signal(BRAM_ROWS
)
2340 req_hit_way
= Signal(WAY_BITS
)
2341 req_tag
= Signal(TAG_BITS
)
2343 req_data
= Signal(64)
2344 req_same_tag
= Signal()
2347 early_req_row
= Signal(BRAM_ROWS
)
2349 cancel_store
= Signal()
2351 clear_rsrv
= Signal()
2356 use_forward1_next
= Signal()
2357 use_forward2_next
= Signal()
2359 # Cache RAM interface
2361 return Array(Signal(WB_DATA_BITS
) for x
in range(NUM_WAYS
))
2363 cache_out
= CacheRamOut()
2365 # PLRU output interface
2367 return Array(Signal(WAY_BITS
) for x
in range(Index()))
2369 plru_victim
= PLRUOut()
2370 replace_way
= Signal(WAY_BITS
)
2372 # Wishbone read/write/cache write formatting signals
2376 tlb_tag_way
= Signal(TLB_TAG_WAY_BITS
)
2377 tlb_pte_way
= Signal(TLB_PTE_WAY_BITS
)
2378 tlb_valid_way
= Signal(TLB_NUM_WAYS
)
2379 tlb_req_index
= Signal(TLB_SET_SIZE
)
2381 tlb_hit_way
= Signal(TLB_NUM_WAYS
)
2382 pte
= Signal(TLB_PTE_BITS
)
2383 ra
= Signal(REAL_ADDR_BITS
)
2385 perm_attr
= PermAttr()
2388 access_ok
= Signal()
2390 # TLB PLRU output interface
2393 Signal(TLB_WAY_BITS
) for x
in range(TLB_SET_SIZE
)
2396 tlb_plru_victim
= TLBPLRUOut()
2398 # Helper functions to decode incoming requests
2400 # Return the cache line index (tag index) for an address
2401 def get_index(addr
):
2402 return addr
[LINE_OFF_BITS
:SET_SIZE_BITS
]
2404 # Return the cache row index (data memory) for an address
2406 return addr
[ROW_OFF_BITS
:SET_SIZE_BITS
]
2408 # Return the index of a row within a line
2409 def get_row_of_line(row
):
2410 row_v
= Signal(ROW_BITS
)
2412 return row_v
[0:ROW_LINE_BITS
]
2414 # Returns whether this is the last row of a line
2415 def is_last_row_addr(addr
, last
):
2416 return addr
[ROW_OFF_BITS
:LINE_OFF_BITS
] == last
2418 # Returns whether this is the last row of a line
2419 def is_last_row(row
, last
):
2420 return get_row_of_line(row
) == last
2422 # Return the address of the next row in the current cache line
2423 def next_row_addr(addr
):
2424 row_idx
= Signal(ROW_LINE_BITS
)
2425 result
= WBAddrType()
2426 # Is there no simpler way in VHDL to
2427 # generate that 3 bits adder ?
2428 row_idx
= addr
[ROW_OFF_BITS
:LINE_OFF_BITS
]
2429 row_idx
= Signal(row_idx
+ 1)
2431 result
[ROW_OFF_BITS
:LINE_OFF_BITS
] = row_idx
2434 # Return the next row in the current cache line. We use a
2435 # dedicated function in order to limit the size of the
2436 # generated adder to be only the bits within a cache line
2437 # (3 bits with default settings)
2439 row_v
= Signal(ROW_BITS
)
2440 row_idx
= Signal(ROW_LINE_BITS
)
2441 result
= Signal(ROW_BITS
)
2444 row_idx
= row_v
[ROW_LINE_BITS
]
2445 row_v
[0:ROW_LINE_BITS
] = Signal(row_idx
+ 1)
2448 # Get the tag value from the address
2450 return addr
[SET_SIZE_BITS
:REAL_ADDR_BITS
]
2452 # Read a tag from a tag memory row
2453 def read_tag(way
, tagset
):
2454 return tagset
[way
*TAG_WIDTH
:way
* TAG_WIDTH
+ TAG_BITS
]
2456 # Read a TLB tag from a TLB tag memory row
2457 def read_tlb_tag(way
, tags
):
2460 j
= way
* TLB_EA_TAG_BITS
2461 return tags
[j
:j
+ TLB_EA_TAG_BITS
]
2463 # Write a TLB tag to a TLB tag memory row
2464 def write_tlb_tag(way
, tags
), tag
):
2467 j
= way
* TLB_EA_TAG_BITS
2468 tags
[j
:j
+ TLB_EA_TAG_BITS
] = tag
2470 # Read a PTE from a TLB PTE memory row
2471 def read_tlb_pte(way
, ptes
):
2474 j
= way
* TLB_PTE_BITS
2475 return ptes
[j
:j
+ TLB_PTE_BITS
]
2477 def write_tlb_pte(way
, ptes
,newpte
):
2480 j
= way
* TLB_PTE_BITS
2481 return ptes
[j
:j
+ TLB_PTE_BITS
] = newpte
2483 assert (LINE_SIZE
% ROW_SIZE
) == 0 "LINE_SIZE not " \
2484 "multiple of ROW_SIZE"
2486 assert (LINE_SIZE
% 2) == 0 "LINE_SIZE not power of 2"
2488 assert (NUM_LINES
% 2) == 0 "NUM_LINES not power of 2"
2490 assert (ROW_PER_LINE
% 2) == 0 "ROW_PER_LINE not" \
2493 assert ROW_BITS
== (INDEX_BITS
+ ROW_LINE_BITS
) \
2494 "geometry bits don't add up"
2496 assert (LINE_OFF_BITS
= ROW_OFF_BITS
+ ROW_LINEBITS
) \
2497 "geometry bits don't add up"
2499 assert REAL_ADDR_BITS
== (TAG_BITS
+ INDEX_BITS \
2500 + LINE_OFF_BITS
) "geometry bits don't add up"
2502 assert REAL_ADDR_BITS
== (TAG_BITS
+ ROW_BITS
+ ROW_OFF_BITS
) \
2503 "geometry bits don't add up"
2505 assert 64 == wishbone_data_bits
"Can't yet handle a" \
2506 "wishbone width that isn't 64-bits"
2508 assert SET_SIZE_BITS
<= TLB_LG_PGSZ
"Set indexed by" \
2511 # we don't yet handle collisions between loadstore1 requests
2513 comb
+= m_out
.stall
.eq(0)
2515 # Hold off the request in r0 when r1 has an uncompleted request
2516 comb
+= r0_stall
.eq(r0_full
& r1
.full
)
2517 comb
+= r0_valid
.eq(r0_full
& ~r1
.full
)
2518 comb
+= stall_out
.eq(r0_stall
)
2520 # Wire up wishbone request latch out of stage 1
2521 comb
+= wishbone_out
.eq(r1
.wb
)
2527 # entity dcache_tb is
2530 # architecture behave of dcache_tb is
2531 # signal clk : std_ulogic;
2532 # signal rst : std_ulogic;
2534 # signal d_in : Loadstore1ToDcacheType;
2535 # signal d_out : DcacheToLoadstore1Type;
2537 # signal m_in : MmuToDcacheType;
2538 # signal m_out : DcacheToMmuType;
2540 # signal wb_bram_in : wishbone_master_out;
2541 # signal wb_bram_out : wishbone_slave_out;
2543 # constant clk_period : time := 10 ns;
2545 # dcache0: entity work.dcache
2558 # wishbone_out => wb_bram_in,
2559 # wishbone_in => wb_bram_out
2562 # -- BRAM Memory slave
2563 # bram0: entity work.wishbone_bram_wrapper
2565 # MEMORY_SIZE => 1024,
2566 # RAM_INIT_FILE => "icache_test.bin"
2571 # wishbone_in => wb_bram_in,
2572 # wishbone_out => wb_bram_out
2575 # clk_process: process
2578 # wait for clk_period/2;
2580 # wait for clk_period/2;
2583 # rst_process: process
2586 # wait for 2*clk_period;
2594 # d_in.valid <= '0';
2597 # d_in.addr <= (others => '0');
2598 # d_in.data <= (others => '0');
2599 # m_in.valid <= '0';
2600 # m_in.addr <= (others => '0');
2601 # m_in.pte <= (others => '0');
2603 # wait for 4*clk_period;
2604 # wait until rising_edge(clk);
2606 # -- Cacheable read of address 4
2609 # d_in.addr <= x"0000000000000004";
2610 # d_in.valid <= '1';
2611 # wait until rising_edge(clk);
2612 # d_in.valid <= '0';
2614 # wait until rising_edge(clk) and d_out.valid = '1';
2615 # assert d_out.data = x"0000000100000000"
2616 # report "data @" & to_hstring(d_in.addr) &
2617 # "=" & to_hstring(d_out.data) &
2618 # " expected 0000000100000000"
2620 # -- wait for clk_period;
2622 # -- Cacheable read of address 30
2625 # d_in.addr <= x"0000000000000030";
2626 # d_in.valid <= '1';
2627 # wait until rising_edge(clk);
2628 # d_in.valid <= '0';
2630 # wait until rising_edge(clk) and d_out.valid = '1';
2631 # assert d_out.data = x"0000000D0000000C"
2632 # report "data @" & to_hstring(d_in.addr) &
2633 # "=" & to_hstring(d_out.data) &
2634 # " expected 0000000D0000000C"
2637 # -- Non-cacheable read of address 100
2640 # d_in.addr <= x"0000000000000100";
2641 # d_in.valid <= '1';
2642 # wait until rising_edge(clk);
2643 # d_in.valid <= '0';
2644 # wait until rising_edge(clk) and d_out.valid = '1';
2645 # assert d_out.data = x"0000004100000040"
2646 # report "data @" & to_hstring(d_in.addr) &
2647 # "=" & to_hstring(d_out.data) &
2648 # " expected 0000004100000040"
2651 # wait until rising_edge(clk);
2652 # wait until rising_edge(clk);
2653 # wait until rising_edge(clk);
2654 # wait until rising_edge(clk);
2659 def dcache_sim(dut
):
2661 yield dut
.d_in
.valid
.eq(0)
2662 yield dut
.d_in
.load
.eq(0)
2663 yield dut
.d_in
.nc
.eq(0)
2664 yield dut
.d_in
.adrr
.eq(0)
2665 yield dut
.d_in
.data
.eq(0)
2666 yield dut
.m_in
.valid
.eq(0)
2667 yield dut
.m_in
.addr
.eq(0)
2668 yield dut
.m_in
.pte
.eq(0)
2669 # wait 4 * clk_period
2674 # wait_until rising_edge(clk)
2676 # Cacheable read of address 4
2677 yield dut
.d_in
.load
.eq(1)
2678 yield dut
.d_in
.nc
.eq(0)
2679 yield dut
.d_in
.addr
.eq(Const(0x0000000000000004, 64))
2680 yield dut
.d_in
.valid
.eq(1)
2681 # wait-until rising_edge(clk)
2683 yield dut
.d_in
.valid
.eq(0)
2685 while not (yield dut
.d_out
.valid
):
2687 assert dut
.d_out
.data
== Const(0x0000000100000000, 64) f
"data @" \
2688 f
"{dut.d_in.addr}={dut.d_in.data} expected 0000000100000000" \
2689 " -!- severity failure"
2692 # Cacheable read of address 30
2693 yield dut
.d_in
.load
.eq(1)
2694 yield dut
.d_in
.nc
.eq(0)
2695 yield dut
.d_in
.addr
.eq(Const(0x0000000000000030, 64))
2696 yield dut
.d_in
.valid
.eq(1)
2698 yield dut
.d_in
.valid
.eq(0)
2700 while not (yield dut
.d_out
.valid
):
2702 assert dut
.d_out
.data
== Const(0x0000000D0000000C, 64) f
"data @" \
2703 f
"{dut.d_in.addr}={dut.d_out.data} expected 0000000D0000000C" \
2704 f
"-!- severity failure"
2706 # Non-cacheable read of address 100
2707 yield dut
.d_in
.load
.eq(1)
2708 yield dut
.d_in
.nc
.eq(1)
2709 yield dut
.d_in
.addr
.eq(Const(0x0000000000000100, 64))
2710 yield dut
.d_in
.valid
.eq(1)
2712 yield dut
.d_in
.valid
.eq(0)
2714 while not (yield dut
.d_out
.valid
):
2716 assert dut
.d_out
.data
== Const(0x0000004100000040, 64) f
"data @" \
2717 f
"{dut.d_in.addr}={dut.d_out.data} expected 0000004100000040" \
2718 f
"-!- severity failure"
2728 vl
= rtlil
.convert(dut
, ports
=[])
2729 with
open("test_dcache.il", "w") as f
:
2732 run_simulation(dut
, dcache_sim(), vcd_name
='test_dcache.vcd')
2734 if __name__
== '__main__':