whoops, combinatorial loop on pending_priority
[soc.git] / src / soc / experiment / dcache.py
1 """DCache
2
3 based on Anton Blanchard microwatt dcache.vhdl
4
5 """
6
7 from enum import Enum, unique
8
9 from nmigen import Module, Signal, Elaboratable,
10 Cat, Repl
11 from nmigen.cli import main
12 from nmigen.iocontrol import RecordObject
13 from nmigen.util import log2_int
14
15 from experiment.mem_types import LoadStore1ToDCacheType,
16 DCacheToLoadStore1Type,
17 MMUToDCacheType,
18 DCacheToMMUType
19
20 from experiment.wb_types import WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
21 WBAddrType, WBDataType, WBSelType,
22 WbMasterOut, WBSlaveOut,
23 WBMasterOutVector, WBSlaveOutVector,
24 WBIOMasterOut, WBIOSlaveOut
25
26
27 # Record for storing permission, attribute, etc. bits from a PTE
28 class PermAttr(RecordObject):
29 def __init__(self):
30 super().__init__()
31 self.reference = Signal()
32 self.changed = Signal()
33 self.nocache = Signal()
34 self.priv = Signal()
35 self.rd_perm = Signal()
36 self.wr_perm = Signal()
37
38
39 def extract_perm_attr(pte):
40 pa = PermAttr()
41 pa.reference = pte[8]
42 pa.changed = pte[7]
43 pa.nocache = pte[5]
44 pa.priv = pte[3]
45 pa.rd_perm = pte[2]
46 pa.wr_perm = pte[1]
47 return pa;
48
49
50 # Type of operation on a "valid" input
51 @unique
52 class Op(Enum):
53 OP_NONE = 0
54 OP_BAD = 1 # NC cache hit, TLB miss, prot/RC failure
55 OP_STCX_FAIL = 2 # conditional store w/o reservation
56 OP_LOAD_HIT = 3 # Cache hit on load
57 OP_LOAD_MISS = 4 # Load missing cache
58 OP_LOAD_NC = 5 # Non-cachable load
59 OP_STORE_HIT = 6 # Store hitting cache
60 OP_STORE_MISS = 7 # Store missing cache
61
62
63 # Cache state machine
64 @unique
65 class State(Enum):
66 IDLE = 0 # Normal load hit processing
67 RELOAD_WAIT_ACK = 1 # Cache reload wait ack
68 STORE_WAIT_ACK = 2 # Store wait ack
69 NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
70
71
72 # Dcache operations:
73 #
74 # In order to make timing, we use the BRAMs with
75 # an output buffer, which means that the BRAM
76 # output is delayed by an extra cycle.
77 #
78 # Thus, the dcache has a 2-stage internal pipeline
79 # for cache hits with no stalls.
80 #
81 # All other operations are handled via stalling
82 # in the first stage.
83 #
84 # The second stage can thus complete a hit at the same
85 # time as the first stage emits a stall for a complex op.
86 #
87 # Stage 0 register, basically contains just the latched request
88 class RegStage0(RecordObject):
89 def __init__(self):
90 super().__init__()
91 self.req = LoadStore1ToDCacheType()
92 self.tlbie = Signal()
93 self.doall = Signal()
94 self.tlbld = Signal()
95 self.mmu_req = Signal() # indicates source of request
96
97
98 class MemAccessRequest(RecordObject):
99 def __init__(self):
100 super().__init__()
101 self.op = Op()
102 self.valid = Signal()
103 self.dcbz = Signal()
104 self.real_addr = Signal(REAL_ADDR_BITS)
105 self.data = Signal(64)
106 self.byte_sel = Signal(8)
107 self.hit_way = Signal(WAY_BITS)
108 self.same_tag = Signal()
109 self.mmu_req = Signal()
110
111
112 # First stage register, contains state for stage 1 of load hits
113 # and for the state machine used by all other operations
114 class RegStage1(RecordObject):
115 def __init__(self):
116 super().__init__()
117 # Info about the request
118 self.full = Signal() # have uncompleted request
119 self.mmu_req = Signal() # request is from MMU
120 self.req = MemAccessRequest()
121
122 # Cache hit state
123 self.hit_way = Signal(WAY_BITS)
124 self.hit_load_valid = Signal()
125 self.hit_index = Signal(NUM_LINES)
126 self.cache_hit = Signal()
127
128 # TLB hit state
129 self.tlb_hit = Signal()
130 self.tlb_hit_way = Signal(TLB_NUM_WAYS)
131 self.tlb_hit_index = Signal(TLB_SET_SIZE)
132 self.
133 # 2-stage data buffer for data forwarded from writes to reads
134 self.forward_data1 = Signal(64)
135 self.forward_data2 = Signal(64)
136 self.forward_sel1 = Signal(8)
137 self.forward_valid1 = Signal()
138 self.forward_way1 = Signal(WAY_BITS)
139 self.forward_row1 = Signal(BRAM_ROWS)
140 self.use_forward1 = Signal()
141 self.forward_sel = Signal(8)
142
143 # Cache miss state (reload state machine)
144 self.state = State()
145 self.dcbz = Signal()
146 self.write_bram = Signal()
147 self.write_tag = Signal()
148 self.slow_valid = Signal()
149 self.wb = WishboneMasterOut()
150 self.reload_tag = Signal(TAG_BITS)
151 self.store_way = Signal(WAY_BITS)
152 self.store_row = Signal(BRAM_ROWS)
153 self.store_index = Signal(NUM_LINES)
154 self.end_row_ix = Signal(ROW_LINE_BIT)
155 self.rows_valid = RowPerLineValidArray()
156 self.acks_pending = Signal(3)
157 self.inc_acks = Signal()
158 self.dec_acks = Signal()
159
160 # Signals to complete (possibly with error)
161 self.ls_valid = Signal()
162 self.ls_error = Signal()
163 self.mmu_done = Signal()
164 self.mmu_error = Signal()
165 self.cache_paradox = Signal()
166
167 # Signal to complete a failed stcx.
168 self.stcx_fail = Signal()
169
170
171 # Reservation information
172 class Reservation(RecordObject):
173 def __init__(self):
174 super().__init__()
175 valid = Signal()
176 # TODO LINE_OFF_BITS is 6
177 addr = Signal(63 downto LINE_OFF_BITS)
178
179
180 # Set associative dcache write-through
181 #
182 # TODO (in no specific order):
183 #
184 # * See list in icache.vhdl
185 # * Complete load misses on the cycle when WB data comes instead of
186 # at the end of line (this requires dealing with requests coming in
187 # while not idle...)
188 class DCache(Elaboratable):
189 def __init__(self):
190 # TODO: make these parameters of DCache at some point
191 self.LINE_SIZE = 64 # Line size in bytes
192 self.NUM_LINES = 32 # Number of lines in a set
193 self.NUM_WAYS = 4 # Number of ways
194 self.TLB_SET_SIZE = 64 # L1 DTLB entries per set
195 self.TLB_NUM_WAYS = 2 # L1 DTLB number of sets
196 self.TLB_LG_PGSZ = 12 # L1 DTLB log_2(page_size)
197 self.LOG_LENGTH = 0 # Non-zero to enable log data collection
198
199 self.d_in = LoadStore1ToDCacheType()
200 self.d_out = DCacheToLoadStore1Type()
201
202 self.m_in = MMUToDCacheType()
203 self.m_out = DCacheToMMUType()
204
205 self.stall_out = Signal()
206
207 self.wb_out = WBMasterOut()
208 self.wb_in = WBSlaveOut()
209
210 self.log_out = Signal(20)
211
212 # Latch the request in r0.req as long as we're not stalling
213 def stage_0(self, m, d_in, m_in):
214 comb = m.d.comb
215 sync = m.d.sync
216
217 r = RegStage0()
218
219 # TODO, this goes in unit tests and formal proofs
220 # assert ~(d_in.valid & m_in.valid),
221 # "request collision loadstore vs MMU"
222 with m.If(~(d_in.valid & m_in.valid)):
223 #sync += Display("request collision loadstore vs MMU")
224 pass
225
226 with m.If(m_in.valid):
227 sync += r.req.valid.eq(1)
228 sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
229 sync += r.req.dcbz.eq(0)
230 sync += r.req.nc.eq(0)
231 sync += r.req.reserve.eq(0)
232 sync += r.req.virt_mode.eq(1)
233 sync += r.req.priv_mode.eq(1)
234 sync += r.req.addr.eq(m_in.addr)
235 sync += r.req.data.eq(m_in.pte)
236 sync += r.req.byte_sel.eq(-1) # Const -1 sets all to 0b111....
237 sync += r.tlbie.eq(m_in.tlbie)
238 sync += r.doall.eq(m_in.doall)
239 sync += r.tlbld.eq(m_in.tlbld)
240 sync += r.mmu_req.eq(1)
241 with m.Else():
242 sync += r.req.eq(d_in)
243 sync += r.req.tlbie.eq(0)
244 sync += r.req.doall.eq(0)
245 sync += r.req.tlbd.eq(0)
246 sync += r.req.mmu_req.eq(0)
247 with m.If(~(r1.full & r0_full)):
248 sync += r0.eq(r)
249 sync += r0_full.eq(r.req.valid)
250
251 # TLB
252 # Operates in the second cycle on the request latched in r0.req.
253 # TLB updates write the entry at the end of the second cycle.
254 def tlb_read(self, m, m_in, d_in, r0_stall, tlb_valid_way,
255 tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
256 dtlb_tags, dtlb_ptes):
257
258 comb = m.d.comb
259 sync = m.d.sync
260
261 index = Signal(log2_int(TLB_SET_BITS), False)
262 addrbits = Signal(TLB_SET_BITS)
263
264 amin = TLB_LG_PGSZ
265 amax = TLB_LG_PGSZ + TLB_SET_BITS
266
267 with m.If(m_in.valid):
268 comb += addrbits.eq(m_in.addr[amin : amax])
269 with m.Else():
270 comb += addrbits.eq(d_in.addr[amin : amax])
271 comb += index.eq(addrbits)
272
273 # If we have any op and the previous op isn't finished,
274 # then keep the same output for next cycle.
275 with m.If(~r0_stall):
276 sync += tlb_valid_way.eq(dtlb_valid_bits[index])
277 sync += tlb_tag_way.eq(dtlb_tags[index])
278 sync += tlb_pte_way.eq(dtlb_ptes[index])
279
280 # Generate TLB PLRUs
281 def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, acc, acc_en, lru):
282 comb = m.d.comb
283 sync = m.d.sync
284
285 with m.If(TLB_NUM_WAYS > 1):
286 for i in range(TLB_SET_SIZE):
287 # TLB PLRU interface
288 tlb_plru = PLRU(TLB_WAY_BITS)
289 tlb_plru_acc = Signal(TLB_WAY_BITS)
290 tlb_plru_acc_en = Signal()
291 tlb_plru_out = Signal(TLB_WAY_BITS)
292
293 comb += tlb_plru.acc.eq(tlb_plru_acc)
294 comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
295 comb += tlb_plru.lru.eq(tlb_plru_out)
296
297 # PLRU interface
298 with m.If(r1.tlb_hit_index == i):
299 comb += tlb_plru.acc_en.eq(r1.tlb_hit)
300 with m.Else():
301 comb += tlb_plru.acc_en.eq(0)
302 comb += tlb_plru.acc.eq(r1.tlb_hit_way)
303
304 comb += tlb_plru_victim[i].eq(tlb_plru.lru)
305
306 def tlb_search(self, tlb_req_index, r0, tlb_valid_way_ tlb_tag_way,
307 tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
308
309 comb = m.d.comb
310 sync = m.d.sync
311
312 hitway = Signal(TLB_WAY_BITS)
313 hit = Signal()
314 eatag = Signal(log2_int(TLB_EA_TAG_BITS, False))
315
316 TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
317 comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
318 comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
319
320 for i in range(TLB_NUM_WAYS):
321 with m.If(tlb_valid_way(i)
322 & read_tlb_tag(i, tlb_tag_way) == eatag):
323 comb += hitway.eq(i)
324 comb += hit.eq(1)
325
326 comb += tlb_hit.eq(hit & r0_valid)
327 comb += tlb_hit_way.eq(hitway)
328
329 with m.If(tlb_hit):
330 comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
331 with m.Else():
332 comb += pte.eq(0)
333 comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
334 with m.If(r0.req.virt_mode):
335 comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
336 r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
337 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
338 comb += perm_attr.eq(extract_perm_attr(pte))
339 with m.Else():
340 comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
341 r0.rq.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
342
343 comb += perm_attr.reference.eq(1)
344 comb += perm_attr.changed.eq(1)
345 comb += perm_attr.priv.eq(1)
346 comb += perm_attr.nocache.eq(0)
347 comb += perm_attr.rd_perm.eq(1)
348 comb += perm_attr.wr_perm.eq(1)
349
350 def tlb_update(self, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
351 tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
352 dtlb_tags, tlb_pte_way, dtlb_ptes, dtlb_valid_bits):
353
354 comb = m.d.comb
355 sync = m.d.sync
356
357 # variable tlbie : std_ulogic;
358 # variable tlbwe : std_ulogic;
359 # variable repl_way : tlb_way_t;
360 # variable eatag : tlb_tag_t;
361 # variable tagset : tlb_way_tags_t;
362 # variable pteset : tlb_way_ptes_t;
363 #type tlb_tags_t is array(tlb_index_t) of tlb_way_tags_t;
364 # --> Array([Signal(log(way_tags length)) for i in range(number of tlbs)])
365
366 tlbie = Signal()
367 tlbwe = Signal()
368 repl_way = Signal(TLB_WAY_BITS)
369 eatag = Signal(log2_int(TLB_EA_TAG_BITS, False))
370 tagset = TLBWayTags()
371 pteset = TLBWayPtes()
372
373 comb += tlbie.eq(r0_valid & r0.tlbie)
374 comb += tlbwe.eq(r0_valid & r0.tlbldoi)
375
376 with m.If(tlbie & r0.doall):
377 # clear all valid bits at once
378 for i in range(TLB_SET_SIZE):
379 sync += dtlb_valid_bits[i].eq(0)
380
381 with m.Elif(tlbie):
382 with m.If(tlb_hit):
383 sync += dtlb_valid_bits[tlb_req_index][tlb_hit_way].eq(0)
384 with m.Elif(tlbwe):
385 with m.If(tlb_hit):
386 comb += repl_way.eq(tlb_hit_way)
387 with m.Else():
388 comb += repl_way.eq(tlb_plru_victim[tlb_req_index])
389 comb += eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
390 comb += tagset.eq(tlb_tag_way)
391 sync += write_tlb_tag(repl_way, tagset, eatag)
392 sync += dtlb_tags[tlb_req_index].eq(tagset)
393 comb += pteset.eq(tlb_pte_way)
394 sync += write_tlb_pte(repl_way, pteset, r0.req.data)
395 sync += dtlb_ptes[tlb_req_index].eq(pteset)
396 sync += dtlb_valid_bits[tlb_req_index][repl_way].eq(1)
397
398 # Generate PLRUs
399 def maybe_plrus(self, r1):
400
401 comb = m.d.comb
402 sync = m.d.sync
403
404 for i in range(NUM_LINES):
405 # PLRU interface
406 plru = PLRU(TLB_WAY_BITS)
407 setattr(m.submodules, "plru%d" % i, plru)
408 plru_acc = Signal(TLB_WAY_BITS)
409 plru_acc_en = Signal()
410 plru_out = Signal(TLB_WAY_BITS)
411
412 comb += plru.acc.eq(plru_acc)
413 comb += plru.acc_en.eq(plru_acc_en)
414 comb += plru.lru.eq(plru_out)
415
416 with m.If(r1.hit_index == i):
417 comb += plru_acc_en.eq(r1.cache_hit)
418
419 comb += plru_acc.eq(r1.hit_way)
420 comb += plru_victim[i].eq(plru_out)
421
422 # Cache tag RAM read port
423 def cache_tag_read(self, r0_stall, req_index, m_in, d_in,
424 cache_tag_set, cache_tags):
425
426 comb = m.d.comb
427 sync = m.d.sync
428
429 index = Signal(INDEX_BITS)
430
431 with m.If(r0_stall):
432 comb += index.eq(req_index)
433 with m.Elif(m_in.valid):
434 comb += index.eq(get_index(m_in.addr))
435 with m.Else():
436 comb += index.eq(get_index(d_in.addr))
437 sync += cache_tag_set.eq(cache_tags[index])
438
439 # Cache request parsing and hit detection
440 def dcache_request(self, r0, ra, req_index, req_row, req_tag,
441 r0_valid, r1, cache_valid_bits, replace_way,
442 use_forward1_next, use_forward2_next,
443 req_hit_way, plru_victim, rc_ok, perm_attr,
444 valid_ra, perm_ok, access_ok, req_op, req_ok,
445 r0_stall, m_in, early_req_row, d_in):
446
447 comb = m.d.comb
448 sync = m.d.sync
449
450 is_hit = Signal()
451 hit_way = Signal(WAY_BITS)
452 op = Op()
453 opsel = Signal(3)
454 go = Signal()
455 nc = Signal()
456 s_hit = Signal()
457 s_tag = Signal(TAG_BITS)
458 s_pte = Signal(TLB_PTE_BITS)
459 s_ra = Signal(REAL_ADDR_BITS)
460 hit_set = Signal(TLB_NUM_WAYS)
461 hit_way_set = HitWaySet()
462 rel_matches = Signal(TLB_NUM_WAYS)
463 rel_match = Signal()
464
465 # Extract line, row and tag from request
466 comb += req_index.eq(get_index(r0.req.addr))
467 comb += req_row.eq(get_row(r0.req.addr))
468 comb += req_tag.eq(get_tag(ra))
469
470 comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
471
472 # Test if pending request is a hit on any way
473 # In order to make timing in virtual mode,
474 # when we are using the TLB, we compare each
475 # way with each of the real addresses from each way of
476 # the TLB, and then decide later which match to use.
477
478 with m.If(r0.req.virt_mode):
479 comb += rel_matches.eq(0)
480 for j in range(TLB_NUM_WAYS):
481 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
482 comb += s_ra.eq(Cat(r0.req.addr[0:TLB_LG_PGSZ],
483 s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
484 comb += s_tag.eq(get_tag(s_ra))
485
486 for i in range(NUM_WAYS):
487 with m.If(go & cache_valid_bits[req_index][i] &
488 read_tag(i, cache_tag_set) == s_tag
489 & tlb_valid_way[j]):
490 comb += hit_way_set[j].eq(i)
491 comb += s_hit.eq(1)
492 comb += hit_set[j].eq(s_hit)
493 with m.If(s_tag == r1.reload_tag):
494 comb += rel_matches[j].eq(1)
495 with m.If(tlb_hit):
496 comb += is_hit.eq(hit_set[tlb_hit_way])
497 comb += hit_way.eq(hit_way_set[tlb_hit_way])
498 comb += rel_match.eq(rel_matches[tlb_hit_way])
499 with m.Else():
500 comb += s_tag.eq(get_tag(r0.req.addr))
501 for i in range(NUM_WAYS):
502 with m.If(go & cache_valid_bits[req_index][i] &
503 read_tag(i, cache_tag_set) == s_tag):
504 comb += hit_way.eq(i)
505 comb += is_hit.eq(1)
506 with m.If(s_tag == r1.reload_tag):
507 comb += rel_match.eq(1)
508 comb += req_same_tag.eq(rel_match)
509
510 # See if the request matches the line currently being reloaded
511 with m.If((r1.state == State.RELOAD_WAIT_ACK) &
512 (req_index == r1.store_index) & rel_match):
513 # For a store, consider this a hit even if the row isn't
514 # valid since it will be by the time we perform the store.
515 # For a load, check the appropriate row valid bit.
516 valid = r1.rows_valid[req_row % ROW_PER_LINE]
517 comb += is_hit.eq(~r0.req.load | valid)
518 comb += hit_way.eq(replace_way)
519
520 # Whether to use forwarded data for a load or not
521 comb += use_forward1_next.eq(0)
522 with m.If((get_row(r1.req.real_addr) == req_row)
523 & (r1.req.hit_way == hit_way))
524 # Only need to consider r1.write_bram here, since if we
525 # are writing refill data here, then we don't have a
526 # cache hit this cycle on the line being refilled.
527 # (There is the possibility that the load following the
528 # load miss that started the refill could be to the old
529 # contents of the victim line, since it is a couple of
530 # cycles after the refill starts before we see the updated
531 # cache tag. In that case we don't use the bypass.)
532 comb += use_forward1_next.eq(r1.write_bram)
533 comb += use_forward2_next.eq(0)
534 with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
535 comb += use_forward2_next.eq(r1.forward_valid1)
536
537 # The way that matched on a hit
538 comb += req_hit_way.eq(hit_way)
539
540 # The way to replace on a miss
541 with m.If(r1.write_tag):
542 replace_way.eq(plru_victim[r1.store_index])
543 with m.Else():
544 comb += replace_way.eq(r1.store_way)
545
546 # work out whether we have permission for this access
547 # NB we don't yet implement AMR, thus no KUAP
548 comb += rc_ok.eq( perm_attr.reference
549 & (r0.req.load | perm_attr.changed)
550 )
551 comb += perm_ok.eq((r0.req.prive_mode | ~perm_attr.priv)
552 & perm_attr.wr_perm
553 | (r0.req.load & perm_attr.rd_perm)
554 )
555 comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
556 # Combine the request and cache hit status to decide what
557 # operation needs to be done
558 comb += nc.eq(r0.req.nc | perm_attr.nocache)
559 comb += op.eq(Op.OP_NONE)
560 with m.If(go):
561 with m.If(~access_ok):
562 comb += op.eq(Op.OP_BAD)
563 with m.Elif(cancel_store):
564 comb += op.eq(Op.OP_STCX_FAIL)
565 with m.Else():
566 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
567 with m.Switch(opsel):
568 with m.Case(Const(0b101, 3)):
569 comb += op.eq(Op.OP_LOAD_HIT)
570 with m.Case(Cosnt(0b100, 3)):
571 comb += op.eq(Op.OP_LOAD_MISS)
572 with m.Case(Const(0b110, 3)):
573 comb += op.eq(Op.OP_LOAD_NC)
574 with m.Case(Const(0b001, 3)):
575 comb += op.eq(Op.OP_STORE_HIT)
576 with m.Case(Const(0b000, 3)):
577 comb += op.eq(Op.OP_STORE_MISS)
578 with m.Case(Const(0b010, 3)):
579 comb += op.eq(Op.OP_STORE_MISS)
580 with m.Case(Const(0b011, 3)):
581 comb += op.eq(Op.OP_BAD)
582 with m.Case(Const(0b111, 3)):
583 comb += op.eq(Op.OP_BAD)
584 with m.Default():
585 comb += op.eq(Op.OP_NONE)
586 comb += req_op.eq(op)
587 comb += req_go.eq(go)
588
589 # Version of the row number that is valid one cycle earlier
590 # in the cases where we need to read the cache data BRAM.
591 # If we're stalling then we need to keep reading the last
592 # row requested.
593 with m.If(~r0_stall):
594 with m.If(m_in.valid):
595 comb += early_req_row.eq(get_row(m_in.addr))
596 with m.Else():
597 comb += early_req_row.eq(get_row(d_in.addr))
598 with m.Else():
599 comb += early_req_row.eq(req_row)
600
601 # Handle load-with-reservation and store-conditional instructions
602 def reservation_comb(self, cancel_store, set_rsrv, clear_rsrv,
603 r0_valid, r0, reservation):
604
605 comb = m.d.comb
606 sync = m.d.sync
607
608 with m.If(r0_valid & r0.req.reserve):
609
610 # XXX generate alignment interrupt if address
611 # is not aligned XXX or if r0.req.nc = '1'
612 with m.If(r0.req.load):
613 comb += set_rsrv(1) # load with reservation
614 with m.Else():
615 comb += clear_rsrv.eq(1) # store conditional
616 with m.If(~reservation.valid | r0.req.addr[LINE_OFF_BITS:64]):
617 comb += cancel_store.eq(1)
618
619 def reservation_reg(self, r0_valid, access_ok, clear_rsrv,
620 reservation, r0):
621
622 comb = m.d.comb
623 sync = m.d.sync
624
625 with m.If(r0_valid & access_ok):
626 with m.If(clear_rsrv):
627 sync += reservation.valid.eq(0)
628 with m.Elif(set_rsrv):
629 sync += reservation.valid.eq(1)
630 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
631
632 # Return data for loads & completion control logic
633 def writeback_control(self, r1, cache_out, d_out, m_out):
634
635 comb = m.d.comb
636 sync = m.d.sync
637
638 data_out = Signal(64)
639 data_fwd = Signal(64)
640 j = Signal()
641
642 # Use the bypass if are reading the row that was
643 # written 1 or 2 cycles ago, including for the
644 # slow_valid = 1 case (i.e. completing a load
645 # miss or a non-cacheable load).
646 with m.If(r1.use_forward1):
647 comb += data_fwd.eq(r1.forward_data1)
648 with m.Else():
649 comb += data_fwd.eq(r1.forward_data2)
650
651 comb += data_out.eq(cache_out[r1.hit_way])
652
653 for i in range(8):
654 with m.If(r1.forward_sel[i]):
655 dsel = data_fwd.word_select(i, 8)
656 comb += data_out.word_select(i, 8).eq(dsel)
657
658 comb += d_out.valid.eq(r1.ls_valid)
659 comb += d_out.data.eq(data_out)
660 comb += d_out.store_done.eq(~r1.stcx_fail)
661 comb += d_out.error.eq(r1.ls_error)
662 comb += d_out.cache_paradox.eq(r1.cache_paradox)
663
664 # Outputs to MMU
665 comb += m_out.done.eq(r1.mmu_done)
666 comb += m_out.err.eq(r1.mmu_error)
667 comb += m_out.data.eq(data_out)
668
669 # We have a valid load or store hit or we just completed
670 # a slow op such as a load miss, a NC load or a store
671 #
672 # Note: the load hit is delayed by one cycle. However it
673 # can still not collide with r.slow_valid (well unless I
674 # miscalculated) because slow_valid can only be set on a
675 # subsequent request and not on its first cycle (the state
676 # machine must have advanced), which makes slow_valid
677 # at least 2 cycles from the previous hit_load_valid.
678
679 # Sanity: Only one of these must be set in any given cycle
680
681 if False: # TODO: need Display to get this to work
682 assert (r1.slow_valid & r1.stcx_fail) != 1 "unexpected" \
683 "slow_valid collision with stcx_fail -!- severity FAILURE"
684
685 assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1
686 "unexpected hit_load_delayed collision with slow_valid -!-" \
687 "severity FAILURE"
688
689 with m.If(~r1._mmu_req):
690 # Request came from loadstore1...
691 # Load hit case is the standard path
692 with m.If(r1.hit_load_valid):
693 # report
694 # "completing load hit data=" & to_hstring(data_out);
695 #Display(f"completing load hit data={data_out}")
696 pass
697
698 # error cases complete without stalling
699 with m.If(r1.ls_error):
700 # Display("completing ld/st with error")
701 pass
702
703 # Slow ops (load miss, NC, stores)
704 with m.If(r1.slow_valid):
705 #Display(f"completing store or load miss data={data_out}")
706 pass
707
708 with m.Else():
709 # Request came from MMU
710 with m.If(r1.hit_load_valid):
711 # Display(f"completing load hit to MMU, data={m_out.data}")
712 pass
713 # error cases complete without stalling
714 with m.If(r1.mmu_error):
715 #Display("combpleting MMU ld with error")
716 pass
717
718 # Slow ops (i.e. load miss)
719 with m.If(r1.slow_valid):
720 #Display("completing MMU load miss, data={m_out.data}")
721 pass
722
723 # Generate a cache RAM for each way. This handles the normal
724 # reads, writes from reloads and the special store-hit update
725 # path as well.
726 #
727 # Note: the BRAMs have an extra read buffer, meaning the output
728 # is pipelined an extra cycle. This differs from the
729 # icache. The writeback logic needs to take that into
730 # account by using 1-cycle delayed signals for load hits.
731 def rams(self, ):
732 for i in range(NUM_WAYS):
733 # signal do_read : std_ulogic;
734 # signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
735 # signal do_write : std_ulogic;
736 # signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
737 # signal wr_data :
738 # std_ulogic_vector(wishbone_data_bits-1 downto 0);
739 # signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
740 # signal wr_sel_m : std_ulogic_vector(ROW_SIZE-1 downto 0);
741 # signal dout : cache_row_t;
742 do_read = Signal()
743 rd_addr = Signal(ROW_BITS)
744 do_write = Signal()
745 wr_addr = Signal(ROW_BITS)
746 wr_data = Signal(WB_DATA_BITS)
747 wr_sel = Signal(ROW_SIZE)
748 wr_sel_m = Signal(ROW_SIZE)
749 _d_out = Signal(WB_DATA_BITS)
750
751 # begin
752 # way: entity work.cache_ram
753 # generic map (
754 # ROW_BITS => ROW_BITS,
755 # WIDTH => wishbone_data_bits,
756 # ADD_BUF => true
757 # )
758 # port map (
759 # clk => clk,
760 # rd_en => do_read,
761 # rd_addr => rd_addr,
762 # rd_data => dout,
763 # wr_sel => wr_sel_m,
764 # wr_addr => wr_addr,
765 # wr_data => wr_data
766 # );
767 # process(all)
768 way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
769 comb += way.rd_en.eq(do_read)
770 comb += way.rd_addr.eq(rd_addr)
771 comb += way.rd_data.eq(_d_out)
772 comb += way.wr_sel.eq(wr_sel_m)
773 comb += way.wr_addr.eq(wr_addr)
774 comb += way.wr_data.eq(wr_data)
775
776 # begin
777 # -- Cache hit reads
778 # do_read <= '1';
779 # rd_addr <=
780 # std_ulogic_vector(to_unsigned(early_req_row, ROW_BITS));
781 # cache_out(i) <= dout;
782 # Cache hit reads
783 comb += do_read.eq(1)
784 comb += rd_addr.eq(Signal(BRAM_ROWS))
785 comb += cache_out[i].eq(dout)
786
787 # -- Write mux:
788 # --
789 # -- Defaults to wishbone read responses (cache refill)
790 # --
791 # -- For timing, the mux on wr_data/sel/addr is not
792 # -- dependent on anything other than the current state.
793 # Write mux:
794 #
795 # Defaults to wishbone read responses (cache refill)
796 #
797 # For timing, the mux on wr_data/sel/addr is not
798 # dependent on anything other than the current state.
799 # wr_sel_m <= (others => '0');
800 comb += wr_sel_m.eq(0)
801
802 # do_write <= '0';
803 comb += do_write.eq(0)
804 # if r1.write_bram = '1' then
805 with m.If(r1.write_bram):
806 # -- Write store data to BRAM. This happens one
807 # -- cycle after the store is in r0.
808 # Write store data to BRAM. This happens one
809 # cycle after the store is in r0.
810 # wr_data <= r1.req.data;
811 # wr_sel <= r1.req.byte_sel;
812 # wr_addr <= std_ulogic_vector(to_unsigned(
813 # get_row(r1.req.real_addr), ROW_BITS
814 # ));
815 comb += wr_data.eq(r1.req.data)
816 comb += wr_sel.eq(r1.req.byte_sel)
817 comb += wr_addr.eq(Signal(get_row(r1.req.real_addr)))
818
819 # if i = r1.req.hit_way then
820 with m.If(i == r1.req.hit_way):
821 # do_write <= '1';
822 comb += do_write.eq(1)
823 # end if;
824 # else
825 with m.Else():
826 # -- Otherwise, we might be doing a reload or a DCBZ
827 # if r1.dcbz = '1' then
828 # Otherwise, we might be doing a reload or a DCBZ
829 with m.If(r1.dcbz):
830 # wr_data <= (others => '0');
831 comb += wr_data.eq(0)
832 # else
833 with m.Else():
834 # wr_data <= wishbone_in.dat;
835 comb += wr_data.eq(wishbone_in.dat)
836 # end if;
837
838 # wr_addr <= std_ulogic_vector(to_unsigned(
839 # r1.store_row, ROW_BITS
840 # ));
841 # wr_sel <= (others => '1');
842 comb += wr_addr.eq(Signal(r1.store_row))
843 comb += wr_sel.eq(1)
844
845 # if r1.state = RELOAD_WAIT_ACK and
846 # wishbone_in.ack = '1' and replace_way = i then
847 with m.If(r1.state == State.RELOAD_WAIT_ACK
848 & wishbone_in.ack & relpace_way == i):
849 # do_write <= '1';
850 comb += do_write.eq(1)
851 # end if;
852 # end if;
853
854 # -- Mask write selects with do_write since BRAM
855 # -- doesn't have a global write-enable
856 # if do_write = '1' then
857 # -- Mask write selects with do_write since BRAM
858 # -- doesn't have a global write-enable
859 with m.If(do_write):
860 # wr_sel_m <= wr_sel;
861 comb += wr_sel_m.eq(wr_sel)
862 # end if;
863 # end process;
864 # end generate;
865
866 # Cache hit synchronous machine for the easy case.
867 # This handles load hits.
868 # It also handles error cases (TLB miss, cache paradox)
869 def dcache_fast_hit(self, req_op, r0_valid, r1, ):
870
871 comb = m.d.comb
872 sync = m.d.sync
873
874 # begin
875 # if rising_edge(clk) then
876 # if req_op /= OP_NONE then
877 with m.If(req_op != Op.OP_NONE):
878 # report "op:" & op_t'image(req_op) &
879 # " addr:" & to_hstring(r0.req.addr) &
880 # " nc:" & std_ulogic'image(r0.req.nc) &
881 # " idx:" & integer'image(req_index) &
882 # " tag:" & to_hstring(req_tag) &
883 # " way: " & integer'image(req_hit_way);
884 print(f"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \
885 f"idx:{req_index} tag:{req_tag} way: {req_hit_way}"
886 )
887 # end if;
888 # if r0_valid = '1' then
889 with m.If(r0_valid):
890 # r1.mmu_req <= r0.mmu_req;
891 sync += r1.mmu_req.eq(r0.mmu_req)
892 # end if;
893
894 # -- Fast path for load/store hits.
895 # -- Set signals for the writeback controls.
896 # r1.hit_way <= req_hit_way;
897 # r1.hit_index <= req_index;
898 # Fast path for load/store hits.
899 # Set signals for the writeback controls.
900 sync += r1.hit_way.eq(req_hit_way)
901 sync += r1.hit_index.eq(req_index)
902
903 # if req_op = OP_LOAD_HIT then
904 with m.If(req_op == Op.OP_LOAD_HIT):
905 # r1.hit_load_valid <= '1';
906 sync += r1.hit_load_valid.eq(1)
907
908 # else
909 with m.Else():
910 # r1.hit_load_valid <= '0';
911 sync += r1.hit_load_valid.eq(0)
912 # end if;
913
914 # if req_op = OP_LOAD_HIT or req_op = OP_STORE_HIT then
915 with m.If(req_op == Op.OP_LOAD_HIT | req_op == Op.OP_STORE_HIT):
916 # r1.cache_hit <= '1';
917 sync += r1.cache_hit.eq(1)
918 # else
919 with m.Else():
920 # r1.cache_hit <= '0';
921 sync += r1.cache_hit.eq(0)
922 # end if;
923
924 # if req_op = OP_BAD then
925 with m.If(req_op == Op.OP_BAD):
926 # report "Signalling ld/st error valid_ra=" &
927 # std_ulogic'image(valid_ra) & " rc_ok=" &
928 # std_ulogic'image(rc_ok) & " perm_ok=" &
929 # std_ulogic'image(perm_ok);
930 print(f"Signalling ld/st error valid_ra={valid_ra}"
931 f"rc_ok={rc_ok} perm_ok={perm_ok}"
932
933 # r1.ls_error <= not r0.mmu_req;
934 # r1.mmu_error <= r0.mmu_req;
935 # r1.cache_paradox <= access_ok;
936 sync += r1.ls_error.eq(~r0.mmu_req)
937 sync += r1.mmu_error.eq(r0.mmu_req)
938 sync += r1.cache_paradox.eq(access_ok)
939
940 # else
941 with m.Else():
942 # r1.ls_error <= '0';
943 # r1.mmu_error <= '0';
944 # r1.cache_paradox <= '0';
945 sync += r1.ls_error.eq(0)
946 sync += r1.mmu_error.eq(0)
947 sync += r1.cache_paradox.eq(0)
948 # end if;
949 #
950 # if req_op = OP_STCX_FAIL then
951 with m.If(req_op == Op.OP_STCX_FAIL):
952 # r1.stcx_fail <= '1';
953 r1.stcx_fail.eq(1)
954
955 # else
956 with m.Else():
957 # r1.stcx_fail <= '0';
958 sync += r1.stcx_fail.eq(0)
959 # end if;
960 #
961 # -- Record TLB hit information for updating TLB PLRU
962 # r1.tlb_hit <= tlb_hit;
963 # r1.tlb_hit_way <= tlb_hit_way;
964 # r1.tlb_hit_index <= tlb_req_index;
965 # Record TLB hit information for updating TLB PLRU
966 sync += r1.tlb_hit.eq(tlb_hit)
967 sync += r1.tlb_hit_way.eq(tlb_hit_way)
968 sync += r1.tlb_hit_index.eq(tlb_req_index)
969 # end if;
970 # end process;
971
972 # Memory accesses are handled by this state machine:
973 #
974 # * Cache load miss/reload (in conjunction with "rams")
975 # * Load hits for non-cachable forms
976 # * Stores (the collision case is handled in "rams")
977 #
978 # All wishbone requests generation is done here.
979 # This machine operates at stage 1.
980 def dcache_slow(self, r1, use_forward1_next, cache_valid_bits, r0,
981 r0_valid, req_op, cache_tag, req_go, ra, wb_in):
982
983 comb = m.d.comb
984 sync = m.d.sync
985
986 # variable stbs_done : boolean;
987 # variable req : mem_access_request_t;
988 # variable acks : unsigned(2 downto 0);
989 stbs_done = Signal()
990 req = MemAccessRequest()
991 acks = Signal(3)
992
993 comb += stbs_done
994 comb += req
995 comb += acks
996
997 # begin
998 # if rising_edge(clk) then
999 # r1.use_forward1 <= use_forward1_next;
1000 # r1.forward_sel <= (others => '0');
1001 sync += r1.use_forward1.eq(use_forward1_next)
1002 sync += r1.forward_sel.eq(0)
1003
1004 # if use_forward1_next = '1' then
1005 with m.If(use_forward1_next):
1006 # r1.forward_sel <= r1.req.byte_sel;
1007 sync += r1.forward_sel.eq(r1.req.byte_sel)
1008
1009 # elsif use_forward2_next = '1' then
1010 with m.Elif(use_forward2_next):
1011 # r1.forward_sel <= r1.forward_sel1;
1012 sync += r1.forward_sel.eq(r1.forward_sel1)
1013 # end if;
1014
1015 # r1.forward_data2 <= r1.forward_data1;
1016 sync += r1.forward_data2.eq(r1.forward_data1)
1017
1018 # if r1.write_bram = '1' then
1019 with m.If(r1.write_bram):
1020 # r1.forward_data1 <= r1.req.data;
1021 # r1.forward_sel1 <= r1.req.byte_sel;
1022 # r1.forward_way1 <= r1.req.hit_way;
1023 # r1.forward_row1 <= get_row(r1.req.real_addr);
1024 # r1.forward_valid1 <= '1';
1025 sync += r1.forward_data1.eq(r1.req.data)
1026 sync += r1.forward_sel1.eq(r1.req.byte_sel)
1027 sync += r1.forward_way1.eq(r1.req.hit_way)
1028 sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1029 sync += r1.forward_valid1.eq(1)
1030 # else
1031 with m.Else():
1032
1033 # if r1.dcbz = '1' then
1034 with m.If(r1.bcbz):
1035 # r1.forward_data1 <= (others => '0');
1036 sync += r1.forward_data1.eq(0)
1037
1038 # else
1039 with m.Else():
1040 # r1.forward_data1 <= wishbone_in.dat;
1041 sync += r1.forward_data1.eq(wb_in.dat)
1042 # end if;
1043
1044 # r1.forward_sel1 <= (others => '1');
1045 # r1.forward_way1 <= replace_way;
1046 # r1.forward_row1 <= r1.store_row;
1047 # r1.forward_valid1 <= '0';
1048 sync += r1.forward_sel1.eq(1)
1049 sync += r1.forward_way1.eq(replace_way)
1050 sync += r1.forward_row1.eq(r1.store_row)
1051 sync += r1.forward_valid1.eq(0)
1052 # end if;
1053
1054 # -- On reset, clear all valid bits to force misses
1055 # if rst = '1' then
1056 # On reset, clear all valid bits to force misses
1057 # TODO figure out how reset signal works in nmigeni
1058 with m.If("""TODO RST???"""):
1059 # for i in index_t loop
1060 for i in range(NUM_LINES):
1061 # cache_valids(i) <= (others => '0');
1062 sync += cache_valid_bits[i].eq(0)
1063 # end loop;
1064
1065 # r1.state <= IDLE;
1066 # r1.full <= '0';
1067 # r1.slow_valid <= '0';
1068 # r1.wb.cyc <= '0';
1069 # r1.wb.stb <= '0';
1070 # r1.ls_valid <= '0';
1071 # r1.mmu_done <= '0';
1072 sync += r1.state.eq(State.IDLE)
1073 sync += r1.full.eq(0)
1074 sync += r1.slow_valid.eq(0)
1075 sync += r1.wb.cyc.eq(0)
1076 sync += r1.wb.stb.eq(0)
1077 sync += r1.ls_valid.eq(0)
1078 sync += r1.mmu_done.eq(0)
1079
1080 # -- Not useful normally but helps avoiding
1081 # -- tons of sim warnings
1082 # Not useful normally but helps avoiding
1083 # tons of sim warnings
1084 # r1.wb.adr <= (others => '0');
1085 sync += r1.wb.adr.eq(0)
1086 # else
1087 with m.Else():
1088 # -- One cycle pulses reset
1089 # r1.slow_valid <= '0';
1090 # r1.write_bram <= '0';
1091 # r1.inc_acks <= '0';
1092 # r1.dec_acks <= '0';
1093 #
1094 # r1.ls_valid <= '0';
1095 # -- complete tlbies and TLB loads in the third cycle
1096 # r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld);
1097 # One cycle pulses reset
1098 sync += r1.slow_valid.eq(0)
1099 sync += r1.write_bram.eq(0)
1100 sync += r1.inc_acks.eq(0)
1101 sync += r1.dec_acks.eq(0)
1102
1103 sync += r1.ls_valid.eq(0)
1104 # complete tlbies and TLB loads in the third cycle
1105 sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1106
1107 # if req_op = OP_LOAD_HIT or req_op = OP_STCX_FAIL then
1108 with m.If(req_op == Op.OP_LOAD_HIT
1109 | req_op == Op.OP_STCX_FAIL):
1110 # if r0.mmu_req = '0' then
1111 with m.If(~r0.mmu_req):
1112 # r1.ls_valid <= '1';
1113 sync += r1.ls_valid.eq(1)
1114 # else
1115 with m.Else():
1116 # r1.mmu_done <= '1';
1117 sync += r1.mmu_done.eq(1)
1118 # end if;
1119 # end if;
1120
1121 # if r1.write_tag = '1' then
1122 with m.If(r1.write_tag):
1123 # -- Store new tag in selected way
1124 # for i in 0 to NUM_WAYS-1 loop
1125 # Store new tag in selected way
1126 for i in range(NUM_WAYS):
1127 # if i = replace_way then
1128 with m.If(i == replace_way):
1129 # cache_tags(r1.store_index)(
1130 # (i + 1) * TAG_WIDTH - 1
1131 # downto i * TAG_WIDTH
1132 # ) <=
1133 # (TAG_WIDTH - 1 downto TAG_BITS => '0')
1134 # & r1.reload_tag;
1135 sync += cache_tag[
1136 r1.store_index
1137 ][i * TAG_WIDTH:(i +1) * TAG_WIDTH].eq(
1138 Const(TAG_WIDTH, TAG_WIDTH)
1139 & r1.reload_tag
1140 )
1141 # end if;
1142 # end loop;
1143 # r1.store_way <= replace_way;
1144 # r1.write_tag <= '0';
1145 sync += r1.store_way.eq(replace_way)
1146 sync += r1.write_tag.eq(0)
1147 # end if;
1148
1149 # -- Take request from r1.req if there is one there,
1150 # -- else from req_op, ra, etc.
1151 # if r1.full = '1' then
1152 # Take request from r1.req if there is one there,
1153 # else from req_op, ra, etc.
1154 with m.If(r1.full)
1155 # req := r1.req;
1156 sync += req.eq(r1.req)
1157
1158 # else
1159 with m.Else():
1160 # req.op := req_op;
1161 # req.valid := req_go;
1162 # req.mmu_req := r0.mmu_req;
1163 # req.dcbz := r0.req.dcbz;
1164 # req.real_addr := ra;
1165 sync += req.op.eq(req_op)
1166 sync += req.valid.eq(req_go)
1167 sync += req.mmu_req.eq(r0.mmu_req)
1168 sync += req.dcbz.eq(r0.req.dcbz)
1169 sync += req.real_addr.eq(ra)
1170
1171 # -- Force data to 0 for dcbz
1172 # if r0.req.dcbz = '0' then
1173 with m.If(~r0.req.dcbz):
1174 # req.data := r0.req.data;
1175 sync += req.data.eq(r0.req.data)
1176
1177 # else
1178 with m.Else():
1179 # req.data := (others => '0');
1180 sync += req.data.eq(0)
1181 # end if;
1182
1183 # -- Select all bytes for dcbz
1184 # -- and for cacheable loads
1185 # if r0.req.dcbz = '1'
1186 # or (r0.req.load = '1' and r0.req.nc = '0') then
1187 # Select all bytes for dcbz
1188 # and for cacheable loads
1189 with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc):
1190 # req.byte_sel := (others => '1');
1191 sync += req.byte_sel.eq(1)
1192
1193 # else
1194 with m.Else():
1195 # req.byte_sel := r0.req.byte_sel;
1196 sync += req.byte_sel.eq(r0.req.byte_sel)
1197 # end if;
1198
1199 # req.hit_way := req_hit_way;
1200 # req.same_tag := req_same_tag;
1201 sync += req.hit_way.eq(req_hit_way)
1202 sync += req.same_tag.eq(req_same_tag)
1203
1204 # -- Store the incoming request from r0,
1205 # -- if it is a slow request
1206 # -- Note that r1.full = 1 implies req_op = OP_NONE
1207 # if req_op = OP_LOAD_MISS or req_op = OP_LOAD_NC
1208 # or req_op = OP_STORE_MISS
1209 # or req_op = OP_STORE_HIT then
1210 # Store the incoming request from r0,
1211 # if it is a slow request
1212 # Note that r1.full = 1 implies req_op = OP_NONE
1213 with m.If(req_op == Op.OP_LOAD_MISS
1214 | req_op == Op.OP_LOAD_NC
1215 | req_op == Op.OP_STORE_MISS
1216 | req_op == Op.OP_STORE_HIT):
1217 # r1.req <= req;
1218 # r1.full <= '1';
1219 sync += r1.req(req)
1220 sync += r1.full.eq(1)
1221 # end if;
1222 # end if;
1223 #
1224 # -- Main state machine
1225 # case r1.state is
1226 # Main state machine
1227 with m.Switch(r1.state):
1228
1229 # when IDLE =>
1230 with m.Case(State.IDLE)
1231 # r1.wb.adr <= req.real_addr(
1232 # r1.wb.adr'left downto 0
1233 # );
1234 # r1.wb.sel <= req.byte_sel;
1235 # r1.wb.dat <= req.data;
1236 # r1.dcbz <= req.dcbz;
1237 #
1238 # -- Keep track of our index and way
1239 # -- for subsequent stores.
1240 # r1.store_index <= get_index(req.real_addr);
1241 # r1.store_row <= get_row(req.real_addr);
1242 # r1.end_row_ix <=
1243 # get_row_of_line(get_row(req.real_addr)) - 1;
1244 # r1.reload_tag <= get_tag(req.real_addr);
1245 # r1.req.same_tag <= '1';
1246 sync += r1.wb.adr.eq(req.real_addr[0:r1.wb.adr])
1247 sync += r1.wb.sel.eq(req.byte_sel)
1248 sync += r1.wb.dat.eq(req.data)
1249 sync += r1.dcbz.eq(req.dcbz)
1250
1251 # Keep track of our index and way
1252 # for subsequent stores.
1253 sync += r1.store_index.eq(get_index(req.real_addr))
1254 sync += r1.store_row.eq(get_row(req.real_addr))
1255 sync += r1.end_row_ix.eq(
1256 get_row_of_line(get_row(req.real_addr))
1257 )
1258 sync += r1.reload_tag.eq(get_tag(req.real_addr))
1259 sync += r1.req.same_tag.eq(1)
1260
1261 # if req.op = OP_STORE_HIT theni
1262 with m.If(req.op == Op.OP_STORE_HIT):
1263 # r1.store_way <= req.hit_way;
1264 sync += r1.store_way.eq(req.hit_way)
1265 # end if;
1266
1267 # -- Reset per-row valid bits,
1268 # -- ready for handling OP_LOAD_MISS
1269 # for i in 0 to ROW_PER_LINE - 1 loop
1270 # Reset per-row valid bits,
1271 # ready for handling OP_LOAD_MISS
1272 for i in range(ROW_PER_LINE):
1273 # r1.rows_valid(i) <= '0';
1274 sync += r1.rows_valid[i].eq(0)
1275 # end loop;
1276
1277 # case req.op is
1278 with m.Switch(req.op):
1279 # when OP_LOAD_HIT =>
1280 with m.Case(Op.OP_LOAD_HIT):
1281 # -- stay in IDLE state
1282 # stay in IDLE state
1283 pass
1284
1285 # when OP_LOAD_MISS =>
1286 with m.Case(Op.OP_LOAD_MISS):
1287 # -- Normal load cache miss,
1288 # -- start the reload machine
1289 # report "cache miss real addr:" &
1290 # to_hstring(req.real_addr) & " idx:" &
1291 # integer'image(get_index(req.real_addr)) &
1292 # " tag:" & to_hstring(get_tag(req.real_addr));
1293 # Normal load cache miss,
1294 # start the reload machine
1295 print(f"cache miss real addr:" \
1296 f"{req_real_addr}" \
1297 f" idx:{get_index(req_real_addr)}" \
1298 f" tag:{get_tag(req.real_addr)}")
1299
1300 # -- Start the wishbone cycle
1301 # r1.wb.we <= '0';
1302 # r1.wb.cyc <= '1';
1303 # r1.wb.stb <= '1';
1304 # Start the wishbone cycle
1305 sync += r1.wb.we.eq(0)
1306 sync += r1.wb.cyc.eq(1)
1307 sync += r1.wb.stb.eq(1)
1308
1309 # -- Track that we had one request sent
1310 # r1.state <= RELOAD_WAIT_ACK;
1311 # r1.write_tag <= '1';
1312 # Track that we had one request sent
1313 sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1314 sync += r1.write_tag.eq(1)
1315
1316 # when OP_LOAD_NC =>
1317 with m.Case(Op.OP_LOAD_NC):
1318 # r1.wb.cyc <= '1';
1319 # r1.wb.stb <= '1';
1320 # r1.wb.we <= '0';
1321 # r1.state <= NC_LOAD_WAIT_ACK;
1322 sync += r1.wb.cyc.eq(1)
1323 sync += r1.wb.stb.eq(1)
1324 sync += r1.wb.we.eq(0)
1325 sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1326
1327 # when OP_STORE_HIT | OP_STORE_MISS =>
1328 with m.Case(Op.OP_STORE_HIT
1329 | Op.OP_STORE_MISS):
1330 # if req.dcbz = '0' then
1331 with m.If(~req.bcbz):
1332 # r1.state <= STORE_WAIT_ACK;
1333 # r1.acks_pending <= to_unsigned(1, 3);
1334 # r1.full <= '0';
1335 # r1.slow_valid <= '1';
1336 sync += r1.state.eq(
1337 State.STORE_WAIT_ACK
1338 )
1339 sync += r1.acks_pending.eq(
1340 '''TODO to_unsignes(1,3)'''
1341 )
1342 sync += r1.full.eq(0)
1343 sync += r1.slow_valid.eq(1)
1344
1345 # if req.mmu_req = '0' then
1346 with m.If(~req.mmu_req):
1347 # r1.ls_valid <= '1';
1348 sync += r1.ls_valid.eq(1)
1349 # else
1350 with m.Else():
1351 # r1.mmu_done <= '1';
1352 sync += r1.mmu_done.eq(1)
1353 # end if;
1354
1355 # if req.op = OP_STORE_HIT then
1356 with m.If(req.op == Op.OP_STORE_HIT):
1357 # r1.write_bram <= '1';
1358 sync += r1.write_bram.eq(1)
1359 # end if;
1360
1361 # else
1362 with m.Else():
1363 # -- dcbz is handled much like a load
1364 # -- miss except that we are writing
1365 # -- to memory instead of reading
1366 # r1.state <= RELOAD_WAIT_ACK;
1367 # dcbz is handled much like a load
1368 # miss except that we are writing
1369 # to memory instead of reading
1370 sync += r1.state.eq(Op.RELOAD_WAIT_ACK)
1371
1372 # if req.op = OP_STORE_MISS then
1373 with m.If(req.op == Op.OP_STORE_MISS):
1374 # r1.write_tag <= '1';
1375 sync += r1.write_tag.eq(1)
1376 # end if;
1377 # end if;
1378
1379 # r1.wb.we <= '1';
1380 # r1.wb.cyc <= '1';
1381 # r1.wb.stb <= '1';
1382 sync += r1.wb.we.eq(1)
1383 sync += r1.wb.cyc.eq(1)
1384 sync += r1.wb.stb.eq(1)
1385
1386 # -- OP_NONE and OP_BAD do nothing
1387 # -- OP_BAD & OP_STCX_FAIL were handled above already
1388 # when OP_NONE =>
1389 # when OP_BAD =>
1390 # when OP_STCX_FAIL =>
1391 # OP_NONE and OP_BAD do nothing
1392 # OP_BAD & OP_STCX_FAIL were
1393 # handled above already
1394 with m.Case(Op.OP_NONE):
1395 pass
1396
1397 with m.Case(OP_BAD):
1398 pass
1399
1400 with m.Case(OP_STCX_FAIL):
1401 pass
1402 # end case;
1403
1404 # when RELOAD_WAIT_ACK =>
1405 with m.Case(State.RELOAD_WAIT_ACK):
1406 # -- Requests are all sent if stb is 0
1407 # Requests are all sent if stb is 0
1408 sync += stbs_done.eq(~r1.wb.stb)
1409 # stbs_done := r1.wb.stb = '0';
1410
1411 # -- If we are still sending requests,
1412 # -- was one accepted?
1413 # if wishbone_in.stall = '0' and not stbs_done then
1414 # If we are still sending requests,
1415 # was one accepted?
1416 with m.If(~wb_in.stall & ~stbs_done):
1417 # -- That was the last word ? We are done sending.
1418 # -- Clear stb and set stbs_done so we can handle
1419 # -- an eventual last ack on the same cycle.
1420 # if is_last_row_addr(
1421 # r1.wb.adr, r1.end_row_ix
1422 # ) then
1423 # That was the last word?
1424 # We are done sending.
1425 # Clear stb and set stbs_done
1426 # so we can handle an eventual
1427 # last ack on the same cycle.
1428 with m.If(is_last_row_addr(
1429 r1.wb.adr, r1.end_row_ix)):
1430 # r1.wb.stb <= '0';
1431 # stbs_done := true;
1432 sync += r1.wb.stb.eq(0)
1433 sync += stbs_done.eq(0)
1434 # end if;
1435
1436 # -- Calculate the next row address
1437 # r1.wb.adr <= next_row_addr(r1.wb.adr);
1438 # Calculate the next row address
1439 sync += r1.wb.adr.eq(next_row_addr(r1.wb.adr))
1440 # end if;
1441
1442 # -- Incoming acks processing
1443 # r1.forward_valid1 <= wishbone_in.ack;
1444 # Incoming acks processing
1445 sync += r1.forward_valid1.eq(wb_in.ack)
1446
1447 # if wishbone_in.ack = '1' then
1448 with m.If(wb_in.ack):
1449 # r1.rows_valid(
1450 # r1.store_row mod ROW_PER_LINE
1451 # ) <= '1';
1452 sync += r1.rows_valid[
1453 r1.store_row % ROW_PER_LINE
1454 ].eq(1)
1455
1456 # -- If this is the data we were looking for,
1457 # -- we can complete the request next cycle.
1458 # -- Compare the whole address in case the
1459 # -- request in r1.req is not the one that
1460 # -- started this refill.
1461 # if r1.full = '1' and r1.req.same_tag = '1'
1462 # and ((r1.dcbz = '1' and r1.req.dcbz = '1')
1463 # or (r1.dcbz = '0' and r1.req.op = OP_LOAD_MISS))
1464 # and r1.store_row = get_row(r1.req.real_addr) then
1465 # If this is the data we were looking for,
1466 # we can complete the request next cycle.
1467 # Compare the whole address in case the
1468 # request in r1.req is not the one that
1469 # started this refill.
1470 with m.If(r1.full & r1.req.same_tag &
1471 ((r1.dcbz & r1.req.dcbz)
1472 (~r1.dcbz &
1473 r1.req.op == Op.OP_LOAD_MISS)
1474 ) &
1475 r1.store_row
1476 == get_row(r1.req.real_addr):
1477 # r1.full <= '0';
1478 # r1.slow_valid <= '1';
1479 sync += r1.full.eq(0)
1480 sync += r1.slow_valid.eq(1)
1481
1482 # if r1.mmu_req = '0' then
1483 with m.If(~r1.mmu_req):
1484 # r1.ls_valid <= '1';
1485 sync += r1.ls_valid.eq(1)
1486 # else
1487 with m.Else():
1488 # r1.mmu_done <= '1';
1489 sync += r1.mmu_done.eq(1)
1490 # end if;
1491 # r1.forward_sel <= (others => '1');
1492 # r1.use_forward1 <= '1';
1493 sync += r1.forward_sel.eq(1)
1494 sync += r1.use_forward1.eq(1)
1495 # end if;
1496
1497 # -- Check for completion
1498 # if stbs_done and is_last_row(r1.store_row,
1499 # r1.end_row_ix) then
1500 # Check for completion
1501 with m.If(stbs_done &
1502 is_last_row(r1.store_row,
1503 r1.end_row_ix)):
1504
1505 # -- Complete wishbone cycle
1506 # r1.wb.cyc <= '0';
1507 # Complete wishbone cycle
1508 sync += r1.wb.cyc.eq(0)
1509
1510 # -- Cache line is now valid
1511 # cache_valids(r1.store_index)(
1512 # r1.store_way
1513 # ) <= '1';
1514 # Cache line is now valid
1515 sync += cache_valid_bits[
1516 r1.store_index
1517 ][r1.store_way].eq(1)
1518
1519 # r1.state <= IDLE;
1520 sync += r1.state.eq(State.IDLE)
1521 # end if;
1522
1523 # -- Increment store row counter
1524 # r1.store_row <= next_row(r1.store_row);
1525 # Increment store row counter
1526 sync += r1.store_row.eq(next_row(
1527 r1.store_row
1528 ))
1529 # end if;
1530
1531 # when STORE_WAIT_ACK =>
1532 with m.Case(State.STORE_WAIT_ACK):
1533 # stbs_done := r1.wb.stb = '0';
1534 # acks := r1.acks_pending;
1535 sync += stbs_done.eq(~r1.wb.stb)
1536 sync += acks.eq(r1.acks_pending)
1537
1538 # if r1.inc_acks /= r1.dec_acks then
1539 with m.If(r1.inc_acks != r1.dec_acks):
1540
1541 # if r1.inc_acks = '1' then
1542 with m.If(r1.inc_acks):
1543 # acks := acks + 1;
1544 sync += acks.eq(acks + 1)
1545
1546 # else
1547 with m.Else():
1548 # acks := acks - 1;
1549 sync += acks.eq(acks - 1)
1550 # end if;
1551 # end if;
1552
1553 # r1.acks_pending <= acks;
1554 sync += r1.acks_pending.eq(acks)
1555
1556 # -- Clear stb when slave accepted request
1557 # if wishbone_in.stall = '0' then
1558 # Clear stb when slave accepted request
1559 with m.If(~wb_in.stall):
1560 # -- See if there is another store waiting
1561 # -- to be done which is in the same real page.
1562 # if req.valid = '1' then
1563 # See if there is another store waiting
1564 # to be done which is in the same real page.
1565 with m.If(req.valid):
1566 # r1.wb.adr(
1567 # SET_SIZE_BITS - 1 downto 0
1568 # ) <= req.real_addr(
1569 # SET_SIZE_BITS - 1 downto 0
1570 # );
1571 # r1.wb.dat <= req.data;
1572 # r1.wb.sel <= req.byte_sel;
1573 sync += r1.wb.adr[0:SET_SIZE_BITS].eq(
1574 req.real_addr[0:SET_SIZE_BITS]
1575 )
1576 # end if;
1577
1578 # if acks < 7 and req.same_tag = '1'
1579 # and (req.op = OP_STORE_MISS
1580 # or req.op = OP_STORE_HIT) then
1581 with m.Elif(acks < 7 & req.same_tag &
1582 (req.op == Op.Op_STORE_MISS
1583 | req.op == Op.OP_SOTRE_HIT)):
1584 # r1.wb.stb <= '1';
1585 # stbs_done := false;
1586 sync += r1.wb.stb.eq(1)
1587 sync += stbs_done.eq(0)
1588
1589 # if req.op = OP_STORE_HIT then
1590 with m.If(req.op == Op.OP_STORE_HIT):
1591 # r1.write_bram <= '1';
1592 sync += r1.write_bram.eq(1)
1593 # end if;
1594 # r1.full <= '0';
1595 # r1.slow_valid <= '1';
1596 sync += r1.full.eq(0)
1597 sync += r1.slow_valid.eq(1)
1598
1599 # -- Store requests never come from the MMU
1600 # r1.ls_valid <= '1';
1601 # stbs_done := false;
1602 # r1.inc_acks <= '1';
1603 # Store request never come from the MMU
1604 sync += r1.ls_valid.eq(1)
1605 sync += stbs_done.eq(0)
1606 sync += r1.inc_acks.eq(1)
1607 # else
1608 with m.Else():
1609 # r1.wb.stb <= '0';
1610 # stbs_done := true;
1611 sync += r1.wb.stb.eq(0)
1612 sync += stbs_done.eq(1)
1613 # end if;
1614 # end if;
1615
1616 # -- Got ack ? See if complete.
1617 # if wishbone_in.ack = '1' then
1618 # Got ack ? See if complete.
1619 with m.If(wb_in.ack):
1620 # if stbs_done and acks = 1 then
1621 with m.If(stbs_done & acks)
1622 # r1.state <= IDLE;
1623 # r1.wb.cyc <= '0';
1624 # r1.wb.stb <= '0';
1625 sync += r1.state.eq(State.IDLE)
1626 sync += r1.wb.cyc.eq(0)
1627 sync += r1.wb.stb.eq(0)
1628 # end if;
1629 # r1.dec_acks <= '1';
1630 sync += r1.dec_acks.eq(1)
1631 # end if;
1632
1633 # when NC_LOAD_WAIT_ACK =>
1634 with m.Case(State.NC_LOAD_WAIT_ACK):
1635 # -- Clear stb when slave accepted request
1636 # if wishbone_in.stall = '0' then
1637 # Clear stb when slave accepted request
1638 with m.If(~wb_in.stall):
1639 # r1.wb.stb <= '0';
1640 sync += r1.wb.stb.eq(0)
1641 # end if;
1642
1643 # -- Got ack ? complete.
1644 # if wishbone_in.ack = '1' then
1645 # Got ack ? complete.
1646 with m.If(wb_in.ack):
1647 # r1.state <= IDLE;
1648 # r1.full <= '0';
1649 # r1.slow_valid <= '1';
1650 sync += r1.state.eq(State.IDLE)
1651 sync += r1.full.eq(0)
1652 sync += r1.slow_valid.eq(1)
1653
1654 # if r1.mmu_req = '0' then
1655 with m.If(~r1.mmu_req):
1656 # r1.ls_valid <= '1';
1657 sync += r1.ls_valid.eq(1)
1658
1659 # else
1660 with m.Else():
1661 # r1.mmu_done <= '1';
1662 sync += r1.mmu_done.eq(1)
1663 # end if;
1664
1665 # r1.forward_sel <= (others => '1');
1666 # r1.use_forward1 <= '1';
1667 # r1.wb.cyc <= '0';
1668 # r1.wb.stb <= '0';
1669 sync += r1.forward_sel.eq(1)
1670 sync += r1.use_forward1.eq(1)
1671 sync += r1.wb.cyc.eq(0)
1672 sync += r1.wb.stb.eq(0)
1673 # end if;
1674 # end case;
1675 # end if;
1676 # end if;
1677 # end process;
1678
1679 # dc_log: if LOG_LENGTH > 0 generate
1680 # TODO learn how to tranlate vhdl generate into nmigen
1681 def dcache_log(self, r1, valid_ra, tlb_hit_way, stall_out,
1682 d_out, wb_in, log_out):
1683
1684 comb = m.d.comb
1685 sync = m.d.sync
1686
1687 # signal log_data : std_ulogic_vector(19 downto 0);
1688 log_data = Signal(20)
1689
1690 comb += log_data
1691
1692 # begin
1693 # dcache_log: process(clk)
1694 # begin
1695 # if rising_edge(clk) then
1696 # log_data <= r1.wb.adr(5 downto 3) &
1697 # wishbone_in.stall &
1698 # wishbone_in.ack &
1699 # r1.wb.stb & r1.wb.cyc &
1700 # d_out.error &
1701 # d_out.valid &
1702 # std_ulogic_vector(
1703 # to_unsigned(op_t'pos(req_op), 3)) &
1704 # stall_out &
1705 # std_ulogic_vector(
1706 # to_unsigned(tlb_hit_way, 3)) &
1707 # valid_ra &
1708 # std_ulogic_vector(
1709 # to_unsigned(state_t'pos(r1.state), 3));
1710 sync += log_data.eq(Cat(
1711 Const(r1.state, 3), valid_ra, Const(tlb_hit_way, 3),
1712 stall_out, Const(req_op, 3), d_out.valid, d_out.error,
1713 r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1714 r1.wb.adr[3:6]
1715 ))
1716 # end if;
1717 # end process;
1718 # log_out <= log_data;
1719 # TODO ??? I am very confused need help
1720 comb += log_out.eq(log_data)
1721 # end generate;
1722 # end;
1723
1724 def elaborate(self, platform):
1725 LINE_SIZE = self.LINE_SIZE
1726 NUM_LINES = self.NUM_LINES
1727 NUM_WAYS = self.NUM_WAYS
1728 TLB_SET_SIZE = self.TLB_SET_SIZE
1729 TLB_NUM_WAYS = self.TLB_NUM_WAYS
1730 TLB_LG_PGSZ = self.TLB_LG_PGSZ
1731 LOG_LENGTH = self.LOG_LENGTH
1732
1733 # BRAM organisation: We never access more than
1734 # -- wishbone_data_bits at a time so to save
1735 # -- resources we make the array only that wide, and
1736 # -- use consecutive indices for to make a cache "line"
1737 # --
1738 # -- ROW_SIZE is the width in bytes of the BRAM
1739 # -- (based on WB, so 64-bits)
1740 ROW_SIZE = WB_DATA_BITS / 8;
1741
1742 # ROW_PER_LINE is the number of row (wishbone
1743 # transactions) in a line
1744 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
1745
1746 # BRAM_ROWS is the number of rows in BRAM needed
1747 # to represent the full dcache
1748 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
1749
1750
1751 # Bit fields counts in the address
1752
1753 # REAL_ADDR_BITS is the number of real address
1754 # bits that we store
1755 REAL_ADDR_BITS = 56
1756
1757 # ROW_BITS is the number of bits to select a row
1758 ROW_BITS = log2_int(BRAM_ROWS)
1759
1760 # ROW_LINE_BITS is the number of bits to select
1761 # a row within a line
1762 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
1763
1764 # LINE_OFF_BITS is the number of bits for
1765 # the offset in a cache line
1766 LINE_OFF_BITS = log2_int(LINE_SIZE)
1767
1768 # ROW_OFF_BITS is the number of bits for
1769 # the offset in a row
1770 ROW_OFF_BITS = log2_int(ROW_SIZE)
1771
1772 # INDEX_BITS is the number if bits to
1773 # select a cache line
1774 INDEX_BITS = log2_int(NUM_LINES)
1775
1776 # SET_SIZE_BITS is the log base 2 of the set size
1777 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
1778
1779 # TAG_BITS is the number of bits of
1780 # the tag part of the address
1781 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
1782
1783 # TAG_WIDTH is the width in bits of each way of the tag RAM
1784 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
1785
1786 # WAY_BITS is the number of bits to select a way
1787 WAY_BITS = log2_int(NUM_WAYS)
1788
1789 # Example of layout for 32 lines of 64 bytes:
1790 #
1791 # .. tag |index| line |
1792 # .. | row | |
1793 # .. | |---| | ROW_LINE_BITS (3)
1794 # .. | |--- - --| LINE_OFF_BITS (6)
1795 # .. | |- --| ROW_OFF_BITS (3)
1796 # .. |----- ---| | ROW_BITS (8)
1797 # .. |-----| | INDEX_BITS (5)
1798 # .. --------| | TAG_BITS (45)
1799
1800 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
1801
1802 def CacheTagArray():
1803 return Array(CacheTagSet() for x in range(NUM_LINES))
1804
1805 def CacheValidBitsArray():
1806 return Array(CacheWayValidBits() for x in range(NUM_LINES))
1807
1808 def RowPerLineValidArray():
1809 return Array(Signal() for x in range(ROW_PER_LINE))
1810
1811 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1812 cache_tags = CacheTagArray()
1813 cache_tag_set = Signal(TAG_RAM_WIDTH)
1814 cache_valid_bits = CacheValidBitsArray()
1815
1816 # TODO attribute ram_style : string;
1817 # TODO attribute ram_style of cache_tags : signal is "distributed";
1818
1819 # L1 TLB
1820 TLB_SET_BITS = log2_int(TLB_SET_SIZE)
1821 TLB_WAY_BITS = log2_int(TLB_NUM_WAYS)
1822 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
1823 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
1824 TLB_PTE_BITS = 64
1825 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
1826
1827 def TLBValidBitsArray():
1828 return Array(
1829 Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE)
1830 )
1831
1832 def TLBTagsArray():
1833 return Array(
1834 Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE)
1835 )
1836
1837 def TLBPtesArray():
1838 return Array(
1839 Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE)
1840 )
1841
1842 def HitWaySet():
1843 return Array(Signal(NUM_WAYS) for x in range(TLB_NUM_WAYS))
1844
1845 """note: these are passed to nmigen.hdl.Memory as "attributes".
1846 don't know how, just that they are.
1847 """
1848 dtlb_valid_bits = TLBValidBitsArray()
1849 dtlb_tags = TLBTagsArray()
1850 dtlb_ptes = TLBPtesArray()
1851 # TODO attribute ram_style of
1852 # dtlb_tags : signal is "distributed";
1853 # TODO attribute ram_style of
1854 # dtlb_ptes : signal is "distributed";
1855
1856 r0 = RegStage0()
1857 r0_full = Signal()
1858
1859 r1 = RegStage1()
1860
1861 reservation = Reservation()
1862
1863 # Async signals on incoming request
1864 req_index = Signal(NUM_LINES)
1865 req_row = Signal(BRAM_ROWS)
1866 req_hit_way = Signal(WAY_BITS)
1867 req_tag = Signal(TAG_BITS)
1868 req_op = Op()
1869 req_data = Signal(64)
1870 req_same_tag = Signal()
1871 req_go = Signal()
1872
1873 early_req_row = Signal(BRAM_ROWS)
1874
1875 cancel_store = Signal()
1876 set_rsrv = Signal()
1877 clear_rsrv = Signal()
1878
1879 r0_valid = Signal()
1880 r0_stall = Signal()
1881
1882 use_forward1_next = Signal()
1883 use_forward2_next = Signal()
1884
1885 # Cache RAM interface
1886 def CacheRamOut():
1887 return Array(Signal(WB_DATA_BITS) for x in range(NUM_WAYS))
1888
1889 cache_out = CacheRamOut()
1890
1891 # PLRU output interface
1892 def PLRUOut():
1893 return Array(Signal(WAY_BITS) for x in range(Index()))
1894
1895 plru_victim = PLRUOut()
1896 replace_way = Signal(WAY_BITS)
1897
1898 # Wishbone read/write/cache write formatting signals
1899 bus_sel = Signal(8)
1900
1901 # TLB signals
1902 tlb_tag_way = Signal(TLB_TAG_WAY_BITS)
1903 tlb_pte_way = Signal(TLB_PTE_WAY_BITS)
1904 tlb_valid_way = Signal(TLB_NUM_WAYS)
1905 tlb_req_index = Signal(TLB_SET_SIZE)
1906 tlb_hit = Signal()
1907 tlb_hit_way = Signal(TLB_NUM_WAYS)
1908 pte = Signal(TLB_PTE_BITS)
1909 ra = Signal(REAL_ADDR_BITS)
1910 valid_ra = Signal()
1911 perm_attr = PermAttr()
1912 rc_ok = Signal()
1913 perm_ok = Signal()
1914 access_ok = Signal()
1915
1916 # TLB PLRU output interface
1917 def TLBPLRUOut():
1918 return Array(
1919 Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE)
1920 )
1921
1922 tlb_plru_victim = TLBPLRUOut()
1923
1924 # Helper functions to decode incoming requests
1925 #
1926 # Return the cache line index (tag index) for an address
1927 def get_index(addr):
1928 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
1929
1930 # Return the cache row index (data memory) for an address
1931 def get_row(addr):
1932 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
1933
1934 # Return the index of a row within a line
1935 def get_row_of_line(row):
1936 row_v = Signal(ROW_BITS)
1937 row_v = Signal(row)
1938 return row_v[0:ROW_LINE_BITS]
1939
1940 # Returns whether this is the last row of a line
1941 def is_last_row_addr(addr, last):
1942 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
1943
1944 # Returns whether this is the last row of a line
1945 def is_last_row(row, last):
1946 return get_row_of_line(row) == last
1947
1948 # Return the address of the next row in the current cache line
1949 def next_row_addr(addr):
1950 row_idx = Signal(ROW_LINE_BITS)
1951 result = WBAddrType()
1952 # Is there no simpler way in VHDL to
1953 # generate that 3 bits adder ?
1954 row_idx = addr[ROW_OFF_BITS:LINE_OFF_BITS]
1955 row_idx = Signal(row_idx + 1)
1956 result = addr
1957 result[ROW_OFF_BITS:LINE_OFF_BITS] = row_idx
1958 return result
1959
1960 # Return the next row in the current cache line. We use a
1961 # dedicated function in order to limit the size of the
1962 # generated adder to be only the bits within a cache line
1963 # (3 bits with default settings)
1964 def next_row(row)
1965 row_v = Signal(ROW_BITS)
1966 row_idx = Signal(ROW_LINE_BITS)
1967 result = Signal(ROW_BITS)
1968
1969 row_v = Signal(row)
1970 row_idx = row_v[ROW_LINE_BITS]
1971 row_v[0:ROW_LINE_BITS] = Signal(row_idx + 1)
1972 return row_v
1973
1974 # Get the tag value from the address
1975 def get_tag(addr):
1976 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
1977
1978 # Read a tag from a tag memory row
1979 def read_tag(way, tagset):
1980 return tagset[way *TAG_WIDTH:way * TAG_WIDTH + TAG_BITS]
1981
1982 # Read a TLB tag from a TLB tag memory row
1983 def read_tlb_tag(way, tags):
1984 j = Signal()
1985
1986 j = way * TLB_EA_TAG_BITS
1987 return tags[j:j + TLB_EA_TAG_BITS]
1988
1989 # Write a TLB tag to a TLB tag memory row
1990 def write_tlb_tag(way, tags), tag):
1991 j = Signal()
1992
1993 j = way * TLB_EA_TAG_BITS
1994 tags[j:j + TLB_EA_TAG_BITS] = tag
1995
1996 # Read a PTE from a TLB PTE memory row
1997 def read_tlb_pte(way, ptes):
1998 j = Signal()
1999
2000 j = way * TLB_PTE_BITS
2001 return ptes[j:j + TLB_PTE_BITS]
2002
2003 def write_tlb_pte(way, ptes,newpte):
2004 j = Signal()
2005
2006 j = way * TLB_PTE_BITS
2007 return ptes[j:j + TLB_PTE_BITS] = newpte
2008
2009 assert (LINE_SIZE % ROW_SIZE) == 0 "LINE_SIZE not " \
2010 "multiple of ROW_SIZE"
2011
2012 assert (LINE_SIZE % 2) == 0 "LINE_SIZE not power of 2"
2013
2014 assert (NUM_LINES % 2) == 0 "NUM_LINES not power of 2"
2015
2016 assert (ROW_PER_LINE % 2) == 0 "ROW_PER_LINE not" \
2017 "power of 2"
2018
2019 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS) \
2020 "geometry bits don't add up"
2021
2022 assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) \
2023 "geometry bits don't add up"
2024
2025 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS \
2026 + LINE_OFF_BITS) "geometry bits don't add up"
2027
2028 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS) \
2029 "geometry bits don't add up"
2030
2031 assert 64 == wishbone_data_bits "Can't yet handle a" \
2032 "wishbone width that isn't 64-bits"
2033
2034 assert SET_SIZE_BITS <= TLB_LG_PGSZ "Set indexed by" \
2035 "virtual address"
2036
2037 # we don't yet handle collisions between loadstore1 requests
2038 # and MMU requests
2039 comb += m_out.stall.eq(0)
2040
2041 # Hold off the request in r0 when r1 has an uncompleted request
2042 comb += r0_stall.eq(r0_full & r1.full)
2043 comb += r0_valid.eq(r0_full & ~r1.full)
2044 comb += stall_out.eq(r0_stall)
2045
2046 # Wire up wishbone request latch out of stage 1
2047 comb += wishbone_out.eq(r1.wb)
2048
2049
2050
2051 # dcache_tb.vhdl
2052 #
2053 # entity dcache_tb is
2054 # end dcache_tb;
2055 #
2056 # architecture behave of dcache_tb is
2057 # signal clk : std_ulogic;
2058 # signal rst : std_ulogic;
2059 #
2060 # signal d_in : Loadstore1ToDcacheType;
2061 # signal d_out : DcacheToLoadstore1Type;
2062 #
2063 # signal m_in : MmuToDcacheType;
2064 # signal m_out : DcacheToMmuType;
2065 #
2066 # signal wb_bram_in : wishbone_master_out;
2067 # signal wb_bram_out : wishbone_slave_out;
2068 #
2069 # constant clk_period : time := 10 ns;
2070 # begin
2071 # dcache0: entity work.dcache
2072 # generic map(
2073 #
2074 # LINE_SIZE => 64,
2075 # NUM_LINES => 4
2076 # )
2077 # port map(
2078 # clk => clk,
2079 # rst => rst,
2080 # d_in => d_in,
2081 # d_out => d_out,
2082 # m_in => m_in,
2083 # m_out => m_out,
2084 # wishbone_out => wb_bram_in,
2085 # wishbone_in => wb_bram_out
2086 # );
2087 #
2088 # -- BRAM Memory slave
2089 # bram0: entity work.wishbone_bram_wrapper
2090 # generic map(
2091 # MEMORY_SIZE => 1024,
2092 # RAM_INIT_FILE => "icache_test.bin"
2093 # )
2094 # port map(
2095 # clk => clk,
2096 # rst => rst,
2097 # wishbone_in => wb_bram_in,
2098 # wishbone_out => wb_bram_out
2099 # );
2100 #
2101 # clk_process: process
2102 # begin
2103 # clk <= '0';
2104 # wait for clk_period/2;
2105 # clk <= '1';
2106 # wait for clk_period/2;
2107 # end process;
2108 #
2109 # rst_process: process
2110 # begin
2111 # rst <= '1';
2112 # wait for 2*clk_period;
2113 # rst <= '0';
2114 # wait;
2115 # end process;
2116 #
2117 # stim: process
2118 # begin
2119 # -- Clear stuff
2120 # d_in.valid <= '0';
2121 # d_in.load <= '0';
2122 # d_in.nc <= '0';
2123 # d_in.addr <= (others => '0');
2124 # d_in.data <= (others => '0');
2125 # m_in.valid <= '0';
2126 # m_in.addr <= (others => '0');
2127 # m_in.pte <= (others => '0');
2128 #
2129 # wait for 4*clk_period;
2130 # wait until rising_edge(clk);
2131 #
2132 # -- Cacheable read of address 4
2133 # d_in.load <= '1';
2134 # d_in.nc <= '0';
2135 # d_in.addr <= x"0000000000000004";
2136 # d_in.valid <= '1';
2137 # wait until rising_edge(clk);
2138 # d_in.valid <= '0';
2139 #
2140 # wait until rising_edge(clk) and d_out.valid = '1';
2141 # assert d_out.data = x"0000000100000000"
2142 # report "data @" & to_hstring(d_in.addr) &
2143 # "=" & to_hstring(d_out.data) &
2144 # " expected 0000000100000000"
2145 # severity failure;
2146 # -- wait for clk_period;
2147 #
2148 # -- Cacheable read of address 30
2149 # d_in.load <= '1';
2150 # d_in.nc <= '0';
2151 # d_in.addr <= x"0000000000000030";
2152 # d_in.valid <= '1';
2153 # wait until rising_edge(clk);
2154 # d_in.valid <= '0';
2155 #
2156 # wait until rising_edge(clk) and d_out.valid = '1';
2157 # assert d_out.data = x"0000000D0000000C"
2158 # report "data @" & to_hstring(d_in.addr) &
2159 # "=" & to_hstring(d_out.data) &
2160 # " expected 0000000D0000000C"
2161 # severity failure;
2162 #
2163 # -- Non-cacheable read of address 100
2164 # d_in.load <= '1';
2165 # d_in.nc <= '1';
2166 # d_in.addr <= x"0000000000000100";
2167 # d_in.valid <= '1';
2168 # wait until rising_edge(clk);
2169 # d_in.valid <= '0';
2170 # wait until rising_edge(clk) and d_out.valid = '1';
2171 # assert d_out.data = x"0000004100000040"
2172 # report "data @" & to_hstring(d_in.addr) &
2173 # "=" & to_hstring(d_out.data) &
2174 # " expected 0000004100000040"
2175 # severity failure;
2176 #
2177 # wait until rising_edge(clk);
2178 # wait until rising_edge(clk);
2179 # wait until rising_edge(clk);
2180 # wait until rising_edge(clk);
2181 #
2182 # std.env.finish;
2183 # end process;
2184 # end;
2185 def dcache_sim(dut):
2186 # clear stuff
2187 yield dut.d_in.valid.eq(0)
2188 yield dut.d_in.load.eq(0)
2189 yield dut.d_in.nc.eq(0)
2190 yield dut.d_in.adrr.eq(0)
2191 yield dut.d_in.data.eq(0)
2192 yield dut.m_in.valid.eq(0)
2193 yield dut.m_in.addr.eq(0)
2194 yield dut.m_in.pte.eq(0)
2195 # wait 4 * clk_period
2196 yield
2197 yield
2198 yield
2199 yield
2200 # wait_until rising_edge(clk)
2201 yield
2202 # Cacheable read of address 4
2203 yield dut.d_in.load.eq(1)
2204 yield dut.d_in.nc.eq(0)
2205 yield dut.d_in.addr.eq(Const(0x0000000000000004, 64))
2206 yield dut.d_in.valid.eq(1)
2207 # wait-until rising_edge(clk)
2208 yield
2209 yield dut.d_in.valid.eq(0)
2210 yield
2211 while not (yield dut.d_out.valid):
2212 yield
2213 assert dut.d_out.data == Const(0x0000000100000000, 64) f"data @" \
2214 f"{dut.d_in.addr}={dut.d_in.data} expected 0000000100000000" \
2215 " -!- severity failure"
2216
2217
2218 # Cacheable read of address 30
2219 yield dut.d_in.load.eq(1)
2220 yield dut.d_in.nc.eq(0)
2221 yield dut.d_in.addr.eq(Const(0x0000000000000030, 64))
2222 yield dut.d_in.valid.eq(1)
2223 yield
2224 yield dut.d_in.valid.eq(0)
2225 yield
2226 while not (yield dut.d_out.valid):
2227 yield
2228 assert dut.d_out.data == Const(0x0000000D0000000C, 64) f"data @" \
2229 f"{dut.d_in.addr}={dut.d_out.data} expected 0000000D0000000C" \
2230 f"-!- severity failure"
2231
2232 # Non-cacheable read of address 100
2233 yield dut.d_in.load.eq(1)
2234 yield dut.d_in.nc.eq(1)
2235 yield dut.d_in.addr.eq(Const(0x0000000000000100, 64))
2236 yield dut.d_in.valid.eq(1)
2237 yield
2238 yield dut.d_in.valid.eq(0)
2239 yield
2240 while not (yield dut.d_out.valid):
2241 yield
2242 assert dut.d_out.data == Const(0x0000004100000040, 64) f"data @" \
2243 f"{dut.d_in.addr}={dut.d_out.data} expected 0000004100000040" \
2244 f"-!- severity failure"
2245
2246 yield
2247 yield
2248 yield
2249 yield
2250
2251
2252 def test_dcache():
2253 dut = DCache()
2254 vl = rtlil.convert(dut, ports=[])
2255 with open("test_dcache.il", "w") as f:
2256 f.write(vl)
2257
2258 run_simulation(dut, dcache_sim(), vcd_name='test_dcache.vcd')
2259
2260 if __name__ == '__main__':
2261 test_dcache()
2262