adjust indentation of dcache_slow
[soc.git] / src / soc / experiment / dcache.py
1 """DCache
2
3 based on Anton Blanchard microwatt dcache.vhdl
4
5 """
6
7 from enum import Enum, unique
8
9 from nmigen import Module, Signal, Elaboratable,
10 Cat, Repl
11 from nmigen.cli import main
12 from nmigen.iocontrol import RecordObject
13 from nmigen.util import log2_int
14
15 from experiment.mem_types import LoadStore1ToDCacheType,
16 DCacheToLoadStore1Type,
17 MMUToDCacheType,
18 DCacheToMMUType
19
20 from experiment.wb_types import WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
21 WBAddrType, WBDataType, WBSelType,
22 WbMasterOut, WBSlaveOut,
23 WBMasterOutVector, WBSlaveOutVector,
24 WBIOMasterOut, WBIOSlaveOut
25
26
27 # Record for storing permission, attribute, etc. bits from a PTE
28 class PermAttr(RecordObject):
29 def __init__(self):
30 super().__init__()
31 self.reference = Signal()
32 self.changed = Signal()
33 self.nocache = Signal()
34 self.priv = Signal()
35 self.rd_perm = Signal()
36 self.wr_perm = Signal()
37
38
39 def extract_perm_attr(pte):
40 pa = PermAttr()
41 pa.reference = pte[8]
42 pa.changed = pte[7]
43 pa.nocache = pte[5]
44 pa.priv = pte[3]
45 pa.rd_perm = pte[2]
46 pa.wr_perm = pte[1]
47 return pa;
48
49
50 # Type of operation on a "valid" input
51 @unique
52 class Op(Enum):
53 OP_NONE = 0
54 OP_BAD = 1 # NC cache hit, TLB miss, prot/RC failure
55 OP_STCX_FAIL = 2 # conditional store w/o reservation
56 OP_LOAD_HIT = 3 # Cache hit on load
57 OP_LOAD_MISS = 4 # Load missing cache
58 OP_LOAD_NC = 5 # Non-cachable load
59 OP_STORE_HIT = 6 # Store hitting cache
60 OP_STORE_MISS = 7 # Store missing cache
61
62
63 # Cache state machine
64 @unique
65 class State(Enum):
66 IDLE = 0 # Normal load hit processing
67 RELOAD_WAIT_ACK = 1 # Cache reload wait ack
68 STORE_WAIT_ACK = 2 # Store wait ack
69 NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
70
71
72 # Dcache operations:
73 #
74 # In order to make timing, we use the BRAMs with
75 # an output buffer, which means that the BRAM
76 # output is delayed by an extra cycle.
77 #
78 # Thus, the dcache has a 2-stage internal pipeline
79 # for cache hits with no stalls.
80 #
81 # All other operations are handled via stalling
82 # in the first stage.
83 #
84 # The second stage can thus complete a hit at the same
85 # time as the first stage emits a stall for a complex op.
86 #
87 # Stage 0 register, basically contains just the latched request
88 class RegStage0(RecordObject):
89 def __init__(self):
90 super().__init__()
91 self.req = LoadStore1ToDCacheType()
92 self.tlbie = Signal()
93 self.doall = Signal()
94 self.tlbld = Signal()
95 self.mmu_req = Signal() # indicates source of request
96
97
98 class MemAccessRequest(RecordObject):
99 def __init__(self):
100 super().__init__()
101 self.op = Op()
102 self.valid = Signal()
103 self.dcbz = Signal()
104 self.real_addr = Signal(REAL_ADDR_BITS)
105 self.data = Signal(64)
106 self.byte_sel = Signal(8)
107 self.hit_way = Signal(WAY_BITS)
108 self.same_tag = Signal()
109 self.mmu_req = Signal()
110
111
112 # First stage register, contains state for stage 1 of load hits
113 # and for the state machine used by all other operations
114 class RegStage1(RecordObject):
115 def __init__(self):
116 super().__init__()
117 # Info about the request
118 self.full = Signal() # have uncompleted request
119 self.mmu_req = Signal() # request is from MMU
120 self.req = MemAccessRequest()
121
122 # Cache hit state
123 self.hit_way = Signal(WAY_BITS)
124 self.hit_load_valid = Signal()
125 self.hit_index = Signal(NUM_LINES)
126 self.cache_hit = Signal()
127
128 # TLB hit state
129 self.tlb_hit = Signal()
130 self.tlb_hit_way = Signal(TLB_NUM_WAYS)
131 self.tlb_hit_index = Signal(TLB_SET_SIZE)
132 self.
133 # 2-stage data buffer for data forwarded from writes to reads
134 self.forward_data1 = Signal(64)
135 self.forward_data2 = Signal(64)
136 self.forward_sel1 = Signal(8)
137 self.forward_valid1 = Signal()
138 self.forward_way1 = Signal(WAY_BITS)
139 self.forward_row1 = Signal(BRAM_ROWS)
140 self.use_forward1 = Signal()
141 self.forward_sel = Signal(8)
142
143 # Cache miss state (reload state machine)
144 self.state = State()
145 self.dcbz = Signal()
146 self.write_bram = Signal()
147 self.write_tag = Signal()
148 self.slow_valid = Signal()
149 self.wb = WishboneMasterOut()
150 self.reload_tag = Signal(TAG_BITS)
151 self.store_way = Signal(WAY_BITS)
152 self.store_row = Signal(BRAM_ROWS)
153 self.store_index = Signal(NUM_LINES)
154 self.end_row_ix = Signal(ROW_LINE_BIT)
155 self.rows_valid = RowPerLineValidArray()
156 self.acks_pending = Signal(3)
157 self.inc_acks = Signal()
158 self.dec_acks = Signal()
159
160 # Signals to complete (possibly with error)
161 self.ls_valid = Signal()
162 self.ls_error = Signal()
163 self.mmu_done = Signal()
164 self.mmu_error = Signal()
165 self.cache_paradox = Signal()
166
167 # Signal to complete a failed stcx.
168 self.stcx_fail = Signal()
169
170
171 # Reservation information
172 class Reservation(RecordObject):
173 def __init__(self):
174 super().__init__()
175 valid = Signal()
176 # TODO LINE_OFF_BITS is 6
177 addr = Signal(63 downto LINE_OFF_BITS)
178
179
180 # Set associative dcache write-through
181 #
182 # TODO (in no specific order):
183 #
184 # * See list in icache.vhdl
185 # * Complete load misses on the cycle when WB data comes instead of
186 # at the end of line (this requires dealing with requests coming in
187 # while not idle...)
188 class DCache(Elaboratable):
189 def __init__(self):
190 # TODO: make these parameters of DCache at some point
191 self.LINE_SIZE = 64 # Line size in bytes
192 self.NUM_LINES = 32 # Number of lines in a set
193 self.NUM_WAYS = 4 # Number of ways
194 self.TLB_SET_SIZE = 64 # L1 DTLB entries per set
195 self.TLB_NUM_WAYS = 2 # L1 DTLB number of sets
196 self.TLB_LG_PGSZ = 12 # L1 DTLB log_2(page_size)
197 self.LOG_LENGTH = 0 # Non-zero to enable log data collection
198
199 self.d_in = LoadStore1ToDCacheType()
200 self.d_out = DCacheToLoadStore1Type()
201
202 self.m_in = MMUToDCacheType()
203 self.m_out = DCacheToMMUType()
204
205 self.stall_out = Signal()
206
207 self.wb_out = WBMasterOut()
208 self.wb_in = WBSlaveOut()
209
210 self.log_out = Signal(20)
211
212 # Latch the request in r0.req as long as we're not stalling
213 def stage_0(self, m, d_in, m_in):
214 comb = m.d.comb
215 sync = m.d.sync
216
217 r = RegStage0()
218
219 # TODO, this goes in unit tests and formal proofs
220 # assert ~(d_in.valid & m_in.valid),
221 # "request collision loadstore vs MMU"
222 with m.If(~(d_in.valid & m_in.valid)):
223 #sync += Display("request collision loadstore vs MMU")
224 pass
225
226 with m.If(m_in.valid):
227 sync += r.req.valid.eq(1)
228 sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
229 sync += r.req.dcbz.eq(0)
230 sync += r.req.nc.eq(0)
231 sync += r.req.reserve.eq(0)
232 sync += r.req.virt_mode.eq(1)
233 sync += r.req.priv_mode.eq(1)
234 sync += r.req.addr.eq(m_in.addr)
235 sync += r.req.data.eq(m_in.pte)
236 sync += r.req.byte_sel.eq(-1) # Const -1 sets all to 0b111....
237 sync += r.tlbie.eq(m_in.tlbie)
238 sync += r.doall.eq(m_in.doall)
239 sync += r.tlbld.eq(m_in.tlbld)
240 sync += r.mmu_req.eq(1)
241 with m.Else():
242 sync += r.req.eq(d_in)
243 sync += r.req.tlbie.eq(0)
244 sync += r.req.doall.eq(0)
245 sync += r.req.tlbd.eq(0)
246 sync += r.req.mmu_req.eq(0)
247 with m.If(~(r1.full & r0_full)):
248 sync += r0.eq(r)
249 sync += r0_full.eq(r.req.valid)
250
251 # TLB
252 # Operates in the second cycle on the request latched in r0.req.
253 # TLB updates write the entry at the end of the second cycle.
254 def tlb_read(self, m, m_in, d_in, r0_stall, tlb_valid_way,
255 tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
256 dtlb_tags, dtlb_ptes):
257
258 comb = m.d.comb
259 sync = m.d.sync
260
261 index = Signal(log2_int(TLB_SET_BITS), False)
262 addrbits = Signal(TLB_SET_BITS)
263
264 amin = TLB_LG_PGSZ
265 amax = TLB_LG_PGSZ + TLB_SET_BITS
266
267 with m.If(m_in.valid):
268 comb += addrbits.eq(m_in.addr[amin : amax])
269 with m.Else():
270 comb += addrbits.eq(d_in.addr[amin : amax])
271 comb += index.eq(addrbits)
272
273 # If we have any op and the previous op isn't finished,
274 # then keep the same output for next cycle.
275 with m.If(~r0_stall):
276 sync += tlb_valid_way.eq(dtlb_valid_bits[index])
277 sync += tlb_tag_way.eq(dtlb_tags[index])
278 sync += tlb_pte_way.eq(dtlb_ptes[index])
279
280 # Generate TLB PLRUs
281 def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, acc, acc_en, lru):
282 comb = m.d.comb
283 sync = m.d.sync
284
285 with m.If(TLB_NUM_WAYS > 1):
286 for i in range(TLB_SET_SIZE):
287 # TLB PLRU interface
288 tlb_plru = PLRU(TLB_WAY_BITS)
289 tlb_plru_acc = Signal(TLB_WAY_BITS)
290 tlb_plru_acc_en = Signal()
291 tlb_plru_out = Signal(TLB_WAY_BITS)
292
293 comb += tlb_plru.acc.eq(tlb_plru_acc)
294 comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
295 comb += tlb_plru.lru.eq(tlb_plru_out)
296
297 # PLRU interface
298 with m.If(r1.tlb_hit_index == i):
299 comb += tlb_plru.acc_en.eq(r1.tlb_hit)
300 with m.Else():
301 comb += tlb_plru.acc_en.eq(0)
302 comb += tlb_plru.acc.eq(r1.tlb_hit_way)
303
304 comb += tlb_plru_victim[i].eq(tlb_plru.lru)
305
306 def tlb_search(self, tlb_req_index, r0, tlb_valid_way_ tlb_tag_way,
307 tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
308
309 comb = m.d.comb
310 sync = m.d.sync
311
312 hitway = Signal(TLB_WAY_BITS)
313 hit = Signal()
314 eatag = Signal(log2_int(TLB_EA_TAG_BITS, False))
315
316 TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
317 comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
318 comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
319
320 for i in range(TLB_NUM_WAYS):
321 with m.If(tlb_valid_way(i)
322 & read_tlb_tag(i, tlb_tag_way) == eatag):
323 comb += hitway.eq(i)
324 comb += hit.eq(1)
325
326 comb += tlb_hit.eq(hit & r0_valid)
327 comb += tlb_hit_way.eq(hitway)
328
329 with m.If(tlb_hit):
330 comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
331 with m.Else():
332 comb += pte.eq(0)
333 comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
334 with m.If(r0.req.virt_mode):
335 comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
336 r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
337 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
338 comb += perm_attr.eq(extract_perm_attr(pte))
339 with m.Else():
340 comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
341 r0.rq.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
342
343 comb += perm_attr.reference.eq(1)
344 comb += perm_attr.changed.eq(1)
345 comb += perm_attr.priv.eq(1)
346 comb += perm_attr.nocache.eq(0)
347 comb += perm_attr.rd_perm.eq(1)
348 comb += perm_attr.wr_perm.eq(1)
349
350 def tlb_update(self, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
351 tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
352 dtlb_tags, tlb_pte_way, dtlb_ptes, dtlb_valid_bits):
353
354 comb = m.d.comb
355 sync = m.d.sync
356
357 # variable tlbie : std_ulogic;
358 # variable tlbwe : std_ulogic;
359 # variable repl_way : tlb_way_t;
360 # variable eatag : tlb_tag_t;
361 # variable tagset : tlb_way_tags_t;
362 # variable pteset : tlb_way_ptes_t;
363 #type tlb_tags_t is array(tlb_index_t) of tlb_way_tags_t;
364 # --> Array([Signal(log(way_tags length)) for i in range(number of tlbs)])
365
366 tlbie = Signal()
367 tlbwe = Signal()
368 repl_way = Signal(TLB_WAY_BITS)
369 eatag = Signal(log2_int(TLB_EA_TAG_BITS, False))
370 tagset = TLBWayTags()
371 pteset = TLBWayPtes()
372
373 comb += tlbie.eq(r0_valid & r0.tlbie)
374 comb += tlbwe.eq(r0_valid & r0.tlbldoi)
375
376 with m.If(tlbie & r0.doall):
377 # clear all valid bits at once
378 for i in range(TLB_SET_SIZE):
379 sync += dtlb_valid_bits[i].eq(0)
380
381 with m.Elif(tlbie):
382 with m.If(tlb_hit):
383 sync += dtlb_valid_bits[tlb_req_index][tlb_hit_way].eq(0)
384 with m.Elif(tlbwe):
385 with m.If(tlb_hit):
386 comb += repl_way.eq(tlb_hit_way)
387 with m.Else():
388 comb += repl_way.eq(tlb_plru_victim[tlb_req_index])
389 comb += eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
390 comb += tagset.eq(tlb_tag_way)
391 sync += write_tlb_tag(repl_way, tagset, eatag)
392 sync += dtlb_tags[tlb_req_index].eq(tagset)
393 comb += pteset.eq(tlb_pte_way)
394 sync += write_tlb_pte(repl_way, pteset, r0.req.data)
395 sync += dtlb_ptes[tlb_req_index].eq(pteset)
396 sync += dtlb_valid_bits[tlb_req_index][repl_way].eq(1)
397
398 # Generate PLRUs
399 def maybe_plrus(self, r1):
400
401 comb = m.d.comb
402 sync = m.d.sync
403
404 for i in range(NUM_LINES):
405 # PLRU interface
406 plru = PLRU(TLB_WAY_BITS)
407 setattr(m.submodules, "plru%d" % i, plru)
408 plru_acc = Signal(TLB_WAY_BITS)
409 plru_acc_en = Signal()
410 plru_out = Signal(TLB_WAY_BITS)
411
412 comb += plru.acc.eq(plru_acc)
413 comb += plru.acc_en.eq(plru_acc_en)
414 comb += plru.lru.eq(plru_out)
415
416 with m.If(r1.hit_index == i):
417 comb += plru_acc_en.eq(r1.cache_hit)
418
419 comb += plru_acc.eq(r1.hit_way)
420 comb += plru_victim[i].eq(plru_out)
421
422 # Cache tag RAM read port
423 def cache_tag_read(self, r0_stall, req_index, m_in, d_in,
424 cache_tag_set, cache_tags):
425
426 comb = m.d.comb
427 sync = m.d.sync
428
429 index = Signal(INDEX_BITS)
430
431 with m.If(r0_stall):
432 comb += index.eq(req_index)
433 with m.Elif(m_in.valid):
434 comb += index.eq(get_index(m_in.addr))
435 with m.Else():
436 comb += index.eq(get_index(d_in.addr))
437 sync += cache_tag_set.eq(cache_tags[index])
438
439 # Cache request parsing and hit detection
440 def dcache_request(self, r0, ra, req_index, req_row, req_tag,
441 r0_valid, r1, cache_valid_bits, replace_way,
442 use_forward1_next, use_forward2_next,
443 req_hit_way, plru_victim, rc_ok, perm_attr,
444 valid_ra, perm_ok, access_ok, req_op, req_ok,
445 r0_stall, m_in, early_req_row, d_in):
446
447 comb = m.d.comb
448 sync = m.d.sync
449
450 is_hit = Signal()
451 hit_way = Signal(WAY_BITS)
452 op = Op()
453 opsel = Signal(3)
454 go = Signal()
455 nc = Signal()
456 s_hit = Signal()
457 s_tag = Signal(TAG_BITS)
458 s_pte = Signal(TLB_PTE_BITS)
459 s_ra = Signal(REAL_ADDR_BITS)
460 hit_set = Signal(TLB_NUM_WAYS)
461 hit_way_set = HitWaySet()
462 rel_matches = Signal(TLB_NUM_WAYS)
463 rel_match = Signal()
464
465 # Extract line, row and tag from request
466 comb += req_index.eq(get_index(r0.req.addr))
467 comb += req_row.eq(get_row(r0.req.addr))
468 comb += req_tag.eq(get_tag(ra))
469
470 comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
471
472 # Test if pending request is a hit on any way
473 # In order to make timing in virtual mode,
474 # when we are using the TLB, we compare each
475 # way with each of the real addresses from each way of
476 # the TLB, and then decide later which match to use.
477
478 with m.If(r0.req.virt_mode):
479 comb += rel_matches.eq(0)
480 for j in range(TLB_NUM_WAYS):
481 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
482 comb += s_ra.eq(Cat(r0.req.addr[0:TLB_LG_PGSZ],
483 s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
484 comb += s_tag.eq(get_tag(s_ra))
485
486 for i in range(NUM_WAYS):
487 with m.If(go & cache_valid_bits[req_index][i] &
488 read_tag(i, cache_tag_set) == s_tag
489 & tlb_valid_way[j]):
490 comb += hit_way_set[j].eq(i)
491 comb += s_hit.eq(1)
492 comb += hit_set[j].eq(s_hit)
493 with m.If(s_tag == r1.reload_tag):
494 comb += rel_matches[j].eq(1)
495 with m.If(tlb_hit):
496 comb += is_hit.eq(hit_set[tlb_hit_way])
497 comb += hit_way.eq(hit_way_set[tlb_hit_way])
498 comb += rel_match.eq(rel_matches[tlb_hit_way])
499 with m.Else():
500 comb += s_tag.eq(get_tag(r0.req.addr))
501 for i in range(NUM_WAYS):
502 with m.If(go & cache_valid_bits[req_index][i] &
503 read_tag(i, cache_tag_set) == s_tag):
504 comb += hit_way.eq(i)
505 comb += is_hit.eq(1)
506 with m.If(s_tag == r1.reload_tag):
507 comb += rel_match.eq(1)
508 comb += req_same_tag.eq(rel_match)
509
510 # See if the request matches the line currently being reloaded
511 with m.If((r1.state == State.RELOAD_WAIT_ACK) &
512 (req_index == r1.store_index) & rel_match):
513 # For a store, consider this a hit even if the row isn't
514 # valid since it will be by the time we perform the store.
515 # For a load, check the appropriate row valid bit.
516 valid = r1.rows_valid[req_row % ROW_PER_LINE]
517 comb += is_hit.eq(~r0.req.load | valid)
518 comb += hit_way.eq(replace_way)
519
520 # Whether to use forwarded data for a load or not
521 comb += use_forward1_next.eq(0)
522 with m.If((get_row(r1.req.real_addr) == req_row)
523 & (r1.req.hit_way == hit_way))
524 # Only need to consider r1.write_bram here, since if we
525 # are writing refill data here, then we don't have a
526 # cache hit this cycle on the line being refilled.
527 # (There is the possibility that the load following the
528 # load miss that started the refill could be to the old
529 # contents of the victim line, since it is a couple of
530 # cycles after the refill starts before we see the updated
531 # cache tag. In that case we don't use the bypass.)
532 comb += use_forward1_next.eq(r1.write_bram)
533 comb += use_forward2_next.eq(0)
534 with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
535 comb += use_forward2_next.eq(r1.forward_valid1)
536
537 # The way that matched on a hit
538 comb += req_hit_way.eq(hit_way)
539
540 # The way to replace on a miss
541 with m.If(r1.write_tag):
542 replace_way.eq(plru_victim[r1.store_index])
543 with m.Else():
544 comb += replace_way.eq(r1.store_way)
545
546 # work out whether we have permission for this access
547 # NB we don't yet implement AMR, thus no KUAP
548 comb += rc_ok.eq( perm_attr.reference
549 & (r0.req.load | perm_attr.changed)
550 )
551 comb += perm_ok.eq((r0.req.prive_mode | ~perm_attr.priv)
552 & perm_attr.wr_perm
553 | (r0.req.load & perm_attr.rd_perm)
554 )
555 comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
556 # Combine the request and cache hit status to decide what
557 # operation needs to be done
558 comb += nc.eq(r0.req.nc | perm_attr.nocache)
559 comb += op.eq(Op.OP_NONE)
560 with m.If(go):
561 with m.If(~access_ok):
562 comb += op.eq(Op.OP_BAD)
563 with m.Elif(cancel_store):
564 comb += op.eq(Op.OP_STCX_FAIL)
565 with m.Else():
566 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
567 with m.Switch(opsel):
568 with m.Case(Const(0b101, 3)):
569 comb += op.eq(Op.OP_LOAD_HIT)
570 with m.Case(Cosnt(0b100, 3)):
571 comb += op.eq(Op.OP_LOAD_MISS)
572 with m.Case(Const(0b110, 3)):
573 comb += op.eq(Op.OP_LOAD_NC)
574 with m.Case(Const(0b001, 3)):
575 comb += op.eq(Op.OP_STORE_HIT)
576 with m.Case(Const(0b000, 3)):
577 comb += op.eq(Op.OP_STORE_MISS)
578 with m.Case(Const(0b010, 3)):
579 comb += op.eq(Op.OP_STORE_MISS)
580 with m.Case(Const(0b011, 3)):
581 comb += op.eq(Op.OP_BAD)
582 with m.Case(Const(0b111, 3)):
583 comb += op.eq(Op.OP_BAD)
584 with m.Default():
585 comb += op.eq(Op.OP_NONE)
586 comb += req_op.eq(op)
587 comb += req_go.eq(go)
588
589 # Version of the row number that is valid one cycle earlier
590 # in the cases where we need to read the cache data BRAM.
591 # If we're stalling then we need to keep reading the last
592 # row requested.
593 with m.If(~r0_stall):
594 with m.If(m_in.valid):
595 comb += early_req_row.eq(get_row(m_in.addr))
596 with m.Else():
597 comb += early_req_row.eq(get_row(d_in.addr))
598 with m.Else():
599 comb += early_req_row.eq(req_row)
600
601 # Handle load-with-reservation and store-conditional instructions
602 def reservation_comb(self, cancel_store, set_rsrv, clear_rsrv,
603 r0_valid, r0, reservation):
604
605 comb = m.d.comb
606 sync = m.d.sync
607
608 with m.If(r0_valid & r0.req.reserve):
609
610 # XXX generate alignment interrupt if address
611 # is not aligned XXX or if r0.req.nc = '1'
612 with m.If(r0.req.load):
613 comb += set_rsrv(1) # load with reservation
614 with m.Else():
615 comb += clear_rsrv.eq(1) # store conditional
616 with m.If(~reservation.valid | r0.req.addr[LINE_OFF_BITS:64]):
617 comb += cancel_store.eq(1)
618
619 def reservation_reg(self, r0_valid, access_ok, clear_rsrv,
620 reservation, r0):
621
622 comb = m.d.comb
623 sync = m.d.sync
624
625 with m.If(r0_valid & access_ok):
626 with m.If(clear_rsrv):
627 sync += reservation.valid.eq(0)
628 with m.Elif(set_rsrv):
629 sync += reservation.valid.eq(1)
630 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
631
632 # Return data for loads & completion control logic
633 def writeback_control(self, r1, cache_out, d_out, m_out):
634
635 comb = m.d.comb
636 sync = m.d.sync
637
638 data_out = Signal(64)
639 data_fwd = Signal(64)
640 j = Signal()
641
642 # Use the bypass if are reading the row that was
643 # written 1 or 2 cycles ago, including for the
644 # slow_valid = 1 case (i.e. completing a load
645 # miss or a non-cacheable load).
646 with m.If(r1.use_forward1):
647 comb += data_fwd.eq(r1.forward_data1)
648 with m.Else():
649 comb += data_fwd.eq(r1.forward_data2)
650
651 comb += data_out.eq(cache_out[r1.hit_way])
652
653 for i in range(8):
654 with m.If(r1.forward_sel[i]):
655 dsel = data_fwd.word_select(i, 8)
656 comb += data_out.word_select(i, 8).eq(dsel)
657
658 comb += d_out.valid.eq(r1.ls_valid)
659 comb += d_out.data.eq(data_out)
660 comb += d_out.store_done.eq(~r1.stcx_fail)
661 comb += d_out.error.eq(r1.ls_error)
662 comb += d_out.cache_paradox.eq(r1.cache_paradox)
663
664 # Outputs to MMU
665 comb += m_out.done.eq(r1.mmu_done)
666 comb += m_out.err.eq(r1.mmu_error)
667 comb += m_out.data.eq(data_out)
668
669 # We have a valid load or store hit or we just completed
670 # a slow op such as a load miss, a NC load or a store
671 #
672 # Note: the load hit is delayed by one cycle. However it
673 # can still not collide with r.slow_valid (well unless I
674 # miscalculated) because slow_valid can only be set on a
675 # subsequent request and not on its first cycle (the state
676 # machine must have advanced), which makes slow_valid
677 # at least 2 cycles from the previous hit_load_valid.
678
679 # Sanity: Only one of these must be set in any given cycle
680
681 if False: # TODO: need Display to get this to work
682 assert (r1.slow_valid & r1.stcx_fail) != 1 "unexpected" \
683 "slow_valid collision with stcx_fail -!- severity FAILURE"
684
685 assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1
686 "unexpected hit_load_delayed collision with slow_valid -!-" \
687 "severity FAILURE"
688
689 with m.If(~r1._mmu_req):
690 # Request came from loadstore1...
691 # Load hit case is the standard path
692 with m.If(r1.hit_load_valid):
693 #Display(f"completing load hit data={data_out}")
694 pass
695
696 # error cases complete without stalling
697 with m.If(r1.ls_error):
698 # Display("completing ld/st with error")
699 pass
700
701 # Slow ops (load miss, NC, stores)
702 with m.If(r1.slow_valid):
703 #Display(f"completing store or load miss data={data_out}")
704 pass
705
706 with m.Else():
707 # Request came from MMU
708 with m.If(r1.hit_load_valid):
709 # Display(f"completing load hit to MMU, data={m_out.data}")
710 pass
711 # error cases complete without stalling
712 with m.If(r1.mmu_error):
713 #Display("combpleting MMU ld with error")
714 pass
715
716 # Slow ops (i.e. load miss)
717 with m.If(r1.slow_valid):
718 #Display("completing MMU load miss, data={m_out.data}")
719 pass
720
721 # Generate a cache RAM for each way. This handles the normal
722 # reads, writes from reloads and the special store-hit update
723 # path as well.
724 #
725 # Note: the BRAMs have an extra read buffer, meaning the output
726 # is pipelined an extra cycle. This differs from the
727 # icache. The writeback logic needs to take that into
728 # account by using 1-cycle delayed signals for load hits.
729 def rams(self, ):
730 for i in range(NUM_WAYS):
731 do_read = Signal()
732 rd_addr = Signal(ROW_BITS)
733 do_write = Signal()
734 wr_addr = Signal(ROW_BITS)
735 wr_data = Signal(WB_DATA_BITS)
736 wr_sel = Signal(ROW_SIZE)
737 wr_sel_m = Signal(ROW_SIZE)
738 _d_out = Signal(WB_DATA_BITS)
739
740 # begin
741 # way: entity work.cache_ram
742 # generic map (
743 # ROW_BITS => ROW_BITS,
744 # WIDTH => wishbone_data_bits,
745 # ADD_BUF => true
746 # )
747 # port map (
748 # clk => clk,
749 # rd_en => do_read,
750 # rd_addr => rd_addr,
751 # rd_data => dout,
752 # wr_sel => wr_sel_m,
753 # wr_addr => wr_addr,
754 # wr_data => wr_data
755 # );
756 # process(all)
757 way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
758 comb += way.rd_en.eq(do_read)
759 comb += way.rd_addr.eq(rd_addr)
760 comb += _d_out.eq(way.rd_data)
761 comb += way.wr_sel.eq(wr_sel_m)
762 comb += way.wr_addr.eq(wr_addr)
763 comb += way.wr_data.eq(wr_data)
764
765 # Cache hit reads
766 comb += do_read.eq(1)
767 comb += rd_addr.eq(early_req_row)
768 comb += cache_out[i].eq(_d_out)
769
770 # Write mux:
771 #
772 # Defaults to wishbone read responses (cache refill)
773 #
774 # For timing, the mux on wr_data/sel/addr is not
775 # dependent on anything other than the current state.
776
777 with m.If(r1.write_bram):
778 # Write store data to BRAM. This happens one
779 # cycle after the store is in r0.
780 comb += wr_data.eq(r1.req.data)
781 comb += wr_sel.eq(r1.req.byte_sel)
782 comb += wr_addr.eq(get_row(r1.req.real_addr))
783
784 with m.If(i == r1.req.hit_way):
785 comb += do_write.eq(1)
786 with m.Else():
787 # Otherwise, we might be doing a reload or a DCBZ
788 with m.If(r1.dcbz):
789 comb += wr_data.eq(0)
790 with m.Else():
791 comb += wr_data.eq(wishbone_in.dat)
792 comb += wr_addr.eq(r1.store_row)
793 comb += wr_sel.eq(~0) # all 1s
794
795 with m.If((r1.state == State.RELOAD_WAIT_ACK)
796 & wishbone_in.ack & (relpace_way == i)):
797 comb += do_write.eq(1)
798
799 # Mask write selects with do_write since BRAM
800 # doesn't have a global write-enable
801 with m.If(do_write):
802 comb += wr_sel_m.eq(wr_sel)
803
804 # Cache hit synchronous machine for the easy case.
805 # This handles load hits.
806 # It also handles error cases (TLB miss, cache paradox)
807 def dcache_fast_hit(self, req_op, r0_valid, r1, ):
808
809 comb = m.d.comb
810 sync = m.d.sync
811
812 with m.If(req_op != Op.OP_NONE):
813 #Display(f"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \
814 # f"idx:{req_index} tag:{req_tag} way: {req_hit_way}"
815 # )
816 pass
817
818 with m.If(r0_valid):
819 sync += r1.mmu_req.eq(r0.mmu_req)
820
821 # Fast path for load/store hits.
822 # Set signals for the writeback controls.
823 sync += r1.hit_way.eq(req_hit_way)
824 sync += r1.hit_index.eq(req_index)
825
826 with m.If(req_op == Op.OP_LOAD_HIT):
827 sync += r1.hit_load_valid.eq(1)
828 with m.Else():
829 sync += r1.hit_load_valid.eq(0)
830
831 with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
832 sync += r1.cache_hit.eq(1)
833 with m.Else():
834 sync += r1.cache_hit.eq(0)
835
836 with m.If(req_op == Op.OP_BAD):
837 # Display(f"Signalling ld/st error valid_ra={valid_ra}"
838 # f"rc_ok={rc_ok} perm_ok={perm_ok}"
839 sync += r1.ls_error.eq(~r0.mmu_req)
840 sync += r1.mmu_error.eq(r0.mmu_req)
841 sync += r1.cache_paradox.eq(access_ok)
842
843 with m.Else():
844 sync += r1.ls_error.eq(0)
845 sync += r1.mmu_error.eq(0)
846 sync += r1.cache_paradox.eq(0)
847
848 with m.If(req_op == Op.OP_STCX_FAIL):
849 r1.stcx_fail.eq(1)
850 with m.Else():
851 sync += r1.stcx_fail.eq(0)
852
853 # Record TLB hit information for updating TLB PLRU
854 sync += r1.tlb_hit.eq(tlb_hit)
855 sync += r1.tlb_hit_way.eq(tlb_hit_way)
856 sync += r1.tlb_hit_index.eq(tlb_req_index)
857
858 # Memory accesses are handled by this state machine:
859 #
860 # * Cache load miss/reload (in conjunction with "rams")
861 # * Load hits for non-cachable forms
862 # * Stores (the collision case is handled in "rams")
863 #
864 # All wishbone requests generation is done here.
865 # This machine operates at stage 1.
866 def dcache_slow(self, r1, use_forward1_next, cache_valid_bits, r0,
867 r0_valid, req_op, cache_tag, req_go, ra, wb_in):
868
869 comb = m.d.comb
870 sync = m.d.sync
871
872 req = MemAccessRequest()
873 acks = Signal(3)
874 adjust_acks = Signal(3)
875
876 sync += r1.use_forward1.eq(use_forward1_next)
877 sync += r1.forward_sel.eq(0)
878
879 with m.If(use_forward1_next):
880 sync += r1.forward_sel.eq(r1.req.byte_sel)
881 with m.Elif(use_forward2_next):
882 sync += r1.forward_sel.eq(r1.forward_sel1)
883
884 sync += r1.forward_data2.eq(r1.forward_data1)
885 with m.If(r1.write_bram):
886 sync += r1.forward_data1.eq(r1.req.data)
887 sync += r1.forward_sel1.eq(r1.req.byte_sel)
888 sync += r1.forward_way1.eq(r1.req.hit_way)
889 sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
890 sync += r1.forward_valid1.eq(1)
891 with m.Else():
892 with m.If(r1.bcbz):
893 sync += r1.forward_data1.eq(0)
894 with m.Else():
895 sync += r1.forward_data1.eq(wb_in.dat)
896 sync += r1.forward_sel1.eq(~0) # all 1s
897 sync += r1.forward_way1.eq(replace_way)
898 sync += r1.forward_row1.eq(r1.store_row)
899 sync += r1.forward_valid1.eq(0)
900
901 # One cycle pulses reset
902 sync += r1.slow_valid.eq(0)
903 sync += r1.write_bram.eq(0)
904 sync += r1.inc_acks.eq(0)
905 sync += r1.dec_acks.eq(0)
906
907 sync += r1.ls_valid.eq(0)
908 # complete tlbies and TLB loads in the third cycle
909 sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
910
911 with m.If((req_op == Op.OP_LOAD_HIT)
912 | (req_op == Op.OP_STCX_FAIL)):
913 with m.If(~r0.mmu_req):
914 sync += r1.ls_valid.eq(1)
915 with m.Else():
916 sync += r1.mmu_done.eq(1)
917
918 with m.If(r1.write_tag):
919 # Store new tag in selected way
920 for i in range(NUM_WAYS):
921 with m.If(i == replace_way):
922 idx = r1.store_index
923 trange = range(i * TAG_WIDTH, (i+1) * TAG_WIDTH)
924 sync += cache_tag[idx][trange].eq(r1.reload_tag)
925 sync += r1.store_way.eq(replace_way)
926 sync += r1.write_tag.eq(0)
927
928 # Take request from r1.req if there is one there,
929 # else from req_op, ra, etc.
930 with m.If(r1.full)
931 comb += req.eq(r1.req)
932 with m.Else():
933 comb += req.op.eq(req_op)
934 comb += req.valid.eq(req_go)
935 comb += req.mmu_req.eq(r0.mmu_req)
936 comb += req.dcbz.eq(r0.req.dcbz)
937 comb += req.real_addr.eq(ra)
938
939 with m.If(~r0.req.dcbz):
940 comb += req.data.eq(r0.req.data)
941 with m.Else():
942 comb += req.data.eq(0)
943
944 # Select all bytes for dcbz
945 # and for cacheable loads
946 with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc):
947 comb += req.byte_sel.eq(~0) # all 1s
948 with m.Else():
949 comb += req.byte_sel.eq(r0.req.byte_sel)
950 comb += req.hit_way.eq(req_hit_way)
951 comb += req.same_tag.eq(req_same_tag)
952
953 # Store the incoming request from r0,
954 # if it is a slow request
955 # Note that r1.full = 1 implies req_op = OP_NONE
956 with m.If((req_op == Op.OP_LOAD_MISS)
957 | (req_op == Op.OP_LOAD_NC)
958 | (req_op == Op.OP_STORE_MISS)
959 | (req_op == Op.OP_STORE_HIT)):
960 sync += r1.req(req)
961 sync += r1.full.eq(1)
962
963 # Main state machine
964 with m.Switch(r1.state):
965
966 with m.Case(State.IDLE)
967 # XXX check 'left downto. probably means len(r1.wb.adr)
968 # r1.wb.adr <= req.real_addr(
969 # r1.wb.adr'left downto 0
970 # );
971 sync += r1.wb.adr.eq(req.real_addr[0:r1.wb.adr])
972 sync += r1.wb.sel.eq(req.byte_sel)
973 sync += r1.wb.dat.eq(req.data)
974 sync += r1.dcbz.eq(req.dcbz)
975
976 # Keep track of our index and way
977 # for subsequent stores.
978 sync += r1.store_index.eq(get_index(req.real_addr))
979 sync += r1.store_row.eq(get_row(req.real_addr))
980 sync += r1.end_row_ix.eq(
981 get_row_of_line(get_row(req.real_addr))
982 )
983 sync += r1.reload_tag.eq(get_tag(req.real_addr))
984 sync += r1.req.same_tag.eq(1)
985
986 with m.If(req.op == Op.OP_STORE_HIT):
987 sync += r1.store_way.eq(req.hit_way)
988
989 # Reset per-row valid bits,
990 # ready for handling OP_LOAD_MISS
991 for i in range(ROW_PER_LINE):
992 sync += r1.rows_valid[i].eq(0)
993
994 with m.Switch(req.op):
995 with m.Case(Op.OP_LOAD_HIT):
996 # stay in IDLE state
997 pass
998
999 with m.Case(Op.OP_LOAD_MISS):
1000 #Display(f"cache miss real addr:" \
1001 # f"{req_real_addr}" \
1002 # f" idx:{get_index(req_real_addr)}" \
1003 # f" tag:{get_tag(req.real_addr)}")
1004 pass
1005
1006 # Start the wishbone cycle
1007 sync += r1.wb.we.eq(0)
1008 sync += r1.wb.cyc.eq(1)
1009 sync += r1.wb.stb.eq(1)
1010
1011 # Track that we had one request sent
1012 sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1013 sync += r1.write_tag.eq(1)
1014
1015 with m.Case(Op.OP_LOAD_NC):
1016 sync += r1.wb.cyc.eq(1)
1017 sync += r1.wb.stb.eq(1)
1018 sync += r1.wb.we.eq(0)
1019 sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1020
1021 with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1022 with m.If(~req.bcbz):
1023 sync += r1.state.eq(State.STORE_WAIT_ACK)
1024 sync += r1.acks_pending.eq(1)
1025 sync += r1.full.eq(0)
1026 sync += r1.slow_valid.eq(1)
1027
1028 with m.If(~req.mmu_req):
1029 sync += r1.ls_valid.eq(1)
1030 with m.Else():
1031 sync += r1.mmu_done.eq(1)
1032
1033 with m.If(req.op == Op.OP_STORE_HIT):
1034 sync += r1.write_bram.eq(1)
1035 with m.Else():
1036 sync += r1.state.eq(Op.RELOAD_WAIT_ACK)
1037
1038 with m.If(req.op == Op.OP_STORE_MISS):
1039 sync += r1.write_tag.eq(1)
1040
1041 sync += r1.wb.we.eq(1)
1042 sync += r1.wb.cyc.eq(1)
1043 sync += r1.wb.stb.eq(1)
1044
1045 # OP_NONE and OP_BAD do nothing
1046 # OP_BAD & OP_STCX_FAIL were
1047 # handled above already
1048 with m.Case(Op.OP_NONE):
1049 pass
1050 with m.Case(OP_BAD):
1051 pass
1052 with m.Case(OP_STCX_FAIL):
1053 pass
1054
1055 with m.Case(State.RELOAD_WAIT_ACK):
1056 # Requests are all sent if stb is 0
1057 comb += stbs_done.eq(~r1.wb.stb)
1058
1059 with m.If(~wb_in.stall & ~stbs_done):
1060 # That was the last word?
1061 # We are done sending.
1062 # Clear stb and set stbs_done
1063 # so we can handle an eventual
1064 # last ack on the same cycle.
1065 with m.If(is_last_row_addr(
1066 r1.wb.adr, r1.end_row_ix)):
1067 sync += r1.wb.stb.eq(0)
1068 comb += stbs_done.eq(0)
1069
1070 # Calculate the next row address
1071 sync += r1.wb.adr.eq(next_row_addr(r1.wb.adr))
1072
1073 # Incoming acks processing
1074 sync += r1.forward_valid1.eq(wb_in.ack)
1075 with m.If(wb_in.ack):
1076 # XXX needs an Array bit-accessor here
1077 sync += r1.rows_valid[r1.store_row % ROW_PER_LINE].eq(1)
1078
1079 # If this is the data we were looking for,
1080 # we can complete the request next cycle.
1081 # Compare the whole address in case the
1082 # request in r1.req is not the one that
1083 # started this refill.
1084 with m.If(r1.full & r1.req.same_tag &
1085 ((r1.dcbz & r1.req.dcbz) |
1086 (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1087 (r1.store_row == get_row(r1.req.real_addr))):
1088 sync += r1.full.eq(0)
1089 sync += r1.slow_valid.eq(1)
1090 with m.If(~r1.mmu_req):
1091 sync += r1.ls_valid.eq(1)
1092 with m.Else():
1093 sync += r1.mmu_done.eq(1)
1094 sync += r1.forward_sel.eq(~0) # all 1s
1095 sync += r1.use_forward1.eq(1)
1096
1097 # Check for completion
1098 with m.If(stbs_done & is_last_row(r1.store_row,
1099 r1.end_row_ix)):
1100 # Complete wishbone cycle
1101 sync += r1.wb.cyc.eq(0)
1102
1103 # Cache line is now valid
1104 cv = cache_valid_bits[r1.store_index]
1105 sync += cv[r1.store_way].eq(1)
1106 sync += r1.state.eq(State.IDLE)
1107
1108 # Increment store row counter
1109 sync += r1.store_row.eq(next_row(r1.store_row))
1110
1111 with m.Case(State.STORE_WAIT_ACK):
1112 comb += stbs_done.eq(~r1.wb.stb)
1113 comb += acks.eq(r1.acks_pending)
1114
1115 with m.If(r1.inc_acks != r1.dec_acks):
1116 with m.If(r1.inc_acks):
1117 comb += adjust_acks.eq(acks + 1)
1118 with m.Else():
1119 comb += adjust_acks.eq(acks - 1)
1120 with m.Else():
1121 comb += adjust_acks.eq(acks)
1122
1123 sync += r1.acks_pending.eq(adjust_acks)
1124
1125 # Clear stb when slave accepted request
1126 with m.If(~wb_in.stall):
1127 # See if there is another store waiting
1128 # to be done which is in the same real page.
1129 with m.If(req.valid):
1130 ra = req.real_addr[0:SET_SIZE_BITS]
1131 sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
1132 sync += r1.wb.dat.eq(req.data)
1133 sync += r1.wb.sel.eq(req.byte_sel)
1134
1135 with m.Elif((adjust_acks < 7) & req.same_tag &
1136 ((req.op == Op.Op_STORE_MISS)
1137 | (req.op == Op.OP_SOTRE_HIT))):
1138 sync += r1.wb.stb.eq(1)
1139 comb += stbs_done.eq(0)
1140
1141 with m.If(req.op == Op.OP_STORE_HIT):
1142 sync += r1.write_bram.eq(1)
1143 sync += r1.full.eq(0)
1144 sync += r1.slow_valid.eq(1)
1145
1146 # Store requests never come from the MMU
1147 sync += r1.ls_valid.eq(1)
1148 comb += stbs_done.eq(0)
1149 sync += r1.inc_acks.eq(1)
1150 with m.Else():
1151 sync += r1.wb.stb.eq(0)
1152 comb += stbs_done.eq(1)
1153
1154 # Got ack ? See if complete.
1155 with m.If(wb_in.ack):
1156 with m.If(stbs_done & (adjust_acks == 1))
1157 sync += r1.state.eq(State.IDLE)
1158 sync += r1.wb.cyc.eq(0)
1159 sync += r1.wb.stb.eq(0)
1160 sync += r1.dec_acks.eq(1)
1161
1162 with m.Case(State.NC_LOAD_WAIT_ACK):
1163 # Clear stb when slave accepted request
1164 with m.If(~wb_in.stall):
1165 sync += r1.wb.stb.eq(0)
1166
1167 # Got ack ? complete.
1168 with m.If(wb_in.ack):
1169 sync += r1.state.eq(State.IDLE)
1170 sync += r1.full.eq(0)
1171 sync += r1.slow_valid.eq(1)
1172
1173 with m.If(~r1.mmu_req):
1174 sync += r1.ls_valid.eq(1)
1175 with m.Else():
1176 sync += r1.mmu_done.eq(1)
1177
1178 sync += r1.forward_sel.eq(~0) # all 1s
1179 sync += r1.use_forward1.eq(1)
1180 sync += r1.wb.cyc.eq(0)
1181 sync += r1.wb.stb.eq(0)
1182
1183 # dc_log: if LOG_LENGTH > 0 generate
1184 # TODO learn how to tranlate vhdl generate into nmigen
1185 def dcache_log(self, r1, valid_ra, tlb_hit_way, stall_out,
1186 d_out, wb_in, log_out):
1187
1188 comb = m.d.comb
1189 sync = m.d.sync
1190
1191 # signal log_data : std_ulogic_vector(19 downto 0);
1192 log_data = Signal(20)
1193
1194 comb += log_data
1195
1196 # begin
1197 # dcache_log: process(clk)
1198 # begin
1199 # if rising_edge(clk) then
1200 # log_data <= r1.wb.adr(5 downto 3) &
1201 # wishbone_in.stall &
1202 # wishbone_in.ack &
1203 # r1.wb.stb & r1.wb.cyc &
1204 # d_out.error &
1205 # d_out.valid &
1206 # std_ulogic_vector(
1207 # to_unsigned(op_t'pos(req_op), 3)) &
1208 # stall_out &
1209 # std_ulogic_vector(
1210 # to_unsigned(tlb_hit_way, 3)) &
1211 # valid_ra &
1212 # std_ulogic_vector(
1213 # to_unsigned(state_t'pos(r1.state), 3));
1214 sync += log_data.eq(Cat(
1215 Const(r1.state, 3), valid_ra, Const(tlb_hit_way, 3),
1216 stall_out, Const(req_op, 3), d_out.valid, d_out.error,
1217 r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1218 r1.wb.adr[3:6]
1219 ))
1220 # end if;
1221 # end process;
1222 # log_out <= log_data;
1223 # TODO ??? I am very confused need help
1224 comb += log_out.eq(log_data)
1225 # end generate;
1226 # end;
1227
1228 def elaborate(self, platform):
1229 LINE_SIZE = self.LINE_SIZE
1230 NUM_LINES = self.NUM_LINES
1231 NUM_WAYS = self.NUM_WAYS
1232 TLB_SET_SIZE = self.TLB_SET_SIZE
1233 TLB_NUM_WAYS = self.TLB_NUM_WAYS
1234 TLB_LG_PGSZ = self.TLB_LG_PGSZ
1235 LOG_LENGTH = self.LOG_LENGTH
1236
1237 # BRAM organisation: We never access more than
1238 # -- wishbone_data_bits at a time so to save
1239 # -- resources we make the array only that wide, and
1240 # -- use consecutive indices for to make a cache "line"
1241 # --
1242 # -- ROW_SIZE is the width in bytes of the BRAM
1243 # -- (based on WB, so 64-bits)
1244 ROW_SIZE = WB_DATA_BITS / 8;
1245
1246 # ROW_PER_LINE is the number of row (wishbone
1247 # transactions) in a line
1248 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
1249
1250 # BRAM_ROWS is the number of rows in BRAM needed
1251 # to represent the full dcache
1252 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
1253
1254
1255 # Bit fields counts in the address
1256
1257 # REAL_ADDR_BITS is the number of real address
1258 # bits that we store
1259 REAL_ADDR_BITS = 56
1260
1261 # ROW_BITS is the number of bits to select a row
1262 ROW_BITS = log2_int(BRAM_ROWS)
1263
1264 # ROW_LINE_BITS is the number of bits to select
1265 # a row within a line
1266 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
1267
1268 # LINE_OFF_BITS is the number of bits for
1269 # the offset in a cache line
1270 LINE_OFF_BITS = log2_int(LINE_SIZE)
1271
1272 # ROW_OFF_BITS is the number of bits for
1273 # the offset in a row
1274 ROW_OFF_BITS = log2_int(ROW_SIZE)
1275
1276 # INDEX_BITS is the number if bits to
1277 # select a cache line
1278 INDEX_BITS = log2_int(NUM_LINES)
1279
1280 # SET_SIZE_BITS is the log base 2 of the set size
1281 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
1282
1283 # TAG_BITS is the number of bits of
1284 # the tag part of the address
1285 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
1286
1287 # TAG_WIDTH is the width in bits of each way of the tag RAM
1288 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
1289
1290 # WAY_BITS is the number of bits to select a way
1291 WAY_BITS = log2_int(NUM_WAYS)
1292
1293 # Example of layout for 32 lines of 64 bytes:
1294 #
1295 # .. tag |index| line |
1296 # .. | row | |
1297 # .. | |---| | ROW_LINE_BITS (3)
1298 # .. | |--- - --| LINE_OFF_BITS (6)
1299 # .. | |- --| ROW_OFF_BITS (3)
1300 # .. |----- ---| | ROW_BITS (8)
1301 # .. |-----| | INDEX_BITS (5)
1302 # .. --------| | TAG_BITS (45)
1303
1304 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
1305
1306 def CacheTagArray():
1307 return Array(CacheTagSet() for x in range(NUM_LINES))
1308
1309 def CacheValidBitsArray():
1310 return Array(CacheWayValidBits() for x in range(NUM_LINES))
1311
1312 def RowPerLineValidArray():
1313 return Array(Signal() for x in range(ROW_PER_LINE))
1314
1315 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1316 cache_tags = CacheTagArray()
1317 cache_tag_set = Signal(TAG_RAM_WIDTH)
1318 cache_valid_bits = CacheValidBitsArray()
1319
1320 # TODO attribute ram_style : string;
1321 # TODO attribute ram_style of cache_tags : signal is "distributed";
1322
1323 # L1 TLB
1324 TLB_SET_BITS = log2_int(TLB_SET_SIZE)
1325 TLB_WAY_BITS = log2_int(TLB_NUM_WAYS)
1326 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
1327 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
1328 TLB_PTE_BITS = 64
1329 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
1330
1331 def TLBValidBitsArray():
1332 return Array(
1333 Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE)
1334 )
1335
1336 def TLBTagsArray():
1337 return Array(
1338 Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE)
1339 )
1340
1341 def TLBPtesArray():
1342 return Array(
1343 Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE)
1344 )
1345
1346 def HitWaySet():
1347 return Array(Signal(NUM_WAYS) for x in range(TLB_NUM_WAYS))
1348
1349 """note: these are passed to nmigen.hdl.Memory as "attributes".
1350 don't know how, just that they are.
1351 """
1352 dtlb_valid_bits = TLBValidBitsArray()
1353 dtlb_tags = TLBTagsArray()
1354 dtlb_ptes = TLBPtesArray()
1355 # TODO attribute ram_style of
1356 # dtlb_tags : signal is "distributed";
1357 # TODO attribute ram_style of
1358 # dtlb_ptes : signal is "distributed";
1359
1360 r0 = RegStage0()
1361 r0_full = Signal()
1362
1363 r1 = RegStage1()
1364
1365 reservation = Reservation()
1366
1367 # Async signals on incoming request
1368 req_index = Signal(NUM_LINES)
1369 req_row = Signal(BRAM_ROWS)
1370 req_hit_way = Signal(WAY_BITS)
1371 req_tag = Signal(TAG_BITS)
1372 req_op = Op()
1373 req_data = Signal(64)
1374 req_same_tag = Signal()
1375 req_go = Signal()
1376
1377 early_req_row = Signal(BRAM_ROWS)
1378
1379 cancel_store = Signal()
1380 set_rsrv = Signal()
1381 clear_rsrv = Signal()
1382
1383 r0_valid = Signal()
1384 r0_stall = Signal()
1385
1386 use_forward1_next = Signal()
1387 use_forward2_next = Signal()
1388
1389 # Cache RAM interface
1390 def CacheRamOut():
1391 return Array(Signal(WB_DATA_BITS) for x in range(NUM_WAYS))
1392
1393 cache_out = CacheRamOut()
1394
1395 # PLRU output interface
1396 def PLRUOut():
1397 return Array(Signal(WAY_BITS) for x in range(Index()))
1398
1399 plru_victim = PLRUOut()
1400 replace_way = Signal(WAY_BITS)
1401
1402 # Wishbone read/write/cache write formatting signals
1403 bus_sel = Signal(8)
1404
1405 # TLB signals
1406 tlb_tag_way = Signal(TLB_TAG_WAY_BITS)
1407 tlb_pte_way = Signal(TLB_PTE_WAY_BITS)
1408 tlb_valid_way = Signal(TLB_NUM_WAYS)
1409 tlb_req_index = Signal(TLB_SET_SIZE)
1410 tlb_hit = Signal()
1411 tlb_hit_way = Signal(TLB_NUM_WAYS)
1412 pte = Signal(TLB_PTE_BITS)
1413 ra = Signal(REAL_ADDR_BITS)
1414 valid_ra = Signal()
1415 perm_attr = PermAttr()
1416 rc_ok = Signal()
1417 perm_ok = Signal()
1418 access_ok = Signal()
1419
1420 # TLB PLRU output interface
1421 def TLBPLRUOut():
1422 return Array(
1423 Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE)
1424 )
1425
1426 tlb_plru_victim = TLBPLRUOut()
1427
1428 # Helper functions to decode incoming requests
1429 #
1430 # Return the cache line index (tag index) for an address
1431 def get_index(addr):
1432 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
1433
1434 # Return the cache row index (data memory) for an address
1435 def get_row(addr):
1436 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
1437
1438 # Return the index of a row within a line
1439 def get_row_of_line(row):
1440 row_v = Signal(ROW_BITS)
1441 row_v = Signal(row)
1442 return row_v[0:ROW_LINE_BITS]
1443
1444 # Returns whether this is the last row of a line
1445 def is_last_row_addr(addr, last):
1446 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
1447
1448 # Returns whether this is the last row of a line
1449 def is_last_row(row, last):
1450 return get_row_of_line(row) == last
1451
1452 # Return the address of the next row in the current cache line
1453 def next_row_addr(addr):
1454 row_idx = Signal(ROW_LINE_BITS)
1455 result = WBAddrType()
1456 # Is there no simpler way in VHDL to
1457 # generate that 3 bits adder ?
1458 row_idx = addr[ROW_OFF_BITS:LINE_OFF_BITS]
1459 row_idx = Signal(row_idx + 1)
1460 result = addr
1461 result[ROW_OFF_BITS:LINE_OFF_BITS] = row_idx
1462 return result
1463
1464 # Return the next row in the current cache line. We use a
1465 # dedicated function in order to limit the size of the
1466 # generated adder to be only the bits within a cache line
1467 # (3 bits with default settings)
1468 def next_row(row)
1469 row_v = Signal(ROW_BITS)
1470 row_idx = Signal(ROW_LINE_BITS)
1471 result = Signal(ROW_BITS)
1472
1473 row_v = Signal(row)
1474 row_idx = row_v[ROW_LINE_BITS]
1475 row_v[0:ROW_LINE_BITS] = Signal(row_idx + 1)
1476 return row_v
1477
1478 # Get the tag value from the address
1479 def get_tag(addr):
1480 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
1481
1482 # Read a tag from a tag memory row
1483 def read_tag(way, tagset):
1484 return tagset[way *TAG_WIDTH:way * TAG_WIDTH + TAG_BITS]
1485
1486 # Read a TLB tag from a TLB tag memory row
1487 def read_tlb_tag(way, tags):
1488 j = Signal()
1489
1490 j = way * TLB_EA_TAG_BITS
1491 return tags[j:j + TLB_EA_TAG_BITS]
1492
1493 # Write a TLB tag to a TLB tag memory row
1494 def write_tlb_tag(way, tags), tag):
1495 j = Signal()
1496
1497 j = way * TLB_EA_TAG_BITS
1498 tags[j:j + TLB_EA_TAG_BITS] = tag
1499
1500 # Read a PTE from a TLB PTE memory row
1501 def read_tlb_pte(way, ptes):
1502 j = Signal()
1503
1504 j = way * TLB_PTE_BITS
1505 return ptes[j:j + TLB_PTE_BITS]
1506
1507 def write_tlb_pte(way, ptes,newpte):
1508 j = Signal()
1509
1510 j = way * TLB_PTE_BITS
1511 return ptes[j:j + TLB_PTE_BITS] = newpte
1512
1513 assert (LINE_SIZE % ROW_SIZE) == 0 "LINE_SIZE not " \
1514 "multiple of ROW_SIZE"
1515
1516 assert (LINE_SIZE % 2) == 0 "LINE_SIZE not power of 2"
1517
1518 assert (NUM_LINES % 2) == 0 "NUM_LINES not power of 2"
1519
1520 assert (ROW_PER_LINE % 2) == 0 "ROW_PER_LINE not" \
1521 "power of 2"
1522
1523 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS) \
1524 "geometry bits don't add up"
1525
1526 assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) \
1527 "geometry bits don't add up"
1528
1529 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS \
1530 + LINE_OFF_BITS) "geometry bits don't add up"
1531
1532 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS) \
1533 "geometry bits don't add up"
1534
1535 assert 64 == wishbone_data_bits "Can't yet handle a" \
1536 "wishbone width that isn't 64-bits"
1537
1538 assert SET_SIZE_BITS <= TLB_LG_PGSZ "Set indexed by" \
1539 "virtual address"
1540
1541 # we don't yet handle collisions between loadstore1 requests
1542 # and MMU requests
1543 comb += m_out.stall.eq(0)
1544
1545 # Hold off the request in r0 when r1 has an uncompleted request
1546 comb += r0_stall.eq(r0_full & r1.full)
1547 comb += r0_valid.eq(r0_full & ~r1.full)
1548 comb += stall_out.eq(r0_stall)
1549
1550 # Wire up wishbone request latch out of stage 1
1551 comb += wishbone_out.eq(r1.wb)
1552
1553
1554
1555 # dcache_tb.vhdl
1556 #
1557 # entity dcache_tb is
1558 # end dcache_tb;
1559 #
1560 # architecture behave of dcache_tb is
1561 # signal clk : std_ulogic;
1562 # signal rst : std_ulogic;
1563 #
1564 # signal d_in : Loadstore1ToDcacheType;
1565 # signal d_out : DcacheToLoadstore1Type;
1566 #
1567 # signal m_in : MmuToDcacheType;
1568 # signal m_out : DcacheToMmuType;
1569 #
1570 # signal wb_bram_in : wishbone_master_out;
1571 # signal wb_bram_out : wishbone_slave_out;
1572 #
1573 # constant clk_period : time := 10 ns;
1574 # begin
1575 # dcache0: entity work.dcache
1576 # generic map(
1577 #
1578 # LINE_SIZE => 64,
1579 # NUM_LINES => 4
1580 # )
1581 # port map(
1582 # clk => clk,
1583 # rst => rst,
1584 # d_in => d_in,
1585 # d_out => d_out,
1586 # m_in => m_in,
1587 # m_out => m_out,
1588 # wishbone_out => wb_bram_in,
1589 # wishbone_in => wb_bram_out
1590 # );
1591 #
1592 # -- BRAM Memory slave
1593 # bram0: entity work.wishbone_bram_wrapper
1594 # generic map(
1595 # MEMORY_SIZE => 1024,
1596 # RAM_INIT_FILE => "icache_test.bin"
1597 # )
1598 # port map(
1599 # clk => clk,
1600 # rst => rst,
1601 # wishbone_in => wb_bram_in,
1602 # wishbone_out => wb_bram_out
1603 # );
1604 #
1605 # clk_process: process
1606 # begin
1607 # clk <= '0';
1608 # wait for clk_period/2;
1609 # clk <= '1';
1610 # wait for clk_period/2;
1611 # end process;
1612 #
1613 # rst_process: process
1614 # begin
1615 # rst <= '1';
1616 # wait for 2*clk_period;
1617 # rst <= '0';
1618 # wait;
1619 # end process;
1620 #
1621 # stim: process
1622 # begin
1623 # -- Clear stuff
1624 # d_in.valid <= '0';
1625 # d_in.load <= '0';
1626 # d_in.nc <= '0';
1627 # d_in.addr <= (others => '0');
1628 # d_in.data <= (others => '0');
1629 # m_in.valid <= '0';
1630 # m_in.addr <= (others => '0');
1631 # m_in.pte <= (others => '0');
1632 #
1633 # wait for 4*clk_period;
1634 # wait until rising_edge(clk);
1635 #
1636 # -- Cacheable read of address 4
1637 # d_in.load <= '1';
1638 # d_in.nc <= '0';
1639 # d_in.addr <= x"0000000000000004";
1640 # d_in.valid <= '1';
1641 # wait until rising_edge(clk);
1642 # d_in.valid <= '0';
1643 #
1644 # wait until rising_edge(clk) and d_out.valid = '1';
1645 # assert d_out.data = x"0000000100000000"
1646 # report "data @" & to_hstring(d_in.addr) &
1647 # "=" & to_hstring(d_out.data) &
1648 # " expected 0000000100000000"
1649 # severity failure;
1650 # -- wait for clk_period;
1651 #
1652 # -- Cacheable read of address 30
1653 # d_in.load <= '1';
1654 # d_in.nc <= '0';
1655 # d_in.addr <= x"0000000000000030";
1656 # d_in.valid <= '1';
1657 # wait until rising_edge(clk);
1658 # d_in.valid <= '0';
1659 #
1660 # wait until rising_edge(clk) and d_out.valid = '1';
1661 # assert d_out.data = x"0000000D0000000C"
1662 # report "data @" & to_hstring(d_in.addr) &
1663 # "=" & to_hstring(d_out.data) &
1664 # " expected 0000000D0000000C"
1665 # severity failure;
1666 #
1667 # -- Non-cacheable read of address 100
1668 # d_in.load <= '1';
1669 # d_in.nc <= '1';
1670 # d_in.addr <= x"0000000000000100";
1671 # d_in.valid <= '1';
1672 # wait until rising_edge(clk);
1673 # d_in.valid <= '0';
1674 # wait until rising_edge(clk) and d_out.valid = '1';
1675 # assert d_out.data = x"0000004100000040"
1676 # report "data @" & to_hstring(d_in.addr) &
1677 # "=" & to_hstring(d_out.data) &
1678 # " expected 0000004100000040"
1679 # severity failure;
1680 #
1681 # wait until rising_edge(clk);
1682 # wait until rising_edge(clk);
1683 # wait until rising_edge(clk);
1684 # wait until rising_edge(clk);
1685 #
1686 # std.env.finish;
1687 # end process;
1688 # end;
1689 def dcache_sim(dut):
1690 # clear stuff
1691 yield dut.d_in.valid.eq(0)
1692 yield dut.d_in.load.eq(0)
1693 yield dut.d_in.nc.eq(0)
1694 yield dut.d_in.adrr.eq(0)
1695 yield dut.d_in.data.eq(0)
1696 yield dut.m_in.valid.eq(0)
1697 yield dut.m_in.addr.eq(0)
1698 yield dut.m_in.pte.eq(0)
1699 # wait 4 * clk_period
1700 yield
1701 yield
1702 yield
1703 yield
1704 # wait_until rising_edge(clk)
1705 yield
1706 # Cacheable read of address 4
1707 yield dut.d_in.load.eq(1)
1708 yield dut.d_in.nc.eq(0)
1709 yield dut.d_in.addr.eq(Const(0x0000000000000004, 64))
1710 yield dut.d_in.valid.eq(1)
1711 # wait-until rising_edge(clk)
1712 yield
1713 yield dut.d_in.valid.eq(0)
1714 yield
1715 while not (yield dut.d_out.valid):
1716 yield
1717 assert dut.d_out.data == Const(0x0000000100000000, 64) f"data @" \
1718 f"{dut.d_in.addr}={dut.d_in.data} expected 0000000100000000" \
1719 " -!- severity failure"
1720
1721
1722 # Cacheable read of address 30
1723 yield dut.d_in.load.eq(1)
1724 yield dut.d_in.nc.eq(0)
1725 yield dut.d_in.addr.eq(Const(0x0000000000000030, 64))
1726 yield dut.d_in.valid.eq(1)
1727 yield
1728 yield dut.d_in.valid.eq(0)
1729 yield
1730 while not (yield dut.d_out.valid):
1731 yield
1732 assert dut.d_out.data == Const(0x0000000D0000000C, 64) f"data @" \
1733 f"{dut.d_in.addr}={dut.d_out.data} expected 0000000D0000000C" \
1734 f"-!- severity failure"
1735
1736 # Non-cacheable read of address 100
1737 yield dut.d_in.load.eq(1)
1738 yield dut.d_in.nc.eq(1)
1739 yield dut.d_in.addr.eq(Const(0x0000000000000100, 64))
1740 yield dut.d_in.valid.eq(1)
1741 yield
1742 yield dut.d_in.valid.eq(0)
1743 yield
1744 while not (yield dut.d_out.valid):
1745 yield
1746 assert dut.d_out.data == Const(0x0000004100000040, 64) f"data @" \
1747 f"{dut.d_in.addr}={dut.d_out.data} expected 0000004100000040" \
1748 f"-!- severity failure"
1749
1750 yield
1751 yield
1752 yield
1753 yield
1754
1755
1756 def test_dcache():
1757 dut = DCache()
1758 vl = rtlil.convert(dut, ports=[])
1759 with open("test_dcache.il", "w") as f:
1760 f.write(vl)
1761
1762 run_simulation(dut, dcache_sim(), vcd_name='test_dcache.vcd')
1763
1764 if __name__ == '__main__':
1765 test_dcache()
1766