more error correction in dcache
[soc.git] / src / soc / experiment / dcache.py
1 """DCache
2
3 based on Anton Blanchard microwatt dcache.vhdl
4
5 """
6
7 from enum import Enum, unique
8
9 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
10 from nmigen.cli import main
11 from nmutil.iocontrol import RecordObject
12 from nmigen.utils import log2_int
13 from nmigen.cli import rtlil
14
15
16 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
17 DCacheToLoadStore1Type,
18 MMUToDCacheType,
19 DCacheToMMUType)
20
21 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
22 WBAddrType, WBDataType, WBSelType,
23 WBMasterOut, WBSlaveOut,
24 WBMasterOutVector, WBSlaveOutVector,
25 WBIOMasterOut, WBIOSlaveOut)
26
27 from soc.experiment.cache_ram import CacheRam
28 from soc.experiment.plru import PLRU
29
30
31 # TODO: make these parameters of DCache at some point
32 LINE_SIZE = 64 # Line size in bytes
33 NUM_LINES = 32 # Number of lines in a set
34 NUM_WAYS = 4 # Number of ways
35 TLB_SET_SIZE = 64 # L1 DTLB entries per set
36 TLB_NUM_WAYS = 2 # L1 DTLB number of sets
37 TLB_LG_PGSZ = 12 # L1 DTLB log_2(page_size)
38 LOG_LENGTH = 0 # Non-zero to enable log data collection
39
40 # BRAM organisation: We never access more than
41 # -- WB_DATA_BITS at a time so to save
42 # -- resources we make the array only that wide, and
43 # -- use consecutive indices for to make a cache "line"
44 # --
45 # -- ROW_SIZE is the width in bytes of the BRAM
46 # -- (based on WB, so 64-bits)
47 ROW_SIZE = WB_DATA_BITS // 8;
48
49 # ROW_PER_LINE is the number of row (wishbone
50 # transactions) in a line
51 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
52
53 # BRAM_ROWS is the number of rows in BRAM needed
54 # to represent the full dcache
55 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
56
57
58 # Bit fields counts in the address
59
60 # REAL_ADDR_BITS is the number of real address
61 # bits that we store
62 REAL_ADDR_BITS = 56
63
64 # ROW_BITS is the number of bits to select a row
65 ROW_BITS = log2_int(BRAM_ROWS)
66
67 # ROW_LINE_BITS is the number of bits to select
68 # a row within a line
69 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
70
71 # LINE_OFF_BITS is the number of bits for
72 # the offset in a cache line
73 LINE_OFF_BITS = log2_int(LINE_SIZE)
74
75 # ROW_OFF_BITS is the number of bits for
76 # the offset in a row
77 ROW_OFF_BITS = log2_int(ROW_SIZE)
78
79 # INDEX_BITS is the number if bits to
80 # select a cache line
81 INDEX_BITS = log2_int(NUM_LINES)
82
83 # SET_SIZE_BITS is the log base 2 of the set size
84 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
85
86 # TAG_BITS is the number of bits of
87 # the tag part of the address
88 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
89
90 # TAG_WIDTH is the width in bits of each way of the tag RAM
91 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
92
93 # WAY_BITS is the number of bits to select a way
94 WAY_BITS = log2_int(NUM_WAYS)
95
96 # Example of layout for 32 lines of 64 bytes:
97 #
98 # .. tag |index| line |
99 # .. | row | |
100 # .. | |---| | ROW_LINE_BITS (3)
101 # .. | |--- - --| LINE_OFF_BITS (6)
102 # .. | |- --| ROW_OFF_BITS (3)
103 # .. |----- ---| | ROW_BITS (8)
104 # .. |-----| | INDEX_BITS (5)
105 # .. --------| | TAG_BITS (45)
106
107 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
108
109 def CacheTagArray():
110 return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
111
112 def CacheValidBitsArray():
113 return Array(Signal(INDEX_BITS) for x in range(NUM_LINES))
114
115 def RowPerLineValidArray():
116 return Array(Signal() for x in range(ROW_PER_LINE))
117
118 # L1 TLB
119 TLB_SET_BITS = log2_int(TLB_SET_SIZE)
120 TLB_WAY_BITS = log2_int(TLB_NUM_WAYS)
121 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
122 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
123 TLB_PTE_BITS = 64
124 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
125
126 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
127 assert (LINE_SIZE % 2) == 0, "LINE_SIZE not power of 2"
128 assert (NUM_LINES % 2) == 0, "NUM_LINES not power of 2"
129 assert (ROW_PER_LINE % 2) == 0, "ROW_PER_LINE not power of 2"
130 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
131 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
132 "geometry bits don't add up"
133 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
134 "geometry bits don't add up"
135 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
136 "geometry bits don't add up"
137 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
138 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
139
140
141 def TLBValidBitsArray():
142 return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
143
144 def TLBTagEAArray():
145 return Array(Signal(TLB_EA_TAG_BITS) for x in range (TLB_NUM_WAYS))
146
147 def TLBTagsArray():
148 return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
149
150 def TLBPtesArray():
151 return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
152
153 def HitWaySet():
154 return Array(Signal(NUM_WAYS) for x in range(TLB_NUM_WAYS))
155
156 # Cache RAM interface
157 def CacheRamOut():
158 return Array(Signal(WB_DATA_BITS) for x in range(NUM_WAYS))
159
160 # PLRU output interface
161 def PLRUOut():
162 return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
163
164 # TLB PLRU output interface
165 def TLBPLRUOut():
166 return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
167
168 # Helper functions to decode incoming requests
169 #
170 # Return the cache line index (tag index) for an address
171 def get_index(addr):
172 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
173
174 # Return the cache row index (data memory) for an address
175 def get_row(addr):
176 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
177
178 # Return the index of a row within a line
179 def get_row_of_line(row):
180 return row[:ROW_LINE_BITS]
181
182 # Returns whether this is the last row of a line
183 def is_last_row_addr(addr, last):
184 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
185
186 # Returns whether this is the last row of a line
187 def is_last_row(row, last):
188 return get_row_of_line(row) == last
189
190 # Return the next row in the current cache line. We use a
191 # dedicated function in order to limit the size of the
192 # generated adder to be only the bits within a cache line
193 # (3 bits with default settings)
194 def next_row(row):
195 row_v = row[0:ROW_LINE_BITS] + 1
196 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
197
198 # Get the tag value from the address
199 def get_tag(addr):
200 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
201
202 # Read a tag from a tag memory row
203 def read_tag(way, tagset):
204 return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
205
206 # Read a TLB tag from a TLB tag memory row
207 def read_tlb_tag(way, tags):
208 return tags.word_select(way, TLB_EA_TAG_BITS)
209
210 # Write a TLB tag to a TLB tag memory row
211 def write_tlb_tag(way, tags, tag):
212 return read_tlb_tag(way, tags).eq(tag)
213
214 # Read a PTE from a TLB PTE memory row
215 def read_tlb_pte(way, ptes):
216 return ptes.word_select(way, TLB_PTE_BITS)
217
218 def write_tlb_pte(way, ptes, newpte):
219 return read_tlb_pte(way, ptes).eq(newpte)
220
221
222 # Record for storing permission, attribute, etc. bits from a PTE
223 class PermAttr(RecordObject):
224 def __init__(self):
225 super().__init__()
226 self.reference = Signal()
227 self.changed = Signal()
228 self.nocache = Signal()
229 self.priv = Signal()
230 self.rd_perm = Signal()
231 self.wr_perm = Signal()
232
233
234 def extract_perm_attr(pte):
235 pa = PermAttr()
236 pa.reference = pte[8]
237 pa.changed = pte[7]
238 pa.nocache = pte[5]
239 pa.priv = pte[3]
240 pa.rd_perm = pte[2]
241 pa.wr_perm = pte[1]
242 return pa;
243
244
245 # Type of operation on a "valid" input
246 @unique
247 class Op(Enum):
248 OP_NONE = 0
249 OP_BAD = 1 # NC cache hit, TLB miss, prot/RC failure
250 OP_STCX_FAIL = 2 # conditional store w/o reservation
251 OP_LOAD_HIT = 3 # Cache hit on load
252 OP_LOAD_MISS = 4 # Load missing cache
253 OP_LOAD_NC = 5 # Non-cachable load
254 OP_STORE_HIT = 6 # Store hitting cache
255 OP_STORE_MISS = 7 # Store missing cache
256
257
258 # Cache state machine
259 @unique
260 class State(Enum):
261 IDLE = 0 # Normal load hit processing
262 RELOAD_WAIT_ACK = 1 # Cache reload wait ack
263 STORE_WAIT_ACK = 2 # Store wait ack
264 NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
265
266
267 # Dcache operations:
268 #
269 # In order to make timing, we use the BRAMs with
270 # an output buffer, which means that the BRAM
271 # output is delayed by an extra cycle.
272 #
273 # Thus, the dcache has a 2-stage internal pipeline
274 # for cache hits with no stalls.
275 #
276 # All other operations are handled via stalling
277 # in the first stage.
278 #
279 # The second stage can thus complete a hit at the same
280 # time as the first stage emits a stall for a complex op.
281 #
282 # Stage 0 register, basically contains just the latched request
283
284 class RegStage0(RecordObject):
285 def __init__(self):
286 super().__init__()
287 self.req = LoadStore1ToDCacheType()
288 self.tlbie = Signal()
289 self.doall = Signal()
290 self.tlbld = Signal()
291 self.mmu_req = Signal() # indicates source of request
292
293
294 class MemAccessRequest(RecordObject):
295 def __init__(self):
296 super().__init__()
297 self.op = Signal(Op)
298 self.valid = Signal()
299 self.dcbz = Signal()
300 self.real_addr = Signal(REAL_ADDR_BITS)
301 self.data = Signal(64)
302 self.byte_sel = Signal(8)
303 self.hit_way = Signal(WAY_BITS)
304 self.same_tag = Signal()
305 self.mmu_req = Signal()
306
307
308 # First stage register, contains state for stage 1 of load hits
309 # and for the state machine used by all other operations
310 class RegStage1(RecordObject):
311 def __init__(self):
312 super().__init__()
313 # Info about the request
314 self.full = Signal() # have uncompleted request
315 self.mmu_req = Signal() # request is from MMU
316 self.req = MemAccessRequest()
317
318 # Cache hit state
319 self.hit_way = Signal(WAY_BITS)
320 self.hit_load_valid = Signal()
321 self.hit_index = Signal(NUM_LINES)
322 self.cache_hit = Signal()
323
324 # TLB hit state
325 self.tlb_hit = Signal()
326 self.tlb_hit_way = Signal(TLB_NUM_WAYS)
327 self.tlb_hit_index = Signal(TLB_WAY_BITS)
328
329 # 2-stage data buffer for data forwarded from writes to reads
330 self.forward_data1 = Signal(64)
331 self.forward_data2 = Signal(64)
332 self.forward_sel1 = Signal(8)
333 self.forward_valid1 = Signal()
334 self.forward_way1 = Signal(WAY_BITS)
335 self.forward_row1 = Signal(ROW_BITS)
336 self.use_forward1 = Signal()
337 self.forward_sel = Signal(8)
338
339 # Cache miss state (reload state machine)
340 self.state = Signal(State)
341 self.dcbz = Signal()
342 self.write_bram = Signal()
343 self.write_tag = Signal()
344 self.slow_valid = Signal()
345 self.wb = WBMasterOut()
346 self.reload_tag = Signal(TAG_BITS)
347 self.store_way = Signal(WAY_BITS)
348 self.store_row = Signal(ROW_BITS)
349 self.store_index = Signal(INDEX_BITS)
350 self.end_row_ix = Signal(log2_int(ROW_LINE_BITS, False))
351 self.rows_valid = RowPerLineValidArray()
352 self.acks_pending = Signal(3)
353 self.inc_acks = Signal()
354 self.dec_acks = Signal()
355
356 # Signals to complete (possibly with error)
357 self.ls_valid = Signal()
358 self.ls_error = Signal()
359 self.mmu_done = Signal()
360 self.mmu_error = Signal()
361 self.cache_paradox = Signal()
362
363 # Signal to complete a failed stcx.
364 self.stcx_fail = Signal()
365
366
367 # Reservation information
368 class Reservation(RecordObject):
369 def __init__(self):
370 super().__init__()
371 self.valid = Signal()
372 self.addr = Signal(64-LINE_OFF_BITS)
373
374
375 class DTLBUpdate(Elaboratable):
376 def __init__(self, dtlb_valid_bits, dtlb_ptes):
377 self.tlbie = Signal()
378 self.tlbwe = Signal()
379 self.doall = Signal()
380 self.tlb_hit = Signal()
381 self.tlb_req_index = Signal(TLB_SET_BITS)
382
383 self.dtlb_valid_bits = dtlb_valid_bits
384 self.dtlb_ptes = dtlb_ptes
385
386 self.tlb_hit_way = Signal(TLB_WAY_BITS)
387 self.tlb_tag_way = Signal(TLB_TAG_WAY_BITS)
388 self.tlb_pte_way = Signal(TLB_PTE_WAY_BITS)
389 self.repl_way = Signal(TLB_WAY_BITS)
390 self.eatag = Signal(TLB_EA_TAG_BITS)
391 self.pte_data = Signal(TLB_PTE_BITS)
392
393 def elaborate(self, platform):
394 m = Module()
395 comb = m.d.comb
396 sync = m.d.sync
397
398 tagset = Signal(TLB_TAG_WAY_BITS)
399 pteset = Signal(TLB_PTE_WAY_BITS)
400
401 vb = Signal(TLB_NUM_WAYS)
402 db = Signal(TLB_PTE_WAY_BITS)
403
404 sync += vb.eq(self.dtlb_valid_bits[self.tlb_req_index])
405 sync += db.eq(self.dtlb_ptes[self.tlb_req_index])
406
407 with m.If(self.tlbie & self.doall):
408 # clear all valid bits at once
409 for i in range(TLB_SET_SIZE):
410 sync += self.dtlb_valid_bits[i].eq(0)
411
412 with m.Elif(self.tlbie):
413 with m.If(self.tlb_hit):
414 sync += vb.bit_select(self.tlb_hit_way, 1).eq(Const(0, 1))
415
416 with m.Elif(self.tlbwe):
417
418 comb += tagset.eq(self.tlb_tag_way)
419 comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
420 sync += db.eq(tagset)
421
422 comb += pteset.eq(self.tlb_pte_way)
423 comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
424 sync += db.eq(pteset)
425
426 sync += vb.bit_select(self.repl_way, 1).eq(1)
427
428 return m
429
430
431 class DCache(Elaboratable):
432 """Set associative dcache write-through
433 TODO (in no specific order):
434 * See list in icache.vhdl
435 * Complete load misses on the cycle when WB data comes instead of
436 at the end of line (this requires dealing with requests coming in
437 while not idle...)
438 """
439 def __init__(self):
440 self.d_in = LoadStore1ToDCacheType()
441 self.d_out = DCacheToLoadStore1Type()
442
443 self.m_in = MMUToDCacheType()
444 self.m_out = DCacheToMMUType()
445
446 self.stall_out = Signal()
447
448 self.wb_out = WBMasterOut()
449 self.wb_in = WBSlaveOut()
450
451 self.log_out = Signal(20)
452
453 def stage_0(self, m, r0, r1, r0_full):
454 """Latch the request in r0.req as long as we're not stalling
455 """
456 comb = m.d.comb
457 sync = m.d.sync
458 d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
459
460 r = RegStage0()
461
462 # TODO, this goes in unit tests and formal proofs
463 with m.If(~(d_in.valid & m_in.valid)):
464 #sync += Display("request collision loadstore vs MMU")
465 pass
466
467 with m.If(m_in.valid):
468 sync += r.req.valid.eq(1)
469 sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
470 sync += r.req.dcbz.eq(0)
471 sync += r.req.nc.eq(0)
472 sync += r.req.reserve.eq(0)
473 sync += r.req.virt_mode.eq(1)
474 sync += r.req.priv_mode.eq(1)
475 sync += r.req.addr.eq(m_in.addr)
476 sync += r.req.data.eq(m_in.pte)
477 sync += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
478 sync += r.tlbie.eq(m_in.tlbie)
479 sync += r.doall.eq(m_in.doall)
480 sync += r.tlbld.eq(m_in.tlbld)
481 sync += r.mmu_req.eq(1)
482 with m.Else():
483 sync += r.req.eq(d_in)
484 sync += r.tlbie.eq(0)
485 sync += r.doall.eq(0)
486 sync += r.tlbld.eq(0)
487 sync += r.mmu_req.eq(0)
488 with m.If(~(r1.full & r0_full)):
489 sync += r0.eq(r)
490 sync += r0_full.eq(r.req.valid)
491
492 def tlb_read(self, m, r0_stall, tlb_valid_way,
493 tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
494 dtlb_tags, dtlb_ptes):
495 """TLB
496 Operates in the second cycle on the request latched in r0.req.
497 TLB updates write the entry at the end of the second cycle.
498 """
499 comb = m.d.comb
500 sync = m.d.sync
501 m_in, d_in = self.m_in, self.d_in
502
503 index = Signal(TLB_SET_BITS)
504 addrbits = Signal(TLB_SET_BITS)
505
506 amin = TLB_LG_PGSZ
507 amax = TLB_LG_PGSZ + TLB_SET_BITS
508
509 with m.If(m_in.valid):
510 comb += addrbits.eq(m_in.addr[amin : amax])
511 with m.Else():
512 comb += addrbits.eq(d_in.addr[amin : amax])
513 comb += index.eq(addrbits)
514
515 # If we have any op and the previous op isn't finished,
516 # then keep the same output for next cycle.
517 with m.If(~r0_stall):
518 sync += tlb_valid_way.eq(dtlb_valid_bits[index])
519 sync += tlb_tag_way.eq(dtlb_tags[index])
520 sync += tlb_pte_way.eq(dtlb_ptes[index])
521
522 def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, acc, acc_en, lru):
523 """Generate TLB PLRUs
524 """
525 comb = m.d.comb
526 sync = m.d.sync
527
528 with m.If(TLB_NUM_WAYS > 1):
529 for i in range(TLB_SET_SIZE):
530 # TLB PLRU interface
531 tlb_plru = PLRU(TLB_WAY_BITS)
532 setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
533 tlb_plru_acc = Signal(TLB_WAY_BITS)
534 tlb_plru_acc_en = Signal()
535 tlb_plru_out = Signal(TLB_WAY_BITS)
536
537 comb += tlb_plru.acc.eq(tlb_plru_acc)
538 comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
539 comb += tlb_plru.lru.eq(tlb_plru_out)
540
541 # PLRU interface
542 with m.If(r1.tlb_hit_index == i):
543 comb += tlb_plru.acc_en.eq(r1.tlb_hit)
544 with m.Else():
545 comb += tlb_plru.acc_en.eq(0)
546 comb += tlb_plru.acc.eq(r1.tlb_hit_way)
547
548 comb += tlb_plru_victim[i].eq(tlb_plru.lru)
549
550 def tlb_search(self, m, tlb_req_index, r0, r0_valid,
551 tlb_valid_way, tlb_tag_way, tlb_hit_way,
552 tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
553
554 comb = m.d.comb
555 sync = m.d.sync
556
557 hitway = Signal(TLB_WAY_BITS)
558 hit = Signal()
559 eatag = Signal(TLB_EA_TAG_BITS)
560
561 TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
562 comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
563 comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
564
565 for i in range(TLB_NUM_WAYS):
566 is_tag_hit = Signal()
567 comb += is_tag_hit.eq(tlb_valid_way[i]
568 & read_tlb_tag(i, tlb_tag_way) == eatag)
569 with m.If(is_tag_hit):
570 comb += hitway.eq(i)
571 comb += hit.eq(1)
572
573 comb += tlb_hit.eq(hit & r0_valid)
574 comb += tlb_hit_way.eq(hitway)
575
576 with m.If(tlb_hit):
577 comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
578 with m.Else():
579 comb += pte.eq(0)
580 comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
581 with m.If(r0.req.virt_mode):
582 comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
583 r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
584 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
585 comb += perm_attr.eq(extract_perm_attr(pte))
586 with m.Else():
587 comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
588 r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
589
590 comb += perm_attr.reference.eq(1)
591 comb += perm_attr.changed.eq(1)
592 comb += perm_attr.priv.eq(1)
593 comb += perm_attr.nocache.eq(0)
594 comb += perm_attr.rd_perm.eq(1)
595 comb += perm_attr.wr_perm.eq(1)
596
597 def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
598 tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
599 dtlb_tags, tlb_pte_way, dtlb_ptes):
600
601 comb = m.d.comb
602
603 tlbie = Signal()
604 tlbwe = Signal()
605
606 comb += tlbie.eq(r0_valid & r0.tlbie)
607 comb += tlbwe.eq(r0_valid & r0.tlbld)
608
609 m.submodules.tlb_update = d = DTLBUpdate(dtlb_valid_bits, dtlb_ptes)
610 comb += d.tlbie.eq(tlbie)
611 comb += d.tlbwe.eq(tlbwe)
612 comb += d.doall.eq(r0.doall)
613 comb += d.tlb_hit.eq(tlb_hit)
614 comb += d.tlb_hit_way.eq(tlb_hit_way)
615 comb += d.tlb_tag_way.eq(tlb_tag_way)
616 comb += d.tlb_pte_way.eq(tlb_pte_way)
617 comb += d.tlb_req_index.eq(tlb_req_index)
618
619 with m.If(tlb_hit):
620 comb += d.repl_way.eq(tlb_hit_way)
621 with m.Else():
622 comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
623 comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
624 comb += d.pte_data.eq(r0.req.data)
625
626 def maybe_plrus(self, m, r1, plru_victim):
627 """Generate PLRUs
628 """
629 comb = m.d.comb
630 sync = m.d.sync
631
632 for i in range(NUM_LINES):
633 # PLRU interface
634 plru = PLRU(TLB_WAY_BITS)
635 setattr(m.submodules, "plru%d" % i, plru)
636 plru_acc = Signal(WAY_BITS)
637 plru_acc_en = Signal()
638 plru_out = Signal(WAY_BITS)
639
640 comb += plru.acc.eq(plru_acc)
641 comb += plru.acc_en.eq(plru_acc_en)
642 comb += plru_out.eq(plru.lru_o)
643
644 with m.If(r1.hit_index == i):
645 comb += plru_acc_en.eq(r1.cache_hit)
646
647 comb += plru_acc.eq(r1.hit_way)
648 comb += plru_victim[i].eq(plru_out)
649
650 def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
651 """Cache tag RAM read port
652 """
653 comb = m.d.comb
654 sync = m.d.sync
655 m_in, d_in = self.m_in, self.d_in
656
657 index = Signal(INDEX_BITS)
658
659 with m.If(r0_stall):
660 comb += index.eq(req_index)
661 with m.Elif(m_in.valid):
662 comb += index.eq(get_index(m_in.addr))
663 with m.Else():
664 comb += index.eq(get_index(d_in.addr))
665 sync += cache_tag_set.eq(cache_tags[index])
666
667 def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
668 r0_valid, r1, cache_valid_bits, replace_way,
669 use_forward1_next, use_forward2_next,
670 req_hit_way, plru_victim, rc_ok, perm_attr,
671 valid_ra, perm_ok, access_ok, req_op, req_go,
672 tlb_pte_way,
673 tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
674 cancel_store, req_same_tag, r0_stall, early_req_row):
675 """Cache request parsing and hit detection
676 """
677
678 comb = m.d.comb
679 sync = m.d.sync
680 m_in, d_in = self.m_in, self.d_in
681
682 is_hit = Signal()
683 hit_way = Signal(WAY_BITS)
684 op = Signal(Op)
685 opsel = Signal(3)
686 go = Signal()
687 nc = Signal()
688 hit_set = Array(Signal() for i in range(TLB_NUM_WAYS))
689 hit_way_set = HitWaySet()
690 rel_matches = Array(Signal() for i in range(TLB_NUM_WAYS))
691 rel_match = Signal()
692
693 # Extract line, row and tag from request
694 comb += req_index.eq(get_index(r0.req.addr))
695 comb += req_row.eq(get_row(r0.req.addr))
696 comb += req_tag.eq(get_tag(ra))
697
698 comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
699
700 # Test if pending request is a hit on any way
701 # In order to make timing in virtual mode,
702 # when we are using the TLB, we compare each
703 # way with each of the real addresses from each way of
704 # the TLB, and then decide later which match to use.
705
706 with m.If(r0.req.virt_mode):
707 for j in range(TLB_NUM_WAYS):
708 s_tag = Signal(TAG_BITS)
709 s_hit = Signal()
710 s_pte = Signal(TLB_PTE_BITS)
711 s_ra = Signal(REAL_ADDR_BITS)
712 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
713 comb += s_ra.eq(Cat(r0.req.addr[0:TLB_LG_PGSZ],
714 s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
715 comb += s_tag.eq(get_tag(s_ra))
716
717 for i in range(NUM_WAYS):
718 is_tag_hit = Signal()
719 comb += is_tag_hit.eq(go & cache_valid_bits[req_index][i] &
720 (read_tag(i, cache_tag_set) == s_tag)
721 & tlb_valid_way[j])
722 with m.If(is_tag_hit):
723 comb += hit_way_set[j].eq(i)
724 comb += s_hit.eq(1)
725 comb += hit_set[j].eq(s_hit)
726 with m.If(s_tag == r1.reload_tag):
727 comb += rel_matches[j].eq(1)
728 with m.If(tlb_hit):
729 comb += is_hit.eq(hit_set[tlb_hit_way])
730 comb += hit_way.eq(hit_way_set[tlb_hit_way])
731 comb += rel_match.eq(rel_matches[tlb_hit_way])
732 with m.Else():
733 s_tag = Signal(TAG_BITS)
734 comb += s_tag.eq(get_tag(r0.req.addr))
735 for i in range(NUM_WAYS):
736 is_tag_hit = Signal()
737 comb += is_tag_hit.eq(go & cache_valid_bits[req_index][i] &
738 read_tag(i, cache_tag_set) == s_tag)
739 with m.If(is_tag_hit):
740 comb += hit_way.eq(i)
741 comb += is_hit.eq(1)
742 with m.If(s_tag == r1.reload_tag):
743 comb += rel_match.eq(1)
744
745 comb += req_same_tag.eq(rel_match)
746
747 # See if the request matches the line currently being reloaded
748 with m.If((r1.state == State.RELOAD_WAIT_ACK) &
749 (req_index == r1.store_index) & rel_match):
750 # For a store, consider this a hit even if the row isn't
751 # valid since it will be by the time we perform the store.
752 # For a load, check the appropriate row valid bit.
753 valid = r1.rows_valid[req_row % ROW_PER_LINE]
754 comb += is_hit.eq(~r0.req.load | valid)
755 comb += hit_way.eq(replace_way)
756
757 # Whether to use forwarded data for a load or not
758 comb += use_forward1_next.eq(0)
759 with m.If((get_row(r1.req.real_addr) == req_row) &
760 (r1.req.hit_way == hit_way)):
761 # Only need to consider r1.write_bram here, since if we
762 # are writing refill data here, then we don't have a
763 # cache hit this cycle on the line being refilled.
764 # (There is the possibility that the load following the
765 # load miss that started the refill could be to the old
766 # contents of the victim line, since it is a couple of
767 # cycles after the refill starts before we see the updated
768 # cache tag. In that case we don't use the bypass.)
769 comb += use_forward1_next.eq(r1.write_bram)
770 comb += use_forward2_next.eq(0)
771 with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
772 comb += use_forward2_next.eq(r1.forward_valid1)
773
774 # The way that matched on a hit
775 comb += req_hit_way.eq(hit_way)
776
777 # The way to replace on a miss
778 with m.If(r1.write_tag):
779 replace_way.eq(plru_victim[r1.store_index])
780 with m.Else():
781 comb += replace_way.eq(r1.store_way)
782
783 # work out whether we have permission for this access
784 # NB we don't yet implement AMR, thus no KUAP
785 comb += rc_ok.eq(perm_attr.reference
786 & (r0.req.load | perm_attr.changed)
787 )
788 comb += perm_ok.eq((r0.req.priv_mode | ~perm_attr.priv)
789 & perm_attr.wr_perm
790 | (r0.req.load & perm_attr.rd_perm)
791 )
792 comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
793 # Combine the request and cache hit status to decide what
794 # operation needs to be done
795 comb += nc.eq(r0.req.nc | perm_attr.nocache)
796 comb += op.eq(Op.OP_NONE)
797 with m.If(go):
798 with m.If(~access_ok):
799 comb += op.eq(Op.OP_BAD)
800 with m.Elif(cancel_store):
801 comb += op.eq(Op.OP_STCX_FAIL)
802 with m.Else():
803 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
804 with m.Switch(opsel):
805 with m.Case(0b101):
806 comb += op.eq(Op.OP_LOAD_HIT)
807 with m.Case(0b100):
808 comb += op.eq(Op.OP_LOAD_MISS)
809 with m.Case(0b110):
810 comb += op.eq(Op.OP_LOAD_NC)
811 with m.Case(0b001):
812 comb += op.eq(Op.OP_STORE_HIT)
813 with m.Case(0b000):
814 comb += op.eq(Op.OP_STORE_MISS)
815 with m.Case(0b010):
816 comb += op.eq(Op.OP_STORE_MISS)
817 with m.Case(0b011):
818 comb += op.eq(Op.OP_BAD)
819 with m.Case(0b111):
820 comb += op.eq(Op.OP_BAD)
821 with m.Default():
822 comb += op.eq(Op.OP_NONE)
823 comb += req_op.eq(op)
824 comb += req_go.eq(go)
825
826 # Version of the row number that is valid one cycle earlier
827 # in the cases where we need to read the cache data BRAM.
828 # If we're stalling then we need to keep reading the last
829 # row requested.
830 with m.If(~r0_stall):
831 with m.If(m_in.valid):
832 comb += early_req_row.eq(get_row(m_in.addr))
833 with m.Else():
834 comb += early_req_row.eq(get_row(d_in.addr))
835 with m.Else():
836 comb += early_req_row.eq(req_row)
837
838 def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
839 r0_valid, r0, reservation):
840 """Handle load-with-reservation and store-conditional instructions
841 """
842 comb = m.d.comb
843 sync = m.d.sync
844
845 with m.If(r0_valid & r0.req.reserve):
846
847 # XXX generate alignment interrupt if address
848 # is not aligned XXX or if r0.req.nc = '1'
849 with m.If(r0.req.load):
850 comb += set_rsrv.eq(1) # load with reservation
851 with m.Else():
852 comb += clear_rsrv.eq(1) # store conditional
853 with m.If(~reservation.valid | r0.req.addr[LINE_OFF_BITS:64]):
854 comb += cancel_store.eq(1)
855
856 def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
857 reservation, r0):
858
859 comb = m.d.comb
860 sync = m.d.sync
861
862 with m.If(r0_valid & access_ok):
863 with m.If(clear_rsrv):
864 sync += reservation.valid.eq(0)
865 with m.Elif(set_rsrv):
866 sync += reservation.valid.eq(1)
867 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
868
869 def writeback_control(self, m, r1, cache_out):
870 """Return data for loads & completion control logic
871 """
872 comb = m.d.comb
873 sync = m.d.sync
874 d_out, m_out = self.d_out, self.m_out
875
876 data_out = Signal(64)
877 data_fwd = Signal(64)
878
879 # Use the bypass if are reading the row that was
880 # written 1 or 2 cycles ago, including for the
881 # slow_valid = 1 case (i.e. completing a load
882 # miss or a non-cacheable load).
883 with m.If(r1.use_forward1):
884 comb += data_fwd.eq(r1.forward_data1)
885 with m.Else():
886 comb += data_fwd.eq(r1.forward_data2)
887
888 comb += data_out.eq(cache_out[r1.hit_way])
889
890 for i in range(8):
891 with m.If(r1.forward_sel[i]):
892 dsel = data_fwd.word_select(i, 8)
893 comb += data_out.word_select(i, 8).eq(dsel)
894
895 comb += d_out.valid.eq(r1.ls_valid)
896 comb += d_out.data.eq(data_out)
897 comb += d_out.store_done.eq(~r1.stcx_fail)
898 comb += d_out.error.eq(r1.ls_error)
899 comb += d_out.cache_paradox.eq(r1.cache_paradox)
900
901 # Outputs to MMU
902 comb += m_out.done.eq(r1.mmu_done)
903 comb += m_out.err.eq(r1.mmu_error)
904 comb += m_out.data.eq(data_out)
905
906 # We have a valid load or store hit or we just completed
907 # a slow op such as a load miss, a NC load or a store
908 #
909 # Note: the load hit is delayed by one cycle. However it
910 # can still not collide with r.slow_valid (well unless I
911 # miscalculated) because slow_valid can only be set on a
912 # subsequent request and not on its first cycle (the state
913 # machine must have advanced), which makes slow_valid
914 # at least 2 cycles from the previous hit_load_valid.
915
916 # Sanity: Only one of these must be set in any given cycle
917
918 if False: # TODO: need Display to get this to work
919 assert (r1.slow_valid & r1.stcx_fail) != 1, \
920 "unexpected slow_valid collision with stcx_fail"
921
922 assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
923 "unexpected hit_load_delayed collision with slow_valid"
924
925 with m.If(~r1.mmu_req):
926 # Request came from loadstore1...
927 # Load hit case is the standard path
928 with m.If(r1.hit_load_valid):
929 #Display(f"completing load hit data={data_out}")
930 pass
931
932 # error cases complete without stalling
933 with m.If(r1.ls_error):
934 # Display("completing ld/st with error")
935 pass
936
937 # Slow ops (load miss, NC, stores)
938 with m.If(r1.slow_valid):
939 #Display(f"completing store or load miss data={data_out}")
940 pass
941
942 with m.Else():
943 # Request came from MMU
944 with m.If(r1.hit_load_valid):
945 # Display(f"completing load hit to MMU, data={m_out.data}")
946 pass
947 # error cases complete without stalling
948 with m.If(r1.mmu_error):
949 #Display("combpleting MMU ld with error")
950 pass
951
952 # Slow ops (i.e. load miss)
953 with m.If(r1.slow_valid):
954 #Display("completing MMU load miss, data={m_out.data}")
955 pass
956
957 def rams(self, m, r1, early_req_row, cache_out, replace_way):
958 """rams
959 Generate a cache RAM for each way. This handles the normal
960 reads, writes from reloads and the special store-hit update
961 path as well.
962
963 Note: the BRAMs have an extra read buffer, meaning the output
964 is pipelined an extra cycle. This differs from the
965 icache. The writeback logic needs to take that into
966 account by using 1-cycle delayed signals for load hits.
967 """
968 comb = m.d.comb
969 wb_in = self.wb_in
970
971 for i in range(NUM_WAYS):
972 do_read = Signal()
973 rd_addr = Signal(ROW_BITS)
974 do_write = Signal()
975 wr_addr = Signal(ROW_BITS)
976 wr_data = Signal(WB_DATA_BITS)
977 wr_sel = Signal(ROW_SIZE)
978 wr_sel_m = Signal(ROW_SIZE)
979 _d_out = Signal(WB_DATA_BITS)
980
981 way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
982 setattr(m.submodules, "cacheram_%d" % i, way)
983
984 comb += way.rd_en.eq(do_read)
985 comb += way.rd_addr.eq(rd_addr)
986 comb += _d_out.eq(way.rd_data_o)
987 comb += way.wr_sel.eq(wr_sel_m)
988 comb += way.wr_addr.eq(wr_addr)
989 comb += way.wr_data.eq(wr_data)
990
991 # Cache hit reads
992 comb += do_read.eq(1)
993 comb += rd_addr.eq(early_req_row)
994 comb += cache_out[i].eq(_d_out)
995
996 # Write mux:
997 #
998 # Defaults to wishbone read responses (cache refill)
999 #
1000 # For timing, the mux on wr_data/sel/addr is not
1001 # dependent on anything other than the current state.
1002
1003 with m.If(r1.write_bram):
1004 # Write store data to BRAM. This happens one
1005 # cycle after the store is in r0.
1006 comb += wr_data.eq(r1.req.data)
1007 comb += wr_sel.eq(r1.req.byte_sel)
1008 comb += wr_addr.eq(get_row(r1.req.real_addr))
1009
1010 with m.If(i == r1.req.hit_way):
1011 comb += do_write.eq(1)
1012 with m.Else():
1013 # Otherwise, we might be doing a reload or a DCBZ
1014 with m.If(r1.dcbz):
1015 comb += wr_data.eq(0)
1016 with m.Else():
1017 comb += wr_data.eq(wb_in.dat)
1018 comb += wr_addr.eq(r1.store_row)
1019 comb += wr_sel.eq(~0) # all 1s
1020
1021 with m.If((r1.state == State.RELOAD_WAIT_ACK)
1022 & wb_in.ack & (replace_way == i)):
1023 comb += do_write.eq(1)
1024
1025 # Mask write selects with do_write since BRAM
1026 # doesn't have a global write-enable
1027 with m.If(do_write):
1028 comb += wr_sel_m.eq(wr_sel)
1029
1030 # Cache hit synchronous machine for the easy case.
1031 # This handles load hits.
1032 # It also handles error cases (TLB miss, cache paradox)
1033 def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1034 req_hit_way, req_index, access_ok,
1035 tlb_hit, tlb_hit_way, tlb_req_index):
1036
1037 comb = m.d.comb
1038 sync = m.d.sync
1039
1040 with m.If(req_op != Op.OP_NONE):
1041 #Display(f"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \
1042 # f"idx:{req_index} tag:{req_tag} way: {req_hit_way}"
1043 # )
1044 pass
1045
1046 with m.If(r0_valid):
1047 sync += r1.mmu_req.eq(r0.mmu_req)
1048
1049 # Fast path for load/store hits.
1050 # Set signals for the writeback controls.
1051 sync += r1.hit_way.eq(req_hit_way)
1052 sync += r1.hit_index.eq(req_index)
1053
1054 with m.If(req_op == Op.OP_LOAD_HIT):
1055 sync += r1.hit_load_valid.eq(1)
1056 with m.Else():
1057 sync += r1.hit_load_valid.eq(0)
1058
1059 with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1060 sync += r1.cache_hit.eq(1)
1061 with m.Else():
1062 sync += r1.cache_hit.eq(0)
1063
1064 with m.If(req_op == Op.OP_BAD):
1065 # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1066 # f"rc_ok={rc_ok} perm_ok={perm_ok}"
1067 sync += r1.ls_error.eq(~r0.mmu_req)
1068 sync += r1.mmu_error.eq(r0.mmu_req)
1069 sync += r1.cache_paradox.eq(access_ok)
1070
1071 with m.Else():
1072 sync += r1.ls_error.eq(0)
1073 sync += r1.mmu_error.eq(0)
1074 sync += r1.cache_paradox.eq(0)
1075
1076 with m.If(req_op == Op.OP_STCX_FAIL):
1077 r1.stcx_fail.eq(1)
1078 with m.Else():
1079 sync += r1.stcx_fail.eq(0)
1080
1081 # Record TLB hit information for updating TLB PLRU
1082 sync += r1.tlb_hit.eq(tlb_hit)
1083 sync += r1.tlb_hit_way.eq(tlb_hit_way)
1084 sync += r1.tlb_hit_index.eq(tlb_req_index)
1085
1086 # Memory accesses are handled by this state machine:
1087 #
1088 # * Cache load miss/reload (in conjunction with "rams")
1089 # * Load hits for non-cachable forms
1090 # * Stores (the collision case is handled in "rams")
1091 #
1092 # All wishbone requests generation is done here.
1093 # This machine operates at stage 1.
1094 def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1095 cache_valid_bits, r0, replace_way,
1096 req_hit_way, req_same_tag,
1097 r0_valid, req_op, cache_tag, req_go, ra):
1098
1099 comb = m.d.comb
1100 sync = m.d.sync
1101 wb_in = self.wb_in
1102
1103 req = MemAccessRequest()
1104 acks = Signal(3)
1105 adjust_acks = Signal(3)
1106 stbs_done = Signal()
1107
1108 sync += r1.use_forward1.eq(use_forward1_next)
1109 sync += r1.forward_sel.eq(0)
1110
1111 with m.If(use_forward1_next):
1112 sync += r1.forward_sel.eq(r1.req.byte_sel)
1113 with m.Elif(use_forward2_next):
1114 sync += r1.forward_sel.eq(r1.forward_sel1)
1115
1116 sync += r1.forward_data2.eq(r1.forward_data1)
1117 with m.If(r1.write_bram):
1118 sync += r1.forward_data1.eq(r1.req.data)
1119 sync += r1.forward_sel1.eq(r1.req.byte_sel)
1120 sync += r1.forward_way1.eq(r1.req.hit_way)
1121 sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1122 sync += r1.forward_valid1.eq(1)
1123 with m.Else():
1124 with m.If(r1.dcbz):
1125 sync += r1.forward_data1.eq(0)
1126 with m.Else():
1127 sync += r1.forward_data1.eq(wb_in.dat)
1128 sync += r1.forward_sel1.eq(~0) # all 1s
1129 sync += r1.forward_way1.eq(replace_way)
1130 sync += r1.forward_row1.eq(r1.store_row)
1131 sync += r1.forward_valid1.eq(0)
1132
1133 # One cycle pulses reset
1134 sync += r1.slow_valid.eq(0)
1135 sync += r1.write_bram.eq(0)
1136 sync += r1.inc_acks.eq(0)
1137 sync += r1.dec_acks.eq(0)
1138
1139 sync += r1.ls_valid.eq(0)
1140 # complete tlbies and TLB loads in the third cycle
1141 sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1142
1143 with m.If((req_op == Op.OP_LOAD_HIT)
1144 | (req_op == Op.OP_STCX_FAIL)):
1145 with m.If(~r0.mmu_req):
1146 sync += r1.ls_valid.eq(1)
1147 with m.Else():
1148 sync += r1.mmu_done.eq(1)
1149
1150 with m.If(r1.write_tag):
1151 # Store new tag in selected way
1152 for i in range(NUM_WAYS):
1153 with m.If(i == replace_way):
1154 ct = Signal(TAG_RAM_WIDTH)
1155 comb += ct.eq(cache_tag[r1.store_index])
1156 comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1157 sync += cache_tag[r1.store_index].eq(ct)
1158 sync += r1.store_way.eq(replace_way)
1159 sync += r1.write_tag.eq(0)
1160
1161 # Take request from r1.req if there is one there,
1162 # else from req_op, ra, etc.
1163 with m.If(r1.full):
1164 comb += req.eq(r1.req)
1165 with m.Else():
1166 comb += req.op.eq(req_op)
1167 comb += req.valid.eq(req_go)
1168 comb += req.mmu_req.eq(r0.mmu_req)
1169 comb += req.dcbz.eq(r0.req.dcbz)
1170 comb += req.real_addr.eq(ra)
1171
1172 with m.If(~r0.req.dcbz):
1173 comb += req.data.eq(r0.req.data)
1174 with m.Else():
1175 comb += req.data.eq(0)
1176
1177 # Select all bytes for dcbz
1178 # and for cacheable loads
1179 with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1180 comb += req.byte_sel.eq(~0) # all 1s
1181 with m.Else():
1182 comb += req.byte_sel.eq(r0.req.byte_sel)
1183 comb += req.hit_way.eq(req_hit_way)
1184 comb += req.same_tag.eq(req_same_tag)
1185
1186 # Store the incoming request from r0,
1187 # if it is a slow request
1188 # Note that r1.full = 1 implies req_op = OP_NONE
1189 with m.If((req_op == Op.OP_LOAD_MISS)
1190 | (req_op == Op.OP_LOAD_NC)
1191 | (req_op == Op.OP_STORE_MISS)
1192 | (req_op == Op.OP_STORE_HIT)):
1193 sync += r1.req.eq(req)
1194 sync += r1.full.eq(1)
1195
1196 # Main state machine
1197 with m.Switch(r1.state):
1198
1199 with m.Case(State.IDLE):
1200 # XXX check 'left downto. probably means len(r1.wb.adr)
1201 # r1.wb.adr <= req.real_addr(
1202 # r1.wb.adr'left downto 0
1203 # );
1204 sync += r1.wb.adr.eq(req.real_addr)
1205 sync += r1.wb.sel.eq(req.byte_sel)
1206 sync += r1.wb.dat.eq(req.data)
1207 sync += r1.dcbz.eq(req.dcbz)
1208
1209 # Keep track of our index and way
1210 # for subsequent stores.
1211 sync += r1.store_index.eq(get_index(req.real_addr))
1212 sync += r1.store_row.eq(get_row(req.real_addr))
1213 sync += r1.end_row_ix.eq(
1214 get_row_of_line(get_row(req.real_addr))
1215 )
1216 sync += r1.reload_tag.eq(get_tag(req.real_addr))
1217 sync += r1.req.same_tag.eq(1)
1218
1219 with m.If(req.op == Op.OP_STORE_HIT):
1220 sync += r1.store_way.eq(req.hit_way)
1221
1222 # Reset per-row valid bits,
1223 # ready for handling OP_LOAD_MISS
1224 for i in range(ROW_PER_LINE):
1225 sync += r1.rows_valid[i].eq(0)
1226
1227 with m.Switch(req.op):
1228 with m.Case(Op.OP_LOAD_HIT):
1229 # stay in IDLE state
1230 pass
1231
1232 with m.Case(Op.OP_LOAD_MISS):
1233 #Display(f"cache miss real addr:" \
1234 # f"{req_real_addr}" \
1235 # f" idx:{get_index(req_real_addr)}" \
1236 # f" tag:{get_tag(req.real_addr)}")
1237 pass
1238
1239 # Start the wishbone cycle
1240 sync += r1.wb.we.eq(0)
1241 sync += r1.wb.cyc.eq(1)
1242 sync += r1.wb.stb.eq(1)
1243
1244 # Track that we had one request sent
1245 sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1246 sync += r1.write_tag.eq(1)
1247
1248 with m.Case(Op.OP_LOAD_NC):
1249 sync += r1.wb.cyc.eq(1)
1250 sync += r1.wb.stb.eq(1)
1251 sync += r1.wb.we.eq(0)
1252 sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1253
1254 with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1255 with m.If(~req.dcbz):
1256 sync += r1.state.eq(State.STORE_WAIT_ACK)
1257 sync += r1.acks_pending.eq(1)
1258 sync += r1.full.eq(0)
1259 sync += r1.slow_valid.eq(1)
1260
1261 with m.If(~req.mmu_req):
1262 sync += r1.ls_valid.eq(1)
1263 with m.Else():
1264 sync += r1.mmu_done.eq(1)
1265
1266 with m.If(req.op == Op.OP_STORE_HIT):
1267 sync += r1.write_bram.eq(1)
1268 with m.Else():
1269 sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1270
1271 with m.If(req.op == Op.OP_STORE_MISS):
1272 sync += r1.write_tag.eq(1)
1273
1274 sync += r1.wb.we.eq(1)
1275 sync += r1.wb.cyc.eq(1)
1276 sync += r1.wb.stb.eq(1)
1277
1278 # OP_NONE and OP_BAD do nothing
1279 # OP_BAD & OP_STCX_FAIL were
1280 # handled above already
1281 with m.Case(Op.OP_NONE):
1282 pass
1283 with m.Case(Op.OP_BAD):
1284 pass
1285 with m.Case(Op.OP_STCX_FAIL):
1286 pass
1287
1288 with m.Case(State.RELOAD_WAIT_ACK):
1289 # Requests are all sent if stb is 0
1290 comb += stbs_done.eq(~r1.wb.stb)
1291
1292 with m.If(~wb_in.stall & ~stbs_done):
1293 # That was the last word?
1294 # We are done sending.
1295 # Clear stb and set stbs_done
1296 # so we can handle an eventual
1297 # last ack on the same cycle.
1298 with m.If(is_last_row_addr(
1299 r1.wb.adr, r1.end_row_ix)):
1300 sync += r1.wb.stb.eq(0)
1301 comb += stbs_done.eq(0)
1302
1303 # Calculate the next row address in the current cache line
1304 rarange = r1.wb.adr[ROW_OFF_BITS : LINE_OFF_BITS]
1305 sync += rarange.eq(rarange + 1)
1306
1307 # Incoming acks processing
1308 sync += r1.forward_valid1.eq(wb_in.ack)
1309 with m.If(wb_in.ack):
1310 # XXX needs an Array bit-accessor here
1311 sync += r1.rows_valid[r1.store_row % ROW_PER_LINE].eq(1)
1312
1313 # If this is the data we were looking for,
1314 # we can complete the request next cycle.
1315 # Compare the whole address in case the
1316 # request in r1.req is not the one that
1317 # started this refill.
1318 with m.If(r1.full & r1.req.same_tag &
1319 ((r1.dcbz & r1.req.dcbz) |
1320 (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1321 (r1.store_row == get_row(r1.req.real_addr))):
1322 sync += r1.full.eq(0)
1323 sync += r1.slow_valid.eq(1)
1324 with m.If(~r1.mmu_req):
1325 sync += r1.ls_valid.eq(1)
1326 with m.Else():
1327 sync += r1.mmu_done.eq(1)
1328 sync += r1.forward_sel.eq(~0) # all 1s
1329 sync += r1.use_forward1.eq(1)
1330
1331 # Check for completion
1332 with m.If(stbs_done & is_last_row(r1.store_row,
1333 r1.end_row_ix)):
1334 # Complete wishbone cycle
1335 sync += r1.wb.cyc.eq(0)
1336
1337 # Cache line is now valid
1338 cv = Signal(INDEX_BITS)
1339 sync += cv.eq(cache_valid_bits[r1.store_index])
1340 sync += cv.bit_select(r1.store_way, 1).eq(1)
1341 sync += r1.state.eq(State.IDLE)
1342
1343 # Increment store row counter
1344 sync += r1.store_row.eq(next_row(r1.store_row))
1345
1346 with m.Case(State.STORE_WAIT_ACK):
1347 comb += stbs_done.eq(~r1.wb.stb)
1348 comb += acks.eq(r1.acks_pending)
1349
1350 with m.If(r1.inc_acks != r1.dec_acks):
1351 with m.If(r1.inc_acks):
1352 comb += adjust_acks.eq(acks + 1)
1353 with m.Else():
1354 comb += adjust_acks.eq(acks - 1)
1355 with m.Else():
1356 comb += adjust_acks.eq(acks)
1357
1358 sync += r1.acks_pending.eq(adjust_acks)
1359
1360 # Clear stb when slave accepted request
1361 with m.If(~wb_in.stall):
1362 # See if there is another store waiting
1363 # to be done which is in the same real page.
1364 with m.If(req.valid):
1365 ra = req.real_addr[0:SET_SIZE_BITS]
1366 sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
1367 sync += r1.wb.dat.eq(req.data)
1368 sync += r1.wb.sel.eq(req.byte_sel)
1369
1370 with m.Elif((adjust_acks < 7) & req.same_tag &
1371 ((req.op == Op.OP_STORE_MISS)
1372 | (req.op == Op.OP_STORE_HIT))):
1373 sync += r1.wb.stb.eq(1)
1374 comb += stbs_done.eq(0)
1375
1376 with m.If(req.op == Op.OP_STORE_HIT):
1377 sync += r1.write_bram.eq(1)
1378 sync += r1.full.eq(0)
1379 sync += r1.slow_valid.eq(1)
1380
1381 # Store requests never come from the MMU
1382 sync += r1.ls_valid.eq(1)
1383 comb += stbs_done.eq(0)
1384 sync += r1.inc_acks.eq(1)
1385 with m.Else():
1386 sync += r1.wb.stb.eq(0)
1387 comb += stbs_done.eq(1)
1388
1389 # Got ack ? See if complete.
1390 with m.If(wb_in.ack):
1391 with m.If(stbs_done & (adjust_acks == 1)):
1392 sync += r1.state.eq(State.IDLE)
1393 sync += r1.wb.cyc.eq(0)
1394 sync += r1.wb.stb.eq(0)
1395 sync += r1.dec_acks.eq(1)
1396
1397 with m.Case(State.NC_LOAD_WAIT_ACK):
1398 # Clear stb when slave accepted request
1399 with m.If(~wb_in.stall):
1400 sync += r1.wb.stb.eq(0)
1401
1402 # Got ack ? complete.
1403 with m.If(wb_in.ack):
1404 sync += r1.state.eq(State.IDLE)
1405 sync += r1.full.eq(0)
1406 sync += r1.slow_valid.eq(1)
1407
1408 with m.If(~r1.mmu_req):
1409 sync += r1.ls_valid.eq(1)
1410 with m.Else():
1411 sync += r1.mmu_done.eq(1)
1412
1413 sync += r1.forward_sel.eq(~0) # all 1s
1414 sync += r1.use_forward1.eq(1)
1415 sync += r1.wb.cyc.eq(0)
1416 sync += r1.wb.stb.eq(0)
1417
1418 def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1419
1420 sync = m.d.sync
1421 d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1422
1423 sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1424 stall_out, req_op[:3], d_out.valid, d_out.error,
1425 r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1426 r1.wb.adr[3:6]))
1427
1428 def elaborate(self, platform):
1429
1430 m = Module()
1431 comb = m.d.comb
1432
1433 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1434 cache_tags = CacheTagArray()
1435 cache_tag_set = Signal(TAG_RAM_WIDTH)
1436 cache_valid_bits = CacheValidBitsArray()
1437
1438 # TODO attribute ram_style : string;
1439 # TODO attribute ram_style of cache_tags : signal is "distributed";
1440
1441 """note: these are passed to nmigen.hdl.Memory as "attributes".
1442 don't know how, just that they are.
1443 """
1444 dtlb_valid_bits = TLBValidBitsArray()
1445 dtlb_tags = TLBTagsArray()
1446 dtlb_ptes = TLBPtesArray()
1447 # TODO attribute ram_style of
1448 # dtlb_tags : signal is "distributed";
1449 # TODO attribute ram_style of
1450 # dtlb_ptes : signal is "distributed";
1451
1452 r0 = RegStage0()
1453 r0_full = Signal()
1454
1455 r1 = RegStage1()
1456
1457 reservation = Reservation()
1458
1459 # Async signals on incoming request
1460 req_index = Signal(INDEX_BITS)
1461 req_row = Signal(ROW_BITS)
1462 req_hit_way = Signal(WAY_BITS)
1463 req_tag = Signal(TAG_BITS)
1464 req_op = Signal(Op)
1465 req_data = Signal(64)
1466 req_same_tag = Signal()
1467 req_go = Signal()
1468
1469 early_req_row = Signal(ROW_BITS)
1470
1471 cancel_store = Signal()
1472 set_rsrv = Signal()
1473 clear_rsrv = Signal()
1474
1475 r0_valid = Signal()
1476 r0_stall = Signal()
1477
1478 use_forward1_next = Signal()
1479 use_forward2_next = Signal()
1480
1481 cache_out = CacheRamOut()
1482
1483 plru_victim = PLRUOut()
1484 replace_way = Signal(WAY_BITS)
1485
1486 # Wishbone read/write/cache write formatting signals
1487 bus_sel = Signal(8)
1488
1489 # TLB signals
1490 tlb_tag_way = Signal(TLB_TAG_WAY_BITS)
1491 tlb_pte_way = Signal(TLB_PTE_WAY_BITS)
1492 tlb_valid_way = Signal(TLB_NUM_WAYS)
1493 tlb_req_index = Signal(TLB_SET_BITS)
1494 tlb_hit = Signal()
1495 tlb_hit_way = Signal(TLB_WAY_BITS)
1496 pte = Signal(TLB_PTE_BITS)
1497 ra = Signal(REAL_ADDR_BITS)
1498 valid_ra = Signal()
1499 perm_attr = PermAttr()
1500 rc_ok = Signal()
1501 perm_ok = Signal()
1502 access_ok = Signal()
1503
1504 tlb_plru_victim = TLBPLRUOut()
1505
1506 # we don't yet handle collisions between loadstore1 requests
1507 # and MMU requests
1508 comb += self.m_out.stall.eq(0)
1509
1510 # Hold off the request in r0 when r1 has an uncompleted request
1511 comb += r0_stall.eq(r0_full & r1.full)
1512 comb += r0_valid.eq(r0_full & ~r1.full)
1513 comb += self.stall_out.eq(r0_stall)
1514
1515 # Wire up wishbone request latch out of stage 1
1516 comb += self.wb_out.eq(r1.wb)
1517
1518 # call sub-functions putting everything together, using shared
1519 # signals established above
1520 self.stage_0(m, r0, r1, r0_full)
1521 self.tlb_read(m, r0_stall, tlb_valid_way,
1522 tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1523 dtlb_tags, dtlb_ptes)
1524 self.tlb_search(m, tlb_req_index, r0, r0_valid,
1525 tlb_valid_way, tlb_tag_way, tlb_hit_way,
1526 tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1527 self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1528 tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1529 dtlb_tags, tlb_pte_way, dtlb_ptes)
1530 self.maybe_plrus(m, r1, plru_victim)
1531 self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1532 self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1533 r0_valid, r1, cache_valid_bits, replace_way,
1534 use_forward1_next, use_forward2_next,
1535 req_hit_way, plru_victim, rc_ok, perm_attr,
1536 valid_ra, perm_ok, access_ok, req_op, req_go,
1537 tlb_pte_way,
1538 tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1539 cancel_store, req_same_tag, r0_stall, early_req_row)
1540 self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1541 r0_valid, r0, reservation)
1542 self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1543 reservation, r0)
1544 self.writeback_control(m, r1, cache_out)
1545 self.rams(m, r1, early_req_row, cache_out, replace_way)
1546 self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1547 req_hit_way, req_index, access_ok,
1548 tlb_hit, tlb_hit_way, tlb_req_index)
1549 self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1550 cache_valid_bits, r0, replace_way,
1551 req_hit_way, req_same_tag,
1552 r0_valid, req_op, cache_tags, req_go, ra)
1553 #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1554
1555 return m
1556
1557
1558 # dcache_tb.vhdl
1559 #
1560 # entity dcache_tb is
1561 # end dcache_tb;
1562 #
1563 # architecture behave of dcache_tb is
1564 # signal clk : std_ulogic;
1565 # signal rst : std_ulogic;
1566 #
1567 # signal d_in : Loadstore1ToDcacheType;
1568 # signal d_out : DcacheToLoadstore1Type;
1569 #
1570 # signal m_in : MmuToDcacheType;
1571 # signal m_out : DcacheToMmuType;
1572 #
1573 # signal wb_bram_in : wishbone_master_out;
1574 # signal wb_bram_out : wishbone_slave_out;
1575 #
1576 # constant clk_period : time := 10 ns;
1577 # begin
1578 # dcache0: entity work.dcache
1579 # generic map(
1580 #
1581 # LINE_SIZE => 64,
1582 # NUM_LINES => 4
1583 # )
1584 # port map(
1585 # clk => clk,
1586 # rst => rst,
1587 # d_in => d_in,
1588 # d_out => d_out,
1589 # m_in => m_in,
1590 # m_out => m_out,
1591 # wishbone_out => wb_bram_in,
1592 # wishbone_in => wb_bram_out
1593 # );
1594 #
1595 # -- BRAM Memory slave
1596 # bram0: entity work.wishbone_bram_wrapper
1597 # generic map(
1598 # MEMORY_SIZE => 1024,
1599 # RAM_INIT_FILE => "icache_test.bin"
1600 # )
1601 # port map(
1602 # clk => clk,
1603 # rst => rst,
1604 # wishbone_in => wb_bram_in,
1605 # wishbone_out => wb_bram_out
1606 # );
1607 #
1608 # clk_process: process
1609 # begin
1610 # clk <= '0';
1611 # wait for clk_period/2;
1612 # clk <= '1';
1613 # wait for clk_period/2;
1614 # end process;
1615 #
1616 # rst_process: process
1617 # begin
1618 # rst <= '1';
1619 # wait for 2*clk_period;
1620 # rst <= '0';
1621 # wait;
1622 # end process;
1623 #
1624 # stim: process
1625 # begin
1626 # -- Clear stuff
1627 # d_in.valid <= '0';
1628 # d_in.load <= '0';
1629 # d_in.nc <= '0';
1630 # d_in.addr <= (others => '0');
1631 # d_in.data <= (others => '0');
1632 # m_in.valid <= '0';
1633 # m_in.addr <= (others => '0');
1634 # m_in.pte <= (others => '0');
1635 #
1636 # wait for 4*clk_period;
1637 # wait until rising_edge(clk);
1638 #
1639 # -- Cacheable read of address 4
1640 # d_in.load <= '1';
1641 # d_in.nc <= '0';
1642 # d_in.addr <= x"0000000000000004";
1643 # d_in.valid <= '1';
1644 # wait until rising_edge(clk);
1645 # d_in.valid <= '0';
1646 #
1647 # wait until rising_edge(clk) and d_out.valid = '1';
1648 # assert d_out.data = x"0000000100000000"
1649 # report "data @" & to_hstring(d_in.addr) &
1650 # "=" & to_hstring(d_out.data) &
1651 # " expected 0000000100000000"
1652 # severity failure;
1653 # -- wait for clk_period;
1654 #
1655 # -- Cacheable read of address 30
1656 # d_in.load <= '1';
1657 # d_in.nc <= '0';
1658 # d_in.addr <= x"0000000000000030";
1659 # d_in.valid <= '1';
1660 # wait until rising_edge(clk);
1661 # d_in.valid <= '0';
1662 #
1663 # wait until rising_edge(clk) and d_out.valid = '1';
1664 # assert d_out.data = x"0000000D0000000C"
1665 # report "data @" & to_hstring(d_in.addr) &
1666 # "=" & to_hstring(d_out.data) &
1667 # " expected 0000000D0000000C"
1668 # severity failure;
1669 #
1670 # -- Non-cacheable read of address 100
1671 # d_in.load <= '1';
1672 # d_in.nc <= '1';
1673 # d_in.addr <= x"0000000000000100";
1674 # d_in.valid <= '1';
1675 # wait until rising_edge(clk);
1676 # d_in.valid <= '0';
1677 # wait until rising_edge(clk) and d_out.valid = '1';
1678 # assert d_out.data = x"0000004100000040"
1679 # report "data @" & to_hstring(d_in.addr) &
1680 # "=" & to_hstring(d_out.data) &
1681 # " expected 0000004100000040"
1682 # severity failure;
1683 #
1684 # wait until rising_edge(clk);
1685 # wait until rising_edge(clk);
1686 # wait until rising_edge(clk);
1687 # wait until rising_edge(clk);
1688 #
1689 # std.env.finish;
1690 # end process;
1691 # end;
1692 def dcache_sim(dut):
1693 # clear stuff
1694 yield dut.d_in.valid.eq(0)
1695 yield dut.d_in.load.eq(0)
1696 yield dut.d_in.nc.eq(0)
1697 yield dut.d_in.adrr.eq(0)
1698 yield dut.d_in.data.eq(0)
1699 yield dut.m_in.valid.eq(0)
1700 yield dut.m_in.addr.eq(0)
1701 yield dut.m_in.pte.eq(0)
1702 # wait 4 * clk_period
1703 yield
1704 yield
1705 yield
1706 yield
1707 # wait_until rising_edge(clk)
1708 yield
1709 # Cacheable read of address 4
1710 yield dut.d_in.load.eq(1)
1711 yield dut.d_in.nc.eq(0)
1712 yield dut.d_in.addr.eq(Const(0x0000000000000004, 64))
1713 yield dut.d_in.valid.eq(1)
1714 # wait-until rising_edge(clk)
1715 yield
1716 yield dut.d_in.valid.eq(0)
1717 yield
1718 while not (yield dut.d_out.valid):
1719 yield
1720 assert dut.d_out.data == 0x0000000100000000, \
1721 f"data @ {dut.d_in.addr}={dut.d_in.data} expected 0000000100000000"
1722
1723
1724 # Cacheable read of address 30
1725 yield dut.d_in.load.eq(1)
1726 yield dut.d_in.nc.eq(0)
1727 yield dut.d_in.addr.eq(Const(0x0000000000000030, 64))
1728 yield dut.d_in.valid.eq(1)
1729 yield
1730 yield dut.d_in.valid.eq(0)
1731 yield
1732 while not (yield dut.d_out.valid):
1733 yield
1734 assert dut.d_out.data == 0x0000000D0000000C, \
1735 f"data @{dut.d_in.addr}={dut.d_out.data} expected 0000000D0000000C"
1736
1737 # Non-cacheable read of address 100
1738 yield dut.d_in.load.eq(1)
1739 yield dut.d_in.nc.eq(1)
1740 yield dut.d_in.addr.eq(Const(0x0000000000000100, 64))
1741 yield dut.d_in.valid.eq(1)
1742 yield
1743 yield dut.d_in.valid.eq(0)
1744 yield
1745 while not (yield dut.d_out.valid):
1746 yield
1747 assert dut.d_out.data == 0x0000004100000040, \
1748 f"data @ {dut.d_in.addr}={dut.d_out.data} expected 0000004100000040"
1749
1750 yield
1751 yield
1752 yield
1753 yield
1754
1755
1756 def test_dcache():
1757 dut = DCache()
1758 vl = rtlil.convert(dut, ports=[])
1759 with open("test_dcache.il", "w") as f:
1760 f.write(vl)
1761
1762 #run_simulation(dut, dcache_sim(), vcd_name='test_dcache.vcd')
1763
1764 if __name__ == '__main__':
1765 test_dcache()
1766