large stack of moving stuff around in dcache
[soc.git] / src / soc / experiment / dcache.py
1 """DCache
2
3 based on Anton Blanchard microwatt dcache.vhdl
4
5 """
6
7 from enum import Enum, unique
8
9 from nmigen import Module, Signal, Elaboratable,
10 Cat, Repl
11 from nmigen.cli import main
12 from nmigen.iocontrol import RecordObject
13 from nmigen.util import log2_int
14
15 from experiment.mem_types import LoadStore1ToDCacheType,
16 DCacheToLoadStore1Type,
17 MMUToDCacheType,
18 DCacheToMMUType
19
20 from experiment.wb_types import WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
21 WBAddrType, WBDataType, WBSelType,
22 WbMasterOut, WBSlaveOut,
23 WBMasterOutVector, WBSlaveOutVector,
24 WBIOMasterOut, WBIOSlaveOut
25
26 # TODO: make these parameters of DCache at some point
27 LINE_SIZE = 64 # Line size in bytes
28 NUM_LINES = 32 # Number of lines in a set
29 NUM_WAYS = 4 # Number of ways
30 TLB_SET_SIZE = 64 # L1 DTLB entries per set
31 TLB_NUM_WAYS = 2 # L1 DTLB number of sets
32 TLB_LG_PGSZ = 12 # L1 DTLB log_2(page_size)
33 LOG_LENGTH = 0 # Non-zero to enable log data collection
34
35 # BRAM organisation: We never access more than
36 # -- wishbone_data_bits at a time so to save
37 # -- resources we make the array only that wide, and
38 # -- use consecutive indices for to make a cache "line"
39 # --
40 # -- ROW_SIZE is the width in bytes of the BRAM
41 # -- (based on WB, so 64-bits)
42 ROW_SIZE = WB_DATA_BITS // 8;
43
44 # ROW_PER_LINE is the number of row (wishbone
45 # transactions) in a line
46 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
47
48 # BRAM_ROWS is the number of rows in BRAM needed
49 # to represent the full dcache
50 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
51
52
53 # Bit fields counts in the address
54
55 # REAL_ADDR_BITS is the number of real address
56 # bits that we store
57 REAL_ADDR_BITS = 56
58
59 # ROW_BITS is the number of bits to select a row
60 ROW_BITS = log2_int(BRAM_ROWS)
61
62 # ROW_LINE_BITS is the number of bits to select
63 # a row within a line
64 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
65
66 # LINE_OFF_BITS is the number of bits for
67 # the offset in a cache line
68 LINE_OFF_BITS = log2_int(LINE_SIZE)
69
70 # ROW_OFF_BITS is the number of bits for
71 # the offset in a row
72 ROW_OFF_BITS = log2_int(ROW_SIZE)
73
74 # INDEX_BITS is the number if bits to
75 # select a cache line
76 INDEX_BITS = log2_int(NUM_LINES)
77
78 # SET_SIZE_BITS is the log base 2 of the set size
79 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
80
81 # TAG_BITS is the number of bits of
82 # the tag part of the address
83 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
84
85 # TAG_WIDTH is the width in bits of each way of the tag RAM
86 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
87
88 # WAY_BITS is the number of bits to select a way
89 WAY_BITS = log2_int(NUM_WAYS)
90
91 # Example of layout for 32 lines of 64 bytes:
92 #
93 # .. tag |index| line |
94 # .. | row | |
95 # .. | |---| | ROW_LINE_BITS (3)
96 # .. | |--- - --| LINE_OFF_BITS (6)
97 # .. | |- --| ROW_OFF_BITS (3)
98 # .. |----- ---| | ROW_BITS (8)
99 # .. |-----| | INDEX_BITS (5)
100 # .. --------| | TAG_BITS (45)
101
102 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
103
104 def CacheTagArray():
105 return Array(CacheTagSet() for x in range(NUM_LINES))
106
107 def CacheValidBitsArray():
108 return Array(CacheWayValidBits() for x in range(NUM_LINES))
109
110 def RowPerLineValidArray():
111 return Array(Signal() for x in range(ROW_PER_LINE))
112
113 # L1 TLB
114 TLB_SET_BITS = log2_int(TLB_SET_SIZE)
115 TLB_WAY_BITS = log2_int(TLB_NUM_WAYS)
116 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
117 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
118 TLB_PTE_BITS = 64
119 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
120
121 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
122 assert (LINE_SIZE % 2) == 0, "LINE_SIZE not power of 2"
123 assert (NUM_LINES % 2) == 0, "NUM_LINES not power of 2"
124 assert (ROW_PER_LINE % 2) == 0, "ROW_PER_LINE not power of 2"
125 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
126 assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS), \
127 "geometry bits don't add up"
128 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
129 "geometry bits don't add up"
130 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
131 "geometry bits don't add up"
132 assert 64 == wishbone_data_bits, "Can't yet handle wb width that isn't 64-bits"
133 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
134
135
136 def TLBValidBitsArray():
137 return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
138
139 def TLBTagsArray():
140 return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
141
142 def TLBPtesArray():
143 return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
144
145 def HitWaySet():
146 return Array(Signal(NUM_WAYS) for x in range(TLB_NUM_WAYS))
147
148 # Cache RAM interface
149 def CacheRamOut():
150 return Array(Signal(WB_DATA_BITS) for x in range(NUM_WAYS))
151
152 # PLRU output interface
153 def PLRUOut():
154 return Array(Signal(WAY_BITS) for x in range(Index()))
155
156 # TLB PLRU output interface
157 def TLBPLRUOut():
158 return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
159
160 # Helper functions to decode incoming requests
161 #
162 # Return the cache line index (tag index) for an address
163 def get_index(addr):
164 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
165
166 # Return the cache row index (data memory) for an address
167 def get_row(addr):
168 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
169
170 # Return the index of a row within a line
171 def get_row_of_line(row):
172 row_v = Signal(ROW_BITS)
173 row_v = Signal(row)
174 return row_v[0:ROW_LINE_BITS]
175
176 # Returns whether this is the last row of a line
177 def is_last_row_addr(addr, last):
178 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
179
180 # Returns whether this is the last row of a line
181 def is_last_row(row, last):
182 return get_row_of_line(row) == last
183
184 # Return the address of the next row in the current cache line
185 def next_row_addr(addr):
186 row_idx = Signal(ROW_LINE_BITS)
187 result = WBAddrType()
188 # Is there no simpler way in VHDL to
189 # generate that 3 bits adder ?
190 row_idx = addr[ROW_OFF_BITS:LINE_OFF_BITS]
191 row_idx = Signal(row_idx + 1)
192 result = addr
193 result[ROW_OFF_BITS:LINE_OFF_BITS] = row_idx
194 return result
195
196 # Return the next row in the current cache line. We use a
197 # dedicated function in order to limit the size of the
198 # generated adder to be only the bits within a cache line
199 # (3 bits with default settings)
200 def next_row(row)
201 row_v = row[0:ROW_LINE_BITS] + 1
202 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
203
204 # Get the tag value from the address
205 def get_tag(addr):
206 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
207
208 # Read a tag from a tag memory row
209 def read_tag(way, tagset):
210 return tagset[way *TAG_WIDTH:way * TAG_WIDTH + TAG_BITS]
211
212 # Read a TLB tag from a TLB tag memory row
213 def read_tlb_tag(way, tags):
214 j = way * TLB_EA_TAG_BITS
215 return tags[j:j + TLB_EA_TAG_BITS]
216
217 # Write a TLB tag to a TLB tag memory row
218 def write_tlb_tag(way, tags), tag):
219 j = way * TLB_EA_TAG_BITS
220 tags[j:j + TLB_EA_TAG_BITS] = tag
221
222 # Read a PTE from a TLB PTE memory row
223 def read_tlb_pte(way, ptes):
224 j = way * TLB_PTE_BITS
225 return ptes[j:j + TLB_PTE_BITS]
226
227 def write_tlb_pte(way, ptes,newpte):
228 j = way * TLB_PTE_BITS
229 return ptes[j:j + TLB_PTE_BITS].eq(newpte)
230
231
232 # Record for storing permission, attribute, etc. bits from a PTE
233 class PermAttr(RecordObject):
234 def __init__(self):
235 super().__init__()
236 self.reference = Signal()
237 self.changed = Signal()
238 self.nocache = Signal()
239 self.priv = Signal()
240 self.rd_perm = Signal()
241 self.wr_perm = Signal()
242
243
244 def extract_perm_attr(pte):
245 pa = PermAttr()
246 pa.reference = pte[8]
247 pa.changed = pte[7]
248 pa.nocache = pte[5]
249 pa.priv = pte[3]
250 pa.rd_perm = pte[2]
251 pa.wr_perm = pte[1]
252 return pa;
253
254
255 # Type of operation on a "valid" input
256 @unique
257 class Op(Enum):
258 OP_NONE = 0
259 OP_BAD = 1 # NC cache hit, TLB miss, prot/RC failure
260 OP_STCX_FAIL = 2 # conditional store w/o reservation
261 OP_LOAD_HIT = 3 # Cache hit on load
262 OP_LOAD_MISS = 4 # Load missing cache
263 OP_LOAD_NC = 5 # Non-cachable load
264 OP_STORE_HIT = 6 # Store hitting cache
265 OP_STORE_MISS = 7 # Store missing cache
266
267
268 # Cache state machine
269 @unique
270 class State(Enum):
271 IDLE = 0 # Normal load hit processing
272 RELOAD_WAIT_ACK = 1 # Cache reload wait ack
273 STORE_WAIT_ACK = 2 # Store wait ack
274 NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
275
276
277 # Dcache operations:
278 #
279 # In order to make timing, we use the BRAMs with
280 # an output buffer, which means that the BRAM
281 # output is delayed by an extra cycle.
282 #
283 # Thus, the dcache has a 2-stage internal pipeline
284 # for cache hits with no stalls.
285 #
286 # All other operations are handled via stalling
287 # in the first stage.
288 #
289 # The second stage can thus complete a hit at the same
290 # time as the first stage emits a stall for a complex op.
291 #
292 # Stage 0 register, basically contains just the latched request
293
294 class RegStage0(RecordObject):
295 def __init__(self):
296 super().__init__()
297 self.req = LoadStore1ToDCacheType()
298 self.tlbie = Signal()
299 self.doall = Signal()
300 self.tlbld = Signal()
301 self.mmu_req = Signal() # indicates source of request
302
303
304 class MemAccessRequest(RecordObject):
305 def __init__(self):
306 super().__init__()
307 self.op = Op()
308 self.valid = Signal()
309 self.dcbz = Signal()
310 self.real_addr = Signal(REAL_ADDR_BITS)
311 self.data = Signal(64)
312 self.byte_sel = Signal(8)
313 self.hit_way = Signal(WAY_BITS)
314 self.same_tag = Signal()
315 self.mmu_req = Signal()
316
317
318 # First stage register, contains state for stage 1 of load hits
319 # and for the state machine used by all other operations
320 class RegStage1(RecordObject):
321 def __init__(self):
322 super().__init__()
323 # Info about the request
324 self.full = Signal() # have uncompleted request
325 self.mmu_req = Signal() # request is from MMU
326 self.req = MemAccessRequest()
327
328 # Cache hit state
329 self.hit_way = Signal(WAY_BITS)
330 self.hit_load_valid = Signal()
331 self.hit_index = Signal(NUM_LINES)
332 self.cache_hit = Signal()
333
334 # TLB hit state
335 self.tlb_hit = Signal()
336 self.tlb_hit_way = Signal(TLB_NUM_WAYS)
337 self.tlb_hit_index = Signal(TLB_SET_SIZE)
338 self.
339 # 2-stage data buffer for data forwarded from writes to reads
340 self.forward_data1 = Signal(64)
341 self.forward_data2 = Signal(64)
342 self.forward_sel1 = Signal(8)
343 self.forward_valid1 = Signal()
344 self.forward_way1 = Signal(WAY_BITS)
345 self.forward_row1 = Signal(BRAM_ROWS)
346 self.use_forward1 = Signal()
347 self.forward_sel = Signal(8)
348
349 # Cache miss state (reload state machine)
350 self.state = State()
351 self.dcbz = Signal()
352 self.write_bram = Signal()
353 self.write_tag = Signal()
354 self.slow_valid = Signal()
355 self.wb = WishboneMasterOut()
356 self.reload_tag = Signal(TAG_BITS)
357 self.store_way = Signal(WAY_BITS)
358 self.store_row = Signal(BRAM_ROWS)
359 self.store_index = Signal(NUM_LINES)
360 self.end_row_ix = Signal(ROW_LINE_BIT)
361 self.rows_valid = RowPerLineValidArray()
362 self.acks_pending = Signal(3)
363 self.inc_acks = Signal()
364 self.dec_acks = Signal()
365
366 # Signals to complete (possibly with error)
367 self.ls_valid = Signal()
368 self.ls_error = Signal()
369 self.mmu_done = Signal()
370 self.mmu_error = Signal()
371 self.cache_paradox = Signal()
372
373 # Signal to complete a failed stcx.
374 self.stcx_fail = Signal()
375
376
377 # Reservation information
378 class Reservation(RecordObject):
379 def __init__(self):
380 super().__init__()
381 self.valid = Signal()
382 self.addr = Signal(64-LINE_OFF_BITS)
383
384
385 class DCache(Elaboratable):
386 """Set associative dcache write-through
387 TODO (in no specific order):
388 * See list in icache.vhdl
389 * Complete load misses on the cycle when WB data comes instead of
390 at the end of line (this requires dealing with requests coming in
391 while not idle...)
392 """
393 def __init__(self):
394 self.d_in = LoadStore1ToDCacheType()
395 self.d_out = DCacheToLoadStore1Type()
396
397 self.m_in = MMUToDCacheType()
398 self.m_out = DCacheToMMUType()
399
400 self.stall_out = Signal()
401
402 self.wb_out = WBMasterOut()
403 self.wb_in = WBSlaveOut()
404
405 self.log_out = Signal(20)
406
407 def stage_0(self, m, d_in, m_in):
408 """Latch the request in r0.req as long as we're not stalling
409 """
410 comb = m.d.comb
411 sync = m.d.sync
412
413 r = RegStage0()
414
415 # TODO, this goes in unit tests and formal proofs
416 with m.If(~(d_in.valid & m_in.valid)):
417 #sync += Display("request collision loadstore vs MMU")
418 pass
419
420 with m.If(m_in.valid):
421 sync += r.req.valid.eq(1)
422 sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
423 sync += r.req.dcbz.eq(0)
424 sync += r.req.nc.eq(0)
425 sync += r.req.reserve.eq(0)
426 sync += r.req.virt_mode.eq(1)
427 sync += r.req.priv_mode.eq(1)
428 sync += r.req.addr.eq(m_in.addr)
429 sync += r.req.data.eq(m_in.pte)
430 sync += r.req.byte_sel.eq(-1) # Const -1 sets all to 0b111....
431 sync += r.tlbie.eq(m_in.tlbie)
432 sync += r.doall.eq(m_in.doall)
433 sync += r.tlbld.eq(m_in.tlbld)
434 sync += r.mmu_req.eq(1)
435 with m.Else():
436 sync += r.req.eq(d_in)
437 sync += r.req.tlbie.eq(0)
438 sync += r.req.doall.eq(0)
439 sync += r.req.tlbd.eq(0)
440 sync += r.req.mmu_req.eq(0)
441 with m.If(~(r1.full & r0_full)):
442 sync += r0.eq(r)
443 sync += r0_full.eq(r.req.valid)
444
445 def tlb_read(self, m, m_in, d_in, r0_stall, tlb_valid_way,
446 tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
447 dtlb_tags, dtlb_ptes):
448 """TLB
449 Operates in the second cycle on the request latched in r0.req.
450 TLB updates write the entry at the end of the second cycle.
451 """
452 comb = m.d.comb
453 sync = m.d.sync
454
455 index = Signal(log2_int(TLB_SET_BITS), False)
456 addrbits = Signal(TLB_SET_BITS)
457
458 amin = TLB_LG_PGSZ
459 amax = TLB_LG_PGSZ + TLB_SET_BITS
460
461 with m.If(m_in.valid):
462 comb += addrbits.eq(m_in.addr[amin : amax])
463 with m.Else():
464 comb += addrbits.eq(d_in.addr[amin : amax])
465 comb += index.eq(addrbits)
466
467 # If we have any op and the previous op isn't finished,
468 # then keep the same output for next cycle.
469 with m.If(~r0_stall):
470 sync += tlb_valid_way.eq(dtlb_valid_bits[index])
471 sync += tlb_tag_way.eq(dtlb_tags[index])
472 sync += tlb_pte_way.eq(dtlb_ptes[index])
473
474 def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, acc, acc_en, lru):
475 """Generate TLB PLRUs
476 """
477 comb = m.d.comb
478 sync = m.d.sync
479
480 with m.If(TLB_NUM_WAYS > 1):
481 for i in range(TLB_SET_SIZE):
482 # TLB PLRU interface
483 tlb_plru = PLRU(TLB_WAY_BITS)
484 tlb_plru_acc = Signal(TLB_WAY_BITS)
485 tlb_plru_acc_en = Signal()
486 tlb_plru_out = Signal(TLB_WAY_BITS)
487
488 comb += tlb_plru.acc.eq(tlb_plru_acc)
489 comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
490 comb += tlb_plru.lru.eq(tlb_plru_out)
491
492 # PLRU interface
493 with m.If(r1.tlb_hit_index == i):
494 comb += tlb_plru.acc_en.eq(r1.tlb_hit)
495 with m.Else():
496 comb += tlb_plru.acc_en.eq(0)
497 comb += tlb_plru.acc.eq(r1.tlb_hit_way)
498
499 comb += tlb_plru_victim[i].eq(tlb_plru.lru)
500
501 def tlb_search(self, m, tlb_req_index, r0, tlb_valid_way_ tlb_tag_way,
502 tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
503
504 comb = m.d.comb
505 sync = m.d.sync
506
507 hitway = Signal(TLB_WAY_BITS)
508 hit = Signal()
509 eatag = Signal(log2_int(TLB_EA_TAG_BITS, False))
510
511 TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
512 comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
513 comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
514
515 for i in range(TLB_NUM_WAYS):
516 with m.If(tlb_valid_way(i)
517 & read_tlb_tag(i, tlb_tag_way) == eatag):
518 comb += hitway.eq(i)
519 comb += hit.eq(1)
520
521 comb += tlb_hit.eq(hit & r0_valid)
522 comb += tlb_hit_way.eq(hitway)
523
524 with m.If(tlb_hit):
525 comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
526 with m.Else():
527 comb += pte.eq(0)
528 comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
529 with m.If(r0.req.virt_mode):
530 comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
531 r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
532 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
533 comb += perm_attr.eq(extract_perm_attr(pte))
534 with m.Else():
535 comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
536 r0.rq.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
537
538 comb += perm_attr.reference.eq(1)
539 comb += perm_attr.changed.eq(1)
540 comb += perm_attr.priv.eq(1)
541 comb += perm_attr.nocache.eq(0)
542 comb += perm_attr.rd_perm.eq(1)
543 comb += perm_attr.wr_perm.eq(1)
544
545 def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
546 tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
547 dtlb_tags, tlb_pte_way, dtlb_ptes, dtlb_valid_bits):
548
549 comb = m.d.comb
550 sync = m.d.sync
551
552 # variable tlbie : std_ulogic;
553 # variable tlbwe : std_ulogic;
554 # variable repl_way : tlb_way_t;
555 # variable eatag : tlb_tag_t;
556 # variable tagset : tlb_way_tags_t;
557 # variable pteset : tlb_way_ptes_t;
558 #type tlb_tags_t is array(tlb_index_t) of tlb_way_tags_t;
559 # --> Array([Signal(log(way_tags length)) for i in range(number of tlbs)])
560
561 tlbie = Signal()
562 tlbwe = Signal()
563 repl_way = Signal(TLB_WAY_BITS)
564 eatag = Signal(TLB_EA_TAG_BITS)
565 tagset = TLBWayTags()
566 pteset = TLBWayPtes()
567
568 comb += tlbie.eq(r0_valid & r0.tlbie)
569 comb += tlbwe.eq(r0_valid & r0.tlbldoi)
570
571 with m.If(tlbie & r0.doall):
572 # clear all valid bits at once
573 for i in range(TLB_SET_SIZE):
574 sync += dtlb_valid_bits[i].eq(0)
575
576 with m.Elif(tlbie):
577 with m.If(tlb_hit):
578 sync += dtlb_valid_bits[tlb_req_index][tlb_hit_way].eq(0)
579 with m.Elif(tlbwe):
580 with m.If(tlb_hit):
581 comb += repl_way.eq(tlb_hit_way)
582 with m.Else():
583 comb += repl_way.eq(tlb_plru_victim[tlb_req_index])
584 comb += eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
585 comb += tagset.eq(tlb_tag_way)
586 sync += write_tlb_tag(repl_way, tagset, eatag)
587 sync += dtlb_tags[tlb_req_index].eq(tagset)
588 comb += pteset.eq(tlb_pte_way)
589 sync += write_tlb_pte(repl_way, pteset, r0.req.data)
590 sync += dtlb_ptes[tlb_req_index].eq(pteset)
591 sync += dtlb_valid_bits[tlb_req_index][repl_way].eq(1)
592
593 def maybe_plrus(self, r1):
594 """Generate PLRUs
595 """
596 comb = m.d.comb
597 sync = m.d.sync
598
599 for i in range(NUM_LINES):
600 # PLRU interface
601 plru = PLRU(TLB_WAY_BITS)
602 setattr(m.submodules, "plru%d" % i, plru)
603 plru_acc = Signal(TLB_WAY_BITS)
604 plru_acc_en = Signal()
605 plru_out = Signal(TLB_WAY_BITS)
606
607 comb += plru.acc.eq(plru_acc)
608 comb += plru.acc_en.eq(plru_acc_en)
609 comb += plru.lru.eq(plru_out)
610
611 with m.If(r1.hit_index == i):
612 comb += plru_acc_en.eq(r1.cache_hit)
613
614 comb += plru_acc.eq(r1.hit_way)
615 comb += plru_victim[i].eq(plru_out)
616
617 def cache_tag_read(self, m, r0_stall, req_index, m_in, d_in,
618 cache_tag_set, cache_tags):
619 """Cache tag RAM read port
620 """
621 comb = m.d.comb
622 sync = m.d.sync
623
624 index = Signal(INDEX_BITS)
625
626 with m.If(r0_stall):
627 comb += index.eq(req_index)
628 with m.Elif(m_in.valid):
629 comb += index.eq(get_index(m_in.addr))
630 with m.Else():
631 comb += index.eq(get_index(d_in.addr))
632 sync += cache_tag_set.eq(cache_tags[index])
633
634 def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
635 r0_valid, r1, cache_valid_bits, replace_way,
636 use_forward1_next, use_forward2_next,
637 req_hit_way, plru_victim, rc_ok, perm_attr,
638 valid_ra, perm_ok, access_ok, req_op, req_ok,
639 r0_stall, m_in, early_req_row, d_in):
640 """Cache request parsing and hit detection
641 """
642
643 comb = m.d.comb
644 sync = m.d.sync
645
646 is_hit = Signal()
647 hit_way = Signal(WAY_BITS)
648 op = Op()
649 opsel = Signal(3)
650 go = Signal()
651 nc = Signal()
652 s_hit = Signal()
653 s_tag = Signal(TAG_BITS)
654 s_pte = Signal(TLB_PTE_BITS)
655 s_ra = Signal(REAL_ADDR_BITS)
656 hit_set = Signal(TLB_NUM_WAYS)
657 hit_way_set = HitWaySet()
658 rel_matches = Signal(TLB_NUM_WAYS)
659 rel_match = Signal()
660
661 # Extract line, row and tag from request
662 comb += req_index.eq(get_index(r0.req.addr))
663 comb += req_row.eq(get_row(r0.req.addr))
664 comb += req_tag.eq(get_tag(ra))
665
666 comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
667
668 # Test if pending request is a hit on any way
669 # In order to make timing in virtual mode,
670 # when we are using the TLB, we compare each
671 # way with each of the real addresses from each way of
672 # the TLB, and then decide later which match to use.
673
674 with m.If(r0.req.virt_mode):
675 comb += rel_matches.eq(0)
676 for j in range(TLB_NUM_WAYS):
677 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
678 comb += s_ra.eq(Cat(r0.req.addr[0:TLB_LG_PGSZ],
679 s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
680 comb += s_tag.eq(get_tag(s_ra))
681
682 for i in range(NUM_WAYS):
683 with m.If(go & cache_valid_bits[req_index][i] &
684 read_tag(i, cache_tag_set) == s_tag
685 & tlb_valid_way[j]):
686 comb += hit_way_set[j].eq(i)
687 comb += s_hit.eq(1)
688 comb += hit_set[j].eq(s_hit)
689 with m.If(s_tag == r1.reload_tag):
690 comb += rel_matches[j].eq(1)
691 with m.If(tlb_hit):
692 comb += is_hit.eq(hit_set[tlb_hit_way])
693 comb += hit_way.eq(hit_way_set[tlb_hit_way])
694 comb += rel_match.eq(rel_matches[tlb_hit_way])
695 with m.Else():
696 comb += s_tag.eq(get_tag(r0.req.addr))
697 for i in range(NUM_WAYS):
698 with m.If(go & cache_valid_bits[req_index][i] &
699 read_tag(i, cache_tag_set) == s_tag):
700 comb += hit_way.eq(i)
701 comb += is_hit.eq(1)
702 with m.If(s_tag == r1.reload_tag):
703 comb += rel_match.eq(1)
704 comb += req_same_tag.eq(rel_match)
705
706 # See if the request matches the line currently being reloaded
707 with m.If((r1.state == State.RELOAD_WAIT_ACK) &
708 (req_index == r1.store_index) & rel_match):
709 # For a store, consider this a hit even if the row isn't
710 # valid since it will be by the time we perform the store.
711 # For a load, check the appropriate row valid bit.
712 valid = r1.rows_valid[req_row % ROW_PER_LINE]
713 comb += is_hit.eq(~r0.req.load | valid)
714 comb += hit_way.eq(replace_way)
715
716 # Whether to use forwarded data for a load or not
717 comb += use_forward1_next.eq(0)
718 with m.If((get_row(r1.req.real_addr) == req_row)
719 & (r1.req.hit_way == hit_way))
720 # Only need to consider r1.write_bram here, since if we
721 # are writing refill data here, then we don't have a
722 # cache hit this cycle on the line being refilled.
723 # (There is the possibility that the load following the
724 # load miss that started the refill could be to the old
725 # contents of the victim line, since it is a couple of
726 # cycles after the refill starts before we see the updated
727 # cache tag. In that case we don't use the bypass.)
728 comb += use_forward1_next.eq(r1.write_bram)
729 comb += use_forward2_next.eq(0)
730 with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
731 comb += use_forward2_next.eq(r1.forward_valid1)
732
733 # The way that matched on a hit
734 comb += req_hit_way.eq(hit_way)
735
736 # The way to replace on a miss
737 with m.If(r1.write_tag):
738 replace_way.eq(plru_victim[r1.store_index])
739 with m.Else():
740 comb += replace_way.eq(r1.store_way)
741
742 # work out whether we have permission for this access
743 # NB we don't yet implement AMR, thus no KUAP
744 comb += rc_ok.eq(perm_attr.reference
745 & (r0.req.load | perm_attr.changed)
746 )
747 comb += perm_ok.eq((r0.req.prive_mode | ~perm_attr.priv)
748 & perm_attr.wr_perm
749 | (r0.req.load & perm_attr.rd_perm)
750 )
751 comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
752 # Combine the request and cache hit status to decide what
753 # operation needs to be done
754 comb += nc.eq(r0.req.nc | perm_attr.nocache)
755 comb += op.eq(Op.OP_NONE)
756 with m.If(go):
757 with m.If(~access_ok):
758 comb += op.eq(Op.OP_BAD)
759 with m.Elif(cancel_store):
760 comb += op.eq(Op.OP_STCX_FAIL)
761 with m.Else():
762 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
763 with m.Switch(opsel):
764 with m.Case(Const(0b101, 3)):
765 comb += op.eq(Op.OP_LOAD_HIT)
766 with m.Case(Cosnt(0b100, 3)):
767 comb += op.eq(Op.OP_LOAD_MISS)
768 with m.Case(Const(0b110, 3)):
769 comb += op.eq(Op.OP_LOAD_NC)
770 with m.Case(Const(0b001, 3)):
771 comb += op.eq(Op.OP_STORE_HIT)
772 with m.Case(Const(0b000, 3)):
773 comb += op.eq(Op.OP_STORE_MISS)
774 with m.Case(Const(0b010, 3)):
775 comb += op.eq(Op.OP_STORE_MISS)
776 with m.Case(Const(0b011, 3)):
777 comb += op.eq(Op.OP_BAD)
778 with m.Case(Const(0b111, 3)):
779 comb += op.eq(Op.OP_BAD)
780 with m.Default():
781 comb += op.eq(Op.OP_NONE)
782 comb += req_op.eq(op)
783 comb += req_go.eq(go)
784
785 # Version of the row number that is valid one cycle earlier
786 # in the cases where we need to read the cache data BRAM.
787 # If we're stalling then we need to keep reading the last
788 # row requested.
789 with m.If(~r0_stall):
790 with m.If(m_in.valid):
791 comb += early_req_row.eq(get_row(m_in.addr))
792 with m.Else():
793 comb += early_req_row.eq(get_row(d_in.addr))
794 with m.Else():
795 comb += early_req_row.eq(req_row)
796
797 def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
798 r0_valid, r0, reservation):
799 """Handle load-with-reservation and store-conditional instructions
800 """
801 comb = m.d.comb
802 sync = m.d.sync
803
804 with m.If(r0_valid & r0.req.reserve):
805
806 # XXX generate alignment interrupt if address
807 # is not aligned XXX or if r0.req.nc = '1'
808 with m.If(r0.req.load):
809 comb += set_rsrv(1) # load with reservation
810 with m.Else():
811 comb += clear_rsrv.eq(1) # store conditional
812 with m.If(~reservation.valid | r0.req.addr[LINE_OFF_BITS:64]):
813 comb += cancel_store.eq(1)
814
815 def reservation_reg(self, m, r0_valid, access_ok, clear_rsrv,
816 reservation, r0):
817
818 comb = m.d.comb
819 sync = m.d.sync
820
821 with m.If(r0_valid & access_ok):
822 with m.If(clear_rsrv):
823 sync += reservation.valid.eq(0)
824 with m.Elif(set_rsrv):
825 sync += reservation.valid.eq(1)
826 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
827
828 def writeback_control(self, m, r1, cache_out, d_out, m_out):
829 """Return data for loads & completion control logic
830 """
831 comb = m.d.comb
832 sync = m.d.sync
833
834 data_out = Signal(64)
835 data_fwd = Signal(64)
836 j = Signal()
837
838 # Use the bypass if are reading the row that was
839 # written 1 or 2 cycles ago, including for the
840 # slow_valid = 1 case (i.e. completing a load
841 # miss or a non-cacheable load).
842 with m.If(r1.use_forward1):
843 comb += data_fwd.eq(r1.forward_data1)
844 with m.Else():
845 comb += data_fwd.eq(r1.forward_data2)
846
847 comb += data_out.eq(cache_out[r1.hit_way])
848
849 for i in range(8):
850 with m.If(r1.forward_sel[i]):
851 dsel = data_fwd.word_select(i, 8)
852 comb += data_out.word_select(i, 8).eq(dsel)
853
854 comb += d_out.valid.eq(r1.ls_valid)
855 comb += d_out.data.eq(data_out)
856 comb += d_out.store_done.eq(~r1.stcx_fail)
857 comb += d_out.error.eq(r1.ls_error)
858 comb += d_out.cache_paradox.eq(r1.cache_paradox)
859
860 # Outputs to MMU
861 comb += m_out.done.eq(r1.mmu_done)
862 comb += m_out.err.eq(r1.mmu_error)
863 comb += m_out.data.eq(data_out)
864
865 # We have a valid load or store hit or we just completed
866 # a slow op such as a load miss, a NC load or a store
867 #
868 # Note: the load hit is delayed by one cycle. However it
869 # can still not collide with r.slow_valid (well unless I
870 # miscalculated) because slow_valid can only be set on a
871 # subsequent request and not on its first cycle (the state
872 # machine must have advanced), which makes slow_valid
873 # at least 2 cycles from the previous hit_load_valid.
874
875 # Sanity: Only one of these must be set in any given cycle
876
877 if False: # TODO: need Display to get this to work
878 assert (r1.slow_valid & r1.stcx_fail) != 1 "unexpected" \
879 "slow_valid collision with stcx_fail -!- severity FAILURE"
880
881 assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1
882 "unexpected hit_load_delayed collision with slow_valid -!-" \
883 "severity FAILURE"
884
885 with m.If(~r1._mmu_req):
886 # Request came from loadstore1...
887 # Load hit case is the standard path
888 with m.If(r1.hit_load_valid):
889 #Display(f"completing load hit data={data_out}")
890 pass
891
892 # error cases complete without stalling
893 with m.If(r1.ls_error):
894 # Display("completing ld/st with error")
895 pass
896
897 # Slow ops (load miss, NC, stores)
898 with m.If(r1.slow_valid):
899 #Display(f"completing store or load miss data={data_out}")
900 pass
901
902 with m.Else():
903 # Request came from MMU
904 with m.If(r1.hit_load_valid):
905 # Display(f"completing load hit to MMU, data={m_out.data}")
906 pass
907 # error cases complete without stalling
908 with m.If(r1.mmu_error):
909 #Display("combpleting MMU ld with error")
910 pass
911
912 # Slow ops (i.e. load miss)
913 with m.If(r1.slow_valid):
914 #Display("completing MMU load miss, data={m_out.data}")
915 pass
916
917 def rams(self, m):
918 """rams
919 Generate a cache RAM for each way. This handles the normal
920 reads, writes from reloads and the special store-hit update
921 path as well.
922
923 Note: the BRAMs have an extra read buffer, meaning the output
924 is pipelined an extra cycle. This differs from the
925 icache. The writeback logic needs to take that into
926 account by using 1-cycle delayed signals for load hits.
927 """
928 comb = m.d.comb
929
930 for i in range(NUM_WAYS):
931 do_read = Signal()
932 rd_addr = Signal(ROW_BITS)
933 do_write = Signal()
934 wr_addr = Signal(ROW_BITS)
935 wr_data = Signal(WB_DATA_BITS)
936 wr_sel = Signal(ROW_SIZE)
937 wr_sel_m = Signal(ROW_SIZE)
938 _d_out = Signal(WB_DATA_BITS)
939
940 way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
941 setattr(m.submodules, "cacheram_%d" % i, way)
942
943 comb += way.rd_en.eq(do_read)
944 comb += way.rd_addr.eq(rd_addr)
945 comb += _d_out.eq(way.rd_data)
946 comb += way.wr_sel.eq(wr_sel_m)
947 comb += way.wr_addr.eq(wr_addr)
948 comb += way.wr_data.eq(wr_data)
949
950 # Cache hit reads
951 comb += do_read.eq(1)
952 comb += rd_addr.eq(early_req_row)
953 comb += cache_out[i].eq(_d_out)
954
955 # Write mux:
956 #
957 # Defaults to wishbone read responses (cache refill)
958 #
959 # For timing, the mux on wr_data/sel/addr is not
960 # dependent on anything other than the current state.
961
962 with m.If(r1.write_bram):
963 # Write store data to BRAM. This happens one
964 # cycle after the store is in r0.
965 comb += wr_data.eq(r1.req.data)
966 comb += wr_sel.eq(r1.req.byte_sel)
967 comb += wr_addr.eq(get_row(r1.req.real_addr))
968
969 with m.If(i == r1.req.hit_way):
970 comb += do_write.eq(1)
971 with m.Else():
972 # Otherwise, we might be doing a reload or a DCBZ
973 with m.If(r1.dcbz):
974 comb += wr_data.eq(0)
975 with m.Else():
976 comb += wr_data.eq(wishbone_in.dat)
977 comb += wr_addr.eq(r1.store_row)
978 comb += wr_sel.eq(~0) # all 1s
979
980 with m.If((r1.state == State.RELOAD_WAIT_ACK)
981 & wishbone_in.ack & (relpace_way == i)):
982 comb += do_write.eq(1)
983
984 # Mask write selects with do_write since BRAM
985 # doesn't have a global write-enable
986 with m.If(do_write):
987 comb += wr_sel_m.eq(wr_sel)
988
989 # Cache hit synchronous machine for the easy case.
990 # This handles load hits.
991 # It also handles error cases (TLB miss, cache paradox)
992 def dcache_fast_hit(self, m, req_op, r0_valid, r1, ):
993
994 comb = m.d.comb
995 sync = m.d.sync
996
997 with m.If(req_op != Op.OP_NONE):
998 #Display(f"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \
999 # f"idx:{req_index} tag:{req_tag} way: {req_hit_way}"
1000 # )
1001 pass
1002
1003 with m.If(r0_valid):
1004 sync += r1.mmu_req.eq(r0.mmu_req)
1005
1006 # Fast path for load/store hits.
1007 # Set signals for the writeback controls.
1008 sync += r1.hit_way.eq(req_hit_way)
1009 sync += r1.hit_index.eq(req_index)
1010
1011 with m.If(req_op == Op.OP_LOAD_HIT):
1012 sync += r1.hit_load_valid.eq(1)
1013 with m.Else():
1014 sync += r1.hit_load_valid.eq(0)
1015
1016 with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1017 sync += r1.cache_hit.eq(1)
1018 with m.Else():
1019 sync += r1.cache_hit.eq(0)
1020
1021 with m.If(req_op == Op.OP_BAD):
1022 # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1023 # f"rc_ok={rc_ok} perm_ok={perm_ok}"
1024 sync += r1.ls_error.eq(~r0.mmu_req)
1025 sync += r1.mmu_error.eq(r0.mmu_req)
1026 sync += r1.cache_paradox.eq(access_ok)
1027
1028 with m.Else():
1029 sync += r1.ls_error.eq(0)
1030 sync += r1.mmu_error.eq(0)
1031 sync += r1.cache_paradox.eq(0)
1032
1033 with m.If(req_op == Op.OP_STCX_FAIL):
1034 r1.stcx_fail.eq(1)
1035 with m.Else():
1036 sync += r1.stcx_fail.eq(0)
1037
1038 # Record TLB hit information for updating TLB PLRU
1039 sync += r1.tlb_hit.eq(tlb_hit)
1040 sync += r1.tlb_hit_way.eq(tlb_hit_way)
1041 sync += r1.tlb_hit_index.eq(tlb_req_index)
1042
1043 # Memory accesses are handled by this state machine:
1044 #
1045 # * Cache load miss/reload (in conjunction with "rams")
1046 # * Load hits for non-cachable forms
1047 # * Stores (the collision case is handled in "rams")
1048 #
1049 # All wishbone requests generation is done here.
1050 # This machine operates at stage 1.
1051 def dcache_slow(self, m, r1, use_forward1_next, cache_valid_bits, r0,
1052 r0_valid, req_op, cache_tag, req_go, ra, wb_in):
1053
1054 comb = m.d.comb
1055 sync = m.d.sync
1056
1057 req = MemAccessRequest()
1058 acks = Signal(3)
1059 adjust_acks = Signal(3)
1060
1061 sync += r1.use_forward1.eq(use_forward1_next)
1062 sync += r1.forward_sel.eq(0)
1063
1064 with m.If(use_forward1_next):
1065 sync += r1.forward_sel.eq(r1.req.byte_sel)
1066 with m.Elif(use_forward2_next):
1067 sync += r1.forward_sel.eq(r1.forward_sel1)
1068
1069 sync += r1.forward_data2.eq(r1.forward_data1)
1070 with m.If(r1.write_bram):
1071 sync += r1.forward_data1.eq(r1.req.data)
1072 sync += r1.forward_sel1.eq(r1.req.byte_sel)
1073 sync += r1.forward_way1.eq(r1.req.hit_way)
1074 sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1075 sync += r1.forward_valid1.eq(1)
1076 with m.Else():
1077 with m.If(r1.bcbz):
1078 sync += r1.forward_data1.eq(0)
1079 with m.Else():
1080 sync += r1.forward_data1.eq(wb_in.dat)
1081 sync += r1.forward_sel1.eq(~0) # all 1s
1082 sync += r1.forward_way1.eq(replace_way)
1083 sync += r1.forward_row1.eq(r1.store_row)
1084 sync += r1.forward_valid1.eq(0)
1085
1086 # One cycle pulses reset
1087 sync += r1.slow_valid.eq(0)
1088 sync += r1.write_bram.eq(0)
1089 sync += r1.inc_acks.eq(0)
1090 sync += r1.dec_acks.eq(0)
1091
1092 sync += r1.ls_valid.eq(0)
1093 # complete tlbies and TLB loads in the third cycle
1094 sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1095
1096 with m.If((req_op == Op.OP_LOAD_HIT)
1097 | (req_op == Op.OP_STCX_FAIL)):
1098 with m.If(~r0.mmu_req):
1099 sync += r1.ls_valid.eq(1)
1100 with m.Else():
1101 sync += r1.mmu_done.eq(1)
1102
1103 with m.If(r1.write_tag):
1104 # Store new tag in selected way
1105 for i in range(NUM_WAYS):
1106 with m.If(i == replace_way):
1107 idx = r1.store_index
1108 trange = range(i * TAG_WIDTH, (i+1) * TAG_WIDTH)
1109 sync += cache_tag[idx][trange].eq(r1.reload_tag)
1110 sync += r1.store_way.eq(replace_way)
1111 sync += r1.write_tag.eq(0)
1112
1113 # Take request from r1.req if there is one there,
1114 # else from req_op, ra, etc.
1115 with m.If(r1.full)
1116 comb += req.eq(r1.req)
1117 with m.Else():
1118 comb += req.op.eq(req_op)
1119 comb += req.valid.eq(req_go)
1120 comb += req.mmu_req.eq(r0.mmu_req)
1121 comb += req.dcbz.eq(r0.req.dcbz)
1122 comb += req.real_addr.eq(ra)
1123
1124 with m.If(~r0.req.dcbz):
1125 comb += req.data.eq(r0.req.data)
1126 with m.Else():
1127 comb += req.data.eq(0)
1128
1129 # Select all bytes for dcbz
1130 # and for cacheable loads
1131 with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc):
1132 comb += req.byte_sel.eq(~0) # all 1s
1133 with m.Else():
1134 comb += req.byte_sel.eq(r0.req.byte_sel)
1135 comb += req.hit_way.eq(req_hit_way)
1136 comb += req.same_tag.eq(req_same_tag)
1137
1138 # Store the incoming request from r0,
1139 # if it is a slow request
1140 # Note that r1.full = 1 implies req_op = OP_NONE
1141 with m.If((req_op == Op.OP_LOAD_MISS)
1142 | (req_op == Op.OP_LOAD_NC)
1143 | (req_op == Op.OP_STORE_MISS)
1144 | (req_op == Op.OP_STORE_HIT)):
1145 sync += r1.req(req)
1146 sync += r1.full.eq(1)
1147
1148 # Main state machine
1149 with m.Switch(r1.state):
1150
1151 with m.Case(State.IDLE)
1152 # XXX check 'left downto. probably means len(r1.wb.adr)
1153 # r1.wb.adr <= req.real_addr(
1154 # r1.wb.adr'left downto 0
1155 # );
1156 sync += r1.wb.adr.eq(req.real_addr[0:r1.wb.adr])
1157 sync += r1.wb.sel.eq(req.byte_sel)
1158 sync += r1.wb.dat.eq(req.data)
1159 sync += r1.dcbz.eq(req.dcbz)
1160
1161 # Keep track of our index and way
1162 # for subsequent stores.
1163 sync += r1.store_index.eq(get_index(req.real_addr))
1164 sync += r1.store_row.eq(get_row(req.real_addr))
1165 sync += r1.end_row_ix.eq(
1166 get_row_of_line(get_row(req.real_addr))
1167 )
1168 sync += r1.reload_tag.eq(get_tag(req.real_addr))
1169 sync += r1.req.same_tag.eq(1)
1170
1171 with m.If(req.op == Op.OP_STORE_HIT):
1172 sync += r1.store_way.eq(req.hit_way)
1173
1174 # Reset per-row valid bits,
1175 # ready for handling OP_LOAD_MISS
1176 for i in range(ROW_PER_LINE):
1177 sync += r1.rows_valid[i].eq(0)
1178
1179 with m.Switch(req.op):
1180 with m.Case(Op.OP_LOAD_HIT):
1181 # stay in IDLE state
1182 pass
1183
1184 with m.Case(Op.OP_LOAD_MISS):
1185 #Display(f"cache miss real addr:" \
1186 # f"{req_real_addr}" \
1187 # f" idx:{get_index(req_real_addr)}" \
1188 # f" tag:{get_tag(req.real_addr)}")
1189 pass
1190
1191 # Start the wishbone cycle
1192 sync += r1.wb.we.eq(0)
1193 sync += r1.wb.cyc.eq(1)
1194 sync += r1.wb.stb.eq(1)
1195
1196 # Track that we had one request sent
1197 sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1198 sync += r1.write_tag.eq(1)
1199
1200 with m.Case(Op.OP_LOAD_NC):
1201 sync += r1.wb.cyc.eq(1)
1202 sync += r1.wb.stb.eq(1)
1203 sync += r1.wb.we.eq(0)
1204 sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1205
1206 with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1207 with m.If(~req.bcbz):
1208 sync += r1.state.eq(State.STORE_WAIT_ACK)
1209 sync += r1.acks_pending.eq(1)
1210 sync += r1.full.eq(0)
1211 sync += r1.slow_valid.eq(1)
1212
1213 with m.If(~req.mmu_req):
1214 sync += r1.ls_valid.eq(1)
1215 with m.Else():
1216 sync += r1.mmu_done.eq(1)
1217
1218 with m.If(req.op == Op.OP_STORE_HIT):
1219 sync += r1.write_bram.eq(1)
1220 with m.Else():
1221 sync += r1.state.eq(Op.RELOAD_WAIT_ACK)
1222
1223 with m.If(req.op == Op.OP_STORE_MISS):
1224 sync += r1.write_tag.eq(1)
1225
1226 sync += r1.wb.we.eq(1)
1227 sync += r1.wb.cyc.eq(1)
1228 sync += r1.wb.stb.eq(1)
1229
1230 # OP_NONE and OP_BAD do nothing
1231 # OP_BAD & OP_STCX_FAIL were
1232 # handled above already
1233 with m.Case(Op.OP_NONE):
1234 pass
1235 with m.Case(OP_BAD):
1236 pass
1237 with m.Case(OP_STCX_FAIL):
1238 pass
1239
1240 with m.Case(State.RELOAD_WAIT_ACK):
1241 # Requests are all sent if stb is 0
1242 comb += stbs_done.eq(~r1.wb.stb)
1243
1244 with m.If(~wb_in.stall & ~stbs_done):
1245 # That was the last word?
1246 # We are done sending.
1247 # Clear stb and set stbs_done
1248 # so we can handle an eventual
1249 # last ack on the same cycle.
1250 with m.If(is_last_row_addr(
1251 r1.wb.adr, r1.end_row_ix)):
1252 sync += r1.wb.stb.eq(0)
1253 comb += stbs_done.eq(0)
1254
1255 # Calculate the next row address
1256 sync += r1.wb.adr.eq(next_row_addr(r1.wb.adr))
1257
1258 # Incoming acks processing
1259 sync += r1.forward_valid1.eq(wb_in.ack)
1260 with m.If(wb_in.ack):
1261 # XXX needs an Array bit-accessor here
1262 sync += r1.rows_valid[r1.store_row % ROW_PER_LINE].eq(1)
1263
1264 # If this is the data we were looking for,
1265 # we can complete the request next cycle.
1266 # Compare the whole address in case the
1267 # request in r1.req is not the one that
1268 # started this refill.
1269 with m.If(r1.full & r1.req.same_tag &
1270 ((r1.dcbz & r1.req.dcbz) |
1271 (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1272 (r1.store_row == get_row(r1.req.real_addr))):
1273 sync += r1.full.eq(0)
1274 sync += r1.slow_valid.eq(1)
1275 with m.If(~r1.mmu_req):
1276 sync += r1.ls_valid.eq(1)
1277 with m.Else():
1278 sync += r1.mmu_done.eq(1)
1279 sync += r1.forward_sel.eq(~0) # all 1s
1280 sync += r1.use_forward1.eq(1)
1281
1282 # Check for completion
1283 with m.If(stbs_done & is_last_row(r1.store_row,
1284 r1.end_row_ix)):
1285 # Complete wishbone cycle
1286 sync += r1.wb.cyc.eq(0)
1287
1288 # Cache line is now valid
1289 cv = cache_valid_bits[r1.store_index]
1290 sync += cv[r1.store_way].eq(1)
1291 sync += r1.state.eq(State.IDLE)
1292
1293 # Increment store row counter
1294 sync += r1.store_row.eq(next_row(r1.store_row))
1295
1296 with m.Case(State.STORE_WAIT_ACK):
1297 comb += stbs_done.eq(~r1.wb.stb)
1298 comb += acks.eq(r1.acks_pending)
1299
1300 with m.If(r1.inc_acks != r1.dec_acks):
1301 with m.If(r1.inc_acks):
1302 comb += adjust_acks.eq(acks + 1)
1303 with m.Else():
1304 comb += adjust_acks.eq(acks - 1)
1305 with m.Else():
1306 comb += adjust_acks.eq(acks)
1307
1308 sync += r1.acks_pending.eq(adjust_acks)
1309
1310 # Clear stb when slave accepted request
1311 with m.If(~wb_in.stall):
1312 # See if there is another store waiting
1313 # to be done which is in the same real page.
1314 with m.If(req.valid):
1315 ra = req.real_addr[0:SET_SIZE_BITS]
1316 sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
1317 sync += r1.wb.dat.eq(req.data)
1318 sync += r1.wb.sel.eq(req.byte_sel)
1319
1320 with m.Elif((adjust_acks < 7) & req.same_tag &
1321 ((req.op == Op.Op_STORE_MISS)
1322 | (req.op == Op.OP_SOTRE_HIT))):
1323 sync += r1.wb.stb.eq(1)
1324 comb += stbs_done.eq(0)
1325
1326 with m.If(req.op == Op.OP_STORE_HIT):
1327 sync += r1.write_bram.eq(1)
1328 sync += r1.full.eq(0)
1329 sync += r1.slow_valid.eq(1)
1330
1331 # Store requests never come from the MMU
1332 sync += r1.ls_valid.eq(1)
1333 comb += stbs_done.eq(0)
1334 sync += r1.inc_acks.eq(1)
1335 with m.Else():
1336 sync += r1.wb.stb.eq(0)
1337 comb += stbs_done.eq(1)
1338
1339 # Got ack ? See if complete.
1340 with m.If(wb_in.ack):
1341 with m.If(stbs_done & (adjust_acks == 1))
1342 sync += r1.state.eq(State.IDLE)
1343 sync += r1.wb.cyc.eq(0)
1344 sync += r1.wb.stb.eq(0)
1345 sync += r1.dec_acks.eq(1)
1346
1347 with m.Case(State.NC_LOAD_WAIT_ACK):
1348 # Clear stb when slave accepted request
1349 with m.If(~wb_in.stall):
1350 sync += r1.wb.stb.eq(0)
1351
1352 # Got ack ? complete.
1353 with m.If(wb_in.ack):
1354 sync += r1.state.eq(State.IDLE)
1355 sync += r1.full.eq(0)
1356 sync += r1.slow_valid.eq(1)
1357
1358 with m.If(~r1.mmu_req):
1359 sync += r1.ls_valid.eq(1)
1360 with m.Else():
1361 sync += r1.mmu_done.eq(1)
1362
1363 sync += r1.forward_sel.eq(~0) # all 1s
1364 sync += r1.use_forward1.eq(1)
1365 sync += r1.wb.cyc.eq(0)
1366 sync += r1.wb.stb.eq(0)
1367
1368 def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out,
1369 d_out, wb_in, log_out):
1370
1371 sync = m.d.sync
1372
1373 sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1374 stall_out, req_op[:3], d_out.valid, d_out.error,
1375 r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1376 r1.wb.adr[3:6]))
1377
1378 def elaborate(self, platform):
1379
1380 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1381 cache_tags = CacheTagArray()
1382 cache_tag_set = Signal(TAG_RAM_WIDTH)
1383 cache_valid_bits = CacheValidBitsArray()
1384
1385 # TODO attribute ram_style : string;
1386 # TODO attribute ram_style of cache_tags : signal is "distributed";
1387
1388 """note: these are passed to nmigen.hdl.Memory as "attributes".
1389 don't know how, just that they are.
1390 """
1391 dtlb_valid_bits = TLBValidBitsArray()
1392 dtlb_tags = TLBTagsArray()
1393 dtlb_ptes = TLBPtesArray()
1394 # TODO attribute ram_style of
1395 # dtlb_tags : signal is "distributed";
1396 # TODO attribute ram_style of
1397 # dtlb_ptes : signal is "distributed";
1398
1399 r0 = RegStage0()
1400 r0_full = Signal()
1401
1402 r1 = RegStage1()
1403
1404 reservation = Reservation()
1405
1406 # Async signals on incoming request
1407 req_index = Signal(NUM_LINES)
1408 req_row = Signal(BRAM_ROWS)
1409 req_hit_way = Signal(WAY_BITS)
1410 req_tag = Signal(TAG_BITS)
1411 req_op = Op()
1412 req_data = Signal(64)
1413 req_same_tag = Signal()
1414 req_go = Signal()
1415
1416 early_req_row = Signal(BRAM_ROWS)
1417
1418 cancel_store = Signal()
1419 set_rsrv = Signal()
1420 clear_rsrv = Signal()
1421
1422 r0_valid = Signal()
1423 r0_stall = Signal()
1424
1425 use_forward1_next = Signal()
1426 use_forward2_next = Signal()
1427
1428 cache_out = CacheRamOut()
1429
1430 plru_victim = PLRUOut()
1431 replace_way = Signal(WAY_BITS)
1432
1433 # Wishbone read/write/cache write formatting signals
1434 bus_sel = Signal(8)
1435
1436 # TLB signals
1437 tlb_tag_way = Signal(TLB_TAG_WAY_BITS)
1438 tlb_pte_way = Signal(TLB_PTE_WAY_BITS)
1439 tlb_valid_way = Signal(TLB_NUM_WAYS)
1440 tlb_req_index = Signal(TLB_SET_SIZE)
1441 tlb_hit = Signal()
1442 tlb_hit_way = Signal(TLB_NUM_WAYS)
1443 pte = Signal(TLB_PTE_BITS)
1444 ra = Signal(REAL_ADDR_BITS)
1445 valid_ra = Signal()
1446 perm_attr = PermAttr()
1447 rc_ok = Signal()
1448 perm_ok = Signal()
1449 access_ok = Signal()
1450
1451 tlb_plru_victim = TLBPLRUOut()
1452
1453 # we don't yet handle collisions between loadstore1 requests
1454 # and MMU requests
1455 comb += m_out.stall.eq(0)
1456
1457 # Hold off the request in r0 when r1 has an uncompleted request
1458 comb += r0_stall.eq(r0_full & r1.full)
1459 comb += r0_valid.eq(r0_full & ~r1.full)
1460 comb += stall_out.eq(r0_stall)
1461
1462 # Wire up wishbone request latch out of stage 1
1463 comb += wishbone_out.eq(r1.wb)
1464
1465
1466
1467 # dcache_tb.vhdl
1468 #
1469 # entity dcache_tb is
1470 # end dcache_tb;
1471 #
1472 # architecture behave of dcache_tb is
1473 # signal clk : std_ulogic;
1474 # signal rst : std_ulogic;
1475 #
1476 # signal d_in : Loadstore1ToDcacheType;
1477 # signal d_out : DcacheToLoadstore1Type;
1478 #
1479 # signal m_in : MmuToDcacheType;
1480 # signal m_out : DcacheToMmuType;
1481 #
1482 # signal wb_bram_in : wishbone_master_out;
1483 # signal wb_bram_out : wishbone_slave_out;
1484 #
1485 # constant clk_period : time := 10 ns;
1486 # begin
1487 # dcache0: entity work.dcache
1488 # generic map(
1489 #
1490 # LINE_SIZE => 64,
1491 # NUM_LINES => 4
1492 # )
1493 # port map(
1494 # clk => clk,
1495 # rst => rst,
1496 # d_in => d_in,
1497 # d_out => d_out,
1498 # m_in => m_in,
1499 # m_out => m_out,
1500 # wishbone_out => wb_bram_in,
1501 # wishbone_in => wb_bram_out
1502 # );
1503 #
1504 # -- BRAM Memory slave
1505 # bram0: entity work.wishbone_bram_wrapper
1506 # generic map(
1507 # MEMORY_SIZE => 1024,
1508 # RAM_INIT_FILE => "icache_test.bin"
1509 # )
1510 # port map(
1511 # clk => clk,
1512 # rst => rst,
1513 # wishbone_in => wb_bram_in,
1514 # wishbone_out => wb_bram_out
1515 # );
1516 #
1517 # clk_process: process
1518 # begin
1519 # clk <= '0';
1520 # wait for clk_period/2;
1521 # clk <= '1';
1522 # wait for clk_period/2;
1523 # end process;
1524 #
1525 # rst_process: process
1526 # begin
1527 # rst <= '1';
1528 # wait for 2*clk_period;
1529 # rst <= '0';
1530 # wait;
1531 # end process;
1532 #
1533 # stim: process
1534 # begin
1535 # -- Clear stuff
1536 # d_in.valid <= '0';
1537 # d_in.load <= '0';
1538 # d_in.nc <= '0';
1539 # d_in.addr <= (others => '0');
1540 # d_in.data <= (others => '0');
1541 # m_in.valid <= '0';
1542 # m_in.addr <= (others => '0');
1543 # m_in.pte <= (others => '0');
1544 #
1545 # wait for 4*clk_period;
1546 # wait until rising_edge(clk);
1547 #
1548 # -- Cacheable read of address 4
1549 # d_in.load <= '1';
1550 # d_in.nc <= '0';
1551 # d_in.addr <= x"0000000000000004";
1552 # d_in.valid <= '1';
1553 # wait until rising_edge(clk);
1554 # d_in.valid <= '0';
1555 #
1556 # wait until rising_edge(clk) and d_out.valid = '1';
1557 # assert d_out.data = x"0000000100000000"
1558 # report "data @" & to_hstring(d_in.addr) &
1559 # "=" & to_hstring(d_out.data) &
1560 # " expected 0000000100000000"
1561 # severity failure;
1562 # -- wait for clk_period;
1563 #
1564 # -- Cacheable read of address 30
1565 # d_in.load <= '1';
1566 # d_in.nc <= '0';
1567 # d_in.addr <= x"0000000000000030";
1568 # d_in.valid <= '1';
1569 # wait until rising_edge(clk);
1570 # d_in.valid <= '0';
1571 #
1572 # wait until rising_edge(clk) and d_out.valid = '1';
1573 # assert d_out.data = x"0000000D0000000C"
1574 # report "data @" & to_hstring(d_in.addr) &
1575 # "=" & to_hstring(d_out.data) &
1576 # " expected 0000000D0000000C"
1577 # severity failure;
1578 #
1579 # -- Non-cacheable read of address 100
1580 # d_in.load <= '1';
1581 # d_in.nc <= '1';
1582 # d_in.addr <= x"0000000000000100";
1583 # d_in.valid <= '1';
1584 # wait until rising_edge(clk);
1585 # d_in.valid <= '0';
1586 # wait until rising_edge(clk) and d_out.valid = '1';
1587 # assert d_out.data = x"0000004100000040"
1588 # report "data @" & to_hstring(d_in.addr) &
1589 # "=" & to_hstring(d_out.data) &
1590 # " expected 0000004100000040"
1591 # severity failure;
1592 #
1593 # wait until rising_edge(clk);
1594 # wait until rising_edge(clk);
1595 # wait until rising_edge(clk);
1596 # wait until rising_edge(clk);
1597 #
1598 # std.env.finish;
1599 # end process;
1600 # end;
1601 def dcache_sim(dut):
1602 # clear stuff
1603 yield dut.d_in.valid.eq(0)
1604 yield dut.d_in.load.eq(0)
1605 yield dut.d_in.nc.eq(0)
1606 yield dut.d_in.adrr.eq(0)
1607 yield dut.d_in.data.eq(0)
1608 yield dut.m_in.valid.eq(0)
1609 yield dut.m_in.addr.eq(0)
1610 yield dut.m_in.pte.eq(0)
1611 # wait 4 * clk_period
1612 yield
1613 yield
1614 yield
1615 yield
1616 # wait_until rising_edge(clk)
1617 yield
1618 # Cacheable read of address 4
1619 yield dut.d_in.load.eq(1)
1620 yield dut.d_in.nc.eq(0)
1621 yield dut.d_in.addr.eq(Const(0x0000000000000004, 64))
1622 yield dut.d_in.valid.eq(1)
1623 # wait-until rising_edge(clk)
1624 yield
1625 yield dut.d_in.valid.eq(0)
1626 yield
1627 while not (yield dut.d_out.valid):
1628 yield
1629 assert dut.d_out.data == Const(0x0000000100000000, 64) f"data @" \
1630 f"{dut.d_in.addr}={dut.d_in.data} expected 0000000100000000" \
1631 " -!- severity failure"
1632
1633
1634 # Cacheable read of address 30
1635 yield dut.d_in.load.eq(1)
1636 yield dut.d_in.nc.eq(0)
1637 yield dut.d_in.addr.eq(Const(0x0000000000000030, 64))
1638 yield dut.d_in.valid.eq(1)
1639 yield
1640 yield dut.d_in.valid.eq(0)
1641 yield
1642 while not (yield dut.d_out.valid):
1643 yield
1644 assert dut.d_out.data == Const(0x0000000D0000000C, 64) f"data @" \
1645 f"{dut.d_in.addr}={dut.d_out.data} expected 0000000D0000000C" \
1646 f"-!- severity failure"
1647
1648 # Non-cacheable read of address 100
1649 yield dut.d_in.load.eq(1)
1650 yield dut.d_in.nc.eq(1)
1651 yield dut.d_in.addr.eq(Const(0x0000000000000100, 64))
1652 yield dut.d_in.valid.eq(1)
1653 yield
1654 yield dut.d_in.valid.eq(0)
1655 yield
1656 while not (yield dut.d_out.valid):
1657 yield
1658 assert dut.d_out.data == Const(0x0000004100000040, 64) f"data @" \
1659 f"{dut.d_in.addr}={dut.d_out.data} expected 0000004100000040" \
1660 f"-!- severity failure"
1661
1662 yield
1663 yield
1664 yield
1665 yield
1666
1667
1668 def test_dcache():
1669 dut = DCache()
1670 vl = rtlil.convert(dut, ports=[])
1671 with open("test_dcache.il", "w") as f:
1672 f.write(vl)
1673
1674 run_simulation(dut, dcache_sim(), vcd_name='test_dcache.vcd')
1675
1676 if __name__ == '__main__':
1677 test_dcache()
1678