add PLRU microwatt conversion
[soc.git] / src / soc / experiment / dcache.py
1 """DCache
2
3 based on Anton Blanchard microwatt dcache.vhdl
4
5 """
6
7 from enum import Enum, unique
8
9 from nmigen import Module, Signal, Elaboratable,
10 Cat, Repl
11 from nmigen.cli import main
12 from nmigen.iocontrol import RecordObject
13 from nmigen.util import log2_int
14
15 from soc.experiment.mem_types import LoadStore1ToDCacheType,
16 DCacheToLoadStore1Type,
17 MMUToDCacheType,
18 DCacheToMMUType
19
20 from soc.experiment.wb_types import WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
21 WBAddrType, WBDataType, WBSelType,
22 WbMasterOut, WBSlaveOut,
23 WBMasterOutVector, WBSlaveOutVector,
24 WBIOMasterOut, WBIOSlaveOut
25
26 from soc.experiment.cache_ram import CacheRam
27
28 # TODO: make these parameters of DCache at some point
29 LINE_SIZE = 64 # Line size in bytes
30 NUM_LINES = 32 # Number of lines in a set
31 NUM_WAYS = 4 # Number of ways
32 TLB_SET_SIZE = 64 # L1 DTLB entries per set
33 TLB_NUM_WAYS = 2 # L1 DTLB number of sets
34 TLB_LG_PGSZ = 12 # L1 DTLB log_2(page_size)
35 LOG_LENGTH = 0 # Non-zero to enable log data collection
36
37 # BRAM organisation: We never access more than
38 # -- wishbone_data_bits at a time so to save
39 # -- resources we make the array only that wide, and
40 # -- use consecutive indices for to make a cache "line"
41 # --
42 # -- ROW_SIZE is the width in bytes of the BRAM
43 # -- (based on WB, so 64-bits)
44 ROW_SIZE = WB_DATA_BITS // 8;
45
46 # ROW_PER_LINE is the number of row (wishbone
47 # transactions) in a line
48 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
49
50 # BRAM_ROWS is the number of rows in BRAM needed
51 # to represent the full dcache
52 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
53
54
55 # Bit fields counts in the address
56
57 # REAL_ADDR_BITS is the number of real address
58 # bits that we store
59 REAL_ADDR_BITS = 56
60
61 # ROW_BITS is the number of bits to select a row
62 ROW_BITS = log2_int(BRAM_ROWS)
63
64 # ROW_LINE_BITS is the number of bits to select
65 # a row within a line
66 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
67
68 # LINE_OFF_BITS is the number of bits for
69 # the offset in a cache line
70 LINE_OFF_BITS = log2_int(LINE_SIZE)
71
72 # ROW_OFF_BITS is the number of bits for
73 # the offset in a row
74 ROW_OFF_BITS = log2_int(ROW_SIZE)
75
76 # INDEX_BITS is the number if bits to
77 # select a cache line
78 INDEX_BITS = log2_int(NUM_LINES)
79
80 # SET_SIZE_BITS is the log base 2 of the set size
81 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
82
83 # TAG_BITS is the number of bits of
84 # the tag part of the address
85 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
86
87 # TAG_WIDTH is the width in bits of each way of the tag RAM
88 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
89
90 # WAY_BITS is the number of bits to select a way
91 WAY_BITS = log2_int(NUM_WAYS)
92
93 # Example of layout for 32 lines of 64 bytes:
94 #
95 # .. tag |index| line |
96 # .. | row | |
97 # .. | |---| | ROW_LINE_BITS (3)
98 # .. | |--- - --| LINE_OFF_BITS (6)
99 # .. | |- --| ROW_OFF_BITS (3)
100 # .. |----- ---| | ROW_BITS (8)
101 # .. |-----| | INDEX_BITS (5)
102 # .. --------| | TAG_BITS (45)
103
104 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
105
106 def CacheTagArray():
107 return Array(CacheTagSet() for x in range(NUM_LINES))
108
109 def CacheValidBitsArray():
110 return Array(CacheWayValidBits() for x in range(NUM_LINES))
111
112 def RowPerLineValidArray():
113 return Array(Signal() for x in range(ROW_PER_LINE))
114
115 # L1 TLB
116 TLB_SET_BITS = log2_int(TLB_SET_SIZE)
117 TLB_WAY_BITS = log2_int(TLB_NUM_WAYS)
118 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
119 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
120 TLB_PTE_BITS = 64
121 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
122
123 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
124 assert (LINE_SIZE % 2) == 0, "LINE_SIZE not power of 2"
125 assert (NUM_LINES % 2) == 0, "NUM_LINES not power of 2"
126 assert (ROW_PER_LINE % 2) == 0, "ROW_PER_LINE not power of 2"
127 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
128 assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS), \
129 "geometry bits don't add up"
130 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
131 "geometry bits don't add up"
132 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
133 "geometry bits don't add up"
134 assert 64 == wishbone_data_bits, "Can't yet handle wb width that isn't 64-bits"
135 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
136
137
138 def TLBValidBitsArray():
139 return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
140
141 def TLBTagsArray():
142 return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
143
144 def TLBPtesArray():
145 return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
146
147 def HitWaySet():
148 return Array(Signal(NUM_WAYS) for x in range(TLB_NUM_WAYS))
149
150 # Cache RAM interface
151 def CacheRamOut():
152 return Array(Signal(WB_DATA_BITS) for x in range(NUM_WAYS))
153
154 # PLRU output interface
155 def PLRUOut():
156 return Array(Signal(WAY_BITS) for x in range(Index()))
157
158 # TLB PLRU output interface
159 def TLBPLRUOut():
160 return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
161
162 # Helper functions to decode incoming requests
163 #
164 # Return the cache line index (tag index) for an address
165 def get_index(addr):
166 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
167
168 # Return the cache row index (data memory) for an address
169 def get_row(addr):
170 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
171
172 # Return the index of a row within a line
173 def get_row_of_line(row):
174 row_v = Signal(ROW_BITS)
175 row_v = Signal(row)
176 return row_v[0:ROW_LINE_BITS]
177
178 # Returns whether this is the last row of a line
179 def is_last_row_addr(addr, last):
180 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
181
182 # Returns whether this is the last row of a line
183 def is_last_row(row, last):
184 return get_row_of_line(row) == last
185
186 # Return the address of the next row in the current cache line
187 def next_row_addr(addr):
188 row_idx = Signal(ROW_LINE_BITS)
189 result = WBAddrType()
190 # Is there no simpler way in VHDL to
191 # generate that 3 bits adder ?
192 row_idx = addr[ROW_OFF_BITS:LINE_OFF_BITS]
193 row_idx = Signal(row_idx + 1)
194 result = addr
195 result[ROW_OFF_BITS:LINE_OFF_BITS] = row_idx
196 return result
197
198 # Return the next row in the current cache line. We use a
199 # dedicated function in order to limit the size of the
200 # generated adder to be only the bits within a cache line
201 # (3 bits with default settings)
202 def next_row(row)
203 row_v = row[0:ROW_LINE_BITS] + 1
204 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
205
206 # Get the tag value from the address
207 def get_tag(addr):
208 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
209
210 # Read a tag from a tag memory row
211 def read_tag(way, tagset):
212 return tagset[way *TAG_WIDTH:way * TAG_WIDTH + TAG_BITS]
213
214 # Read a TLB tag from a TLB tag memory row
215 def read_tlb_tag(way, tags):
216 j = way * TLB_EA_TAG_BITS
217 return tags[j:j + TLB_EA_TAG_BITS]
218
219 # Write a TLB tag to a TLB tag memory row
220 def write_tlb_tag(way, tags), tag):
221 j = way * TLB_EA_TAG_BITS
222 tags[j:j + TLB_EA_TAG_BITS] = tag
223
224 # Read a PTE from a TLB PTE memory row
225 def read_tlb_pte(way, ptes):
226 j = way * TLB_PTE_BITS
227 return ptes[j:j + TLB_PTE_BITS]
228
229 def write_tlb_pte(way, ptes,newpte):
230 j = way * TLB_PTE_BITS
231 return ptes[j:j + TLB_PTE_BITS].eq(newpte)
232
233
234 # Record for storing permission, attribute, etc. bits from a PTE
235 class PermAttr(RecordObject):
236 def __init__(self):
237 super().__init__()
238 self.reference = Signal()
239 self.changed = Signal()
240 self.nocache = Signal()
241 self.priv = Signal()
242 self.rd_perm = Signal()
243 self.wr_perm = Signal()
244
245
246 def extract_perm_attr(pte):
247 pa = PermAttr()
248 pa.reference = pte[8]
249 pa.changed = pte[7]
250 pa.nocache = pte[5]
251 pa.priv = pte[3]
252 pa.rd_perm = pte[2]
253 pa.wr_perm = pte[1]
254 return pa;
255
256
257 # Type of operation on a "valid" input
258 @unique
259 class Op(Enum):
260 OP_NONE = 0
261 OP_BAD = 1 # NC cache hit, TLB miss, prot/RC failure
262 OP_STCX_FAIL = 2 # conditional store w/o reservation
263 OP_LOAD_HIT = 3 # Cache hit on load
264 OP_LOAD_MISS = 4 # Load missing cache
265 OP_LOAD_NC = 5 # Non-cachable load
266 OP_STORE_HIT = 6 # Store hitting cache
267 OP_STORE_MISS = 7 # Store missing cache
268
269
270 # Cache state machine
271 @unique
272 class State(Enum):
273 IDLE = 0 # Normal load hit processing
274 RELOAD_WAIT_ACK = 1 # Cache reload wait ack
275 STORE_WAIT_ACK = 2 # Store wait ack
276 NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
277
278
279 # Dcache operations:
280 #
281 # In order to make timing, we use the BRAMs with
282 # an output buffer, which means that the BRAM
283 # output is delayed by an extra cycle.
284 #
285 # Thus, the dcache has a 2-stage internal pipeline
286 # for cache hits with no stalls.
287 #
288 # All other operations are handled via stalling
289 # in the first stage.
290 #
291 # The second stage can thus complete a hit at the same
292 # time as the first stage emits a stall for a complex op.
293 #
294 # Stage 0 register, basically contains just the latched request
295
296 class RegStage0(RecordObject):
297 def __init__(self):
298 super().__init__()
299 self.req = LoadStore1ToDCacheType()
300 self.tlbie = Signal()
301 self.doall = Signal()
302 self.tlbld = Signal()
303 self.mmu_req = Signal() # indicates source of request
304
305
306 class MemAccessRequest(RecordObject):
307 def __init__(self):
308 super().__init__()
309 self.op = Op()
310 self.valid = Signal()
311 self.dcbz = Signal()
312 self.real_addr = Signal(REAL_ADDR_BITS)
313 self.data = Signal(64)
314 self.byte_sel = Signal(8)
315 self.hit_way = Signal(WAY_BITS)
316 self.same_tag = Signal()
317 self.mmu_req = Signal()
318
319
320 # First stage register, contains state for stage 1 of load hits
321 # and for the state machine used by all other operations
322 class RegStage1(RecordObject):
323 def __init__(self):
324 super().__init__()
325 # Info about the request
326 self.full = Signal() # have uncompleted request
327 self.mmu_req = Signal() # request is from MMU
328 self.req = MemAccessRequest()
329
330 # Cache hit state
331 self.hit_way = Signal(WAY_BITS)
332 self.hit_load_valid = Signal()
333 self.hit_index = Signal(NUM_LINES)
334 self.cache_hit = Signal()
335
336 # TLB hit state
337 self.tlb_hit = Signal()
338 self.tlb_hit_way = Signal(TLB_NUM_WAYS)
339 self.tlb_hit_index = Signal(TLB_WAY_BITS)
340
341 # 2-stage data buffer for data forwarded from writes to reads
342 self.forward_data1 = Signal(64)
343 self.forward_data2 = Signal(64)
344 self.forward_sel1 = Signal(8)
345 self.forward_valid1 = Signal()
346 self.forward_way1 = Signal(WAY_BITS)
347 self.forward_row1 = Signal(ROW_BITS)
348 self.use_forward1 = Signal()
349 self.forward_sel = Signal(8)
350
351 # Cache miss state (reload state machine)
352 self.state = State()
353 self.dcbz = Signal()
354 self.write_bram = Signal()
355 self.write_tag = Signal()
356 self.slow_valid = Signal()
357 self.wb = WishboneMasterOut()
358 self.reload_tag = Signal(TAG_BITS)
359 self.store_way = Signal(WAY_BITS)
360 self.store_row = Signal(ROW_BITS)
361 self.store_index = Signal(INDEX_BITS)
362 self.end_row_ix = Signal(log2_int(ROW_LINE_BITS))
363 self.rows_valid = RowPerLineValidArray()
364 self.acks_pending = Signal(3)
365 self.inc_acks = Signal()
366 self.dec_acks = Signal()
367
368 # Signals to complete (possibly with error)
369 self.ls_valid = Signal()
370 self.ls_error = Signal()
371 self.mmu_done = Signal()
372 self.mmu_error = Signal()
373 self.cache_paradox = Signal()
374
375 # Signal to complete a failed stcx.
376 self.stcx_fail = Signal()
377
378
379 # Reservation information
380 class Reservation(RecordObject):
381 def __init__(self):
382 super().__init__()
383 self.valid = Signal()
384 self.addr = Signal(64-LINE_OFF_BITS)
385
386
387 class DCache(Elaboratable):
388 """Set associative dcache write-through
389 TODO (in no specific order):
390 * See list in icache.vhdl
391 * Complete load misses on the cycle when WB data comes instead of
392 at the end of line (this requires dealing with requests coming in
393 while not idle...)
394 """
395 def __init__(self):
396 self.d_in = LoadStore1ToDCacheType()
397 self.d_out = DCacheToLoadStore1Type()
398
399 self.m_in = MMUToDCacheType()
400 self.m_out = DCacheToMMUType()
401
402 self.stall_out = Signal()
403
404 self.wb_out = WBMasterOut()
405 self.wb_in = WBSlaveOut()
406
407 self.log_out = Signal(20)
408
409 def stage_0(self, m):
410 """Latch the request in r0.req as long as we're not stalling
411 """
412 comb = m.d.comb
413 sync = m.d.sync
414 d_in, d_out = self.d_in, self.d_out
415
416 r = RegStage0()
417
418 # TODO, this goes in unit tests and formal proofs
419 with m.If(~(d_in.valid & m_in.valid)):
420 #sync += Display("request collision loadstore vs MMU")
421 pass
422
423 with m.If(m_in.valid):
424 sync += r.req.valid.eq(1)
425 sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
426 sync += r.req.dcbz.eq(0)
427 sync += r.req.nc.eq(0)
428 sync += r.req.reserve.eq(0)
429 sync += r.req.virt_mode.eq(1)
430 sync += r.req.priv_mode.eq(1)
431 sync += r.req.addr.eq(m_in.addr)
432 sync += r.req.data.eq(m_in.pte)
433 sync += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
434 sync += r.tlbie.eq(m_in.tlbie)
435 sync += r.doall.eq(m_in.doall)
436 sync += r.tlbld.eq(m_in.tlbld)
437 sync += r.mmu_req.eq(1)
438 with m.Else():
439 sync += r.req.eq(d_in)
440 sync += r.req.tlbie.eq(0)
441 sync += r.req.doall.eq(0)
442 sync += r.req.tlbd.eq(0)
443 sync += r.req.mmu_req.eq(0)
444 with m.If(~(r1.full & r0_full)):
445 sync += r0.eq(r)
446 sync += r0_full.eq(r.req.valid)
447
448 def tlb_read(self, m, r0_stall, tlb_valid_way,
449 tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
450 dtlb_tags, dtlb_ptes):
451 """TLB
452 Operates in the second cycle on the request latched in r0.req.
453 TLB updates write the entry at the end of the second cycle.
454 """
455 comb = m.d.comb
456 sync = m.d.sync
457 m_in, d_in = self.m_in, self.d_in
458
459 index = Signal(TLB_SET_BITS)
460 addrbits = Signal(TLB_SET_BITS)
461
462 amin = TLB_LG_PGSZ
463 amax = TLB_LG_PGSZ + TLB_SET_BITS
464
465 with m.If(m_in.valid):
466 comb += addrbits.eq(m_in.addr[amin : amax])
467 with m.Else():
468 comb += addrbits.eq(d_in.addr[amin : amax])
469 comb += index.eq(addrbits)
470
471 # If we have any op and the previous op isn't finished,
472 # then keep the same output for next cycle.
473 with m.If(~r0_stall):
474 sync += tlb_valid_way.eq(dtlb_valid_bits[index])
475 sync += tlb_tag_way.eq(dtlb_tags[index])
476 sync += tlb_pte_way.eq(dtlb_ptes[index])
477
478 def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, acc, acc_en, lru):
479 """Generate TLB PLRUs
480 """
481 comb = m.d.comb
482 sync = m.d.sync
483
484 with m.If(TLB_NUM_WAYS > 1):
485 for i in range(TLB_SET_SIZE):
486 # TLB PLRU interface
487 tlb_plru = PLRU(TLB_WAY_BITS)
488 setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
489 tlb_plru_acc = Signal(TLB_WAY_BITS)
490 tlb_plru_acc_en = Signal()
491 tlb_plru_out = Signal(TLB_WAY_BITS)
492
493 comb += tlb_plru.acc.eq(tlb_plru_acc)
494 comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
495 comb += tlb_plru.lru.eq(tlb_plru_out)
496
497 # PLRU interface
498 with m.If(r1.tlb_hit_index == i):
499 comb += tlb_plru.acc_en.eq(r1.tlb_hit)
500 with m.Else():
501 comb += tlb_plru.acc_en.eq(0)
502 comb += tlb_plru.acc.eq(r1.tlb_hit_way)
503
504 comb += tlb_plru_victim[i].eq(tlb_plru.lru)
505
506 def tlb_search(self, m, tlb_req_index, r0, tlb_valid_way_ tlb_tag_way,
507 tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
508
509 comb = m.d.comb
510 sync = m.d.sync
511
512 hitway = Signal(TLB_WAY_BITS)
513 hit = Signal()
514 eatag = Signal(TLB_EA_TAG_BITS)
515
516 TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
517 comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
518 comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
519
520 for i in range(TLB_NUM_WAYS):
521 with m.If(tlb_valid_way(i)
522 & read_tlb_tag(i, tlb_tag_way) == eatag):
523 comb += hitway.eq(i)
524 comb += hit.eq(1)
525
526 comb += tlb_hit.eq(hit & r0_valid)
527 comb += tlb_hit_way.eq(hitway)
528
529 with m.If(tlb_hit):
530 comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
531 with m.Else():
532 comb += pte.eq(0)
533 comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
534 with m.If(r0.req.virt_mode):
535 comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
536 r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
537 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
538 comb += perm_attr.eq(extract_perm_attr(pte))
539 with m.Else():
540 comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
541 r0.rq.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
542
543 comb += perm_attr.reference.eq(1)
544 comb += perm_attr.changed.eq(1)
545 comb += perm_attr.priv.eq(1)
546 comb += perm_attr.nocache.eq(0)
547 comb += perm_attr.rd_perm.eq(1)
548 comb += perm_attr.wr_perm.eq(1)
549
550 def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
551 tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
552 dtlb_tags, tlb_pte_way, dtlb_ptes, dtlb_valid_bits):
553
554 comb = m.d.comb
555 sync = m.d.sync
556
557 tlbie = Signal()
558 tlbwe = Signal()
559 repl_way = Signal(TLB_WAY_BITS)
560 eatag = Signal(TLB_EA_TAG_BITS)
561 tagset = TLBWayTags()
562 pteset = TLBWayPtes()
563
564 comb += tlbie.eq(r0_valid & r0.tlbie)
565 comb += tlbwe.eq(r0_valid & r0.tlbldoi)
566
567 with m.If(tlbie & r0.doall):
568 # clear all valid bits at once
569 for i in range(TLB_SET_SIZE):
570 sync += dtlb_valid_bits[i].eq(0)
571
572 with m.Elif(tlbie):
573 with m.If(tlb_hit):
574 sync += dtlb_valid_bits[tlb_req_index][tlb_hit_way].eq(0)
575 with m.Elif(tlbwe):
576 with m.If(tlb_hit):
577 comb += repl_way.eq(tlb_hit_way)
578 with m.Else():
579 comb += repl_way.eq(tlb_plru_victim[tlb_req_index])
580 comb += eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
581 comb += tagset.eq(tlb_tag_way)
582 sync += write_tlb_tag(repl_way, tagset, eatag)
583 sync += dtlb_tags[tlb_req_index].eq(tagset)
584 comb += pteset.eq(tlb_pte_way)
585 sync += write_tlb_pte(repl_way, pteset, r0.req.data)
586 sync += dtlb_ptes[tlb_req_index].eq(pteset)
587 sync += dtlb_valid_bits[tlb_req_index][repl_way].eq(1)
588
589 def maybe_plrus(self, r1):
590 """Generate PLRUs
591 """
592 comb = m.d.comb
593 sync = m.d.sync
594
595 for i in range(NUM_LINES):
596 # PLRU interface
597 plru = PLRU(TLB_WAY_BITS)
598 setattr(m.submodules, "plru%d" % i, plru)
599 plru_acc = Signal(WAY_BITS)
600 plru_acc_en = Signal()
601 plru_out = Signal(WAY_BITS)
602
603 comb += plru.acc.eq(plru_acc)
604 comb += plru.acc_en.eq(plru_acc_en)
605 comb += plru.lru.eq(plru_out)
606
607 with m.If(r1.hit_index == i):
608 comb += plru_acc_en.eq(r1.cache_hit)
609
610 comb += plru_acc.eq(r1.hit_way)
611 comb += plru_victim[i].eq(plru_out)
612
613 def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
614 """Cache tag RAM read port
615 """
616 comb = m.d.comb
617 sync = m.d.sync
618 m_in, d_in = self.m_in, self.d_in
619
620 index = Signal(INDEX_BITS)
621
622 with m.If(r0_stall):
623 comb += index.eq(req_index)
624 with m.Elif(m_in.valid):
625 comb += index.eq(get_index(m_in.addr))
626 with m.Else():
627 comb += index.eq(get_index(d_in.addr))
628 sync += cache_tag_set.eq(cache_tags[index])
629
630 def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
631 r0_valid, r1, cache_valid_bits, replace_way,
632 use_forward1_next, use_forward2_next,
633 req_hit_way, plru_victim, rc_ok, perm_attr,
634 valid_ra, perm_ok, access_ok, req_op, req_ok,
635 r0_stall, early_req_row):
636 """Cache request parsing and hit detection
637 """
638
639 comb = m.d.comb
640 sync = m.d.sync
641 m_in, d_in = self.m_in, self.d_in
642
643 is_hit = Signal()
644 hit_way = Signal(WAY_BITS)
645 op = Op()
646 opsel = Signal(3)
647 go = Signal()
648 nc = Signal()
649 s_hit = Signal()
650 s_tag = Signal(TAG_BITS)
651 s_pte = Signal(TLB_PTE_BITS)
652 s_ra = Signal(REAL_ADDR_BITS)
653 hit_set = Signal(TLB_NUM_WAYS)
654 hit_way_set = HitWaySet()
655 rel_matches = Signal(TLB_NUM_WAYS)
656 rel_match = Signal()
657
658 # Extract line, row and tag from request
659 comb += req_index.eq(get_index(r0.req.addr))
660 comb += req_row.eq(get_row(r0.req.addr))
661 comb += req_tag.eq(get_tag(ra))
662
663 comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
664
665 # Test if pending request is a hit on any way
666 # In order to make timing in virtual mode,
667 # when we are using the TLB, we compare each
668 # way with each of the real addresses from each way of
669 # the TLB, and then decide later which match to use.
670
671 with m.If(r0.req.virt_mode):
672 comb += rel_matches.eq(0)
673 for j in range(TLB_NUM_WAYS):
674 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
675 comb += s_ra.eq(Cat(r0.req.addr[0:TLB_LG_PGSZ],
676 s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
677 comb += s_tag.eq(get_tag(s_ra))
678
679 for i in range(NUM_WAYS):
680 with m.If(go & cache_valid_bits[req_index][i] &
681 read_tag(i, cache_tag_set) == s_tag
682 & tlb_valid_way[j]):
683 comb += hit_way_set[j].eq(i)
684 comb += s_hit.eq(1)
685 comb += hit_set[j].eq(s_hit)
686 with m.If(s_tag == r1.reload_tag):
687 comb += rel_matches[j].eq(1)
688 with m.If(tlb_hit):
689 comb += is_hit.eq(hit_set[tlb_hit_way])
690 comb += hit_way.eq(hit_way_set[tlb_hit_way])
691 comb += rel_match.eq(rel_matches[tlb_hit_way])
692 with m.Else():
693 comb += s_tag.eq(get_tag(r0.req.addr))
694 for i in range(NUM_WAYS):
695 with m.If(go & cache_valid_bits[req_index][i] &
696 read_tag(i, cache_tag_set) == s_tag):
697 comb += hit_way.eq(i)
698 comb += is_hit.eq(1)
699 with m.If(s_tag == r1.reload_tag):
700 comb += rel_match.eq(1)
701 comb += req_same_tag.eq(rel_match)
702
703 # See if the request matches the line currently being reloaded
704 with m.If((r1.state == State.RELOAD_WAIT_ACK) &
705 (req_index == r1.store_index) & rel_match):
706 # For a store, consider this a hit even if the row isn't
707 # valid since it will be by the time we perform the store.
708 # For a load, check the appropriate row valid bit.
709 valid = r1.rows_valid[req_row % ROW_PER_LINE]
710 comb += is_hit.eq(~r0.req.load | valid)
711 comb += hit_way.eq(replace_way)
712
713 # Whether to use forwarded data for a load or not
714 comb += use_forward1_next.eq(0)
715 with m.If((get_row(r1.req.real_addr) == req_row)
716 & (r1.req.hit_way == hit_way))
717 # Only need to consider r1.write_bram here, since if we
718 # are writing refill data here, then we don't have a
719 # cache hit this cycle on the line being refilled.
720 # (There is the possibility that the load following the
721 # load miss that started the refill could be to the old
722 # contents of the victim line, since it is a couple of
723 # cycles after the refill starts before we see the updated
724 # cache tag. In that case we don't use the bypass.)
725 comb += use_forward1_next.eq(r1.write_bram)
726 comb += use_forward2_next.eq(0)
727 with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
728 comb += use_forward2_next.eq(r1.forward_valid1)
729
730 # The way that matched on a hit
731 comb += req_hit_way.eq(hit_way)
732
733 # The way to replace on a miss
734 with m.If(r1.write_tag):
735 replace_way.eq(plru_victim[r1.store_index])
736 with m.Else():
737 comb += replace_way.eq(r1.store_way)
738
739 # work out whether we have permission for this access
740 # NB we don't yet implement AMR, thus no KUAP
741 comb += rc_ok.eq(perm_attr.reference
742 & (r0.req.load | perm_attr.changed)
743 )
744 comb += perm_ok.eq((r0.req.prive_mode | ~perm_attr.priv)
745 & perm_attr.wr_perm
746 | (r0.req.load & perm_attr.rd_perm)
747 )
748 comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
749 # Combine the request and cache hit status to decide what
750 # operation needs to be done
751 comb += nc.eq(r0.req.nc | perm_attr.nocache)
752 comb += op.eq(Op.OP_NONE)
753 with m.If(go):
754 with m.If(~access_ok):
755 comb += op.eq(Op.OP_BAD)
756 with m.Elif(cancel_store):
757 comb += op.eq(Op.OP_STCX_FAIL)
758 with m.Else():
759 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
760 with m.Switch(opsel):
761 with m.Case(Const(0b101, 3)):
762 comb += op.eq(Op.OP_LOAD_HIT)
763 with m.Case(Cosnt(0b100, 3)):
764 comb += op.eq(Op.OP_LOAD_MISS)
765 with m.Case(Const(0b110, 3)):
766 comb += op.eq(Op.OP_LOAD_NC)
767 with m.Case(Const(0b001, 3)):
768 comb += op.eq(Op.OP_STORE_HIT)
769 with m.Case(Const(0b000, 3)):
770 comb += op.eq(Op.OP_STORE_MISS)
771 with m.Case(Const(0b010, 3)):
772 comb += op.eq(Op.OP_STORE_MISS)
773 with m.Case(Const(0b011, 3)):
774 comb += op.eq(Op.OP_BAD)
775 with m.Case(Const(0b111, 3)):
776 comb += op.eq(Op.OP_BAD)
777 with m.Default():
778 comb += op.eq(Op.OP_NONE)
779 comb += req_op.eq(op)
780 comb += req_go.eq(go)
781
782 # Version of the row number that is valid one cycle earlier
783 # in the cases where we need to read the cache data BRAM.
784 # If we're stalling then we need to keep reading the last
785 # row requested.
786 with m.If(~r0_stall):
787 with m.If(m_in.valid):
788 comb += early_req_row.eq(get_row(m_in.addr))
789 with m.Else():
790 comb += early_req_row.eq(get_row(d_in.addr))
791 with m.Else():
792 comb += early_req_row.eq(req_row)
793
794 def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
795 r0_valid, r0, reservation):
796 """Handle load-with-reservation and store-conditional instructions
797 """
798 comb = m.d.comb
799 sync = m.d.sync
800
801 with m.If(r0_valid & r0.req.reserve):
802
803 # XXX generate alignment interrupt if address
804 # is not aligned XXX or if r0.req.nc = '1'
805 with m.If(r0.req.load):
806 comb += set_rsrv(1) # load with reservation
807 with m.Else():
808 comb += clear_rsrv.eq(1) # store conditional
809 with m.If(~reservation.valid | r0.req.addr[LINE_OFF_BITS:64]):
810 comb += cancel_store.eq(1)
811
812 def reservation_reg(self, m, r0_valid, access_ok, clear_rsrv,
813 reservation, r0):
814
815 comb = m.d.comb
816 sync = m.d.sync
817
818 with m.If(r0_valid & access_ok):
819 with m.If(clear_rsrv):
820 sync += reservation.valid.eq(0)
821 with m.Elif(set_rsrv):
822 sync += reservation.valid.eq(1)
823 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
824
825 def writeback_control(self, m, r1, cache_out):
826 """Return data for loads & completion control logic
827 """
828 comb = m.d.comb
829 sync = m.d.sync
830 d_out, m_out = self.d_out, self.m_out
831
832 data_out = Signal(64)
833 data_fwd = Signal(64)
834
835 # Use the bypass if are reading the row that was
836 # written 1 or 2 cycles ago, including for the
837 # slow_valid = 1 case (i.e. completing a load
838 # miss or a non-cacheable load).
839 with m.If(r1.use_forward1):
840 comb += data_fwd.eq(r1.forward_data1)
841 with m.Else():
842 comb += data_fwd.eq(r1.forward_data2)
843
844 comb += data_out.eq(cache_out[r1.hit_way])
845
846 for i in range(8):
847 with m.If(r1.forward_sel[i]):
848 dsel = data_fwd.word_select(i, 8)
849 comb += data_out.word_select(i, 8).eq(dsel)
850
851 comb += d_out.valid.eq(r1.ls_valid)
852 comb += d_out.data.eq(data_out)
853 comb += d_out.store_done.eq(~r1.stcx_fail)
854 comb += d_out.error.eq(r1.ls_error)
855 comb += d_out.cache_paradox.eq(r1.cache_paradox)
856
857 # Outputs to MMU
858 comb += m_out.done.eq(r1.mmu_done)
859 comb += m_out.err.eq(r1.mmu_error)
860 comb += m_out.data.eq(data_out)
861
862 # We have a valid load or store hit or we just completed
863 # a slow op such as a load miss, a NC load or a store
864 #
865 # Note: the load hit is delayed by one cycle. However it
866 # can still not collide with r.slow_valid (well unless I
867 # miscalculated) because slow_valid can only be set on a
868 # subsequent request and not on its first cycle (the state
869 # machine must have advanced), which makes slow_valid
870 # at least 2 cycles from the previous hit_load_valid.
871
872 # Sanity: Only one of these must be set in any given cycle
873
874 if False: # TODO: need Display to get this to work
875 assert (r1.slow_valid & r1.stcx_fail) != 1 "unexpected" \
876 "slow_valid collision with stcx_fail -!- severity FAILURE"
877
878 assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1
879 "unexpected hit_load_delayed collision with slow_valid -!-" \
880 "severity FAILURE"
881
882 with m.If(~r1._mmu_req):
883 # Request came from loadstore1...
884 # Load hit case is the standard path
885 with m.If(r1.hit_load_valid):
886 #Display(f"completing load hit data={data_out}")
887 pass
888
889 # error cases complete without stalling
890 with m.If(r1.ls_error):
891 # Display("completing ld/st with error")
892 pass
893
894 # Slow ops (load miss, NC, stores)
895 with m.If(r1.slow_valid):
896 #Display(f"completing store or load miss data={data_out}")
897 pass
898
899 with m.Else():
900 # Request came from MMU
901 with m.If(r1.hit_load_valid):
902 # Display(f"completing load hit to MMU, data={m_out.data}")
903 pass
904 # error cases complete without stalling
905 with m.If(r1.mmu_error):
906 #Display("combpleting MMU ld with error")
907 pass
908
909 # Slow ops (i.e. load miss)
910 with m.If(r1.slow_valid):
911 #Display("completing MMU load miss, data={m_out.data}")
912 pass
913
914 def rams(self, m, r1):
915 """rams
916 Generate a cache RAM for each way. This handles the normal
917 reads, writes from reloads and the special store-hit update
918 path as well.
919
920 Note: the BRAMs have an extra read buffer, meaning the output
921 is pipelined an extra cycle. This differs from the
922 icache. The writeback logic needs to take that into
923 account by using 1-cycle delayed signals for load hits.
924 """
925 comb = m.d.comb
926 wb_in = self.wb_in
927
928 for i in range(NUM_WAYS):
929 do_read = Signal()
930 rd_addr = Signal(ROW_BITS)
931 do_write = Signal()
932 wr_addr = Signal(ROW_BITS)
933 wr_data = Signal(WB_DATA_BITS)
934 wr_sel = Signal(ROW_SIZE)
935 wr_sel_m = Signal(ROW_SIZE)
936 _d_out = Signal(WB_DATA_BITS)
937
938 way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
939 setattr(m.submodules, "cacheram_%d" % i, way)
940
941 comb += way.rd_en.eq(do_read)
942 comb += way.rd_addr.eq(rd_addr)
943 comb += _d_out.eq(way.rd_data)
944 comb += way.wr_sel.eq(wr_sel_m)
945 comb += way.wr_addr.eq(wr_addr)
946 comb += way.wr_data.eq(wr_data)
947
948 # Cache hit reads
949 comb += do_read.eq(1)
950 comb += rd_addr.eq(early_req_row)
951 comb += cache_out[i].eq(_d_out)
952
953 # Write mux:
954 #
955 # Defaults to wishbone read responses (cache refill)
956 #
957 # For timing, the mux on wr_data/sel/addr is not
958 # dependent on anything other than the current state.
959
960 with m.If(r1.write_bram):
961 # Write store data to BRAM. This happens one
962 # cycle after the store is in r0.
963 comb += wr_data.eq(r1.req.data)
964 comb += wr_sel.eq(r1.req.byte_sel)
965 comb += wr_addr.eq(get_row(r1.req.real_addr))
966
967 with m.If(i == r1.req.hit_way):
968 comb += do_write.eq(1)
969 with m.Else():
970 # Otherwise, we might be doing a reload or a DCBZ
971 with m.If(r1.dcbz):
972 comb += wr_data.eq(0)
973 with m.Else():
974 comb += wr_data.eq(wb_in.dat)
975 comb += wr_addr.eq(r1.store_row)
976 comb += wr_sel.eq(~0) # all 1s
977
978 with m.If((r1.state == State.RELOAD_WAIT_ACK)
979 & wb_in.ack & (replace_way == i)):
980 comb += do_write.eq(1)
981
982 # Mask write selects with do_write since BRAM
983 # doesn't have a global write-enable
984 with m.If(do_write):
985 comb += wr_sel_m.eq(wr_sel)
986
987 # Cache hit synchronous machine for the easy case.
988 # This handles load hits.
989 # It also handles error cases (TLB miss, cache paradox)
990 def dcache_fast_hit(self, m, req_op, r0_valid, r1):
991
992 comb = m.d.comb
993 sync = m.d.sync
994
995 with m.If(req_op != Op.OP_NONE):
996 #Display(f"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \
997 # f"idx:{req_index} tag:{req_tag} way: {req_hit_way}"
998 # )
999 pass
1000
1001 with m.If(r0_valid):
1002 sync += r1.mmu_req.eq(r0.mmu_req)
1003
1004 # Fast path for load/store hits.
1005 # Set signals for the writeback controls.
1006 sync += r1.hit_way.eq(req_hit_way)
1007 sync += r1.hit_index.eq(req_index)
1008
1009 with m.If(req_op == Op.OP_LOAD_HIT):
1010 sync += r1.hit_load_valid.eq(1)
1011 with m.Else():
1012 sync += r1.hit_load_valid.eq(0)
1013
1014 with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1015 sync += r1.cache_hit.eq(1)
1016 with m.Else():
1017 sync += r1.cache_hit.eq(0)
1018
1019 with m.If(req_op == Op.OP_BAD):
1020 # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1021 # f"rc_ok={rc_ok} perm_ok={perm_ok}"
1022 sync += r1.ls_error.eq(~r0.mmu_req)
1023 sync += r1.mmu_error.eq(r0.mmu_req)
1024 sync += r1.cache_paradox.eq(access_ok)
1025
1026 with m.Else():
1027 sync += r1.ls_error.eq(0)
1028 sync += r1.mmu_error.eq(0)
1029 sync += r1.cache_paradox.eq(0)
1030
1031 with m.If(req_op == Op.OP_STCX_FAIL):
1032 r1.stcx_fail.eq(1)
1033 with m.Else():
1034 sync += r1.stcx_fail.eq(0)
1035
1036 # Record TLB hit information for updating TLB PLRU
1037 sync += r1.tlb_hit.eq(tlb_hit)
1038 sync += r1.tlb_hit_way.eq(tlb_hit_way)
1039 sync += r1.tlb_hit_index.eq(tlb_req_index)
1040
1041 # Memory accesses are handled by this state machine:
1042 #
1043 # * Cache load miss/reload (in conjunction with "rams")
1044 # * Load hits for non-cachable forms
1045 # * Stores (the collision case is handled in "rams")
1046 #
1047 # All wishbone requests generation is done here.
1048 # This machine operates at stage 1.
1049 def dcache_slow(self, m, r1, use_forward1_next, cache_valid_bits, r0,
1050 r0_valid, req_op, cache_tag, req_go, ra):
1051
1052 comb = m.d.comb
1053 sync = m.d.sync
1054 wb_in = self.wb_i
1055
1056 req = MemAccessRequest()
1057 acks = Signal(3)
1058 adjust_acks = Signal(3)
1059
1060 sync += r1.use_forward1.eq(use_forward1_next)
1061 sync += r1.forward_sel.eq(0)
1062
1063 with m.If(use_forward1_next):
1064 sync += r1.forward_sel.eq(r1.req.byte_sel)
1065 with m.Elif(use_forward2_next):
1066 sync += r1.forward_sel.eq(r1.forward_sel1)
1067
1068 sync += r1.forward_data2.eq(r1.forward_data1)
1069 with m.If(r1.write_bram):
1070 sync += r1.forward_data1.eq(r1.req.data)
1071 sync += r1.forward_sel1.eq(r1.req.byte_sel)
1072 sync += r1.forward_way1.eq(r1.req.hit_way)
1073 sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1074 sync += r1.forward_valid1.eq(1)
1075 with m.Else():
1076 with m.If(r1.bcbz):
1077 sync += r1.forward_data1.eq(0)
1078 with m.Else():
1079 sync += r1.forward_data1.eq(wb_in.dat)
1080 sync += r1.forward_sel1.eq(~0) # all 1s
1081 sync += r1.forward_way1.eq(replace_way)
1082 sync += r1.forward_row1.eq(r1.store_row)
1083 sync += r1.forward_valid1.eq(0)
1084
1085 # One cycle pulses reset
1086 sync += r1.slow_valid.eq(0)
1087 sync += r1.write_bram.eq(0)
1088 sync += r1.inc_acks.eq(0)
1089 sync += r1.dec_acks.eq(0)
1090
1091 sync += r1.ls_valid.eq(0)
1092 # complete tlbies and TLB loads in the third cycle
1093 sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1094
1095 with m.If((req_op == Op.OP_LOAD_HIT)
1096 | (req_op == Op.OP_STCX_FAIL)):
1097 with m.If(~r0.mmu_req):
1098 sync += r1.ls_valid.eq(1)
1099 with m.Else():
1100 sync += r1.mmu_done.eq(1)
1101
1102 with m.If(r1.write_tag):
1103 # Store new tag in selected way
1104 for i in range(NUM_WAYS):
1105 with m.If(i == replace_way):
1106 idx = r1.store_index
1107 trange = range(i * TAG_WIDTH, (i+1) * TAG_WIDTH)
1108 sync += cache_tag[idx][trange].eq(r1.reload_tag)
1109 sync += r1.store_way.eq(replace_way)
1110 sync += r1.write_tag.eq(0)
1111
1112 # Take request from r1.req if there is one there,
1113 # else from req_op, ra, etc.
1114 with m.If(r1.full)
1115 comb += req.eq(r1.req)
1116 with m.Else():
1117 comb += req.op.eq(req_op)
1118 comb += req.valid.eq(req_go)
1119 comb += req.mmu_req.eq(r0.mmu_req)
1120 comb += req.dcbz.eq(r0.req.dcbz)
1121 comb += req.real_addr.eq(ra)
1122
1123 with m.If(~r0.req.dcbz):
1124 comb += req.data.eq(r0.req.data)
1125 with m.Else():
1126 comb += req.data.eq(0)
1127
1128 # Select all bytes for dcbz
1129 # and for cacheable loads
1130 with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc):
1131 comb += req.byte_sel.eq(~0) # all 1s
1132 with m.Else():
1133 comb += req.byte_sel.eq(r0.req.byte_sel)
1134 comb += req.hit_way.eq(req_hit_way)
1135 comb += req.same_tag.eq(req_same_tag)
1136
1137 # Store the incoming request from r0,
1138 # if it is a slow request
1139 # Note that r1.full = 1 implies req_op = OP_NONE
1140 with m.If((req_op == Op.OP_LOAD_MISS)
1141 | (req_op == Op.OP_LOAD_NC)
1142 | (req_op == Op.OP_STORE_MISS)
1143 | (req_op == Op.OP_STORE_HIT)):
1144 sync += r1.req(req)
1145 sync += r1.full.eq(1)
1146
1147 # Main state machine
1148 with m.Switch(r1.state):
1149
1150 with m.Case(State.IDLE)
1151 # XXX check 'left downto. probably means len(r1.wb.adr)
1152 # r1.wb.adr <= req.real_addr(
1153 # r1.wb.adr'left downto 0
1154 # );
1155 sync += r1.wb.adr.eq(req.real_addr[0:r1.wb.adr])
1156 sync += r1.wb.sel.eq(req.byte_sel)
1157 sync += r1.wb.dat.eq(req.data)
1158 sync += r1.dcbz.eq(req.dcbz)
1159
1160 # Keep track of our index and way
1161 # for subsequent stores.
1162 sync += r1.store_index.eq(get_index(req.real_addr))
1163 sync += r1.store_row.eq(get_row(req.real_addr))
1164 sync += r1.end_row_ix.eq(
1165 get_row_of_line(get_row(req.real_addr))
1166 )
1167 sync += r1.reload_tag.eq(get_tag(req.real_addr))
1168 sync += r1.req.same_tag.eq(1)
1169
1170 with m.If(req.op == Op.OP_STORE_HIT):
1171 sync += r1.store_way.eq(req.hit_way)
1172
1173 # Reset per-row valid bits,
1174 # ready for handling OP_LOAD_MISS
1175 for i in range(ROW_PER_LINE):
1176 sync += r1.rows_valid[i].eq(0)
1177
1178 with m.Switch(req.op):
1179 with m.Case(Op.OP_LOAD_HIT):
1180 # stay in IDLE state
1181 pass
1182
1183 with m.Case(Op.OP_LOAD_MISS):
1184 #Display(f"cache miss real addr:" \
1185 # f"{req_real_addr}" \
1186 # f" idx:{get_index(req_real_addr)}" \
1187 # f" tag:{get_tag(req.real_addr)}")
1188 pass
1189
1190 # Start the wishbone cycle
1191 sync += r1.wb.we.eq(0)
1192 sync += r1.wb.cyc.eq(1)
1193 sync += r1.wb.stb.eq(1)
1194
1195 # Track that we had one request sent
1196 sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1197 sync += r1.write_tag.eq(1)
1198
1199 with m.Case(Op.OP_LOAD_NC):
1200 sync += r1.wb.cyc.eq(1)
1201 sync += r1.wb.stb.eq(1)
1202 sync += r1.wb.we.eq(0)
1203 sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1204
1205 with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1206 with m.If(~req.bcbz):
1207 sync += r1.state.eq(State.STORE_WAIT_ACK)
1208 sync += r1.acks_pending.eq(1)
1209 sync += r1.full.eq(0)
1210 sync += r1.slow_valid.eq(1)
1211
1212 with m.If(~req.mmu_req):
1213 sync += r1.ls_valid.eq(1)
1214 with m.Else():
1215 sync += r1.mmu_done.eq(1)
1216
1217 with m.If(req.op == Op.OP_STORE_HIT):
1218 sync += r1.write_bram.eq(1)
1219 with m.Else():
1220 sync += r1.state.eq(Op.RELOAD_WAIT_ACK)
1221
1222 with m.If(req.op == Op.OP_STORE_MISS):
1223 sync += r1.write_tag.eq(1)
1224
1225 sync += r1.wb.we.eq(1)
1226 sync += r1.wb.cyc.eq(1)
1227 sync += r1.wb.stb.eq(1)
1228
1229 # OP_NONE and OP_BAD do nothing
1230 # OP_BAD & OP_STCX_FAIL were
1231 # handled above already
1232 with m.Case(Op.OP_NONE):
1233 pass
1234 with m.Case(OP_BAD):
1235 pass
1236 with m.Case(OP_STCX_FAIL):
1237 pass
1238
1239 with m.Case(State.RELOAD_WAIT_ACK):
1240 # Requests are all sent if stb is 0
1241 comb += stbs_done.eq(~r1.wb.stb)
1242
1243 with m.If(~wb_in.stall & ~stbs_done):
1244 # That was the last word?
1245 # We are done sending.
1246 # Clear stb and set stbs_done
1247 # so we can handle an eventual
1248 # last ack on the same cycle.
1249 with m.If(is_last_row_addr(
1250 r1.wb.adr, r1.end_row_ix)):
1251 sync += r1.wb.stb.eq(0)
1252 comb += stbs_done.eq(0)
1253
1254 # Calculate the next row address
1255 sync += r1.wb.adr.eq(next_row_addr(r1.wb.adr))
1256
1257 # Incoming acks processing
1258 sync += r1.forward_valid1.eq(wb_in.ack)
1259 with m.If(wb_in.ack):
1260 # XXX needs an Array bit-accessor here
1261 sync += r1.rows_valid[r1.store_row % ROW_PER_LINE].eq(1)
1262
1263 # If this is the data we were looking for,
1264 # we can complete the request next cycle.
1265 # Compare the whole address in case the
1266 # request in r1.req is not the one that
1267 # started this refill.
1268 with m.If(r1.full & r1.req.same_tag &
1269 ((r1.dcbz & r1.req.dcbz) |
1270 (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1271 (r1.store_row == get_row(r1.req.real_addr))):
1272 sync += r1.full.eq(0)
1273 sync += r1.slow_valid.eq(1)
1274 with m.If(~r1.mmu_req):
1275 sync += r1.ls_valid.eq(1)
1276 with m.Else():
1277 sync += r1.mmu_done.eq(1)
1278 sync += r1.forward_sel.eq(~0) # all 1s
1279 sync += r1.use_forward1.eq(1)
1280
1281 # Check for completion
1282 with m.If(stbs_done & is_last_row(r1.store_row,
1283 r1.end_row_ix)):
1284 # Complete wishbone cycle
1285 sync += r1.wb.cyc.eq(0)
1286
1287 # Cache line is now valid
1288 cv = cache_valid_bits[r1.store_index]
1289 sync += cv[r1.store_way].eq(1)
1290 sync += r1.state.eq(State.IDLE)
1291
1292 # Increment store row counter
1293 sync += r1.store_row.eq(next_row(r1.store_row))
1294
1295 with m.Case(State.STORE_WAIT_ACK):
1296 comb += stbs_done.eq(~r1.wb.stb)
1297 comb += acks.eq(r1.acks_pending)
1298
1299 with m.If(r1.inc_acks != r1.dec_acks):
1300 with m.If(r1.inc_acks):
1301 comb += adjust_acks.eq(acks + 1)
1302 with m.Else():
1303 comb += adjust_acks.eq(acks - 1)
1304 with m.Else():
1305 comb += adjust_acks.eq(acks)
1306
1307 sync += r1.acks_pending.eq(adjust_acks)
1308
1309 # Clear stb when slave accepted request
1310 with m.If(~wb_in.stall):
1311 # See if there is another store waiting
1312 # to be done which is in the same real page.
1313 with m.If(req.valid):
1314 ra = req.real_addr[0:SET_SIZE_BITS]
1315 sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
1316 sync += r1.wb.dat.eq(req.data)
1317 sync += r1.wb.sel.eq(req.byte_sel)
1318
1319 with m.Elif((adjust_acks < 7) & req.same_tag &
1320 ((req.op == Op.Op_STORE_MISS)
1321 | (req.op == Op.OP_SOTRE_HIT))):
1322 sync += r1.wb.stb.eq(1)
1323 comb += stbs_done.eq(0)
1324
1325 with m.If(req.op == Op.OP_STORE_HIT):
1326 sync += r1.write_bram.eq(1)
1327 sync += r1.full.eq(0)
1328 sync += r1.slow_valid.eq(1)
1329
1330 # Store requests never come from the MMU
1331 sync += r1.ls_valid.eq(1)
1332 comb += stbs_done.eq(0)
1333 sync += r1.inc_acks.eq(1)
1334 with m.Else():
1335 sync += r1.wb.stb.eq(0)
1336 comb += stbs_done.eq(1)
1337
1338 # Got ack ? See if complete.
1339 with m.If(wb_in.ack):
1340 with m.If(stbs_done & (adjust_acks == 1))
1341 sync += r1.state.eq(State.IDLE)
1342 sync += r1.wb.cyc.eq(0)
1343 sync += r1.wb.stb.eq(0)
1344 sync += r1.dec_acks.eq(1)
1345
1346 with m.Case(State.NC_LOAD_WAIT_ACK):
1347 # Clear stb when slave accepted request
1348 with m.If(~wb_in.stall):
1349 sync += r1.wb.stb.eq(0)
1350
1351 # Got ack ? complete.
1352 with m.If(wb_in.ack):
1353 sync += r1.state.eq(State.IDLE)
1354 sync += r1.full.eq(0)
1355 sync += r1.slow_valid.eq(1)
1356
1357 with m.If(~r1.mmu_req):
1358 sync += r1.ls_valid.eq(1)
1359 with m.Else():
1360 sync += r1.mmu_done.eq(1)
1361
1362 sync += r1.forward_sel.eq(~0) # all 1s
1363 sync += r1.use_forward1.eq(1)
1364 sync += r1.wb.cyc.eq(0)
1365 sync += r1.wb.stb.eq(0)
1366
1367 def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1368
1369 sync = m.d.sync
1370 d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1371
1372 sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1373 stall_out, req_op[:3], d_out.valid, d_out.error,
1374 r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1375 r1.wb.adr[3:6]))
1376
1377 def elaborate(self, platform):
1378
1379 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1380 cache_tags = CacheTagArray()
1381 cache_tag_set = Signal(TAG_RAM_WIDTH)
1382 cache_valid_bits = CacheValidBitsArray()
1383
1384 # TODO attribute ram_style : string;
1385 # TODO attribute ram_style of cache_tags : signal is "distributed";
1386
1387 """note: these are passed to nmigen.hdl.Memory as "attributes".
1388 don't know how, just that they are.
1389 """
1390 dtlb_valid_bits = TLBValidBitsArray()
1391 dtlb_tags = TLBTagsArray()
1392 dtlb_ptes = TLBPtesArray()
1393 # TODO attribute ram_style of
1394 # dtlb_tags : signal is "distributed";
1395 # TODO attribute ram_style of
1396 # dtlb_ptes : signal is "distributed";
1397
1398 r0 = RegStage0()
1399 r0_full = Signal()
1400
1401 r1 = RegStage1()
1402
1403 reservation = Reservation()
1404
1405 # Async signals on incoming request
1406 req_index = Signal(INDEX_BITS)
1407 req_row = Signal(ROW_BITS)
1408 req_hit_way = Signal(WAY_BITS)
1409 req_tag = Signal(TAG_BITS)
1410 req_op = Op()
1411 req_data = Signal(64)
1412 req_same_tag = Signal()
1413 req_go = Signal()
1414
1415 early_req_row = Signal(ROW_BITS)
1416
1417 cancel_store = Signal()
1418 set_rsrv = Signal()
1419 clear_rsrv = Signal()
1420
1421 r0_valid = Signal()
1422 r0_stall = Signal()
1423
1424 use_forward1_next = Signal()
1425 use_forward2_next = Signal()
1426
1427 cache_out = CacheRamOut()
1428
1429 plru_victim = PLRUOut()
1430 replace_way = Signal(WAY_BITS)
1431
1432 # Wishbone read/write/cache write formatting signals
1433 bus_sel = Signal(8)
1434
1435 # TLB signals
1436 tlb_tag_way = Signal(TLB_TAG_WAY_BITS)
1437 tlb_pte_way = Signal(TLB_PTE_WAY_BITS)
1438 tlb_valid_way = Signal(TLB_NUM_WAYS)
1439 tlb_req_index = Signal(TLB_SET_BITS)
1440 tlb_hit = Signal()
1441 tlb_hit_way = Signal(TLB_WAY_BITS)
1442 pte = Signal(TLB_PTE_BITS)
1443 ra = Signal(REAL_ADDR_BITS)
1444 valid_ra = Signal()
1445 perm_attr = PermAttr()
1446 rc_ok = Signal()
1447 perm_ok = Signal()
1448 access_ok = Signal()
1449
1450 tlb_plru_victim = TLBPLRUOut()
1451
1452 # we don't yet handle collisions between loadstore1 requests
1453 # and MMU requests
1454 comb += m_out.stall.eq(0)
1455
1456 # Hold off the request in r0 when r1 has an uncompleted request
1457 comb += r0_stall.eq(r0_full & r1.full)
1458 comb += r0_valid.eq(r0_full & ~r1.full)
1459 comb += stall_out.eq(r0_stall)
1460
1461 # Wire up wishbone request latch out of stage 1
1462 comb += self.wb_out.eq(r1.wb)
1463
1464 # call sub-functions putting everything together, using shared
1465 # signals established above
1466 self.stage_0(m)
1467 self.tlb_read(m, r0_stall, tlb_valid_way,
1468 tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1469 dtlb_tags, dtlb_ptes)
1470 self.tlb_search(self, tlb_req_index, r0, tlb_valid_way_ tlb_tag_way,
1471 tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1472 self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1473 tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1474 dtlb_tags, tlb_pte_way, dtlb_ptes, dtlb_valid_bits)
1475 self.maybe_plrus(r1)
1476 self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1477 self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1478 r0_valid, r1, cache_valid_bits, replace_way,
1479 use_forward1_next, use_forward2_next,
1480 req_hit_way, plru_victim, rc_ok, perm_attr,
1481 valid_ra, perm_ok, access_ok, req_op, req_ok,
1482 r0_stall, early_req_row)
1483 self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1484 r0_valid, r0, reservation)
1485 self.reservation_reg(m, r0_valid, access_ok, clear_rsrv,
1486 reservation, r0)
1487 self.writeback_control(m, r1, cache_out)
1488 self.rams(m, r1)
1489 self.dcache_fast_hit(m, req_op, r0_valid, r1)
1490 self.dcache_slow(m, r1, use_forward1_next, cache_valid_bits, r0,
1491 r0_valid, req_op, cache_tag, req_go, ra)
1492 #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1493
1494
1495 # dcache_tb.vhdl
1496 #
1497 # entity dcache_tb is
1498 # end dcache_tb;
1499 #
1500 # architecture behave of dcache_tb is
1501 # signal clk : std_ulogic;
1502 # signal rst : std_ulogic;
1503 #
1504 # signal d_in : Loadstore1ToDcacheType;
1505 # signal d_out : DcacheToLoadstore1Type;
1506 #
1507 # signal m_in : MmuToDcacheType;
1508 # signal m_out : DcacheToMmuType;
1509 #
1510 # signal wb_bram_in : wishbone_master_out;
1511 # signal wb_bram_out : wishbone_slave_out;
1512 #
1513 # constant clk_period : time := 10 ns;
1514 # begin
1515 # dcache0: entity work.dcache
1516 # generic map(
1517 #
1518 # LINE_SIZE => 64,
1519 # NUM_LINES => 4
1520 # )
1521 # port map(
1522 # clk => clk,
1523 # rst => rst,
1524 # d_in => d_in,
1525 # d_out => d_out,
1526 # m_in => m_in,
1527 # m_out => m_out,
1528 # wishbone_out => wb_bram_in,
1529 # wishbone_in => wb_bram_out
1530 # );
1531 #
1532 # -- BRAM Memory slave
1533 # bram0: entity work.wishbone_bram_wrapper
1534 # generic map(
1535 # MEMORY_SIZE => 1024,
1536 # RAM_INIT_FILE => "icache_test.bin"
1537 # )
1538 # port map(
1539 # clk => clk,
1540 # rst => rst,
1541 # wishbone_in => wb_bram_in,
1542 # wishbone_out => wb_bram_out
1543 # );
1544 #
1545 # clk_process: process
1546 # begin
1547 # clk <= '0';
1548 # wait for clk_period/2;
1549 # clk <= '1';
1550 # wait for clk_period/2;
1551 # end process;
1552 #
1553 # rst_process: process
1554 # begin
1555 # rst <= '1';
1556 # wait for 2*clk_period;
1557 # rst <= '0';
1558 # wait;
1559 # end process;
1560 #
1561 # stim: process
1562 # begin
1563 # -- Clear stuff
1564 # d_in.valid <= '0';
1565 # d_in.load <= '0';
1566 # d_in.nc <= '0';
1567 # d_in.addr <= (others => '0');
1568 # d_in.data <= (others => '0');
1569 # m_in.valid <= '0';
1570 # m_in.addr <= (others => '0');
1571 # m_in.pte <= (others => '0');
1572 #
1573 # wait for 4*clk_period;
1574 # wait until rising_edge(clk);
1575 #
1576 # -- Cacheable read of address 4
1577 # d_in.load <= '1';
1578 # d_in.nc <= '0';
1579 # d_in.addr <= x"0000000000000004";
1580 # d_in.valid <= '1';
1581 # wait until rising_edge(clk);
1582 # d_in.valid <= '0';
1583 #
1584 # wait until rising_edge(clk) and d_out.valid = '1';
1585 # assert d_out.data = x"0000000100000000"
1586 # report "data @" & to_hstring(d_in.addr) &
1587 # "=" & to_hstring(d_out.data) &
1588 # " expected 0000000100000000"
1589 # severity failure;
1590 # -- wait for clk_period;
1591 #
1592 # -- Cacheable read of address 30
1593 # d_in.load <= '1';
1594 # d_in.nc <= '0';
1595 # d_in.addr <= x"0000000000000030";
1596 # d_in.valid <= '1';
1597 # wait until rising_edge(clk);
1598 # d_in.valid <= '0';
1599 #
1600 # wait until rising_edge(clk) and d_out.valid = '1';
1601 # assert d_out.data = x"0000000D0000000C"
1602 # report "data @" & to_hstring(d_in.addr) &
1603 # "=" & to_hstring(d_out.data) &
1604 # " expected 0000000D0000000C"
1605 # severity failure;
1606 #
1607 # -- Non-cacheable read of address 100
1608 # d_in.load <= '1';
1609 # d_in.nc <= '1';
1610 # d_in.addr <= x"0000000000000100";
1611 # d_in.valid <= '1';
1612 # wait until rising_edge(clk);
1613 # d_in.valid <= '0';
1614 # wait until rising_edge(clk) and d_out.valid = '1';
1615 # assert d_out.data = x"0000004100000040"
1616 # report "data @" & to_hstring(d_in.addr) &
1617 # "=" & to_hstring(d_out.data) &
1618 # " expected 0000004100000040"
1619 # severity failure;
1620 #
1621 # wait until rising_edge(clk);
1622 # wait until rising_edge(clk);
1623 # wait until rising_edge(clk);
1624 # wait until rising_edge(clk);
1625 #
1626 # std.env.finish;
1627 # end process;
1628 # end;
1629 def dcache_sim(dut):
1630 # clear stuff
1631 yield dut.d_in.valid.eq(0)
1632 yield dut.d_in.load.eq(0)
1633 yield dut.d_in.nc.eq(0)
1634 yield dut.d_in.adrr.eq(0)
1635 yield dut.d_in.data.eq(0)
1636 yield dut.m_in.valid.eq(0)
1637 yield dut.m_in.addr.eq(0)
1638 yield dut.m_in.pte.eq(0)
1639 # wait 4 * clk_period
1640 yield
1641 yield
1642 yield
1643 yield
1644 # wait_until rising_edge(clk)
1645 yield
1646 # Cacheable read of address 4
1647 yield dut.d_in.load.eq(1)
1648 yield dut.d_in.nc.eq(0)
1649 yield dut.d_in.addr.eq(Const(0x0000000000000004, 64))
1650 yield dut.d_in.valid.eq(1)
1651 # wait-until rising_edge(clk)
1652 yield
1653 yield dut.d_in.valid.eq(0)
1654 yield
1655 while not (yield dut.d_out.valid):
1656 yield
1657 assert dut.d_out.data == Const(0x0000000100000000, 64) f"data @" \
1658 f"{dut.d_in.addr}={dut.d_in.data} expected 0000000100000000" \
1659 " -!- severity failure"
1660
1661
1662 # Cacheable read of address 30
1663 yield dut.d_in.load.eq(1)
1664 yield dut.d_in.nc.eq(0)
1665 yield dut.d_in.addr.eq(Const(0x0000000000000030, 64))
1666 yield dut.d_in.valid.eq(1)
1667 yield
1668 yield dut.d_in.valid.eq(0)
1669 yield
1670 while not (yield dut.d_out.valid):
1671 yield
1672 assert dut.d_out.data == Const(0x0000000D0000000C, 64) f"data @" \
1673 f"{dut.d_in.addr}={dut.d_out.data} expected 0000000D0000000C" \
1674 f"-!- severity failure"
1675
1676 # Non-cacheable read of address 100
1677 yield dut.d_in.load.eq(1)
1678 yield dut.d_in.nc.eq(1)
1679 yield dut.d_in.addr.eq(Const(0x0000000000000100, 64))
1680 yield dut.d_in.valid.eq(1)
1681 yield
1682 yield dut.d_in.valid.eq(0)
1683 yield
1684 while not (yield dut.d_out.valid):
1685 yield
1686 assert dut.d_out.data == Const(0x0000004100000040, 64) f"data @" \
1687 f"{dut.d_in.addr}={dut.d_out.data} expected 0000004100000040" \
1688 f"-!- severity failure"
1689
1690 yield
1691 yield
1692 yield
1693 yield
1694
1695
1696 def test_dcache():
1697 dut = DCache()
1698 vl = rtlil.convert(dut, ports=[])
1699 with open("test_dcache.il", "w") as f:
1700 f.write(vl)
1701
1702 run_simulation(dut, dcache_sim(), vcd_name='test_dcache.vcd')
1703
1704 if __name__ == '__main__':
1705 test_dcache()
1706