891ae27a100fe7f6cc36cac709d742fd4c18f624
[soc.git] / src / soc / experiment / dcache.py
1 """DCache
2
3 based on Anton Blanchard microwatt dcache.vhdl
4
5 """
6
7 from enum import Enum, unique
8
9 from nmigen import Module, Signal, Elaboratable,
10 Cat, Repl
11 from nmigen.cli import main
12 from nmigen.iocontrol import RecordObject
13 from nmigen.util import log2_int
14
15 from experiment.mem_types import LoadStore1ToDCacheType,
16 DCacheToLoadStore1Type,
17 MMUToDCacheType,
18 DCacheToMMUType
19
20 from experiment.wb_types import WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
21 WBAddrType, WBDataType, WBSelType,
22 WbMasterOut, WBSlaveOut,
23 WBMasterOutVector, WBSlaveOutVector,
24 WBIOMasterOut, WBIOSlaveOut
25
26 # TODO: make these parameters of DCache at some point
27 LINE_SIZE = 64 # Line size in bytes
28 NUM_LINES = 32 # Number of lines in a set
29 NUM_WAYS = 4 # Number of ways
30 TLB_SET_SIZE = 64 # L1 DTLB entries per set
31 TLB_NUM_WAYS = 2 # L1 DTLB number of sets
32 TLB_LG_PGSZ = 12 # L1 DTLB log_2(page_size)
33 LOG_LENGTH = 0 # Non-zero to enable log data collection
34
35 # BRAM organisation: We never access more than
36 # -- wishbone_data_bits at a time so to save
37 # -- resources we make the array only that wide, and
38 # -- use consecutive indices for to make a cache "line"
39 # --
40 # -- ROW_SIZE is the width in bytes of the BRAM
41 # -- (based on WB, so 64-bits)
42 ROW_SIZE = WB_DATA_BITS // 8;
43
44 # ROW_PER_LINE is the number of row (wishbone
45 # transactions) in a line
46 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
47
48 # BRAM_ROWS is the number of rows in BRAM needed
49 # to represent the full dcache
50 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
51
52
53 # Bit fields counts in the address
54
55 # REAL_ADDR_BITS is the number of real address
56 # bits that we store
57 REAL_ADDR_BITS = 56
58
59 # ROW_BITS is the number of bits to select a row
60 ROW_BITS = log2_int(BRAM_ROWS)
61
62 # ROW_LINE_BITS is the number of bits to select
63 # a row within a line
64 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
65
66 # LINE_OFF_BITS is the number of bits for
67 # the offset in a cache line
68 LINE_OFF_BITS = log2_int(LINE_SIZE)
69
70 # ROW_OFF_BITS is the number of bits for
71 # the offset in a row
72 ROW_OFF_BITS = log2_int(ROW_SIZE)
73
74 # INDEX_BITS is the number if bits to
75 # select a cache line
76 INDEX_BITS = log2_int(NUM_LINES)
77
78 # SET_SIZE_BITS is the log base 2 of the set size
79 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
80
81 # TAG_BITS is the number of bits of
82 # the tag part of the address
83 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
84
85 # TAG_WIDTH is the width in bits of each way of the tag RAM
86 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
87
88 # WAY_BITS is the number of bits to select a way
89 WAY_BITS = log2_int(NUM_WAYS)
90
91 # Example of layout for 32 lines of 64 bytes:
92 #
93 # .. tag |index| line |
94 # .. | row | |
95 # .. | |---| | ROW_LINE_BITS (3)
96 # .. | |--- - --| LINE_OFF_BITS (6)
97 # .. | |- --| ROW_OFF_BITS (3)
98 # .. |----- ---| | ROW_BITS (8)
99 # .. |-----| | INDEX_BITS (5)
100 # .. --------| | TAG_BITS (45)
101
102 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
103
104 def CacheTagArray():
105 return Array(CacheTagSet() for x in range(NUM_LINES))
106
107 def CacheValidBitsArray():
108 return Array(CacheWayValidBits() for x in range(NUM_LINES))
109
110 def RowPerLineValidArray():
111 return Array(Signal() for x in range(ROW_PER_LINE))
112
113 # L1 TLB
114 TLB_SET_BITS = log2_int(TLB_SET_SIZE)
115 TLB_WAY_BITS = log2_int(TLB_NUM_WAYS)
116 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
117 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
118 TLB_PTE_BITS = 64
119 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
120
121 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
122 assert (LINE_SIZE % 2) == 0, "LINE_SIZE not power of 2"
123 assert (NUM_LINES % 2) == 0, "NUM_LINES not power of 2"
124 assert (ROW_PER_LINE % 2) == 0, "ROW_PER_LINE not power of 2"
125 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
126 assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS), \
127 "geometry bits don't add up"
128 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
129 "geometry bits don't add up"
130 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
131 "geometry bits don't add up"
132 assert 64 == wishbone_data_bits, "Can't yet handle wb width that isn't 64-bits"
133 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
134
135
136 def TLBValidBitsArray():
137 return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
138
139 def TLBTagsArray():
140 return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
141
142 def TLBPtesArray():
143 return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
144
145 def HitWaySet():
146 return Array(Signal(NUM_WAYS) for x in range(TLB_NUM_WAYS))
147
148 # Cache RAM interface
149 def CacheRamOut():
150 return Array(Signal(WB_DATA_BITS) for x in range(NUM_WAYS))
151
152 # PLRU output interface
153 def PLRUOut():
154 return Array(Signal(WAY_BITS) for x in range(Index()))
155
156 # TLB PLRU output interface
157 def TLBPLRUOut():
158 return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
159
160 # Helper functions to decode incoming requests
161 #
162 # Return the cache line index (tag index) for an address
163 def get_index(addr):
164 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
165
166 # Return the cache row index (data memory) for an address
167 def get_row(addr):
168 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
169
170 # Return the index of a row within a line
171 def get_row_of_line(row):
172 row_v = Signal(ROW_BITS)
173 row_v = Signal(row)
174 return row_v[0:ROW_LINE_BITS]
175
176 # Returns whether this is the last row of a line
177 def is_last_row_addr(addr, last):
178 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
179
180 # Returns whether this is the last row of a line
181 def is_last_row(row, last):
182 return get_row_of_line(row) == last
183
184 # Return the address of the next row in the current cache line
185 def next_row_addr(addr):
186 row_idx = Signal(ROW_LINE_BITS)
187 result = WBAddrType()
188 # Is there no simpler way in VHDL to
189 # generate that 3 bits adder ?
190 row_idx = addr[ROW_OFF_BITS:LINE_OFF_BITS]
191 row_idx = Signal(row_idx + 1)
192 result = addr
193 result[ROW_OFF_BITS:LINE_OFF_BITS] = row_idx
194 return result
195
196 # Return the next row in the current cache line. We use a
197 # dedicated function in order to limit the size of the
198 # generated adder to be only the bits within a cache line
199 # (3 bits with default settings)
200 def next_row(row)
201 row_v = row[0:ROW_LINE_BITS] + 1
202 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
203
204 # Get the tag value from the address
205 def get_tag(addr):
206 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
207
208 # Read a tag from a tag memory row
209 def read_tag(way, tagset):
210 return tagset[way *TAG_WIDTH:way * TAG_WIDTH + TAG_BITS]
211
212 # Read a TLB tag from a TLB tag memory row
213 def read_tlb_tag(way, tags):
214 j = way * TLB_EA_TAG_BITS
215 return tags[j:j + TLB_EA_TAG_BITS]
216
217 # Write a TLB tag to a TLB tag memory row
218 def write_tlb_tag(way, tags), tag):
219 j = way * TLB_EA_TAG_BITS
220 tags[j:j + TLB_EA_TAG_BITS] = tag
221
222 # Read a PTE from a TLB PTE memory row
223 def read_tlb_pte(way, ptes):
224 j = way * TLB_PTE_BITS
225 return ptes[j:j + TLB_PTE_BITS]
226
227 def write_tlb_pte(way, ptes,newpte):
228 j = way * TLB_PTE_BITS
229 return ptes[j:j + TLB_PTE_BITS].eq(newpte)
230
231
232 # Record for storing permission, attribute, etc. bits from a PTE
233 class PermAttr(RecordObject):
234 def __init__(self):
235 super().__init__()
236 self.reference = Signal()
237 self.changed = Signal()
238 self.nocache = Signal()
239 self.priv = Signal()
240 self.rd_perm = Signal()
241 self.wr_perm = Signal()
242
243
244 def extract_perm_attr(pte):
245 pa = PermAttr()
246 pa.reference = pte[8]
247 pa.changed = pte[7]
248 pa.nocache = pte[5]
249 pa.priv = pte[3]
250 pa.rd_perm = pte[2]
251 pa.wr_perm = pte[1]
252 return pa;
253
254
255 # Type of operation on a "valid" input
256 @unique
257 class Op(Enum):
258 OP_NONE = 0
259 OP_BAD = 1 # NC cache hit, TLB miss, prot/RC failure
260 OP_STCX_FAIL = 2 # conditional store w/o reservation
261 OP_LOAD_HIT = 3 # Cache hit on load
262 OP_LOAD_MISS = 4 # Load missing cache
263 OP_LOAD_NC = 5 # Non-cachable load
264 OP_STORE_HIT = 6 # Store hitting cache
265 OP_STORE_MISS = 7 # Store missing cache
266
267
268 # Cache state machine
269 @unique
270 class State(Enum):
271 IDLE = 0 # Normal load hit processing
272 RELOAD_WAIT_ACK = 1 # Cache reload wait ack
273 STORE_WAIT_ACK = 2 # Store wait ack
274 NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
275
276
277 # Dcache operations:
278 #
279 # In order to make timing, we use the BRAMs with
280 # an output buffer, which means that the BRAM
281 # output is delayed by an extra cycle.
282 #
283 # Thus, the dcache has a 2-stage internal pipeline
284 # for cache hits with no stalls.
285 #
286 # All other operations are handled via stalling
287 # in the first stage.
288 #
289 # The second stage can thus complete a hit at the same
290 # time as the first stage emits a stall for a complex op.
291 #
292 # Stage 0 register, basically contains just the latched request
293
294 class RegStage0(RecordObject):
295 def __init__(self):
296 super().__init__()
297 self.req = LoadStore1ToDCacheType()
298 self.tlbie = Signal()
299 self.doall = Signal()
300 self.tlbld = Signal()
301 self.mmu_req = Signal() # indicates source of request
302
303
304 class MemAccessRequest(RecordObject):
305 def __init__(self):
306 super().__init__()
307 self.op = Op()
308 self.valid = Signal()
309 self.dcbz = Signal()
310 self.real_addr = Signal(REAL_ADDR_BITS)
311 self.data = Signal(64)
312 self.byte_sel = Signal(8)
313 self.hit_way = Signal(WAY_BITS)
314 self.same_tag = Signal()
315 self.mmu_req = Signal()
316
317
318 # First stage register, contains state for stage 1 of load hits
319 # and for the state machine used by all other operations
320 class RegStage1(RecordObject):
321 def __init__(self):
322 super().__init__()
323 # Info about the request
324 self.full = Signal() # have uncompleted request
325 self.mmu_req = Signal() # request is from MMU
326 self.req = MemAccessRequest()
327
328 # Cache hit state
329 self.hit_way = Signal(WAY_BITS)
330 self.hit_load_valid = Signal()
331 self.hit_index = Signal(NUM_LINES)
332 self.cache_hit = Signal()
333
334 # TLB hit state
335 self.tlb_hit = Signal()
336 self.tlb_hit_way = Signal(TLB_NUM_WAYS)
337 self.tlb_hit_index = Signal(TLB_WAY_BITS)
338
339 # 2-stage data buffer for data forwarded from writes to reads
340 self.forward_data1 = Signal(64)
341 self.forward_data2 = Signal(64)
342 self.forward_sel1 = Signal(8)
343 self.forward_valid1 = Signal()
344 self.forward_way1 = Signal(WAY_BITS)
345 self.forward_row1 = Signal(ROW_BITS)
346 self.use_forward1 = Signal()
347 self.forward_sel = Signal(8)
348
349 # Cache miss state (reload state machine)
350 self.state = State()
351 self.dcbz = Signal()
352 self.write_bram = Signal()
353 self.write_tag = Signal()
354 self.slow_valid = Signal()
355 self.wb = WishboneMasterOut()
356 self.reload_tag = Signal(TAG_BITS)
357 self.store_way = Signal(WAY_BITS)
358 self.store_row = Signal(ROW_BITS)
359 self.store_index = Signal(INDEX_BITS)
360 self.end_row_ix = Signal(log2_int(ROW_LINE_BITS))
361 self.rows_valid = RowPerLineValidArray()
362 self.acks_pending = Signal(3)
363 self.inc_acks = Signal()
364 self.dec_acks = Signal()
365
366 # Signals to complete (possibly with error)
367 self.ls_valid = Signal()
368 self.ls_error = Signal()
369 self.mmu_done = Signal()
370 self.mmu_error = Signal()
371 self.cache_paradox = Signal()
372
373 # Signal to complete a failed stcx.
374 self.stcx_fail = Signal()
375
376
377 # Reservation information
378 class Reservation(RecordObject):
379 def __init__(self):
380 super().__init__()
381 self.valid = Signal()
382 self.addr = Signal(64-LINE_OFF_BITS)
383
384
385 class DCache(Elaboratable):
386 """Set associative dcache write-through
387 TODO (in no specific order):
388 * See list in icache.vhdl
389 * Complete load misses on the cycle when WB data comes instead of
390 at the end of line (this requires dealing with requests coming in
391 while not idle...)
392 """
393 def __init__(self):
394 self.d_in = LoadStore1ToDCacheType()
395 self.d_out = DCacheToLoadStore1Type()
396
397 self.m_in = MMUToDCacheType()
398 self.m_out = DCacheToMMUType()
399
400 self.stall_out = Signal()
401
402 self.wb_out = WBMasterOut()
403 self.wb_in = WBSlaveOut()
404
405 self.log_out = Signal(20)
406
407 def stage_0(self, m):
408 """Latch the request in r0.req as long as we're not stalling
409 """
410 comb = m.d.comb
411 sync = m.d.sync
412 d_in, d_out = self.d_in, self.d_out
413
414 r = RegStage0()
415
416 # TODO, this goes in unit tests and formal proofs
417 with m.If(~(d_in.valid & m_in.valid)):
418 #sync += Display("request collision loadstore vs MMU")
419 pass
420
421 with m.If(m_in.valid):
422 sync += r.req.valid.eq(1)
423 sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
424 sync += r.req.dcbz.eq(0)
425 sync += r.req.nc.eq(0)
426 sync += r.req.reserve.eq(0)
427 sync += r.req.virt_mode.eq(1)
428 sync += r.req.priv_mode.eq(1)
429 sync += r.req.addr.eq(m_in.addr)
430 sync += r.req.data.eq(m_in.pte)
431 sync += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
432 sync += r.tlbie.eq(m_in.tlbie)
433 sync += r.doall.eq(m_in.doall)
434 sync += r.tlbld.eq(m_in.tlbld)
435 sync += r.mmu_req.eq(1)
436 with m.Else():
437 sync += r.req.eq(d_in)
438 sync += r.req.tlbie.eq(0)
439 sync += r.req.doall.eq(0)
440 sync += r.req.tlbd.eq(0)
441 sync += r.req.mmu_req.eq(0)
442 with m.If(~(r1.full & r0_full)):
443 sync += r0.eq(r)
444 sync += r0_full.eq(r.req.valid)
445
446 def tlb_read(self, m, r0_stall, tlb_valid_way,
447 tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
448 dtlb_tags, dtlb_ptes):
449 """TLB
450 Operates in the second cycle on the request latched in r0.req.
451 TLB updates write the entry at the end of the second cycle.
452 """
453 comb = m.d.comb
454 sync = m.d.sync
455 m_in, d_in = self.m_in, self.d_in
456
457 index = Signal(TLB_SET_BITS)
458 addrbits = Signal(TLB_SET_BITS)
459
460 amin = TLB_LG_PGSZ
461 amax = TLB_LG_PGSZ + TLB_SET_BITS
462
463 with m.If(m_in.valid):
464 comb += addrbits.eq(m_in.addr[amin : amax])
465 with m.Else():
466 comb += addrbits.eq(d_in.addr[amin : amax])
467 comb += index.eq(addrbits)
468
469 # If we have any op and the previous op isn't finished,
470 # then keep the same output for next cycle.
471 with m.If(~r0_stall):
472 sync += tlb_valid_way.eq(dtlb_valid_bits[index])
473 sync += tlb_tag_way.eq(dtlb_tags[index])
474 sync += tlb_pte_way.eq(dtlb_ptes[index])
475
476 def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, acc, acc_en, lru):
477 """Generate TLB PLRUs
478 """
479 comb = m.d.comb
480 sync = m.d.sync
481
482 with m.If(TLB_NUM_WAYS > 1):
483 for i in range(TLB_SET_SIZE):
484 # TLB PLRU interface
485 tlb_plru = PLRU(TLB_WAY_BITS)
486 setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
487 tlb_plru_acc = Signal(TLB_WAY_BITS)
488 tlb_plru_acc_en = Signal()
489 tlb_plru_out = Signal(TLB_WAY_BITS)
490
491 comb += tlb_plru.acc.eq(tlb_plru_acc)
492 comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
493 comb += tlb_plru.lru.eq(tlb_plru_out)
494
495 # PLRU interface
496 with m.If(r1.tlb_hit_index == i):
497 comb += tlb_plru.acc_en.eq(r1.tlb_hit)
498 with m.Else():
499 comb += tlb_plru.acc_en.eq(0)
500 comb += tlb_plru.acc.eq(r1.tlb_hit_way)
501
502 comb += tlb_plru_victim[i].eq(tlb_plru.lru)
503
504 def tlb_search(self, m, tlb_req_index, r0, tlb_valid_way_ tlb_tag_way,
505 tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
506
507 comb = m.d.comb
508 sync = m.d.sync
509
510 hitway = Signal(TLB_WAY_BITS)
511 hit = Signal()
512 eatag = Signal(TLB_EA_TAG_BITS)
513
514 TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
515 comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
516 comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
517
518 for i in range(TLB_NUM_WAYS):
519 with m.If(tlb_valid_way(i)
520 & read_tlb_tag(i, tlb_tag_way) == eatag):
521 comb += hitway.eq(i)
522 comb += hit.eq(1)
523
524 comb += tlb_hit.eq(hit & r0_valid)
525 comb += tlb_hit_way.eq(hitway)
526
527 with m.If(tlb_hit):
528 comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
529 with m.Else():
530 comb += pte.eq(0)
531 comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
532 with m.If(r0.req.virt_mode):
533 comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
534 r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
535 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
536 comb += perm_attr.eq(extract_perm_attr(pte))
537 with m.Else():
538 comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
539 r0.rq.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
540
541 comb += perm_attr.reference.eq(1)
542 comb += perm_attr.changed.eq(1)
543 comb += perm_attr.priv.eq(1)
544 comb += perm_attr.nocache.eq(0)
545 comb += perm_attr.rd_perm.eq(1)
546 comb += perm_attr.wr_perm.eq(1)
547
548 def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
549 tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
550 dtlb_tags, tlb_pte_way, dtlb_ptes, dtlb_valid_bits):
551
552 comb = m.d.comb
553 sync = m.d.sync
554
555 tlbie = Signal()
556 tlbwe = Signal()
557 repl_way = Signal(TLB_WAY_BITS)
558 eatag = Signal(TLB_EA_TAG_BITS)
559 tagset = TLBWayTags()
560 pteset = TLBWayPtes()
561
562 comb += tlbie.eq(r0_valid & r0.tlbie)
563 comb += tlbwe.eq(r0_valid & r0.tlbldoi)
564
565 with m.If(tlbie & r0.doall):
566 # clear all valid bits at once
567 for i in range(TLB_SET_SIZE):
568 sync += dtlb_valid_bits[i].eq(0)
569
570 with m.Elif(tlbie):
571 with m.If(tlb_hit):
572 sync += dtlb_valid_bits[tlb_req_index][tlb_hit_way].eq(0)
573 with m.Elif(tlbwe):
574 with m.If(tlb_hit):
575 comb += repl_way.eq(tlb_hit_way)
576 with m.Else():
577 comb += repl_way.eq(tlb_plru_victim[tlb_req_index])
578 comb += eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
579 comb += tagset.eq(tlb_tag_way)
580 sync += write_tlb_tag(repl_way, tagset, eatag)
581 sync += dtlb_tags[tlb_req_index].eq(tagset)
582 comb += pteset.eq(tlb_pte_way)
583 sync += write_tlb_pte(repl_way, pteset, r0.req.data)
584 sync += dtlb_ptes[tlb_req_index].eq(pteset)
585 sync += dtlb_valid_bits[tlb_req_index][repl_way].eq(1)
586
587 def maybe_plrus(self, r1):
588 """Generate PLRUs
589 """
590 comb = m.d.comb
591 sync = m.d.sync
592
593 for i in range(NUM_LINES):
594 # PLRU interface
595 plru = PLRU(TLB_WAY_BITS)
596 setattr(m.submodules, "plru%d" % i, plru)
597 plru_acc = Signal(WAY_BITS)
598 plru_acc_en = Signal()
599 plru_out = Signal(WAY_BITS)
600
601 comb += plru.acc.eq(plru_acc)
602 comb += plru.acc_en.eq(plru_acc_en)
603 comb += plru.lru.eq(plru_out)
604
605 with m.If(r1.hit_index == i):
606 comb += plru_acc_en.eq(r1.cache_hit)
607
608 comb += plru_acc.eq(r1.hit_way)
609 comb += plru_victim[i].eq(plru_out)
610
611 def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
612 """Cache tag RAM read port
613 """
614 comb = m.d.comb
615 sync = m.d.sync
616 m_in, d_in = self.m_in, self.d_in
617
618 index = Signal(INDEX_BITS)
619
620 with m.If(r0_stall):
621 comb += index.eq(req_index)
622 with m.Elif(m_in.valid):
623 comb += index.eq(get_index(m_in.addr))
624 with m.Else():
625 comb += index.eq(get_index(d_in.addr))
626 sync += cache_tag_set.eq(cache_tags[index])
627
628 def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
629 r0_valid, r1, cache_valid_bits, replace_way,
630 use_forward1_next, use_forward2_next,
631 req_hit_way, plru_victim, rc_ok, perm_attr,
632 valid_ra, perm_ok, access_ok, req_op, req_ok,
633 r0_stall, early_req_row):
634 """Cache request parsing and hit detection
635 """
636
637 comb = m.d.comb
638 sync = m.d.sync
639 m_in, d_in = self.m_in, self.d_in
640
641 is_hit = Signal()
642 hit_way = Signal(WAY_BITS)
643 op = Op()
644 opsel = Signal(3)
645 go = Signal()
646 nc = Signal()
647 s_hit = Signal()
648 s_tag = Signal(TAG_BITS)
649 s_pte = Signal(TLB_PTE_BITS)
650 s_ra = Signal(REAL_ADDR_BITS)
651 hit_set = Signal(TLB_NUM_WAYS)
652 hit_way_set = HitWaySet()
653 rel_matches = Signal(TLB_NUM_WAYS)
654 rel_match = Signal()
655
656 # Extract line, row and tag from request
657 comb += req_index.eq(get_index(r0.req.addr))
658 comb += req_row.eq(get_row(r0.req.addr))
659 comb += req_tag.eq(get_tag(ra))
660
661 comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
662
663 # Test if pending request is a hit on any way
664 # In order to make timing in virtual mode,
665 # when we are using the TLB, we compare each
666 # way with each of the real addresses from each way of
667 # the TLB, and then decide later which match to use.
668
669 with m.If(r0.req.virt_mode):
670 comb += rel_matches.eq(0)
671 for j in range(TLB_NUM_WAYS):
672 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
673 comb += s_ra.eq(Cat(r0.req.addr[0:TLB_LG_PGSZ],
674 s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
675 comb += s_tag.eq(get_tag(s_ra))
676
677 for i in range(NUM_WAYS):
678 with m.If(go & cache_valid_bits[req_index][i] &
679 read_tag(i, cache_tag_set) == s_tag
680 & tlb_valid_way[j]):
681 comb += hit_way_set[j].eq(i)
682 comb += s_hit.eq(1)
683 comb += hit_set[j].eq(s_hit)
684 with m.If(s_tag == r1.reload_tag):
685 comb += rel_matches[j].eq(1)
686 with m.If(tlb_hit):
687 comb += is_hit.eq(hit_set[tlb_hit_way])
688 comb += hit_way.eq(hit_way_set[tlb_hit_way])
689 comb += rel_match.eq(rel_matches[tlb_hit_way])
690 with m.Else():
691 comb += s_tag.eq(get_tag(r0.req.addr))
692 for i in range(NUM_WAYS):
693 with m.If(go & cache_valid_bits[req_index][i] &
694 read_tag(i, cache_tag_set) == s_tag):
695 comb += hit_way.eq(i)
696 comb += is_hit.eq(1)
697 with m.If(s_tag == r1.reload_tag):
698 comb += rel_match.eq(1)
699 comb += req_same_tag.eq(rel_match)
700
701 # See if the request matches the line currently being reloaded
702 with m.If((r1.state == State.RELOAD_WAIT_ACK) &
703 (req_index == r1.store_index) & rel_match):
704 # For a store, consider this a hit even if the row isn't
705 # valid since it will be by the time we perform the store.
706 # For a load, check the appropriate row valid bit.
707 valid = r1.rows_valid[req_row % ROW_PER_LINE]
708 comb += is_hit.eq(~r0.req.load | valid)
709 comb += hit_way.eq(replace_way)
710
711 # Whether to use forwarded data for a load or not
712 comb += use_forward1_next.eq(0)
713 with m.If((get_row(r1.req.real_addr) == req_row)
714 & (r1.req.hit_way == hit_way))
715 # Only need to consider r1.write_bram here, since if we
716 # are writing refill data here, then we don't have a
717 # cache hit this cycle on the line being refilled.
718 # (There is the possibility that the load following the
719 # load miss that started the refill could be to the old
720 # contents of the victim line, since it is a couple of
721 # cycles after the refill starts before we see the updated
722 # cache tag. In that case we don't use the bypass.)
723 comb += use_forward1_next.eq(r1.write_bram)
724 comb += use_forward2_next.eq(0)
725 with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
726 comb += use_forward2_next.eq(r1.forward_valid1)
727
728 # The way that matched on a hit
729 comb += req_hit_way.eq(hit_way)
730
731 # The way to replace on a miss
732 with m.If(r1.write_tag):
733 replace_way.eq(plru_victim[r1.store_index])
734 with m.Else():
735 comb += replace_way.eq(r1.store_way)
736
737 # work out whether we have permission for this access
738 # NB we don't yet implement AMR, thus no KUAP
739 comb += rc_ok.eq(perm_attr.reference
740 & (r0.req.load | perm_attr.changed)
741 )
742 comb += perm_ok.eq((r0.req.prive_mode | ~perm_attr.priv)
743 & perm_attr.wr_perm
744 | (r0.req.load & perm_attr.rd_perm)
745 )
746 comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
747 # Combine the request and cache hit status to decide what
748 # operation needs to be done
749 comb += nc.eq(r0.req.nc | perm_attr.nocache)
750 comb += op.eq(Op.OP_NONE)
751 with m.If(go):
752 with m.If(~access_ok):
753 comb += op.eq(Op.OP_BAD)
754 with m.Elif(cancel_store):
755 comb += op.eq(Op.OP_STCX_FAIL)
756 with m.Else():
757 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
758 with m.Switch(opsel):
759 with m.Case(Const(0b101, 3)):
760 comb += op.eq(Op.OP_LOAD_HIT)
761 with m.Case(Cosnt(0b100, 3)):
762 comb += op.eq(Op.OP_LOAD_MISS)
763 with m.Case(Const(0b110, 3)):
764 comb += op.eq(Op.OP_LOAD_NC)
765 with m.Case(Const(0b001, 3)):
766 comb += op.eq(Op.OP_STORE_HIT)
767 with m.Case(Const(0b000, 3)):
768 comb += op.eq(Op.OP_STORE_MISS)
769 with m.Case(Const(0b010, 3)):
770 comb += op.eq(Op.OP_STORE_MISS)
771 with m.Case(Const(0b011, 3)):
772 comb += op.eq(Op.OP_BAD)
773 with m.Case(Const(0b111, 3)):
774 comb += op.eq(Op.OP_BAD)
775 with m.Default():
776 comb += op.eq(Op.OP_NONE)
777 comb += req_op.eq(op)
778 comb += req_go.eq(go)
779
780 # Version of the row number that is valid one cycle earlier
781 # in the cases where we need to read the cache data BRAM.
782 # If we're stalling then we need to keep reading the last
783 # row requested.
784 with m.If(~r0_stall):
785 with m.If(m_in.valid):
786 comb += early_req_row.eq(get_row(m_in.addr))
787 with m.Else():
788 comb += early_req_row.eq(get_row(d_in.addr))
789 with m.Else():
790 comb += early_req_row.eq(req_row)
791
792 def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
793 r0_valid, r0, reservation):
794 """Handle load-with-reservation and store-conditional instructions
795 """
796 comb = m.d.comb
797 sync = m.d.sync
798
799 with m.If(r0_valid & r0.req.reserve):
800
801 # XXX generate alignment interrupt if address
802 # is not aligned XXX or if r0.req.nc = '1'
803 with m.If(r0.req.load):
804 comb += set_rsrv(1) # load with reservation
805 with m.Else():
806 comb += clear_rsrv.eq(1) # store conditional
807 with m.If(~reservation.valid | r0.req.addr[LINE_OFF_BITS:64]):
808 comb += cancel_store.eq(1)
809
810 def reservation_reg(self, m, r0_valid, access_ok, clear_rsrv,
811 reservation, r0):
812
813 comb = m.d.comb
814 sync = m.d.sync
815
816 with m.If(r0_valid & access_ok):
817 with m.If(clear_rsrv):
818 sync += reservation.valid.eq(0)
819 with m.Elif(set_rsrv):
820 sync += reservation.valid.eq(1)
821 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
822
823 def writeback_control(self, m, r1, cache_out):
824 """Return data for loads & completion control logic
825 """
826 comb = m.d.comb
827 sync = m.d.sync
828 d_out, m_out = self.d_out, self.m_out
829
830 data_out = Signal(64)
831 data_fwd = Signal(64)
832
833 # Use the bypass if are reading the row that was
834 # written 1 or 2 cycles ago, including for the
835 # slow_valid = 1 case (i.e. completing a load
836 # miss or a non-cacheable load).
837 with m.If(r1.use_forward1):
838 comb += data_fwd.eq(r1.forward_data1)
839 with m.Else():
840 comb += data_fwd.eq(r1.forward_data2)
841
842 comb += data_out.eq(cache_out[r1.hit_way])
843
844 for i in range(8):
845 with m.If(r1.forward_sel[i]):
846 dsel = data_fwd.word_select(i, 8)
847 comb += data_out.word_select(i, 8).eq(dsel)
848
849 comb += d_out.valid.eq(r1.ls_valid)
850 comb += d_out.data.eq(data_out)
851 comb += d_out.store_done.eq(~r1.stcx_fail)
852 comb += d_out.error.eq(r1.ls_error)
853 comb += d_out.cache_paradox.eq(r1.cache_paradox)
854
855 # Outputs to MMU
856 comb += m_out.done.eq(r1.mmu_done)
857 comb += m_out.err.eq(r1.mmu_error)
858 comb += m_out.data.eq(data_out)
859
860 # We have a valid load or store hit or we just completed
861 # a slow op such as a load miss, a NC load or a store
862 #
863 # Note: the load hit is delayed by one cycle. However it
864 # can still not collide with r.slow_valid (well unless I
865 # miscalculated) because slow_valid can only be set on a
866 # subsequent request and not on its first cycle (the state
867 # machine must have advanced), which makes slow_valid
868 # at least 2 cycles from the previous hit_load_valid.
869
870 # Sanity: Only one of these must be set in any given cycle
871
872 if False: # TODO: need Display to get this to work
873 assert (r1.slow_valid & r1.stcx_fail) != 1 "unexpected" \
874 "slow_valid collision with stcx_fail -!- severity FAILURE"
875
876 assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1
877 "unexpected hit_load_delayed collision with slow_valid -!-" \
878 "severity FAILURE"
879
880 with m.If(~r1._mmu_req):
881 # Request came from loadstore1...
882 # Load hit case is the standard path
883 with m.If(r1.hit_load_valid):
884 #Display(f"completing load hit data={data_out}")
885 pass
886
887 # error cases complete without stalling
888 with m.If(r1.ls_error):
889 # Display("completing ld/st with error")
890 pass
891
892 # Slow ops (load miss, NC, stores)
893 with m.If(r1.slow_valid):
894 #Display(f"completing store or load miss data={data_out}")
895 pass
896
897 with m.Else():
898 # Request came from MMU
899 with m.If(r1.hit_load_valid):
900 # Display(f"completing load hit to MMU, data={m_out.data}")
901 pass
902 # error cases complete without stalling
903 with m.If(r1.mmu_error):
904 #Display("combpleting MMU ld with error")
905 pass
906
907 # Slow ops (i.e. load miss)
908 with m.If(r1.slow_valid):
909 #Display("completing MMU load miss, data={m_out.data}")
910 pass
911
912 def rams(self, m, r1):
913 """rams
914 Generate a cache RAM for each way. This handles the normal
915 reads, writes from reloads and the special store-hit update
916 path as well.
917
918 Note: the BRAMs have an extra read buffer, meaning the output
919 is pipelined an extra cycle. This differs from the
920 icache. The writeback logic needs to take that into
921 account by using 1-cycle delayed signals for load hits.
922 """
923 comb = m.d.comb
924 wb_in = self.wb_in
925
926 for i in range(NUM_WAYS):
927 do_read = Signal()
928 rd_addr = Signal(ROW_BITS)
929 do_write = Signal()
930 wr_addr = Signal(ROW_BITS)
931 wr_data = Signal(WB_DATA_BITS)
932 wr_sel = Signal(ROW_SIZE)
933 wr_sel_m = Signal(ROW_SIZE)
934 _d_out = Signal(WB_DATA_BITS)
935
936 way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
937 setattr(m.submodules, "cacheram_%d" % i, way)
938
939 comb += way.rd_en.eq(do_read)
940 comb += way.rd_addr.eq(rd_addr)
941 comb += _d_out.eq(way.rd_data)
942 comb += way.wr_sel.eq(wr_sel_m)
943 comb += way.wr_addr.eq(wr_addr)
944 comb += way.wr_data.eq(wr_data)
945
946 # Cache hit reads
947 comb += do_read.eq(1)
948 comb += rd_addr.eq(early_req_row)
949 comb += cache_out[i].eq(_d_out)
950
951 # Write mux:
952 #
953 # Defaults to wishbone read responses (cache refill)
954 #
955 # For timing, the mux on wr_data/sel/addr is not
956 # dependent on anything other than the current state.
957
958 with m.If(r1.write_bram):
959 # Write store data to BRAM. This happens one
960 # cycle after the store is in r0.
961 comb += wr_data.eq(r1.req.data)
962 comb += wr_sel.eq(r1.req.byte_sel)
963 comb += wr_addr.eq(get_row(r1.req.real_addr))
964
965 with m.If(i == r1.req.hit_way):
966 comb += do_write.eq(1)
967 with m.Else():
968 # Otherwise, we might be doing a reload or a DCBZ
969 with m.If(r1.dcbz):
970 comb += wr_data.eq(0)
971 with m.Else():
972 comb += wr_data.eq(wb_in.dat)
973 comb += wr_addr.eq(r1.store_row)
974 comb += wr_sel.eq(~0) # all 1s
975
976 with m.If((r1.state == State.RELOAD_WAIT_ACK)
977 & wb_in.ack & (replace_way == i)):
978 comb += do_write.eq(1)
979
980 # Mask write selects with do_write since BRAM
981 # doesn't have a global write-enable
982 with m.If(do_write):
983 comb += wr_sel_m.eq(wr_sel)
984
985 # Cache hit synchronous machine for the easy case.
986 # This handles load hits.
987 # It also handles error cases (TLB miss, cache paradox)
988 def dcache_fast_hit(self, m, req_op, r0_valid, r1):
989
990 comb = m.d.comb
991 sync = m.d.sync
992
993 with m.If(req_op != Op.OP_NONE):
994 #Display(f"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \
995 # f"idx:{req_index} tag:{req_tag} way: {req_hit_way}"
996 # )
997 pass
998
999 with m.If(r0_valid):
1000 sync += r1.mmu_req.eq(r0.mmu_req)
1001
1002 # Fast path for load/store hits.
1003 # Set signals for the writeback controls.
1004 sync += r1.hit_way.eq(req_hit_way)
1005 sync += r1.hit_index.eq(req_index)
1006
1007 with m.If(req_op == Op.OP_LOAD_HIT):
1008 sync += r1.hit_load_valid.eq(1)
1009 with m.Else():
1010 sync += r1.hit_load_valid.eq(0)
1011
1012 with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1013 sync += r1.cache_hit.eq(1)
1014 with m.Else():
1015 sync += r1.cache_hit.eq(0)
1016
1017 with m.If(req_op == Op.OP_BAD):
1018 # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1019 # f"rc_ok={rc_ok} perm_ok={perm_ok}"
1020 sync += r1.ls_error.eq(~r0.mmu_req)
1021 sync += r1.mmu_error.eq(r0.mmu_req)
1022 sync += r1.cache_paradox.eq(access_ok)
1023
1024 with m.Else():
1025 sync += r1.ls_error.eq(0)
1026 sync += r1.mmu_error.eq(0)
1027 sync += r1.cache_paradox.eq(0)
1028
1029 with m.If(req_op == Op.OP_STCX_FAIL):
1030 r1.stcx_fail.eq(1)
1031 with m.Else():
1032 sync += r1.stcx_fail.eq(0)
1033
1034 # Record TLB hit information for updating TLB PLRU
1035 sync += r1.tlb_hit.eq(tlb_hit)
1036 sync += r1.tlb_hit_way.eq(tlb_hit_way)
1037 sync += r1.tlb_hit_index.eq(tlb_req_index)
1038
1039 # Memory accesses are handled by this state machine:
1040 #
1041 # * Cache load miss/reload (in conjunction with "rams")
1042 # * Load hits for non-cachable forms
1043 # * Stores (the collision case is handled in "rams")
1044 #
1045 # All wishbone requests generation is done here.
1046 # This machine operates at stage 1.
1047 def dcache_slow(self, m, r1, use_forward1_next, cache_valid_bits, r0,
1048 r0_valid, req_op, cache_tag, req_go, ra):
1049
1050 comb = m.d.comb
1051 sync = m.d.sync
1052 wb_in = self.wb_i
1053
1054 req = MemAccessRequest()
1055 acks = Signal(3)
1056 adjust_acks = Signal(3)
1057
1058 sync += r1.use_forward1.eq(use_forward1_next)
1059 sync += r1.forward_sel.eq(0)
1060
1061 with m.If(use_forward1_next):
1062 sync += r1.forward_sel.eq(r1.req.byte_sel)
1063 with m.Elif(use_forward2_next):
1064 sync += r1.forward_sel.eq(r1.forward_sel1)
1065
1066 sync += r1.forward_data2.eq(r1.forward_data1)
1067 with m.If(r1.write_bram):
1068 sync += r1.forward_data1.eq(r1.req.data)
1069 sync += r1.forward_sel1.eq(r1.req.byte_sel)
1070 sync += r1.forward_way1.eq(r1.req.hit_way)
1071 sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1072 sync += r1.forward_valid1.eq(1)
1073 with m.Else():
1074 with m.If(r1.bcbz):
1075 sync += r1.forward_data1.eq(0)
1076 with m.Else():
1077 sync += r1.forward_data1.eq(wb_in.dat)
1078 sync += r1.forward_sel1.eq(~0) # all 1s
1079 sync += r1.forward_way1.eq(replace_way)
1080 sync += r1.forward_row1.eq(r1.store_row)
1081 sync += r1.forward_valid1.eq(0)
1082
1083 # One cycle pulses reset
1084 sync += r1.slow_valid.eq(0)
1085 sync += r1.write_bram.eq(0)
1086 sync += r1.inc_acks.eq(0)
1087 sync += r1.dec_acks.eq(0)
1088
1089 sync += r1.ls_valid.eq(0)
1090 # complete tlbies and TLB loads in the third cycle
1091 sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1092
1093 with m.If((req_op == Op.OP_LOAD_HIT)
1094 | (req_op == Op.OP_STCX_FAIL)):
1095 with m.If(~r0.mmu_req):
1096 sync += r1.ls_valid.eq(1)
1097 with m.Else():
1098 sync += r1.mmu_done.eq(1)
1099
1100 with m.If(r1.write_tag):
1101 # Store new tag in selected way
1102 for i in range(NUM_WAYS):
1103 with m.If(i == replace_way):
1104 idx = r1.store_index
1105 trange = range(i * TAG_WIDTH, (i+1) * TAG_WIDTH)
1106 sync += cache_tag[idx][trange].eq(r1.reload_tag)
1107 sync += r1.store_way.eq(replace_way)
1108 sync += r1.write_tag.eq(0)
1109
1110 # Take request from r1.req if there is one there,
1111 # else from req_op, ra, etc.
1112 with m.If(r1.full)
1113 comb += req.eq(r1.req)
1114 with m.Else():
1115 comb += req.op.eq(req_op)
1116 comb += req.valid.eq(req_go)
1117 comb += req.mmu_req.eq(r0.mmu_req)
1118 comb += req.dcbz.eq(r0.req.dcbz)
1119 comb += req.real_addr.eq(ra)
1120
1121 with m.If(~r0.req.dcbz):
1122 comb += req.data.eq(r0.req.data)
1123 with m.Else():
1124 comb += req.data.eq(0)
1125
1126 # Select all bytes for dcbz
1127 # and for cacheable loads
1128 with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc):
1129 comb += req.byte_sel.eq(~0) # all 1s
1130 with m.Else():
1131 comb += req.byte_sel.eq(r0.req.byte_sel)
1132 comb += req.hit_way.eq(req_hit_way)
1133 comb += req.same_tag.eq(req_same_tag)
1134
1135 # Store the incoming request from r0,
1136 # if it is a slow request
1137 # Note that r1.full = 1 implies req_op = OP_NONE
1138 with m.If((req_op == Op.OP_LOAD_MISS)
1139 | (req_op == Op.OP_LOAD_NC)
1140 | (req_op == Op.OP_STORE_MISS)
1141 | (req_op == Op.OP_STORE_HIT)):
1142 sync += r1.req(req)
1143 sync += r1.full.eq(1)
1144
1145 # Main state machine
1146 with m.Switch(r1.state):
1147
1148 with m.Case(State.IDLE)
1149 # XXX check 'left downto. probably means len(r1.wb.adr)
1150 # r1.wb.adr <= req.real_addr(
1151 # r1.wb.adr'left downto 0
1152 # );
1153 sync += r1.wb.adr.eq(req.real_addr[0:r1.wb.adr])
1154 sync += r1.wb.sel.eq(req.byte_sel)
1155 sync += r1.wb.dat.eq(req.data)
1156 sync += r1.dcbz.eq(req.dcbz)
1157
1158 # Keep track of our index and way
1159 # for subsequent stores.
1160 sync += r1.store_index.eq(get_index(req.real_addr))
1161 sync += r1.store_row.eq(get_row(req.real_addr))
1162 sync += r1.end_row_ix.eq(
1163 get_row_of_line(get_row(req.real_addr))
1164 )
1165 sync += r1.reload_tag.eq(get_tag(req.real_addr))
1166 sync += r1.req.same_tag.eq(1)
1167
1168 with m.If(req.op == Op.OP_STORE_HIT):
1169 sync += r1.store_way.eq(req.hit_way)
1170
1171 # Reset per-row valid bits,
1172 # ready for handling OP_LOAD_MISS
1173 for i in range(ROW_PER_LINE):
1174 sync += r1.rows_valid[i].eq(0)
1175
1176 with m.Switch(req.op):
1177 with m.Case(Op.OP_LOAD_HIT):
1178 # stay in IDLE state
1179 pass
1180
1181 with m.Case(Op.OP_LOAD_MISS):
1182 #Display(f"cache miss real addr:" \
1183 # f"{req_real_addr}" \
1184 # f" idx:{get_index(req_real_addr)}" \
1185 # f" tag:{get_tag(req.real_addr)}")
1186 pass
1187
1188 # Start the wishbone cycle
1189 sync += r1.wb.we.eq(0)
1190 sync += r1.wb.cyc.eq(1)
1191 sync += r1.wb.stb.eq(1)
1192
1193 # Track that we had one request sent
1194 sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1195 sync += r1.write_tag.eq(1)
1196
1197 with m.Case(Op.OP_LOAD_NC):
1198 sync += r1.wb.cyc.eq(1)
1199 sync += r1.wb.stb.eq(1)
1200 sync += r1.wb.we.eq(0)
1201 sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1202
1203 with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1204 with m.If(~req.bcbz):
1205 sync += r1.state.eq(State.STORE_WAIT_ACK)
1206 sync += r1.acks_pending.eq(1)
1207 sync += r1.full.eq(0)
1208 sync += r1.slow_valid.eq(1)
1209
1210 with m.If(~req.mmu_req):
1211 sync += r1.ls_valid.eq(1)
1212 with m.Else():
1213 sync += r1.mmu_done.eq(1)
1214
1215 with m.If(req.op == Op.OP_STORE_HIT):
1216 sync += r1.write_bram.eq(1)
1217 with m.Else():
1218 sync += r1.state.eq(Op.RELOAD_WAIT_ACK)
1219
1220 with m.If(req.op == Op.OP_STORE_MISS):
1221 sync += r1.write_tag.eq(1)
1222
1223 sync += r1.wb.we.eq(1)
1224 sync += r1.wb.cyc.eq(1)
1225 sync += r1.wb.stb.eq(1)
1226
1227 # OP_NONE and OP_BAD do nothing
1228 # OP_BAD & OP_STCX_FAIL were
1229 # handled above already
1230 with m.Case(Op.OP_NONE):
1231 pass
1232 with m.Case(OP_BAD):
1233 pass
1234 with m.Case(OP_STCX_FAIL):
1235 pass
1236
1237 with m.Case(State.RELOAD_WAIT_ACK):
1238 # Requests are all sent if stb is 0
1239 comb += stbs_done.eq(~r1.wb.stb)
1240
1241 with m.If(~wb_in.stall & ~stbs_done):
1242 # That was the last word?
1243 # We are done sending.
1244 # Clear stb and set stbs_done
1245 # so we can handle an eventual
1246 # last ack on the same cycle.
1247 with m.If(is_last_row_addr(
1248 r1.wb.adr, r1.end_row_ix)):
1249 sync += r1.wb.stb.eq(0)
1250 comb += stbs_done.eq(0)
1251
1252 # Calculate the next row address
1253 sync += r1.wb.adr.eq(next_row_addr(r1.wb.adr))
1254
1255 # Incoming acks processing
1256 sync += r1.forward_valid1.eq(wb_in.ack)
1257 with m.If(wb_in.ack):
1258 # XXX needs an Array bit-accessor here
1259 sync += r1.rows_valid[r1.store_row % ROW_PER_LINE].eq(1)
1260
1261 # If this is the data we were looking for,
1262 # we can complete the request next cycle.
1263 # Compare the whole address in case the
1264 # request in r1.req is not the one that
1265 # started this refill.
1266 with m.If(r1.full & r1.req.same_tag &
1267 ((r1.dcbz & r1.req.dcbz) |
1268 (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1269 (r1.store_row == get_row(r1.req.real_addr))):
1270 sync += r1.full.eq(0)
1271 sync += r1.slow_valid.eq(1)
1272 with m.If(~r1.mmu_req):
1273 sync += r1.ls_valid.eq(1)
1274 with m.Else():
1275 sync += r1.mmu_done.eq(1)
1276 sync += r1.forward_sel.eq(~0) # all 1s
1277 sync += r1.use_forward1.eq(1)
1278
1279 # Check for completion
1280 with m.If(stbs_done & is_last_row(r1.store_row,
1281 r1.end_row_ix)):
1282 # Complete wishbone cycle
1283 sync += r1.wb.cyc.eq(0)
1284
1285 # Cache line is now valid
1286 cv = cache_valid_bits[r1.store_index]
1287 sync += cv[r1.store_way].eq(1)
1288 sync += r1.state.eq(State.IDLE)
1289
1290 # Increment store row counter
1291 sync += r1.store_row.eq(next_row(r1.store_row))
1292
1293 with m.Case(State.STORE_WAIT_ACK):
1294 comb += stbs_done.eq(~r1.wb.stb)
1295 comb += acks.eq(r1.acks_pending)
1296
1297 with m.If(r1.inc_acks != r1.dec_acks):
1298 with m.If(r1.inc_acks):
1299 comb += adjust_acks.eq(acks + 1)
1300 with m.Else():
1301 comb += adjust_acks.eq(acks - 1)
1302 with m.Else():
1303 comb += adjust_acks.eq(acks)
1304
1305 sync += r1.acks_pending.eq(adjust_acks)
1306
1307 # Clear stb when slave accepted request
1308 with m.If(~wb_in.stall):
1309 # See if there is another store waiting
1310 # to be done which is in the same real page.
1311 with m.If(req.valid):
1312 ra = req.real_addr[0:SET_SIZE_BITS]
1313 sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
1314 sync += r1.wb.dat.eq(req.data)
1315 sync += r1.wb.sel.eq(req.byte_sel)
1316
1317 with m.Elif((adjust_acks < 7) & req.same_tag &
1318 ((req.op == Op.Op_STORE_MISS)
1319 | (req.op == Op.OP_SOTRE_HIT))):
1320 sync += r1.wb.stb.eq(1)
1321 comb += stbs_done.eq(0)
1322
1323 with m.If(req.op == Op.OP_STORE_HIT):
1324 sync += r1.write_bram.eq(1)
1325 sync += r1.full.eq(0)
1326 sync += r1.slow_valid.eq(1)
1327
1328 # Store requests never come from the MMU
1329 sync += r1.ls_valid.eq(1)
1330 comb += stbs_done.eq(0)
1331 sync += r1.inc_acks.eq(1)
1332 with m.Else():
1333 sync += r1.wb.stb.eq(0)
1334 comb += stbs_done.eq(1)
1335
1336 # Got ack ? See if complete.
1337 with m.If(wb_in.ack):
1338 with m.If(stbs_done & (adjust_acks == 1))
1339 sync += r1.state.eq(State.IDLE)
1340 sync += r1.wb.cyc.eq(0)
1341 sync += r1.wb.stb.eq(0)
1342 sync += r1.dec_acks.eq(1)
1343
1344 with m.Case(State.NC_LOAD_WAIT_ACK):
1345 # Clear stb when slave accepted request
1346 with m.If(~wb_in.stall):
1347 sync += r1.wb.stb.eq(0)
1348
1349 # Got ack ? complete.
1350 with m.If(wb_in.ack):
1351 sync += r1.state.eq(State.IDLE)
1352 sync += r1.full.eq(0)
1353 sync += r1.slow_valid.eq(1)
1354
1355 with m.If(~r1.mmu_req):
1356 sync += r1.ls_valid.eq(1)
1357 with m.Else():
1358 sync += r1.mmu_done.eq(1)
1359
1360 sync += r1.forward_sel.eq(~0) # all 1s
1361 sync += r1.use_forward1.eq(1)
1362 sync += r1.wb.cyc.eq(0)
1363 sync += r1.wb.stb.eq(0)
1364
1365 def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1366
1367 sync = m.d.sync
1368 d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1369
1370 sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1371 stall_out, req_op[:3], d_out.valid, d_out.error,
1372 r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1373 r1.wb.adr[3:6]))
1374
1375 def elaborate(self, platform):
1376
1377 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1378 cache_tags = CacheTagArray()
1379 cache_tag_set = Signal(TAG_RAM_WIDTH)
1380 cache_valid_bits = CacheValidBitsArray()
1381
1382 # TODO attribute ram_style : string;
1383 # TODO attribute ram_style of cache_tags : signal is "distributed";
1384
1385 """note: these are passed to nmigen.hdl.Memory as "attributes".
1386 don't know how, just that they are.
1387 """
1388 dtlb_valid_bits = TLBValidBitsArray()
1389 dtlb_tags = TLBTagsArray()
1390 dtlb_ptes = TLBPtesArray()
1391 # TODO attribute ram_style of
1392 # dtlb_tags : signal is "distributed";
1393 # TODO attribute ram_style of
1394 # dtlb_ptes : signal is "distributed";
1395
1396 r0 = RegStage0()
1397 r0_full = Signal()
1398
1399 r1 = RegStage1()
1400
1401 reservation = Reservation()
1402
1403 # Async signals on incoming request
1404 req_index = Signal(INDEX_BITS)
1405 req_row = Signal(ROW_BITS)
1406 req_hit_way = Signal(WAY_BITS)
1407 req_tag = Signal(TAG_BITS)
1408 req_op = Op()
1409 req_data = Signal(64)
1410 req_same_tag = Signal()
1411 req_go = Signal()
1412
1413 early_req_row = Signal(ROW_BITS)
1414
1415 cancel_store = Signal()
1416 set_rsrv = Signal()
1417 clear_rsrv = Signal()
1418
1419 r0_valid = Signal()
1420 r0_stall = Signal()
1421
1422 use_forward1_next = Signal()
1423 use_forward2_next = Signal()
1424
1425 cache_out = CacheRamOut()
1426
1427 plru_victim = PLRUOut()
1428 replace_way = Signal(WAY_BITS)
1429
1430 # Wishbone read/write/cache write formatting signals
1431 bus_sel = Signal(8)
1432
1433 # TLB signals
1434 tlb_tag_way = Signal(TLB_TAG_WAY_BITS)
1435 tlb_pte_way = Signal(TLB_PTE_WAY_BITS)
1436 tlb_valid_way = Signal(TLB_NUM_WAYS)
1437 tlb_req_index = Signal(TLB_SET_BITS)
1438 tlb_hit = Signal()
1439 tlb_hit_way = Signal(TLB_WAY_BITS)
1440 pte = Signal(TLB_PTE_BITS)
1441 ra = Signal(REAL_ADDR_BITS)
1442 valid_ra = Signal()
1443 perm_attr = PermAttr()
1444 rc_ok = Signal()
1445 perm_ok = Signal()
1446 access_ok = Signal()
1447
1448 tlb_plru_victim = TLBPLRUOut()
1449
1450 # we don't yet handle collisions between loadstore1 requests
1451 # and MMU requests
1452 comb += m_out.stall.eq(0)
1453
1454 # Hold off the request in r0 when r1 has an uncompleted request
1455 comb += r0_stall.eq(r0_full & r1.full)
1456 comb += r0_valid.eq(r0_full & ~r1.full)
1457 comb += stall_out.eq(r0_stall)
1458
1459 # Wire up wishbone request latch out of stage 1
1460 comb += self.wb_out.eq(r1.wb)
1461
1462 # call sub-functions putting everything together, using shared
1463 # signals established above
1464 self.stage_0(m)
1465 self.tlb_read(m, r0_stall, tlb_valid_way,
1466 tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1467 dtlb_tags, dtlb_ptes)
1468 self.tlb_search(self, tlb_req_index, r0, tlb_valid_way_ tlb_tag_way,
1469 tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1470 self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1471 tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1472 dtlb_tags, tlb_pte_way, dtlb_ptes, dtlb_valid_bits)
1473 self.maybe_plrus(r1)
1474 self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1475 self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1476 r0_valid, r1, cache_valid_bits, replace_way,
1477 use_forward1_next, use_forward2_next,
1478 req_hit_way, plru_victim, rc_ok, perm_attr,
1479 valid_ra, perm_ok, access_ok, req_op, req_ok,
1480 r0_stall, early_req_row)
1481 self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1482 r0_valid, r0, reservation)
1483 self.reservation_reg(m, r0_valid, access_ok, clear_rsrv,
1484 reservation, r0)
1485 self.writeback_control(m, r1, cache_out)
1486 self.rams(m, r1)
1487 self.dcache_fast_hit(m, req_op, r0_valid, r1)
1488 self.dcache_slow(m, r1, use_forward1_next, cache_valid_bits, r0,
1489 r0_valid, req_op, cache_tag, req_go, ra)
1490 #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1491
1492
1493 # dcache_tb.vhdl
1494 #
1495 # entity dcache_tb is
1496 # end dcache_tb;
1497 #
1498 # architecture behave of dcache_tb is
1499 # signal clk : std_ulogic;
1500 # signal rst : std_ulogic;
1501 #
1502 # signal d_in : Loadstore1ToDcacheType;
1503 # signal d_out : DcacheToLoadstore1Type;
1504 #
1505 # signal m_in : MmuToDcacheType;
1506 # signal m_out : DcacheToMmuType;
1507 #
1508 # signal wb_bram_in : wishbone_master_out;
1509 # signal wb_bram_out : wishbone_slave_out;
1510 #
1511 # constant clk_period : time := 10 ns;
1512 # begin
1513 # dcache0: entity work.dcache
1514 # generic map(
1515 #
1516 # LINE_SIZE => 64,
1517 # NUM_LINES => 4
1518 # )
1519 # port map(
1520 # clk => clk,
1521 # rst => rst,
1522 # d_in => d_in,
1523 # d_out => d_out,
1524 # m_in => m_in,
1525 # m_out => m_out,
1526 # wishbone_out => wb_bram_in,
1527 # wishbone_in => wb_bram_out
1528 # );
1529 #
1530 # -- BRAM Memory slave
1531 # bram0: entity work.wishbone_bram_wrapper
1532 # generic map(
1533 # MEMORY_SIZE => 1024,
1534 # RAM_INIT_FILE => "icache_test.bin"
1535 # )
1536 # port map(
1537 # clk => clk,
1538 # rst => rst,
1539 # wishbone_in => wb_bram_in,
1540 # wishbone_out => wb_bram_out
1541 # );
1542 #
1543 # clk_process: process
1544 # begin
1545 # clk <= '0';
1546 # wait for clk_period/2;
1547 # clk <= '1';
1548 # wait for clk_period/2;
1549 # end process;
1550 #
1551 # rst_process: process
1552 # begin
1553 # rst <= '1';
1554 # wait for 2*clk_period;
1555 # rst <= '0';
1556 # wait;
1557 # end process;
1558 #
1559 # stim: process
1560 # begin
1561 # -- Clear stuff
1562 # d_in.valid <= '0';
1563 # d_in.load <= '0';
1564 # d_in.nc <= '0';
1565 # d_in.addr <= (others => '0');
1566 # d_in.data <= (others => '0');
1567 # m_in.valid <= '0';
1568 # m_in.addr <= (others => '0');
1569 # m_in.pte <= (others => '0');
1570 #
1571 # wait for 4*clk_period;
1572 # wait until rising_edge(clk);
1573 #
1574 # -- Cacheable read of address 4
1575 # d_in.load <= '1';
1576 # d_in.nc <= '0';
1577 # d_in.addr <= x"0000000000000004";
1578 # d_in.valid <= '1';
1579 # wait until rising_edge(clk);
1580 # d_in.valid <= '0';
1581 #
1582 # wait until rising_edge(clk) and d_out.valid = '1';
1583 # assert d_out.data = x"0000000100000000"
1584 # report "data @" & to_hstring(d_in.addr) &
1585 # "=" & to_hstring(d_out.data) &
1586 # " expected 0000000100000000"
1587 # severity failure;
1588 # -- wait for clk_period;
1589 #
1590 # -- Cacheable read of address 30
1591 # d_in.load <= '1';
1592 # d_in.nc <= '0';
1593 # d_in.addr <= x"0000000000000030";
1594 # d_in.valid <= '1';
1595 # wait until rising_edge(clk);
1596 # d_in.valid <= '0';
1597 #
1598 # wait until rising_edge(clk) and d_out.valid = '1';
1599 # assert d_out.data = x"0000000D0000000C"
1600 # report "data @" & to_hstring(d_in.addr) &
1601 # "=" & to_hstring(d_out.data) &
1602 # " expected 0000000D0000000C"
1603 # severity failure;
1604 #
1605 # -- Non-cacheable read of address 100
1606 # d_in.load <= '1';
1607 # d_in.nc <= '1';
1608 # d_in.addr <= x"0000000000000100";
1609 # d_in.valid <= '1';
1610 # wait until rising_edge(clk);
1611 # d_in.valid <= '0';
1612 # wait until rising_edge(clk) and d_out.valid = '1';
1613 # assert d_out.data = x"0000004100000040"
1614 # report "data @" & to_hstring(d_in.addr) &
1615 # "=" & to_hstring(d_out.data) &
1616 # " expected 0000004100000040"
1617 # severity failure;
1618 #
1619 # wait until rising_edge(clk);
1620 # wait until rising_edge(clk);
1621 # wait until rising_edge(clk);
1622 # wait until rising_edge(clk);
1623 #
1624 # std.env.finish;
1625 # end process;
1626 # end;
1627 def dcache_sim(dut):
1628 # clear stuff
1629 yield dut.d_in.valid.eq(0)
1630 yield dut.d_in.load.eq(0)
1631 yield dut.d_in.nc.eq(0)
1632 yield dut.d_in.adrr.eq(0)
1633 yield dut.d_in.data.eq(0)
1634 yield dut.m_in.valid.eq(0)
1635 yield dut.m_in.addr.eq(0)
1636 yield dut.m_in.pte.eq(0)
1637 # wait 4 * clk_period
1638 yield
1639 yield
1640 yield
1641 yield
1642 # wait_until rising_edge(clk)
1643 yield
1644 # Cacheable read of address 4
1645 yield dut.d_in.load.eq(1)
1646 yield dut.d_in.nc.eq(0)
1647 yield dut.d_in.addr.eq(Const(0x0000000000000004, 64))
1648 yield dut.d_in.valid.eq(1)
1649 # wait-until rising_edge(clk)
1650 yield
1651 yield dut.d_in.valid.eq(0)
1652 yield
1653 while not (yield dut.d_out.valid):
1654 yield
1655 assert dut.d_out.data == Const(0x0000000100000000, 64) f"data @" \
1656 f"{dut.d_in.addr}={dut.d_in.data} expected 0000000100000000" \
1657 " -!- severity failure"
1658
1659
1660 # Cacheable read of address 30
1661 yield dut.d_in.load.eq(1)
1662 yield dut.d_in.nc.eq(0)
1663 yield dut.d_in.addr.eq(Const(0x0000000000000030, 64))
1664 yield dut.d_in.valid.eq(1)
1665 yield
1666 yield dut.d_in.valid.eq(0)
1667 yield
1668 while not (yield dut.d_out.valid):
1669 yield
1670 assert dut.d_out.data == Const(0x0000000D0000000C, 64) f"data @" \
1671 f"{dut.d_in.addr}={dut.d_out.data} expected 0000000D0000000C" \
1672 f"-!- severity failure"
1673
1674 # Non-cacheable read of address 100
1675 yield dut.d_in.load.eq(1)
1676 yield dut.d_in.nc.eq(1)
1677 yield dut.d_in.addr.eq(Const(0x0000000000000100, 64))
1678 yield dut.d_in.valid.eq(1)
1679 yield
1680 yield dut.d_in.valid.eq(0)
1681 yield
1682 while not (yield dut.d_out.valid):
1683 yield
1684 assert dut.d_out.data == Const(0x0000004100000040, 64) f"data @" \
1685 f"{dut.d_in.addr}={dut.d_out.data} expected 0000004100000040" \
1686 f"-!- severity failure"
1687
1688 yield
1689 yield
1690 yield
1691 yield
1692
1693
1694 def test_dcache():
1695 dut = DCache()
1696 vl = rtlil.convert(dut, ports=[])
1697 with open("test_dcache.il", "w") as f:
1698 f.write(vl)
1699
1700 run_simulation(dut, dcache_sim(), vcd_name='test_dcache.vcd')
1701
1702 if __name__ == '__main__':
1703 test_dcache()
1704