correct some errors introduced in dcache.py
[soc.git] / src / soc / experiment / dcache.py
1 """DCache
2
3 based on Anton Blanchard microwatt dcache.vhdl
4
5 """
6
7 from enum import Enum, unique
8
9 from nmigen import Module, Signal, Elaboratable,
10 Cat, Repl
11 from nmigen.cli import main
12 from nmigen.iocontrol import RecordObject
13 from nmigen.util import log2_int
14
15 from experiment.mem_types import LoadStore1ToDCacheType,
16 DCacheToLoadStore1Type,
17 MMUToDCacheType,
18 DCacheToMMUType
19
20 from experiment.wb_types import WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
21 WBAddrType, WBDataType, WBSelType,
22 WbMasterOut, WBSlaveOut,
23 WBMasterOutVector, WBSlaveOutVector,
24 WBIOMasterOut, WBIOSlaveOut
25
26 # TODO: make these parameters of DCache at some point
27 LINE_SIZE = 64 # Line size in bytes
28 NUM_LINES = 32 # Number of lines in a set
29 NUM_WAYS = 4 # Number of ways
30 TLB_SET_SIZE = 64 # L1 DTLB entries per set
31 TLB_NUM_WAYS = 2 # L1 DTLB number of sets
32 TLB_LG_PGSZ = 12 # L1 DTLB log_2(page_size)
33 LOG_LENGTH = 0 # Non-zero to enable log data collection
34
35 # BRAM organisation: We never access more than
36 # -- wishbone_data_bits at a time so to save
37 # -- resources we make the array only that wide, and
38 # -- use consecutive indices for to make a cache "line"
39 # --
40 # -- ROW_SIZE is the width in bytes of the BRAM
41 # -- (based on WB, so 64-bits)
42 ROW_SIZE = WB_DATA_BITS // 8;
43
44 # ROW_PER_LINE is the number of row (wishbone
45 # transactions) in a line
46 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
47
48 # BRAM_ROWS is the number of rows in BRAM needed
49 # to represent the full dcache
50 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
51
52
53 # Bit fields counts in the address
54
55 # REAL_ADDR_BITS is the number of real address
56 # bits that we store
57 REAL_ADDR_BITS = 56
58
59 # ROW_BITS is the number of bits to select a row
60 ROW_BITS = log2_int(BRAM_ROWS)
61
62 # ROW_LINE_BITS is the number of bits to select
63 # a row within a line
64 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
65
66 # LINE_OFF_BITS is the number of bits for
67 # the offset in a cache line
68 LINE_OFF_BITS = log2_int(LINE_SIZE)
69
70 # ROW_OFF_BITS is the number of bits for
71 # the offset in a row
72 ROW_OFF_BITS = log2_int(ROW_SIZE)
73
74 # INDEX_BITS is the number if bits to
75 # select a cache line
76 INDEX_BITS = log2_int(NUM_LINES)
77
78 # SET_SIZE_BITS is the log base 2 of the set size
79 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
80
81 # TAG_BITS is the number of bits of
82 # the tag part of the address
83 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
84
85 # TAG_WIDTH is the width in bits of each way of the tag RAM
86 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
87
88 # WAY_BITS is the number of bits to select a way
89 WAY_BITS = log2_int(NUM_WAYS)
90
91 # Example of layout for 32 lines of 64 bytes:
92 #
93 # .. tag |index| line |
94 # .. | row | |
95 # .. | |---| | ROW_LINE_BITS (3)
96 # .. | |--- - --| LINE_OFF_BITS (6)
97 # .. | |- --| ROW_OFF_BITS (3)
98 # .. |----- ---| | ROW_BITS (8)
99 # .. |-----| | INDEX_BITS (5)
100 # .. --------| | TAG_BITS (45)
101
102 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
103
104 def CacheTagArray():
105 return Array(CacheTagSet() for x in range(NUM_LINES))
106
107 def CacheValidBitsArray():
108 return Array(CacheWayValidBits() for x in range(NUM_LINES))
109
110 def RowPerLineValidArray():
111 return Array(Signal() for x in range(ROW_PER_LINE))
112
113 # L1 TLB
114 TLB_SET_BITS = log2_int(TLB_SET_SIZE)
115 TLB_WAY_BITS = log2_int(TLB_NUM_WAYS)
116 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
117 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
118 TLB_PTE_BITS = 64
119 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
120
121 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
122 assert (LINE_SIZE % 2) == 0, "LINE_SIZE not power of 2"
123 assert (NUM_LINES % 2) == 0, "NUM_LINES not power of 2"
124 assert (ROW_PER_LINE % 2) == 0, "ROW_PER_LINE not power of 2"
125 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
126 assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS), \
127 "geometry bits don't add up"
128 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
129 "geometry bits don't add up"
130 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
131 "geometry bits don't add up"
132 assert 64 == wishbone_data_bits, "Can't yet handle wb width that isn't 64-bits"
133 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
134
135
136 def TLBValidBitsArray():
137 return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
138
139 def TLBTagsArray():
140 return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
141
142 def TLBPtesArray():
143 return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
144
145 def HitWaySet():
146 return Array(Signal(NUM_WAYS) for x in range(TLB_NUM_WAYS))
147
148 # Cache RAM interface
149 def CacheRamOut():
150 return Array(Signal(WB_DATA_BITS) for x in range(NUM_WAYS))
151
152 # PLRU output interface
153 def PLRUOut():
154 return Array(Signal(WAY_BITS) for x in range(Index()))
155
156 # TLB PLRU output interface
157 def TLBPLRUOut():
158 return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
159
160 # Helper functions to decode incoming requests
161 #
162 # Return the cache line index (tag index) for an address
163 def get_index(addr):
164 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
165
166 # Return the cache row index (data memory) for an address
167 def get_row(addr):
168 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
169
170 # Return the index of a row within a line
171 def get_row_of_line(row):
172 row_v = Signal(ROW_BITS)
173 row_v = Signal(row)
174 return row_v[0:ROW_LINE_BITS]
175
176 # Returns whether this is the last row of a line
177 def is_last_row_addr(addr, last):
178 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
179
180 # Returns whether this is the last row of a line
181 def is_last_row(row, last):
182 return get_row_of_line(row) == last
183
184 # Return the address of the next row in the current cache line
185 def next_row_addr(addr):
186 row_idx = Signal(ROW_LINE_BITS)
187 result = WBAddrType()
188 # Is there no simpler way in VHDL to
189 # generate that 3 bits adder ?
190 row_idx = addr[ROW_OFF_BITS:LINE_OFF_BITS]
191 row_idx = Signal(row_idx + 1)
192 result = addr
193 result[ROW_OFF_BITS:LINE_OFF_BITS] = row_idx
194 return result
195
196 # Return the next row in the current cache line. We use a
197 # dedicated function in order to limit the size of the
198 # generated adder to be only the bits within a cache line
199 # (3 bits with default settings)
200 def next_row(row)
201 row_v = row[0:ROW_LINE_BITS] + 1
202 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
203
204 # Get the tag value from the address
205 def get_tag(addr):
206 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
207
208 # Read a tag from a tag memory row
209 def read_tag(way, tagset):
210 return tagset[way *TAG_WIDTH:way * TAG_WIDTH + TAG_BITS]
211
212 # Read a TLB tag from a TLB tag memory row
213 def read_tlb_tag(way, tags):
214 j = way * TLB_EA_TAG_BITS
215 return tags[j:j + TLB_EA_TAG_BITS]
216
217 # Write a TLB tag to a TLB tag memory row
218 def write_tlb_tag(way, tags), tag):
219 j = way * TLB_EA_TAG_BITS
220 tags[j:j + TLB_EA_TAG_BITS] = tag
221
222 # Read a PTE from a TLB PTE memory row
223 def read_tlb_pte(way, ptes):
224 j = way * TLB_PTE_BITS
225 return ptes[j:j + TLB_PTE_BITS]
226
227 def write_tlb_pte(way, ptes,newpte):
228 j = way * TLB_PTE_BITS
229 return ptes[j:j + TLB_PTE_BITS].eq(newpte)
230
231
232 # Record for storing permission, attribute, etc. bits from a PTE
233 class PermAttr(RecordObject):
234 def __init__(self):
235 super().__init__()
236 self.reference = Signal()
237 self.changed = Signal()
238 self.nocache = Signal()
239 self.priv = Signal()
240 self.rd_perm = Signal()
241 self.wr_perm = Signal()
242
243
244 def extract_perm_attr(pte):
245 pa = PermAttr()
246 pa.reference = pte[8]
247 pa.changed = pte[7]
248 pa.nocache = pte[5]
249 pa.priv = pte[3]
250 pa.rd_perm = pte[2]
251 pa.wr_perm = pte[1]
252 return pa;
253
254
255 # Type of operation on a "valid" input
256 @unique
257 class Op(Enum):
258 OP_NONE = 0
259 OP_BAD = 1 # NC cache hit, TLB miss, prot/RC failure
260 OP_STCX_FAIL = 2 # conditional store w/o reservation
261 OP_LOAD_HIT = 3 # Cache hit on load
262 OP_LOAD_MISS = 4 # Load missing cache
263 OP_LOAD_NC = 5 # Non-cachable load
264 OP_STORE_HIT = 6 # Store hitting cache
265 OP_STORE_MISS = 7 # Store missing cache
266
267
268 # Cache state machine
269 @unique
270 class State(Enum):
271 IDLE = 0 # Normal load hit processing
272 RELOAD_WAIT_ACK = 1 # Cache reload wait ack
273 STORE_WAIT_ACK = 2 # Store wait ack
274 NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
275
276
277 # Dcache operations:
278 #
279 # In order to make timing, we use the BRAMs with
280 # an output buffer, which means that the BRAM
281 # output is delayed by an extra cycle.
282 #
283 # Thus, the dcache has a 2-stage internal pipeline
284 # for cache hits with no stalls.
285 #
286 # All other operations are handled via stalling
287 # in the first stage.
288 #
289 # The second stage can thus complete a hit at the same
290 # time as the first stage emits a stall for a complex op.
291 #
292 # Stage 0 register, basically contains just the latched request
293
294 class RegStage0(RecordObject):
295 def __init__(self):
296 super().__init__()
297 self.req = LoadStore1ToDCacheType()
298 self.tlbie = Signal()
299 self.doall = Signal()
300 self.tlbld = Signal()
301 self.mmu_req = Signal() # indicates source of request
302
303
304 class MemAccessRequest(RecordObject):
305 def __init__(self):
306 super().__init__()
307 self.op = Op()
308 self.valid = Signal()
309 self.dcbz = Signal()
310 self.real_addr = Signal(REAL_ADDR_BITS)
311 self.data = Signal(64)
312 self.byte_sel = Signal(8)
313 self.hit_way = Signal(WAY_BITS)
314 self.same_tag = Signal()
315 self.mmu_req = Signal()
316
317
318 # First stage register, contains state for stage 1 of load hits
319 # and for the state machine used by all other operations
320 class RegStage1(RecordObject):
321 def __init__(self):
322 super().__init__()
323 # Info about the request
324 self.full = Signal() # have uncompleted request
325 self.mmu_req = Signal() # request is from MMU
326 self.req = MemAccessRequest()
327
328 # Cache hit state
329 self.hit_way = Signal(WAY_BITS)
330 self.hit_load_valid = Signal()
331 self.hit_index = Signal(NUM_LINES)
332 self.cache_hit = Signal()
333
334 # TLB hit state
335 self.tlb_hit = Signal()
336 self.tlb_hit_way = Signal(TLB_NUM_WAYS)
337 self.tlb_hit_index = Signal(TLB_WAY_BITS)
338
339 # 2-stage data buffer for data forwarded from writes to reads
340 self.forward_data1 = Signal(64)
341 self.forward_data2 = Signal(64)
342 self.forward_sel1 = Signal(8)
343 self.forward_valid1 = Signal()
344 self.forward_way1 = Signal(WAY_BITS)
345 self.forward_row1 = Signal(ROW_BITS)
346 self.use_forward1 = Signal()
347 self.forward_sel = Signal(8)
348
349 # Cache miss state (reload state machine)
350 self.state = State()
351 self.dcbz = Signal()
352 self.write_bram = Signal()
353 self.write_tag = Signal()
354 self.slow_valid = Signal()
355 self.wb = WishboneMasterOut()
356 self.reload_tag = Signal(TAG_BITS)
357 self.store_way = Signal(WAY_BITS)
358 self.store_row = Signal(ROW_BITS)
359 self.store_index = Signal(INDEX_BITS)
360 self.end_row_ix = Signal(log2_int(ROW_LINE_BITS))
361 self.rows_valid = RowPerLineValidArray()
362 self.acks_pending = Signal(3)
363 self.inc_acks = Signal()
364 self.dec_acks = Signal()
365
366 # Signals to complete (possibly with error)
367 self.ls_valid = Signal()
368 self.ls_error = Signal()
369 self.mmu_done = Signal()
370 self.mmu_error = Signal()
371 self.cache_paradox = Signal()
372
373 # Signal to complete a failed stcx.
374 self.stcx_fail = Signal()
375
376
377 # Reservation information
378 class Reservation(RecordObject):
379 def __init__(self):
380 super().__init__()
381 self.valid = Signal()
382 self.addr = Signal(64-LINE_OFF_BITS)
383
384
385 class DCache(Elaboratable):
386 """Set associative dcache write-through
387 TODO (in no specific order):
388 * See list in icache.vhdl
389 * Complete load misses on the cycle when WB data comes instead of
390 at the end of line (this requires dealing with requests coming in
391 while not idle...)
392 """
393 def __init__(self):
394 self.d_in = LoadStore1ToDCacheType()
395 self.d_out = DCacheToLoadStore1Type()
396
397 self.m_in = MMUToDCacheType()
398 self.m_out = DCacheToMMUType()
399
400 self.stall_out = Signal()
401
402 self.wb_out = WBMasterOut()
403 self.wb_in = WBSlaveOut()
404
405 self.log_out = Signal(20)
406
407 def stage_0(self, m, d_in, m_in):
408 """Latch the request in r0.req as long as we're not stalling
409 """
410 comb = m.d.comb
411 sync = m.d.sync
412
413 r = RegStage0()
414
415 # TODO, this goes in unit tests and formal proofs
416 with m.If(~(d_in.valid & m_in.valid)):
417 #sync += Display("request collision loadstore vs MMU")
418 pass
419
420 with m.If(m_in.valid):
421 sync += r.req.valid.eq(1)
422 sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
423 sync += r.req.dcbz.eq(0)
424 sync += r.req.nc.eq(0)
425 sync += r.req.reserve.eq(0)
426 sync += r.req.virt_mode.eq(1)
427 sync += r.req.priv_mode.eq(1)
428 sync += r.req.addr.eq(m_in.addr)
429 sync += r.req.data.eq(m_in.pte)
430 sync += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
431 sync += r.tlbie.eq(m_in.tlbie)
432 sync += r.doall.eq(m_in.doall)
433 sync += r.tlbld.eq(m_in.tlbld)
434 sync += r.mmu_req.eq(1)
435 with m.Else():
436 sync += r.req.eq(d_in)
437 sync += r.req.tlbie.eq(0)
438 sync += r.req.doall.eq(0)
439 sync += r.req.tlbd.eq(0)
440 sync += r.req.mmu_req.eq(0)
441 with m.If(~(r1.full & r0_full)):
442 sync += r0.eq(r)
443 sync += r0_full.eq(r.req.valid)
444
445 def tlb_read(self, m, m_in, d_in, r0_stall, tlb_valid_way,
446 tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
447 dtlb_tags, dtlb_ptes):
448 """TLB
449 Operates in the second cycle on the request latched in r0.req.
450 TLB updates write the entry at the end of the second cycle.
451 """
452 comb = m.d.comb
453 sync = m.d.sync
454
455 index = Signal(TLB_SET_BITS)
456 addrbits = Signal(TLB_SET_BITS)
457
458 amin = TLB_LG_PGSZ
459 amax = TLB_LG_PGSZ + TLB_SET_BITS
460
461 with m.If(m_in.valid):
462 comb += addrbits.eq(m_in.addr[amin : amax])
463 with m.Else():
464 comb += addrbits.eq(d_in.addr[amin : amax])
465 comb += index.eq(addrbits)
466
467 # If we have any op and the previous op isn't finished,
468 # then keep the same output for next cycle.
469 with m.If(~r0_stall):
470 sync += tlb_valid_way.eq(dtlb_valid_bits[index])
471 sync += tlb_tag_way.eq(dtlb_tags[index])
472 sync += tlb_pte_way.eq(dtlb_ptes[index])
473
474 def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, acc, acc_en, lru):
475 """Generate TLB PLRUs
476 """
477 comb = m.d.comb
478 sync = m.d.sync
479
480 with m.If(TLB_NUM_WAYS > 1):
481 for i in range(TLB_SET_SIZE):
482 # TLB PLRU interface
483 tlb_plru = PLRU(TLB_WAY_BITS)
484 setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
485 tlb_plru_acc = Signal(TLB_WAY_BITS)
486 tlb_plru_acc_en = Signal()
487 tlb_plru_out = Signal(TLB_WAY_BITS)
488
489 comb += tlb_plru.acc.eq(tlb_plru_acc)
490 comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
491 comb += tlb_plru.lru.eq(tlb_plru_out)
492
493 # PLRU interface
494 with m.If(r1.tlb_hit_index == i):
495 comb += tlb_plru.acc_en.eq(r1.tlb_hit)
496 with m.Else():
497 comb += tlb_plru.acc_en.eq(0)
498 comb += tlb_plru.acc.eq(r1.tlb_hit_way)
499
500 comb += tlb_plru_victim[i].eq(tlb_plru.lru)
501
502 def tlb_search(self, m, tlb_req_index, r0, tlb_valid_way_ tlb_tag_way,
503 tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
504
505 comb = m.d.comb
506 sync = m.d.sync
507
508 hitway = Signal(TLB_WAY_BITS)
509 hit = Signal()
510 eatag = Signal(TLB_EA_TAG_BITS)
511
512 TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
513 comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
514 comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
515
516 for i in range(TLB_NUM_WAYS):
517 with m.If(tlb_valid_way(i)
518 & read_tlb_tag(i, tlb_tag_way) == eatag):
519 comb += hitway.eq(i)
520 comb += hit.eq(1)
521
522 comb += tlb_hit.eq(hit & r0_valid)
523 comb += tlb_hit_way.eq(hitway)
524
525 with m.If(tlb_hit):
526 comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
527 with m.Else():
528 comb += pte.eq(0)
529 comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
530 with m.If(r0.req.virt_mode):
531 comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
532 r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
533 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
534 comb += perm_attr.eq(extract_perm_attr(pte))
535 with m.Else():
536 comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
537 r0.rq.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
538
539 comb += perm_attr.reference.eq(1)
540 comb += perm_attr.changed.eq(1)
541 comb += perm_attr.priv.eq(1)
542 comb += perm_attr.nocache.eq(0)
543 comb += perm_attr.rd_perm.eq(1)
544 comb += perm_attr.wr_perm.eq(1)
545
546 def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
547 tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
548 dtlb_tags, tlb_pte_way, dtlb_ptes, dtlb_valid_bits):
549
550 comb = m.d.comb
551 sync = m.d.sync
552
553 tlbie = Signal()
554 tlbwe = Signal()
555 repl_way = Signal(TLB_WAY_BITS)
556 eatag = Signal(TLB_EA_TAG_BITS)
557 tagset = TLBWayTags()
558 pteset = TLBWayPtes()
559
560 comb += tlbie.eq(r0_valid & r0.tlbie)
561 comb += tlbwe.eq(r0_valid & r0.tlbldoi)
562
563 with m.If(tlbie & r0.doall):
564 # clear all valid bits at once
565 for i in range(TLB_SET_SIZE):
566 sync += dtlb_valid_bits[i].eq(0)
567
568 with m.Elif(tlbie):
569 with m.If(tlb_hit):
570 sync += dtlb_valid_bits[tlb_req_index][tlb_hit_way].eq(0)
571 with m.Elif(tlbwe):
572 with m.If(tlb_hit):
573 comb += repl_way.eq(tlb_hit_way)
574 with m.Else():
575 comb += repl_way.eq(tlb_plru_victim[tlb_req_index])
576 comb += eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
577 comb += tagset.eq(tlb_tag_way)
578 sync += write_tlb_tag(repl_way, tagset, eatag)
579 sync += dtlb_tags[tlb_req_index].eq(tagset)
580 comb += pteset.eq(tlb_pte_way)
581 sync += write_tlb_pte(repl_way, pteset, r0.req.data)
582 sync += dtlb_ptes[tlb_req_index].eq(pteset)
583 sync += dtlb_valid_bits[tlb_req_index][repl_way].eq(1)
584
585 def maybe_plrus(self, r1):
586 """Generate PLRUs
587 """
588 comb = m.d.comb
589 sync = m.d.sync
590
591 for i in range(NUM_LINES):
592 # PLRU interface
593 plru = PLRU(TLB_WAY_BITS)
594 setattr(m.submodules, "plru%d" % i, plru)
595 plru_acc = Signal(WAY_BITS)
596 plru_acc_en = Signal()
597 plru_out = Signal(WAY_BITS)
598
599 comb += plru.acc.eq(plru_acc)
600 comb += plru.acc_en.eq(plru_acc_en)
601 comb += plru.lru.eq(plru_out)
602
603 with m.If(r1.hit_index == i):
604 comb += plru_acc_en.eq(r1.cache_hit)
605
606 comb += plru_acc.eq(r1.hit_way)
607 comb += plru_victim[i].eq(plru_out)
608
609 def cache_tag_read(self, m, r0_stall, req_index, m_in, d_in,
610 cache_tag_set, cache_tags):
611 """Cache tag RAM read port
612 """
613 comb = m.d.comb
614 sync = m.d.sync
615
616 index = Signal(INDEX_BITS)
617
618 with m.If(r0_stall):
619 comb += index.eq(req_index)
620 with m.Elif(m_in.valid):
621 comb += index.eq(get_index(m_in.addr))
622 with m.Else():
623 comb += index.eq(get_index(d_in.addr))
624 sync += cache_tag_set.eq(cache_tags[index])
625
626 def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
627 r0_valid, r1, cache_valid_bits, replace_way,
628 use_forward1_next, use_forward2_next,
629 req_hit_way, plru_victim, rc_ok, perm_attr,
630 valid_ra, perm_ok, access_ok, req_op, req_ok,
631 r0_stall, m_in, early_req_row, d_in):
632 """Cache request parsing and hit detection
633 """
634
635 comb = m.d.comb
636 sync = m.d.sync
637
638 is_hit = Signal()
639 hit_way = Signal(WAY_BITS)
640 op = Op()
641 opsel = Signal(3)
642 go = Signal()
643 nc = Signal()
644 s_hit = Signal()
645 s_tag = Signal(TAG_BITS)
646 s_pte = Signal(TLB_PTE_BITS)
647 s_ra = Signal(REAL_ADDR_BITS)
648 hit_set = Signal(TLB_NUM_WAYS)
649 hit_way_set = HitWaySet()
650 rel_matches = Signal(TLB_NUM_WAYS)
651 rel_match = Signal()
652
653 # Extract line, row and tag from request
654 comb += req_index.eq(get_index(r0.req.addr))
655 comb += req_row.eq(get_row(r0.req.addr))
656 comb += req_tag.eq(get_tag(ra))
657
658 comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
659
660 # Test if pending request is a hit on any way
661 # In order to make timing in virtual mode,
662 # when we are using the TLB, we compare each
663 # way with each of the real addresses from each way of
664 # the TLB, and then decide later which match to use.
665
666 with m.If(r0.req.virt_mode):
667 comb += rel_matches.eq(0)
668 for j in range(TLB_NUM_WAYS):
669 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
670 comb += s_ra.eq(Cat(r0.req.addr[0:TLB_LG_PGSZ],
671 s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
672 comb += s_tag.eq(get_tag(s_ra))
673
674 for i in range(NUM_WAYS):
675 with m.If(go & cache_valid_bits[req_index][i] &
676 read_tag(i, cache_tag_set) == s_tag
677 & tlb_valid_way[j]):
678 comb += hit_way_set[j].eq(i)
679 comb += s_hit.eq(1)
680 comb += hit_set[j].eq(s_hit)
681 with m.If(s_tag == r1.reload_tag):
682 comb += rel_matches[j].eq(1)
683 with m.If(tlb_hit):
684 comb += is_hit.eq(hit_set[tlb_hit_way])
685 comb += hit_way.eq(hit_way_set[tlb_hit_way])
686 comb += rel_match.eq(rel_matches[tlb_hit_way])
687 with m.Else():
688 comb += s_tag.eq(get_tag(r0.req.addr))
689 for i in range(NUM_WAYS):
690 with m.If(go & cache_valid_bits[req_index][i] &
691 read_tag(i, cache_tag_set) == s_tag):
692 comb += hit_way.eq(i)
693 comb += is_hit.eq(1)
694 with m.If(s_tag == r1.reload_tag):
695 comb += rel_match.eq(1)
696 comb += req_same_tag.eq(rel_match)
697
698 # See if the request matches the line currently being reloaded
699 with m.If((r1.state == State.RELOAD_WAIT_ACK) &
700 (req_index == r1.store_index) & rel_match):
701 # For a store, consider this a hit even if the row isn't
702 # valid since it will be by the time we perform the store.
703 # For a load, check the appropriate row valid bit.
704 valid = r1.rows_valid[req_row % ROW_PER_LINE]
705 comb += is_hit.eq(~r0.req.load | valid)
706 comb += hit_way.eq(replace_way)
707
708 # Whether to use forwarded data for a load or not
709 comb += use_forward1_next.eq(0)
710 with m.If((get_row(r1.req.real_addr) == req_row)
711 & (r1.req.hit_way == hit_way))
712 # Only need to consider r1.write_bram here, since if we
713 # are writing refill data here, then we don't have a
714 # cache hit this cycle on the line being refilled.
715 # (There is the possibility that the load following the
716 # load miss that started the refill could be to the old
717 # contents of the victim line, since it is a couple of
718 # cycles after the refill starts before we see the updated
719 # cache tag. In that case we don't use the bypass.)
720 comb += use_forward1_next.eq(r1.write_bram)
721 comb += use_forward2_next.eq(0)
722 with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
723 comb += use_forward2_next.eq(r1.forward_valid1)
724
725 # The way that matched on a hit
726 comb += req_hit_way.eq(hit_way)
727
728 # The way to replace on a miss
729 with m.If(r1.write_tag):
730 replace_way.eq(plru_victim[r1.store_index])
731 with m.Else():
732 comb += replace_way.eq(r1.store_way)
733
734 # work out whether we have permission for this access
735 # NB we don't yet implement AMR, thus no KUAP
736 comb += rc_ok.eq(perm_attr.reference
737 & (r0.req.load | perm_attr.changed)
738 )
739 comb += perm_ok.eq((r0.req.prive_mode | ~perm_attr.priv)
740 & perm_attr.wr_perm
741 | (r0.req.load & perm_attr.rd_perm)
742 )
743 comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
744 # Combine the request and cache hit status to decide what
745 # operation needs to be done
746 comb += nc.eq(r0.req.nc | perm_attr.nocache)
747 comb += op.eq(Op.OP_NONE)
748 with m.If(go):
749 with m.If(~access_ok):
750 comb += op.eq(Op.OP_BAD)
751 with m.Elif(cancel_store):
752 comb += op.eq(Op.OP_STCX_FAIL)
753 with m.Else():
754 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
755 with m.Switch(opsel):
756 with m.Case(Const(0b101, 3)):
757 comb += op.eq(Op.OP_LOAD_HIT)
758 with m.Case(Cosnt(0b100, 3)):
759 comb += op.eq(Op.OP_LOAD_MISS)
760 with m.Case(Const(0b110, 3)):
761 comb += op.eq(Op.OP_LOAD_NC)
762 with m.Case(Const(0b001, 3)):
763 comb += op.eq(Op.OP_STORE_HIT)
764 with m.Case(Const(0b000, 3)):
765 comb += op.eq(Op.OP_STORE_MISS)
766 with m.Case(Const(0b010, 3)):
767 comb += op.eq(Op.OP_STORE_MISS)
768 with m.Case(Const(0b011, 3)):
769 comb += op.eq(Op.OP_BAD)
770 with m.Case(Const(0b111, 3)):
771 comb += op.eq(Op.OP_BAD)
772 with m.Default():
773 comb += op.eq(Op.OP_NONE)
774 comb += req_op.eq(op)
775 comb += req_go.eq(go)
776
777 # Version of the row number that is valid one cycle earlier
778 # in the cases where we need to read the cache data BRAM.
779 # If we're stalling then we need to keep reading the last
780 # row requested.
781 with m.If(~r0_stall):
782 with m.If(m_in.valid):
783 comb += early_req_row.eq(get_row(m_in.addr))
784 with m.Else():
785 comb += early_req_row.eq(get_row(d_in.addr))
786 with m.Else():
787 comb += early_req_row.eq(req_row)
788
789 def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
790 r0_valid, r0, reservation):
791 """Handle load-with-reservation and store-conditional instructions
792 """
793 comb = m.d.comb
794 sync = m.d.sync
795
796 with m.If(r0_valid & r0.req.reserve):
797
798 # XXX generate alignment interrupt if address
799 # is not aligned XXX or if r0.req.nc = '1'
800 with m.If(r0.req.load):
801 comb += set_rsrv(1) # load with reservation
802 with m.Else():
803 comb += clear_rsrv.eq(1) # store conditional
804 with m.If(~reservation.valid | r0.req.addr[LINE_OFF_BITS:64]):
805 comb += cancel_store.eq(1)
806
807 def reservation_reg(self, m, r0_valid, access_ok, clear_rsrv,
808 reservation, r0):
809
810 comb = m.d.comb
811 sync = m.d.sync
812
813 with m.If(r0_valid & access_ok):
814 with m.If(clear_rsrv):
815 sync += reservation.valid.eq(0)
816 with m.Elif(set_rsrv):
817 sync += reservation.valid.eq(1)
818 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
819
820 def writeback_control(self, m, r1, cache_out, d_out, m_out):
821 """Return data for loads & completion control logic
822 """
823 comb = m.d.comb
824 sync = m.d.sync
825
826 data_out = Signal(64)
827 data_fwd = Signal(64)
828
829 # Use the bypass if are reading the row that was
830 # written 1 or 2 cycles ago, including for the
831 # slow_valid = 1 case (i.e. completing a load
832 # miss or a non-cacheable load).
833 with m.If(r1.use_forward1):
834 comb += data_fwd.eq(r1.forward_data1)
835 with m.Else():
836 comb += data_fwd.eq(r1.forward_data2)
837
838 comb += data_out.eq(cache_out[r1.hit_way])
839
840 for i in range(8):
841 with m.If(r1.forward_sel[i]):
842 dsel = data_fwd.word_select(i, 8)
843 comb += data_out.word_select(i, 8).eq(dsel)
844
845 comb += d_out.valid.eq(r1.ls_valid)
846 comb += d_out.data.eq(data_out)
847 comb += d_out.store_done.eq(~r1.stcx_fail)
848 comb += d_out.error.eq(r1.ls_error)
849 comb += d_out.cache_paradox.eq(r1.cache_paradox)
850
851 # Outputs to MMU
852 comb += m_out.done.eq(r1.mmu_done)
853 comb += m_out.err.eq(r1.mmu_error)
854 comb += m_out.data.eq(data_out)
855
856 # We have a valid load or store hit or we just completed
857 # a slow op such as a load miss, a NC load or a store
858 #
859 # Note: the load hit is delayed by one cycle. However it
860 # can still not collide with r.slow_valid (well unless I
861 # miscalculated) because slow_valid can only be set on a
862 # subsequent request and not on its first cycle (the state
863 # machine must have advanced), which makes slow_valid
864 # at least 2 cycles from the previous hit_load_valid.
865
866 # Sanity: Only one of these must be set in any given cycle
867
868 if False: # TODO: need Display to get this to work
869 assert (r1.slow_valid & r1.stcx_fail) != 1 "unexpected" \
870 "slow_valid collision with stcx_fail -!- severity FAILURE"
871
872 assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1
873 "unexpected hit_load_delayed collision with slow_valid -!-" \
874 "severity FAILURE"
875
876 with m.If(~r1._mmu_req):
877 # Request came from loadstore1...
878 # Load hit case is the standard path
879 with m.If(r1.hit_load_valid):
880 #Display(f"completing load hit data={data_out}")
881 pass
882
883 # error cases complete without stalling
884 with m.If(r1.ls_error):
885 # Display("completing ld/st with error")
886 pass
887
888 # Slow ops (load miss, NC, stores)
889 with m.If(r1.slow_valid):
890 #Display(f"completing store or load miss data={data_out}")
891 pass
892
893 with m.Else():
894 # Request came from MMU
895 with m.If(r1.hit_load_valid):
896 # Display(f"completing load hit to MMU, data={m_out.data}")
897 pass
898 # error cases complete without stalling
899 with m.If(r1.mmu_error):
900 #Display("combpleting MMU ld with error")
901 pass
902
903 # Slow ops (i.e. load miss)
904 with m.If(r1.slow_valid):
905 #Display("completing MMU load miss, data={m_out.data}")
906 pass
907
908 def rams(self, m):
909 """rams
910 Generate a cache RAM for each way. This handles the normal
911 reads, writes from reloads and the special store-hit update
912 path as well.
913
914 Note: the BRAMs have an extra read buffer, meaning the output
915 is pipelined an extra cycle. This differs from the
916 icache. The writeback logic needs to take that into
917 account by using 1-cycle delayed signals for load hits.
918 """
919 comb = m.d.comb
920
921 for i in range(NUM_WAYS):
922 do_read = Signal()
923 rd_addr = Signal(ROW_BITS)
924 do_write = Signal()
925 wr_addr = Signal(ROW_BITS)
926 wr_data = Signal(WB_DATA_BITS)
927 wr_sel = Signal(ROW_SIZE)
928 wr_sel_m = Signal(ROW_SIZE)
929 _d_out = Signal(WB_DATA_BITS)
930
931 way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
932 setattr(m.submodules, "cacheram_%d" % i, way)
933
934 comb += way.rd_en.eq(do_read)
935 comb += way.rd_addr.eq(rd_addr)
936 comb += _d_out.eq(way.rd_data)
937 comb += way.wr_sel.eq(wr_sel_m)
938 comb += way.wr_addr.eq(wr_addr)
939 comb += way.wr_data.eq(wr_data)
940
941 # Cache hit reads
942 comb += do_read.eq(1)
943 comb += rd_addr.eq(early_req_row)
944 comb += cache_out[i].eq(_d_out)
945
946 # Write mux:
947 #
948 # Defaults to wishbone read responses (cache refill)
949 #
950 # For timing, the mux on wr_data/sel/addr is not
951 # dependent on anything other than the current state.
952
953 with m.If(r1.write_bram):
954 # Write store data to BRAM. This happens one
955 # cycle after the store is in r0.
956 comb += wr_data.eq(r1.req.data)
957 comb += wr_sel.eq(r1.req.byte_sel)
958 comb += wr_addr.eq(get_row(r1.req.real_addr))
959
960 with m.If(i == r1.req.hit_way):
961 comb += do_write.eq(1)
962 with m.Else():
963 # Otherwise, we might be doing a reload or a DCBZ
964 with m.If(r1.dcbz):
965 comb += wr_data.eq(0)
966 with m.Else():
967 comb += wr_data.eq(wishbone_in.dat)
968 comb += wr_addr.eq(r1.store_row)
969 comb += wr_sel.eq(~0) # all 1s
970
971 with m.If((r1.state == State.RELOAD_WAIT_ACK)
972 & wishbone_in.ack & (relpace_way == i)):
973 comb += do_write.eq(1)
974
975 # Mask write selects with do_write since BRAM
976 # doesn't have a global write-enable
977 with m.If(do_write):
978 comb += wr_sel_m.eq(wr_sel)
979
980 # Cache hit synchronous machine for the easy case.
981 # This handles load hits.
982 # It also handles error cases (TLB miss, cache paradox)
983 def dcache_fast_hit(self, m, req_op, r0_valid, r1):
984
985 comb = m.d.comb
986 sync = m.d.sync
987
988 with m.If(req_op != Op.OP_NONE):
989 #Display(f"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \
990 # f"idx:{req_index} tag:{req_tag} way: {req_hit_way}"
991 # )
992 pass
993
994 with m.If(r0_valid):
995 sync += r1.mmu_req.eq(r0.mmu_req)
996
997 # Fast path for load/store hits.
998 # Set signals for the writeback controls.
999 sync += r1.hit_way.eq(req_hit_way)
1000 sync += r1.hit_index.eq(req_index)
1001
1002 with m.If(req_op == Op.OP_LOAD_HIT):
1003 sync += r1.hit_load_valid.eq(1)
1004 with m.Else():
1005 sync += r1.hit_load_valid.eq(0)
1006
1007 with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1008 sync += r1.cache_hit.eq(1)
1009 with m.Else():
1010 sync += r1.cache_hit.eq(0)
1011
1012 with m.If(req_op == Op.OP_BAD):
1013 # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1014 # f"rc_ok={rc_ok} perm_ok={perm_ok}"
1015 sync += r1.ls_error.eq(~r0.mmu_req)
1016 sync += r1.mmu_error.eq(r0.mmu_req)
1017 sync += r1.cache_paradox.eq(access_ok)
1018
1019 with m.Else():
1020 sync += r1.ls_error.eq(0)
1021 sync += r1.mmu_error.eq(0)
1022 sync += r1.cache_paradox.eq(0)
1023
1024 with m.If(req_op == Op.OP_STCX_FAIL):
1025 r1.stcx_fail.eq(1)
1026 with m.Else():
1027 sync += r1.stcx_fail.eq(0)
1028
1029 # Record TLB hit information for updating TLB PLRU
1030 sync += r1.tlb_hit.eq(tlb_hit)
1031 sync += r1.tlb_hit_way.eq(tlb_hit_way)
1032 sync += r1.tlb_hit_index.eq(tlb_req_index)
1033
1034 # Memory accesses are handled by this state machine:
1035 #
1036 # * Cache load miss/reload (in conjunction with "rams")
1037 # * Load hits for non-cachable forms
1038 # * Stores (the collision case is handled in "rams")
1039 #
1040 # All wishbone requests generation is done here.
1041 # This machine operates at stage 1.
1042 def dcache_slow(self, m, r1, use_forward1_next, cache_valid_bits, r0,
1043 r0_valid, req_op, cache_tag, req_go, ra, wb_in):
1044
1045 comb = m.d.comb
1046 sync = m.d.sync
1047
1048 req = MemAccessRequest()
1049 acks = Signal(3)
1050 adjust_acks = Signal(3)
1051
1052 sync += r1.use_forward1.eq(use_forward1_next)
1053 sync += r1.forward_sel.eq(0)
1054
1055 with m.If(use_forward1_next):
1056 sync += r1.forward_sel.eq(r1.req.byte_sel)
1057 with m.Elif(use_forward2_next):
1058 sync += r1.forward_sel.eq(r1.forward_sel1)
1059
1060 sync += r1.forward_data2.eq(r1.forward_data1)
1061 with m.If(r1.write_bram):
1062 sync += r1.forward_data1.eq(r1.req.data)
1063 sync += r1.forward_sel1.eq(r1.req.byte_sel)
1064 sync += r1.forward_way1.eq(r1.req.hit_way)
1065 sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1066 sync += r1.forward_valid1.eq(1)
1067 with m.Else():
1068 with m.If(r1.bcbz):
1069 sync += r1.forward_data1.eq(0)
1070 with m.Else():
1071 sync += r1.forward_data1.eq(wb_in.dat)
1072 sync += r1.forward_sel1.eq(~0) # all 1s
1073 sync += r1.forward_way1.eq(replace_way)
1074 sync += r1.forward_row1.eq(r1.store_row)
1075 sync += r1.forward_valid1.eq(0)
1076
1077 # One cycle pulses reset
1078 sync += r1.slow_valid.eq(0)
1079 sync += r1.write_bram.eq(0)
1080 sync += r1.inc_acks.eq(0)
1081 sync += r1.dec_acks.eq(0)
1082
1083 sync += r1.ls_valid.eq(0)
1084 # complete tlbies and TLB loads in the third cycle
1085 sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1086
1087 with m.If((req_op == Op.OP_LOAD_HIT)
1088 | (req_op == Op.OP_STCX_FAIL)):
1089 with m.If(~r0.mmu_req):
1090 sync += r1.ls_valid.eq(1)
1091 with m.Else():
1092 sync += r1.mmu_done.eq(1)
1093
1094 with m.If(r1.write_tag):
1095 # Store new tag in selected way
1096 for i in range(NUM_WAYS):
1097 with m.If(i == replace_way):
1098 idx = r1.store_index
1099 trange = range(i * TAG_WIDTH, (i+1) * TAG_WIDTH)
1100 sync += cache_tag[idx][trange].eq(r1.reload_tag)
1101 sync += r1.store_way.eq(replace_way)
1102 sync += r1.write_tag.eq(0)
1103
1104 # Take request from r1.req if there is one there,
1105 # else from req_op, ra, etc.
1106 with m.If(r1.full)
1107 comb += req.eq(r1.req)
1108 with m.Else():
1109 comb += req.op.eq(req_op)
1110 comb += req.valid.eq(req_go)
1111 comb += req.mmu_req.eq(r0.mmu_req)
1112 comb += req.dcbz.eq(r0.req.dcbz)
1113 comb += req.real_addr.eq(ra)
1114
1115 with m.If(~r0.req.dcbz):
1116 comb += req.data.eq(r0.req.data)
1117 with m.Else():
1118 comb += req.data.eq(0)
1119
1120 # Select all bytes for dcbz
1121 # and for cacheable loads
1122 with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc):
1123 comb += req.byte_sel.eq(~0) # all 1s
1124 with m.Else():
1125 comb += req.byte_sel.eq(r0.req.byte_sel)
1126 comb += req.hit_way.eq(req_hit_way)
1127 comb += req.same_tag.eq(req_same_tag)
1128
1129 # Store the incoming request from r0,
1130 # if it is a slow request
1131 # Note that r1.full = 1 implies req_op = OP_NONE
1132 with m.If((req_op == Op.OP_LOAD_MISS)
1133 | (req_op == Op.OP_LOAD_NC)
1134 | (req_op == Op.OP_STORE_MISS)
1135 | (req_op == Op.OP_STORE_HIT)):
1136 sync += r1.req(req)
1137 sync += r1.full.eq(1)
1138
1139 # Main state machine
1140 with m.Switch(r1.state):
1141
1142 with m.Case(State.IDLE)
1143 # XXX check 'left downto. probably means len(r1.wb.adr)
1144 # r1.wb.adr <= req.real_addr(
1145 # r1.wb.adr'left downto 0
1146 # );
1147 sync += r1.wb.adr.eq(req.real_addr[0:r1.wb.adr])
1148 sync += r1.wb.sel.eq(req.byte_sel)
1149 sync += r1.wb.dat.eq(req.data)
1150 sync += r1.dcbz.eq(req.dcbz)
1151
1152 # Keep track of our index and way
1153 # for subsequent stores.
1154 sync += r1.store_index.eq(get_index(req.real_addr))
1155 sync += r1.store_row.eq(get_row(req.real_addr))
1156 sync += r1.end_row_ix.eq(
1157 get_row_of_line(get_row(req.real_addr))
1158 )
1159 sync += r1.reload_tag.eq(get_tag(req.real_addr))
1160 sync += r1.req.same_tag.eq(1)
1161
1162 with m.If(req.op == Op.OP_STORE_HIT):
1163 sync += r1.store_way.eq(req.hit_way)
1164
1165 # Reset per-row valid bits,
1166 # ready for handling OP_LOAD_MISS
1167 for i in range(ROW_PER_LINE):
1168 sync += r1.rows_valid[i].eq(0)
1169
1170 with m.Switch(req.op):
1171 with m.Case(Op.OP_LOAD_HIT):
1172 # stay in IDLE state
1173 pass
1174
1175 with m.Case(Op.OP_LOAD_MISS):
1176 #Display(f"cache miss real addr:" \
1177 # f"{req_real_addr}" \
1178 # f" idx:{get_index(req_real_addr)}" \
1179 # f" tag:{get_tag(req.real_addr)}")
1180 pass
1181
1182 # Start the wishbone cycle
1183 sync += r1.wb.we.eq(0)
1184 sync += r1.wb.cyc.eq(1)
1185 sync += r1.wb.stb.eq(1)
1186
1187 # Track that we had one request sent
1188 sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1189 sync += r1.write_tag.eq(1)
1190
1191 with m.Case(Op.OP_LOAD_NC):
1192 sync += r1.wb.cyc.eq(1)
1193 sync += r1.wb.stb.eq(1)
1194 sync += r1.wb.we.eq(0)
1195 sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1196
1197 with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1198 with m.If(~req.bcbz):
1199 sync += r1.state.eq(State.STORE_WAIT_ACK)
1200 sync += r1.acks_pending.eq(1)
1201 sync += r1.full.eq(0)
1202 sync += r1.slow_valid.eq(1)
1203
1204 with m.If(~req.mmu_req):
1205 sync += r1.ls_valid.eq(1)
1206 with m.Else():
1207 sync += r1.mmu_done.eq(1)
1208
1209 with m.If(req.op == Op.OP_STORE_HIT):
1210 sync += r1.write_bram.eq(1)
1211 with m.Else():
1212 sync += r1.state.eq(Op.RELOAD_WAIT_ACK)
1213
1214 with m.If(req.op == Op.OP_STORE_MISS):
1215 sync += r1.write_tag.eq(1)
1216
1217 sync += r1.wb.we.eq(1)
1218 sync += r1.wb.cyc.eq(1)
1219 sync += r1.wb.stb.eq(1)
1220
1221 # OP_NONE and OP_BAD do nothing
1222 # OP_BAD & OP_STCX_FAIL were
1223 # handled above already
1224 with m.Case(Op.OP_NONE):
1225 pass
1226 with m.Case(OP_BAD):
1227 pass
1228 with m.Case(OP_STCX_FAIL):
1229 pass
1230
1231 with m.Case(State.RELOAD_WAIT_ACK):
1232 # Requests are all sent if stb is 0
1233 comb += stbs_done.eq(~r1.wb.stb)
1234
1235 with m.If(~wb_in.stall & ~stbs_done):
1236 # That was the last word?
1237 # We are done sending.
1238 # Clear stb and set stbs_done
1239 # so we can handle an eventual
1240 # last ack on the same cycle.
1241 with m.If(is_last_row_addr(
1242 r1.wb.adr, r1.end_row_ix)):
1243 sync += r1.wb.stb.eq(0)
1244 comb += stbs_done.eq(0)
1245
1246 # Calculate the next row address
1247 sync += r1.wb.adr.eq(next_row_addr(r1.wb.adr))
1248
1249 # Incoming acks processing
1250 sync += r1.forward_valid1.eq(wb_in.ack)
1251 with m.If(wb_in.ack):
1252 # XXX needs an Array bit-accessor here
1253 sync += r1.rows_valid[r1.store_row % ROW_PER_LINE].eq(1)
1254
1255 # If this is the data we were looking for,
1256 # we can complete the request next cycle.
1257 # Compare the whole address in case the
1258 # request in r1.req is not the one that
1259 # started this refill.
1260 with m.If(r1.full & r1.req.same_tag &
1261 ((r1.dcbz & r1.req.dcbz) |
1262 (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1263 (r1.store_row == get_row(r1.req.real_addr))):
1264 sync += r1.full.eq(0)
1265 sync += r1.slow_valid.eq(1)
1266 with m.If(~r1.mmu_req):
1267 sync += r1.ls_valid.eq(1)
1268 with m.Else():
1269 sync += r1.mmu_done.eq(1)
1270 sync += r1.forward_sel.eq(~0) # all 1s
1271 sync += r1.use_forward1.eq(1)
1272
1273 # Check for completion
1274 with m.If(stbs_done & is_last_row(r1.store_row,
1275 r1.end_row_ix)):
1276 # Complete wishbone cycle
1277 sync += r1.wb.cyc.eq(0)
1278
1279 # Cache line is now valid
1280 cv = cache_valid_bits[r1.store_index]
1281 sync += cv[r1.store_way].eq(1)
1282 sync += r1.state.eq(State.IDLE)
1283
1284 # Increment store row counter
1285 sync += r1.store_row.eq(next_row(r1.store_row))
1286
1287 with m.Case(State.STORE_WAIT_ACK):
1288 comb += stbs_done.eq(~r1.wb.stb)
1289 comb += acks.eq(r1.acks_pending)
1290
1291 with m.If(r1.inc_acks != r1.dec_acks):
1292 with m.If(r1.inc_acks):
1293 comb += adjust_acks.eq(acks + 1)
1294 with m.Else():
1295 comb += adjust_acks.eq(acks - 1)
1296 with m.Else():
1297 comb += adjust_acks.eq(acks)
1298
1299 sync += r1.acks_pending.eq(adjust_acks)
1300
1301 # Clear stb when slave accepted request
1302 with m.If(~wb_in.stall):
1303 # See if there is another store waiting
1304 # to be done which is in the same real page.
1305 with m.If(req.valid):
1306 ra = req.real_addr[0:SET_SIZE_BITS]
1307 sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
1308 sync += r1.wb.dat.eq(req.data)
1309 sync += r1.wb.sel.eq(req.byte_sel)
1310
1311 with m.Elif((adjust_acks < 7) & req.same_tag &
1312 ((req.op == Op.Op_STORE_MISS)
1313 | (req.op == Op.OP_SOTRE_HIT))):
1314 sync += r1.wb.stb.eq(1)
1315 comb += stbs_done.eq(0)
1316
1317 with m.If(req.op == Op.OP_STORE_HIT):
1318 sync += r1.write_bram.eq(1)
1319 sync += r1.full.eq(0)
1320 sync += r1.slow_valid.eq(1)
1321
1322 # Store requests never come from the MMU
1323 sync += r1.ls_valid.eq(1)
1324 comb += stbs_done.eq(0)
1325 sync += r1.inc_acks.eq(1)
1326 with m.Else():
1327 sync += r1.wb.stb.eq(0)
1328 comb += stbs_done.eq(1)
1329
1330 # Got ack ? See if complete.
1331 with m.If(wb_in.ack):
1332 with m.If(stbs_done & (adjust_acks == 1))
1333 sync += r1.state.eq(State.IDLE)
1334 sync += r1.wb.cyc.eq(0)
1335 sync += r1.wb.stb.eq(0)
1336 sync += r1.dec_acks.eq(1)
1337
1338 with m.Case(State.NC_LOAD_WAIT_ACK):
1339 # Clear stb when slave accepted request
1340 with m.If(~wb_in.stall):
1341 sync += r1.wb.stb.eq(0)
1342
1343 # Got ack ? complete.
1344 with m.If(wb_in.ack):
1345 sync += r1.state.eq(State.IDLE)
1346 sync += r1.full.eq(0)
1347 sync += r1.slow_valid.eq(1)
1348
1349 with m.If(~r1.mmu_req):
1350 sync += r1.ls_valid.eq(1)
1351 with m.Else():
1352 sync += r1.mmu_done.eq(1)
1353
1354 sync += r1.forward_sel.eq(~0) # all 1s
1355 sync += r1.use_forward1.eq(1)
1356 sync += r1.wb.cyc.eq(0)
1357 sync += r1.wb.stb.eq(0)
1358
1359 def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out,
1360 d_out, wb_in, log_out):
1361
1362 sync = m.d.sync
1363
1364 sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1365 stall_out, req_op[:3], d_out.valid, d_out.error,
1366 r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1367 r1.wb.adr[3:6]))
1368
1369 def elaborate(self, platform):
1370
1371 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1372 cache_tags = CacheTagArray()
1373 cache_tag_set = Signal(TAG_RAM_WIDTH)
1374 cache_valid_bits = CacheValidBitsArray()
1375
1376 # TODO attribute ram_style : string;
1377 # TODO attribute ram_style of cache_tags : signal is "distributed";
1378
1379 """note: these are passed to nmigen.hdl.Memory as "attributes".
1380 don't know how, just that they are.
1381 """
1382 dtlb_valid_bits = TLBValidBitsArray()
1383 dtlb_tags = TLBTagsArray()
1384 dtlb_ptes = TLBPtesArray()
1385 # TODO attribute ram_style of
1386 # dtlb_tags : signal is "distributed";
1387 # TODO attribute ram_style of
1388 # dtlb_ptes : signal is "distributed";
1389
1390 r0 = RegStage0()
1391 r0_full = Signal()
1392
1393 r1 = RegStage1()
1394
1395 reservation = Reservation()
1396
1397 # Async signals on incoming request
1398 req_index = Signal(INDEX_BITS)
1399 req_row = Signal(ROW_BITS)
1400 req_hit_way = Signal(WAY_BITS)
1401 req_tag = Signal(TAG_BITS)
1402 req_op = Op()
1403 req_data = Signal(64)
1404 req_same_tag = Signal()
1405 req_go = Signal()
1406
1407 early_req_row = Signal(ROW_BITS)
1408
1409 cancel_store = Signal()
1410 set_rsrv = Signal()
1411 clear_rsrv = Signal()
1412
1413 r0_valid = Signal()
1414 r0_stall = Signal()
1415
1416 use_forward1_next = Signal()
1417 use_forward2_next = Signal()
1418
1419 cache_out = CacheRamOut()
1420
1421 plru_victim = PLRUOut()
1422 replace_way = Signal(WAY_BITS)
1423
1424 # Wishbone read/write/cache write formatting signals
1425 bus_sel = Signal(8)
1426
1427 # TLB signals
1428 tlb_tag_way = Signal(TLB_TAG_WAY_BITS)
1429 tlb_pte_way = Signal(TLB_PTE_WAY_BITS)
1430 tlb_valid_way = Signal(TLB_NUM_WAYS)
1431 tlb_req_index = Signal(TLB_SET_BITS)
1432 tlb_hit = Signal()
1433 tlb_hit_way = Signal(TLB_WAY_BITS)
1434 pte = Signal(TLB_PTE_BITS)
1435 ra = Signal(REAL_ADDR_BITS)
1436 valid_ra = Signal()
1437 perm_attr = PermAttr()
1438 rc_ok = Signal()
1439 perm_ok = Signal()
1440 access_ok = Signal()
1441
1442 tlb_plru_victim = TLBPLRUOut()
1443
1444 # we don't yet handle collisions between loadstore1 requests
1445 # and MMU requests
1446 comb += m_out.stall.eq(0)
1447
1448 # Hold off the request in r0 when r1 has an uncompleted request
1449 comb += r0_stall.eq(r0_full & r1.full)
1450 comb += r0_valid.eq(r0_full & ~r1.full)
1451 comb += stall_out.eq(r0_stall)
1452
1453 # Wire up wishbone request latch out of stage 1
1454 comb += wishbone_out.eq(r1.wb)
1455
1456
1457
1458 # dcache_tb.vhdl
1459 #
1460 # entity dcache_tb is
1461 # end dcache_tb;
1462 #
1463 # architecture behave of dcache_tb is
1464 # signal clk : std_ulogic;
1465 # signal rst : std_ulogic;
1466 #
1467 # signal d_in : Loadstore1ToDcacheType;
1468 # signal d_out : DcacheToLoadstore1Type;
1469 #
1470 # signal m_in : MmuToDcacheType;
1471 # signal m_out : DcacheToMmuType;
1472 #
1473 # signal wb_bram_in : wishbone_master_out;
1474 # signal wb_bram_out : wishbone_slave_out;
1475 #
1476 # constant clk_period : time := 10 ns;
1477 # begin
1478 # dcache0: entity work.dcache
1479 # generic map(
1480 #
1481 # LINE_SIZE => 64,
1482 # NUM_LINES => 4
1483 # )
1484 # port map(
1485 # clk => clk,
1486 # rst => rst,
1487 # d_in => d_in,
1488 # d_out => d_out,
1489 # m_in => m_in,
1490 # m_out => m_out,
1491 # wishbone_out => wb_bram_in,
1492 # wishbone_in => wb_bram_out
1493 # );
1494 #
1495 # -- BRAM Memory slave
1496 # bram0: entity work.wishbone_bram_wrapper
1497 # generic map(
1498 # MEMORY_SIZE => 1024,
1499 # RAM_INIT_FILE => "icache_test.bin"
1500 # )
1501 # port map(
1502 # clk => clk,
1503 # rst => rst,
1504 # wishbone_in => wb_bram_in,
1505 # wishbone_out => wb_bram_out
1506 # );
1507 #
1508 # clk_process: process
1509 # begin
1510 # clk <= '0';
1511 # wait for clk_period/2;
1512 # clk <= '1';
1513 # wait for clk_period/2;
1514 # end process;
1515 #
1516 # rst_process: process
1517 # begin
1518 # rst <= '1';
1519 # wait for 2*clk_period;
1520 # rst <= '0';
1521 # wait;
1522 # end process;
1523 #
1524 # stim: process
1525 # begin
1526 # -- Clear stuff
1527 # d_in.valid <= '0';
1528 # d_in.load <= '0';
1529 # d_in.nc <= '0';
1530 # d_in.addr <= (others => '0');
1531 # d_in.data <= (others => '0');
1532 # m_in.valid <= '0';
1533 # m_in.addr <= (others => '0');
1534 # m_in.pte <= (others => '0');
1535 #
1536 # wait for 4*clk_period;
1537 # wait until rising_edge(clk);
1538 #
1539 # -- Cacheable read of address 4
1540 # d_in.load <= '1';
1541 # d_in.nc <= '0';
1542 # d_in.addr <= x"0000000000000004";
1543 # d_in.valid <= '1';
1544 # wait until rising_edge(clk);
1545 # d_in.valid <= '0';
1546 #
1547 # wait until rising_edge(clk) and d_out.valid = '1';
1548 # assert d_out.data = x"0000000100000000"
1549 # report "data @" & to_hstring(d_in.addr) &
1550 # "=" & to_hstring(d_out.data) &
1551 # " expected 0000000100000000"
1552 # severity failure;
1553 # -- wait for clk_period;
1554 #
1555 # -- Cacheable read of address 30
1556 # d_in.load <= '1';
1557 # d_in.nc <= '0';
1558 # d_in.addr <= x"0000000000000030";
1559 # d_in.valid <= '1';
1560 # wait until rising_edge(clk);
1561 # d_in.valid <= '0';
1562 #
1563 # wait until rising_edge(clk) and d_out.valid = '1';
1564 # assert d_out.data = x"0000000D0000000C"
1565 # report "data @" & to_hstring(d_in.addr) &
1566 # "=" & to_hstring(d_out.data) &
1567 # " expected 0000000D0000000C"
1568 # severity failure;
1569 #
1570 # -- Non-cacheable read of address 100
1571 # d_in.load <= '1';
1572 # d_in.nc <= '1';
1573 # d_in.addr <= x"0000000000000100";
1574 # d_in.valid <= '1';
1575 # wait until rising_edge(clk);
1576 # d_in.valid <= '0';
1577 # wait until rising_edge(clk) and d_out.valid = '1';
1578 # assert d_out.data = x"0000004100000040"
1579 # report "data @" & to_hstring(d_in.addr) &
1580 # "=" & to_hstring(d_out.data) &
1581 # " expected 0000004100000040"
1582 # severity failure;
1583 #
1584 # wait until rising_edge(clk);
1585 # wait until rising_edge(clk);
1586 # wait until rising_edge(clk);
1587 # wait until rising_edge(clk);
1588 #
1589 # std.env.finish;
1590 # end process;
1591 # end;
1592 def dcache_sim(dut):
1593 # clear stuff
1594 yield dut.d_in.valid.eq(0)
1595 yield dut.d_in.load.eq(0)
1596 yield dut.d_in.nc.eq(0)
1597 yield dut.d_in.adrr.eq(0)
1598 yield dut.d_in.data.eq(0)
1599 yield dut.m_in.valid.eq(0)
1600 yield dut.m_in.addr.eq(0)
1601 yield dut.m_in.pte.eq(0)
1602 # wait 4 * clk_period
1603 yield
1604 yield
1605 yield
1606 yield
1607 # wait_until rising_edge(clk)
1608 yield
1609 # Cacheable read of address 4
1610 yield dut.d_in.load.eq(1)
1611 yield dut.d_in.nc.eq(0)
1612 yield dut.d_in.addr.eq(Const(0x0000000000000004, 64))
1613 yield dut.d_in.valid.eq(1)
1614 # wait-until rising_edge(clk)
1615 yield
1616 yield dut.d_in.valid.eq(0)
1617 yield
1618 while not (yield dut.d_out.valid):
1619 yield
1620 assert dut.d_out.data == Const(0x0000000100000000, 64) f"data @" \
1621 f"{dut.d_in.addr}={dut.d_in.data} expected 0000000100000000" \
1622 " -!- severity failure"
1623
1624
1625 # Cacheable read of address 30
1626 yield dut.d_in.load.eq(1)
1627 yield dut.d_in.nc.eq(0)
1628 yield dut.d_in.addr.eq(Const(0x0000000000000030, 64))
1629 yield dut.d_in.valid.eq(1)
1630 yield
1631 yield dut.d_in.valid.eq(0)
1632 yield
1633 while not (yield dut.d_out.valid):
1634 yield
1635 assert dut.d_out.data == Const(0x0000000D0000000C, 64) f"data @" \
1636 f"{dut.d_in.addr}={dut.d_out.data} expected 0000000D0000000C" \
1637 f"-!- severity failure"
1638
1639 # Non-cacheable read of address 100
1640 yield dut.d_in.load.eq(1)
1641 yield dut.d_in.nc.eq(1)
1642 yield dut.d_in.addr.eq(Const(0x0000000000000100, 64))
1643 yield dut.d_in.valid.eq(1)
1644 yield
1645 yield dut.d_in.valid.eq(0)
1646 yield
1647 while not (yield dut.d_out.valid):
1648 yield
1649 assert dut.d_out.data == Const(0x0000004100000040, 64) f"data @" \
1650 f"{dut.d_in.addr}={dut.d_out.data} expected 0000004100000040" \
1651 f"-!- severity failure"
1652
1653 yield
1654 yield
1655 yield
1656 yield
1657
1658
1659 def test_dcache():
1660 dut = DCache()
1661 vl = rtlil.convert(dut, ports=[])
1662 with open("test_dcache.il", "w") as f:
1663 f.write(vl)
1664
1665 run_simulation(dut, dcache_sim(), vcd_name='test_dcache.vcd')
1666
1667 if __name__ == '__main__':
1668 test_dcache()
1669