convert TLBArray to TLBValidArray
[soc.git] / src / soc / experiment / dcache.py
1 """DCache
2
3 based on Anton Blanchard microwatt dcache.vhdl
4
5 note that the microwatt dcache wishbone interface expects "stall".
6 for simplicity at the moment this is hard-coded to cyc & ~ack.
7 see WB4 spec, p84, section 5.2.1
8
9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
10 is raised. sigh
11
12 Links:
13
14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
16
17 """
18
19 import sys
20
21 from nmutil.gtkw import write_gtkw
22
23 sys.setrecursionlimit(1000000)
24
25 from enum import Enum, unique
26
27 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
28 Record, Memory)
29 from nmutil.util import Display
30 from nmigen.lib.coding import Decoder
31
32 from copy import deepcopy
33 from random import randint, seed
34
35 from nmigen_soc.wishbone.bus import Interface
36
37 from nmigen.cli import main
38 from nmutil.iocontrol import RecordObject
39 from nmigen.utils import log2_int
40 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
41 DCacheToLoadStore1Type,
42 MMUToDCacheType,
43 DCacheToMMUType)
44
45 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
46 WBAddrType, WBDataType, WBSelType,
47 WBMasterOut, WBSlaveOut,
48 WBMasterOutVector, WBSlaveOutVector,
49 WBIOMasterOut, WBIOSlaveOut)
50
51 from soc.experiment.cache_ram import CacheRam
52 #from soc.experiment.plru import PLRU
53 from nmutil.plru import PLRU, PLRUs
54
55 # for test
56 from soc.bus.sram import SRAM
57 from nmigen import Memory
58 from nmigen.cli import rtlil
59
60 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
61 # Also, check out the cxxsim nmigen branch, and latest yosys from git
62 from nmutil.sim_tmp_alternative import Simulator
63
64 from nmutil.util import wrap
65
66
67 # TODO: make these parameters of DCache at some point
68 LINE_SIZE = 64 # Line size in bytes
69 NUM_LINES = 16 # Number of lines in a set
70 NUM_WAYS = 4 # Number of ways
71 TLB_SET_SIZE = 64 # L1 DTLB entries per set
72 TLB_NUM_WAYS = 2 # L1 DTLB number of sets
73 TLB_LG_PGSZ = 12 # L1 DTLB log_2(page_size)
74 LOG_LENGTH = 0 # Non-zero to enable log data collection
75
76 # BRAM organisation: We never access more than
77 # -- WB_DATA_BITS at a time so to save
78 # -- resources we make the array only that wide, and
79 # -- use consecutive indices to make a cache "line"
80 # --
81 # -- ROW_SIZE is the width in bytes of the BRAM
82 # -- (based on WB, so 64-bits)
83 ROW_SIZE = WB_DATA_BITS // 8;
84
85 # ROW_PER_LINE is the number of row (wishbone
86 # transactions) in a line
87 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
88
89 # BRAM_ROWS is the number of rows in BRAM needed
90 # to represent the full dcache
91 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
92
93 print ("ROW_SIZE", ROW_SIZE)
94 print ("ROW_PER_LINE", ROW_PER_LINE)
95 print ("BRAM_ROWS", BRAM_ROWS)
96 print ("NUM_WAYS", NUM_WAYS)
97
98 # Bit fields counts in the address
99
100 # REAL_ADDR_BITS is the number of real address
101 # bits that we store
102 REAL_ADDR_BITS = 56
103
104 # ROW_BITS is the number of bits to select a row
105 ROW_BITS = log2_int(BRAM_ROWS)
106
107 # ROW_LINE_BITS is the number of bits to select
108 # a row within a line
109 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
110
111 # LINE_OFF_BITS is the number of bits for
112 # the offset in a cache line
113 LINE_OFF_BITS = log2_int(LINE_SIZE)
114
115 # ROW_OFF_BITS is the number of bits for
116 # the offset in a row
117 ROW_OFF_BITS = log2_int(ROW_SIZE)
118
119 # INDEX_BITS is the number if bits to
120 # select a cache line
121 INDEX_BITS = log2_int(NUM_LINES)
122
123 # SET_SIZE_BITS is the log base 2 of the set size
124 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
125
126 # TAG_BITS is the number of bits of
127 # the tag part of the address
128 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
129
130 # TAG_WIDTH is the width in bits of each way of the tag RAM
131 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
132
133 # WAY_BITS is the number of bits to select a way
134 WAY_BITS = log2_int(NUM_WAYS)
135
136 # Example of layout for 32 lines of 64 bytes:
137 layout = """\
138 .. tag |index| line |
139 .. | row | |
140 .. | |---| | ROW_LINE_BITS (3)
141 .. | |--- - --| LINE_OFF_BITS (6)
142 .. | |- --| ROW_OFF_BITS (3)
143 .. |----- ---| | ROW_BITS (8)
144 .. |-----| | INDEX_BITS (5)
145 .. --------| | TAG_BITS (45)
146 """
147 print (layout)
148 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
149 (TAG_BITS, INDEX_BITS, ROW_BITS,
150 ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
151 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
152 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
153 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
154
155 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
156
157 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
158 print (" TAG_WIDTH", TAG_WIDTH)
159 print (" NUM_WAYS", NUM_WAYS)
160
161 def CacheTagArray():
162 tag_layout = [('valid', 1),
163 ('tag', TAG_RAM_WIDTH),
164 ]
165 return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
166
167 def RowPerLineValidArray():
168 return Array(Signal(name="rows_valid%d" % x) \
169 for x in range(ROW_PER_LINE))
170
171 # L1 TLB
172 TLB_SET_BITS = log2_int(TLB_SET_SIZE)
173 TLB_WAY_BITS = log2_int(TLB_NUM_WAYS)
174 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
175 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
176 TLB_PTE_BITS = 64
177 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
178
179 def ispow2(x):
180 return (1<<log2_int(x, False)) == x
181
182 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
183 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
184 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
185 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
186 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
187 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
188 "geometry bits don't add up"
189 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
190 "geometry bits don't add up"
191 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
192 "geometry bits don't add up"
193 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
194 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
195
196
197 def TLBHit(name):
198 return Record([('valid', 1),
199 ('way', TLB_WAY_BITS)], name=name)
200
201 def TLBTagEAArray():
202 return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
203 for x in range (TLB_NUM_WAYS))
204
205 def TLBRecord(name):
206 tlb_layout = [('valid', TLB_NUM_WAYS),
207 ('tag', TLB_TAG_WAY_BITS),
208 ('pte', TLB_PTE_WAY_BITS)
209 ]
210 return Record(tlb_layout, name=name)
211
212 def TLBValidArray():
213 return Array(Signal(TLB_NUM_WAYS, name="tlb_valid%d" % x)
214 for x in range(TLB_SET_SIZE))
215
216 def HitWaySet():
217 return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
218 for x in range(TLB_NUM_WAYS))
219
220 # Cache RAM interface
221 def CacheRamOut():
222 return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
223 for x in range(NUM_WAYS))
224
225 # PLRU output interface
226 def PLRUOut():
227 return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
228 for x in range(NUM_LINES))
229
230 # TLB PLRU output interface
231 def TLBPLRUOut():
232 return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
233 for x in range(TLB_SET_SIZE))
234
235 # Helper functions to decode incoming requests
236 #
237 # Return the cache line index (tag index) for an address
238 def get_index(addr):
239 return addr[LINE_OFF_BITS:SET_SIZE_BITS]
240
241 # Return the cache row index (data memory) for an address
242 def get_row(addr):
243 return addr[ROW_OFF_BITS:SET_SIZE_BITS]
244
245 # Return the index of a row within a line
246 def get_row_of_line(row):
247 return row[:ROW_BITS][:ROW_LINE_BITS]
248
249 # Returns whether this is the last row of a line
250 def is_last_row_addr(addr, last):
251 return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
252
253 # Returns whether this is the last row of a line
254 def is_last_row(row, last):
255 return get_row_of_line(row) == last
256
257 # Return the next row in the current cache line. We use a
258 # dedicated function in order to limit the size of the
259 # generated adder to be only the bits within a cache line
260 # (3 bits with default settings)
261 def next_row(row):
262 row_v = row[0:ROW_LINE_BITS] + 1
263 return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
264
265 # Get the tag value from the address
266 def get_tag(addr):
267 return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
268
269 # Read a tag from a tag memory row
270 def read_tag(way, tagset):
271 return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
272
273 # Read a TLB tag from a TLB tag memory row
274 def read_tlb_tag(way, tags):
275 return tags.word_select(way, TLB_EA_TAG_BITS)
276
277 # Write a TLB tag to a TLB tag memory row
278 def write_tlb_tag(way, tags, tag):
279 return read_tlb_tag(way, tags).eq(tag)
280
281 # Read a PTE from a TLB PTE memory row
282 def read_tlb_pte(way, ptes):
283 return ptes.word_select(way, TLB_PTE_BITS)
284
285 def write_tlb_pte(way, ptes, newpte):
286 return read_tlb_pte(way, ptes).eq(newpte)
287
288
289 # Record for storing permission, attribute, etc. bits from a PTE
290 class PermAttr(RecordObject):
291 def __init__(self, name=None):
292 super().__init__(name=name)
293 self.reference = Signal()
294 self.changed = Signal()
295 self.nocache = Signal()
296 self.priv = Signal()
297 self.rd_perm = Signal()
298 self.wr_perm = Signal()
299
300
301 def extract_perm_attr(pte):
302 pa = PermAttr()
303 return pa;
304
305
306 # Type of operation on a "valid" input
307 @unique
308 class Op(Enum):
309 OP_NONE = 0
310 OP_BAD = 1 # NC cache hit, TLB miss, prot/RC failure
311 OP_STCX_FAIL = 2 # conditional store w/o reservation
312 OP_LOAD_HIT = 3 # Cache hit on load
313 OP_LOAD_MISS = 4 # Load missing cache
314 OP_LOAD_NC = 5 # Non-cachable load
315 OP_STORE_HIT = 6 # Store hitting cache
316 OP_STORE_MISS = 7 # Store missing cache
317
318
319 # Cache state machine
320 @unique
321 class State(Enum):
322 IDLE = 0 # Normal load hit processing
323 RELOAD_WAIT_ACK = 1 # Cache reload wait ack
324 STORE_WAIT_ACK = 2 # Store wait ack
325 NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
326
327
328 # Dcache operations:
329 #
330 # In order to make timing, we use the BRAMs with
331 # an output buffer, which means that the BRAM
332 # output is delayed by an extra cycle.
333 #
334 # Thus, the dcache has a 2-stage internal pipeline
335 # for cache hits with no stalls.
336 #
337 # All other operations are handled via stalling
338 # in the first stage.
339 #
340 # The second stage can thus complete a hit at the same
341 # time as the first stage emits a stall for a complex op.
342 #
343 # Stage 0 register, basically contains just the latched request
344
345 class RegStage0(RecordObject):
346 def __init__(self, name=None):
347 super().__init__(name=name)
348 self.req = LoadStore1ToDCacheType(name="lsmem")
349 self.tlbie = Signal() # indicates a tlbie request (from MMU)
350 self.doall = Signal() # with tlbie, indicates flush whole TLB
351 self.tlbld = Signal() # indicates a TLB load request (from MMU)
352 self.mmu_req = Signal() # indicates source of request
353 self.d_valid = Signal() # indicates req.data is valid now
354
355
356 class MemAccessRequest(RecordObject):
357 def __init__(self, name=None):
358 super().__init__(name=name)
359 self.op = Signal(Op)
360 self.valid = Signal()
361 self.dcbz = Signal()
362 self.real_addr = Signal(REAL_ADDR_BITS)
363 self.data = Signal(64)
364 self.byte_sel = Signal(8)
365 self.hit_way = Signal(WAY_BITS)
366 self.same_tag = Signal()
367 self.mmu_req = Signal()
368
369
370 # First stage register, contains state for stage 1 of load hits
371 # and for the state machine used by all other operations
372 class RegStage1(RecordObject):
373 def __init__(self, name=None):
374 super().__init__(name=name)
375 # Info about the request
376 self.full = Signal() # have uncompleted request
377 self.mmu_req = Signal() # request is from MMU
378 self.req = MemAccessRequest(name="reqmem")
379
380 # Cache hit state
381 self.hit_way = Signal(WAY_BITS)
382 self.hit_load_valid = Signal()
383 self.hit_index = Signal(INDEX_BITS)
384 self.cache_hit = Signal()
385
386 # TLB hit state
387 self.tlb_hit = TLBHit("tlb_hit")
388 self.tlb_hit_index = Signal(TLB_SET_BITS)
389
390 # 2-stage data buffer for data forwarded from writes to reads
391 self.forward_data1 = Signal(64)
392 self.forward_data2 = Signal(64)
393 self.forward_sel1 = Signal(8)
394 self.forward_valid1 = Signal()
395 self.forward_way1 = Signal(WAY_BITS)
396 self.forward_row1 = Signal(ROW_BITS)
397 self.use_forward1 = Signal()
398 self.forward_sel = Signal(8)
399
400 # Cache miss state (reload state machine)
401 self.state = Signal(State)
402 self.dcbz = Signal()
403 self.write_bram = Signal()
404 self.write_tag = Signal()
405 self.slow_valid = Signal()
406 self.wb = WBMasterOut("wb")
407 self.reload_tag = Signal(TAG_BITS)
408 self.store_way = Signal(WAY_BITS)
409 self.store_row = Signal(ROW_BITS)
410 self.store_index = Signal(INDEX_BITS)
411 self.end_row_ix = Signal(ROW_LINE_BITS)
412 self.rows_valid = RowPerLineValidArray()
413 self.acks_pending = Signal(3)
414 self.inc_acks = Signal()
415 self.dec_acks = Signal()
416
417 # Signals to complete (possibly with error)
418 self.ls_valid = Signal()
419 self.ls_error = Signal()
420 self.mmu_done = Signal()
421 self.mmu_error = Signal()
422 self.cache_paradox = Signal()
423
424 # Signal to complete a failed stcx.
425 self.stcx_fail = Signal()
426
427
428 # Reservation information
429 class Reservation(RecordObject):
430 def __init__(self):
431 super().__init__()
432 self.valid = Signal()
433 self.addr = Signal(64-LINE_OFF_BITS)
434
435
436 class DTLBUpdate(Elaboratable):
437 def __init__(self):
438 self.dtlb = TLBValidArray()
439 self.tlbie = Signal()
440 self.tlbwe = Signal()
441 self.doall = Signal()
442 self.tlb_hit = TLBHit("tlb_hit")
443 self.tlb_req_index = Signal(TLB_SET_BITS)
444
445 self.tlb_tag_way = Signal(TLB_TAG_WAY_BITS)
446 self.tlb_pte_way = Signal(TLB_PTE_WAY_BITS)
447 self.repl_way = Signal(TLB_WAY_BITS)
448 self.eatag = Signal(TLB_EA_TAG_BITS)
449 self.pte_data = Signal(TLB_PTE_BITS)
450
451 # read from dtlb array
452 self.tlb_read = Signal()
453 self.tlb_read_index = Signal(TLB_SET_BITS)
454 self.tlb_way = TLBRecord("o_tlb_way")
455
456 def elaborate(self, platform):
457 m = Module()
458 comb = m.d.comb
459 sync = m.d.sync
460
461 dtlb, tlb_req_index = self.dtlb, self.tlb_req_index
462
463 print ("TLB_TAG_WAY_BITS", TLB_TAG_WAY_BITS)
464 print (" TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
465 print (" TLB_NUM_WAYS", TLB_NUM_WAYS)
466 print ("TLB_PTE_WAY_BITS", TLB_PTE_WAY_BITS)
467 print (" TLB_PTE_BITS", TLB_PTE_BITS)
468 print (" TLB_NUM_WAYS", TLB_NUM_WAYS)
469
470 # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
471 tagway = Memory(depth=TLB_SET_SIZE, width=TLB_TAG_WAY_BITS)
472 m.submodules.rd_tagway = rd_tagway = tagway.read_port()
473 m.submodules.wr_tagway = wr_tagway = tagway.write_port(
474 granularity=TLB_EA_TAG_BITS)
475
476 pteway = Memory(depth=TLB_SET_SIZE, width=TLB_PTE_WAY_BITS)
477 m.submodules.rd_pteway = rd_pteway = pteway.read_port()
478 m.submodules.wr_pteway = wr_pteway = pteway.write_port(
479 granularity=TLB_PTE_BITS)
480
481 m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
482 m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
483 m.d.comb += wr_tagway.addr.eq(tlb_req_index)
484 m.d.comb += wr_pteway.addr.eq(tlb_req_index)
485
486 tagset = Signal(TLB_TAG_WAY_BITS)
487 pteset = Signal(TLB_PTE_WAY_BITS)
488 updated = Signal()
489 v_updated = Signal()
490 tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
491 db_out = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
492 pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
493 dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
494
495 comb += dv.eq(dtlb[tlb_req_index])
496 comb += db_out.eq(dv)
497
498 with m.If(self.tlbie & self.doall):
499 # clear all valid bits at once
500 for i in range(TLB_SET_SIZE):
501 sync += dtlb[i].eq(0)
502 with m.Elif(self.tlbie):
503 # invalidate just the hit_way
504 with m.If(self.tlb_hit.valid):
505 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
506 comb += v_updated.eq(1)
507 with m.Elif(self.tlbwe):
508 # write to the requested tag and PTE
509 comb += write_tlb_tag(self.repl_way, tb_out, self.eatag)
510 comb += write_tlb_pte(self.repl_way, pb_out, self.pte_data)
511 # set valid bit
512 comb += db_out.bit_select(self.repl_way, 1).eq(1)
513
514 comb += updated.eq(1)
515 comb += v_updated.eq(1)
516
517 with m.If(updated):
518 comb += wr_pteway.data.eq(pb_out)
519 comb += wr_pteway.en.eq(1<<self.repl_way)
520 comb += wr_tagway.data.eq(tb_out)
521 comb += wr_tagway.en.eq(1<<self.repl_way)
522 with m.If(v_updated):
523 sync += dtlb[tlb_req_index].eq(db_out)
524
525 # select one TLB way
526 r_tlb_way = TLBRecord("r_tlb_way")
527 r_delay = Signal()
528 sync += r_delay.eq(self.tlb_read)
529 with m.If(self.tlb_read):
530 sync += self.tlb_way.valid.eq(dtlb[self.tlb_read_index])
531 with m.If(r_delay):
532 comb += self.tlb_way.tag.eq(rd_tagway.data)
533 comb += self.tlb_way.pte.eq(rd_pteway.data)
534 sync += r_tlb_way.tag.eq(rd_tagway.data)
535 sync += r_tlb_way.pte.eq(rd_pteway.data)
536 with m.Else():
537 comb += self.tlb_way.tag.eq(r_tlb_way.tag)
538 comb += self.tlb_way.pte.eq(r_tlb_way.pte)
539
540 return m
541
542
543 class DCachePendingHit(Elaboratable):
544
545 def __init__(self, tlb_way,
546 cache_i_validdx, cache_tag_set,
547 req_addr,
548 hit_set):
549
550 self.go = Signal()
551 self.virt_mode = Signal()
552 self.is_hit = Signal()
553 self.tlb_hit = TLBHit("tlb_hit")
554 self.hit_way = Signal(WAY_BITS)
555 self.rel_match = Signal()
556 self.req_index = Signal(INDEX_BITS)
557 self.reload_tag = Signal(TAG_BITS)
558
559 self.tlb_way = tlb_way
560 self.cache_i_validdx = cache_i_validdx
561 self.cache_tag_set = cache_tag_set
562 self.req_addr = req_addr
563 self.hit_set = hit_set
564
565 def elaborate(self, platform):
566 m = Module()
567 comb = m.d.comb
568 sync = m.d.sync
569
570 go = self.go
571 virt_mode = self.virt_mode
572 is_hit = self.is_hit
573 tlb_way = self.tlb_way
574 cache_i_validdx = self.cache_i_validdx
575 cache_tag_set = self.cache_tag_set
576 req_addr = self.req_addr
577 tlb_hit = self.tlb_hit
578 hit_set = self.hit_set
579 hit_way = self.hit_way
580 rel_match = self.rel_match
581 req_index = self.req_index
582 reload_tag = self.reload_tag
583
584 rel_matches = Array(Signal(name="rel_matches_%d" % i) \
585 for i in range(TLB_NUM_WAYS))
586 hit_way_set = HitWaySet()
587
588 # Test if pending request is a hit on any way
589 # In order to make timing in virtual mode,
590 # when we are using the TLB, we compare each
591 # way with each of the real addresses from each way of
592 # the TLB, and then decide later which match to use.
593
594 with m.If(virt_mode):
595 for j in range(TLB_NUM_WAYS): # tlb_num_way_t
596 s_tag = Signal(TAG_BITS, name="s_tag%d" % j)
597 s_hit = Signal()
598 s_pte = Signal(TLB_PTE_BITS)
599 s_ra = Signal(REAL_ADDR_BITS)
600 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
601 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
602 s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
603 comb += s_tag.eq(get_tag(s_ra))
604
605 for i in range(NUM_WAYS): # way_t
606 is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
607 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
608 (read_tag(i, cache_tag_set) == s_tag)
609 & (tlb_way.valid[j]))
610 with m.If(is_tag_hit):
611 comb += hit_way_set[j].eq(i)
612 comb += s_hit.eq(1)
613 comb += hit_set[j].eq(s_hit)
614 with m.If(s_tag == reload_tag):
615 comb += rel_matches[j].eq(1)
616 with m.If(tlb_hit.valid):
617 comb += is_hit.eq(hit_set[tlb_hit.way])
618 comb += hit_way.eq(hit_way_set[tlb_hit.way])
619 comb += rel_match.eq(rel_matches[tlb_hit.way])
620 with m.Else():
621 s_tag = Signal(TAG_BITS)
622 comb += s_tag.eq(get_tag(req_addr))
623 for i in range(NUM_WAYS): # way_t
624 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
625 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
626 (read_tag(i, cache_tag_set) == s_tag))
627 with m.If(is_tag_hit):
628 comb += hit_way.eq(i)
629 comb += is_hit.eq(1)
630 with m.If(s_tag == reload_tag):
631 comb += rel_match.eq(1)
632
633 return m
634
635
636 class DCache(Elaboratable):
637 """Set associative dcache write-through
638
639 TODO (in no specific order):
640 * See list in icache.vhdl
641 * Complete load misses on the cycle when WB data comes instead of
642 at the end of line (this requires dealing with requests coming in
643 while not idle...)
644 """
645 def __init__(self):
646 self.d_in = LoadStore1ToDCacheType("d_in")
647 self.d_out = DCacheToLoadStore1Type("d_out")
648
649 self.m_in = MMUToDCacheType("m_in")
650 self.m_out = DCacheToMMUType("m_out")
651
652 self.stall_out = Signal()
653
654 # standard naming (wired to non-standard for compatibility)
655 self.bus = Interface(addr_width=32,
656 data_width=64,
657 granularity=8,
658 features={'stall'},
659 alignment=0,
660 name="dcache")
661
662 self.log_out = Signal(20)
663
664 def stage_0(self, m, r0, r1, r0_full):
665 """Latch the request in r0.req as long as we're not stalling
666 """
667 comb = m.d.comb
668 sync = m.d.sync
669 d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
670
671 r = RegStage0("stage0")
672
673 # TODO, this goes in unit tests and formal proofs
674 with m.If(d_in.valid & m_in.valid):
675 sync += Display("request collision loadstore vs MMU")
676
677 with m.If(m_in.valid):
678 comb += r.req.valid.eq(1)
679 comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
680 comb += r.req.dcbz.eq(0)
681 comb += r.req.nc.eq(0)
682 comb += r.req.reserve.eq(0)
683 comb += r.req.virt_mode.eq(0)
684 comb += r.req.priv_mode.eq(1)
685 comb += r.req.addr.eq(m_in.addr)
686 comb += r.req.data.eq(m_in.pte)
687 comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
688 comb += r.tlbie.eq(m_in.tlbie)
689 comb += r.doall.eq(m_in.doall)
690 comb += r.tlbld.eq(m_in.tlbld)
691 comb += r.mmu_req.eq(1)
692 m.d.sync += Display(" DCACHE req mmu addr %x pte %x ld %d",
693 m_in.addr, m_in.pte, r.req.load)
694
695 with m.Else():
696 comb += r.req.eq(d_in)
697 comb += r.req.data.eq(0)
698 comb += r.tlbie.eq(0)
699 comb += r.doall.eq(0)
700 comb += r.tlbld.eq(0)
701 comb += r.mmu_req.eq(0)
702 with m.If((~r1.full & ~d_in.hold) | ~r0_full):
703 sync += r0.eq(r)
704 sync += r0_full.eq(r.req.valid)
705 # Sample data the cycle after a request comes in from loadstore1.
706 # If another request has come in already then the data will get
707 # put directly into req.data below.
708 with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
709 ~r0.mmu_req):
710 sync += r0.req.data.eq(d_in.data)
711 sync += r0.d_valid.eq(1)
712 with m.If(d_in.valid):
713 m.d.sync += Display(" DCACHE req cache "
714 "virt %d addr %x data %x ld %d",
715 r.req.virt_mode, r.req.addr,
716 r.req.data, r.req.load)
717
718 def tlb_read(self, m, r0_stall, tlb_way, dtlb):
719 """TLB
720 Operates in the second cycle on the request latched in r0.req.
721 TLB updates write the entry at the end of the second cycle.
722 """
723 comb = m.d.comb
724 sync = m.d.sync
725 m_in, d_in = self.m_in, self.d_in
726
727 addrbits = Signal(TLB_SET_BITS)
728
729 amin = TLB_LG_PGSZ
730 amax = TLB_LG_PGSZ + TLB_SET_BITS
731
732 with m.If(m_in.valid):
733 comb += addrbits.eq(m_in.addr[amin : amax])
734 with m.Else():
735 comb += addrbits.eq(d_in.addr[amin : amax])
736
737 # If we have any op and the previous op isn't finished,
738 # then keep the same output for next cycle.
739 d = self.dtlb_update
740 comb += d.tlb_read_index.eq(addrbits)
741 comb += d.tlb_read.eq(~r0_stall)
742 comb += tlb_way.eq(d.tlb_way)
743
744 def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
745 """Generate TLB PLRUs
746 """
747 comb = m.d.comb
748 sync = m.d.sync
749
750 if TLB_NUM_WAYS == 0:
751 return
752
753 # Binary-to-Unary one-hot, enabled by tlb_hit valid
754 tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
755 m.submodules.tlb_plrus = tlb_plrus
756 comb += tlb_plrus.way.eq(r1.tlb_hit.way)
757 comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
758 comb += tlb_plrus.index.eq(r1.tlb_hit_index)
759 comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
760 comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
761
762 def tlb_search(self, m, tlb_req_index, r0, r0_valid,
763 tlb_way,
764 pte, tlb_hit, valid_ra, perm_attr, ra):
765
766 comb = m.d.comb
767
768 hitway = Signal(TLB_WAY_BITS)
769 hit = Signal()
770 eatag = Signal(TLB_EA_TAG_BITS)
771
772 TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
773 comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
774 comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
775
776 for i in range(TLB_NUM_WAYS):
777 is_tag_hit = Signal(name="is_tag_hit%d" % i)
778 tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
779 comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
780 comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
781 with m.If(is_tag_hit):
782 comb += hitway.eq(i)
783 comb += hit.eq(1)
784
785 comb += tlb_hit.valid.eq(hit & r0_valid)
786 comb += tlb_hit.way.eq(hitway)
787
788 with m.If(tlb_hit.valid):
789 comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
790 comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
791
792 with m.If(r0.req.virt_mode):
793 comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
794 r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
795 pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
796 comb += perm_attr.reference.eq(pte[8])
797 comb += perm_attr.changed.eq(pte[7])
798 comb += perm_attr.nocache.eq(pte[5])
799 comb += perm_attr.priv.eq(pte[3])
800 comb += perm_attr.rd_perm.eq(pte[2])
801 comb += perm_attr.wr_perm.eq(pte[1])
802 with m.Else():
803 comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
804 r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
805 comb += perm_attr.reference.eq(1)
806 comb += perm_attr.changed.eq(1)
807 comb += perm_attr.nocache.eq(0)
808 comb += perm_attr.priv.eq(1)
809 comb += perm_attr.rd_perm.eq(1)
810 comb += perm_attr.wr_perm.eq(1)
811
812 with m.If(valid_ra):
813 m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
814 r0.req.virt_mode, tlb_hit.valid, ra, pte)
815 m.d.sync += Display(" perm ref=%d", perm_attr.reference)
816 m.d.sync += Display(" perm chg=%d", perm_attr.changed)
817 m.d.sync += Display(" perm noc=%d", perm_attr.nocache)
818 m.d.sync += Display(" perm prv=%d", perm_attr.priv)
819 m.d.sync += Display(" perm rdp=%d", perm_attr.rd_perm)
820 m.d.sync += Display(" perm wrp=%d", perm_attr.wr_perm)
821
822 def tlb_update(self, m, r0_valid, r0, dtlb, tlb_req_index,
823 tlb_hit, tlb_plru_victim, tlb_way):
824
825 comb = m.d.comb
826 sync = m.d.sync
827
828 tlbie = Signal()
829 tlbwe = Signal()
830
831 comb += tlbie.eq(r0_valid & r0.tlbie)
832 comb += tlbwe.eq(r0_valid & r0.tlbld)
833
834 d = self.dtlb_update
835
836 comb += d.tlbie.eq(tlbie)
837 comb += d.tlbwe.eq(tlbwe)
838 comb += d.doall.eq(r0.doall)
839 comb += d.tlb_hit.eq(tlb_hit)
840 comb += d.tlb_tag_way.eq(tlb_way.tag)
841 comb += d.tlb_pte_way.eq(tlb_way.pte)
842 comb += d.tlb_req_index.eq(tlb_req_index)
843
844 with m.If(tlb_hit.valid):
845 comb += d.repl_way.eq(tlb_hit.way)
846 with m.Else():
847 comb += d.repl_way.eq(tlb_plru_victim)
848 comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
849 comb += d.pte_data.eq(r0.req.data)
850
851 def maybe_plrus(self, m, r1, plru_victim):
852 """Generate PLRUs
853 """
854 comb = m.d.comb
855 sync = m.d.sync
856
857 if TLB_NUM_WAYS == 0:
858 return
859
860 m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
861 comb += plrus.way.eq(r1.hit_way)
862 comb += plrus.valid.eq(r1.cache_hit)
863 comb += plrus.index.eq(r1.hit_index)
864 comb += plrus.isel.eq(r1.store_index) # select victim
865 comb += plru_victim.eq(plrus.o_index) # selected victim
866
867 def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
868 """Cache tag RAM read port
869 """
870 comb = m.d.comb
871 sync = m.d.sync
872 m_in, d_in = self.m_in, self.d_in
873
874 index = Signal(INDEX_BITS)
875
876 with m.If(r0_stall):
877 comb += index.eq(req_index)
878 with m.Elif(m_in.valid):
879 comb += index.eq(get_index(m_in.addr))
880 with m.Else():
881 comb += index.eq(get_index(d_in.addr))
882 sync += cache_tag_set.eq(cache_tags[index].tag)
883
884 def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
885 r0_valid, r1, cache_tags, replace_way,
886 use_forward1_next, use_forward2_next,
887 req_hit_way, plru_victim, rc_ok, perm_attr,
888 valid_ra, perm_ok, access_ok, req_op, req_go,
889 tlb_hit, tlb_way, cache_tag_set,
890 cancel_store, req_same_tag, r0_stall, early_req_row):
891 """Cache request parsing and hit detection
892 """
893
894 comb = m.d.comb
895 m_in, d_in = self.m_in, self.d_in
896
897 is_hit = Signal()
898 hit_way = Signal(WAY_BITS)
899 op = Signal(Op)
900 opsel = Signal(3)
901 go = Signal()
902 nc = Signal()
903 hit_set = Array(Signal(name="hit_set_%d" % i) \
904 for i in range(TLB_NUM_WAYS))
905 cache_i_validdx = Signal(NUM_WAYS)
906
907 # Extract line, row and tag from request
908 comb += req_index.eq(get_index(r0.req.addr))
909 comb += req_row.eq(get_row(r0.req.addr))
910 comb += req_tag.eq(get_tag(ra))
911
912 if False: # display on comb is a bit... busy.
913 comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
914 r0.req.addr, ra, req_index, req_tag, req_row)
915
916 comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
917 comb += cache_i_validdx.eq(cache_tags[req_index].valid)
918
919 m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
920 cache_i_validdx, cache_tag_set,
921 r0.req.addr,
922 hit_set)
923 comb += dc.tlb_hit.eq(tlb_hit)
924 comb += dc.reload_tag.eq(r1.reload_tag)
925 comb += dc.virt_mode.eq(r0.req.virt_mode)
926 comb += dc.go.eq(go)
927 comb += dc.req_index.eq(req_index)
928
929 comb += is_hit.eq(dc.is_hit)
930 comb += hit_way.eq(dc.hit_way)
931 comb += req_same_tag.eq(dc.rel_match)
932
933 # See if the request matches the line currently being reloaded
934 with m.If((r1.state == State.RELOAD_WAIT_ACK) &
935 (req_index == r1.store_index) & req_same_tag):
936 # For a store, consider this a hit even if the row isn't
937 # valid since it will be by the time we perform the store.
938 # For a load, check the appropriate row valid bit.
939 rrow = Signal(ROW_LINE_BITS)
940 comb += rrow.eq(req_row)
941 valid = r1.rows_valid[rrow]
942 comb += is_hit.eq((~r0.req.load) | valid)
943 comb += hit_way.eq(replace_way)
944
945 # Whether to use forwarded data for a load or not
946 with m.If((get_row(r1.req.real_addr) == req_row) &
947 (r1.req.hit_way == hit_way)):
948 # Only need to consider r1.write_bram here, since if we
949 # are writing refill data here, then we don't have a
950 # cache hit this cycle on the line being refilled.
951 # (There is the possibility that the load following the
952 # load miss that started the refill could be to the old
953 # contents of the victim line, since it is a couple of
954 # cycles after the refill starts before we see the updated
955 # cache tag. In that case we don't use the bypass.)
956 comb += use_forward1_next.eq(r1.write_bram)
957 with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
958 comb += use_forward2_next.eq(r1.forward_valid1)
959
960 # The way that matched on a hit
961 comb += req_hit_way.eq(hit_way)
962
963 # The way to replace on a miss
964 with m.If(r1.write_tag):
965 comb += replace_way.eq(plru_victim)
966 with m.Else():
967 comb += replace_way.eq(r1.store_way)
968
969 # work out whether we have permission for this access
970 # NB we don't yet implement AMR, thus no KUAP
971 comb += rc_ok.eq(perm_attr.reference
972 & (r0.req.load | perm_attr.changed))
973 comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
974 (perm_attr.wr_perm |
975 (r0.req.load & perm_attr.rd_perm)))
976 comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
977
978 # Combine the request and cache hit status to decide what
979 # operation needs to be done
980 comb += nc.eq(r0.req.nc | perm_attr.nocache)
981 comb += op.eq(Op.OP_NONE)
982 with m.If(go):
983 with m.If(~access_ok):
984 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
985 valid_ra, perm_ok, rc_ok)
986 comb += op.eq(Op.OP_BAD)
987 with m.Elif(cancel_store):
988 m.d.sync += Display("DCACHE cancel store")
989 comb += op.eq(Op.OP_STCX_FAIL)
990 with m.Else():
991 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
992 valid_ra, nc, r0.req.load)
993 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
994 with m.Switch(opsel):
995 with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
996 with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
997 with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
998 with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
999 with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
1000 with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
1001 with m.Case(0b011): comb += op.eq(Op.OP_BAD)
1002 with m.Case(0b111): comb += op.eq(Op.OP_BAD)
1003 comb += req_op.eq(op)
1004 comb += req_go.eq(go)
1005
1006 # Version of the row number that is valid one cycle earlier
1007 # in the cases where we need to read the cache data BRAM.
1008 # If we're stalling then we need to keep reading the last
1009 # row requested.
1010 with m.If(~r0_stall):
1011 with m.If(m_in.valid):
1012 comb += early_req_row.eq(get_row(m_in.addr))
1013 with m.Else():
1014 comb += early_req_row.eq(get_row(d_in.addr))
1015 with m.Else():
1016 comb += early_req_row.eq(req_row)
1017
1018 def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
1019 r0_valid, r0, reservation):
1020 """Handle load-with-reservation and store-conditional instructions
1021 """
1022 comb = m.d.comb
1023
1024 with m.If(r0_valid & r0.req.reserve):
1025 # XXX generate alignment interrupt if address
1026 # is not aligned XXX or if r0.req.nc = '1'
1027 with m.If(r0.req.load):
1028 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
1029 with m.Else():
1030 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1031 with m.If((~reservation.valid) |
1032 (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1033 comb += cancel_store.eq(1)
1034
1035 def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1036 reservation, r0):
1037 comb = m.d.comb
1038 sync = m.d.sync
1039
1040 with m.If(r0_valid & access_ok):
1041 with m.If(clear_rsrv):
1042 sync += reservation.valid.eq(0)
1043 with m.Elif(set_rsrv):
1044 sync += reservation.valid.eq(1)
1045 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1046
1047 def writeback_control(self, m, r1, cache_out_row):
1048 """Return data for loads & completion control logic
1049 """
1050 comb = m.d.comb
1051 sync = m.d.sync
1052 d_out, m_out = self.d_out, self.m_out
1053
1054 data_out = Signal(64)
1055 data_fwd = Signal(64)
1056
1057 # Use the bypass if are reading the row that was
1058 # written 1 or 2 cycles ago, including for the
1059 # slow_valid = 1 case (i.e. completing a load
1060 # miss or a non-cacheable load).
1061 with m.If(r1.use_forward1):
1062 comb += data_fwd.eq(r1.forward_data1)
1063 with m.Else():
1064 comb += data_fwd.eq(r1.forward_data2)
1065
1066 comb += data_out.eq(cache_out_row)
1067
1068 for i in range(8):
1069 with m.If(r1.forward_sel[i]):
1070 dsel = data_fwd.word_select(i, 8)
1071 comb += data_out.word_select(i, 8).eq(dsel)
1072
1073 # DCache output to LoadStore
1074 comb += d_out.valid.eq(r1.ls_valid)
1075 comb += d_out.data.eq(data_out)
1076 comb += d_out.store_done.eq(~r1.stcx_fail)
1077 comb += d_out.error.eq(r1.ls_error)
1078 comb += d_out.cache_paradox.eq(r1.cache_paradox)
1079
1080 # Outputs to MMU
1081 comb += m_out.done.eq(r1.mmu_done)
1082 comb += m_out.err.eq(r1.mmu_error)
1083 comb += m_out.data.eq(data_out)
1084
1085 # We have a valid load or store hit or we just completed
1086 # a slow op such as a load miss, a NC load or a store
1087 #
1088 # Note: the load hit is delayed by one cycle. However it
1089 # can still not collide with r.slow_valid (well unless I
1090 # miscalculated) because slow_valid can only be set on a
1091 # subsequent request and not on its first cycle (the state
1092 # machine must have advanced), which makes slow_valid
1093 # at least 2 cycles from the previous hit_load_valid.
1094
1095 # Sanity: Only one of these must be set in any given cycle
1096
1097 if False: # TODO: need Display to get this to work
1098 assert (r1.slow_valid & r1.stcx_fail) != 1, \
1099 "unexpected slow_valid collision with stcx_fail"
1100
1101 assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1102 "unexpected hit_load_delayed collision with slow_valid"
1103
1104 with m.If(~r1.mmu_req):
1105 # Request came from loadstore1...
1106 # Load hit case is the standard path
1107 with m.If(r1.hit_load_valid):
1108 sync += Display("completing load hit data=%x", data_out)
1109
1110 # error cases complete without stalling
1111 with m.If(r1.ls_error):
1112 with m.If(r1.dcbz):
1113 sync += Display("completing dcbz with error")
1114 with m.Else():
1115 sync += Display("completing ld/st with error")
1116
1117 # Slow ops (load miss, NC, stores)
1118 with m.If(r1.slow_valid):
1119 sync += Display("completing store or load miss adr=%x data=%x",
1120 r1.req.real_addr, data_out)
1121
1122 with m.Else():
1123 # Request came from MMU
1124 with m.If(r1.hit_load_valid):
1125 sync += Display("completing load hit to MMU, data=%x",
1126 m_out.data)
1127 # error cases complete without stalling
1128 with m.If(r1.mmu_error):
1129 sync += Display("combpleting MMU ld with error")
1130
1131 # Slow ops (i.e. load miss)
1132 with m.If(r1.slow_valid):
1133 sync += Display("completing MMU load miss, adr=%x data=%x",
1134 r1.req.real_addr, m_out.data)
1135
1136 def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1137 """rams
1138 Generate a cache RAM for each way. This handles the normal
1139 reads, writes from reloads and the special store-hit update
1140 path as well.
1141
1142 Note: the BRAMs have an extra read buffer, meaning the output
1143 is pipelined an extra cycle. This differs from the
1144 icache. The writeback logic needs to take that into
1145 account by using 1-cycle delayed signals for load hits.
1146 """
1147 comb = m.d.comb
1148 bus = self.bus
1149
1150 # a Binary-to-Unary one-hots here. replace-way one-hot is gated
1151 # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1152 m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
1153 comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1154 ~r1.write_bram))
1155 comb += rwe.i.eq(replace_way)
1156
1157 m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
1158 comb += hwe.i.eq(r1.hit_way)
1159
1160 # this one is gated with write_bram, and replace_way_e can never be
1161 # set at the same time. that means that do_write can OR the outputs
1162 m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
1163 comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1164 comb += hre.i.eq(r1.req.hit_way)
1165
1166 # common Signals
1167 do_read = Signal()
1168 wr_addr = Signal(ROW_BITS)
1169 wr_data = Signal(WB_DATA_BITS)
1170 wr_sel = Signal(ROW_SIZE)
1171 rd_addr = Signal(ROW_BITS)
1172
1173 comb += do_read.eq(1) # always enable
1174 comb += rd_addr.eq(early_req_row)
1175
1176 # Write mux:
1177 #
1178 # Defaults to wishbone read responses (cache refill)
1179 #
1180 # For timing, the mux on wr_data/sel/addr is not
1181 # dependent on anything other than the current state.
1182
1183 with m.If(r1.write_bram):
1184 # Write store data to BRAM. This happens one
1185 # cycle after the store is in r0.
1186 comb += wr_data.eq(r1.req.data)
1187 comb += wr_sel.eq(r1.req.byte_sel)
1188 comb += wr_addr.eq(get_row(r1.req.real_addr))
1189
1190 with m.Else():
1191 # Otherwise, we might be doing a reload or a DCBZ
1192 with m.If(r1.dcbz):
1193 comb += wr_data.eq(0)
1194 with m.Else():
1195 comb += wr_data.eq(bus.dat_r)
1196 comb += wr_addr.eq(r1.store_row)
1197 comb += wr_sel.eq(~0) # all 1s
1198
1199 # set up Cache Rams
1200 for i in range(NUM_WAYS):
1201 do_write = Signal(name="do_wr%d" % i)
1202 wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
1203 d_out = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1204
1205 way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1206 setattr(m.submodules, "cacheram_%d" % i, way)
1207
1208 comb += way.rd_en.eq(do_read)
1209 comb += way.rd_addr.eq(rd_addr)
1210 comb += d_out.eq(way.rd_data_o)
1211 comb += way.wr_sel.eq(wr_sel_m)
1212 comb += way.wr_addr.eq(wr_addr)
1213 comb += way.wr_data.eq(wr_data)
1214
1215 # Cache hit reads
1216 with m.If(hwe.o[i]):
1217 comb += cache_out_row.eq(d_out)
1218
1219 # these are mutually-exclusive via their Decoder-enablers
1220 # (note: Decoder-enable is inverted)
1221 comb += do_write.eq(hre.o[i] | rwe.o[i])
1222
1223 # Mask write selects with do_write since BRAM
1224 # doesn't have a global write-enable
1225 with m.If(do_write):
1226 comb += wr_sel_m.eq(wr_sel)
1227
1228 # Cache hit synchronous machine for the easy case.
1229 # This handles load hits.
1230 # It also handles error cases (TLB miss, cache paradox)
1231 def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1232 req_hit_way, req_index, req_tag, access_ok,
1233 tlb_hit, tlb_req_index):
1234 comb = m.d.comb
1235 sync = m.d.sync
1236
1237 with m.If(req_op != Op.OP_NONE):
1238 sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1239 req_op, r0.req.addr, r0.req.nc,
1240 req_index, req_tag, req_hit_way)
1241
1242 with m.If(r0_valid):
1243 sync += r1.mmu_req.eq(r0.mmu_req)
1244
1245 # Fast path for load/store hits.
1246 # Set signals for the writeback controls.
1247 sync += r1.hit_way.eq(req_hit_way)
1248 sync += r1.hit_index.eq(req_index)
1249
1250 sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1251 sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1252 (req_op == Op.OP_STORE_HIT))
1253
1254 with m.If(req_op == Op.OP_BAD):
1255 sync += Display("Signalling ld/st error "
1256 "ls_error=%i mmu_error=%i cache_paradox=%i",
1257 ~r0.mmu_req,r0.mmu_req,access_ok)
1258 sync += r1.ls_error.eq(~r0.mmu_req)
1259 sync += r1.mmu_error.eq(r0.mmu_req)
1260 sync += r1.cache_paradox.eq(access_ok)
1261 with m.Else():
1262 sync += r1.ls_error.eq(0)
1263 sync += r1.mmu_error.eq(0)
1264 sync += r1.cache_paradox.eq(0)
1265
1266 sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1267
1268 # Record TLB hit information for updating TLB PLRU
1269 sync += r1.tlb_hit.eq(tlb_hit)
1270 sync += r1.tlb_hit_index.eq(tlb_req_index)
1271
1272 # Memory accesses are handled by this state machine:
1273 #
1274 # * Cache load miss/reload (in conjunction with "rams")
1275 # * Load hits for non-cachable forms
1276 # * Stores (the collision case is handled in "rams")
1277 #
1278 # All wishbone requests generation is done here.
1279 # This machine operates at stage 1.
1280 def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1281 r0, replace_way,
1282 req_hit_way, req_same_tag,
1283 r0_valid, req_op, cache_tags, req_go, ra):
1284
1285 comb = m.d.comb
1286 sync = m.d.sync
1287 bus = self.bus
1288 d_in = self.d_in
1289
1290 req = MemAccessRequest("mreq_ds")
1291
1292 req_row = Signal(ROW_BITS)
1293 req_idx = Signal(INDEX_BITS)
1294 req_tag = Signal(TAG_BITS)
1295 comb += req_idx.eq(get_index(req.real_addr))
1296 comb += req_row.eq(get_row(req.real_addr))
1297 comb += req_tag.eq(get_tag(req.real_addr))
1298
1299 sync += r1.use_forward1.eq(use_forward1_next)
1300 sync += r1.forward_sel.eq(0)
1301
1302 with m.If(use_forward1_next):
1303 sync += r1.forward_sel.eq(r1.req.byte_sel)
1304 with m.Elif(use_forward2_next):
1305 sync += r1.forward_sel.eq(r1.forward_sel1)
1306
1307 sync += r1.forward_data2.eq(r1.forward_data1)
1308 with m.If(r1.write_bram):
1309 sync += r1.forward_data1.eq(r1.req.data)
1310 sync += r1.forward_sel1.eq(r1.req.byte_sel)
1311 sync += r1.forward_way1.eq(r1.req.hit_way)
1312 sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1313 sync += r1.forward_valid1.eq(1)
1314 with m.Else():
1315 with m.If(r1.dcbz):
1316 sync += r1.forward_data1.eq(0)
1317 with m.Else():
1318 sync += r1.forward_data1.eq(bus.dat_r)
1319 sync += r1.forward_sel1.eq(~0) # all 1s
1320 sync += r1.forward_way1.eq(replace_way)
1321 sync += r1.forward_row1.eq(r1.store_row)
1322 sync += r1.forward_valid1.eq(0)
1323
1324 # One cycle pulses reset
1325 sync += r1.slow_valid.eq(0)
1326 sync += r1.write_bram.eq(0)
1327 sync += r1.inc_acks.eq(0)
1328 sync += r1.dec_acks.eq(0)
1329
1330 sync += r1.ls_valid.eq(0)
1331 # complete tlbies and TLB loads in the third cycle
1332 sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1333
1334 with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1335 with m.If(~r0.mmu_req):
1336 sync += r1.ls_valid.eq(1)
1337 with m.Else():
1338 sync += r1.mmu_done.eq(1)
1339
1340 with m.If(r1.write_tag):
1341 # Store new tag in selected way
1342 replace_way_onehot = Signal(NUM_WAYS)
1343 comb += replace_way_onehot.eq(1<<replace_way)
1344 for i in range(NUM_WAYS):
1345 with m.If(replace_way_onehot[i]):
1346 ct = Signal(TAG_RAM_WIDTH)
1347 comb += ct.eq(cache_tags[r1.store_index].tag)
1348 comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1349 sync += cache_tags[r1.store_index].tag.eq(ct)
1350 sync += r1.store_way.eq(replace_way)
1351 sync += r1.write_tag.eq(0)
1352
1353 # Take request from r1.req if there is one there,
1354 # else from req_op, ra, etc.
1355 with m.If(r1.full):
1356 comb += req.eq(r1.req)
1357 with m.Else():
1358 comb += req.op.eq(req_op)
1359 comb += req.valid.eq(req_go)
1360 comb += req.mmu_req.eq(r0.mmu_req)
1361 comb += req.dcbz.eq(r0.req.dcbz)
1362 comb += req.real_addr.eq(ra)
1363
1364 with m.If(r0.req.dcbz):
1365 # force data to 0 for dcbz
1366 comb += req.data.eq(0)
1367 with m.Elif(r0.d_valid):
1368 comb += req.data.eq(r0.req.data)
1369 with m.Else():
1370 comb += req.data.eq(d_in.data)
1371
1372 # Select all bytes for dcbz
1373 # and for cacheable loads
1374 with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1375 comb += req.byte_sel.eq(~0) # all 1s
1376 with m.Else():
1377 comb += req.byte_sel.eq(r0.req.byte_sel)
1378 comb += req.hit_way.eq(req_hit_way)
1379 comb += req.same_tag.eq(req_same_tag)
1380
1381 # Store the incoming request from r0,
1382 # if it is a slow request
1383 # Note that r1.full = 1 implies req_op = OP_NONE
1384 with m.If((req_op == Op.OP_LOAD_MISS)
1385 | (req_op == Op.OP_LOAD_NC)
1386 | (req_op == Op.OP_STORE_MISS)
1387 | (req_op == Op.OP_STORE_HIT)):
1388 sync += r1.req.eq(req)
1389 sync += r1.full.eq(1)
1390
1391 # Main state machine
1392 with m.Switch(r1.state):
1393
1394 with m.Case(State.IDLE):
1395 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1396 sync += r1.wb.sel.eq(req.byte_sel)
1397 sync += r1.wb.dat.eq(req.data)
1398 sync += r1.dcbz.eq(req.dcbz)
1399
1400 # Keep track of our index and way
1401 # for subsequent stores.
1402 sync += r1.store_index.eq(req_idx)
1403 sync += r1.store_row.eq(req_row)
1404 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1405 sync += r1.reload_tag.eq(req_tag)
1406 sync += r1.req.same_tag.eq(1)
1407
1408 with m.If(req.op == Op.OP_STORE_HIT):
1409 sync += r1.store_way.eq(req.hit_way)
1410
1411 # Reset per-row valid bits,
1412 # ready for handling OP_LOAD_MISS
1413 for i in range(ROW_PER_LINE):
1414 sync += r1.rows_valid[i].eq(0)
1415
1416 with m.If(req_op != Op.OP_NONE):
1417 sync += Display("cache op %d", req.op)
1418
1419 with m.Switch(req.op):
1420 with m.Case(Op.OP_LOAD_HIT):
1421 # stay in IDLE state
1422 pass
1423
1424 with m.Case(Op.OP_LOAD_MISS):
1425 sync += Display("cache miss real addr: %x " \
1426 "idx: %x tag: %x",
1427 req.real_addr, req_row, req_tag)
1428
1429 # Start the wishbone cycle
1430 sync += r1.wb.we.eq(0)
1431 sync += r1.wb.cyc.eq(1)
1432 sync += r1.wb.stb.eq(1)
1433
1434 # Track that we had one request sent
1435 sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1436 sync += r1.write_tag.eq(1)
1437
1438 with m.Case(Op.OP_LOAD_NC):
1439 sync += r1.wb.cyc.eq(1)
1440 sync += r1.wb.stb.eq(1)
1441 sync += r1.wb.we.eq(0)
1442 sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1443
1444 with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1445 with m.If(~req.dcbz):
1446 sync += r1.state.eq(State.STORE_WAIT_ACK)
1447 sync += r1.acks_pending.eq(1)
1448 sync += r1.full.eq(0)
1449 sync += r1.slow_valid.eq(1)
1450
1451 with m.If(~req.mmu_req):
1452 sync += r1.ls_valid.eq(1)
1453 with m.Else():
1454 sync += r1.mmu_done.eq(1)
1455
1456 with m.If(req.op == Op.OP_STORE_HIT):
1457 sync += r1.write_bram.eq(1)
1458 with m.Else():
1459 # dcbz is handled much like a load miss except
1460 # that we are writing to memory instead of reading
1461 sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1462
1463 with m.If(req.op == Op.OP_STORE_MISS):
1464 sync += r1.write_tag.eq(1)
1465
1466 sync += r1.wb.we.eq(1)
1467 sync += r1.wb.cyc.eq(1)
1468 sync += r1.wb.stb.eq(1)
1469
1470 # OP_NONE and OP_BAD do nothing
1471 # OP_BAD & OP_STCX_FAIL were
1472 # handled above already
1473 with m.Case(Op.OP_NONE):
1474 pass
1475 with m.Case(Op.OP_BAD):
1476 pass
1477 with m.Case(Op.OP_STCX_FAIL):
1478 pass
1479
1480 with m.Case(State.RELOAD_WAIT_ACK):
1481 ld_stbs_done = Signal()
1482 # Requests are all sent if stb is 0
1483 comb += ld_stbs_done.eq(~r1.wb.stb)
1484
1485 # If we are still sending requests, was one accepted?
1486 with m.If((~bus.stall) & r1.wb.stb):
1487 # That was the last word? We are done sending.
1488 # Clear stb and set ld_stbs_done so we can handle an
1489 # eventual last ack on the same cycle.
1490 # sigh - reconstruct wb adr with 3 extra 0s at front
1491 wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1492 with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1493 sync += r1.wb.stb.eq(0)
1494 comb += ld_stbs_done.eq(1)
1495
1496 # Calculate the next row address in the current cache line
1497 row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1498 comb += row.eq(r1.wb.adr)
1499 sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1500
1501 # Incoming acks processing
1502 sync += r1.forward_valid1.eq(bus.ack)
1503 with m.If(bus.ack):
1504 srow = Signal(ROW_LINE_BITS)
1505 comb += srow.eq(r1.store_row)
1506 sync += r1.rows_valid[srow].eq(1)
1507
1508 # If this is the data we were looking for,
1509 # we can complete the request next cycle.
1510 # Compare the whole address in case the
1511 # request in r1.req is not the one that
1512 # started this refill.
1513 with m.If(req.valid & r1.req.same_tag &
1514 ((r1.dcbz & r1.req.dcbz) |
1515 (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1516 (r1.store_row == get_row(req.real_addr))):
1517 sync += r1.full.eq(0)
1518 sync += r1.slow_valid.eq(1)
1519 with m.If(~r1.mmu_req):
1520 sync += r1.ls_valid.eq(1)
1521 with m.Else():
1522 sync += r1.mmu_done.eq(1)
1523 sync += r1.forward_sel.eq(~0) # all 1s
1524 sync += r1.use_forward1.eq(1)
1525
1526 # Check for completion
1527 with m.If(ld_stbs_done & is_last_row(r1.store_row,
1528 r1.end_row_ix)):
1529 # Complete wishbone cycle
1530 sync += r1.wb.cyc.eq(0)
1531
1532 # Cache line is now valid
1533 cv = Signal(INDEX_BITS)
1534 comb += cv.eq(cache_tags[r1.store_index].valid)
1535 comb += cv.bit_select(r1.store_way, 1).eq(1)
1536 sync += cache_tags[r1.store_index].valid.eq(cv)
1537
1538 sync += r1.state.eq(State.IDLE)
1539 sync += Display("cache valid set %x "
1540 "idx %d way %d",
1541 cv, r1.store_index, r1.store_way)
1542
1543 # Increment store row counter
1544 sync += r1.store_row.eq(next_row(r1.store_row))
1545
1546 with m.Case(State.STORE_WAIT_ACK):
1547 st_stbs_done = Signal()
1548 acks = Signal(3)
1549 adjust_acks = Signal(3)
1550
1551 comb += st_stbs_done.eq(~r1.wb.stb)
1552 comb += acks.eq(r1.acks_pending)
1553
1554 with m.If(r1.inc_acks != r1.dec_acks):
1555 with m.If(r1.inc_acks):
1556 comb += adjust_acks.eq(acks + 1)
1557 with m.Else():
1558 comb += adjust_acks.eq(acks - 1)
1559 with m.Else():
1560 comb += adjust_acks.eq(acks)
1561
1562 sync += r1.acks_pending.eq(adjust_acks)
1563
1564 # Clear stb when slave accepted request
1565 with m.If(~bus.stall):
1566 # See if there is another store waiting
1567 # to be done which is in the same real page.
1568 with m.If(req.valid):
1569 _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1570 sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1571 sync += r1.wb.dat.eq(req.data)
1572 sync += r1.wb.sel.eq(req.byte_sel)
1573
1574 with m.If((adjust_acks < 7) & req.same_tag &
1575 ((req.op == Op.OP_STORE_MISS)
1576 | (req.op == Op.OP_STORE_HIT))):
1577 sync += r1.wb.stb.eq(1)
1578 comb += st_stbs_done.eq(0)
1579
1580 with m.If(req.op == Op.OP_STORE_HIT):
1581 sync += r1.write_bram.eq(1)
1582 sync += r1.full.eq(0)
1583 sync += r1.slow_valid.eq(1)
1584
1585 # Store requests never come from the MMU
1586 sync += r1.ls_valid.eq(1)
1587 comb += st_stbs_done.eq(0)
1588 sync += r1.inc_acks.eq(1)
1589 with m.Else():
1590 sync += r1.wb.stb.eq(0)
1591 comb += st_stbs_done.eq(1)
1592
1593 # Got ack ? See if complete.
1594 with m.If(bus.ack):
1595 with m.If(st_stbs_done & (adjust_acks == 1)):
1596 sync += r1.state.eq(State.IDLE)
1597 sync += r1.wb.cyc.eq(0)
1598 sync += r1.wb.stb.eq(0)
1599 sync += r1.dec_acks.eq(1)
1600
1601 with m.Case(State.NC_LOAD_WAIT_ACK):
1602 # Clear stb when slave accepted request
1603 with m.If(~bus.stall):
1604 sync += r1.wb.stb.eq(0)
1605
1606 # Got ack ? complete.
1607 with m.If(bus.ack):
1608 sync += r1.state.eq(State.IDLE)
1609 sync += r1.full.eq(0)
1610 sync += r1.slow_valid.eq(1)
1611
1612 with m.If(~r1.mmu_req):
1613 sync += r1.ls_valid.eq(1)
1614 with m.Else():
1615 sync += r1.mmu_done.eq(1)
1616
1617 sync += r1.forward_sel.eq(~0) # all 1s
1618 sync += r1.use_forward1.eq(1)
1619 sync += r1.wb.cyc.eq(0)
1620 sync += r1.wb.stb.eq(0)
1621
1622 def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1623
1624 sync = m.d.sync
1625 d_out, bus, log_out = self.d_out, self.bus, self.log_out
1626
1627 sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1628 stall_out, req_op[:3], d_out.valid, d_out.error,
1629 r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1630 r1.real_adr[3:6]))
1631
1632 def elaborate(self, platform):
1633
1634 m = Module()
1635 comb = m.d.comb
1636 d_in = self.d_in
1637
1638 # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1639 cache_tags = CacheTagArray()
1640 cache_tag_set = Signal(TAG_RAM_WIDTH)
1641
1642 # TODO attribute ram_style : string;
1643 # TODO attribute ram_style of cache_tags : signal is "distributed";
1644
1645 """note: these are passed to nmigen.hdl.Memory as "attributes".
1646 don't know how, just that they are.
1647 """
1648 # TODO attribute ram_style of
1649 # dtlb_tags : signal is "distributed";
1650 # TODO attribute ram_style of
1651 # dtlb_ptes : signal is "distributed";
1652
1653 r0 = RegStage0("r0")
1654 r0_full = Signal()
1655
1656 r1 = RegStage1("r1")
1657
1658 reservation = Reservation()
1659
1660 # Async signals on incoming request
1661 req_index = Signal(INDEX_BITS)
1662 req_row = Signal(ROW_BITS)
1663 req_hit_way = Signal(WAY_BITS)
1664 req_tag = Signal(TAG_BITS)
1665 req_op = Signal(Op)
1666 req_data = Signal(64)
1667 req_same_tag = Signal()
1668 req_go = Signal()
1669
1670 early_req_row = Signal(ROW_BITS)
1671
1672 cancel_store = Signal()
1673 set_rsrv = Signal()
1674 clear_rsrv = Signal()
1675
1676 r0_valid = Signal()
1677 r0_stall = Signal()
1678
1679 use_forward1_next = Signal()
1680 use_forward2_next = Signal()
1681
1682 cache_out_row = Signal(WB_DATA_BITS)
1683
1684 plru_victim = Signal(WAY_BITS)
1685 replace_way = Signal(WAY_BITS)
1686
1687 # Wishbone read/write/cache write formatting signals
1688 bus_sel = Signal(8)
1689
1690 # TLB signals
1691 tlb_way = TLBRecord("tlb_way")
1692 tlb_req_index = Signal(TLB_SET_BITS)
1693 tlb_hit = TLBHit("tlb_hit")
1694 pte = Signal(TLB_PTE_BITS)
1695 ra = Signal(REAL_ADDR_BITS)
1696 valid_ra = Signal()
1697 perm_attr = PermAttr("dc_perms")
1698 rc_ok = Signal()
1699 perm_ok = Signal()
1700 access_ok = Signal()
1701
1702 tlb_plru_victim = Signal(TLB_WAY_BITS)
1703
1704 # we don't yet handle collisions between loadstore1 requests
1705 # and MMU requests
1706 comb += self.m_out.stall.eq(0)
1707
1708 # Hold off the request in r0 when r1 has an uncompleted request
1709 comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1710 comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1711 comb += self.stall_out.eq(r0_stall)
1712
1713 # deal with litex not doing wishbone pipeline mode
1714 # XXX in wrong way. FIFOs are needed in the SRAM test
1715 # so that stb/ack match up. same thing done in icache.py
1716 comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1717
1718 # Wire up wishbone request latch out of stage 1
1719 comb += self.bus.we.eq(r1.wb.we)
1720 comb += self.bus.adr.eq(r1.wb.adr)
1721 comb += self.bus.sel.eq(r1.wb.sel)
1722 comb += self.bus.stb.eq(r1.wb.stb)
1723 comb += self.bus.dat_w.eq(r1.wb.dat)
1724 comb += self.bus.cyc.eq(r1.wb.cyc)
1725
1726 # create submodule TLBUpdate
1727 m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
1728 dtlb = self.dtlb_update.dtlb
1729
1730 # call sub-functions putting everything together, using shared
1731 # signals established above
1732 self.stage_0(m, r0, r1, r0_full)
1733 self.tlb_read(m, r0_stall, tlb_way, dtlb)
1734 self.tlb_search(m, tlb_req_index, r0, r0_valid,
1735 tlb_way,
1736 pte, tlb_hit, valid_ra, perm_attr, ra)
1737 self.tlb_update(m, r0_valid, r0, dtlb, tlb_req_index,
1738 tlb_hit, tlb_plru_victim,
1739 tlb_way)
1740 self.maybe_plrus(m, r1, plru_victim)
1741 self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1742 self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1743 self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1744 r0_valid, r1, cache_tags, replace_way,
1745 use_forward1_next, use_forward2_next,
1746 req_hit_way, plru_victim, rc_ok, perm_attr,
1747 valid_ra, perm_ok, access_ok, req_op, req_go,
1748 tlb_hit, tlb_way, cache_tag_set,
1749 cancel_store, req_same_tag, r0_stall, early_req_row)
1750 self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1751 r0_valid, r0, reservation)
1752 self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1753 reservation, r0)
1754 self.writeback_control(m, r1, cache_out_row)
1755 self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1756 self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1757 req_hit_way, req_index, req_tag, access_ok,
1758 tlb_hit, tlb_req_index)
1759 self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1760 r0, replace_way,
1761 req_hit_way, req_same_tag,
1762 r0_valid, req_op, cache_tags, req_go, ra)
1763 #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1764
1765 return m
1766
1767
1768 if __name__ == '__main__':
1769 dut = DCache()
1770 vl = rtlil.convert(dut, ports=[])
1771 with open("test_dcache.il", "w") as f:
1772 f.write(vl)