src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 """
   6
   7 from enum import Enum, unique
   8
   9 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  10 try:
  11     from nmigen.hdl.ast import Display
  12 except ImportError:
  13     def Display(*args):
  14         return []
  15
  16 from nmigen.cli import main
  17 from nmutil.iocontrol import RecordObject
  18 from nmutil.util import wrap
  19 from nmigen.utils import log2_int
  20 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  21                                      DCacheToLoadStore1Type,
  22                                      MMUToDCacheType,
  23                                      DCacheToMMUType)
  24
  25 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  26                                 WBAddrType, WBDataType, WBSelType,
  27                                 WBMasterOut, WBSlaveOut,
  28                                 WBMasterOutVector, WBSlaveOutVector,
  29                                 WBIOMasterOut, WBIOSlaveOut)
  30
  31 from soc.experiment.cache_ram import CacheRam
  32 from soc.experiment.plru import PLRU
  33
  34 # for test
  35 from nmigen_soc.wishbone.sram import SRAM
  36 from nmigen import Memory
  37 from nmigen.cli import rtlil
  38 if True:
  39     from nmigen.back.pysim import Simulator, Delay, Settle
  40 else:
  41     from nmigen.sim.cxxsim import Simulator, Delay, Settle
  42
  43
  44 # TODO: make these parameters of DCache at some point
  45 LINE_SIZE = 64    # Line size in bytes
  46 NUM_LINES = 16    # Number of lines in a set
  47 NUM_WAYS = 4      # Number of ways
  48 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  49 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  50 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  51 LOG_LENGTH = 0    # Non-zero to enable log data collection
  52
  53 # BRAM organisation: We never access more than
  54 #     -- WB_DATA_BITS at a time so to save
  55 #     -- resources we make the array only that wide, and
  56 #     -- use consecutive indices for to make a cache "line"
  57 #     --
  58 #     -- ROW_SIZE is the width in bytes of the BRAM
  59 #     -- (based on WB, so 64-bits)
  60 ROW_SIZE = WB_DATA_BITS // 8;
  61
  62 # ROW_PER_LINE is the number of row (wishbone
  63 # transactions) in a line
  64 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  65
  66 # BRAM_ROWS is the number of rows in BRAM needed
  67 # to represent the full dcache
  68 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  69
  70
  71 # Bit fields counts in the address
  72
  73 # REAL_ADDR_BITS is the number of real address
  74 # bits that we store
  75 REAL_ADDR_BITS = 56
  76
  77 # ROW_BITS is the number of bits to select a row
  78 ROW_BITS = log2_int(BRAM_ROWS)
  79
  80 # ROW_LINE_BITS is the number of bits to select
  81 # a row within a line
  82 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  83
  84 # LINE_OFF_BITS is the number of bits for
  85 # the offset in a cache line
  86 LINE_OFF_BITS = log2_int(LINE_SIZE)
  87
  88 # ROW_OFF_BITS is the number of bits for
  89 # the offset in a row
  90 ROW_OFF_BITS = log2_int(ROW_SIZE)
  91
  92 # INDEX_BITS is the number if bits to
  93 # select a cache line
  94 INDEX_BITS = log2_int(NUM_LINES)
  95
  96 # SET_SIZE_BITS is the log base 2 of the set size
  97 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
  98
  99 # TAG_BITS is the number of bits of
 100 # the tag part of the address
 101 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 102
 103 # TAG_WIDTH is the width in bits of each way of the tag RAM
 104 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 105
 106 # WAY_BITS is the number of bits to select a way
 107 WAY_BITS = log2_int(NUM_WAYS)
 108
 109 # Example of layout for 32 lines of 64 bytes:
 110 #
 111 # ..  tag    |index|  line  |
 112 # ..         |   row   |    |
 113 # ..         |     |---|    | ROW_LINE_BITS  (3)
 114 # ..         |     |--- - --| LINE_OFF_BITS (6)
 115 # ..         |         |- --| ROW_OFF_BITS  (3)
 116 # ..         |----- ---|    | ROW_BITS      (8)
 117 # ..         |-----|        | INDEX_BITS    (5)
 118 # .. --------|              | TAG_BITS      (45)
 119
 120 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 121
 122 def CacheTagArray():
 123     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 124                         for x in range(NUM_LINES))
 125
 126 def CacheValidBitsArray():
 127     return Array(Signal(INDEX_BITS, name="cachevalid_%d" % x) \
 128                         for x in range(NUM_LINES))
 129
 130 def RowPerLineValidArray():
 131     return Array(Signal(name="rows_valid%d" % x) \
 132                         for x in range(ROW_PER_LINE))
 133
 134 # L1 TLB
 135 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 136 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 137 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 138 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 139 TLB_PTE_BITS     = 64
 140 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 141
 142 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 143 assert (LINE_SIZE % 2) == 0, "LINE_SIZE not power of 2"
 144 assert (NUM_LINES % 2) == 0, "NUM_LINES not power of 2"
 145 assert (ROW_PER_LINE % 2) == 0, "ROW_PER_LINE not power of 2"
 146 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 147 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 148         "geometry bits don't add up"
 149 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 150         "geometry bits don't add up"
 151 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 152          "geometry bits don't add up"
 153 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 154 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 155
 156
 157 def TLBValidBitsArray():
 158     return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
 159
 160 def TLBTagEAArray():
 161     return Array(Signal(TLB_EA_TAG_BITS) for x in range (TLB_NUM_WAYS))
 162
 163 def TLBTagsArray():
 164     return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
 165
 166 def TLBPtesArray():
 167     return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
 168
 169 def HitWaySet():
 170     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 171                         for x in range(TLB_NUM_WAYS))
 172
 173 # Cache RAM interface
 174 def CacheRamOut():
 175     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 176                  for x in range(NUM_WAYS))
 177
 178 # PLRU output interface
 179 def PLRUOut():
 180     return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
 181
 182 # TLB PLRU output interface
 183 def TLBPLRUOut():
 184     return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
 185
 186 # Helper functions to decode incoming requests
 187 #
 188 # Return the cache line index (tag index) for an address
 189 def get_index(addr):
 190     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 191
 192 # Return the cache row index (data memory) for an address
 193 def get_row(addr):
 194     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 195
 196 # Return the index of a row within a line
 197 def get_row_of_line(row):
 198     return row[:ROW_LINE_BITS]
 199
 200 # Returns whether this is the last row of a line
 201 def is_last_row_addr(addr, last):
 202     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 203
 204 # Returns whether this is the last row of a line
 205 def is_last_row(row, last):
 206     return get_row_of_line(row) == last
 207
 208 # Return the next row in the current cache line. We use a
 209 # dedicated function in order to limit the size of the
 210 # generated adder to be only the bits within a cache line
 211 # (3 bits with default settings)
 212 def next_row(row):
 213     row_v = row[0:ROW_LINE_BITS] + 1
 214     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 215
 216 # Get the tag value from the address
 217 def get_tag(addr):
 218     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 219
 220 # Read a tag from a tag memory row
 221 def read_tag(way, tagset):
 222     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 223
 224 # Read a TLB tag from a TLB tag memory row
 225 def read_tlb_tag(way, tags):
 226     return tags.word_select(way, TLB_EA_TAG_BITS)
 227
 228 # Write a TLB tag to a TLB tag memory row
 229 def write_tlb_tag(way, tags, tag):
 230     return read_tlb_tag(way, tags).eq(tag)
 231
 232 # Read a PTE from a TLB PTE memory row
 233 def read_tlb_pte(way, ptes):
 234     return ptes.word_select(way, TLB_PTE_BITS)
 235
 236 def write_tlb_pte(way, ptes, newpte):
 237     return read_tlb_pte(way, ptes).eq(newpte)
 238
 239
 240 # Record for storing permission, attribute, etc. bits from a PTE
 241 class PermAttr(RecordObject):
 242     def __init__(self, name=None):
 243         super().__init__(name=name)
 244         self.reference = Signal()
 245         self.changed   = Signal()
 246         self.nocache   = Signal()
 247         self.priv      = Signal()
 248         self.rd_perm   = Signal()
 249         self.wr_perm   = Signal()
 250
 251
 252 def extract_perm_attr(pte):
 253     pa = PermAttr()
 254     pa.reference = pte[8]
 255     pa.changed   = pte[7]
 256     pa.nocache   = pte[5]
 257     pa.priv      = pte[3]
 258     pa.rd_perm   = pte[2]
 259     pa.wr_perm   = pte[1]
 260     return pa;
 261
 262
 263 # Type of operation on a "valid" input
 264 @unique
 265 class Op(Enum):
 266     OP_NONE       = 0
 267     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 268     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 269     OP_LOAD_HIT   = 3 # Cache hit on load
 270     OP_LOAD_MISS  = 4 # Load missing cache
 271     OP_LOAD_NC    = 5 # Non-cachable load
 272     OP_STORE_HIT  = 6 # Store hitting cache
 273     OP_STORE_MISS = 7 # Store missing cache
 274
 275
 276 # Cache state machine
 277 @unique
 278 class State(Enum):
 279     IDLE             = 0 # Normal load hit processing
 280     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 281     STORE_WAIT_ACK   = 2 # Store wait ack
 282     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 283
 284
 285 # Dcache operations:
 286 #
 287 # In order to make timing, we use the BRAMs with
 288 # an output buffer, which means that the BRAM
 289 # output is delayed by an extra cycle.
 290 #
 291 # Thus, the dcache has a 2-stage internal pipeline
 292 # for cache hits with no stalls.
 293 #
 294 # All other operations are handled via stalling
 295 # in the first stage.
 296 #
 297 # The second stage can thus complete a hit at the same
 298 # time as the first stage emits a stall for a complex op.
 299 #
 300 # Stage 0 register, basically contains just the latched request
 301
 302 class RegStage0(RecordObject):
 303     def __init__(self, name=None):
 304         super().__init__(name=name)
 305         self.req     = LoadStore1ToDCacheType(name="lsmem")
 306         self.tlbie   = Signal()
 307         self.doall   = Signal()
 308         self.tlbld   = Signal()
 309         self.mmu_req = Signal() # indicates source of request
 310
 311
 312 class MemAccessRequest(RecordObject):
 313     def __init__(self, name=None):
 314         super().__init__(name=name)
 315         self.op        = Signal(Op)
 316         self.valid     = Signal()
 317         self.dcbz      = Signal()
 318         self.real_addr = Signal(REAL_ADDR_BITS)
 319         self.data      = Signal(64)
 320         self.byte_sel  = Signal(8)
 321         self.hit_way   = Signal(WAY_BITS)
 322         self.same_tag  = Signal()
 323         self.mmu_req   = Signal()
 324
 325
 326 # First stage register, contains state for stage 1 of load hits
 327 # and for the state machine used by all other operations
 328 class RegStage1(RecordObject):
 329     def __init__(self, name=None):
 330         super().__init__(name=name)
 331         # Info about the request
 332         self.full             = Signal() # have uncompleted request
 333         self.mmu_req          = Signal() # request is from MMU
 334         self.req              = MemAccessRequest(name="reqmem")
 335
 336         # Cache hit state
 337         self.hit_way          = Signal(WAY_BITS)
 338         self.hit_load_valid   = Signal()
 339         self.hit_index        = Signal(INDEX_BITS)
 340         self.cache_hit        = Signal()
 341
 342         # TLB hit state
 343         self.tlb_hit          = Signal()
 344         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 345         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 346
 347         # 2-stage data buffer for data forwarded from writes to reads
 348         self.forward_data1    = Signal(64)
 349         self.forward_data2    = Signal(64)
 350         self.forward_sel1     = Signal(8)
 351         self.forward_valid1   = Signal()
 352         self.forward_way1     = Signal(WAY_BITS)
 353         self.forward_row1     = Signal(ROW_BITS)
 354         self.use_forward1     = Signal()
 355         self.forward_sel      = Signal(8)
 356
 357         # Cache miss state (reload state machine)
 358         self.state            = Signal(State)
 359         self.dcbz             = Signal()
 360         self.write_bram       = Signal()
 361         self.write_tag        = Signal()
 362         self.slow_valid       = Signal()
 363         self.wb               = WBMasterOut()
 364         self.reload_tag       = Signal(TAG_BITS)
 365         self.store_way        = Signal(WAY_BITS)
 366         self.store_row        = Signal(ROW_BITS)
 367         self.store_index      = Signal(INDEX_BITS)
 368         self.end_row_ix       = Signal(ROW_LINE_BITS)
 369         self.rows_valid       = RowPerLineValidArray()
 370         self.acks_pending     = Signal(3)
 371         self.inc_acks         = Signal()
 372         self.dec_acks         = Signal()
 373
 374         # Signals to complete (possibly with error)
 375         self.ls_valid         = Signal()
 376         self.ls_error         = Signal()
 377         self.mmu_done         = Signal()
 378         self.mmu_error        = Signal()
 379         self.cache_paradox    = Signal()
 380
 381         # Signal to complete a failed stcx.
 382         self.stcx_fail        = Signal()
 383
 384
 385 # Reservation information
 386 class Reservation(RecordObject):
 387     def __init__(self):
 388         super().__init__()
 389         self.valid = Signal()
 390         self.addr  = Signal(64-LINE_OFF_BITS)
 391
 392
 393 class DTLBUpdate(Elaboratable):
 394     def __init__(self):
 395         self.tlbie    = Signal()
 396         self.tlbwe    = Signal()
 397         self.doall    = Signal()
 398         self.updated  = Signal()
 399         self.v_updated  = Signal()
 400         self.tlb_hit    = Signal()
 401         self.tlb_req_index = Signal(TLB_SET_BITS)
 402
 403         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 404         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 405         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 406         self.repl_way        = Signal(TLB_WAY_BITS)
 407         self.eatag           = Signal(TLB_EA_TAG_BITS)
 408         self.pte_data        = Signal(TLB_PTE_BITS)
 409
 410         self.dv = Signal(TLB_PTE_WAY_BITS)
 411
 412         self.tb_out = Signal(TLB_TAG_WAY_BITS)
 413         self.pb_out = Signal(TLB_NUM_WAYS)
 414         self.db_out = Signal(TLB_PTE_WAY_BITS)
 415
 416     def elaborate(self, platform):
 417         m = Module()
 418         comb = m.d.comb
 419         sync = m.d.sync
 420
 421         tagset   = Signal(TLB_TAG_WAY_BITS)
 422         pteset   = Signal(TLB_PTE_WAY_BITS)
 423
 424         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 425
 426         with m.If(self.tlbie & self.doall):
 427             pass # clear all back in parent
 428         with m.Elif(self.tlbie):
 429             with m.If(self.tlb_hit):
 430                 comb += db_out.eq(self.dv)
 431                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 432                 comb += self.v_updated.eq(1)
 433
 434         with m.Elif(self.tlbwe):
 435
 436             comb += tagset.eq(self.tlb_tag_way)
 437             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 438             comb += tb_out.eq(tagset)
 439
 440             comb += pteset.eq(self.tlb_pte_way)
 441             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 442             comb += pb_out.eq(pteset)
 443
 444             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 445
 446             comb += self.updated.eq(1)
 447             comb += self.v_updated.eq(1)
 448
 449         return m
 450
 451     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 452                        r0_valid, r1, cache_valid_bits, replace_way,
 453                        use_forward1_next, use_forward2_next,
 454                        req_hit_way, plru_victim, rc_ok, perm_attr,
 455                        valid_ra, perm_ok, access_ok, req_op, req_go,
 456                        tlb_pte_way,
 457                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 458                        cancel_store, req_same_tag, r0_stall, early_req_row):
 459         """Cache request parsing and hit detection
 460         """
 461
 462 class DCachePendingHit(Elaboratable):
 463
 464     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 465                       cache_valid_idx, cache_tag_set,
 466                     req_addr,
 467                     hit_set):
 468
 469         self.go          = Signal()
 470         self.virt_mode   = Signal()
 471         self.is_hit      = Signal()
 472         self.tlb_hit     = Signal()
 473         self.hit_way     = Signal(WAY_BITS)
 474         self.rel_match   = Signal()
 475         self.req_index   = Signal(INDEX_BITS)
 476         self.reload_tag  = Signal(TAG_BITS)
 477
 478         self.tlb_hit_way = tlb_hit_way
 479         self.tlb_pte_way = tlb_pte_way
 480         self.tlb_valid_way = tlb_valid_way
 481         self.cache_valid_idx = cache_valid_idx
 482         self.cache_tag_set = cache_tag_set
 483         self.req_addr = req_addr
 484         self.hit_set = hit_set
 485
 486     def elaborate(self, platform):
 487         m = Module()
 488         comb = m.d.comb
 489         sync = m.d.sync
 490
 491         go = self.go
 492         virt_mode = self.virt_mode
 493         is_hit = self.is_hit
 494         tlb_pte_way = self.tlb_pte_way
 495         tlb_valid_way = self.tlb_valid_way
 496         cache_valid_idx = self.cache_valid_idx
 497         cache_tag_set = self.cache_tag_set
 498         req_addr = self.req_addr
 499         tlb_hit_way = self.tlb_hit_way
 500         tlb_hit = self.tlb_hit
 501         hit_set = self.hit_set
 502         hit_way = self.hit_way
 503         rel_match = self.rel_match
 504         req_index = self.req_index
 505         reload_tag = self.reload_tag
 506
 507         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 508                                     for i in range(TLB_NUM_WAYS))
 509         hit_way_set = HitWaySet()
 510
 511         # Test if pending request is a hit on any way
 512         # In order to make timing in virtual mode,
 513         # when we are using the TLB, we compare each
 514         # way with each of the real addresses from each way of
 515         # the TLB, and then decide later which match to use.
 516
 517         with m.If(virt_mode):
 518             for j in range(TLB_NUM_WAYS):
 519                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 520                 s_hit       = Signal()
 521                 s_pte       = Signal(TLB_PTE_BITS)
 522                 s_ra        = Signal(REAL_ADDR_BITS)
 523                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 524                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 525                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 526                 comb += s_tag.eq(get_tag(s_ra))
 527
 528                 for i in range(NUM_WAYS):
 529                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 530                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 531                                   (read_tag(i, cache_tag_set) == s_tag)
 532                                   & tlb_valid_way[j])
 533                     with m.If(is_tag_hit):
 534                         comb += hit_way_set[j].eq(i)
 535                         comb += s_hit.eq(1)
 536                 comb += hit_set[j].eq(s_hit)
 537                 with m.If(s_tag == reload_tag):
 538                     comb += rel_matches[j].eq(1)
 539             with m.If(tlb_hit):
 540                 comb += is_hit.eq(hit_set[tlb_hit_way])
 541                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 542                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 543         with m.Else():
 544             s_tag       = Signal(TAG_BITS)
 545             comb += s_tag.eq(get_tag(req_addr))
 546             for i in range(NUM_WAYS):
 547                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 548                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 549                           (read_tag(i, cache_tag_set) == s_tag))
 550                 with m.If(is_tag_hit):
 551                     comb += hit_way.eq(i)
 552                     comb += is_hit.eq(1)
 553             with m.If(s_tag == reload_tag):
 554                 comb += rel_match.eq(1)
 555
 556         return m
 557
 558
 559 class DCache(Elaboratable):
 560     """Set associative dcache write-through
 561     TODO (in no specific order):
 562     * See list in icache.vhdl
 563     * Complete load misses on the cycle when WB data comes instead of
 564       at the end of line (this requires dealing with requests coming in
 565       while not idle...)
 566     """
 567     def __init__(self):
 568         self.d_in      = LoadStore1ToDCacheType("d_in")
 569         self.d_out     = DCacheToLoadStore1Type("d_out")
 570
 571         self.m_in      = MMUToDCacheType("m_in")
 572         self.m_out     = DCacheToMMUType("m_out")
 573
 574         self.stall_out = Signal()
 575
 576         self.wb_out    = WBMasterOut()
 577         self.wb_in     = WBSlaveOut()
 578
 579         self.log_out   = Signal(20)
 580
 581     def stage_0(self, m, r0, r1, r0_full):
 582         """Latch the request in r0.req as long as we're not stalling
 583         """
 584         comb = m.d.comb
 585         sync = m.d.sync
 586         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 587
 588         r = RegStage0("stage0")
 589
 590         # TODO, this goes in unit tests and formal proofs
 591         with m.If(~(d_in.valid & m_in.valid)):
 592             #sync += Display("request collision loadstore vs MMU")
 593             pass
 594
 595         with m.If(m_in.valid):
 596             sync += r.req.valid.eq(1)
 597             sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 598             sync += r.req.dcbz.eq(0)
 599             sync += r.req.nc.eq(0)
 600             sync += r.req.reserve.eq(0)
 601             sync += r.req.virt_mode.eq(1)
 602             sync += r.req.priv_mode.eq(1)
 603             sync += r.req.addr.eq(m_in.addr)
 604             sync += r.req.data.eq(m_in.pte)
 605             sync += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 606             sync += r.tlbie.eq(m_in.tlbie)
 607             sync += r.doall.eq(m_in.doall)
 608             sync += r.tlbld.eq(m_in.tlbld)
 609             sync += r.mmu_req.eq(1)
 610         with m.Else():
 611             sync += r.req.eq(d_in)
 612             sync += r.tlbie.eq(0)
 613             sync += r.doall.eq(0)
 614             sync += r.tlbld.eq(0)
 615             sync += r.mmu_req.eq(0)
 616             with m.If(~(r1.full & r0_full)):
 617                 sync += r0.eq(r)
 618                 sync += r0_full.eq(r.req.valid)
 619
 620     def tlb_read(self, m, r0_stall, tlb_valid_way,
 621                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 622                  dtlb_tags, dtlb_ptes):
 623         """TLB
 624         Operates in the second cycle on the request latched in r0.req.
 625         TLB updates write the entry at the end of the second cycle.
 626         """
 627         comb = m.d.comb
 628         sync = m.d.sync
 629         m_in, d_in = self.m_in, self.d_in
 630
 631         index    = Signal(TLB_SET_BITS)
 632         addrbits = Signal(TLB_SET_BITS)
 633
 634         amin = TLB_LG_PGSZ
 635         amax = TLB_LG_PGSZ + TLB_SET_BITS
 636
 637         with m.If(m_in.valid):
 638             comb += addrbits.eq(m_in.addr[amin : amax])
 639         with m.Else():
 640             comb += addrbits.eq(d_in.addr[amin : amax])
 641         comb += index.eq(addrbits)
 642
 643         # If we have any op and the previous op isn't finished,
 644         # then keep the same output for next cycle.
 645         with m.If(~r0_stall):
 646             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 647             sync += tlb_tag_way.eq(dtlb_tags[index])
 648             sync += tlb_pte_way.eq(dtlb_ptes[index])
 649
 650     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 651         """Generate TLB PLRUs
 652         """
 653         comb = m.d.comb
 654         sync = m.d.sync
 655
 656         if TLB_NUM_WAYS == 0:
 657             return
 658         for i in range(TLB_SET_SIZE):
 659             # TLB PLRU interface
 660             tlb_plru        = PLRU(WAY_BITS)
 661             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 662             tlb_plru_acc_en = Signal()
 663
 664             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 665             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 666             comb += tlb_plru.acc.eq(r1.tlb_hit_way)
 667             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 668
 669     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 670                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 671                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 672
 673         comb = m.d.comb
 674         sync = m.d.sync
 675
 676         hitway = Signal(TLB_WAY_BITS)
 677         hit    = Signal()
 678         eatag  = Signal(TLB_EA_TAG_BITS)
 679
 680         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 681         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 682         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 683
 684         for i in range(TLB_NUM_WAYS):
 685             is_tag_hit = Signal()
 686             comb += is_tag_hit.eq(tlb_valid_way[i]
 687                                   & read_tlb_tag(i, tlb_tag_way) == eatag)
 688             with m.If(is_tag_hit):
 689                 comb += hitway.eq(i)
 690                 comb += hit.eq(1)
 691
 692         comb += tlb_hit.eq(hit & r0_valid)
 693         comb += tlb_hit_way.eq(hitway)
 694
 695         with m.If(tlb_hit):
 696             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 697         with m.Else():
 698             comb += pte.eq(0)
 699         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 700         with m.If(r0.req.virt_mode):
 701             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 702                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 703                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 704             comb += perm_attr.eq(extract_perm_attr(pte))
 705         with m.Else():
 706             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 707                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 708
 709             comb += perm_attr.reference.eq(1)
 710             comb += perm_attr.changed.eq(1)
 711             comb += perm_attr.nocache.eq(0)
 712             comb += perm_attr.priv.eq(1)
 713             comb += perm_attr.rd_perm.eq(1)
 714             comb += perm_attr.wr_perm.eq(1)
 715
 716     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 717                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 718                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 719
 720         comb = m.d.comb
 721         sync = m.d.sync
 722
 723         tlbie    = Signal()
 724         tlbwe    = Signal()
 725
 726         comb += tlbie.eq(r0_valid & r0.tlbie)
 727         comb += tlbwe.eq(r0_valid & r0.tlbld)
 728
 729         m.submodules.tlb_update = d = DTLBUpdate()
 730         with m.If(tlbie & r0.doall):
 731             # clear all valid bits at once
 732             for i in range(TLB_SET_SIZE):
 733                 sync += dtlb_valid_bits[i].eq(0)
 734         with m.If(d.updated):
 735             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 736             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 737         with m.If(d.v_updated):
 738             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 739
 740         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 741
 742         comb += d.tlbie.eq(tlbie)
 743         comb += d.tlbwe.eq(tlbwe)
 744         comb += d.doall.eq(r0.doall)
 745         comb += d.tlb_hit.eq(tlb_hit)
 746         comb += d.tlb_hit_way.eq(tlb_hit_way)
 747         comb += d.tlb_tag_way.eq(tlb_tag_way)
 748         comb += d.tlb_pte_way.eq(tlb_pte_way)
 749         comb += d.tlb_req_index.eq(tlb_req_index)
 750
 751         with m.If(tlb_hit):
 752             comb += d.repl_way.eq(tlb_hit_way)
 753         with m.Else():
 754             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 755         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 756         comb += d.pte_data.eq(r0.req.data)
 757
 758     def maybe_plrus(self, m, r1, plru_victim):
 759         """Generate PLRUs
 760         """
 761         comb = m.d.comb
 762         sync = m.d.sync
 763
 764         if TLB_NUM_WAYS == 0:
 765             return
 766
 767         for i in range(NUM_LINES):
 768             # PLRU interface
 769             plru        = PLRU(WAY_BITS)
 770             setattr(m.submodules, "plru%d" % i, plru)
 771             plru_acc_en = Signal()
 772
 773             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 774             comb += plru.acc_en.eq(plru_acc_en)
 775             comb += plru.acc.eq(r1.hit_way)
 776             comb += plru_victim[i].eq(plru.lru_o)
 777
 778     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 779         """Cache tag RAM read port
 780         """
 781         comb = m.d.comb
 782         sync = m.d.sync
 783         m_in, d_in = self.m_in, self.d_in
 784
 785         index = Signal(INDEX_BITS)
 786
 787         with m.If(r0_stall):
 788             comb += index.eq(req_index)
 789         with m.Elif(m_in.valid):
 790             comb += index.eq(get_index(m_in.addr))
 791         with m.Else():
 792             comb += index.eq(get_index(d_in.addr))
 793         sync += cache_tag_set.eq(cache_tags[index])
 794
 795     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 796                        r0_valid, r1, cache_valid_bits, replace_way,
 797                        use_forward1_next, use_forward2_next,
 798                        req_hit_way, plru_victim, rc_ok, perm_attr,
 799                        valid_ra, perm_ok, access_ok, req_op, req_go,
 800                        tlb_pte_way,
 801                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 802                        cancel_store, req_same_tag, r0_stall, early_req_row):
 803         """Cache request parsing and hit detection
 804         """
 805
 806         comb = m.d.comb
 807         sync = m.d.sync
 808         m_in, d_in = self.m_in, self.d_in
 809
 810         is_hit      = Signal()
 811         hit_way     = Signal(WAY_BITS)
 812         op          = Signal(Op)
 813         opsel       = Signal(3)
 814         go          = Signal()
 815         nc          = Signal()
 816         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 817                                   for i in range(TLB_NUM_WAYS))
 818         cache_valid_idx = Signal(INDEX_BITS)
 819
 820         # Extract line, row and tag from request
 821         comb += req_index.eq(get_index(r0.req.addr))
 822         comb += req_row.eq(get_row(r0.req.addr))
 823         comb += req_tag.eq(get_tag(ra))
 824
 825         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 826         comb += cache_valid_idx.eq(cache_valid_bits[req_index])
 827
 828         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 829                                 tlb_valid_way, tlb_hit_way,
 830                                 cache_valid_idx, cache_tag_set,
 831                                 r0.req.addr,
 832                                 hit_set)
 833
 834         comb += dc.tlb_hit.eq(tlb_hit)
 835         comb += dc.reload_tag.eq(r1.reload_tag)
 836         comb += dc.virt_mode.eq(r0.req.virt_mode)
 837         comb += dc.go.eq(go)
 838         comb += dc.req_index.eq(req_index)
 839         comb += is_hit.eq(dc.is_hit)
 840         comb += hit_way.eq(dc.hit_way)
 841         comb += req_same_tag.eq(dc.rel_match)
 842
 843         # See if the request matches the line currently being reloaded
 844         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 845                   (req_index == r1.store_index) & req_same_tag):
 846             # For a store, consider this a hit even if the row isn't
 847             # valid since it will be by the time we perform the store.
 848             # For a load, check the appropriate row valid bit.
 849             valid = r1.rows_valid[req_row % ROW_PER_LINE]
 850             comb += is_hit.eq(~r0.req.load | valid)
 851             comb += hit_way.eq(replace_way)
 852
 853         # Whether to use forwarded data for a load or not
 854         with m.If((get_row(r1.req.real_addr) == req_row) &
 855                   (r1.req.hit_way == hit_way)):
 856             # Only need to consider r1.write_bram here, since if we
 857             # are writing refill data here, then we don't have a
 858             # cache hit this cycle on the line being refilled.
 859             # (There is the possibility that the load following the
 860             # load miss that started the refill could be to the old
 861             # contents of the victim line, since it is a couple of
 862             # cycles after the refill starts before we see the updated
 863             # cache tag. In that case we don't use the bypass.)
 864             comb += use_forward1_next.eq(r1.write_bram)
 865         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 866             comb += use_forward2_next.eq(r1.forward_valid1)
 867
 868         # The way that matched on a hit
 869         comb += req_hit_way.eq(hit_way)
 870
 871         # The way to replace on a miss
 872         with m.If(r1.write_tag):
 873             comb += replace_way.eq(plru_victim[r1.store_index])
 874         with m.Else():
 875             comb += replace_way.eq(r1.store_way)
 876
 877         # work out whether we have permission for this access
 878         # NB we don't yet implement AMR, thus no KUAP
 879         comb += rc_ok.eq(perm_attr.reference
 880                          & (r0.req.load | perm_attr.changed)
 881                 )
 882         comb += perm_ok.eq((r0.req.priv_mode | ~perm_attr.priv) &
 883                            (perm_attr.wr_perm |
 884                               (r0.req.load & perm_attr.rd_perm)))
 885         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 886         # Combine the request and cache hit status to decide what
 887         # operation needs to be done
 888         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 889         comb += op.eq(Op.OP_NONE)
 890         with m.If(go):
 891             with m.If(~access_ok):
 892                 comb += op.eq(Op.OP_BAD)
 893             with m.Elif(cancel_store):
 894                 comb += op.eq(Op.OP_STCX_FAIL)
 895             with m.Else():
 896                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 897                 with m.Switch(opsel):
 898                     with m.Case(0b101):
 899                         comb += op.eq(Op.OP_LOAD_HIT)
 900                     with m.Case(0b100):
 901                         comb += op.eq(Op.OP_LOAD_MISS)
 902                     with m.Case(0b110):
 903                         comb += op.eq(Op.OP_LOAD_NC)
 904                     with m.Case(0b001):
 905                         comb += op.eq(Op.OP_STORE_HIT)
 906                     with m.Case(0b000):
 907                         comb += op.eq(Op.OP_STORE_MISS)
 908                     with m.Case(0b010):
 909                         comb += op.eq(Op.OP_STORE_MISS)
 910                     with m.Case(0b011):
 911                         comb += op.eq(Op.OP_BAD)
 912                     with m.Case(0b111):
 913                         comb += op.eq(Op.OP_BAD)
 914                     with m.Default():
 915                         comb += op.eq(Op.OP_NONE)
 916         comb += req_op.eq(op)
 917         comb += req_go.eq(go)
 918
 919         # Version of the row number that is valid one cycle earlier
 920         # in the cases where we need to read the cache data BRAM.
 921         # If we're stalling then we need to keep reading the last
 922         # row requested.
 923         with m.If(~r0_stall):
 924             with m.If(m_in.valid):
 925                 comb += early_req_row.eq(get_row(m_in.addr))
 926             with m.Else():
 927                 comb += early_req_row.eq(get_row(d_in.addr))
 928         with m.Else():
 929             comb += early_req_row.eq(req_row)
 930
 931     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 932                          r0_valid, r0, reservation):
 933         """Handle load-with-reservation and store-conditional instructions
 934         """
 935         comb = m.d.comb
 936         sync = m.d.sync
 937
 938         with m.If(r0_valid & r0.req.reserve):
 939
 940             # XXX generate alignment interrupt if address
 941             # is not aligned XXX or if r0.req.nc = '1'
 942             with m.If(r0.req.load):
 943                 comb += set_rsrv.eq(1) # load with reservation
 944             with m.Else():
 945                 comb += clear_rsrv.eq(1) # store conditional
 946                 with m.If(~reservation.valid | r0.req.addr[LINE_OFF_BITS:64]):
 947                     comb += cancel_store.eq(1)
 948
 949     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 950                         reservation, r0):
 951
 952         comb = m.d.comb
 953         sync = m.d.sync
 954
 955         with m.If(r0_valid & access_ok):
 956             with m.If(clear_rsrv):
 957                 sync += reservation.valid.eq(0)
 958             with m.Elif(set_rsrv):
 959                 sync += reservation.valid.eq(1)
 960                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 961
 962     def writeback_control(self, m, r1, cache_out):
 963         """Return data for loads & completion control logic
 964         """
 965         comb = m.d.comb
 966         sync = m.d.sync
 967         d_out, m_out = self.d_out, self.m_out
 968
 969         data_out = Signal(64)
 970         data_fwd = Signal(64)
 971
 972         # Use the bypass if are reading the row that was
 973         # written 1 or 2 cycles ago, including for the
 974         # slow_valid = 1 case (i.e. completing a load
 975         # miss or a non-cacheable load).
 976         with m.If(r1.use_forward1):
 977             comb += data_fwd.eq(r1.forward_data1)
 978         with m.Else():
 979             comb += data_fwd.eq(r1.forward_data2)
 980
 981         comb += data_out.eq(cache_out[r1.hit_way])
 982
 983         for i in range(8):
 984             with m.If(r1.forward_sel[i]):
 985                 dsel = data_fwd.word_select(i, 8)
 986                 comb += data_out.word_select(i, 8).eq(dsel)
 987
 988         comb += d_out.valid.eq(r1.ls_valid)
 989         comb += d_out.data.eq(data_out)
 990         comb += d_out.store_done.eq(~r1.stcx_fail)
 991         comb += d_out.error.eq(r1.ls_error)
 992         comb += d_out.cache_paradox.eq(r1.cache_paradox)
 993
 994         # Outputs to MMU
 995         comb += m_out.done.eq(r1.mmu_done)
 996         comb += m_out.err.eq(r1.mmu_error)
 997         comb += m_out.data.eq(data_out)
 998
 999         # We have a valid load or store hit or we just completed
1000         # a slow op such as a load miss, a NC load or a store
1001         #
1002         # Note: the load hit is delayed by one cycle. However it
1003         # can still not collide with r.slow_valid (well unless I
1004         # miscalculated) because slow_valid can only be set on a
1005         # subsequent request and not on its first cycle (the state
1006         # machine must have advanced), which makes slow_valid
1007         # at least 2 cycles from the previous hit_load_valid.
1008
1009         # Sanity: Only one of these must be set in any given cycle
1010
1011         if False: # TODO: need Display to get this to work
1012             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1013             "unexpected slow_valid collision with stcx_fail"
1014
1015             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1016              "unexpected hit_load_delayed collision with slow_valid"
1017
1018         with m.If(~r1.mmu_req):
1019             # Request came from loadstore1...
1020             # Load hit case is the standard path
1021             with m.If(r1.hit_load_valid):
1022                 sync += Display("completing load hit data=%x", data_out)
1023
1024             # error cases complete without stalling
1025             with m.If(r1.ls_error):
1026                 sync += Display("completing ld/st with error")
1027
1028             # Slow ops (load miss, NC, stores)
1029             with m.If(r1.slow_valid):
1030                 sync += Display("completing store or load miss data=%x",
1031                                 data_out)
1032
1033         with m.Else():
1034             # Request came from MMU
1035             with m.If(r1.hit_load_valid):
1036                 sync += Display("completing load hit to MMU, data=%x",
1037                                 m_out.data)
1038             # error cases complete without stalling
1039             with m.If(r1.mmu_error):
1040                 sync += Display("combpleting MMU ld with error")
1041
1042             # Slow ops (i.e. load miss)
1043             with m.If(r1.slow_valid):
1044                 sync += Display("completing MMU load miss, data=%x",
1045                                 m_out.data)
1046
1047     def rams(self, m, r1, early_req_row, cache_out, replace_way):
1048         """rams
1049         Generate a cache RAM for each way. This handles the normal
1050         reads, writes from reloads and the special store-hit update
1051         path as well.
1052
1053         Note: the BRAMs have an extra read buffer, meaning the output
1054         is pipelined an extra cycle. This differs from the
1055         icache. The writeback logic needs to take that into
1056         account by using 1-cycle delayed signals for load hits.
1057         """
1058         comb = m.d.comb
1059         wb_in = self.wb_in
1060
1061         for i in range(NUM_WAYS):
1062             do_read  = Signal(name="do_rd%d" % i)
1063             rd_addr  = Signal(ROW_BITS)
1064             do_write = Signal(name="do_wr%d" % i)
1065             wr_addr  = Signal(ROW_BITS)
1066             wr_data  = Signal(WB_DATA_BITS)
1067             wr_sel   = Signal(ROW_SIZE)
1068             wr_sel_m = Signal(ROW_SIZE)
1069             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i)
1070
1071             way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
1072             setattr(m.submodules, "cacheram_%d" % i, way)
1073
1074             comb += way.rd_en.eq(do_read)
1075             comb += way.rd_addr.eq(rd_addr)
1076             comb += _d_out.eq(way.rd_data_o)
1077             comb += way.wr_sel.eq(wr_sel_m)
1078             comb += way.wr_addr.eq(wr_addr)
1079             comb += way.wr_data.eq(wr_data)
1080
1081             # Cache hit reads
1082             comb += do_read.eq(1)
1083             comb += rd_addr.eq(early_req_row[:ROW_BITS])
1084             comb += cache_out[i].eq(_d_out)
1085
1086             # Write mux:
1087             #
1088             # Defaults to wishbone read responses (cache refill)
1089             #
1090             # For timing, the mux on wr_data/sel/addr is not
1091             # dependent on anything other than the current state.
1092
1093             with m.If(r1.write_bram):
1094                 # Write store data to BRAM.  This happens one
1095                 # cycle after the store is in r0.
1096                 comb += wr_data.eq(r1.req.data)
1097                 comb += wr_sel.eq(r1.req.byte_sel)
1098                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1099
1100                 with m.If(i == r1.req.hit_way):
1101                     comb += do_write.eq(1)
1102             with m.Else():
1103                 # Otherwise, we might be doing a reload or a DCBZ
1104                 with m.If(r1.dcbz):
1105                     comb += wr_data.eq(0)
1106                 with m.Else():
1107                     comb += wr_data.eq(wb_in.dat)
1108                 comb += wr_addr.eq(r1.store_row)
1109                 comb += wr_sel.eq(~0) # all 1s
1110
1111             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1112                       & wb_in.ack & (replace_way == i)):
1113                 comb += do_write.eq(1)
1114
1115             # Mask write selects with do_write since BRAM
1116             # doesn't have a global write-enable
1117             with m.If(do_write):
1118                 comb += wr_sel_m.eq(wr_sel)
1119
1120     # Cache hit synchronous machine for the easy case.
1121     # This handles load hits.
1122     # It also handles error cases (TLB miss, cache paradox)
1123     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1124                         req_hit_way, req_index, req_tag, access_ok,
1125                         tlb_hit, tlb_hit_way, tlb_req_index):
1126
1127         comb = m.d.comb
1128         sync = m.d.sync
1129
1130         with m.If(req_op != Op.OP_NONE):
1131             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1132                     req_op, r0.req.addr, r0.req.nc,
1133                     req_index, req_tag, req_hit_way)
1134
1135         with m.If(r0_valid):
1136             sync += r1.mmu_req.eq(r0.mmu_req)
1137
1138         # Fast path for load/store hits.
1139         # Set signals for the writeback controls.
1140         sync += r1.hit_way.eq(req_hit_way)
1141         sync += r1.hit_index.eq(req_index)
1142
1143         with m.If(req_op == Op.OP_LOAD_HIT):
1144             sync += r1.hit_load_valid.eq(1)
1145         with m.Else():
1146             sync += r1.hit_load_valid.eq(0)
1147
1148         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1149             sync += r1.cache_hit.eq(1)
1150         with m.Else():
1151             sync += r1.cache_hit.eq(0)
1152
1153         with m.If(req_op == Op.OP_BAD):
1154             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1155             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1156             sync += r1.ls_error.eq(~r0.mmu_req)
1157             sync += r1.mmu_error.eq(r0.mmu_req)
1158             sync += r1.cache_paradox.eq(access_ok)
1159
1160             with m.Else():
1161                 sync += r1.ls_error.eq(0)
1162                 sync += r1.mmu_error.eq(0)
1163                 sync += r1.cache_paradox.eq(0)
1164
1165         with m.If(req_op == Op.OP_STCX_FAIL):
1166             r1.stcx_fail.eq(1)
1167         with m.Else():
1168             sync += r1.stcx_fail.eq(0)
1169
1170         # Record TLB hit information for updating TLB PLRU
1171         sync += r1.tlb_hit.eq(tlb_hit)
1172         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1173         sync += r1.tlb_hit_index.eq(tlb_req_index)
1174
1175     # Memory accesses are handled by this state machine:
1176     #
1177     #   * Cache load miss/reload (in conjunction with "rams")
1178     #   * Load hits for non-cachable forms
1179     #   * Stores (the collision case is handled in "rams")
1180     #
1181     # All wishbone requests generation is done here.
1182     # This machine operates at stage 1.
1183     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1184                     cache_valid_bits, r0, replace_way,
1185                     req_hit_way, req_same_tag,
1186                     r0_valid, req_op, cache_tags, req_go, ra):
1187
1188         comb = m.d.comb
1189         sync = m.d.sync
1190         wb_in = self.wb_in
1191
1192         req         = MemAccessRequest("mreq_ds")
1193         acks        = Signal(3)
1194         adjust_acks = Signal(3)
1195
1196         sync += r1.use_forward1.eq(use_forward1_next)
1197         sync += r1.forward_sel.eq(0)
1198
1199         with m.If(use_forward1_next):
1200             sync += r1.forward_sel.eq(r1.req.byte_sel)
1201         with m.Elif(use_forward2_next):
1202             sync += r1.forward_sel.eq(r1.forward_sel1)
1203
1204         sync += r1.forward_data2.eq(r1.forward_data1)
1205         with m.If(r1.write_bram):
1206             sync += r1.forward_data1.eq(r1.req.data)
1207             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1208             sync += r1.forward_way1.eq(r1.req.hit_way)
1209             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1210             sync += r1.forward_valid1.eq(1)
1211         with m.Else():
1212             with m.If(r1.dcbz):
1213                 sync += r1.forward_data1.eq(0)
1214             with m.Else():
1215                 sync += r1.forward_data1.eq(wb_in.dat)
1216             sync += r1.forward_sel1.eq(~0) # all 1s
1217             sync += r1.forward_way1.eq(replace_way)
1218             sync += r1.forward_row1.eq(r1.store_row)
1219             sync += r1.forward_valid1.eq(0)
1220
1221         # One cycle pulses reset
1222         sync += r1.slow_valid.eq(0)
1223         sync += r1.write_bram.eq(0)
1224         sync += r1.inc_acks.eq(0)
1225         sync += r1.dec_acks.eq(0)
1226
1227         sync += r1.ls_valid.eq(0)
1228         # complete tlbies and TLB loads in the third cycle
1229         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1230
1231         with m.If((req_op == Op.OP_LOAD_HIT)
1232                   | (req_op == Op.OP_STCX_FAIL)):
1233             with m.If(~r0.mmu_req):
1234                 sync += r1.ls_valid.eq(1)
1235             with m.Else():
1236                 sync += r1.mmu_done.eq(1)
1237
1238         with m.If(r1.write_tag):
1239             # Store new tag in selected way
1240             for i in range(NUM_WAYS):
1241                 with m.If(i == replace_way):
1242                     ct = Signal(TAG_RAM_WIDTH)
1243                     comb += ct.eq(cache_tags[r1.store_index])
1244                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1245                     sync += cache_tags[r1.store_index].eq(ct)
1246             sync += r1.store_way.eq(replace_way)
1247             sync += r1.write_tag.eq(0)
1248
1249         # Take request from r1.req if there is one there,
1250         # else from req_op, ra, etc.
1251         with m.If(r1.full):
1252             comb += req.eq(r1.req)
1253         with m.Else():
1254             comb += req.op.eq(req_op)
1255             comb += req.valid.eq(req_go)
1256             comb += req.mmu_req.eq(r0.mmu_req)
1257             comb += req.dcbz.eq(r0.req.dcbz)
1258             comb += req.real_addr.eq(ra)
1259
1260             with m.If(~r0.req.dcbz):
1261                 comb += req.data.eq(r0.req.data)
1262             with m.Else():
1263                 comb += req.data.eq(0)
1264
1265             # Select all bytes for dcbz
1266             # and for cacheable loads
1267             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1268                 comb += req.byte_sel.eq(~0) # all 1s
1269             with m.Else():
1270                 comb += req.byte_sel.eq(r0.req.byte_sel)
1271             comb += req.hit_way.eq(req_hit_way)
1272             comb += req.same_tag.eq(req_same_tag)
1273
1274             # Store the incoming request from r0,
1275             # if it is a slow request
1276             # Note that r1.full = 1 implies req_op = OP_NONE
1277             with m.If((req_op == Op.OP_LOAD_MISS)
1278                       | (req_op == Op.OP_LOAD_NC)
1279                       | (req_op == Op.OP_STORE_MISS)
1280                       | (req_op == Op.OP_STORE_HIT)):
1281                 sync += r1.req.eq(req)
1282                 sync += r1.full.eq(1)
1283
1284         # Main state machine
1285         with m.Switch(r1.state):
1286
1287             with m.Case(State.IDLE):
1288                 # XXX check 'left downto.  probably means len(r1.wb.adr)
1289                 #                     r1.wb.adr <= req.real_addr(
1290                 #                                   r1.wb.adr'left downto 0
1291                 #                                  );
1292                 sync += r1.wb.adr.eq(req.real_addr)
1293                 sync += r1.wb.sel.eq(req.byte_sel)
1294                 sync += r1.wb.dat.eq(req.data)
1295                 sync += r1.dcbz.eq(req.dcbz)
1296
1297                 # Keep track of our index and way
1298                 # for subsequent stores.
1299                 sync += r1.store_index.eq(get_index(req.real_addr))
1300                 sync += r1.store_row.eq(get_row(req.real_addr))
1301                 sync += r1.end_row_ix.eq(
1302                          get_row_of_line(get_row(req.real_addr))
1303                         )
1304                 sync += r1.reload_tag.eq(get_tag(req.real_addr))
1305                 sync += r1.req.same_tag.eq(1)
1306
1307                 with m.If(req.op == Op.OP_STORE_HIT):
1308                     sync += r1.store_way.eq(req.hit_way)
1309
1310                 # Reset per-row valid bits,
1311                 # ready for handling OP_LOAD_MISS
1312                 for i in range(ROW_PER_LINE):
1313                     sync += r1.rows_valid[i].eq(0)
1314
1315                 with m.If(req_op != Op.OP_NONE):
1316                     sync += Display("cache op %d", req.op)
1317
1318                 with m.Switch(req.op):
1319                     with m.Case(Op.OP_LOAD_HIT):
1320                         # stay in IDLE state
1321                         pass
1322
1323                     with m.Case(Op.OP_LOAD_MISS):
1324                         #Display(f"cache miss real addr:" \
1325                         #      f"{req_real_addr}" \
1326                         #      f" idx:{get_index(req_real_addr)}" \
1327                         #      f" tag:{get_tag(req.real_addr)}")
1328                         pass
1329
1330                         # Start the wishbone cycle
1331                         sync += r1.wb.we.eq(0)
1332                         sync += r1.wb.cyc.eq(1)
1333                         sync += r1.wb.stb.eq(1)
1334
1335                         # Track that we had one request sent
1336                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1337                         sync += r1.write_tag.eq(1)
1338
1339                     with m.Case(Op.OP_LOAD_NC):
1340                         sync += r1.wb.cyc.eq(1)
1341                         sync += r1.wb.stb.eq(1)
1342                         sync += r1.wb.we.eq(0)
1343                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1344
1345                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1346                         with m.If(~req.dcbz):
1347                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1348                             sync += r1.acks_pending.eq(1)
1349                             sync += r1.full.eq(0)
1350                             sync += r1.slow_valid.eq(1)
1351
1352                             with m.If(~req.mmu_req):
1353                                 sync += r1.ls_valid.eq(1)
1354                             with m.Else():
1355                                 sync += r1.mmu_done.eq(1)
1356
1357                             with m.If(req.op == Op.OP_STORE_HIT):
1358                                 sync += r1.write_bram.eq(1)
1359                         with m.Else():
1360                             # dcbz is handled much like a load miss except
1361                             # that we are writing to memory instead of reading
1362                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1363
1364                             with m.If(req.op == Op.OP_STORE_MISS):
1365                                 sync += r1.write_tag.eq(1)
1366
1367                         sync += r1.wb.we.eq(1)
1368                         sync += r1.wb.cyc.eq(1)
1369                         sync += r1.wb.stb.eq(1)
1370
1371                     # OP_NONE and OP_BAD do nothing
1372                     # OP_BAD & OP_STCX_FAIL were
1373                     # handled above already
1374                     with m.Case(Op.OP_NONE):
1375                         pass
1376                     with m.Case(Op.OP_BAD):
1377                         pass
1378                     with m.Case(Op.OP_STCX_FAIL):
1379                         pass
1380
1381             with m.Case(State.RELOAD_WAIT_ACK):
1382                 ld_stbs_done = Signal()
1383                 # Requests are all sent if stb is 0
1384                 comb += ld_stbs_done.eq(~r1.wb.stb)
1385
1386                 with m.If((~wb_in.stall) & r1.wb.stb):
1387                     # That was the last word?
1388                     # We are done sending.
1389                     # Clear stb and set ld_stbs_done
1390                     # so we can handle an eventual
1391                     # last ack on the same cycle.
1392                     with m.If(is_last_row_addr(r1.wb.adr, r1.end_row_ix)):
1393                         sync += r1.wb.stb.eq(0)
1394                         comb += ld_stbs_done.eq(1)
1395
1396                     # Calculate the next row address in the current cache line
1397                     rarange = r1.wb.adr[ROW_OFF_BITS : LINE_OFF_BITS]
1398                     sync += rarange.eq(rarange + 1)
1399
1400                 # Incoming acks processing
1401                 sync += r1.forward_valid1.eq(wb_in.ack)
1402                 with m.If(wb_in.ack):
1403                     sync += r1.rows_valid[r1.store_row % ROW_PER_LINE].eq(1)
1404
1405                     # If this is the data we were looking for,
1406                     # we can complete the request next cycle.
1407                     # Compare the whole address in case the
1408                     # request in r1.req is not the one that
1409                     # started this refill.
1410                     with m.If(r1.full & r1.req.same_tag &
1411                               ((r1.dcbz & r1.req.dcbz) |
1412                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1413                                 (r1.store_row == get_row(r1.req.real_addr))):
1414                         sync += r1.full.eq(0)
1415                         sync += r1.slow_valid.eq(1)
1416                         with m.If(~r1.mmu_req):
1417                             sync += r1.ls_valid.eq(1)
1418                         with m.Else():
1419                             sync += r1.mmu_done.eq(1)
1420                         sync += r1.forward_sel.eq(~0) # all 1s
1421                         sync += r1.use_forward1.eq(1)
1422
1423                     # Check for completion
1424                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1425                                                       r1.end_row_ix)):
1426                         # Complete wishbone cycle
1427                         sync += r1.wb.cyc.eq(0)
1428
1429                         # Cache line is now valid
1430                         cv = Signal(INDEX_BITS)
1431                         comb += cv.eq(cache_valid_bits[r1.store_index])
1432                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1433                         sync += cache_valid_bits[r1.store_index].eq(cv)
1434                         sync += r1.state.eq(State.IDLE)
1435
1436                     # Increment store row counter
1437                     sync += r1.store_row.eq(next_row(r1.store_row))
1438
1439             with m.Case(State.STORE_WAIT_ACK):
1440                 st_stbs_done = Signal()
1441                 comb += st_stbs_done.eq(~r1.wb.stb)
1442                 comb += acks.eq(r1.acks_pending)
1443
1444                 with m.If(r1.inc_acks != r1.dec_acks):
1445                     with m.If(r1.inc_acks):
1446                         comb += adjust_acks.eq(acks + 1)
1447                     with m.Else():
1448                         comb += adjust_acks.eq(acks - 1)
1449                 with m.Else():
1450                     comb += adjust_acks.eq(acks)
1451
1452                 sync += r1.acks_pending.eq(adjust_acks)
1453
1454                 # Clear stb when slave accepted request
1455                 with m.If(~wb_in.stall):
1456                     # See if there is another store waiting
1457                     # to be done which is in the same real page.
1458                     with m.If(req.valid):
1459                         ra = req.real_addr[0:SET_SIZE_BITS]
1460                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
1461                         sync += r1.wb.dat.eq(req.data)
1462                         sync += r1.wb.sel.eq(req.byte_sel)
1463
1464                     with m.Elif((adjust_acks < 7) & req.same_tag &
1465                                 ((req.op == Op.OP_STORE_MISS)
1466                                  | (req.op == Op.OP_STORE_HIT))):
1467                         sync += r1.wb.stb.eq(1)
1468                         comb += st_stbs_done.eq(0)
1469
1470                         with m.If(req.op == Op.OP_STORE_HIT):
1471                             sync += r1.write_bram.eq(1)
1472                         sync += r1.full.eq(0)
1473                         sync += r1.slow_valid.eq(1)
1474
1475                         # Store requests never come from the MMU
1476                         sync += r1.ls_valid.eq(1)
1477                         comb += st_stbs_done.eq(0)
1478                         sync += r1.inc_acks.eq(1)
1479                     with m.Else():
1480                         sync += r1.wb.stb.eq(0)
1481                         comb += st_stbs_done.eq(1)
1482
1483                 # Got ack ? See if complete.
1484                 with m.If(wb_in.ack):
1485                     with m.If(st_stbs_done & (adjust_acks == 1)):
1486                         sync += r1.state.eq(State.IDLE)
1487                         sync += r1.wb.cyc.eq(0)
1488                         sync += r1.wb.stb.eq(0)
1489                     sync += r1.dec_acks.eq(1)
1490
1491             with m.Case(State.NC_LOAD_WAIT_ACK):
1492                 # Clear stb when slave accepted request
1493                 with m.If(~wb_in.stall):
1494                     sync += r1.wb.stb.eq(0)
1495
1496                 # Got ack ? complete.
1497                 with m.If(wb_in.ack):
1498                     sync += r1.state.eq(State.IDLE)
1499                     sync += r1.full.eq(0)
1500                     sync += r1.slow_valid.eq(1)
1501
1502                     with m.If(~r1.mmu_req):
1503                         sync += r1.ls_valid.eq(1)
1504                     with m.Else():
1505                         sync += r1.mmu_done.eq(1)
1506
1507                     sync += r1.forward_sel.eq(~0) # all 1s
1508                     sync += r1.use_forward1.eq(1)
1509                     sync += r1.wb.cyc.eq(0)
1510                     sync += r1.wb.stb.eq(0)
1511
1512     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1513
1514         sync = m.d.sync
1515         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1516
1517         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1518                                stall_out, req_op[:3], d_out.valid, d_out.error,
1519                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1520                                r1.wb.adr[3:6]))
1521
1522     def elaborate(self, platform):
1523
1524         m = Module()
1525         comb = m.d.comb
1526
1527         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1528         cache_tags       = CacheTagArray()
1529         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1530         cache_valid_bits = CacheValidBitsArray()
1531
1532         # TODO attribute ram_style : string;
1533         # TODO attribute ram_style of cache_tags : signal is "distributed";
1534
1535         """note: these are passed to nmigen.hdl.Memory as "attributes".
1536            don't know how, just that they are.
1537         """
1538         dtlb_valid_bits = TLBValidBitsArray()
1539         dtlb_tags       = TLBTagsArray()
1540         dtlb_ptes       = TLBPtesArray()
1541         # TODO attribute ram_style of
1542         #  dtlb_tags : signal is "distributed";
1543         # TODO attribute ram_style of
1544         #  dtlb_ptes : signal is "distributed";
1545
1546         r0      = RegStage0("r0")
1547         r0_full = Signal()
1548
1549         r1 = RegStage1("r1")
1550
1551         reservation = Reservation()
1552
1553         # Async signals on incoming request
1554         req_index    = Signal(INDEX_BITS)
1555         req_row      = Signal(ROW_BITS)
1556         req_hit_way  = Signal(WAY_BITS)
1557         req_tag      = Signal(TAG_BITS)
1558         req_op       = Signal(Op)
1559         req_data     = Signal(64)
1560         req_same_tag = Signal()
1561         req_go       = Signal()
1562
1563         early_req_row     = Signal(ROW_BITS)
1564
1565         cancel_store      = Signal()
1566         set_rsrv          = Signal()
1567         clear_rsrv        = Signal()
1568
1569         r0_valid          = Signal()
1570         r0_stall          = Signal()
1571
1572         use_forward1_next = Signal()
1573         use_forward2_next = Signal()
1574
1575         cache_out         = CacheRamOut()
1576
1577         plru_victim       = PLRUOut()
1578         replace_way       = Signal(WAY_BITS)
1579
1580         # Wishbone read/write/cache write formatting signals
1581         bus_sel           = Signal(8)
1582
1583         # TLB signals
1584         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1585         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1586         tlb_valid_way = Signal(TLB_NUM_WAYS)
1587         tlb_req_index = Signal(TLB_SET_BITS)
1588         tlb_hit       = Signal()
1589         tlb_hit_way   = Signal(TLB_WAY_BITS)
1590         pte           = Signal(TLB_PTE_BITS)
1591         ra            = Signal(REAL_ADDR_BITS)
1592         valid_ra      = Signal()
1593         perm_attr     = PermAttr("dc_perms")
1594         rc_ok         = Signal()
1595         perm_ok       = Signal()
1596         access_ok     = Signal()
1597
1598         tlb_plru_victim = TLBPLRUOut()
1599
1600         # we don't yet handle collisions between loadstore1 requests
1601         # and MMU requests
1602         comb += self.m_out.stall.eq(0)
1603
1604         # Hold off the request in r0 when r1 has an uncompleted request
1605         comb += r0_stall.eq(r0_full & r1.full)
1606         comb += r0_valid.eq(r0_full & ~r1.full)
1607         comb += self.stall_out.eq(r0_stall)
1608
1609         # Wire up wishbone request latch out of stage 1
1610         comb += self.wb_out.eq(r1.wb)
1611
1612         # call sub-functions putting everything together, using shared
1613         # signals established above
1614         self.stage_0(m, r0, r1, r0_full)
1615         self.tlb_read(m, r0_stall, tlb_valid_way,
1616                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1617                       dtlb_tags, dtlb_ptes)
1618         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1619                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1620                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1621         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1622                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1623                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1624         self.maybe_plrus(m, r1, plru_victim)
1625         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1626         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1627         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1628                            r0_valid, r1, cache_valid_bits, replace_way,
1629                            use_forward1_next, use_forward2_next,
1630                            req_hit_way, plru_victim, rc_ok, perm_attr,
1631                            valid_ra, perm_ok, access_ok, req_op, req_go,
1632                            tlb_pte_way,
1633                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1634                            cancel_store, req_same_tag, r0_stall, early_req_row)
1635         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1636                            r0_valid, r0, reservation)
1637         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1638                            reservation, r0)
1639         self.writeback_control(m, r1, cache_out)
1640         self.rams(m, r1, early_req_row, cache_out, replace_way)
1641         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1642                         req_hit_way, req_index, req_tag, access_ok,
1643                         tlb_hit, tlb_hit_way, tlb_req_index)
1644         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1645                     cache_valid_bits, r0, replace_way,
1646                     req_hit_way, req_same_tag,
1647                          r0_valid, req_op, cache_tags, req_go, ra)
1648         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1649
1650         return m
1651
1652 def dcache_load(dut, addr, nc=0):
1653     yield dut.d_in.load.eq(1)
1654     yield dut.d_in.nc.eq(nc)
1655     yield dut.d_in.addr.eq(addr)
1656     yield dut.d_in.valid.eq(1)
1657     yield
1658     yield dut.d_in.valid.eq(0)
1659     yield
1660     while not (yield dut.d_out.valid):
1661         yield
1662     data = yield dut.d_out.data
1663     return data
1664
1665
1666 def dcache_store(dut, addr, data, nc=0):
1667     yield dut.d_in.load.eq(0)
1668     yield dut.d_in.nc.eq(nc)
1669     yield dut.d_in.data.eq(data)
1670     yield dut.d_in.byte_sel.eq(~0)
1671     yield dut.d_in.addr.eq(addr)
1672     yield dut.d_in.valid.eq(1)
1673     yield
1674     yield dut.d_in.valid.eq(0)
1675     yield dut.d_in.byte_sel.eq(0)
1676     yield
1677     while not (yield dut.d_out.valid):
1678         yield
1679
1680
1681 def dcache_sim(dut):
1682     # clear stuff
1683     yield dut.d_in.valid.eq(0)
1684     yield dut.d_in.load.eq(0)
1685     yield dut.d_in.priv_mode.eq(1)
1686     yield dut.d_in.nc.eq(0)
1687     yield dut.d_in.addr.eq(0)
1688     yield dut.d_in.data.eq(0)
1689     yield dut.m_in.valid.eq(0)
1690     yield dut.m_in.addr.eq(0)
1691     yield dut.m_in.pte.eq(0)
1692     # wait 4 * clk_period
1693     yield
1694     yield
1695     yield
1696     yield
1697
1698     # Cacheable read of address 4
1699     data = yield from dcache_load(dut, 0x4)
1700     addr = yield dut.d_in.addr
1701     assert data == 0x0000000100000000, \
1702         f"data @%x=%x expected 0x0000000100000000" % (addr, data)
1703
1704     # Cacheable read of address 30
1705     data = yield from dcache_load(dut, 0x530)
1706     addr = yield dut.d_in.addr
1707     assert data == 0x0000004D0000004C, \
1708         f"data @%x=%x expected 0000004D0000004C" % (addr, data)
1709
1710     # 2nd Cacheable read of address 30
1711     data = yield from dcache_load(dut, 0x530)
1712     addr = yield dut.d_in.addr
1713     assert data == 0x0000004D0000004C, \
1714         f"data @%x=%x expected 0000004D0000004C" % (addr, data)
1715
1716     # Non-cacheable read of address 100
1717     data = yield from dcache_load(dut, 0x100, nc=1)
1718     addr = yield dut.d_in.addr
1719     assert data == 0x0000004100000040, \
1720         f"data @%x=%x expected 0000004100000040" % (addr, data)
1721
1722     # Store at address 530
1723     yield from dcache_store(dut, 0x530, 0x121)
1724
1725     # Store at address 30
1726     yield from dcache_store(dut, 0x530, 0x12345678)
1727
1728     # 3nd Cacheable read of address 530
1729     data = yield from dcache_load(dut, 0x530)
1730     addr = yield dut.d_in.addr
1731     assert data == 0x12345678, \
1732         f"data @%x=%x expected 0x12345678" % (addr, data)
1733
1734     yield
1735     yield
1736     yield
1737     yield
1738
1739
1740 def test_dcache():
1741     dut = DCache()
1742     vl = rtlil.convert(dut, ports=[])
1743     with open("test_dcache.il", "w") as f:
1744         f.write(vl)
1745
1746     mem = []
1747     for i in range(0,128):
1748         mem.append((i*2)| ((i*2+1)<<32))
1749     memory = Memory(width=64, depth=16*8, init=mem)
1750     sram = SRAM(memory=memory, granularity=8)
1751
1752     m = Module()
1753     m.submodules.dcache = dut
1754     m.submodules.sram = sram
1755
1756     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1757     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1758     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1759     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1760     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr[3:])
1761     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1762
1763     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1764     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1765
1766     # nmigen Simulation
1767     sim = Simulator(m)
1768     sim.add_clock(1e-6)
1769
1770     sim.add_sync_process(wrap(dcache_sim(dut)))
1771     with sim.write_vcd('test_dcache.vcd'):
1772         sim.run()
1773
1774 if __name__ == '__main__':
1775     test_dcache()
1776