src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 """
   6
   7 from enum import Enum, unique
   8
   9 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  10 from nmutil.util import Display
  11
  12 from random import randint
  13
  14 from nmigen.cli import main
  15 from nmutil.iocontrol import RecordObject
  16 from nmigen.utils import log2_int
  17 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  18                                      DCacheToLoadStore1Type,
  19                                      MMUToDCacheType,
  20                                      DCacheToMMUType)
  21
  22 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  23                                 WBAddrType, WBDataType, WBSelType,
  24                                 WBMasterOut, WBSlaveOut,
  25                                 WBMasterOutVector, WBSlaveOutVector,
  26                                 WBIOMasterOut, WBIOSlaveOut)
  27
  28 from soc.experiment.cache_ram import CacheRam
  29 #from soc.experiment.plru import PLRU
  30 from nmutil.plru import PLRU
  31
  32 # for test
  33 from nmigen_soc.wishbone.sram import SRAM
  34 from nmigen import Memory
  35 from nmigen.cli import rtlil
  36 if True:
  37     from nmigen.back.pysim import Simulator, Delay, Settle
  38 else:
  39     from nmigen.sim.cxxsim import Simulator, Delay, Settle
  40 from nmutil.util import wrap
  41
  42
  43 # TODO: make these parameters of DCache at some point
  44 LINE_SIZE = 64    # Line size in bytes
  45 NUM_LINES = 16    # Number of lines in a set
  46 NUM_WAYS = 4      # Number of ways
  47 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  48 TLB_NUM_WAYS = 4  # L1 DTLB number of sets
  49 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  50 LOG_LENGTH = 0    # Non-zero to enable log data collection
  51
  52 # BRAM organisation: We never access more than
  53 #     -- WB_DATA_BITS at a time so to save
  54 #     -- resources we make the array only that wide, and
  55 #     -- use consecutive indices for to make a cache "line"
  56 #     --
  57 #     -- ROW_SIZE is the width in bytes of the BRAM
  58 #     -- (based on WB, so 64-bits)
  59 ROW_SIZE = WB_DATA_BITS // 8;
  60
  61 # ROW_PER_LINE is the number of row (wishbone
  62 # transactions) in a line
  63 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  64
  65 # BRAM_ROWS is the number of rows in BRAM needed
  66 # to represent the full dcache
  67 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  68
  69 print ("ROW_SIZE", ROW_SIZE)
  70 print ("ROW_PER_LINE", ROW_PER_LINE)
  71 print ("BRAM_ROWS", BRAM_ROWS)
  72 print ("NUM_WAYS", NUM_WAYS)
  73
  74 # Bit fields counts in the address
  75
  76 # REAL_ADDR_BITS is the number of real address
  77 # bits that we store
  78 REAL_ADDR_BITS = 56
  79
  80 # ROW_BITS is the number of bits to select a row
  81 ROW_BITS = log2_int(BRAM_ROWS)
  82
  83 # ROW_LINE_BITS is the number of bits to select
  84 # a row within a line
  85 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  86
  87 # LINE_OFF_BITS is the number of bits for
  88 # the offset in a cache line
  89 LINE_OFF_BITS = log2_int(LINE_SIZE)
  90
  91 # ROW_OFF_BITS is the number of bits for
  92 # the offset in a row
  93 ROW_OFF_BITS = log2_int(ROW_SIZE)
  94
  95 # INDEX_BITS is the number if bits to
  96 # select a cache line
  97 INDEX_BITS = log2_int(NUM_LINES)
  98
  99 # SET_SIZE_BITS is the log base 2 of the set size
 100 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 101
 102 # TAG_BITS is the number of bits of
 103 # the tag part of the address
 104 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 105
 106 # TAG_WIDTH is the width in bits of each way of the tag RAM
 107 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 108
 109 # WAY_BITS is the number of bits to select a way
 110 WAY_BITS = log2_int(NUM_WAYS)
 111
 112 # Example of layout for 32 lines of 64 bytes:
 113 layout = """\
 114   ..  tag    |index|  line  |
 115   ..         |   row   |    |
 116   ..         |     |---|    | ROW_LINE_BITS  (3)
 117   ..         |     |--- - --| LINE_OFF_BITS (6)
 118   ..         |         |- --| ROW_OFF_BITS  (3)
 119   ..         |----- ---|    | ROW_BITS      (8)
 120   ..         |-----|        | INDEX_BITS    (5)
 121   .. --------|              | TAG_BITS      (45)
 122 """
 123 print (layout)
 124 print ("Dcache TAG %d IDX %d ROW %d ROFF %d LOFF %d RLB %d" % \
 125             (TAG_BITS, INDEX_BITS, ROW_BITS,
 126              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 127 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 128 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 129 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 130
 131 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 132
 133 def CacheTagArray():
 134     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 135                         for x in range(NUM_LINES))
 136
 137 def CacheValidBitsArray():
 138     return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
 139                         for x in range(NUM_LINES))
 140
 141 def RowPerLineValidArray():
 142     return Array(Signal(name="rows_valid%d" % x) \
 143                         for x in range(ROW_PER_LINE))
 144
 145 # L1 TLB
 146 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 147 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 148 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 149 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 150 TLB_PTE_BITS     = 64
 151 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 152
 153 def ispow2(x):
 154     return (1<<log2_int(x, False)) == x
 155
 156 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 157 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 158 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 159 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 160 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 161 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 162         "geometry bits don't add up"
 163 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 164         "geometry bits don't add up"
 165 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 166          "geometry bits don't add up"
 167 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 168 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 169
 170
 171 def TLBValidBitsArray():
 172     return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
 173                 for x in range(TLB_SET_SIZE))
 174
 175 def TLBTagEAArray():
 176     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 177                 for x in range (TLB_NUM_WAYS))
 178
 179 def TLBTagsArray():
 180     return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
 181                 for x in range (TLB_SET_SIZE))
 182
 183 def TLBPtesArray():
 184     return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
 185                 for x in range(TLB_SET_SIZE))
 186
 187 def HitWaySet():
 188     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 189                         for x in range(TLB_NUM_WAYS))
 190
 191 # Cache RAM interface
 192 def CacheRamOut():
 193     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 194                  for x in range(NUM_WAYS))
 195
 196 # PLRU output interface
 197 def PLRUOut():
 198     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 199                 for x in range(NUM_LINES))
 200
 201 # TLB PLRU output interface
 202 def TLBPLRUOut():
 203     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 204                 for x in range(TLB_SET_SIZE))
 205
 206 # Helper functions to decode incoming requests
 207 #
 208 # Return the cache line index (tag index) for an address
 209 def get_index(addr):
 210     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 211
 212 # Return the cache row index (data memory) for an address
 213 def get_row(addr):
 214     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 215
 216 # Return the index of a row within a line
 217 def get_row_of_line(row):
 218     return row[:ROW_BITS][:ROW_LINE_BITS]
 219
 220 # Returns whether this is the last row of a line
 221 def is_last_row_addr(addr, last):
 222     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 223
 224 # Returns whether this is the last row of a line
 225 def is_last_row(row, last):
 226     return get_row_of_line(row) == last
 227
 228 # Return the next row in the current cache line. We use a
 229 # dedicated function in order to limit the size of the
 230 # generated adder to be only the bits within a cache line
 231 # (3 bits with default settings)
 232 def next_row(row):
 233     row_v = row[0:ROW_LINE_BITS] + 1
 234     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 235
 236 # Get the tag value from the address
 237 def get_tag(addr):
 238     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 239
 240 # Read a tag from a tag memory row
 241 def read_tag(way, tagset):
 242     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 243
 244 # Read a TLB tag from a TLB tag memory row
 245 def read_tlb_tag(way, tags):
 246     return tags.word_select(way, TLB_EA_TAG_BITS)
 247
 248 # Write a TLB tag to a TLB tag memory row
 249 def write_tlb_tag(way, tags, tag):
 250     return read_tlb_tag(way, tags).eq(tag)
 251
 252 # Read a PTE from a TLB PTE memory row
 253 def read_tlb_pte(way, ptes):
 254     return ptes.word_select(way, TLB_PTE_BITS)
 255
 256 def write_tlb_pte(way, ptes, newpte):
 257     return read_tlb_pte(way, ptes).eq(newpte)
 258
 259
 260 # Record for storing permission, attribute, etc. bits from a PTE
 261 class PermAttr(RecordObject):
 262     def __init__(self, name=None):
 263         super().__init__(name=name)
 264         self.reference = Signal()
 265         self.changed   = Signal()
 266         self.nocache   = Signal()
 267         self.priv      = Signal()
 268         self.rd_perm   = Signal()
 269         self.wr_perm   = Signal()
 270
 271
 272 def extract_perm_attr(pte):
 273     pa = PermAttr()
 274     return pa;
 275
 276
 277 # Type of operation on a "valid" input
 278 @unique
 279 class Op(Enum):
 280     OP_NONE       = 0
 281     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 282     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 283     OP_LOAD_HIT   = 3 # Cache hit on load
 284     OP_LOAD_MISS  = 4 # Load missing cache
 285     OP_LOAD_NC    = 5 # Non-cachable load
 286     OP_STORE_HIT  = 6 # Store hitting cache
 287     OP_STORE_MISS = 7 # Store missing cache
 288
 289
 290 # Cache state machine
 291 @unique
 292 class State(Enum):
 293     IDLE             = 0 # Normal load hit processing
 294     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 295     STORE_WAIT_ACK   = 2 # Store wait ack
 296     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 297
 298
 299 # Dcache operations:
 300 #
 301 # In order to make timing, we use the BRAMs with
 302 # an output buffer, which means that the BRAM
 303 # output is delayed by an extra cycle.
 304 #
 305 # Thus, the dcache has a 2-stage internal pipeline
 306 # for cache hits with no stalls.
 307 #
 308 # All other operations are handled via stalling
 309 # in the first stage.
 310 #
 311 # The second stage can thus complete a hit at the same
 312 # time as the first stage emits a stall for a complex op.
 313 #
 314 # Stage 0 register, basically contains just the latched request
 315
 316 class RegStage0(RecordObject):
 317     def __init__(self, name=None):
 318         super().__init__(name=name)
 319         self.req     = LoadStore1ToDCacheType(name="lsmem")
 320         self.tlbie   = Signal()
 321         self.doall   = Signal()
 322         self.tlbld   = Signal()
 323         self.mmu_req = Signal() # indicates source of request
 324
 325
 326 class MemAccessRequest(RecordObject):
 327     def __init__(self, name=None):
 328         super().__init__(name=name)
 329         self.op        = Signal(Op)
 330         self.valid     = Signal()
 331         self.dcbz      = Signal()
 332         self.real_addr = Signal(REAL_ADDR_BITS)
 333         self.data      = Signal(64)
 334         self.byte_sel  = Signal(8)
 335         self.hit_way   = Signal(WAY_BITS)
 336         self.same_tag  = Signal()
 337         self.mmu_req   = Signal()
 338
 339
 340 # First stage register, contains state for stage 1 of load hits
 341 # and for the state machine used by all other operations
 342 class RegStage1(RecordObject):
 343     def __init__(self, name=None):
 344         super().__init__(name=name)
 345         # Info about the request
 346         self.full             = Signal() # have uncompleted request
 347         self.mmu_req          = Signal() # request is from MMU
 348         self.req              = MemAccessRequest(name="reqmem")
 349
 350         # Cache hit state
 351         self.hit_way          = Signal(WAY_BITS)
 352         self.hit_load_valid   = Signal()
 353         self.hit_index        = Signal(INDEX_BITS)
 354         self.cache_hit        = Signal()
 355
 356         # TLB hit state
 357         self.tlb_hit          = Signal()
 358         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 359         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 360
 361         # 2-stage data buffer for data forwarded from writes to reads
 362         self.forward_data1    = Signal(64)
 363         self.forward_data2    = Signal(64)
 364         self.forward_sel1     = Signal(8)
 365         self.forward_valid1   = Signal()
 366         self.forward_way1     = Signal(WAY_BITS)
 367         self.forward_row1     = Signal(ROW_BITS)
 368         self.use_forward1     = Signal()
 369         self.forward_sel      = Signal(8)
 370
 371         # Cache miss state (reload state machine)
 372         self.state            = Signal(State)
 373         self.dcbz             = Signal()
 374         self.write_bram       = Signal()
 375         self.write_tag        = Signal()
 376         self.slow_valid       = Signal()
 377         self.real_adr         = Signal(REAL_ADDR_BITS)
 378         self.wb               = WBMasterOut("wb")
 379         self.reload_tag       = Signal(TAG_BITS)
 380         self.store_way        = Signal(WAY_BITS)
 381         self.store_row        = Signal(ROW_BITS)
 382         self.store_index      = Signal(INDEX_BITS)
 383         self.end_row_ix       = Signal(ROW_LINE_BITS)
 384         self.rows_valid       = RowPerLineValidArray()
 385         self.acks_pending     = Signal(3)
 386         self.inc_acks         = Signal()
 387         self.dec_acks         = Signal()
 388
 389         # Signals to complete (possibly with error)
 390         self.ls_valid         = Signal()
 391         self.ls_error         = Signal()
 392         self.mmu_done         = Signal()
 393         self.mmu_error        = Signal()
 394         self.cache_paradox    = Signal()
 395
 396         # Signal to complete a failed stcx.
 397         self.stcx_fail        = Signal()
 398
 399
 400 # Reservation information
 401 class Reservation(RecordObject):
 402     def __init__(self):
 403         super().__init__()
 404         self.valid = Signal()
 405         self.addr  = Signal(64-LINE_OFF_BITS)
 406
 407
 408 class DTLBUpdate(Elaboratable):
 409     def __init__(self):
 410         self.tlbie    = Signal()
 411         self.tlbwe    = Signal()
 412         self.doall    = Signal()
 413         self.updated  = Signal()
 414         self.v_updated  = Signal()
 415         self.tlb_hit    = Signal()
 416         self.tlb_req_index = Signal(TLB_SET_BITS)
 417
 418         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 419         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 420         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 421         self.repl_way        = Signal(TLB_WAY_BITS)
 422         self.eatag           = Signal(TLB_EA_TAG_BITS)
 423         self.pte_data        = Signal(TLB_PTE_BITS)
 424
 425         self.dv = Signal(TLB_PTE_WAY_BITS)
 426
 427         self.tb_out = Signal(TLB_TAG_WAY_BITS)
 428         self.pb_out = Signal(TLB_NUM_WAYS)
 429         self.db_out = Signal(TLB_PTE_WAY_BITS)
 430
 431     def elaborate(self, platform):
 432         m = Module()
 433         comb = m.d.comb
 434         sync = m.d.sync
 435
 436         tagset   = Signal(TLB_TAG_WAY_BITS)
 437         pteset   = Signal(TLB_PTE_WAY_BITS)
 438
 439         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 440
 441         with m.If(self.tlbie & self.doall):
 442             pass # clear all back in parent
 443         with m.Elif(self.tlbie):
 444             with m.If(self.tlb_hit):
 445                 comb += db_out.eq(self.dv)
 446                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 447                 comb += self.v_updated.eq(1)
 448
 449         with m.Elif(self.tlbwe):
 450
 451             comb += tagset.eq(self.tlb_tag_way)
 452             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 453             comb += tb_out.eq(tagset)
 454
 455             comb += pteset.eq(self.tlb_pte_way)
 456             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 457             comb += pb_out.eq(pteset)
 458
 459             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 460
 461             comb += self.updated.eq(1)
 462             comb += self.v_updated.eq(1)
 463
 464         return m
 465
 466
 467 class DCachePendingHit(Elaboratable):
 468
 469     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 470                       cache_valid_idx, cache_tag_set,
 471                     req_addr,
 472                     hit_set):
 473
 474         self.go          = Signal()
 475         self.virt_mode   = Signal()
 476         self.is_hit      = Signal()
 477         self.tlb_hit     = Signal()
 478         self.hit_way     = Signal(WAY_BITS)
 479         self.rel_match   = Signal()
 480         self.req_index   = Signal(INDEX_BITS)
 481         self.reload_tag  = Signal(TAG_BITS)
 482
 483         self.tlb_hit_way = tlb_hit_way
 484         self.tlb_pte_way = tlb_pte_way
 485         self.tlb_valid_way = tlb_valid_way
 486         self.cache_valid_idx = cache_valid_idx
 487         self.cache_tag_set = cache_tag_set
 488         self.req_addr = req_addr
 489         self.hit_set = hit_set
 490
 491     def elaborate(self, platform):
 492         m = Module()
 493         comb = m.d.comb
 494         sync = m.d.sync
 495
 496         go = self.go
 497         virt_mode = self.virt_mode
 498         is_hit = self.is_hit
 499         tlb_pte_way = self.tlb_pte_way
 500         tlb_valid_way = self.tlb_valid_way
 501         cache_valid_idx = self.cache_valid_idx
 502         cache_tag_set = self.cache_tag_set
 503         req_addr = self.req_addr
 504         tlb_hit_way = self.tlb_hit_way
 505         tlb_hit = self.tlb_hit
 506         hit_set = self.hit_set
 507         hit_way = self.hit_way
 508         rel_match = self.rel_match
 509         req_index = self.req_index
 510         reload_tag = self.reload_tag
 511
 512         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 513                                     for i in range(TLB_NUM_WAYS))
 514         hit_way_set = HitWaySet()
 515
 516         # Test if pending request is a hit on any way
 517         # In order to make timing in virtual mode,
 518         # when we are using the TLB, we compare each
 519         # way with each of the real addresses from each way of
 520         # the TLB, and then decide later which match to use.
 521
 522         with m.If(virt_mode):
 523             for j in range(TLB_NUM_WAYS):
 524                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 525                 s_hit       = Signal()
 526                 s_pte       = Signal(TLB_PTE_BITS)
 527                 s_ra        = Signal(REAL_ADDR_BITS)
 528                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 529                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 530                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 531                 comb += s_tag.eq(get_tag(s_ra))
 532
 533                 for i in range(NUM_WAYS):
 534                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 535                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 536                                   (read_tag(i, cache_tag_set) == s_tag)
 537                                   & tlb_valid_way[j])
 538                     with m.If(is_tag_hit):
 539                         comb += hit_way_set[j].eq(i)
 540                         comb += s_hit.eq(1)
 541                 comb += hit_set[j].eq(s_hit)
 542                 with m.If(s_tag == reload_tag):
 543                     comb += rel_matches[j].eq(1)
 544             with m.If(tlb_hit):
 545                 comb += is_hit.eq(hit_set[tlb_hit_way])
 546                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 547                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 548         with m.Else():
 549             s_tag       = Signal(TAG_BITS)
 550             comb += s_tag.eq(get_tag(req_addr))
 551             for i in range(NUM_WAYS):
 552                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 553                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 554                           (read_tag(i, cache_tag_set) == s_tag))
 555                 with m.If(is_tag_hit):
 556                     comb += hit_way.eq(i)
 557                     comb += is_hit.eq(1)
 558             with m.If(s_tag == reload_tag):
 559                 comb += rel_match.eq(1)
 560
 561         return m
 562
 563
 564 class DCache(Elaboratable):
 565     """Set associative dcache write-through
 566     TODO (in no specific order):
 567     * See list in icache.vhdl
 568     * Complete load misses on the cycle when WB data comes instead of
 569       at the end of line (this requires dealing with requests coming in
 570       while not idle...)
 571     """
 572     def __init__(self):
 573         self.d_in      = LoadStore1ToDCacheType("d_in")
 574         self.d_out     = DCacheToLoadStore1Type("d_out")
 575
 576         self.m_in      = MMUToDCacheType("m_in")
 577         self.m_out     = DCacheToMMUType("m_out")
 578
 579         self.stall_out = Signal()
 580
 581         self.wb_out    = WBMasterOut()
 582         self.wb_in     = WBSlaveOut()
 583
 584         self.log_out   = Signal(20)
 585
 586     def stage_0(self, m, r0, r1, r0_full):
 587         """Latch the request in r0.req as long as we're not stalling
 588         """
 589         comb = m.d.comb
 590         sync = m.d.sync
 591         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 592
 593         r = RegStage0("stage0")
 594
 595         # TODO, this goes in unit tests and formal proofs
 596         with m.If(d_in.valid & m_in.valid):
 597             sync += Display("request collision loadstore vs MMU")
 598
 599         with m.If(m_in.valid):
 600             sync += r.req.valid.eq(1)
 601             sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 602             sync += r.req.dcbz.eq(0)
 603             sync += r.req.nc.eq(0)
 604             sync += r.req.reserve.eq(0)
 605             sync += r.req.virt_mode.eq(0)
 606             sync += r.req.priv_mode.eq(1)
 607             sync += r.req.addr.eq(m_in.addr)
 608             sync += r.req.data.eq(m_in.pte)
 609             sync += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 610             sync += r.tlbie.eq(m_in.tlbie)
 611             sync += r.doall.eq(m_in.doall)
 612             sync += r.tlbld.eq(m_in.tlbld)
 613             sync += r.mmu_req.eq(1)
 614         with m.Else():
 615             sync += r.req.eq(d_in)
 616             sync += r.tlbie.eq(0)
 617             sync += r.doall.eq(0)
 618             sync += r.tlbld.eq(0)
 619             sync += r.mmu_req.eq(0)
 620             with m.If(~(r1.full & r0_full)):
 621                 sync += r0.eq(r)
 622                 sync += r0_full.eq(r.req.valid)
 623
 624     def tlb_read(self, m, r0_stall, tlb_valid_way,
 625                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 626                  dtlb_tags, dtlb_ptes):
 627         """TLB
 628         Operates in the second cycle on the request latched in r0.req.
 629         TLB updates write the entry at the end of the second cycle.
 630         """
 631         comb = m.d.comb
 632         sync = m.d.sync
 633         m_in, d_in = self.m_in, self.d_in
 634
 635         index    = Signal(TLB_SET_BITS)
 636         addrbits = Signal(TLB_SET_BITS)
 637
 638         amin = TLB_LG_PGSZ
 639         amax = TLB_LG_PGSZ + TLB_SET_BITS
 640
 641         with m.If(m_in.valid):
 642             comb += addrbits.eq(m_in.addr[amin : amax])
 643         with m.Else():
 644             comb += addrbits.eq(d_in.addr[amin : amax])
 645         comb += index.eq(addrbits)
 646
 647         # If we have any op and the previous op isn't finished,
 648         # then keep the same output for next cycle.
 649         with m.If(~r0_stall):
 650             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 651             sync += tlb_tag_way.eq(dtlb_tags[index])
 652             sync += tlb_pte_way.eq(dtlb_ptes[index])
 653
 654     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 655         """Generate TLB PLRUs
 656         """
 657         comb = m.d.comb
 658         sync = m.d.sync
 659
 660         if TLB_NUM_WAYS == 0:
 661             return
 662         for i in range(TLB_SET_SIZE):
 663             # TLB PLRU interface
 664             tlb_plru        = PLRU(TLB_WAY_BITS)
 665             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 666             tlb_plru_acc_en = Signal()
 667
 668             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 669             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 670             comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
 671             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 672
 673     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 674                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 675                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 676
 677         comb = m.d.comb
 678         sync = m.d.sync
 679
 680         hitway = Signal(TLB_WAY_BITS)
 681         hit    = Signal()
 682         eatag  = Signal(TLB_EA_TAG_BITS)
 683
 684         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 685         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 686         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 687
 688         for i in range(TLB_NUM_WAYS):
 689             is_tag_hit = Signal()
 690             comb += is_tag_hit.eq(tlb_valid_way[i]
 691                                   & (read_tlb_tag(i, tlb_tag_way) == eatag))
 692             with m.If(is_tag_hit):
 693                 comb += hitway.eq(i)
 694                 comb += hit.eq(1)
 695
 696         comb += tlb_hit.eq(hit & r0_valid)
 697         comb += tlb_hit_way.eq(hitway)
 698
 699         with m.If(tlb_hit):
 700             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 701         with m.Else():
 702             comb += pte.eq(0)
 703         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 704         with m.If(r0.req.virt_mode):
 705             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 706                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 707                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 708             comb += perm_attr.reference.eq(pte[8])
 709             comb += perm_attr.changed.eq(pte[7])
 710             comb += perm_attr.nocache.eq(pte[5])
 711             comb += perm_attr.priv.eq(pte[3])
 712             comb += perm_attr.rd_perm.eq(pte[2])
 713             comb += perm_attr.wr_perm.eq(pte[1])
 714         with m.Else():
 715             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 716                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 717
 718             comb += perm_attr.reference.eq(1)
 719             comb += perm_attr.changed.eq(1)
 720             comb += perm_attr.nocache.eq(0)
 721             comb += perm_attr.priv.eq(1)
 722             comb += perm_attr.rd_perm.eq(1)
 723             comb += perm_attr.wr_perm.eq(1)
 724
 725     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 726                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 727                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 728
 729         comb = m.d.comb
 730         sync = m.d.sync
 731
 732         tlbie    = Signal()
 733         tlbwe    = Signal()
 734
 735         comb += tlbie.eq(r0_valid & r0.tlbie)
 736         comb += tlbwe.eq(r0_valid & r0.tlbld)
 737
 738         m.submodules.tlb_update = d = DTLBUpdate()
 739         with m.If(tlbie & r0.doall):
 740             # clear all valid bits at once
 741             for i in range(TLB_SET_SIZE):
 742                 sync += dtlb_valid_bits[i].eq(0)
 743         with m.If(d.updated):
 744             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 745             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 746         with m.If(d.v_updated):
 747             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 748
 749         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 750
 751         comb += d.tlbie.eq(tlbie)
 752         comb += d.tlbwe.eq(tlbwe)
 753         comb += d.doall.eq(r0.doall)
 754         comb += d.tlb_hit.eq(tlb_hit)
 755         comb += d.tlb_hit_way.eq(tlb_hit_way)
 756         comb += d.tlb_tag_way.eq(tlb_tag_way)
 757         comb += d.tlb_pte_way.eq(tlb_pte_way)
 758         comb += d.tlb_req_index.eq(tlb_req_index)
 759
 760         with m.If(tlb_hit):
 761             comb += d.repl_way.eq(tlb_hit_way)
 762         with m.Else():
 763             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 764         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 765         comb += d.pte_data.eq(r0.req.data)
 766
 767     def maybe_plrus(self, m, r1, plru_victim):
 768         """Generate PLRUs
 769         """
 770         comb = m.d.comb
 771         sync = m.d.sync
 772
 773         if TLB_NUM_WAYS == 0:
 774             return
 775
 776         for i in range(NUM_LINES):
 777             # PLRU interface
 778             plru        = PLRU(WAY_BITS)
 779             setattr(m.submodules, "plru%d" % i, plru)
 780             plru_acc_en = Signal()
 781
 782             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 783             comb += plru.acc_en.eq(plru_acc_en)
 784             comb += plru.acc_i.eq(r1.hit_way)
 785             comb += plru_victim[i].eq(plru.lru_o)
 786
 787     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 788         """Cache tag RAM read port
 789         """
 790         comb = m.d.comb
 791         sync = m.d.sync
 792         m_in, d_in = self.m_in, self.d_in
 793
 794         index = Signal(INDEX_BITS)
 795
 796         with m.If(r0_stall):
 797             comb += index.eq(req_index)
 798         with m.Elif(m_in.valid):
 799             comb += index.eq(get_index(m_in.addr))
 800         with m.Else():
 801             comb += index.eq(get_index(d_in.addr))
 802         sync += cache_tag_set.eq(cache_tags[index])
 803
 804     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 805                        r0_valid, r1, cache_valids, replace_way,
 806                        use_forward1_next, use_forward2_next,
 807                        req_hit_way, plru_victim, rc_ok, perm_attr,
 808                        valid_ra, perm_ok, access_ok, req_op, req_go,
 809                        tlb_pte_way,
 810                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 811                        cancel_store, req_same_tag, r0_stall, early_req_row):
 812         """Cache request parsing and hit detection
 813         """
 814
 815         comb = m.d.comb
 816         sync = m.d.sync
 817         m_in, d_in = self.m_in, self.d_in
 818
 819         is_hit      = Signal()
 820         hit_way     = Signal(WAY_BITS)
 821         op          = Signal(Op)
 822         opsel       = Signal(3)
 823         go          = Signal()
 824         nc          = Signal()
 825         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 826                                   for i in range(TLB_NUM_WAYS))
 827         cache_valid_idx = Signal(NUM_WAYS)
 828
 829         # Extract line, row and tag from request
 830         comb += req_index.eq(get_index(r0.req.addr))
 831         comb += req_row.eq(get_row(r0.req.addr))
 832         comb += req_tag.eq(get_tag(ra))
 833
 834         if False: # display on comb is a bit... busy.
 835             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 836                     r0.req.addr, ra, req_index, req_tag, req_row)
 837
 838         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 839         comb += cache_valid_idx.eq(cache_valids[req_index])
 840
 841         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 842                                 tlb_valid_way, tlb_hit_way,
 843                                 cache_valid_idx, cache_tag_set,
 844                                 r0.req.addr,
 845                                 hit_set)
 846
 847         comb += dc.tlb_hit.eq(tlb_hit)
 848         comb += dc.reload_tag.eq(r1.reload_tag)
 849         comb += dc.virt_mode.eq(r0.req.virt_mode)
 850         comb += dc.go.eq(go)
 851         comb += dc.req_index.eq(req_index)
 852         comb += is_hit.eq(dc.is_hit)
 853         comb += hit_way.eq(dc.hit_way)
 854         comb += req_same_tag.eq(dc.rel_match)
 855
 856         # See if the request matches the line currently being reloaded
 857         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 858                   (req_index == r1.store_index) & req_same_tag):
 859             # For a store, consider this a hit even if the row isn't
 860             # valid since it will be by the time we perform the store.
 861             # For a load, check the appropriate row valid bit.
 862             rrow = Signal(ROW_LINE_BITS)
 863             comb += rrow.eq(req_row)
 864             valid = r1.rows_valid[rrow]
 865             comb += is_hit.eq(~r0.req.load | valid)
 866             comb += hit_way.eq(replace_way)
 867
 868         # Whether to use forwarded data for a load or not
 869         with m.If((get_row(r1.req.real_addr) == req_row) &
 870                   (r1.req.hit_way == hit_way)):
 871             # Only need to consider r1.write_bram here, since if we
 872             # are writing refill data here, then we don't have a
 873             # cache hit this cycle on the line being refilled.
 874             # (There is the possibility that the load following the
 875             # load miss that started the refill could be to the old
 876             # contents of the victim line, since it is a couple of
 877             # cycles after the refill starts before we see the updated
 878             # cache tag. In that case we don't use the bypass.)
 879             comb += use_forward1_next.eq(r1.write_bram)
 880         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 881             comb += use_forward2_next.eq(r1.forward_valid1)
 882
 883         # The way that matched on a hit
 884         comb += req_hit_way.eq(hit_way)
 885
 886         # The way to replace on a miss
 887         with m.If(r1.write_tag):
 888             comb += replace_way.eq(plru_victim[r1.store_index])
 889         with m.Else():
 890             comb += replace_way.eq(r1.store_way)
 891
 892         # work out whether we have permission for this access
 893         # NB we don't yet implement AMR, thus no KUAP
 894         comb += rc_ok.eq(perm_attr.reference
 895                          & (r0.req.load | perm_attr.changed)
 896                 )
 897         comb += perm_ok.eq((r0.req.priv_mode | ~perm_attr.priv) &
 898                            (perm_attr.wr_perm |
 899                               (r0.req.load & perm_attr.rd_perm)))
 900         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 901         # Combine the request and cache hit status to decide what
 902         # operation needs to be done
 903         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 904         comb += op.eq(Op.OP_NONE)
 905         with m.If(go):
 906             with m.If(~access_ok):
 907                 comb += op.eq(Op.OP_BAD)
 908             with m.Elif(cancel_store):
 909                 comb += op.eq(Op.OP_STCX_FAIL)
 910             with m.Else():
 911                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 912                 with m.Switch(opsel):
 913                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 914                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 915                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 916                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 917                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 918                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 919                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 920                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 921         comb += req_op.eq(op)
 922         comb += req_go.eq(go)
 923
 924         # Version of the row number that is valid one cycle earlier
 925         # in the cases where we need to read the cache data BRAM.
 926         # If we're stalling then we need to keep reading the last
 927         # row requested.
 928         with m.If(~r0_stall):
 929             with m.If(m_in.valid):
 930                 comb += early_req_row.eq(get_row(m_in.addr))
 931             with m.Else():
 932                 comb += early_req_row.eq(get_row(d_in.addr))
 933         with m.Else():
 934             comb += early_req_row.eq(req_row)
 935
 936     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 937                          r0_valid, r0, reservation):
 938         """Handle load-with-reservation and store-conditional instructions
 939         """
 940         comb = m.d.comb
 941         sync = m.d.sync
 942
 943         with m.If(r0_valid & r0.req.reserve):
 944             # XXX generate alignment interrupt if address
 945             # is not aligned XXX or if r0.req.nc = '1'
 946             with m.If(r0.req.load):
 947                 comb += set_rsrv.eq(1) # load with reservation
 948             with m.Else():
 949                 comb += clear_rsrv.eq(1) # store conditional
 950                 with m.If(~reservation.valid |
 951                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
 952                     comb += cancel_store.eq(1)
 953
 954     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 955                         reservation, r0):
 956
 957         comb = m.d.comb
 958         sync = m.d.sync
 959
 960         with m.If(r0_valid & access_ok):
 961             with m.If(clear_rsrv):
 962                 sync += reservation.valid.eq(0)
 963             with m.Elif(set_rsrv):
 964                 sync += reservation.valid.eq(1)
 965                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 966
 967     def writeback_control(self, m, r1, cache_out):
 968         """Return data for loads & completion control logic
 969         """
 970         comb = m.d.comb
 971         sync = m.d.sync
 972         d_out, m_out = self.d_out, self.m_out
 973
 974         data_out = Signal(64)
 975         data_fwd = Signal(64)
 976
 977         # Use the bypass if are reading the row that was
 978         # written 1 or 2 cycles ago, including for the
 979         # slow_valid = 1 case (i.e. completing a load
 980         # miss or a non-cacheable load).
 981         with m.If(r1.use_forward1):
 982             comb += data_fwd.eq(r1.forward_data1)
 983         with m.Else():
 984             comb += data_fwd.eq(r1.forward_data2)
 985
 986         comb += data_out.eq(cache_out[r1.hit_way])
 987
 988         for i in range(8):
 989             with m.If(r1.forward_sel[i]):
 990                 dsel = data_fwd.word_select(i, 8)
 991                 comb += data_out.word_select(i, 8).eq(dsel)
 992
 993         comb += d_out.valid.eq(r1.ls_valid)
 994         comb += d_out.data.eq(data_out)
 995         comb += d_out.store_done.eq(~r1.stcx_fail)
 996         comb += d_out.error.eq(r1.ls_error)
 997         comb += d_out.cache_paradox.eq(r1.cache_paradox)
 998
 999         # Outputs to MMU
1000         comb += m_out.done.eq(r1.mmu_done)
1001         comb += m_out.err.eq(r1.mmu_error)
1002         comb += m_out.data.eq(data_out)
1003
1004         # We have a valid load or store hit or we just completed
1005         # a slow op such as a load miss, a NC load or a store
1006         #
1007         # Note: the load hit is delayed by one cycle. However it
1008         # can still not collide with r.slow_valid (well unless I
1009         # miscalculated) because slow_valid can only be set on a
1010         # subsequent request and not on its first cycle (the state
1011         # machine must have advanced), which makes slow_valid
1012         # at least 2 cycles from the previous hit_load_valid.
1013
1014         # Sanity: Only one of these must be set in any given cycle
1015
1016         if False: # TODO: need Display to get this to work
1017             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1018             "unexpected slow_valid collision with stcx_fail"
1019
1020             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1021              "unexpected hit_load_delayed collision with slow_valid"
1022
1023         with m.If(~r1.mmu_req):
1024             # Request came from loadstore1...
1025             # Load hit case is the standard path
1026             with m.If(r1.hit_load_valid):
1027                 sync += Display("completing load hit data=%x", data_out)
1028
1029             # error cases complete without stalling
1030             with m.If(r1.ls_error):
1031                 sync += Display("completing ld/st with error")
1032
1033             # Slow ops (load miss, NC, stores)
1034             with m.If(r1.slow_valid):
1035                 sync += Display("completing store or load miss data=%x",
1036                                 data_out)
1037
1038         with m.Else():
1039             # Request came from MMU
1040             with m.If(r1.hit_load_valid):
1041                 sync += Display("completing load hit to MMU, data=%x",
1042                                 m_out.data)
1043             # error cases complete without stalling
1044             with m.If(r1.mmu_error):
1045                 sync += Display("combpleting MMU ld with error")
1046
1047             # Slow ops (i.e. load miss)
1048             with m.If(r1.slow_valid):
1049                 sync += Display("completing MMU load miss, data=%x",
1050                                 m_out.data)
1051
1052     def rams(self, m, r1, early_req_row, cache_out, replace_way):
1053         """rams
1054         Generate a cache RAM for each way. This handles the normal
1055         reads, writes from reloads and the special store-hit update
1056         path as well.
1057
1058         Note: the BRAMs have an extra read buffer, meaning the output
1059         is pipelined an extra cycle. This differs from the
1060         icache. The writeback logic needs to take that into
1061         account by using 1-cycle delayed signals for load hits.
1062         """
1063         comb = m.d.comb
1064         wb_in = self.wb_in
1065
1066         for i in range(NUM_WAYS):
1067             do_read  = Signal(name="do_rd%d" % i)
1068             rd_addr  = Signal(ROW_BITS)
1069             do_write = Signal(name="do_wr%d" % i)
1070             wr_addr  = Signal(ROW_BITS)
1071             wr_data  = Signal(WB_DATA_BITS)
1072             wr_sel   = Signal(ROW_SIZE)
1073             wr_sel_m = Signal(ROW_SIZE)
1074             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i)
1075
1076             way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
1077             setattr(m.submodules, "cacheram_%d" % i, way)
1078
1079             comb += way.rd_en.eq(do_read)
1080             comb += way.rd_addr.eq(rd_addr)
1081             comb += _d_out.eq(way.rd_data_o)
1082             comb += way.wr_sel.eq(wr_sel_m)
1083             comb += way.wr_addr.eq(wr_addr)
1084             comb += way.wr_data.eq(wr_data)
1085
1086             # Cache hit reads
1087             comb += do_read.eq(1)
1088             comb += rd_addr.eq(early_req_row[:ROW_BITS])
1089             comb += cache_out[i].eq(_d_out)
1090
1091             # Write mux:
1092             #
1093             # Defaults to wishbone read responses (cache refill)
1094             #
1095             # For timing, the mux on wr_data/sel/addr is not
1096             # dependent on anything other than the current state.
1097
1098             with m.If(r1.write_bram):
1099                 # Write store data to BRAM.  This happens one
1100                 # cycle after the store is in r0.
1101                 comb += wr_data.eq(r1.req.data)
1102                 comb += wr_sel.eq(r1.req.byte_sel)
1103                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1104
1105                 with m.If(i == r1.req.hit_way):
1106                     comb += do_write.eq(1)
1107             with m.Else():
1108                 # Otherwise, we might be doing a reload or a DCBZ
1109                 with m.If(r1.dcbz):
1110                     comb += wr_data.eq(0)
1111                 with m.Else():
1112                     comb += wr_data.eq(wb_in.dat)
1113                 comb += wr_addr.eq(r1.store_row)
1114                 comb += wr_sel.eq(~0) # all 1s
1115
1116             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1117                       & wb_in.ack & (replace_way == i)):
1118                 comb += do_write.eq(1)
1119
1120             # Mask write selects with do_write since BRAM
1121             # doesn't have a global write-enable
1122             with m.If(do_write):
1123                 comb += wr_sel_m.eq(wr_sel)
1124
1125     # Cache hit synchronous machine for the easy case.
1126     # This handles load hits.
1127     # It also handles error cases (TLB miss, cache paradox)
1128     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1129                         req_hit_way, req_index, req_tag, access_ok,
1130                         tlb_hit, tlb_hit_way, tlb_req_index):
1131
1132         comb = m.d.comb
1133         sync = m.d.sync
1134
1135         with m.If(req_op != Op.OP_NONE):
1136             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1137                     req_op, r0.req.addr, r0.req.nc,
1138                     req_index, req_tag, req_hit_way)
1139
1140         with m.If(r0_valid):
1141             sync += r1.mmu_req.eq(r0.mmu_req)
1142
1143         # Fast path for load/store hits.
1144         # Set signals for the writeback controls.
1145         sync += r1.hit_way.eq(req_hit_way)
1146         sync += r1.hit_index.eq(req_index)
1147
1148         with m.If(req_op == Op.OP_LOAD_HIT):
1149             sync += r1.hit_load_valid.eq(1)
1150         with m.Else():
1151             sync += r1.hit_load_valid.eq(0)
1152
1153         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1154             sync += r1.cache_hit.eq(1)
1155         with m.Else():
1156             sync += r1.cache_hit.eq(0)
1157
1158         with m.If(req_op == Op.OP_BAD):
1159             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1160             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1161             sync += r1.ls_error.eq(~r0.mmu_req)
1162             sync += r1.mmu_error.eq(r0.mmu_req)
1163             sync += r1.cache_paradox.eq(access_ok)
1164
1165             with m.Else():
1166                 sync += r1.ls_error.eq(0)
1167                 sync += r1.mmu_error.eq(0)
1168                 sync += r1.cache_paradox.eq(0)
1169
1170         with m.If(req_op == Op.OP_STCX_FAIL):
1171             r1.stcx_fail.eq(1)
1172         with m.Else():
1173             sync += r1.stcx_fail.eq(0)
1174
1175         # Record TLB hit information for updating TLB PLRU
1176         sync += r1.tlb_hit.eq(tlb_hit)
1177         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1178         sync += r1.tlb_hit_index.eq(tlb_req_index)
1179
1180     # Memory accesses are handled by this state machine:
1181     #
1182     #   * Cache load miss/reload (in conjunction with "rams")
1183     #   * Load hits for non-cachable forms
1184     #   * Stores (the collision case is handled in "rams")
1185     #
1186     # All wishbone requests generation is done here.
1187     # This machine operates at stage 1.
1188     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1189                     cache_valids, r0, replace_way,
1190                     req_hit_way, req_same_tag,
1191                     r0_valid, req_op, cache_tags, req_go, ra):
1192
1193         comb = m.d.comb
1194         sync = m.d.sync
1195         wb_in = self.wb_in
1196
1197         req         = MemAccessRequest("mreq_ds")
1198         acks        = Signal(3)
1199         adjust_acks = Signal(3)
1200
1201         req_row = Signal(ROW_BITS)
1202         req_idx = Signal(INDEX_BITS)
1203         req_tag = Signal(TAG_BITS)
1204         comb += req_idx.eq(get_index(req.real_addr))
1205         comb += req_row.eq(get_row(req.real_addr))
1206         comb += req_tag.eq(get_tag(req.real_addr))
1207
1208         sync += r1.use_forward1.eq(use_forward1_next)
1209         sync += r1.forward_sel.eq(0)
1210
1211         with m.If(use_forward1_next):
1212             sync += r1.forward_sel.eq(r1.req.byte_sel)
1213         with m.Elif(use_forward2_next):
1214             sync += r1.forward_sel.eq(r1.forward_sel1)
1215
1216         sync += r1.forward_data2.eq(r1.forward_data1)
1217         with m.If(r1.write_bram):
1218             sync += r1.forward_data1.eq(r1.req.data)
1219             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1220             sync += r1.forward_way1.eq(r1.req.hit_way)
1221             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1222             sync += r1.forward_valid1.eq(1)
1223         with m.Else():
1224             with m.If(r1.dcbz):
1225                 sync += r1.forward_data1.eq(0)
1226             with m.Else():
1227                 sync += r1.forward_data1.eq(wb_in.dat)
1228             sync += r1.forward_sel1.eq(~0) # all 1s
1229             sync += r1.forward_way1.eq(replace_way)
1230             sync += r1.forward_row1.eq(r1.store_row)
1231             sync += r1.forward_valid1.eq(0)
1232
1233         # One cycle pulses reset
1234         sync += r1.slow_valid.eq(0)
1235         sync += r1.write_bram.eq(0)
1236         sync += r1.inc_acks.eq(0)
1237         sync += r1.dec_acks.eq(0)
1238
1239         sync += r1.ls_valid.eq(0)
1240         # complete tlbies and TLB loads in the third cycle
1241         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1242
1243         with m.If((req_op == Op.OP_LOAD_HIT)
1244                   | (req_op == Op.OP_STCX_FAIL)):
1245             with m.If(~r0.mmu_req):
1246                 sync += r1.ls_valid.eq(1)
1247             with m.Else():
1248                 sync += r1.mmu_done.eq(1)
1249
1250         with m.If(r1.write_tag):
1251             # Store new tag in selected way
1252             for i in range(NUM_WAYS):
1253                 with m.If(i == replace_way):
1254                     ct = Signal(TAG_RAM_WIDTH)
1255                     comb += ct.eq(cache_tags[r1.store_index])
1256                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1257                     sync += cache_tags[r1.store_index].eq(ct)
1258             sync += r1.store_way.eq(replace_way)
1259             sync += r1.write_tag.eq(0)
1260
1261         # Take request from r1.req if there is one there,
1262         # else from req_op, ra, etc.
1263         with m.If(r1.full):
1264             comb += req.eq(r1.req)
1265         with m.Else():
1266             comb += req.op.eq(req_op)
1267             comb += req.valid.eq(req_go)
1268             comb += req.mmu_req.eq(r0.mmu_req)
1269             comb += req.dcbz.eq(r0.req.dcbz)
1270             comb += req.real_addr.eq(ra)
1271
1272             with m.If(~r0.req.dcbz):
1273                 comb += req.data.eq(r0.req.data)
1274             with m.Else():
1275                 comb += req.data.eq(0)
1276
1277             # Select all bytes for dcbz
1278             # and for cacheable loads
1279             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1280                 comb += req.byte_sel.eq(~0) # all 1s
1281             with m.Else():
1282                 comb += req.byte_sel.eq(r0.req.byte_sel)
1283             comb += req.hit_way.eq(req_hit_way)
1284             comb += req.same_tag.eq(req_same_tag)
1285
1286             # Store the incoming request from r0,
1287             # if it is a slow request
1288             # Note that r1.full = 1 implies req_op = OP_NONE
1289             with m.If((req_op == Op.OP_LOAD_MISS)
1290                       | (req_op == Op.OP_LOAD_NC)
1291                       | (req_op == Op.OP_STORE_MISS)
1292                       | (req_op == Op.OP_STORE_HIT)):
1293                 sync += r1.req.eq(req)
1294                 sync += r1.full.eq(1)
1295
1296         # Main state machine
1297         with m.Switch(r1.state):
1298
1299             with m.Case(State.IDLE):
1300                 sync += r1.real_adr.eq(req.real_addr)
1301                 sync += r1.wb.sel.eq(req.byte_sel)
1302                 sync += r1.wb.dat.eq(req.data)
1303                 sync += r1.dcbz.eq(req.dcbz)
1304
1305                 # Keep track of our index and way
1306                 # for subsequent stores.
1307                 sync += r1.store_index.eq(req_idx)
1308                 sync += r1.store_row.eq(req_row)
1309                 sync += r1.end_row_ix.eq(get_row_of_line(req_row))
1310                 sync += r1.reload_tag.eq(req_tag)
1311                 sync += r1.req.same_tag.eq(1)
1312
1313                 with m.If(req.op == Op.OP_STORE_HIT):
1314                     sync += r1.store_way.eq(req.hit_way)
1315
1316                 # Reset per-row valid bits,
1317                 # ready for handling OP_LOAD_MISS
1318                 for i in range(ROW_PER_LINE):
1319                     sync += r1.rows_valid[i].eq(0)
1320
1321                 with m.If(req_op != Op.OP_NONE):
1322                     sync += Display("cache op %d", req.op)
1323
1324                 with m.Switch(req.op):
1325                     with m.Case(Op.OP_LOAD_HIT):
1326                         # stay in IDLE state
1327                         pass
1328
1329                     with m.Case(Op.OP_LOAD_MISS):
1330                         sync += Display("cache miss real addr: %x " \
1331                                 "idx: %x tag: %x",
1332                                 req.real_addr, req_row, req_tag)
1333
1334                         # Start the wishbone cycle
1335                         sync += r1.wb.we.eq(0)
1336                         sync += r1.wb.cyc.eq(1)
1337                         sync += r1.wb.stb.eq(1)
1338
1339                         # Track that we had one request sent
1340                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1341                         sync += r1.write_tag.eq(1)
1342
1343                     with m.Case(Op.OP_LOAD_NC):
1344                         sync += r1.wb.cyc.eq(1)
1345                         sync += r1.wb.stb.eq(1)
1346                         sync += r1.wb.we.eq(0)
1347                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1348
1349                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1350                         with m.If(~req.dcbz):
1351                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1352                             sync += r1.acks_pending.eq(1)
1353                             sync += r1.full.eq(0)
1354                             sync += r1.slow_valid.eq(1)
1355
1356                             with m.If(~req.mmu_req):
1357                                 sync += r1.ls_valid.eq(1)
1358                             with m.Else():
1359                                 sync += r1.mmu_done.eq(1)
1360
1361                             with m.If(req.op == Op.OP_STORE_HIT):
1362                                 sync += r1.write_bram.eq(1)
1363                         with m.Else():
1364                             # dcbz is handled much like a load miss except
1365                             # that we are writing to memory instead of reading
1366                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1367
1368                             with m.If(req.op == Op.OP_STORE_MISS):
1369                                 sync += r1.write_tag.eq(1)
1370
1371                         sync += r1.wb.we.eq(1)
1372                         sync += r1.wb.cyc.eq(1)
1373                         sync += r1.wb.stb.eq(1)
1374
1375                     # OP_NONE and OP_BAD do nothing
1376                     # OP_BAD & OP_STCX_FAIL were
1377                     # handled above already
1378                     with m.Case(Op.OP_NONE):
1379                         pass
1380                     with m.Case(Op.OP_BAD):
1381                         pass
1382                     with m.Case(Op.OP_STCX_FAIL):
1383                         pass
1384
1385             with m.Case(State.RELOAD_WAIT_ACK):
1386                 ld_stbs_done = Signal()
1387                 # Requests are all sent if stb is 0
1388                 comb += ld_stbs_done.eq(~r1.wb.stb)
1389
1390                 with m.If((~wb_in.stall) & r1.wb.stb):
1391                     # That was the last word?
1392                     # We are done sending.
1393                     # Clear stb and set ld_stbs_done
1394                     # so we can handle an eventual
1395                     # last ack on the same cycle.
1396                     with m.If(is_last_row_addr(r1.real_adr, r1.end_row_ix)):
1397                         sync += r1.wb.stb.eq(0)
1398                         comb += ld_stbs_done.eq(1)
1399
1400                     # Calculate the next row address in the current cache line
1401                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1402                     comb += row.eq(r1.real_adr[ROW_OFF_BITS:])
1403                     sync += r1.real_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(row+1)
1404
1405                 # Incoming acks processing
1406                 sync += r1.forward_valid1.eq(wb_in.ack)
1407                 with m.If(wb_in.ack):
1408                     srow = Signal(ROW_LINE_BITS)
1409                     comb += srow.eq(r1.store_row)
1410                     sync += r1.rows_valid[srow].eq(1)
1411
1412                     # If this is the data we were looking for,
1413                     # we can complete the request next cycle.
1414                     # Compare the whole address in case the
1415                     # request in r1.req is not the one that
1416                     # started this refill.
1417                     with m.If(r1.full & r1.req.same_tag &
1418                               ((r1.dcbz & r1.req.dcbz) |
1419                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1420                                 (r1.store_row == get_row(r1.req.real_addr))):
1421                         sync += r1.full.eq(0)
1422                         sync += r1.slow_valid.eq(1)
1423                         with m.If(~r1.mmu_req):
1424                             sync += r1.ls_valid.eq(1)
1425                         with m.Else():
1426                             sync += r1.mmu_done.eq(1)
1427                         sync += r1.forward_sel.eq(~0) # all 1s
1428                         sync += r1.use_forward1.eq(1)
1429
1430                     # Check for completion
1431                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1432                                                       r1.end_row_ix)):
1433                         # Complete wishbone cycle
1434                         sync += r1.wb.cyc.eq(0)
1435
1436                         # Cache line is now valid
1437                         cv = Signal(INDEX_BITS)
1438                         comb += cv.eq(cache_valids[r1.store_index])
1439                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1440                         sync += cache_valids[r1.store_index].eq(cv)
1441                         sync += r1.state.eq(State.IDLE)
1442
1443                     # Increment store row counter
1444                     sync += r1.store_row.eq(next_row(r1.store_row))
1445
1446             with m.Case(State.STORE_WAIT_ACK):
1447                 st_stbs_done = Signal()
1448                 comb += st_stbs_done.eq(~r1.wb.stb)
1449                 comb += acks.eq(r1.acks_pending)
1450
1451                 with m.If(r1.inc_acks != r1.dec_acks):
1452                     with m.If(r1.inc_acks):
1453                         comb += adjust_acks.eq(acks + 1)
1454                     with m.Else():
1455                         comb += adjust_acks.eq(acks - 1)
1456                 with m.Else():
1457                     comb += adjust_acks.eq(acks)
1458
1459                 sync += r1.acks_pending.eq(adjust_acks)
1460
1461                 # Clear stb when slave accepted request
1462                 with m.If(~wb_in.stall):
1463                     # See if there is another store waiting
1464                     # to be done which is in the same real page.
1465                     with m.If(req.valid):
1466                         ra = req.real_addr[0:SET_SIZE_BITS]
1467                         sync += r1.real_adr[0:SET_SIZE_BITS].eq(ra)
1468                         sync += r1.wb.dat.eq(req.data)
1469                         sync += r1.wb.sel.eq(req.byte_sel)
1470
1471                     with m.Elif((adjust_acks < 7) & req.same_tag &
1472                                 ((req.op == Op.OP_STORE_MISS)
1473                                  | (req.op == Op.OP_STORE_HIT))):
1474                         sync += r1.wb.stb.eq(1)
1475                         comb += st_stbs_done.eq(0)
1476
1477                         with m.If(req.op == Op.OP_STORE_HIT):
1478                             sync += r1.write_bram.eq(1)
1479                         sync += r1.full.eq(0)
1480                         sync += r1.slow_valid.eq(1)
1481
1482                         # Store requests never come from the MMU
1483                         sync += r1.ls_valid.eq(1)
1484                         comb += st_stbs_done.eq(0)
1485                         sync += r1.inc_acks.eq(1)
1486                     with m.Else():
1487                         sync += r1.wb.stb.eq(0)
1488                         comb += st_stbs_done.eq(1)
1489
1490                 # Got ack ? See if complete.
1491                 with m.If(wb_in.ack):
1492                     with m.If(st_stbs_done & (adjust_acks == 1)):
1493                         sync += r1.state.eq(State.IDLE)
1494                         sync += r1.wb.cyc.eq(0)
1495                         sync += r1.wb.stb.eq(0)
1496                     sync += r1.dec_acks.eq(1)
1497
1498             with m.Case(State.NC_LOAD_WAIT_ACK):
1499                 # Clear stb when slave accepted request
1500                 with m.If(~wb_in.stall):
1501                     sync += r1.wb.stb.eq(0)
1502
1503                 # Got ack ? complete.
1504                 with m.If(wb_in.ack):
1505                     sync += r1.state.eq(State.IDLE)
1506                     sync += r1.full.eq(0)
1507                     sync += r1.slow_valid.eq(1)
1508
1509                     with m.If(~r1.mmu_req):
1510                         sync += r1.ls_valid.eq(1)
1511                     with m.Else():
1512                         sync += r1.mmu_done.eq(1)
1513
1514                     sync += r1.forward_sel.eq(~0) # all 1s
1515                     sync += r1.use_forward1.eq(1)
1516                     sync += r1.wb.cyc.eq(0)
1517                     sync += r1.wb.stb.eq(0)
1518
1519     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1520
1521         sync = m.d.sync
1522         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1523
1524         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1525                                stall_out, req_op[:3], d_out.valid, d_out.error,
1526                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1527                                r1.real_adr[3:6]))
1528
1529     def elaborate(self, platform):
1530
1531         m = Module()
1532         comb = m.d.comb
1533
1534         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1535         cache_tags       = CacheTagArray()
1536         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1537         cache_valids = CacheValidBitsArray()
1538
1539         # TODO attribute ram_style : string;
1540         # TODO attribute ram_style of cache_tags : signal is "distributed";
1541
1542         """note: these are passed to nmigen.hdl.Memory as "attributes".
1543            don't know how, just that they are.
1544         """
1545         dtlb_valid_bits = TLBValidBitsArray()
1546         dtlb_tags       = TLBTagsArray()
1547         dtlb_ptes       = TLBPtesArray()
1548         # TODO attribute ram_style of
1549         #  dtlb_tags : signal is "distributed";
1550         # TODO attribute ram_style of
1551         #  dtlb_ptes : signal is "distributed";
1552
1553         r0      = RegStage0("r0")
1554         r0_full = Signal()
1555
1556         r1 = RegStage1("r1")
1557
1558         reservation = Reservation()
1559
1560         # Async signals on incoming request
1561         req_index    = Signal(INDEX_BITS)
1562         req_row      = Signal(ROW_BITS)
1563         req_hit_way  = Signal(WAY_BITS)
1564         req_tag      = Signal(TAG_BITS)
1565         req_op       = Signal(Op)
1566         req_data     = Signal(64)
1567         req_same_tag = Signal()
1568         req_go       = Signal()
1569
1570         early_req_row     = Signal(ROW_BITS)
1571
1572         cancel_store      = Signal()
1573         set_rsrv          = Signal()
1574         clear_rsrv        = Signal()
1575
1576         r0_valid          = Signal()
1577         r0_stall          = Signal()
1578
1579         use_forward1_next = Signal()
1580         use_forward2_next = Signal()
1581
1582         cache_out         = CacheRamOut()
1583
1584         plru_victim       = PLRUOut()
1585         replace_way       = Signal(WAY_BITS)
1586
1587         # Wishbone read/write/cache write formatting signals
1588         bus_sel           = Signal(8)
1589
1590         # TLB signals
1591         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1592         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1593         tlb_valid_way = Signal(TLB_NUM_WAYS)
1594         tlb_req_index = Signal(TLB_SET_BITS)
1595         tlb_hit       = Signal()
1596         tlb_hit_way   = Signal(TLB_WAY_BITS)
1597         pte           = Signal(TLB_PTE_BITS)
1598         ra            = Signal(REAL_ADDR_BITS)
1599         valid_ra      = Signal()
1600         perm_attr     = PermAttr("dc_perms")
1601         rc_ok         = Signal()
1602         perm_ok       = Signal()
1603         access_ok     = Signal()
1604
1605         tlb_plru_victim = TLBPLRUOut()
1606
1607         # we don't yet handle collisions between loadstore1 requests
1608         # and MMU requests
1609         comb += self.m_out.stall.eq(0)
1610
1611         # Hold off the request in r0 when r1 has an uncompleted request
1612         comb += r0_stall.eq(r0_full & r1.full)
1613         comb += r0_valid.eq(r0_full & ~r1.full)
1614         comb += self.stall_out.eq(r0_stall)
1615
1616         # Wire up wishbone request latch out of stage 1
1617         comb += r1.wb.adr.eq(r1.real_adr[ROW_OFF_BITS:]) # truncate LSBs
1618         comb += self.wb_out.eq(r1.wb)
1619
1620         # call sub-functions putting everything together, using shared
1621         # signals established above
1622         self.stage_0(m, r0, r1, r0_full)
1623         self.tlb_read(m, r0_stall, tlb_valid_way,
1624                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1625                       dtlb_tags, dtlb_ptes)
1626         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1627                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1628                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1629         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1630                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1631                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1632         self.maybe_plrus(m, r1, plru_victim)
1633         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1634         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1635         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1636                            r0_valid, r1, cache_valids, replace_way,
1637                            use_forward1_next, use_forward2_next,
1638                            req_hit_way, plru_victim, rc_ok, perm_attr,
1639                            valid_ra, perm_ok, access_ok, req_op, req_go,
1640                            tlb_pte_way,
1641                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1642                            cancel_store, req_same_tag, r0_stall, early_req_row)
1643         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1644                            r0_valid, r0, reservation)
1645         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1646                            reservation, r0)
1647         self.writeback_control(m, r1, cache_out)
1648         self.rams(m, r1, early_req_row, cache_out, replace_way)
1649         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1650                         req_hit_way, req_index, req_tag, access_ok,
1651                         tlb_hit, tlb_hit_way, tlb_req_index)
1652         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1653                     cache_valids, r0, replace_way,
1654                     req_hit_way, req_same_tag,
1655                          r0_valid, req_op, cache_tags, req_go, ra)
1656         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1657
1658         return m
1659
1660 def dcache_load(dut, addr, nc=0):
1661     yield dut.d_in.load.eq(1)
1662     yield dut.d_in.nc.eq(nc)
1663     yield dut.d_in.addr.eq(addr)
1664     yield dut.d_in.byte_sel.eq(~0)
1665     yield dut.d_in.valid.eq(1)
1666     yield
1667     yield dut.d_in.valid.eq(0)
1668     yield dut.d_in.byte_sel.eq(0)
1669     yield
1670     while not (yield dut.d_out.valid):
1671         yield
1672     data = yield dut.d_out.data
1673     return data
1674
1675
1676 def dcache_store(dut, addr, data, nc=0):
1677     yield dut.d_in.load.eq(0)
1678     yield dut.d_in.nc.eq(nc)
1679     yield dut.d_in.data.eq(data)
1680     yield dut.d_in.byte_sel.eq(~0)
1681     yield dut.d_in.addr.eq(addr)
1682     yield dut.d_in.valid.eq(1)
1683     yield
1684     yield dut.d_in.valid.eq(0)
1685     yield dut.d_in.byte_sel.eq(0)
1686     yield
1687     while not (yield dut.d_out.valid):
1688         yield
1689
1690
1691 def dcache_random_sim(dut):
1692
1693     # start with stack of zeros
1694     sim_mem = [0] * 512
1695
1696     # clear stuff
1697     yield dut.d_in.valid.eq(0)
1698     yield dut.d_in.load.eq(0)
1699     yield dut.d_in.priv_mode.eq(1)
1700     yield dut.d_in.nc.eq(0)
1701     yield dut.d_in.addr.eq(0)
1702     yield dut.d_in.data.eq(0)
1703     yield dut.m_in.valid.eq(0)
1704     yield dut.m_in.addr.eq(0)
1705     yield dut.m_in.pte.eq(0)
1706     # wait 4 * clk_period
1707     yield
1708     yield
1709     yield
1710     yield
1711
1712     print ()
1713
1714     for i in range(256):
1715         addr = randint(0, 255)
1716         data = randint(0, (1<<64)-1)
1717         sim_mem[addr] = data
1718         addr *= 8
1719
1720         print ("testing %x data %x" % (addr, data))
1721
1722         yield from dcache_load(dut, addr)
1723         yield from dcache_store(dut, addr, data)
1724
1725         addr = randint(0, 255)
1726         sim_data = sim_mem[addr]
1727         addr *= 8
1728
1729         data = yield from dcache_load(dut, addr)
1730         assert data == sim_data, \
1731             "check %x data %x != %x" % (addr, data, sim_data)
1732
1733     for addr in range(256):
1734         data = yield from dcache_load(dut, addr*8)
1735         assert data == sim_mem[addr], \
1736             "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1737
1738 def dcache_sim(dut):
1739     # clear stuff
1740     yield dut.d_in.valid.eq(0)
1741     yield dut.d_in.load.eq(0)
1742     yield dut.d_in.priv_mode.eq(1)
1743     yield dut.d_in.nc.eq(0)
1744     yield dut.d_in.addr.eq(0)
1745     yield dut.d_in.data.eq(0)
1746     yield dut.m_in.valid.eq(0)
1747     yield dut.m_in.addr.eq(0)
1748     yield dut.m_in.pte.eq(0)
1749     # wait 4 * clk_period
1750     yield
1751     yield
1752     yield
1753     yield
1754
1755     # Cacheable read of address 4
1756     data = yield from dcache_load(dut, 0x58)
1757     addr = yield dut.d_in.addr
1758     assert data == 0x0000001700000016, \
1759         f"data @%x=%x expected 0x0000001700000016" % (addr, data)
1760
1761     # Cacheable read of address 20
1762     data = yield from dcache_load(dut, 0x20)
1763     addr = yield dut.d_in.addr
1764     assert data == 0x0000000900000008, \
1765         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1766
1767     # Cacheable read of address 30
1768     data = yield from dcache_load(dut, 0x530)
1769     addr = yield dut.d_in.addr
1770     assert data == 0x0000014D0000014C, \
1771         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1772
1773     # 2nd Cacheable read of address 30
1774     data = yield from dcache_load(dut, 0x530)
1775     addr = yield dut.d_in.addr
1776     assert data == 0x0000014D0000014C, \
1777         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1778
1779     # Non-cacheable read of address 100
1780     data = yield from dcache_load(dut, 0x100, nc=1)
1781     addr = yield dut.d_in.addr
1782     assert data == 0x0000004100000040, \
1783         f"data @%x=%x expected 0000004100000040" % (addr, data)
1784
1785     # Store at address 530
1786     yield from dcache_store(dut, 0x530, 0x121)
1787
1788     # Store at address 30
1789     yield from dcache_store(dut, 0x530, 0x12345678)
1790
1791     # 3nd Cacheable read of address 530
1792     data = yield from dcache_load(dut, 0x530)
1793     addr = yield dut.d_in.addr
1794     assert data == 0x12345678, \
1795         f"data @%x=%x expected 0x12345678" % (addr, data)
1796
1797     # 4th Cacheable read of address 20
1798     data = yield from dcache_load(dut, 0x20)
1799     addr = yield dut.d_in.addr
1800     assert data == 0x0000000900000008, \
1801         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1802
1803     yield
1804     yield
1805     yield
1806     yield
1807
1808
1809 def test_dcache(mem, test_fn, test_name):
1810     dut = DCache()
1811
1812     memory = Memory(width=64, depth=16*64, init=mem)
1813     sram = SRAM(memory=memory, granularity=8)
1814
1815     m = Module()
1816     m.submodules.dcache = dut
1817     m.submodules.sram = sram
1818
1819     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1820     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1821     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1822     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1823     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1824     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1825
1826     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1827     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1828
1829     # nmigen Simulation
1830     sim = Simulator(m)
1831     sim.add_clock(1e-6)
1832
1833     sim.add_sync_process(wrap(test_fn(dut)))
1834     with sim.write_vcd('test_dcache%s.vcd' % test_name):
1835         sim.run()
1836
1837 if __name__ == '__main__':
1838     dut = DCache()
1839     vl = rtlil.convert(dut, ports=[])
1840     with open("test_dcache.il", "w") as f:
1841         f.write(vl)
1842
1843     mem = []
1844     for i in range(0,512):
1845         mem.append((i*2)| ((i*2+1)<<32))
1846
1847     test_dcache(mem, dcache_sim, "")
1848     test_dcache(None, dcache_random_sim, "random")
1849