src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 """
   6
   7 from enum import Enum, unique
   8
   9 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  10 from nmutil.util import Display
  11
  12 from random import randint
  13
  14 from nmigen.cli import main
  15 from nmutil.iocontrol import RecordObject
  16 from nmigen.utils import log2_int
  17 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  18                                      DCacheToLoadStore1Type,
  19                                      MMUToDCacheType,
  20                                      DCacheToMMUType)
  21
  22 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  23                                 WBAddrType, WBDataType, WBSelType,
  24                                 WBMasterOut, WBSlaveOut,
  25                                 WBMasterOutVector, WBSlaveOutVector,
  26                                 WBIOMasterOut, WBIOSlaveOut)
  27
  28 from soc.experiment.cache_ram import CacheRam
  29 from soc.experiment.plru import PLRU
  30
  31 # for test
  32 from nmigen_soc.wishbone.sram import SRAM
  33 from nmigen import Memory
  34 from nmigen.cli import rtlil
  35 if True:
  36     from nmigen.back.pysim import Simulator, Delay, Settle
  37 else:
  38     from nmigen.sim.cxxsim import Simulator, Delay, Settle
  39 from nmutil.util import wrap
  40
  41
  42 # TODO: make these parameters of DCache at some point
  43 LINE_SIZE = 64    # Line size in bytes
  44 NUM_LINES = 16    # Number of lines in a set
  45 NUM_WAYS = 4      # Number of ways
  46 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  47 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  48 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  49 LOG_LENGTH = 0    # Non-zero to enable log data collection
  50
  51 # BRAM organisation: We never access more than
  52 #     -- WB_DATA_BITS at a time so to save
  53 #     -- resources we make the array only that wide, and
  54 #     -- use consecutive indices for to make a cache "line"
  55 #     --
  56 #     -- ROW_SIZE is the width in bytes of the BRAM
  57 #     -- (based on WB, so 64-bits)
  58 ROW_SIZE = WB_DATA_BITS // 8;
  59
  60 # ROW_PER_LINE is the number of row (wishbone
  61 # transactions) in a line
  62 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  63
  64 # BRAM_ROWS is the number of rows in BRAM needed
  65 # to represent the full dcache
  66 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  67
  68 print ("ROW_SIZE", ROW_SIZE)
  69 print ("ROW_PER_LINE", ROW_PER_LINE)
  70 print ("BRAM_ROWS", BRAM_ROWS)
  71 print ("NUM_WAYS", NUM_WAYS)
  72
  73 # Bit fields counts in the address
  74
  75 # REAL_ADDR_BITS is the number of real address
  76 # bits that we store
  77 REAL_ADDR_BITS = 56
  78
  79 # ROW_BITS is the number of bits to select a row
  80 ROW_BITS = log2_int(BRAM_ROWS)
  81
  82 # ROW_LINE_BITS is the number of bits to select
  83 # a row within a line
  84 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  85
  86 # LINE_OFF_BITS is the number of bits for
  87 # the offset in a cache line
  88 LINE_OFF_BITS = log2_int(LINE_SIZE)
  89
  90 # ROW_OFF_BITS is the number of bits for
  91 # the offset in a row
  92 ROW_OFF_BITS = log2_int(ROW_SIZE)
  93
  94 # INDEX_BITS is the number if bits to
  95 # select a cache line
  96 INDEX_BITS = log2_int(NUM_LINES)
  97
  98 # SET_SIZE_BITS is the log base 2 of the set size
  99 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 100
 101 # TAG_BITS is the number of bits of
 102 # the tag part of the address
 103 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 104
 105 # TAG_WIDTH is the width in bits of each way of the tag RAM
 106 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 107
 108 # WAY_BITS is the number of bits to select a way
 109 WAY_BITS = log2_int(NUM_WAYS)
 110
 111 # Example of layout for 32 lines of 64 bytes:
 112 layout = """\
 113   ..  tag    |index|  line  |
 114   ..         |   row   |    |
 115   ..         |     |---|    | ROW_LINE_BITS  (3)
 116   ..         |     |--- - --| LINE_OFF_BITS (6)
 117   ..         |         |- --| ROW_OFF_BITS  (3)
 118   ..         |----- ---|    | ROW_BITS      (8)
 119   ..         |-----|        | INDEX_BITS    (5)
 120   .. --------|              | TAG_BITS      (45)
 121 """
 122 print (layout)
 123 print ("Dcache TAG %d IDX %d ROW %d ROFF %d LOFF %d RLB %d" % \
 124             (TAG_BITS, INDEX_BITS, ROW_BITS,
 125              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 126 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 127 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 128 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 129
 130 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 131
 132 def CacheTagArray():
 133     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 134                         for x in range(NUM_LINES))
 135
 136 def CacheValidBitsArray():
 137     return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
 138                         for x in range(NUM_LINES))
 139
 140 def RowPerLineValidArray():
 141     return Array(Signal(name="rows_valid%d" % x) \
 142                         for x in range(ROW_PER_LINE))
 143
 144 # L1 TLB
 145 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 146 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 147 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 148 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 149 TLB_PTE_BITS     = 64
 150 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 151
 152 def ispow2(x):
 153     return (1<<log2_int(x, False)) == x
 154
 155 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 156 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 157 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 158 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 159 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 160 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 161         "geometry bits don't add up"
 162 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 163         "geometry bits don't add up"
 164 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 165          "geometry bits don't add up"
 166 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 167 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 168
 169
 170 def TLBValidBitsArray():
 171     return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
 172
 173 def TLBTagEAArray():
 174     return Array(Signal(TLB_EA_TAG_BITS) for x in range (TLB_NUM_WAYS))
 175
 176 def TLBTagsArray():
 177     return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
 178
 179 def TLBPtesArray():
 180     return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
 181
 182 def HitWaySet():
 183     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 184                         for x in range(TLB_NUM_WAYS))
 185
 186 # Cache RAM interface
 187 def CacheRamOut():
 188     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 189                  for x in range(NUM_WAYS))
 190
 191 # PLRU output interface
 192 def PLRUOut():
 193     return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
 194
 195 # TLB PLRU output interface
 196 def TLBPLRUOut():
 197     return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
 198
 199 # Helper functions to decode incoming requests
 200 #
 201 # Return the cache line index (tag index) for an address
 202 def get_index(addr):
 203     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 204
 205 # Return the cache row index (data memory) for an address
 206 def get_row(addr):
 207     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 208
 209 # Return the index of a row within a line
 210 def get_row_of_line(row):
 211     return row[:ROW_BITS][:ROW_LINE_BITS]
 212
 213 # Returns whether this is the last row of a line
 214 def is_last_row_addr(addr, last):
 215     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 216
 217 # Returns whether this is the last row of a line
 218 def is_last_row(row, last):
 219     return get_row_of_line(row) == last
 220
 221 # Return the next row in the current cache line. We use a
 222 # dedicated function in order to limit the size of the
 223 # generated adder to be only the bits within a cache line
 224 # (3 bits with default settings)
 225 def next_row(row):
 226     row_v = row[0:ROW_LINE_BITS] + 1
 227     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 228
 229 # Get the tag value from the address
 230 def get_tag(addr):
 231     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 232
 233 # Read a tag from a tag memory row
 234 def read_tag(way, tagset):
 235     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 236
 237 # Read a TLB tag from a TLB tag memory row
 238 def read_tlb_tag(way, tags):
 239     return tags.word_select(way, TLB_EA_TAG_BITS)
 240
 241 # Write a TLB tag to a TLB tag memory row
 242 def write_tlb_tag(way, tags, tag):
 243     return read_tlb_tag(way, tags).eq(tag)
 244
 245 # Read a PTE from a TLB PTE memory row
 246 def read_tlb_pte(way, ptes):
 247     return ptes.word_select(way, TLB_PTE_BITS)
 248
 249 def write_tlb_pte(way, ptes, newpte):
 250     return read_tlb_pte(way, ptes).eq(newpte)
 251
 252
 253 # Record for storing permission, attribute, etc. bits from a PTE
 254 class PermAttr(RecordObject):
 255     def __init__(self, name=None):
 256         super().__init__(name=name)
 257         self.reference = Signal()
 258         self.changed   = Signal()
 259         self.nocache   = Signal()
 260         self.priv      = Signal()
 261         self.rd_perm   = Signal()
 262         self.wr_perm   = Signal()
 263
 264
 265 def extract_perm_attr(pte):
 266     pa = PermAttr()
 267     pa.reference = pte[8]
 268     pa.changed   = pte[7]
 269     pa.nocache   = pte[5]
 270     pa.priv      = pte[3]
 271     pa.rd_perm   = pte[2]
 272     pa.wr_perm   = pte[1]
 273     return pa;
 274
 275
 276 # Type of operation on a "valid" input
 277 @unique
 278 class Op(Enum):
 279     OP_NONE       = 0
 280     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 281     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 282     OP_LOAD_HIT   = 3 # Cache hit on load
 283     OP_LOAD_MISS  = 4 # Load missing cache
 284     OP_LOAD_NC    = 5 # Non-cachable load
 285     OP_STORE_HIT  = 6 # Store hitting cache
 286     OP_STORE_MISS = 7 # Store missing cache
 287
 288
 289 # Cache state machine
 290 @unique
 291 class State(Enum):
 292     IDLE             = 0 # Normal load hit processing
 293     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 294     STORE_WAIT_ACK   = 2 # Store wait ack
 295     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 296
 297
 298 # Dcache operations:
 299 #
 300 # In order to make timing, we use the BRAMs with
 301 # an output buffer, which means that the BRAM
 302 # output is delayed by an extra cycle.
 303 #
 304 # Thus, the dcache has a 2-stage internal pipeline
 305 # for cache hits with no stalls.
 306 #
 307 # All other operations are handled via stalling
 308 # in the first stage.
 309 #
 310 # The second stage can thus complete a hit at the same
 311 # time as the first stage emits a stall for a complex op.
 312 #
 313 # Stage 0 register, basically contains just the latched request
 314
 315 class RegStage0(RecordObject):
 316     def __init__(self, name=None):
 317         super().__init__(name=name)
 318         self.req     = LoadStore1ToDCacheType(name="lsmem")
 319         self.tlbie   = Signal()
 320         self.doall   = Signal()
 321         self.tlbld   = Signal()
 322         self.mmu_req = Signal() # indicates source of request
 323
 324
 325 class MemAccessRequest(RecordObject):
 326     def __init__(self, name=None):
 327         super().__init__(name=name)
 328         self.op        = Signal(Op)
 329         self.valid     = Signal()
 330         self.dcbz      = Signal()
 331         self.real_addr = Signal(REAL_ADDR_BITS)
 332         self.data      = Signal(64)
 333         self.byte_sel  = Signal(8)
 334         self.hit_way   = Signal(WAY_BITS)
 335         self.same_tag  = Signal()
 336         self.mmu_req   = Signal()
 337
 338
 339 # First stage register, contains state for stage 1 of load hits
 340 # and for the state machine used by all other operations
 341 class RegStage1(RecordObject):
 342     def __init__(self, name=None):
 343         super().__init__(name=name)
 344         # Info about the request
 345         self.full             = Signal() # have uncompleted request
 346         self.mmu_req          = Signal() # request is from MMU
 347         self.req              = MemAccessRequest(name="reqmem")
 348
 349         # Cache hit state
 350         self.hit_way          = Signal(WAY_BITS)
 351         self.hit_load_valid   = Signal()
 352         self.hit_index        = Signal(INDEX_BITS)
 353         self.cache_hit        = Signal()
 354
 355         # TLB hit state
 356         self.tlb_hit          = Signal()
 357         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 358         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 359
 360         # 2-stage data buffer for data forwarded from writes to reads
 361         self.forward_data1    = Signal(64)
 362         self.forward_data2    = Signal(64)
 363         self.forward_sel1     = Signal(8)
 364         self.forward_valid1   = Signal()
 365         self.forward_way1     = Signal(WAY_BITS)
 366         self.forward_row1     = Signal(ROW_BITS)
 367         self.use_forward1     = Signal()
 368         self.forward_sel      = Signal(8)
 369
 370         # Cache miss state (reload state machine)
 371         self.state            = Signal(State)
 372         self.dcbz             = Signal()
 373         self.write_bram       = Signal()
 374         self.write_tag        = Signal()
 375         self.slow_valid       = Signal()
 376         self.real_adr         = Signal(REAL_ADDR_BITS)
 377         self.wb               = WBMasterOut("wb")
 378         self.reload_tag       = Signal(TAG_BITS)
 379         self.store_way        = Signal(WAY_BITS)
 380         self.store_row        = Signal(ROW_BITS)
 381         self.store_index      = Signal(INDEX_BITS)
 382         self.end_row_ix       = Signal(ROW_LINE_BITS)
 383         self.rows_valid       = RowPerLineValidArray()
 384         self.acks_pending     = Signal(3)
 385         self.inc_acks         = Signal()
 386         self.dec_acks         = Signal()
 387
 388         # Signals to complete (possibly with error)
 389         self.ls_valid         = Signal()
 390         self.ls_error         = Signal()
 391         self.mmu_done         = Signal()
 392         self.mmu_error        = Signal()
 393         self.cache_paradox    = Signal()
 394
 395         # Signal to complete a failed stcx.
 396         self.stcx_fail        = Signal()
 397
 398
 399 # Reservation information
 400 class Reservation(RecordObject):
 401     def __init__(self):
 402         super().__init__()
 403         self.valid = Signal()
 404         self.addr  = Signal(64-LINE_OFF_BITS)
 405
 406
 407 class DTLBUpdate(Elaboratable):
 408     def __init__(self):
 409         self.tlbie    = Signal()
 410         self.tlbwe    = Signal()
 411         self.doall    = Signal()
 412         self.updated  = Signal()
 413         self.v_updated  = Signal()
 414         self.tlb_hit    = Signal()
 415         self.tlb_req_index = Signal(TLB_SET_BITS)
 416
 417         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 418         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 419         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 420         self.repl_way        = Signal(TLB_WAY_BITS)
 421         self.eatag           = Signal(TLB_EA_TAG_BITS)
 422         self.pte_data        = Signal(TLB_PTE_BITS)
 423
 424         self.dv = Signal(TLB_PTE_WAY_BITS)
 425
 426         self.tb_out = Signal(TLB_TAG_WAY_BITS)
 427         self.pb_out = Signal(TLB_NUM_WAYS)
 428         self.db_out = Signal(TLB_PTE_WAY_BITS)
 429
 430     def elaborate(self, platform):
 431         m = Module()
 432         comb = m.d.comb
 433         sync = m.d.sync
 434
 435         tagset   = Signal(TLB_TAG_WAY_BITS)
 436         pteset   = Signal(TLB_PTE_WAY_BITS)
 437
 438         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 439
 440         with m.If(self.tlbie & self.doall):
 441             pass # clear all back in parent
 442         with m.Elif(self.tlbie):
 443             with m.If(self.tlb_hit):
 444                 comb += db_out.eq(self.dv)
 445                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 446                 comb += self.v_updated.eq(1)
 447
 448         with m.Elif(self.tlbwe):
 449
 450             comb += tagset.eq(self.tlb_tag_way)
 451             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 452             comb += tb_out.eq(tagset)
 453
 454             comb += pteset.eq(self.tlb_pte_way)
 455             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 456             comb += pb_out.eq(pteset)
 457
 458             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 459
 460             comb += self.updated.eq(1)
 461             comb += self.v_updated.eq(1)
 462
 463         return m
 464
 465
 466 class DCachePendingHit(Elaboratable):
 467
 468     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 469                       cache_valid_idx, cache_tag_set,
 470                     req_addr,
 471                     hit_set):
 472
 473         self.go          = Signal()
 474         self.virt_mode   = Signal()
 475         self.is_hit      = Signal()
 476         self.tlb_hit     = Signal()
 477         self.hit_way     = Signal(WAY_BITS)
 478         self.rel_match   = Signal()
 479         self.req_index   = Signal(INDEX_BITS)
 480         self.reload_tag  = Signal(TAG_BITS)
 481
 482         self.tlb_hit_way = tlb_hit_way
 483         self.tlb_pte_way = tlb_pte_way
 484         self.tlb_valid_way = tlb_valid_way
 485         self.cache_valid_idx = cache_valid_idx
 486         self.cache_tag_set = cache_tag_set
 487         self.req_addr = req_addr
 488         self.hit_set = hit_set
 489
 490     def elaborate(self, platform):
 491         m = Module()
 492         comb = m.d.comb
 493         sync = m.d.sync
 494
 495         go = self.go
 496         virt_mode = self.virt_mode
 497         is_hit = self.is_hit
 498         tlb_pte_way = self.tlb_pte_way
 499         tlb_valid_way = self.tlb_valid_way
 500         cache_valid_idx = self.cache_valid_idx
 501         cache_tag_set = self.cache_tag_set
 502         req_addr = self.req_addr
 503         tlb_hit_way = self.tlb_hit_way
 504         tlb_hit = self.tlb_hit
 505         hit_set = self.hit_set
 506         hit_way = self.hit_way
 507         rel_match = self.rel_match
 508         req_index = self.req_index
 509         reload_tag = self.reload_tag
 510
 511         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 512                                     for i in range(TLB_NUM_WAYS))
 513         hit_way_set = HitWaySet()
 514
 515         # Test if pending request is a hit on any way
 516         # In order to make timing in virtual mode,
 517         # when we are using the TLB, we compare each
 518         # way with each of the real addresses from each way of
 519         # the TLB, and then decide later which match to use.
 520
 521         with m.If(virt_mode):
 522             for j in range(TLB_NUM_WAYS):
 523                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 524                 s_hit       = Signal()
 525                 s_pte       = Signal(TLB_PTE_BITS)
 526                 s_ra        = Signal(REAL_ADDR_BITS)
 527                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 528                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 529                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 530                 comb += s_tag.eq(get_tag(s_ra))
 531
 532                 for i in range(NUM_WAYS):
 533                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 534                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 535                                   (read_tag(i, cache_tag_set) == s_tag)
 536                                   & tlb_valid_way[j])
 537                     with m.If(is_tag_hit):
 538                         comb += hit_way_set[j].eq(i)
 539                         comb += s_hit.eq(1)
 540                 comb += hit_set[j].eq(s_hit)
 541                 with m.If(s_tag == reload_tag):
 542                     comb += rel_matches[j].eq(1)
 543             with m.If(tlb_hit):
 544                 comb += is_hit.eq(hit_set[tlb_hit_way])
 545                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 546                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 547         with m.Else():
 548             s_tag       = Signal(TAG_BITS)
 549             comb += s_tag.eq(get_tag(req_addr))
 550             for i in range(NUM_WAYS):
 551                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 552                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 553                           (read_tag(i, cache_tag_set) == s_tag))
 554                 with m.If(is_tag_hit):
 555                     comb += hit_way.eq(i)
 556                     comb += is_hit.eq(1)
 557             with m.If(s_tag == reload_tag):
 558                 comb += rel_match.eq(1)
 559
 560         return m
 561
 562
 563 class DCache(Elaboratable):
 564     """Set associative dcache write-through
 565     TODO (in no specific order):
 566     * See list in icache.vhdl
 567     * Complete load misses on the cycle when WB data comes instead of
 568       at the end of line (this requires dealing with requests coming in
 569       while not idle...)
 570     """
 571     def __init__(self):
 572         self.d_in      = LoadStore1ToDCacheType("d_in")
 573         self.d_out     = DCacheToLoadStore1Type("d_out")
 574
 575         self.m_in      = MMUToDCacheType("m_in")
 576         self.m_out     = DCacheToMMUType("m_out")
 577
 578         self.stall_out = Signal()
 579
 580         self.wb_out    = WBMasterOut()
 581         self.wb_in     = WBSlaveOut()
 582
 583         self.log_out   = Signal(20)
 584
 585     def stage_0(self, m, r0, r1, r0_full):
 586         """Latch the request in r0.req as long as we're not stalling
 587         """
 588         comb = m.d.comb
 589         sync = m.d.sync
 590         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 591
 592         r = RegStage0("stage0")
 593
 594         # TODO, this goes in unit tests and formal proofs
 595         with m.If(d_in.valid & m_in.valid):
 596             sync += Display("request collision loadstore vs MMU")
 597
 598         with m.If(m_in.valid):
 599             sync += r.req.valid.eq(1)
 600             sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 601             sync += r.req.dcbz.eq(0)
 602             sync += r.req.nc.eq(0)
 603             sync += r.req.reserve.eq(0)
 604             sync += r.req.virt_mode.eq(1)
 605             sync += r.req.priv_mode.eq(1)
 606             sync += r.req.addr.eq(m_in.addr)
 607             sync += r.req.data.eq(m_in.pte)
 608             sync += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 609             sync += r.tlbie.eq(m_in.tlbie)
 610             sync += r.doall.eq(m_in.doall)
 611             sync += r.tlbld.eq(m_in.tlbld)
 612             sync += r.mmu_req.eq(1)
 613         with m.Else():
 614             sync += r.req.eq(d_in)
 615             sync += r.tlbie.eq(0)
 616             sync += r.doall.eq(0)
 617             sync += r.tlbld.eq(0)
 618             sync += r.mmu_req.eq(0)
 619             with m.If(~(r1.full & r0_full)):
 620                 sync += r0.eq(r)
 621                 sync += r0_full.eq(r.req.valid)
 622
 623     def tlb_read(self, m, r0_stall, tlb_valid_way,
 624                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 625                  dtlb_tags, dtlb_ptes):
 626         """TLB
 627         Operates in the second cycle on the request latched in r0.req.
 628         TLB updates write the entry at the end of the second cycle.
 629         """
 630         comb = m.d.comb
 631         sync = m.d.sync
 632         m_in, d_in = self.m_in, self.d_in
 633
 634         index    = Signal(TLB_SET_BITS)
 635         addrbits = Signal(TLB_SET_BITS)
 636
 637         amin = TLB_LG_PGSZ
 638         amax = TLB_LG_PGSZ + TLB_SET_BITS
 639
 640         with m.If(m_in.valid):
 641             comb += addrbits.eq(m_in.addr[amin : amax])
 642         with m.Else():
 643             comb += addrbits.eq(d_in.addr[amin : amax])
 644         comb += index.eq(addrbits)
 645
 646         # If we have any op and the previous op isn't finished,
 647         # then keep the same output for next cycle.
 648         with m.If(~r0_stall):
 649             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 650             sync += tlb_tag_way.eq(dtlb_tags[index])
 651             sync += tlb_pte_way.eq(dtlb_ptes[index])
 652
 653     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 654         """Generate TLB PLRUs
 655         """
 656         comb = m.d.comb
 657         sync = m.d.sync
 658
 659         if TLB_NUM_WAYS == 0:
 660             return
 661         for i in range(TLB_SET_SIZE):
 662             # TLB PLRU interface
 663             tlb_plru        = PLRU(WAY_BITS)
 664             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 665             tlb_plru_acc_en = Signal()
 666
 667             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 668             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 669             comb += tlb_plru.acc.eq(r1.tlb_hit_way)
 670             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 671
 672     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 673                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 674                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 675
 676         comb = m.d.comb
 677         sync = m.d.sync
 678
 679         hitway = Signal(TLB_WAY_BITS)
 680         hit    = Signal()
 681         eatag  = Signal(TLB_EA_TAG_BITS)
 682
 683         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 684         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 685         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 686
 687         for i in range(TLB_NUM_WAYS):
 688             is_tag_hit = Signal()
 689             comb += is_tag_hit.eq(tlb_valid_way[i]
 690                                   & read_tlb_tag(i, tlb_tag_way) == eatag)
 691             with m.If(is_tag_hit):
 692                 comb += hitway.eq(i)
 693                 comb += hit.eq(1)
 694
 695         comb += tlb_hit.eq(hit & r0_valid)
 696         comb += tlb_hit_way.eq(hitway)
 697
 698         with m.If(tlb_hit):
 699             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 700         with m.Else():
 701             comb += pte.eq(0)
 702         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 703         with m.If(r0.req.virt_mode):
 704             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 705                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 706                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 707             comb += perm_attr.eq(extract_perm_attr(pte))
 708         with m.Else():
 709             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 710                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 711
 712             comb += perm_attr.reference.eq(1)
 713             comb += perm_attr.changed.eq(1)
 714             comb += perm_attr.nocache.eq(0)
 715             comb += perm_attr.priv.eq(1)
 716             comb += perm_attr.rd_perm.eq(1)
 717             comb += perm_attr.wr_perm.eq(1)
 718
 719     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 720                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 721                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 722
 723         comb = m.d.comb
 724         sync = m.d.sync
 725
 726         tlbie    = Signal()
 727         tlbwe    = Signal()
 728
 729         comb += tlbie.eq(r0_valid & r0.tlbie)
 730         comb += tlbwe.eq(r0_valid & r0.tlbld)
 731
 732         m.submodules.tlb_update = d = DTLBUpdate()
 733         with m.If(tlbie & r0.doall):
 734             # clear all valid bits at once
 735             for i in range(TLB_SET_SIZE):
 736                 sync += dtlb_valid_bits[i].eq(0)
 737         with m.If(d.updated):
 738             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 739             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 740         with m.If(d.v_updated):
 741             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 742
 743         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 744
 745         comb += d.tlbie.eq(tlbie)
 746         comb += d.tlbwe.eq(tlbwe)
 747         comb += d.doall.eq(r0.doall)
 748         comb += d.tlb_hit.eq(tlb_hit)
 749         comb += d.tlb_hit_way.eq(tlb_hit_way)
 750         comb += d.tlb_tag_way.eq(tlb_tag_way)
 751         comb += d.tlb_pte_way.eq(tlb_pte_way)
 752         comb += d.tlb_req_index.eq(tlb_req_index)
 753
 754         with m.If(tlb_hit):
 755             comb += d.repl_way.eq(tlb_hit_way)
 756         with m.Else():
 757             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 758         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 759         comb += d.pte_data.eq(r0.req.data)
 760
 761     def maybe_plrus(self, m, r1, plru_victim):
 762         """Generate PLRUs
 763         """
 764         comb = m.d.comb
 765         sync = m.d.sync
 766
 767         if TLB_NUM_WAYS == 0:
 768             return
 769
 770         for i in range(NUM_LINES):
 771             # PLRU interface
 772             plru        = PLRU(WAY_BITS)
 773             setattr(m.submodules, "plru%d" % i, plru)
 774             plru_acc_en = Signal()
 775
 776             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 777             comb += plru.acc_en.eq(plru_acc_en)
 778             comb += plru.acc.eq(r1.hit_way)
 779             comb += plru_victim[i].eq(plru.lru_o)
 780
 781     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 782         """Cache tag RAM read port
 783         """
 784         comb = m.d.comb
 785         sync = m.d.sync
 786         m_in, d_in = self.m_in, self.d_in
 787
 788         index = Signal(INDEX_BITS)
 789
 790         with m.If(r0_stall):
 791             comb += index.eq(req_index)
 792         with m.Elif(m_in.valid):
 793             comb += index.eq(get_index(m_in.addr))
 794         with m.Else():
 795             comb += index.eq(get_index(d_in.addr))
 796         sync += cache_tag_set.eq(cache_tags[index])
 797
 798     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 799                        r0_valid, r1, cache_valids, replace_way,
 800                        use_forward1_next, use_forward2_next,
 801                        req_hit_way, plru_victim, rc_ok, perm_attr,
 802                        valid_ra, perm_ok, access_ok, req_op, req_go,
 803                        tlb_pte_way,
 804                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 805                        cancel_store, req_same_tag, r0_stall, early_req_row):
 806         """Cache request parsing and hit detection
 807         """
 808
 809         comb = m.d.comb
 810         sync = m.d.sync
 811         m_in, d_in = self.m_in, self.d_in
 812
 813         is_hit      = Signal()
 814         hit_way     = Signal(WAY_BITS)
 815         op          = Signal(Op)
 816         opsel       = Signal(3)
 817         go          = Signal()
 818         nc          = Signal()
 819         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 820                                   for i in range(TLB_NUM_WAYS))
 821         cache_valid_idx = Signal(NUM_WAYS)
 822
 823         # Extract line, row and tag from request
 824         comb += req_index.eq(get_index(r0.req.addr))
 825         comb += req_row.eq(get_row(r0.req.addr))
 826         comb += req_tag.eq(get_tag(ra))
 827
 828         if False: # display on comb is a bit... busy.
 829             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 830                     r0.req.addr, ra, req_index, req_tag, req_row)
 831
 832         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 833         comb += cache_valid_idx.eq(cache_valids[req_index])
 834
 835         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 836                                 tlb_valid_way, tlb_hit_way,
 837                                 cache_valid_idx, cache_tag_set,
 838                                 r0.req.addr,
 839                                 hit_set)
 840
 841         comb += dc.tlb_hit.eq(tlb_hit)
 842         comb += dc.reload_tag.eq(r1.reload_tag)
 843         comb += dc.virt_mode.eq(r0.req.virt_mode)
 844         comb += dc.go.eq(go)
 845         comb += dc.req_index.eq(req_index)
 846         comb += is_hit.eq(dc.is_hit)
 847         comb += hit_way.eq(dc.hit_way)
 848         comb += req_same_tag.eq(dc.rel_match)
 849
 850         # See if the request matches the line currently being reloaded
 851         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 852                   (req_index == r1.store_index) & req_same_tag):
 853             # For a store, consider this a hit even if the row isn't
 854             # valid since it will be by the time we perform the store.
 855             # For a load, check the appropriate row valid bit.
 856             rrow = Signal(ROW_LINE_BITS)
 857             comb += rrow.eq(req_row)
 858             valid = r1.rows_valid[rrow]
 859             comb += is_hit.eq(~r0.req.load | valid)
 860             comb += hit_way.eq(replace_way)
 861
 862         # Whether to use forwarded data for a load or not
 863         with m.If((get_row(r1.req.real_addr) == req_row) &
 864                   (r1.req.hit_way == hit_way)):
 865             # Only need to consider r1.write_bram here, since if we
 866             # are writing refill data here, then we don't have a
 867             # cache hit this cycle on the line being refilled.
 868             # (There is the possibility that the load following the
 869             # load miss that started the refill could be to the old
 870             # contents of the victim line, since it is a couple of
 871             # cycles after the refill starts before we see the updated
 872             # cache tag. In that case we don't use the bypass.)
 873             comb += use_forward1_next.eq(r1.write_bram)
 874         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 875             comb += use_forward2_next.eq(r1.forward_valid1)
 876
 877         # The way that matched on a hit
 878         comb += req_hit_way.eq(hit_way)
 879
 880         # The way to replace on a miss
 881         with m.If(r1.write_tag):
 882             comb += replace_way.eq(plru_victim[r1.store_index])
 883         with m.Else():
 884             comb += replace_way.eq(r1.store_way)
 885
 886         # work out whether we have permission for this access
 887         # NB we don't yet implement AMR, thus no KUAP
 888         comb += rc_ok.eq(perm_attr.reference
 889                          & (r0.req.load | perm_attr.changed)
 890                 )
 891         comb += perm_ok.eq((r0.req.priv_mode | ~perm_attr.priv) &
 892                            (perm_attr.wr_perm |
 893                               (r0.req.load & perm_attr.rd_perm)))
 894         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 895         # Combine the request and cache hit status to decide what
 896         # operation needs to be done
 897         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 898         comb += op.eq(Op.OP_NONE)
 899         with m.If(go):
 900             with m.If(~access_ok):
 901                 comb += op.eq(Op.OP_BAD)
 902             with m.Elif(cancel_store):
 903                 comb += op.eq(Op.OP_STCX_FAIL)
 904             with m.Else():
 905                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 906                 with m.Switch(opsel):
 907                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 908                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 909                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 910                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 911                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 912                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 913                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 914                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 915         comb += req_op.eq(op)
 916         comb += req_go.eq(go)
 917
 918         # Version of the row number that is valid one cycle earlier
 919         # in the cases where we need to read the cache data BRAM.
 920         # If we're stalling then we need to keep reading the last
 921         # row requested.
 922         with m.If(~r0_stall):
 923             with m.If(m_in.valid):
 924                 comb += early_req_row.eq(get_row(m_in.addr))
 925             with m.Else():
 926                 comb += early_req_row.eq(get_row(d_in.addr))
 927         with m.Else():
 928             comb += early_req_row.eq(req_row)
 929
 930     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 931                          r0_valid, r0, reservation):
 932         """Handle load-with-reservation and store-conditional instructions
 933         """
 934         comb = m.d.comb
 935         sync = m.d.sync
 936
 937         with m.If(r0_valid & r0.req.reserve):
 938             # XXX generate alignment interrupt if address
 939             # is not aligned XXX or if r0.req.nc = '1'
 940             with m.If(r0.req.load):
 941                 comb += set_rsrv.eq(1) # load with reservation
 942             with m.Else():
 943                 comb += clear_rsrv.eq(1) # store conditional
 944                 with m.If(~reservation.valid |
 945                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
 946                     comb += cancel_store.eq(1)
 947
 948     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 949                         reservation, r0):
 950
 951         comb = m.d.comb
 952         sync = m.d.sync
 953
 954         with m.If(r0_valid & access_ok):
 955             with m.If(clear_rsrv):
 956                 sync += reservation.valid.eq(0)
 957             with m.Elif(set_rsrv):
 958                 sync += reservation.valid.eq(1)
 959                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 960
 961     def writeback_control(self, m, r1, cache_out):
 962         """Return data for loads & completion control logic
 963         """
 964         comb = m.d.comb
 965         sync = m.d.sync
 966         d_out, m_out = self.d_out, self.m_out
 967
 968         data_out = Signal(64)
 969         data_fwd = Signal(64)
 970
 971         # Use the bypass if are reading the row that was
 972         # written 1 or 2 cycles ago, including for the
 973         # slow_valid = 1 case (i.e. completing a load
 974         # miss or a non-cacheable load).
 975         with m.If(r1.use_forward1):
 976             comb += data_fwd.eq(r1.forward_data1)
 977         with m.Else():
 978             comb += data_fwd.eq(r1.forward_data2)
 979
 980         comb += data_out.eq(cache_out[r1.hit_way])
 981
 982         for i in range(8):
 983             with m.If(r1.forward_sel[i]):
 984                 dsel = data_fwd.word_select(i, 8)
 985                 comb += data_out.word_select(i, 8).eq(dsel)
 986
 987         comb += d_out.valid.eq(r1.ls_valid)
 988         comb += d_out.data.eq(data_out)
 989         comb += d_out.store_done.eq(~r1.stcx_fail)
 990         comb += d_out.error.eq(r1.ls_error)
 991         comb += d_out.cache_paradox.eq(r1.cache_paradox)
 992
 993         # Outputs to MMU
 994         comb += m_out.done.eq(r1.mmu_done)
 995         comb += m_out.err.eq(r1.mmu_error)
 996         comb += m_out.data.eq(data_out)
 997
 998         # We have a valid load or store hit or we just completed
 999         # a slow op such as a load miss, a NC load or a store
1000         #
1001         # Note: the load hit is delayed by one cycle. However it
1002         # can still not collide with r.slow_valid (well unless I
1003         # miscalculated) because slow_valid can only be set on a
1004         # subsequent request and not on its first cycle (the state
1005         # machine must have advanced), which makes slow_valid
1006         # at least 2 cycles from the previous hit_load_valid.
1007
1008         # Sanity: Only one of these must be set in any given cycle
1009
1010         if False: # TODO: need Display to get this to work
1011             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1012             "unexpected slow_valid collision with stcx_fail"
1013
1014             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1015              "unexpected hit_load_delayed collision with slow_valid"
1016
1017         with m.If(~r1.mmu_req):
1018             # Request came from loadstore1...
1019             # Load hit case is the standard path
1020             with m.If(r1.hit_load_valid):
1021                 sync += Display("completing load hit data=%x", data_out)
1022
1023             # error cases complete without stalling
1024             with m.If(r1.ls_error):
1025                 sync += Display("completing ld/st with error")
1026
1027             # Slow ops (load miss, NC, stores)
1028             with m.If(r1.slow_valid):
1029                 sync += Display("completing store or load miss data=%x",
1030                                 data_out)
1031
1032         with m.Else():
1033             # Request came from MMU
1034             with m.If(r1.hit_load_valid):
1035                 sync += Display("completing load hit to MMU, data=%x",
1036                                 m_out.data)
1037             # error cases complete without stalling
1038             with m.If(r1.mmu_error):
1039                 sync += Display("combpleting MMU ld with error")
1040
1041             # Slow ops (i.e. load miss)
1042             with m.If(r1.slow_valid):
1043                 sync += Display("completing MMU load miss, data=%x",
1044                                 m_out.data)
1045
1046     def rams(self, m, r1, early_req_row, cache_out, replace_way):
1047         """rams
1048         Generate a cache RAM for each way. This handles the normal
1049         reads, writes from reloads and the special store-hit update
1050         path as well.
1051
1052         Note: the BRAMs have an extra read buffer, meaning the output
1053         is pipelined an extra cycle. This differs from the
1054         icache. The writeback logic needs to take that into
1055         account by using 1-cycle delayed signals for load hits.
1056         """
1057         comb = m.d.comb
1058         wb_in = self.wb_in
1059
1060         for i in range(NUM_WAYS):
1061             do_read  = Signal(name="do_rd%d" % i)
1062             rd_addr  = Signal(ROW_BITS)
1063             do_write = Signal(name="do_wr%d" % i)
1064             wr_addr  = Signal(ROW_BITS)
1065             wr_data  = Signal(WB_DATA_BITS)
1066             wr_sel   = Signal(ROW_SIZE)
1067             wr_sel_m = Signal(ROW_SIZE)
1068             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i)
1069
1070             way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
1071             setattr(m.submodules, "cacheram_%d" % i, way)
1072
1073             comb += way.rd_en.eq(do_read)
1074             comb += way.rd_addr.eq(rd_addr)
1075             comb += _d_out.eq(way.rd_data_o)
1076             comb += way.wr_sel.eq(wr_sel_m)
1077             comb += way.wr_addr.eq(wr_addr)
1078             comb += way.wr_data.eq(wr_data)
1079
1080             # Cache hit reads
1081             comb += do_read.eq(1)
1082             comb += rd_addr.eq(early_req_row[:ROW_BITS])
1083             comb += cache_out[i].eq(_d_out)
1084
1085             # Write mux:
1086             #
1087             # Defaults to wishbone read responses (cache refill)
1088             #
1089             # For timing, the mux on wr_data/sel/addr is not
1090             # dependent on anything other than the current state.
1091
1092             with m.If(r1.write_bram):
1093                 # Write store data to BRAM.  This happens one
1094                 # cycle after the store is in r0.
1095                 comb += wr_data.eq(r1.req.data)
1096                 comb += wr_sel.eq(r1.req.byte_sel)
1097                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1098
1099                 with m.If(i == r1.req.hit_way):
1100                     comb += do_write.eq(1)
1101             with m.Else():
1102                 # Otherwise, we might be doing a reload or a DCBZ
1103                 with m.If(r1.dcbz):
1104                     comb += wr_data.eq(0)
1105                 with m.Else():
1106                     comb += wr_data.eq(wb_in.dat)
1107                 comb += wr_addr.eq(r1.store_row)
1108                 comb += wr_sel.eq(~0) # all 1s
1109
1110             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1111                       & wb_in.ack & (replace_way == i)):
1112                 comb += do_write.eq(1)
1113
1114             # Mask write selects with do_write since BRAM
1115             # doesn't have a global write-enable
1116             with m.If(do_write):
1117                 comb += wr_sel_m.eq(wr_sel)
1118
1119     # Cache hit synchronous machine for the easy case.
1120     # This handles load hits.
1121     # It also handles error cases (TLB miss, cache paradox)
1122     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1123                         req_hit_way, req_index, req_tag, access_ok,
1124                         tlb_hit, tlb_hit_way, tlb_req_index):
1125
1126         comb = m.d.comb
1127         sync = m.d.sync
1128
1129         with m.If(req_op != Op.OP_NONE):
1130             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1131                     req_op, r0.req.addr, r0.req.nc,
1132                     req_index, req_tag, req_hit_way)
1133
1134         with m.If(r0_valid):
1135             sync += r1.mmu_req.eq(r0.mmu_req)
1136
1137         # Fast path for load/store hits.
1138         # Set signals for the writeback controls.
1139         sync += r1.hit_way.eq(req_hit_way)
1140         sync += r1.hit_index.eq(req_index)
1141
1142         with m.If(req_op == Op.OP_LOAD_HIT):
1143             sync += r1.hit_load_valid.eq(1)
1144         with m.Else():
1145             sync += r1.hit_load_valid.eq(0)
1146
1147         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1148             sync += r1.cache_hit.eq(1)
1149         with m.Else():
1150             sync += r1.cache_hit.eq(0)
1151
1152         with m.If(req_op == Op.OP_BAD):
1153             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1154             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1155             sync += r1.ls_error.eq(~r0.mmu_req)
1156             sync += r1.mmu_error.eq(r0.mmu_req)
1157             sync += r1.cache_paradox.eq(access_ok)
1158
1159             with m.Else():
1160                 sync += r1.ls_error.eq(0)
1161                 sync += r1.mmu_error.eq(0)
1162                 sync += r1.cache_paradox.eq(0)
1163
1164         with m.If(req_op == Op.OP_STCX_FAIL):
1165             r1.stcx_fail.eq(1)
1166         with m.Else():
1167             sync += r1.stcx_fail.eq(0)
1168
1169         # Record TLB hit information for updating TLB PLRU
1170         sync += r1.tlb_hit.eq(tlb_hit)
1171         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1172         sync += r1.tlb_hit_index.eq(tlb_req_index)
1173
1174     # Memory accesses are handled by this state machine:
1175     #
1176     #   * Cache load miss/reload (in conjunction with "rams")
1177     #   * Load hits for non-cachable forms
1178     #   * Stores (the collision case is handled in "rams")
1179     #
1180     # All wishbone requests generation is done here.
1181     # This machine operates at stage 1.
1182     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1183                     cache_valids, r0, replace_way,
1184                     req_hit_way, req_same_tag,
1185                     r0_valid, req_op, cache_tags, req_go, ra):
1186
1187         comb = m.d.comb
1188         sync = m.d.sync
1189         wb_in = self.wb_in
1190
1191         req         = MemAccessRequest("mreq_ds")
1192         acks        = Signal(3)
1193         adjust_acks = Signal(3)
1194
1195         req_row = Signal(ROW_BITS)
1196         req_idx = Signal(INDEX_BITS)
1197         req_tag = Signal(TAG_BITS)
1198         comb += req_idx.eq(get_index(req.real_addr))
1199         comb += req_row.eq(get_row(req.real_addr))
1200         comb += req_tag.eq(get_tag(req.real_addr))
1201
1202         sync += r1.use_forward1.eq(use_forward1_next)
1203         sync += r1.forward_sel.eq(0)
1204
1205         with m.If(use_forward1_next):
1206             sync += r1.forward_sel.eq(r1.req.byte_sel)
1207         with m.Elif(use_forward2_next):
1208             sync += r1.forward_sel.eq(r1.forward_sel1)
1209
1210         sync += r1.forward_data2.eq(r1.forward_data1)
1211         with m.If(r1.write_bram):
1212             sync += r1.forward_data1.eq(r1.req.data)
1213             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1214             sync += r1.forward_way1.eq(r1.req.hit_way)
1215             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1216             sync += r1.forward_valid1.eq(1)
1217         with m.Else():
1218             with m.If(r1.dcbz):
1219                 sync += r1.forward_data1.eq(0)
1220             with m.Else():
1221                 sync += r1.forward_data1.eq(wb_in.dat)
1222             sync += r1.forward_sel1.eq(~0) # all 1s
1223             sync += r1.forward_way1.eq(replace_way)
1224             sync += r1.forward_row1.eq(r1.store_row)
1225             sync += r1.forward_valid1.eq(0)
1226
1227         # One cycle pulses reset
1228         sync += r1.slow_valid.eq(0)
1229         sync += r1.write_bram.eq(0)
1230         sync += r1.inc_acks.eq(0)
1231         sync += r1.dec_acks.eq(0)
1232
1233         sync += r1.ls_valid.eq(0)
1234         # complete tlbies and TLB loads in the third cycle
1235         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1236
1237         with m.If((req_op == Op.OP_LOAD_HIT)
1238                   | (req_op == Op.OP_STCX_FAIL)):
1239             with m.If(~r0.mmu_req):
1240                 sync += r1.ls_valid.eq(1)
1241             with m.Else():
1242                 sync += r1.mmu_done.eq(1)
1243
1244         with m.If(r1.write_tag):
1245             # Store new tag in selected way
1246             for i in range(NUM_WAYS):
1247                 with m.If(i == replace_way):
1248                     ct = Signal(TAG_RAM_WIDTH)
1249                     comb += ct.eq(cache_tags[r1.store_index])
1250                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1251                     sync += cache_tags[r1.store_index].eq(ct)
1252             sync += r1.store_way.eq(replace_way)
1253             sync += r1.write_tag.eq(0)
1254
1255         # Take request from r1.req if there is one there,
1256         # else from req_op, ra, etc.
1257         with m.If(r1.full):
1258             comb += req.eq(r1.req)
1259         with m.Else():
1260             comb += req.op.eq(req_op)
1261             comb += req.valid.eq(req_go)
1262             comb += req.mmu_req.eq(r0.mmu_req)
1263             comb += req.dcbz.eq(r0.req.dcbz)
1264             comb += req.real_addr.eq(ra)
1265
1266             with m.If(~r0.req.dcbz):
1267                 comb += req.data.eq(r0.req.data)
1268             with m.Else():
1269                 comb += req.data.eq(0)
1270
1271             # Select all bytes for dcbz
1272             # and for cacheable loads
1273             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1274                 comb += req.byte_sel.eq(~0) # all 1s
1275             with m.Else():
1276                 comb += req.byte_sel.eq(r0.req.byte_sel)
1277             comb += req.hit_way.eq(req_hit_way)
1278             comb += req.same_tag.eq(req_same_tag)
1279
1280             # Store the incoming request from r0,
1281             # if it is a slow request
1282             # Note that r1.full = 1 implies req_op = OP_NONE
1283             with m.If((req_op == Op.OP_LOAD_MISS)
1284                       | (req_op == Op.OP_LOAD_NC)
1285                       | (req_op == Op.OP_STORE_MISS)
1286                       | (req_op == Op.OP_STORE_HIT)):
1287                 sync += r1.req.eq(req)
1288                 sync += r1.full.eq(1)
1289
1290         # Main state machine
1291         with m.Switch(r1.state):
1292
1293             with m.Case(State.IDLE):
1294                 sync += r1.real_adr.eq(req.real_addr)
1295                 sync += r1.wb.sel.eq(req.byte_sel)
1296                 sync += r1.wb.dat.eq(req.data)
1297                 sync += r1.dcbz.eq(req.dcbz)
1298
1299                 # Keep track of our index and way
1300                 # for subsequent stores.
1301                 sync += r1.store_index.eq(req_idx)
1302                 sync += r1.store_row.eq(req_row)
1303                 sync += r1.end_row_ix.eq(get_row_of_line(req_row))
1304                 sync += r1.reload_tag.eq(req_tag)
1305                 sync += r1.req.same_tag.eq(1)
1306
1307                 with m.If(req.op == Op.OP_STORE_HIT):
1308                     sync += r1.store_way.eq(req.hit_way)
1309
1310                 # Reset per-row valid bits,
1311                 # ready for handling OP_LOAD_MISS
1312                 for i in range(ROW_PER_LINE):
1313                     sync += r1.rows_valid[i].eq(0)
1314
1315                 with m.If(req_op != Op.OP_NONE):
1316                     sync += Display("cache op %d", req.op)
1317
1318                 with m.Switch(req.op):
1319                     with m.Case(Op.OP_LOAD_HIT):
1320                         # stay in IDLE state
1321                         pass
1322
1323                     with m.Case(Op.OP_LOAD_MISS):
1324                         sync += Display("cache miss real addr: %x " \
1325                                 "idx: %x tag: %x",
1326                                 req.real_addr, req_row, req_tag)
1327
1328                         # Start the wishbone cycle
1329                         sync += r1.wb.we.eq(0)
1330                         sync += r1.wb.cyc.eq(1)
1331                         sync += r1.wb.stb.eq(1)
1332
1333                         # Track that we had one request sent
1334                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1335                         sync += r1.write_tag.eq(1)
1336
1337                     with m.Case(Op.OP_LOAD_NC):
1338                         sync += r1.wb.cyc.eq(1)
1339                         sync += r1.wb.stb.eq(1)
1340                         sync += r1.wb.we.eq(0)
1341                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1342
1343                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1344                         with m.If(~req.dcbz):
1345                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1346                             sync += r1.acks_pending.eq(1)
1347                             sync += r1.full.eq(0)
1348                             sync += r1.slow_valid.eq(1)
1349
1350                             with m.If(~req.mmu_req):
1351                                 sync += r1.ls_valid.eq(1)
1352                             with m.Else():
1353                                 sync += r1.mmu_done.eq(1)
1354
1355                             with m.If(req.op == Op.OP_STORE_HIT):
1356                                 sync += r1.write_bram.eq(1)
1357                         with m.Else():
1358                             # dcbz is handled much like a load miss except
1359                             # that we are writing to memory instead of reading
1360                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1361
1362                             with m.If(req.op == Op.OP_STORE_MISS):
1363                                 sync += r1.write_tag.eq(1)
1364
1365                         sync += r1.wb.we.eq(1)
1366                         sync += r1.wb.cyc.eq(1)
1367                         sync += r1.wb.stb.eq(1)
1368
1369                     # OP_NONE and OP_BAD do nothing
1370                     # OP_BAD & OP_STCX_FAIL were
1371                     # handled above already
1372                     with m.Case(Op.OP_NONE):
1373                         pass
1374                     with m.Case(Op.OP_BAD):
1375                         pass
1376                     with m.Case(Op.OP_STCX_FAIL):
1377                         pass
1378
1379             with m.Case(State.RELOAD_WAIT_ACK):
1380                 ld_stbs_done = Signal()
1381                 # Requests are all sent if stb is 0
1382                 comb += ld_stbs_done.eq(~r1.wb.stb)
1383
1384                 with m.If((~wb_in.stall) & r1.wb.stb):
1385                     # That was the last word?
1386                     # We are done sending.
1387                     # Clear stb and set ld_stbs_done
1388                     # so we can handle an eventual
1389                     # last ack on the same cycle.
1390                     with m.If(is_last_row_addr(r1.real_adr, r1.end_row_ix)):
1391                         sync += r1.wb.stb.eq(0)
1392                         comb += ld_stbs_done.eq(1)
1393
1394                     # Calculate the next row address in the current cache line
1395                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1396                     comb += row.eq(r1.real_adr[ROW_OFF_BITS:])
1397                     sync += r1.real_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(row+1)
1398
1399                 # Incoming acks processing
1400                 sync += r1.forward_valid1.eq(wb_in.ack)
1401                 with m.If(wb_in.ack):
1402                     srow = Signal(ROW_LINE_BITS)
1403                     comb += srow.eq(r1.store_row)
1404                     sync += r1.rows_valid[srow].eq(1)
1405
1406                     # If this is the data we were looking for,
1407                     # we can complete the request next cycle.
1408                     # Compare the whole address in case the
1409                     # request in r1.req is not the one that
1410                     # started this refill.
1411                     with m.If(r1.full & r1.req.same_tag &
1412                               ((r1.dcbz & r1.req.dcbz) |
1413                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1414                                 (r1.store_row == get_row(r1.req.real_addr))):
1415                         sync += r1.full.eq(0)
1416                         sync += r1.slow_valid.eq(1)
1417                         with m.If(~r1.mmu_req):
1418                             sync += r1.ls_valid.eq(1)
1419                         with m.Else():
1420                             sync += r1.mmu_done.eq(1)
1421                         sync += r1.forward_sel.eq(~0) # all 1s
1422                         sync += r1.use_forward1.eq(1)
1423
1424                     # Check for completion
1425                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1426                                                       r1.end_row_ix)):
1427                         # Complete wishbone cycle
1428                         sync += r1.wb.cyc.eq(0)
1429
1430                         # Cache line is now valid
1431                         cv = Signal(INDEX_BITS)
1432                         comb += cv.eq(cache_valids[r1.store_index])
1433                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1434                         sync += cache_valids[r1.store_index].eq(cv)
1435                         sync += r1.state.eq(State.IDLE)
1436
1437                     # Increment store row counter
1438                     sync += r1.store_row.eq(next_row(r1.store_row))
1439
1440             with m.Case(State.STORE_WAIT_ACK):
1441                 st_stbs_done = Signal()
1442                 comb += st_stbs_done.eq(~r1.wb.stb)
1443                 comb += acks.eq(r1.acks_pending)
1444
1445                 with m.If(r1.inc_acks != r1.dec_acks):
1446                     with m.If(r1.inc_acks):
1447                         comb += adjust_acks.eq(acks + 1)
1448                     with m.Else():
1449                         comb += adjust_acks.eq(acks - 1)
1450                 with m.Else():
1451                     comb += adjust_acks.eq(acks)
1452
1453                 sync += r1.acks_pending.eq(adjust_acks)
1454
1455                 # Clear stb when slave accepted request
1456                 with m.If(~wb_in.stall):
1457                     # See if there is another store waiting
1458                     # to be done which is in the same real page.
1459                     with m.If(req.valid):
1460                         ra = req.real_addr[0:SET_SIZE_BITS]
1461                         sync += r1.real_adr[0:SET_SIZE_BITS].eq(ra)
1462                         sync += r1.wb.dat.eq(req.data)
1463                         sync += r1.wb.sel.eq(req.byte_sel)
1464
1465                     with m.Elif((adjust_acks < 7) & req.same_tag &
1466                                 ((req.op == Op.OP_STORE_MISS)
1467                                  | (req.op == Op.OP_STORE_HIT))):
1468                         sync += r1.wb.stb.eq(1)
1469                         comb += st_stbs_done.eq(0)
1470
1471                         with m.If(req.op == Op.OP_STORE_HIT):
1472                             sync += r1.write_bram.eq(1)
1473                         sync += r1.full.eq(0)
1474                         sync += r1.slow_valid.eq(1)
1475
1476                         # Store requests never come from the MMU
1477                         sync += r1.ls_valid.eq(1)
1478                         comb += st_stbs_done.eq(0)
1479                         sync += r1.inc_acks.eq(1)
1480                     with m.Else():
1481                         sync += r1.wb.stb.eq(0)
1482                         comb += st_stbs_done.eq(1)
1483
1484                 # Got ack ? See if complete.
1485                 with m.If(wb_in.ack):
1486                     with m.If(st_stbs_done & (adjust_acks == 1)):
1487                         sync += r1.state.eq(State.IDLE)
1488                         sync += r1.wb.cyc.eq(0)
1489                         sync += r1.wb.stb.eq(0)
1490                     sync += r1.dec_acks.eq(1)
1491
1492             with m.Case(State.NC_LOAD_WAIT_ACK):
1493                 # Clear stb when slave accepted request
1494                 with m.If(~wb_in.stall):
1495                     sync += r1.wb.stb.eq(0)
1496
1497                 # Got ack ? complete.
1498                 with m.If(wb_in.ack):
1499                     sync += r1.state.eq(State.IDLE)
1500                     sync += r1.full.eq(0)
1501                     sync += r1.slow_valid.eq(1)
1502
1503                     with m.If(~r1.mmu_req):
1504                         sync += r1.ls_valid.eq(1)
1505                     with m.Else():
1506                         sync += r1.mmu_done.eq(1)
1507
1508                     sync += r1.forward_sel.eq(~0) # all 1s
1509                     sync += r1.use_forward1.eq(1)
1510                     sync += r1.wb.cyc.eq(0)
1511                     sync += r1.wb.stb.eq(0)
1512
1513     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1514
1515         sync = m.d.sync
1516         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1517
1518         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1519                                stall_out, req_op[:3], d_out.valid, d_out.error,
1520                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1521                                r1.real_adr[3:6]))
1522
1523     def elaborate(self, platform):
1524
1525         m = Module()
1526         comb = m.d.comb
1527
1528         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1529         cache_tags       = CacheTagArray()
1530         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1531         cache_valids = CacheValidBitsArray()
1532
1533         # TODO attribute ram_style : string;
1534         # TODO attribute ram_style of cache_tags : signal is "distributed";
1535
1536         """note: these are passed to nmigen.hdl.Memory as "attributes".
1537            don't know how, just that they are.
1538         """
1539         dtlb_valid_bits = TLBValidBitsArray()
1540         dtlb_tags       = TLBTagsArray()
1541         dtlb_ptes       = TLBPtesArray()
1542         # TODO attribute ram_style of
1543         #  dtlb_tags : signal is "distributed";
1544         # TODO attribute ram_style of
1545         #  dtlb_ptes : signal is "distributed";
1546
1547         r0      = RegStage0("r0")
1548         r0_full = Signal()
1549
1550         r1 = RegStage1("r1")
1551
1552         reservation = Reservation()
1553
1554         # Async signals on incoming request
1555         req_index    = Signal(INDEX_BITS)
1556         req_row      = Signal(ROW_BITS)
1557         req_hit_way  = Signal(WAY_BITS)
1558         req_tag      = Signal(TAG_BITS)
1559         req_op       = Signal(Op)
1560         req_data     = Signal(64)
1561         req_same_tag = Signal()
1562         req_go       = Signal()
1563
1564         early_req_row     = Signal(ROW_BITS)
1565
1566         cancel_store      = Signal()
1567         set_rsrv          = Signal()
1568         clear_rsrv        = Signal()
1569
1570         r0_valid          = Signal()
1571         r0_stall          = Signal()
1572
1573         use_forward1_next = Signal()
1574         use_forward2_next = Signal()
1575
1576         cache_out         = CacheRamOut()
1577
1578         plru_victim       = PLRUOut()
1579         replace_way       = Signal(WAY_BITS)
1580
1581         # Wishbone read/write/cache write formatting signals
1582         bus_sel           = Signal(8)
1583
1584         # TLB signals
1585         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1586         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1587         tlb_valid_way = Signal(TLB_NUM_WAYS)
1588         tlb_req_index = Signal(TLB_SET_BITS)
1589         tlb_hit       = Signal()
1590         tlb_hit_way   = Signal(TLB_WAY_BITS)
1591         pte           = Signal(TLB_PTE_BITS)
1592         ra            = Signal(REAL_ADDR_BITS)
1593         valid_ra      = Signal()
1594         perm_attr     = PermAttr("dc_perms")
1595         rc_ok         = Signal()
1596         perm_ok       = Signal()
1597         access_ok     = Signal()
1598
1599         tlb_plru_victim = TLBPLRUOut()
1600
1601         # we don't yet handle collisions between loadstore1 requests
1602         # and MMU requests
1603         comb += self.m_out.stall.eq(0)
1604
1605         # Hold off the request in r0 when r1 has an uncompleted request
1606         comb += r0_stall.eq(r0_full & r1.full)
1607         comb += r0_valid.eq(r0_full & ~r1.full)
1608         comb += self.stall_out.eq(r0_stall)
1609
1610         # Wire up wishbone request latch out of stage 1
1611         comb += r1.wb.adr.eq(r1.real_adr[ROW_OFF_BITS:]) # truncate LSBs
1612         comb += self.wb_out.eq(r1.wb)
1613
1614         # call sub-functions putting everything together, using shared
1615         # signals established above
1616         self.stage_0(m, r0, r1, r0_full)
1617         self.tlb_read(m, r0_stall, tlb_valid_way,
1618                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1619                       dtlb_tags, dtlb_ptes)
1620         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1621                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1622                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1623         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1624                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1625                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1626         self.maybe_plrus(m, r1, plru_victim)
1627         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1628         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1629         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1630                            r0_valid, r1, cache_valids, replace_way,
1631                            use_forward1_next, use_forward2_next,
1632                            req_hit_way, plru_victim, rc_ok, perm_attr,
1633                            valid_ra, perm_ok, access_ok, req_op, req_go,
1634                            tlb_pte_way,
1635                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1636                            cancel_store, req_same_tag, r0_stall, early_req_row)
1637         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1638                            r0_valid, r0, reservation)
1639         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1640                            reservation, r0)
1641         self.writeback_control(m, r1, cache_out)
1642         self.rams(m, r1, early_req_row, cache_out, replace_way)
1643         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1644                         req_hit_way, req_index, req_tag, access_ok,
1645                         tlb_hit, tlb_hit_way, tlb_req_index)
1646         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1647                     cache_valids, r0, replace_way,
1648                     req_hit_way, req_same_tag,
1649                          r0_valid, req_op, cache_tags, req_go, ra)
1650         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1651
1652         return m
1653
1654 def dcache_load(dut, addr, nc=0):
1655     yield dut.d_in.load.eq(1)
1656     yield dut.d_in.nc.eq(nc)
1657     yield dut.d_in.addr.eq(addr)
1658     yield dut.d_in.byte_sel.eq(~0)
1659     yield dut.d_in.valid.eq(1)
1660     yield
1661     yield dut.d_in.valid.eq(0)
1662     yield dut.d_in.byte_sel.eq(0)
1663     yield
1664     while not (yield dut.d_out.valid):
1665         yield
1666     data = yield dut.d_out.data
1667     return data
1668
1669
1670 def dcache_store(dut, addr, data, nc=0):
1671     yield dut.d_in.load.eq(0)
1672     yield dut.d_in.nc.eq(nc)
1673     yield dut.d_in.data.eq(data)
1674     yield dut.d_in.byte_sel.eq(~0)
1675     yield dut.d_in.addr.eq(addr)
1676     yield dut.d_in.valid.eq(1)
1677     yield
1678     yield dut.d_in.valid.eq(0)
1679     yield dut.d_in.byte_sel.eq(0)
1680     yield
1681     while not (yield dut.d_out.valid):
1682         yield
1683
1684
1685 def dcache_random_sim(dut):
1686
1687     # start with stack of zeros
1688     sim_mem = [0] * 512
1689
1690     # clear stuff
1691     yield dut.d_in.valid.eq(0)
1692     yield dut.d_in.load.eq(0)
1693     yield dut.d_in.priv_mode.eq(1)
1694     yield dut.d_in.nc.eq(0)
1695     yield dut.d_in.addr.eq(0)
1696     yield dut.d_in.data.eq(0)
1697     yield dut.m_in.valid.eq(0)
1698     yield dut.m_in.addr.eq(0)
1699     yield dut.m_in.pte.eq(0)
1700     # wait 4 * clk_period
1701     yield
1702     yield
1703     yield
1704     yield
1705
1706     print ()
1707
1708     for i in range(256):
1709         addr = randint(0, 255)
1710         data = randint(0, (1<<64)-1)
1711         sim_mem[addr] = data
1712         addr *= 8
1713
1714         print ("testing %x data %x" % (addr, data))
1715
1716         yield from dcache_load(dut, addr)
1717         yield from dcache_store(dut, addr, data)
1718
1719         addr = randint(0, 255)
1720         sim_data = sim_mem[addr]
1721         addr *= 8
1722
1723         data = yield from dcache_load(dut, addr)
1724         assert data == sim_data, \
1725             "check %x data %x != %x" % (addr, data, sim_data)
1726
1727     for addr in range(256):
1728         data = yield from dcache_load(dut, addr*8)
1729         assert data == sim_mem[addr], \
1730             "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1731
1732 def dcache_sim(dut):
1733     # clear stuff
1734     yield dut.d_in.valid.eq(0)
1735     yield dut.d_in.load.eq(0)
1736     yield dut.d_in.priv_mode.eq(1)
1737     yield dut.d_in.nc.eq(0)
1738     yield dut.d_in.addr.eq(0)
1739     yield dut.d_in.data.eq(0)
1740     yield dut.m_in.valid.eq(0)
1741     yield dut.m_in.addr.eq(0)
1742     yield dut.m_in.pte.eq(0)
1743     # wait 4 * clk_period
1744     yield
1745     yield
1746     yield
1747     yield
1748
1749     # Cacheable read of address 4
1750     data = yield from dcache_load(dut, 0x58)
1751     addr = yield dut.d_in.addr
1752     assert data == 0x0000001700000016, \
1753         f"data @%x=%x expected 0x0000001700000016" % (addr, data)
1754
1755     # Cacheable read of address 20
1756     data = yield from dcache_load(dut, 0x20)
1757     addr = yield dut.d_in.addr
1758     assert data == 0x0000000900000008, \
1759         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1760
1761     # Cacheable read of address 30
1762     data = yield from dcache_load(dut, 0x530)
1763     addr = yield dut.d_in.addr
1764     assert data == 0x0000014D0000014C, \
1765         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1766
1767     # 2nd Cacheable read of address 30
1768     data = yield from dcache_load(dut, 0x530)
1769     addr = yield dut.d_in.addr
1770     assert data == 0x0000014D0000014C, \
1771         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1772
1773     # Non-cacheable read of address 100
1774     data = yield from dcache_load(dut, 0x100, nc=1)
1775     addr = yield dut.d_in.addr
1776     assert data == 0x0000004100000040, \
1777         f"data @%x=%x expected 0000004100000040" % (addr, data)
1778
1779     # Store at address 530
1780     yield from dcache_store(dut, 0x530, 0x121)
1781
1782     # Store at address 30
1783     yield from dcache_store(dut, 0x530, 0x12345678)
1784
1785     # 3nd Cacheable read of address 530
1786     data = yield from dcache_load(dut, 0x530)
1787     addr = yield dut.d_in.addr
1788     assert data == 0x12345678, \
1789         f"data @%x=%x expected 0x12345678" % (addr, data)
1790
1791     # 4th Cacheable read of address 20
1792     data = yield from dcache_load(dut, 0x20)
1793     addr = yield dut.d_in.addr
1794     assert data == 0x0000000900000008, \
1795         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1796
1797     yield
1798     yield
1799     yield
1800     yield
1801
1802
1803 def test_dcache(mem, test_fn, test_name):
1804     dut = DCache()
1805
1806     memory = Memory(width=64, depth=16*64, init=mem)
1807     sram = SRAM(memory=memory, granularity=8)
1808
1809     m = Module()
1810     m.submodules.dcache = dut
1811     m.submodules.sram = sram
1812
1813     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1814     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1815     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1816     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1817     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1818     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1819
1820     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1821     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1822
1823     # nmigen Simulation
1824     sim = Simulator(m)
1825     sim.add_clock(1e-6)
1826
1827     sim.add_sync_process(wrap(test_fn(dut)))
1828     with sim.write_vcd('test_dcache%s.vcd' % test_name):
1829         sim.run()
1830
1831 if __name__ == '__main__':
1832     dut = DCache()
1833     vl = rtlil.convert(dut, ports=[])
1834     with open("test_dcache.il", "w") as f:
1835         f.write(vl)
1836
1837     mem = []
1838     for i in range(0,512):
1839         mem.append((i*2)| ((i*2+1)<<32))
1840
1841     test_dcache(mem, dcache_sim, "")
1842     test_dcache(None, dcache_random_sim, "random")
1843