src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 """
   6
   7 from enum import Enum, unique
   8
   9 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  10 try:
  11     from nmigen.hdl.ast import Display
  12 except ImportError:
  13     def Display(*args):
  14         return []
  15
  16 from random import randint
  17
  18 from nmigen.cli import main
  19 from nmutil.iocontrol import RecordObject
  20 from nmutil.util import wrap
  21 from nmigen.utils import log2_int
  22 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  23                                      DCacheToLoadStore1Type,
  24                                      MMUToDCacheType,
  25                                      DCacheToMMUType)
  26
  27 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  28                                 WBAddrType, WBDataType, WBSelType,
  29                                 WBMasterOut, WBSlaveOut,
  30                                 WBMasterOutVector, WBSlaveOutVector,
  31                                 WBIOMasterOut, WBIOSlaveOut)
  32
  33 from soc.experiment.cache_ram import CacheRam
  34 from soc.experiment.plru import PLRU
  35
  36 # for test
  37 from nmigen_soc.wishbone.sram import SRAM
  38 from nmigen import Memory
  39 from nmigen.cli import rtlil
  40 if True:
  41     from nmigen.back.pysim import Simulator, Delay, Settle
  42 else:
  43     from nmigen.sim.cxxsim import Simulator, Delay, Settle
  44
  45
  46 # TODO: make these parameters of DCache at some point
  47 LINE_SIZE = 64    # Line size in bytes
  48 NUM_LINES = 16    # Number of lines in a set
  49 NUM_WAYS = 4      # Number of ways
  50 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  51 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  52 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  53 LOG_LENGTH = 0    # Non-zero to enable log data collection
  54
  55 # BRAM organisation: We never access more than
  56 #     -- WB_DATA_BITS at a time so to save
  57 #     -- resources we make the array only that wide, and
  58 #     -- use consecutive indices for to make a cache "line"
  59 #     --
  60 #     -- ROW_SIZE is the width in bytes of the BRAM
  61 #     -- (based on WB, so 64-bits)
  62 ROW_SIZE = WB_DATA_BITS // 8;
  63
  64 # ROW_PER_LINE is the number of row (wishbone
  65 # transactions) in a line
  66 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  67
  68 # BRAM_ROWS is the number of rows in BRAM needed
  69 # to represent the full dcache
  70 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  71
  72 print ("ROW_SIZE", ROW_SIZE)
  73 print ("ROW_PER_LINE", ROW_PER_LINE)
  74 print ("BRAM_ROWS", BRAM_ROWS)
  75
  76 # Bit fields counts in the address
  77
  78 # REAL_ADDR_BITS is the number of real address
  79 # bits that we store
  80 REAL_ADDR_BITS = 56
  81
  82 # ROW_BITS is the number of bits to select a row
  83 ROW_BITS = log2_int(BRAM_ROWS)
  84
  85 # ROW_LINE_BITS is the number of bits to select
  86 # a row within a line
  87 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  88
  89 # LINE_OFF_BITS is the number of bits for
  90 # the offset in a cache line
  91 LINE_OFF_BITS = log2_int(LINE_SIZE)
  92
  93 # ROW_OFF_BITS is the number of bits for
  94 # the offset in a row
  95 ROW_OFF_BITS = log2_int(ROW_SIZE)
  96
  97 # INDEX_BITS is the number if bits to
  98 # select a cache line
  99 INDEX_BITS = log2_int(NUM_LINES)
 100
 101 # SET_SIZE_BITS is the log base 2 of the set size
 102 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 103
 104 # TAG_BITS is the number of bits of
 105 # the tag part of the address
 106 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 107
 108 # TAG_WIDTH is the width in bits of each way of the tag RAM
 109 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 110
 111 # WAY_BITS is the number of bits to select a way
 112 WAY_BITS = log2_int(NUM_WAYS)
 113
 114 # Example of layout for 32 lines of 64 bytes:
 115 layout = """\
 116   ..  tag    |index|  line  |
 117   ..         |   row   |    |
 118   ..         |     |---|    | ROW_LINE_BITS  (3)
 119   ..         |     |--- - --| LINE_OFF_BITS (6)
 120   ..         |         |- --| ROW_OFF_BITS  (3)
 121   ..         |----- ---|    | ROW_BITS      (8)
 122   ..         |-----|        | INDEX_BITS    (5)
 123   .. --------|              | TAG_BITS      (45)
 124 """
 125 print (layout)
 126 print ("Dcache TAG %d IDX %d ROW %d ROFF %d LOFF %d RLB %d" % \
 127             (TAG_BITS, INDEX_BITS, ROW_BITS,
 128              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 129 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 130 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 131 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 132
 133 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 134
 135 def CacheTagArray():
 136     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 137                         for x in range(NUM_LINES))
 138
 139 def CacheValidBitsArray():
 140     return Array(Signal(INDEX_BITS, name="cachevalid_%d" % x) \
 141                         for x in range(NUM_LINES))
 142
 143 def RowPerLineValidArray():
 144     return Array(Signal(name="rows_valid%d" % x) \
 145                         for x in range(ROW_PER_LINE))
 146
 147 # L1 TLB
 148 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 149 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 150 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 151 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 152 TLB_PTE_BITS     = 64
 153 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 154
 155 def ispow2(x):
 156     return (1<<log2_int(x, False)) == x
 157
 158 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 159 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 160 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 161 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 162 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 163 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 164         "geometry bits don't add up"
 165 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 166         "geometry bits don't add up"
 167 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 168          "geometry bits don't add up"
 169 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 170 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 171
 172
 173 def TLBValidBitsArray():
 174     return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
 175
 176 def TLBTagEAArray():
 177     return Array(Signal(TLB_EA_TAG_BITS) for x in range (TLB_NUM_WAYS))
 178
 179 def TLBTagsArray():
 180     return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
 181
 182 def TLBPtesArray():
 183     return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
 184
 185 def HitWaySet():
 186     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 187                         for x in range(TLB_NUM_WAYS))
 188
 189 # Cache RAM interface
 190 def CacheRamOut():
 191     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 192                  for x in range(NUM_WAYS))
 193
 194 # PLRU output interface
 195 def PLRUOut():
 196     return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
 197
 198 # TLB PLRU output interface
 199 def TLBPLRUOut():
 200     return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
 201
 202 # Helper functions to decode incoming requests
 203 #
 204 # Return the cache line index (tag index) for an address
 205 def get_index(addr):
 206     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 207
 208 # Return the cache row index (data memory) for an address
 209 def get_row(addr):
 210     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 211
 212 # Return the index of a row within a line
 213 def get_row_of_line(row):
 214     return row[:ROW_BITS][:ROW_LINE_BITS]
 215
 216 # Returns whether this is the last row of a line
 217 def is_last_row_addr(addr, last):
 218     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 219
 220 # Returns whether this is the last row of a line
 221 def is_last_row(row, last):
 222     return get_row_of_line(row) == last
 223
 224 # Return the next row in the current cache line. We use a
 225 # dedicated function in order to limit the size of the
 226 # generated adder to be only the bits within a cache line
 227 # (3 bits with default settings)
 228 def next_row(row):
 229     row_v = row[0:ROW_LINE_BITS] + 1
 230     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 231
 232 # Get the tag value from the address
 233 def get_tag(addr):
 234     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 235
 236 # Read a tag from a tag memory row
 237 def read_tag(way, tagset):
 238     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 239
 240 # Read a TLB tag from a TLB tag memory row
 241 def read_tlb_tag(way, tags):
 242     return tags.word_select(way, TLB_EA_TAG_BITS)
 243
 244 # Write a TLB tag to a TLB tag memory row
 245 def write_tlb_tag(way, tags, tag):
 246     return read_tlb_tag(way, tags).eq(tag)
 247
 248 # Read a PTE from a TLB PTE memory row
 249 def read_tlb_pte(way, ptes):
 250     return ptes.word_select(way, TLB_PTE_BITS)
 251
 252 def write_tlb_pte(way, ptes, newpte):
 253     return read_tlb_pte(way, ptes).eq(newpte)
 254
 255
 256 # Record for storing permission, attribute, etc. bits from a PTE
 257 class PermAttr(RecordObject):
 258     def __init__(self, name=None):
 259         super().__init__(name=name)
 260         self.reference = Signal()
 261         self.changed   = Signal()
 262         self.nocache   = Signal()
 263         self.priv      = Signal()
 264         self.rd_perm   = Signal()
 265         self.wr_perm   = Signal()
 266
 267
 268 def extract_perm_attr(pte):
 269     pa = PermAttr()
 270     pa.reference = pte[8]
 271     pa.changed   = pte[7]
 272     pa.nocache   = pte[5]
 273     pa.priv      = pte[3]
 274     pa.rd_perm   = pte[2]
 275     pa.wr_perm   = pte[1]
 276     return pa;
 277
 278
 279 # Type of operation on a "valid" input
 280 @unique
 281 class Op(Enum):
 282     OP_NONE       = 0
 283     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 284     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 285     OP_LOAD_HIT   = 3 # Cache hit on load
 286     OP_LOAD_MISS  = 4 # Load missing cache
 287     OP_LOAD_NC    = 5 # Non-cachable load
 288     OP_STORE_HIT  = 6 # Store hitting cache
 289     OP_STORE_MISS = 7 # Store missing cache
 290
 291
 292 # Cache state machine
 293 @unique
 294 class State(Enum):
 295     IDLE             = 0 # Normal load hit processing
 296     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 297     STORE_WAIT_ACK   = 2 # Store wait ack
 298     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 299
 300
 301 # Dcache operations:
 302 #
 303 # In order to make timing, we use the BRAMs with
 304 # an output buffer, which means that the BRAM
 305 # output is delayed by an extra cycle.
 306 #
 307 # Thus, the dcache has a 2-stage internal pipeline
 308 # for cache hits with no stalls.
 309 #
 310 # All other operations are handled via stalling
 311 # in the first stage.
 312 #
 313 # The second stage can thus complete a hit at the same
 314 # time as the first stage emits a stall for a complex op.
 315 #
 316 # Stage 0 register, basically contains just the latched request
 317
 318 class RegStage0(RecordObject):
 319     def __init__(self, name=None):
 320         super().__init__(name=name)
 321         self.req     = LoadStore1ToDCacheType(name="lsmem")
 322         self.tlbie   = Signal()
 323         self.doall   = Signal()
 324         self.tlbld   = Signal()
 325         self.mmu_req = Signal() # indicates source of request
 326
 327
 328 class MemAccessRequest(RecordObject):
 329     def __init__(self, name=None):
 330         super().__init__(name=name)
 331         self.op        = Signal(Op)
 332         self.valid     = Signal()
 333         self.dcbz      = Signal()
 334         self.real_addr = Signal(REAL_ADDR_BITS)
 335         self.data      = Signal(64)
 336         self.byte_sel  = Signal(8)
 337         self.hit_way   = Signal(WAY_BITS)
 338         self.same_tag  = Signal()
 339         self.mmu_req   = Signal()
 340
 341
 342 # First stage register, contains state for stage 1 of load hits
 343 # and for the state machine used by all other operations
 344 class RegStage1(RecordObject):
 345     def __init__(self, name=None):
 346         super().__init__(name=name)
 347         # Info about the request
 348         self.full             = Signal() # have uncompleted request
 349         self.mmu_req          = Signal() # request is from MMU
 350         self.req              = MemAccessRequest(name="reqmem")
 351
 352         # Cache hit state
 353         self.hit_way          = Signal(WAY_BITS)
 354         self.hit_load_valid   = Signal()
 355         self.hit_index        = Signal(INDEX_BITS)
 356         self.cache_hit        = Signal()
 357
 358         # TLB hit state
 359         self.tlb_hit          = Signal()
 360         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 361         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 362
 363         # 2-stage data buffer for data forwarded from writes to reads
 364         self.forward_data1    = Signal(64)
 365         self.forward_data2    = Signal(64)
 366         self.forward_sel1     = Signal(8)
 367         self.forward_valid1   = Signal()
 368         self.forward_way1     = Signal(WAY_BITS)
 369         self.forward_row1     = Signal(ROW_BITS)
 370         self.use_forward1     = Signal()
 371         self.forward_sel      = Signal(8)
 372
 373         # Cache miss state (reload state machine)
 374         self.state            = Signal(State)
 375         self.dcbz             = Signal()
 376         self.write_bram       = Signal()
 377         self.write_tag        = Signal()
 378         self.slow_valid       = Signal()
 379         self.wb               = WBMasterOut("wb")
 380         self.reload_tag       = Signal(TAG_BITS)
 381         self.store_way        = Signal(WAY_BITS)
 382         self.store_row        = Signal(ROW_BITS)
 383         self.store_index      = Signal(INDEX_BITS)
 384         self.end_row_ix       = Signal(ROW_LINE_BITS)
 385         self.rows_valid       = RowPerLineValidArray()
 386         self.acks_pending     = Signal(3)
 387         self.inc_acks         = Signal()
 388         self.dec_acks         = Signal()
 389
 390         # Signals to complete (possibly with error)
 391         self.ls_valid         = Signal()
 392         self.ls_error         = Signal()
 393         self.mmu_done         = Signal()
 394         self.mmu_error        = Signal()
 395         self.cache_paradox    = Signal()
 396
 397         # Signal to complete a failed stcx.
 398         self.stcx_fail        = Signal()
 399
 400
 401 # Reservation information
 402 class Reservation(RecordObject):
 403     def __init__(self):
 404         super().__init__()
 405         self.valid = Signal()
 406         self.addr  = Signal(64-LINE_OFF_BITS)
 407
 408
 409 class DTLBUpdate(Elaboratable):
 410     def __init__(self):
 411         self.tlbie    = Signal()
 412         self.tlbwe    = Signal()
 413         self.doall    = Signal()
 414         self.updated  = Signal()
 415         self.v_updated  = Signal()
 416         self.tlb_hit    = Signal()
 417         self.tlb_req_index = Signal(TLB_SET_BITS)
 418
 419         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 420         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 421         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 422         self.repl_way        = Signal(TLB_WAY_BITS)
 423         self.eatag           = Signal(TLB_EA_TAG_BITS)
 424         self.pte_data        = Signal(TLB_PTE_BITS)
 425
 426         self.dv = Signal(TLB_PTE_WAY_BITS)
 427
 428         self.tb_out = Signal(TLB_TAG_WAY_BITS)
 429         self.pb_out = Signal(TLB_NUM_WAYS)
 430         self.db_out = Signal(TLB_PTE_WAY_BITS)
 431
 432     def elaborate(self, platform):
 433         m = Module()
 434         comb = m.d.comb
 435         sync = m.d.sync
 436
 437         tagset   = Signal(TLB_TAG_WAY_BITS)
 438         pteset   = Signal(TLB_PTE_WAY_BITS)
 439
 440         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 441
 442         with m.If(self.tlbie & self.doall):
 443             pass # clear all back in parent
 444         with m.Elif(self.tlbie):
 445             with m.If(self.tlb_hit):
 446                 comb += db_out.eq(self.dv)
 447                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 448                 comb += self.v_updated.eq(1)
 449
 450         with m.Elif(self.tlbwe):
 451
 452             comb += tagset.eq(self.tlb_tag_way)
 453             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 454             comb += tb_out.eq(tagset)
 455
 456             comb += pteset.eq(self.tlb_pte_way)
 457             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 458             comb += pb_out.eq(pteset)
 459
 460             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 461
 462             comb += self.updated.eq(1)
 463             comb += self.v_updated.eq(1)
 464
 465         return m
 466
 467
 468 class DCachePendingHit(Elaboratable):
 469
 470     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 471                       cache_valid_idx, cache_tag_set,
 472                     req_addr,
 473                     hit_set):
 474
 475         self.go          = Signal()
 476         self.virt_mode   = Signal()
 477         self.is_hit      = Signal()
 478         self.tlb_hit     = Signal()
 479         self.hit_way     = Signal(WAY_BITS)
 480         self.rel_match   = Signal()
 481         self.req_index   = Signal(INDEX_BITS)
 482         self.reload_tag  = Signal(TAG_BITS)
 483
 484         self.tlb_hit_way = tlb_hit_way
 485         self.tlb_pte_way = tlb_pte_way
 486         self.tlb_valid_way = tlb_valid_way
 487         self.cache_valid_idx = cache_valid_idx
 488         self.cache_tag_set = cache_tag_set
 489         self.req_addr = req_addr
 490         self.hit_set = hit_set
 491
 492     def elaborate(self, platform):
 493         m = Module()
 494         comb = m.d.comb
 495         sync = m.d.sync
 496
 497         go = self.go
 498         virt_mode = self.virt_mode
 499         is_hit = self.is_hit
 500         tlb_pte_way = self.tlb_pte_way
 501         tlb_valid_way = self.tlb_valid_way
 502         cache_valid_idx = self.cache_valid_idx
 503         cache_tag_set = self.cache_tag_set
 504         req_addr = self.req_addr
 505         tlb_hit_way = self.tlb_hit_way
 506         tlb_hit = self.tlb_hit
 507         hit_set = self.hit_set
 508         hit_way = self.hit_way
 509         rel_match = self.rel_match
 510         req_index = self.req_index
 511         reload_tag = self.reload_tag
 512
 513         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 514                                     for i in range(TLB_NUM_WAYS))
 515         hit_way_set = HitWaySet()
 516
 517         # Test if pending request is a hit on any way
 518         # In order to make timing in virtual mode,
 519         # when we are using the TLB, we compare each
 520         # way with each of the real addresses from each way of
 521         # the TLB, and then decide later which match to use.
 522
 523         with m.If(virt_mode):
 524             for j in range(TLB_NUM_WAYS):
 525                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 526                 s_hit       = Signal()
 527                 s_pte       = Signal(TLB_PTE_BITS)
 528                 s_ra        = Signal(REAL_ADDR_BITS)
 529                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 530                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 531                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 532                 comb += s_tag.eq(get_tag(s_ra))
 533
 534                 for i in range(NUM_WAYS):
 535                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 536                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 537                                   (read_tag(i, cache_tag_set) == s_tag)
 538                                   & tlb_valid_way[j])
 539                     with m.If(is_tag_hit):
 540                         comb += hit_way_set[j].eq(i)
 541                         comb += s_hit.eq(1)
 542                 comb += hit_set[j].eq(s_hit)
 543                 with m.If(s_tag == reload_tag):
 544                     comb += rel_matches[j].eq(1)
 545             with m.If(tlb_hit):
 546                 comb += is_hit.eq(hit_set[tlb_hit_way])
 547                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 548                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 549         with m.Else():
 550             s_tag       = Signal(TAG_BITS)
 551             comb += s_tag.eq(get_tag(req_addr))
 552             for i in range(NUM_WAYS):
 553                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 554                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 555                           (read_tag(i, cache_tag_set) == s_tag))
 556                 with m.If(is_tag_hit):
 557                     comb += hit_way.eq(i)
 558                     comb += is_hit.eq(1)
 559             with m.If(s_tag == reload_tag):
 560                 comb += rel_match.eq(1)
 561
 562         return m
 563
 564
 565 class DCache(Elaboratable):
 566     """Set associative dcache write-through
 567     TODO (in no specific order):
 568     * See list in icache.vhdl
 569     * Complete load misses on the cycle when WB data comes instead of
 570       at the end of line (this requires dealing with requests coming in
 571       while not idle...)
 572     """
 573     def __init__(self):
 574         self.d_in      = LoadStore1ToDCacheType("d_in")
 575         self.d_out     = DCacheToLoadStore1Type("d_out")
 576
 577         self.m_in      = MMUToDCacheType("m_in")
 578         self.m_out     = DCacheToMMUType("m_out")
 579
 580         self.stall_out = Signal()
 581
 582         self.wb_out    = WBMasterOut()
 583         self.wb_in     = WBSlaveOut()
 584
 585         self.log_out   = Signal(20)
 586
 587     def stage_0(self, m, r0, r1, r0_full):
 588         """Latch the request in r0.req as long as we're not stalling
 589         """
 590         comb = m.d.comb
 591         sync = m.d.sync
 592         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 593
 594         r = RegStage0("stage0")
 595
 596         # TODO, this goes in unit tests and formal proofs
 597         with m.If(d_in.valid & m_in.valid):
 598             sync += Display("request collision loadstore vs MMU")
 599
 600         with m.If(m_in.valid):
 601             sync += r.req.valid.eq(1)
 602             sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 603             sync += r.req.dcbz.eq(0)
 604             sync += r.req.nc.eq(0)
 605             sync += r.req.reserve.eq(0)
 606             sync += r.req.virt_mode.eq(1)
 607             sync += r.req.priv_mode.eq(1)
 608             sync += r.req.addr.eq(m_in.addr)
 609             sync += r.req.data.eq(m_in.pte)
 610             sync += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 611             sync += r.tlbie.eq(m_in.tlbie)
 612             sync += r.doall.eq(m_in.doall)
 613             sync += r.tlbld.eq(m_in.tlbld)
 614             sync += r.mmu_req.eq(1)
 615         with m.Else():
 616             sync += r.req.eq(d_in)
 617             sync += r.tlbie.eq(0)
 618             sync += r.doall.eq(0)
 619             sync += r.tlbld.eq(0)
 620             sync += r.mmu_req.eq(0)
 621             with m.If(~(r1.full & r0_full)):
 622                 sync += r0.eq(r)
 623                 sync += r0_full.eq(r.req.valid)
 624
 625     def tlb_read(self, m, r0_stall, tlb_valid_way,
 626                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 627                  dtlb_tags, dtlb_ptes):
 628         """TLB
 629         Operates in the second cycle on the request latched in r0.req.
 630         TLB updates write the entry at the end of the second cycle.
 631         """
 632         comb = m.d.comb
 633         sync = m.d.sync
 634         m_in, d_in = self.m_in, self.d_in
 635
 636         index    = Signal(TLB_SET_BITS)
 637         addrbits = Signal(TLB_SET_BITS)
 638
 639         amin = TLB_LG_PGSZ
 640         amax = TLB_LG_PGSZ + TLB_SET_BITS
 641
 642         with m.If(m_in.valid):
 643             comb += addrbits.eq(m_in.addr[amin : amax])
 644         with m.Else():
 645             comb += addrbits.eq(d_in.addr[amin : amax])
 646         comb += index.eq(addrbits)
 647
 648         # If we have any op and the previous op isn't finished,
 649         # then keep the same output for next cycle.
 650         with m.If(~r0_stall):
 651             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 652             sync += tlb_tag_way.eq(dtlb_tags[index])
 653             sync += tlb_pte_way.eq(dtlb_ptes[index])
 654
 655     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 656         """Generate TLB PLRUs
 657         """
 658         comb = m.d.comb
 659         sync = m.d.sync
 660
 661         if TLB_NUM_WAYS == 0:
 662             return
 663         for i in range(TLB_SET_SIZE):
 664             # TLB PLRU interface
 665             tlb_plru        = PLRU(WAY_BITS)
 666             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 667             tlb_plru_acc_en = Signal()
 668
 669             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 670             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 671             comb += tlb_plru.acc.eq(r1.tlb_hit_way)
 672             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 673
 674     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 675                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 676                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 677
 678         comb = m.d.comb
 679         sync = m.d.sync
 680
 681         hitway = Signal(TLB_WAY_BITS)
 682         hit    = Signal()
 683         eatag  = Signal(TLB_EA_TAG_BITS)
 684
 685         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 686         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 687         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 688
 689         for i in range(TLB_NUM_WAYS):
 690             is_tag_hit = Signal()
 691             comb += is_tag_hit.eq(tlb_valid_way[i]
 692                                   & read_tlb_tag(i, tlb_tag_way) == eatag)
 693             with m.If(is_tag_hit):
 694                 comb += hitway.eq(i)
 695                 comb += hit.eq(1)
 696
 697         comb += tlb_hit.eq(hit & r0_valid)
 698         comb += tlb_hit_way.eq(hitway)
 699
 700         with m.If(tlb_hit):
 701             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 702         with m.Else():
 703             comb += pte.eq(0)
 704         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 705         with m.If(r0.req.virt_mode):
 706             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 707                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 708                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 709             comb += perm_attr.eq(extract_perm_attr(pte))
 710         with m.Else():
 711             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 712                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 713
 714             comb += perm_attr.reference.eq(1)
 715             comb += perm_attr.changed.eq(1)
 716             comb += perm_attr.nocache.eq(0)
 717             comb += perm_attr.priv.eq(1)
 718             comb += perm_attr.rd_perm.eq(1)
 719             comb += perm_attr.wr_perm.eq(1)
 720
 721     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 722                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 723                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 724
 725         comb = m.d.comb
 726         sync = m.d.sync
 727
 728         tlbie    = Signal()
 729         tlbwe    = Signal()
 730
 731         comb += tlbie.eq(r0_valid & r0.tlbie)
 732         comb += tlbwe.eq(r0_valid & r0.tlbld)
 733
 734         m.submodules.tlb_update = d = DTLBUpdate()
 735         with m.If(tlbie & r0.doall):
 736             # clear all valid bits at once
 737             for i in range(TLB_SET_SIZE):
 738                 sync += dtlb_valid_bits[i].eq(0)
 739         with m.If(d.updated):
 740             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 741             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 742         with m.If(d.v_updated):
 743             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 744
 745         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 746
 747         comb += d.tlbie.eq(tlbie)
 748         comb += d.tlbwe.eq(tlbwe)
 749         comb += d.doall.eq(r0.doall)
 750         comb += d.tlb_hit.eq(tlb_hit)
 751         comb += d.tlb_hit_way.eq(tlb_hit_way)
 752         comb += d.tlb_tag_way.eq(tlb_tag_way)
 753         comb += d.tlb_pte_way.eq(tlb_pte_way)
 754         comb += d.tlb_req_index.eq(tlb_req_index)
 755
 756         with m.If(tlb_hit):
 757             comb += d.repl_way.eq(tlb_hit_way)
 758         with m.Else():
 759             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 760         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 761         comb += d.pte_data.eq(r0.req.data)
 762
 763     def maybe_plrus(self, m, r1, plru_victim):
 764         """Generate PLRUs
 765         """
 766         comb = m.d.comb
 767         sync = m.d.sync
 768
 769         if TLB_NUM_WAYS == 0:
 770             return
 771
 772         for i in range(NUM_LINES):
 773             # PLRU interface
 774             plru        = PLRU(WAY_BITS)
 775             setattr(m.submodules, "plru%d" % i, plru)
 776             plru_acc_en = Signal()
 777
 778             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 779             comb += plru.acc_en.eq(plru_acc_en)
 780             comb += plru.acc.eq(r1.hit_way)
 781             comb += plru_victim[i].eq(plru.lru_o)
 782
 783     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 784         """Cache tag RAM read port
 785         """
 786         comb = m.d.comb
 787         sync = m.d.sync
 788         m_in, d_in = self.m_in, self.d_in
 789
 790         index = Signal(INDEX_BITS)
 791
 792         with m.If(r0_stall):
 793             comb += index.eq(req_index)
 794         with m.Elif(m_in.valid):
 795             comb += index.eq(get_index(m_in.addr))
 796         with m.Else():
 797             comb += index.eq(get_index(d_in.addr))
 798         sync += cache_tag_set.eq(cache_tags[index])
 799
 800     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 801                        r0_valid, r1, cache_valid_bits, replace_way,
 802                        use_forward1_next, use_forward2_next,
 803                        req_hit_way, plru_victim, rc_ok, perm_attr,
 804                        valid_ra, perm_ok, access_ok, req_op, req_go,
 805                        tlb_pte_way,
 806                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 807                        cancel_store, req_same_tag, r0_stall, early_req_row):
 808         """Cache request parsing and hit detection
 809         """
 810
 811         comb = m.d.comb
 812         sync = m.d.sync
 813         m_in, d_in = self.m_in, self.d_in
 814
 815         is_hit      = Signal()
 816         hit_way     = Signal(WAY_BITS)
 817         op          = Signal(Op)
 818         opsel       = Signal(3)
 819         go          = Signal()
 820         nc          = Signal()
 821         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 822                                   for i in range(TLB_NUM_WAYS))
 823         cache_valid_idx = Signal(INDEX_BITS)
 824
 825         # Extract line, row and tag from request
 826         comb += req_index.eq(get_index(r0.req.addr))
 827         comb += req_row.eq(get_row(r0.req.addr))
 828         comb += req_tag.eq(get_tag(ra))
 829
 830         comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 831                 r0.req.addr, ra, req_index, req_tag, req_row)
 832
 833         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 834         comb += cache_valid_idx.eq(cache_valid_bits[req_index])
 835
 836         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 837                                 tlb_valid_way, tlb_hit_way,
 838                                 cache_valid_idx, cache_tag_set,
 839                                 r0.req.addr,
 840                                 hit_set)
 841
 842         comb += dc.tlb_hit.eq(tlb_hit)
 843         comb += dc.reload_tag.eq(r1.reload_tag)
 844         comb += dc.virt_mode.eq(r0.req.virt_mode)
 845         comb += dc.go.eq(go)
 846         comb += dc.req_index.eq(req_index)
 847         comb += is_hit.eq(dc.is_hit)
 848         comb += hit_way.eq(dc.hit_way)
 849         comb += req_same_tag.eq(dc.rel_match)
 850
 851         # See if the request matches the line currently being reloaded
 852         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 853                   (req_index == r1.store_index) & req_same_tag):
 854             # For a store, consider this a hit even if the row isn't
 855             # valid since it will be by the time we perform the store.
 856             # For a load, check the appropriate row valid bit.
 857             valid = r1.rows_valid[req_row[:ROW_LINE_BITS]]
 858             comb += is_hit.eq(~r0.req.load | valid)
 859             comb += hit_way.eq(replace_way)
 860
 861         # Whether to use forwarded data for a load or not
 862         with m.If((get_row(r1.req.real_addr) == req_row) &
 863                   (r1.req.hit_way == hit_way)):
 864             # Only need to consider r1.write_bram here, since if we
 865             # are writing refill data here, then we don't have a
 866             # cache hit this cycle on the line being refilled.
 867             # (There is the possibility that the load following the
 868             # load miss that started the refill could be to the old
 869             # contents of the victim line, since it is a couple of
 870             # cycles after the refill starts before we see the updated
 871             # cache tag. In that case we don't use the bypass.)
 872             comb += use_forward1_next.eq(r1.write_bram)
 873         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 874             comb += use_forward2_next.eq(r1.forward_valid1)
 875
 876         # The way that matched on a hit
 877         comb += req_hit_way.eq(hit_way)
 878
 879         # The way to replace on a miss
 880         with m.If(r1.write_tag):
 881             comb += replace_way.eq(plru_victim[r1.store_index])
 882         with m.Else():
 883             comb += replace_way.eq(r1.store_way)
 884
 885         # work out whether we have permission for this access
 886         # NB we don't yet implement AMR, thus no KUAP
 887         comb += rc_ok.eq(perm_attr.reference
 888                          & (r0.req.load | perm_attr.changed)
 889                 )
 890         comb += perm_ok.eq((r0.req.priv_mode | ~perm_attr.priv) &
 891                            (perm_attr.wr_perm |
 892                               (r0.req.load & perm_attr.rd_perm)))
 893         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 894         # Combine the request and cache hit status to decide what
 895         # operation needs to be done
 896         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 897         comb += op.eq(Op.OP_NONE)
 898         with m.If(go):
 899             with m.If(~access_ok):
 900                 comb += op.eq(Op.OP_BAD)
 901             with m.Elif(cancel_store):
 902                 comb += op.eq(Op.OP_STCX_FAIL)
 903             with m.Else():
 904                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 905                 with m.Switch(opsel):
 906                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 907                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 908                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 909                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 910                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 911                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 912                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 913                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 914         comb += req_op.eq(op)
 915         comb += req_go.eq(go)
 916
 917         # Version of the row number that is valid one cycle earlier
 918         # in the cases where we need to read the cache data BRAM.
 919         # If we're stalling then we need to keep reading the last
 920         # row requested.
 921         with m.If(~r0_stall):
 922             with m.If(m_in.valid):
 923                 comb += early_req_row.eq(get_row(m_in.addr))
 924             with m.Else():
 925                 comb += early_req_row.eq(get_row(d_in.addr))
 926         with m.Else():
 927             comb += early_req_row.eq(req_row)
 928
 929     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 930                          r0_valid, r0, reservation):
 931         """Handle load-with-reservation and store-conditional instructions
 932         """
 933         comb = m.d.comb
 934         sync = m.d.sync
 935
 936         with m.If(r0_valid & r0.req.reserve):
 937             # XXX generate alignment interrupt if address
 938             # is not aligned XXX or if r0.req.nc = '1'
 939             with m.If(r0.req.load):
 940                 comb += set_rsrv.eq(1) # load with reservation
 941             with m.Else():
 942                 comb += clear_rsrv.eq(1) # store conditional
 943                 with m.If(~reservation.valid |
 944                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
 945                     comb += cancel_store.eq(1)
 946
 947     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 948                         reservation, r0):
 949
 950         comb = m.d.comb
 951         sync = m.d.sync
 952
 953         with m.If(r0_valid & access_ok):
 954             with m.If(clear_rsrv):
 955                 sync += reservation.valid.eq(0)
 956             with m.Elif(set_rsrv):
 957                 sync += reservation.valid.eq(1)
 958                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 959
 960     def writeback_control(self, m, r1, cache_out):
 961         """Return data for loads & completion control logic
 962         """
 963         comb = m.d.comb
 964         sync = m.d.sync
 965         d_out, m_out = self.d_out, self.m_out
 966
 967         data_out = Signal(64)
 968         data_fwd = Signal(64)
 969
 970         # Use the bypass if are reading the row that was
 971         # written 1 or 2 cycles ago, including for the
 972         # slow_valid = 1 case (i.e. completing a load
 973         # miss or a non-cacheable load).
 974         with m.If(r1.use_forward1):
 975             comb += data_fwd.eq(r1.forward_data1)
 976         with m.Else():
 977             comb += data_fwd.eq(r1.forward_data2)
 978
 979         comb += data_out.eq(cache_out[r1.hit_way])
 980
 981         for i in range(8):
 982             with m.If(r1.forward_sel[i]):
 983                 dsel = data_fwd.word_select(i, 8)
 984                 comb += data_out.word_select(i, 8).eq(dsel)
 985
 986         comb += d_out.valid.eq(r1.ls_valid)
 987         comb += d_out.data.eq(data_out)
 988         comb += d_out.store_done.eq(~r1.stcx_fail)
 989         comb += d_out.error.eq(r1.ls_error)
 990         comb += d_out.cache_paradox.eq(r1.cache_paradox)
 991
 992         # Outputs to MMU
 993         comb += m_out.done.eq(r1.mmu_done)
 994         comb += m_out.err.eq(r1.mmu_error)
 995         comb += m_out.data.eq(data_out)
 996
 997         # We have a valid load or store hit or we just completed
 998         # a slow op such as a load miss, a NC load or a store
 999         #
1000         # Note: the load hit is delayed by one cycle. However it
1001         # can still not collide with r.slow_valid (well unless I
1002         # miscalculated) because slow_valid can only be set on a
1003         # subsequent request and not on its first cycle (the state
1004         # machine must have advanced), which makes slow_valid
1005         # at least 2 cycles from the previous hit_load_valid.
1006
1007         # Sanity: Only one of these must be set in any given cycle
1008
1009         if False: # TODO: need Display to get this to work
1010             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1011             "unexpected slow_valid collision with stcx_fail"
1012
1013             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1014              "unexpected hit_load_delayed collision with slow_valid"
1015
1016         with m.If(~r1.mmu_req):
1017             # Request came from loadstore1...
1018             # Load hit case is the standard path
1019             with m.If(r1.hit_load_valid):
1020                 sync += Display("completing load hit data=%x", data_out)
1021
1022             # error cases complete without stalling
1023             with m.If(r1.ls_error):
1024                 sync += Display("completing ld/st with error")
1025
1026             # Slow ops (load miss, NC, stores)
1027             with m.If(r1.slow_valid):
1028                 sync += Display("completing store or load miss data=%x",
1029                                 data_out)
1030
1031         with m.Else():
1032             # Request came from MMU
1033             with m.If(r1.hit_load_valid):
1034                 sync += Display("completing load hit to MMU, data=%x",
1035                                 m_out.data)
1036             # error cases complete without stalling
1037             with m.If(r1.mmu_error):
1038                 sync += Display("combpleting MMU ld with error")
1039
1040             # Slow ops (i.e. load miss)
1041             with m.If(r1.slow_valid):
1042                 sync += Display("completing MMU load miss, data=%x",
1043                                 m_out.data)
1044
1045     def rams(self, m, r1, early_req_row, cache_out, replace_way):
1046         """rams
1047         Generate a cache RAM for each way. This handles the normal
1048         reads, writes from reloads and the special store-hit update
1049         path as well.
1050
1051         Note: the BRAMs have an extra read buffer, meaning the output
1052         is pipelined an extra cycle. This differs from the
1053         icache. The writeback logic needs to take that into
1054         account by using 1-cycle delayed signals for load hits.
1055         """
1056         comb = m.d.comb
1057         wb_in = self.wb_in
1058
1059         for i in range(NUM_WAYS):
1060             do_read  = Signal(name="do_rd%d" % i)
1061             rd_addr  = Signal(ROW_BITS)
1062             do_write = Signal(name="do_wr%d" % i)
1063             wr_addr  = Signal(ROW_BITS)
1064             wr_data  = Signal(WB_DATA_BITS)
1065             wr_sel   = Signal(ROW_SIZE)
1066             wr_sel_m = Signal(ROW_SIZE)
1067             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i)
1068
1069             way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
1070             setattr(m.submodules, "cacheram_%d" % i, way)
1071
1072             comb += way.rd_en.eq(do_read)
1073             comb += way.rd_addr.eq(rd_addr)
1074             comb += _d_out.eq(way.rd_data_o)
1075             comb += way.wr_sel.eq(wr_sel_m)
1076             comb += way.wr_addr.eq(wr_addr)
1077             comb += way.wr_data.eq(wr_data)
1078
1079             # Cache hit reads
1080             comb += do_read.eq(1)
1081             comb += rd_addr.eq(early_req_row[:ROW_BITS])
1082             comb += cache_out[i].eq(_d_out)
1083
1084             # Write mux:
1085             #
1086             # Defaults to wishbone read responses (cache refill)
1087             #
1088             # For timing, the mux on wr_data/sel/addr is not
1089             # dependent on anything other than the current state.
1090
1091             with m.If(r1.write_bram):
1092                 # Write store data to BRAM.  This happens one
1093                 # cycle after the store is in r0.
1094                 comb += wr_data.eq(r1.req.data)
1095                 comb += wr_sel.eq(r1.req.byte_sel)
1096                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1097
1098                 with m.If(i == r1.req.hit_way):
1099                     comb += do_write.eq(1)
1100             with m.Else():
1101                 # Otherwise, we might be doing a reload or a DCBZ
1102                 with m.If(r1.dcbz):
1103                     comb += wr_data.eq(0)
1104                 with m.Else():
1105                     comb += wr_data.eq(wb_in.dat)
1106                 comb += wr_addr.eq(r1.store_row)
1107                 comb += wr_sel.eq(~0) # all 1s
1108
1109             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1110                       & wb_in.ack & (replace_way == i)):
1111                 comb += do_write.eq(1)
1112
1113             # Mask write selects with do_write since BRAM
1114             # doesn't have a global write-enable
1115             with m.If(do_write):
1116                 comb += wr_sel_m.eq(wr_sel)
1117
1118     # Cache hit synchronous machine for the easy case.
1119     # This handles load hits.
1120     # It also handles error cases (TLB miss, cache paradox)
1121     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1122                         req_hit_way, req_index, req_tag, access_ok,
1123                         tlb_hit, tlb_hit_way, tlb_req_index):
1124
1125         comb = m.d.comb
1126         sync = m.d.sync
1127
1128         with m.If(req_op != Op.OP_NONE):
1129             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1130                     req_op, r0.req.addr, r0.req.nc,
1131                     req_index, req_tag, req_hit_way)
1132
1133         with m.If(r0_valid):
1134             sync += r1.mmu_req.eq(r0.mmu_req)
1135
1136         # Fast path for load/store hits.
1137         # Set signals for the writeback controls.
1138         sync += r1.hit_way.eq(req_hit_way)
1139         sync += r1.hit_index.eq(req_index)
1140
1141         with m.If(req_op == Op.OP_LOAD_HIT):
1142             sync += r1.hit_load_valid.eq(1)
1143         with m.Else():
1144             sync += r1.hit_load_valid.eq(0)
1145
1146         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1147             sync += r1.cache_hit.eq(1)
1148         with m.Else():
1149             sync += r1.cache_hit.eq(0)
1150
1151         with m.If(req_op == Op.OP_BAD):
1152             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1153             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1154             sync += r1.ls_error.eq(~r0.mmu_req)
1155             sync += r1.mmu_error.eq(r0.mmu_req)
1156             sync += r1.cache_paradox.eq(access_ok)
1157
1158             with m.Else():
1159                 sync += r1.ls_error.eq(0)
1160                 sync += r1.mmu_error.eq(0)
1161                 sync += r1.cache_paradox.eq(0)
1162
1163         with m.If(req_op == Op.OP_STCX_FAIL):
1164             r1.stcx_fail.eq(1)
1165         with m.Else():
1166             sync += r1.stcx_fail.eq(0)
1167
1168         # Record TLB hit information for updating TLB PLRU
1169         sync += r1.tlb_hit.eq(tlb_hit)
1170         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1171         sync += r1.tlb_hit_index.eq(tlb_req_index)
1172
1173     # Memory accesses are handled by this state machine:
1174     #
1175     #   * Cache load miss/reload (in conjunction with "rams")
1176     #   * Load hits for non-cachable forms
1177     #   * Stores (the collision case is handled in "rams")
1178     #
1179     # All wishbone requests generation is done here.
1180     # This machine operates at stage 1.
1181     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1182                     cache_valid_bits, r0, replace_way,
1183                     req_hit_way, req_same_tag,
1184                     r0_valid, req_op, cache_tags, req_go, ra):
1185
1186         comb = m.d.comb
1187         sync = m.d.sync
1188         wb_in = self.wb_in
1189
1190         req         = MemAccessRequest("mreq_ds")
1191         acks        = Signal(3)
1192         adjust_acks = Signal(3)
1193
1194         req_row = Signal(ROW_BITS)
1195         req_idx = Signal(INDEX_BITS)
1196         req_tag = Signal(TAG_BITS)
1197         comb += req_idx.eq(get_index(req.real_addr))
1198         comb += req_row.eq(get_row(req.real_addr))
1199         comb += req_tag.eq(get_tag(req.real_addr))
1200
1201         sync += r1.use_forward1.eq(use_forward1_next)
1202         sync += r1.forward_sel.eq(0)
1203
1204         with m.If(use_forward1_next):
1205             sync += r1.forward_sel.eq(r1.req.byte_sel)
1206         with m.Elif(use_forward2_next):
1207             sync += r1.forward_sel.eq(r1.forward_sel1)
1208
1209         sync += r1.forward_data2.eq(r1.forward_data1)
1210         with m.If(r1.write_bram):
1211             sync += r1.forward_data1.eq(r1.req.data)
1212             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1213             sync += r1.forward_way1.eq(r1.req.hit_way)
1214             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1215             sync += r1.forward_valid1.eq(1)
1216         with m.Else():
1217             with m.If(r1.dcbz):
1218                 sync += r1.forward_data1.eq(0)
1219             with m.Else():
1220                 sync += r1.forward_data1.eq(wb_in.dat)
1221             sync += r1.forward_sel1.eq(~0) # all 1s
1222             sync += r1.forward_way1.eq(replace_way)
1223             sync += r1.forward_row1.eq(r1.store_row)
1224             sync += r1.forward_valid1.eq(0)
1225
1226         # One cycle pulses reset
1227         sync += r1.slow_valid.eq(0)
1228         sync += r1.write_bram.eq(0)
1229         sync += r1.inc_acks.eq(0)
1230         sync += r1.dec_acks.eq(0)
1231
1232         sync += r1.ls_valid.eq(0)
1233         # complete tlbies and TLB loads in the third cycle
1234         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1235
1236         with m.If((req_op == Op.OP_LOAD_HIT)
1237                   | (req_op == Op.OP_STCX_FAIL)):
1238             with m.If(~r0.mmu_req):
1239                 sync += r1.ls_valid.eq(1)
1240             with m.Else():
1241                 sync += r1.mmu_done.eq(1)
1242
1243         with m.If(r1.write_tag):
1244             # Store new tag in selected way
1245             for i in range(NUM_WAYS):
1246                 with m.If(i == replace_way):
1247                     ct = Signal(TAG_RAM_WIDTH)
1248                     comb += ct.eq(cache_tags[r1.store_index])
1249                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1250                     sync += cache_tags[r1.store_index].eq(ct)
1251             sync += r1.store_way.eq(replace_way)
1252             sync += r1.write_tag.eq(0)
1253
1254         # Take request from r1.req if there is one there,
1255         # else from req_op, ra, etc.
1256         with m.If(r1.full):
1257             comb += req.eq(r1.req)
1258         with m.Else():
1259             comb += req.op.eq(req_op)
1260             comb += req.valid.eq(req_go)
1261             comb += req.mmu_req.eq(r0.mmu_req)
1262             comb += req.dcbz.eq(r0.req.dcbz)
1263             comb += req.real_addr.eq(ra)
1264
1265             with m.If(~r0.req.dcbz):
1266                 comb += req.data.eq(r0.req.data)
1267             with m.Else():
1268                 comb += req.data.eq(0)
1269
1270             # Select all bytes for dcbz
1271             # and for cacheable loads
1272             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1273                 comb += req.byte_sel.eq(~0) # all 1s
1274             with m.Else():
1275                 comb += req.byte_sel.eq(r0.req.byte_sel)
1276             comb += req.hit_way.eq(req_hit_way)
1277             comb += req.same_tag.eq(req_same_tag)
1278
1279             # Store the incoming request from r0,
1280             # if it is a slow request
1281             # Note that r1.full = 1 implies req_op = OP_NONE
1282             with m.If((req_op == Op.OP_LOAD_MISS)
1283                       | (req_op == Op.OP_LOAD_NC)
1284                       | (req_op == Op.OP_STORE_MISS)
1285                       | (req_op == Op.OP_STORE_HIT)):
1286                 sync += r1.req.eq(req)
1287                 sync += r1.full.eq(1)
1288
1289         # Main state machine
1290         with m.Switch(r1.state):
1291
1292             with m.Case(State.IDLE):
1293                 sync += r1.wb.adr.eq(req.real_addr)
1294                 sync += r1.wb.sel.eq(req.byte_sel)
1295                 sync += r1.wb.dat.eq(req.data)
1296                 sync += r1.dcbz.eq(req.dcbz)
1297
1298                 # Keep track of our index and way
1299                 # for subsequent stores.
1300                 sync += r1.store_index.eq(req_idx)
1301                 sync += r1.store_row.eq(req_row)
1302                 sync += r1.end_row_ix.eq(get_row_of_line(req_row))
1303                 sync += r1.reload_tag.eq(req_tag)
1304                 sync += r1.req.same_tag.eq(1)
1305
1306                 with m.If(req.op == Op.OP_STORE_HIT):
1307                     sync += r1.store_way.eq(req.hit_way)
1308
1309                 # Reset per-row valid bits,
1310                 # ready for handling OP_LOAD_MISS
1311                 for i in range(ROW_PER_LINE):
1312                     sync += r1.rows_valid[i].eq(0)
1313
1314                 with m.If(req_op != Op.OP_NONE):
1315                     sync += Display("cache op %d", req.op)
1316
1317                 with m.Switch(req.op):
1318                     with m.Case(Op.OP_LOAD_HIT):
1319                         # stay in IDLE state
1320                         pass
1321
1322                     with m.Case(Op.OP_LOAD_MISS):
1323                         sync += Display("cache miss real addr: %x " \
1324                                 "idx: %x tag: %x",
1325                                 req.real_addr, req_row, req_tag)
1326
1327                         # Start the wishbone cycle
1328                         sync += r1.wb.we.eq(0)
1329                         sync += r1.wb.cyc.eq(1)
1330                         sync += r1.wb.stb.eq(1)
1331
1332                         # Track that we had one request sent
1333                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1334                         sync += r1.write_tag.eq(1)
1335
1336                     with m.Case(Op.OP_LOAD_NC):
1337                         sync += r1.wb.cyc.eq(1)
1338                         sync += r1.wb.stb.eq(1)
1339                         sync += r1.wb.we.eq(0)
1340                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1341
1342                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1343                         with m.If(~req.dcbz):
1344                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1345                             sync += r1.acks_pending.eq(1)
1346                             sync += r1.full.eq(0)
1347                             sync += r1.slow_valid.eq(1)
1348
1349                             with m.If(~req.mmu_req):
1350                                 sync += r1.ls_valid.eq(1)
1351                             with m.Else():
1352                                 sync += r1.mmu_done.eq(1)
1353
1354                             with m.If(req.op == Op.OP_STORE_HIT):
1355                                 sync += r1.write_bram.eq(1)
1356                         with m.Else():
1357                             # dcbz is handled much like a load miss except
1358                             # that we are writing to memory instead of reading
1359                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1360
1361                             with m.If(req.op == Op.OP_STORE_MISS):
1362                                 sync += r1.write_tag.eq(1)
1363
1364                         sync += r1.wb.we.eq(1)
1365                         sync += r1.wb.cyc.eq(1)
1366                         sync += r1.wb.stb.eq(1)
1367
1368                     # OP_NONE and OP_BAD do nothing
1369                     # OP_BAD & OP_STCX_FAIL were
1370                     # handled above already
1371                     with m.Case(Op.OP_NONE):
1372                         pass
1373                     with m.Case(Op.OP_BAD):
1374                         pass
1375                     with m.Case(Op.OP_STCX_FAIL):
1376                         pass
1377
1378             with m.Case(State.RELOAD_WAIT_ACK):
1379                 ld_stbs_done = Signal()
1380                 # Requests are all sent if stb is 0
1381                 comb += ld_stbs_done.eq(~r1.wb.stb)
1382
1383                 with m.If((~wb_in.stall) & r1.wb.stb):
1384                     # That was the last word?
1385                     # We are done sending.
1386                     # Clear stb and set ld_stbs_done
1387                     # so we can handle an eventual
1388                     # last ack on the same cycle.
1389                     with m.If(is_last_row_addr(r1.wb.adr, r1.end_row_ix)):
1390                         sync += r1.wb.stb.eq(0)
1391                         comb += ld_stbs_done.eq(1)
1392
1393                     # Calculate the next row address in the current cache line
1394                     rarange = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1395                     comb += rarange.eq(r1.wb.adr[ROW_OFF_BITS:LINE_OFF_BITS]+1)
1396                     sync += r1.wb.adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
1397
1398                 # Incoming acks processing
1399                 sync += r1.forward_valid1.eq(wb_in.ack)
1400                 with m.If(wb_in.ack):
1401                     sync += r1.rows_valid[r1.store_row[:ROW_LINE_BITS]].eq(1)
1402
1403                     # If this is the data we were looking for,
1404                     # we can complete the request next cycle.
1405                     # Compare the whole address in case the
1406                     # request in r1.req is not the one that
1407                     # started this refill.
1408                     with m.If(r1.full & r1.req.same_tag &
1409                               ((r1.dcbz & r1.req.dcbz) |
1410                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1411                                 (r1.store_row == get_row(r1.req.real_addr))):
1412                         sync += r1.full.eq(0)
1413                         sync += r1.slow_valid.eq(1)
1414                         with m.If(~r1.mmu_req):
1415                             sync += r1.ls_valid.eq(1)
1416                         with m.Else():
1417                             sync += r1.mmu_done.eq(1)
1418                         sync += r1.forward_sel.eq(~0) # all 1s
1419                         sync += r1.use_forward1.eq(1)
1420
1421                     # Check for completion
1422                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1423                                                       r1.end_row_ix)):
1424                         # Complete wishbone cycle
1425                         sync += r1.wb.cyc.eq(0)
1426
1427                         # Cache line is now valid
1428                         cv = Signal(INDEX_BITS)
1429                         comb += cv.eq(cache_valid_bits[r1.store_index])
1430                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1431                         sync += cache_valid_bits[r1.store_index].eq(cv)
1432                         sync += r1.state.eq(State.IDLE)
1433
1434                     # Increment store row counter
1435                     sync += r1.store_row.eq(next_row(r1.store_row))
1436
1437             with m.Case(State.STORE_WAIT_ACK):
1438                 st_stbs_done = Signal()
1439                 comb += st_stbs_done.eq(~r1.wb.stb)
1440                 comb += acks.eq(r1.acks_pending)
1441
1442                 with m.If(r1.inc_acks != r1.dec_acks):
1443                     with m.If(r1.inc_acks):
1444                         comb += adjust_acks.eq(acks + 1)
1445                     with m.Else():
1446                         comb += adjust_acks.eq(acks - 1)
1447                 with m.Else():
1448                     comb += adjust_acks.eq(acks)
1449
1450                 sync += r1.acks_pending.eq(adjust_acks)
1451
1452                 # Clear stb when slave accepted request
1453                 with m.If(~wb_in.stall):
1454                     # See if there is another store waiting
1455                     # to be done which is in the same real page.
1456                     with m.If(req.valid):
1457                         ra = req.real_addr[0:SET_SIZE_BITS]
1458                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
1459                         sync += r1.wb.dat.eq(req.data)
1460                         sync += r1.wb.sel.eq(req.byte_sel)
1461
1462                     with m.Elif((adjust_acks < 7) & req.same_tag &
1463                                 ((req.op == Op.OP_STORE_MISS)
1464                                  | (req.op == Op.OP_STORE_HIT))):
1465                         sync += r1.wb.stb.eq(1)
1466                         comb += st_stbs_done.eq(0)
1467
1468                         with m.If(req.op == Op.OP_STORE_HIT):
1469                             sync += r1.write_bram.eq(1)
1470                         sync += r1.full.eq(0)
1471                         sync += r1.slow_valid.eq(1)
1472
1473                         # Store requests never come from the MMU
1474                         sync += r1.ls_valid.eq(1)
1475                         comb += st_stbs_done.eq(0)
1476                         sync += r1.inc_acks.eq(1)
1477                     with m.Else():
1478                         sync += r1.wb.stb.eq(0)
1479                         comb += st_stbs_done.eq(1)
1480
1481                 # Got ack ? See if complete.
1482                 with m.If(wb_in.ack):
1483                     with m.If(st_stbs_done & (adjust_acks == 1)):
1484                         sync += r1.state.eq(State.IDLE)
1485                         sync += r1.wb.cyc.eq(0)
1486                         sync += r1.wb.stb.eq(0)
1487                     sync += r1.dec_acks.eq(1)
1488
1489             with m.Case(State.NC_LOAD_WAIT_ACK):
1490                 # Clear stb when slave accepted request
1491                 with m.If(~wb_in.stall):
1492                     sync += r1.wb.stb.eq(0)
1493
1494                 # Got ack ? complete.
1495                 with m.If(wb_in.ack):
1496                     sync += r1.state.eq(State.IDLE)
1497                     sync += r1.full.eq(0)
1498                     sync += r1.slow_valid.eq(1)
1499
1500                     with m.If(~r1.mmu_req):
1501                         sync += r1.ls_valid.eq(1)
1502                     with m.Else():
1503                         sync += r1.mmu_done.eq(1)
1504
1505                     sync += r1.forward_sel.eq(~0) # all 1s
1506                     sync += r1.use_forward1.eq(1)
1507                     sync += r1.wb.cyc.eq(0)
1508                     sync += r1.wb.stb.eq(0)
1509
1510     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1511
1512         sync = m.d.sync
1513         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1514
1515         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1516                                stall_out, req_op[:3], d_out.valid, d_out.error,
1517                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1518                                r1.wb.adr[3:6]))
1519
1520     def elaborate(self, platform):
1521
1522         m = Module()
1523         comb = m.d.comb
1524
1525         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1526         cache_tags       = CacheTagArray()
1527         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1528         cache_valid_bits = CacheValidBitsArray()
1529
1530         # TODO attribute ram_style : string;
1531         # TODO attribute ram_style of cache_tags : signal is "distributed";
1532
1533         """note: these are passed to nmigen.hdl.Memory as "attributes".
1534            don't know how, just that they are.
1535         """
1536         dtlb_valid_bits = TLBValidBitsArray()
1537         dtlb_tags       = TLBTagsArray()
1538         dtlb_ptes       = TLBPtesArray()
1539         # TODO attribute ram_style of
1540         #  dtlb_tags : signal is "distributed";
1541         # TODO attribute ram_style of
1542         #  dtlb_ptes : signal is "distributed";
1543
1544         r0      = RegStage0("r0")
1545         r0_full = Signal()
1546
1547         r1 = RegStage1("r1")
1548
1549         reservation = Reservation()
1550
1551         # Async signals on incoming request
1552         req_index    = Signal(INDEX_BITS)
1553         req_row      = Signal(ROW_BITS)
1554         req_hit_way  = Signal(WAY_BITS)
1555         req_tag      = Signal(TAG_BITS)
1556         req_op       = Signal(Op)
1557         req_data     = Signal(64)
1558         req_same_tag = Signal()
1559         req_go       = Signal()
1560
1561         early_req_row     = Signal(ROW_BITS)
1562
1563         cancel_store      = Signal()
1564         set_rsrv          = Signal()
1565         clear_rsrv        = Signal()
1566
1567         r0_valid          = Signal()
1568         r0_stall          = Signal()
1569
1570         use_forward1_next = Signal()
1571         use_forward2_next = Signal()
1572
1573         cache_out         = CacheRamOut()
1574
1575         plru_victim       = PLRUOut()
1576         replace_way       = Signal(WAY_BITS)
1577
1578         # Wishbone read/write/cache write formatting signals
1579         bus_sel           = Signal(8)
1580
1581         # TLB signals
1582         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1583         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1584         tlb_valid_way = Signal(TLB_NUM_WAYS)
1585         tlb_req_index = Signal(TLB_SET_BITS)
1586         tlb_hit       = Signal()
1587         tlb_hit_way   = Signal(TLB_WAY_BITS)
1588         pte           = Signal(TLB_PTE_BITS)
1589         ra            = Signal(REAL_ADDR_BITS)
1590         valid_ra      = Signal()
1591         perm_attr     = PermAttr("dc_perms")
1592         rc_ok         = Signal()
1593         perm_ok       = Signal()
1594         access_ok     = Signal()
1595
1596         tlb_plru_victim = TLBPLRUOut()
1597
1598         # we don't yet handle collisions between loadstore1 requests
1599         # and MMU requests
1600         comb += self.m_out.stall.eq(0)
1601
1602         # Hold off the request in r0 when r1 has an uncompleted request
1603         comb += r0_stall.eq(r0_full & r1.full)
1604         comb += r0_valid.eq(r0_full & ~r1.full)
1605         comb += self.stall_out.eq(r0_stall)
1606
1607         # Wire up wishbone request latch out of stage 1
1608         comb += self.wb_out.eq(r1.wb)
1609
1610         # call sub-functions putting everything together, using shared
1611         # signals established above
1612         self.stage_0(m, r0, r1, r0_full)
1613         self.tlb_read(m, r0_stall, tlb_valid_way,
1614                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1615                       dtlb_tags, dtlb_ptes)
1616         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1617                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1618                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1619         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1620                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1621                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1622         self.maybe_plrus(m, r1, plru_victim)
1623         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1624         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1625         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1626                            r0_valid, r1, cache_valid_bits, replace_way,
1627                            use_forward1_next, use_forward2_next,
1628                            req_hit_way, plru_victim, rc_ok, perm_attr,
1629                            valid_ra, perm_ok, access_ok, req_op, req_go,
1630                            tlb_pte_way,
1631                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1632                            cancel_store, req_same_tag, r0_stall, early_req_row)
1633         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1634                            r0_valid, r0, reservation)
1635         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1636                            reservation, r0)
1637         self.writeback_control(m, r1, cache_out)
1638         self.rams(m, r1, early_req_row, cache_out, replace_way)
1639         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1640                         req_hit_way, req_index, req_tag, access_ok,
1641                         tlb_hit, tlb_hit_way, tlb_req_index)
1642         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1643                     cache_valid_bits, r0, replace_way,
1644                     req_hit_way, req_same_tag,
1645                          r0_valid, req_op, cache_tags, req_go, ra)
1646         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1647
1648         return m
1649
1650 def dcache_load(dut, addr, nc=0):
1651     yield dut.d_in.load.eq(1)
1652     yield dut.d_in.nc.eq(nc)
1653     yield dut.d_in.addr.eq(addr)
1654     yield dut.d_in.byte_sel.eq(~0)
1655     yield dut.d_in.valid.eq(1)
1656     yield
1657     yield dut.d_in.valid.eq(0)
1658     yield dut.d_in.byte_sel.eq(0)
1659     yield
1660     while not (yield dut.d_out.valid):
1661         yield
1662     data = yield dut.d_out.data
1663     return data
1664
1665
1666 def dcache_store(dut, addr, data, nc=0):
1667     yield dut.d_in.load.eq(0)
1668     yield dut.d_in.nc.eq(nc)
1669     yield dut.d_in.data.eq(data)
1670     yield dut.d_in.byte_sel.eq(~0)
1671     yield dut.d_in.addr.eq(addr)
1672     yield dut.d_in.valid.eq(1)
1673     yield
1674     yield dut.d_in.valid.eq(0)
1675     yield dut.d_in.byte_sel.eq(0)
1676     yield
1677     while not (yield dut.d_out.valid):
1678         yield
1679
1680
1681 def dcache_random_sim(dut):
1682
1683     # start with stack of zeros
1684     sim_mem = [0] * 512
1685
1686     # clear stuff
1687     yield dut.d_in.valid.eq(0)
1688     yield dut.d_in.load.eq(0)
1689     yield dut.d_in.priv_mode.eq(1)
1690     yield dut.d_in.nc.eq(0)
1691     yield dut.d_in.addr.eq(0)
1692     yield dut.d_in.data.eq(0)
1693     yield dut.m_in.valid.eq(0)
1694     yield dut.m_in.addr.eq(0)
1695     yield dut.m_in.pte.eq(0)
1696     # wait 4 * clk_period
1697     yield
1698     yield
1699     yield
1700     yield
1701
1702     print ()
1703
1704     for i in range(256):
1705         addr = randint(0, 255)
1706         data = randint(0, (1<<64)-1)
1707         sim_mem[addr] = data
1708         addr *= 8
1709
1710         print ("testing %x data %x" % (addr, data))
1711
1712         yield from dcache_load(dut, addr)
1713         yield from dcache_store(dut, addr, data)
1714
1715         addr = randint(0, 255)
1716         sim_data = sim_mem[addr]
1717         addr *= 8
1718
1719         data = yield from dcache_load(dut, addr)
1720         assert data == sim_data, \
1721             "check %x data %x != %x" % (addr, data, sim_data)
1722
1723     for addr in range(8):
1724         data = yield from dcache_load(dut, addr*8)
1725         assert data == sim_mem[addr], \
1726             "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1727
1728 def dcache_sim(dut):
1729     # clear stuff
1730     yield dut.d_in.valid.eq(0)
1731     yield dut.d_in.load.eq(0)
1732     yield dut.d_in.priv_mode.eq(1)
1733     yield dut.d_in.nc.eq(0)
1734     yield dut.d_in.addr.eq(0)
1735     yield dut.d_in.data.eq(0)
1736     yield dut.m_in.valid.eq(0)
1737     yield dut.m_in.addr.eq(0)
1738     yield dut.m_in.pte.eq(0)
1739     # wait 4 * clk_period
1740     yield
1741     yield
1742     yield
1743     yield
1744
1745     # Cacheable read of address 4
1746     data = yield from dcache_load(dut, 0x4)
1747     addr = yield dut.d_in.addr
1748     assert data == 0x0000000100000000, \
1749         f"data @%x=%x expected 0x0000000100000000" % (addr, data)
1750
1751     # Cacheable read of address 20
1752     data = yield from dcache_load(dut, 0x20)
1753     addr = yield dut.d_in.addr
1754     assert data == 0x0000000100000000, \
1755         f"data @%x=%x expected 0x0000000100000000" % (addr, data)
1756
1757     # Cacheable read of address 30
1758     data = yield from dcache_load(dut, 0x530)
1759     addr = yield dut.d_in.addr
1760     assert data == 0x0000014D0000014C, \
1761         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1762
1763     # 2nd Cacheable read of address 30
1764     data = yield from dcache_load(dut, 0x530)
1765     addr = yield dut.d_in.addr
1766     assert data == 0x0000014D0000014C, \
1767         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1768
1769     # Non-cacheable read of address 100
1770     data = yield from dcache_load(dut, 0x100, nc=1)
1771     addr = yield dut.d_in.addr
1772     assert data == 0x0000004100000040, \
1773         f"data @%x=%x expected 0000004100000040" % (addr, data)
1774
1775     # Store at address 530
1776     yield from dcache_store(dut, 0x530, 0x121)
1777
1778     # Store at address 30
1779     yield from dcache_store(dut, 0x530, 0x12345678)
1780
1781     # 3nd Cacheable read of address 530
1782     data = yield from dcache_load(dut, 0x530)
1783     addr = yield dut.d_in.addr
1784     assert data == 0x12345678, \
1785         f"data @%x=%x expected 0x12345678" % (addr, data)
1786
1787     # 4th Cacheable read of address 30
1788     data = yield from dcache_load(dut, 0x20)
1789     addr = yield dut.d_in.addr
1790     assert data == 0x12345678, \
1791         f"data @%x=%x expected 0x12345678" % (addr, data)
1792
1793     yield
1794     yield
1795     yield
1796     yield
1797
1798
1799 def test_dcache(mem, test_fn, test_name):
1800     dut = DCache()
1801
1802     memory = Memory(width=64, depth=16*64, init=mem)
1803     sram = SRAM(memory=memory, granularity=8)
1804
1805     m = Module()
1806     m.submodules.dcache = dut
1807     m.submodules.sram = sram
1808
1809     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1810     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1811     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1812     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1813     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr[3:])
1814     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1815
1816     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1817     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1818
1819     # nmigen Simulation
1820     sim = Simulator(m)
1821     sim.add_clock(1e-6)
1822
1823     sim.add_sync_process(wrap(test_fn(dut)))
1824     with sim.write_vcd('test_dcache%s.vcd' % test_name):
1825         sim.run()
1826
1827 if __name__ == '__main__':
1828     dut = DCache()
1829     vl = rtlil.convert(dut, ports=[])
1830     with open("test_dcache.il", "w") as f:
1831         f.write(vl)
1832
1833     mem = []
1834     for i in range(0,512):
1835         mem.append((i*2)| ((i*2+1)<<32))
1836
1837     test_dcache(mem, dcache_sim, "")
1838     #test_dcache(None, dcache_random_sim, "random")
1839