src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8 """
   9
  10 import sys
  11 sys.setrecursionlimit(1000000)
  12
  13 from enum import Enum, unique
  14
  15 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  16 from nmutil.util import Display
  17
  18 from copy import deepcopy
  19 from random import randint, seed
  20
  21 from nmigen.cli import main
  22 from nmutil.iocontrol import RecordObject
  23 from nmigen.utils import log2_int
  24 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  25                                      DCacheToLoadStore1Type,
  26                                      MMUToDCacheType,
  27                                      DCacheToMMUType)
  28
  29 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  30                                 WBAddrType, WBDataType, WBSelType,
  31                                 WBMasterOut, WBSlaveOut,
  32                                 WBMasterOutVector, WBSlaveOutVector,
  33                                 WBIOMasterOut, WBIOSlaveOut)
  34
  35 from soc.experiment.cache_ram import CacheRam
  36 #from soc.experiment.plru import PLRU
  37 from nmutil.plru import PLRU
  38
  39 # for test
  40 from soc.bus.sram import SRAM
  41 from nmigen import Memory
  42 from nmigen.cli import rtlil
  43
  44 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  45 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  46 from nmutil.sim_tmp_alternative import Simulator
  47
  48 from nmutil.util import wrap
  49
  50
  51 # TODO: make these parameters of DCache at some point
  52 LINE_SIZE = 64    # Line size in bytes
  53 NUM_LINES = 16    # Number of lines in a set
  54 NUM_WAYS = 4      # Number of ways
  55 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  56 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  57 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  58 LOG_LENGTH = 0    # Non-zero to enable log data collection
  59
  60 # BRAM organisation: We never access more than
  61 #     -- WB_DATA_BITS at a time so to save
  62 #     -- resources we make the array only that wide, and
  63 #     -- use consecutive indices for to make a cache "line"
  64 #     --
  65 #     -- ROW_SIZE is the width in bytes of the BRAM
  66 #     -- (based on WB, so 64-bits)
  67 ROW_SIZE = WB_DATA_BITS // 8;
  68
  69 # ROW_PER_LINE is the number of row (wishbone
  70 # transactions) in a line
  71 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  72
  73 # BRAM_ROWS is the number of rows in BRAM needed
  74 # to represent the full dcache
  75 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  76
  77 print ("ROW_SIZE", ROW_SIZE)
  78 print ("ROW_PER_LINE", ROW_PER_LINE)
  79 print ("BRAM_ROWS", BRAM_ROWS)
  80 print ("NUM_WAYS", NUM_WAYS)
  81
  82 # Bit fields counts in the address
  83
  84 # REAL_ADDR_BITS is the number of real address
  85 # bits that we store
  86 REAL_ADDR_BITS = 56
  87
  88 # ROW_BITS is the number of bits to select a row
  89 ROW_BITS = log2_int(BRAM_ROWS)
  90
  91 # ROW_LINE_BITS is the number of bits to select
  92 # a row within a line
  93 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  94
  95 # LINE_OFF_BITS is the number of bits for
  96 # the offset in a cache line
  97 LINE_OFF_BITS = log2_int(LINE_SIZE)
  98
  99 # ROW_OFF_BITS is the number of bits for
 100 # the offset in a row
 101 ROW_OFF_BITS = log2_int(ROW_SIZE)
 102
 103 # INDEX_BITS is the number if bits to
 104 # select a cache line
 105 INDEX_BITS = log2_int(NUM_LINES)
 106
 107 # SET_SIZE_BITS is the log base 2 of the set size
 108 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 109
 110 # TAG_BITS is the number of bits of
 111 # the tag part of the address
 112 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 113
 114 # TAG_WIDTH is the width in bits of each way of the tag RAM
 115 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 116
 117 # WAY_BITS is the number of bits to select a way
 118 WAY_BITS = log2_int(NUM_WAYS)
 119
 120 # Example of layout for 32 lines of 64 bytes:
 121 layout = """\
 122   ..  tag    |index|  line  |
 123   ..         |   row   |    |
 124   ..         |     |---|    | ROW_LINE_BITS  (3)
 125   ..         |     |--- - --| LINE_OFF_BITS (6)
 126   ..         |         |- --| ROW_OFF_BITS  (3)
 127   ..         |----- ---|    | ROW_BITS      (8)
 128   ..         |-----|        | INDEX_BITS    (5)
 129   .. --------|              | TAG_BITS      (45)
 130 """
 131 print (layout)
 132 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 133             (TAG_BITS, INDEX_BITS, ROW_BITS,
 134              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 135 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 136 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 137 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 138
 139 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 140
 141 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 142
 143 def CacheTagArray():
 144     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 145                         for x in range(NUM_LINES))
 146
 147 def CacheValidBitsArray():
 148     return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
 149                         for x in range(NUM_LINES))
 150
 151 def RowPerLineValidArray():
 152     return Array(Signal(name="rows_valid%d" % x) \
 153                         for x in range(ROW_PER_LINE))
 154
 155 # L1 TLB
 156 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 157 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 158 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 159 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 160 TLB_PTE_BITS     = 64
 161 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 162
 163 def ispow2(x):
 164     return (1<<log2_int(x, False)) == x
 165
 166 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 167 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 168 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 169 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 170 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 171 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 172         "geometry bits don't add up"
 173 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 174         "geometry bits don't add up"
 175 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 176          "geometry bits don't add up"
 177 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 178 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 179
 180
 181 def TLBValidBitsArray():
 182     return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
 183                 for x in range(TLB_SET_SIZE))
 184
 185 def TLBTagEAArray():
 186     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 187                 for x in range (TLB_NUM_WAYS))
 188
 189 def TLBTagsArray():
 190     return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
 191                 for x in range (TLB_SET_SIZE))
 192
 193 def TLBPtesArray():
 194     return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
 195                 for x in range(TLB_SET_SIZE))
 196
 197 def HitWaySet():
 198     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 199                         for x in range(TLB_NUM_WAYS))
 200
 201 # Cache RAM interface
 202 def CacheRamOut():
 203     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 204                  for x in range(NUM_WAYS))
 205
 206 # PLRU output interface
 207 def PLRUOut():
 208     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 209                 for x in range(NUM_LINES))
 210
 211 # TLB PLRU output interface
 212 def TLBPLRUOut():
 213     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 214                 for x in range(TLB_SET_SIZE))
 215
 216 # Helper functions to decode incoming requests
 217 #
 218 # Return the cache line index (tag index) for an address
 219 def get_index(addr):
 220     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 221
 222 # Return the cache row index (data memory) for an address
 223 def get_row(addr):
 224     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 225
 226 # Return the index of a row within a line
 227 def get_row_of_line(row):
 228     return row[:ROW_BITS][:ROW_LINE_BITS]
 229
 230 # Returns whether this is the last row of a line
 231 def is_last_row_addr(addr, last):
 232     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 233
 234 # Returns whether this is the last row of a line
 235 def is_last_row(row, last):
 236     return get_row_of_line(row) == last
 237
 238 # Return the next row in the current cache line. We use a
 239 # dedicated function in order to limit the size of the
 240 # generated adder to be only the bits within a cache line
 241 # (3 bits with default settings)
 242 def next_row(row):
 243     row_v = row[0:ROW_LINE_BITS] + 1
 244     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 245
 246 # Get the tag value from the address
 247 def get_tag(addr):
 248     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 249
 250 # Read a tag from a tag memory row
 251 def read_tag(way, tagset):
 252     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 253
 254 # Read a TLB tag from a TLB tag memory row
 255 def read_tlb_tag(way, tags):
 256     return tags.word_select(way, TLB_EA_TAG_BITS)
 257
 258 # Write a TLB tag to a TLB tag memory row
 259 def write_tlb_tag(way, tags, tag):
 260     return read_tlb_tag(way, tags).eq(tag)
 261
 262 # Read a PTE from a TLB PTE memory row
 263 def read_tlb_pte(way, ptes):
 264     return ptes.word_select(way, TLB_PTE_BITS)
 265
 266 def write_tlb_pte(way, ptes, newpte):
 267     return read_tlb_pte(way, ptes).eq(newpte)
 268
 269
 270 # Record for storing permission, attribute, etc. bits from a PTE
 271 class PermAttr(RecordObject):
 272     def __init__(self, name=None):
 273         super().__init__(name=name)
 274         self.reference = Signal()
 275         self.changed   = Signal()
 276         self.nocache   = Signal()
 277         self.priv      = Signal()
 278         self.rd_perm   = Signal()
 279         self.wr_perm   = Signal()
 280
 281
 282 def extract_perm_attr(pte):
 283     pa = PermAttr()
 284     return pa;
 285
 286
 287 # Type of operation on a "valid" input
 288 @unique
 289 class Op(Enum):
 290     OP_NONE       = 0
 291     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 292     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 293     OP_LOAD_HIT   = 3 # Cache hit on load
 294     OP_LOAD_MISS  = 4 # Load missing cache
 295     OP_LOAD_NC    = 5 # Non-cachable load
 296     OP_STORE_HIT  = 6 # Store hitting cache
 297     OP_STORE_MISS = 7 # Store missing cache
 298
 299
 300 # Cache state machine
 301 @unique
 302 class State(Enum):
 303     IDLE             = 0 # Normal load hit processing
 304     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 305     STORE_WAIT_ACK   = 2 # Store wait ack
 306     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 307
 308
 309 # Dcache operations:
 310 #
 311 # In order to make timing, we use the BRAMs with
 312 # an output buffer, which means that the BRAM
 313 # output is delayed by an extra cycle.
 314 #
 315 # Thus, the dcache has a 2-stage internal pipeline
 316 # for cache hits with no stalls.
 317 #
 318 # All other operations are handled via stalling
 319 # in the first stage.
 320 #
 321 # The second stage can thus complete a hit at the same
 322 # time as the first stage emits a stall for a complex op.
 323 #
 324 # Stage 0 register, basically contains just the latched request
 325
 326 class RegStage0(RecordObject):
 327     def __init__(self, name=None):
 328         super().__init__(name=name)
 329         self.req     = LoadStore1ToDCacheType(name="lsmem")
 330         self.tlbie   = Signal()
 331         self.doall   = Signal()
 332         self.tlbld   = Signal()
 333         self.mmu_req = Signal() # indicates source of request
 334
 335
 336 class MemAccessRequest(RecordObject):
 337     def __init__(self, name=None):
 338         super().__init__(name=name)
 339         self.op        = Signal(Op)
 340         self.valid     = Signal()
 341         self.dcbz      = Signal()
 342         self.real_addr = Signal(REAL_ADDR_BITS)
 343         self.data      = Signal(64)
 344         self.byte_sel  = Signal(8)
 345         self.hit_way   = Signal(WAY_BITS)
 346         self.same_tag  = Signal()
 347         self.mmu_req   = Signal()
 348
 349
 350 # First stage register, contains state for stage 1 of load hits
 351 # and for the state machine used by all other operations
 352 class RegStage1(RecordObject):
 353     def __init__(self, name=None):
 354         super().__init__(name=name)
 355         # Info about the request
 356         self.full             = Signal() # have uncompleted request
 357         self.mmu_req          = Signal() # request is from MMU
 358         self.req              = MemAccessRequest(name="reqmem")
 359
 360         # Cache hit state
 361         self.hit_way          = Signal(WAY_BITS)
 362         self.hit_load_valid   = Signal()
 363         self.hit_index        = Signal(INDEX_BITS)
 364         self.cache_hit        = Signal()
 365
 366         # TLB hit state
 367         self.tlb_hit          = Signal()
 368         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 369         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 370
 371         # 2-stage data buffer for data forwarded from writes to reads
 372         self.forward_data1    = Signal(64)
 373         self.forward_data2    = Signal(64)
 374         self.forward_sel1     = Signal(8)
 375         self.forward_valid1   = Signal()
 376         self.forward_way1     = Signal(WAY_BITS)
 377         self.forward_row1     = Signal(ROW_BITS)
 378         self.use_forward1     = Signal()
 379         self.forward_sel      = Signal(8)
 380
 381         # Cache miss state (reload state machine)
 382         self.state            = Signal(State)
 383         self.dcbz             = Signal()
 384         self.write_bram       = Signal()
 385         self.write_tag        = Signal()
 386         self.slow_valid       = Signal()
 387         self.real_adr         = Signal(REAL_ADDR_BITS)
 388         self.wb               = WBMasterOut("wb")
 389         self.reload_tag       = Signal(TAG_BITS)
 390         self.store_way        = Signal(WAY_BITS)
 391         self.store_row        = Signal(ROW_BITS)
 392         self.store_index      = Signal(INDEX_BITS)
 393         self.end_row_ix       = Signal(ROW_LINE_BITS)
 394         self.rows_valid       = RowPerLineValidArray()
 395         self.acks_pending     = Signal(3)
 396         self.inc_acks         = Signal()
 397         self.dec_acks         = Signal()
 398
 399         # Signals to complete (possibly with error)
 400         self.ls_valid         = Signal()
 401         self.ls_error         = Signal()
 402         self.mmu_done         = Signal()
 403         self.mmu_error        = Signal()
 404         self.cache_paradox    = Signal()
 405
 406         # Signal to complete a failed stcx.
 407         self.stcx_fail        = Signal()
 408
 409
 410 # Reservation information
 411 class Reservation(RecordObject):
 412     def __init__(self):
 413         super().__init__()
 414         self.valid = Signal()
 415         self.addr  = Signal(64-LINE_OFF_BITS)
 416
 417
 418 class DTLBUpdate(Elaboratable):
 419     def __init__(self):
 420         self.tlbie    = Signal()
 421         self.tlbwe    = Signal()
 422         self.doall    = Signal()
 423         self.updated  = Signal()
 424         self.v_updated  = Signal()
 425         self.tlb_hit    = Signal()
 426         self.tlb_req_index = Signal(TLB_SET_BITS)
 427
 428         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 429         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 430         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 431         self.repl_way        = Signal(TLB_WAY_BITS)
 432         self.eatag           = Signal(TLB_EA_TAG_BITS)
 433         self.pte_data        = Signal(TLB_PTE_BITS)
 434
 435         self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 436
 437         self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 438         self.pb_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 439         self.db_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 440
 441     def elaborate(self, platform):
 442         m = Module()
 443         comb = m.d.comb
 444         sync = m.d.sync
 445
 446         tagset   = Signal(TLB_TAG_WAY_BITS)
 447         pteset   = Signal(TLB_PTE_WAY_BITS)
 448
 449         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 450         comb += db_out.eq(self.dv)
 451
 452         with m.If(self.tlbie & self.doall):
 453             pass # clear all back in parent
 454         with m.Elif(self.tlbie):
 455             with m.If(self.tlb_hit):
 456                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 457                 comb += self.v_updated.eq(1)
 458
 459         with m.Elif(self.tlbwe):
 460
 461             comb += tagset.eq(self.tlb_tag_way)
 462             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 463             comb += tb_out.eq(tagset)
 464
 465             comb += pteset.eq(self.tlb_pte_way)
 466             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 467             comb += pb_out.eq(pteset)
 468
 469             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 470
 471             comb += self.updated.eq(1)
 472             comb += self.v_updated.eq(1)
 473
 474         return m
 475
 476
 477 class DCachePendingHit(Elaboratable):
 478
 479     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 480                       cache_valid_idx, cache_tag_set,
 481                     req_addr,
 482                     hit_set):
 483
 484         self.go          = Signal()
 485         self.virt_mode   = Signal()
 486         self.is_hit      = Signal()
 487         self.tlb_hit     = Signal()
 488         self.hit_way     = Signal(WAY_BITS)
 489         self.rel_match   = Signal()
 490         self.req_index   = Signal(INDEX_BITS)
 491         self.reload_tag  = Signal(TAG_BITS)
 492
 493         self.tlb_hit_way = tlb_hit_way
 494         self.tlb_pte_way = tlb_pte_way
 495         self.tlb_valid_way = tlb_valid_way
 496         self.cache_valid_idx = cache_valid_idx
 497         self.cache_tag_set = cache_tag_set
 498         self.req_addr = req_addr
 499         self.hit_set = hit_set
 500
 501     def elaborate(self, platform):
 502         m = Module()
 503         comb = m.d.comb
 504         sync = m.d.sync
 505
 506         go = self.go
 507         virt_mode = self.virt_mode
 508         is_hit = self.is_hit
 509         tlb_pte_way = self.tlb_pte_way
 510         tlb_valid_way = self.tlb_valid_way
 511         cache_valid_idx = self.cache_valid_idx
 512         cache_tag_set = self.cache_tag_set
 513         req_addr = self.req_addr
 514         tlb_hit_way = self.tlb_hit_way
 515         tlb_hit = self.tlb_hit
 516         hit_set = self.hit_set
 517         hit_way = self.hit_way
 518         rel_match = self.rel_match
 519         req_index = self.req_index
 520         reload_tag = self.reload_tag
 521
 522         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 523                                     for i in range(TLB_NUM_WAYS))
 524         hit_way_set = HitWaySet()
 525
 526         # Test if pending request is a hit on any way
 527         # In order to make timing in virtual mode,
 528         # when we are using the TLB, we compare each
 529         # way with each of the real addresses from each way of
 530         # the TLB, and then decide later which match to use.
 531
 532         with m.If(virt_mode):
 533             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 534                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 535                 s_hit       = Signal()
 536                 s_pte       = Signal(TLB_PTE_BITS)
 537                 s_ra        = Signal(REAL_ADDR_BITS)
 538                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 539                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 540                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 541                 comb += s_tag.eq(get_tag(s_ra))
 542
 543                 for i in range(NUM_WAYS): # way_t
 544                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 545                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 546                                   (read_tag(i, cache_tag_set) == s_tag)
 547                                   & tlb_valid_way[j])
 548                     with m.If(is_tag_hit):
 549                         comb += hit_way_set[j].eq(i)
 550                         comb += s_hit.eq(1)
 551                 comb += hit_set[j].eq(s_hit)
 552                 with m.If(s_tag == reload_tag):
 553                     comb += rel_matches[j].eq(1)
 554             with m.If(tlb_hit):
 555                 comb += is_hit.eq(hit_set[tlb_hit_way])
 556                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 557                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 558         with m.Else():
 559             s_tag       = Signal(TAG_BITS)
 560             comb += s_tag.eq(get_tag(req_addr))
 561             for i in range(NUM_WAYS): # way_t
 562                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 563                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 564                           (read_tag(i, cache_tag_set) == s_tag))
 565                 with m.If(is_tag_hit):
 566                     comb += hit_way.eq(i)
 567                     comb += is_hit.eq(1)
 568             with m.If(s_tag == reload_tag):
 569                 comb += rel_match.eq(1)
 570
 571         return m
 572
 573
 574 class DCache(Elaboratable):
 575     """Set associative dcache write-through
 576     TODO (in no specific order):
 577     * See list in icache.vhdl
 578     * Complete load misses on the cycle when WB data comes instead of
 579       at the end of line (this requires dealing with requests coming in
 580       while not idle...)
 581     """
 582     def __init__(self):
 583         self.d_in      = LoadStore1ToDCacheType("d_in")
 584         self.d_out     = DCacheToLoadStore1Type("d_out")
 585
 586         self.m_in      = MMUToDCacheType("m_in")
 587         self.m_out     = DCacheToMMUType("m_out")
 588
 589         self.stall_out = Signal()
 590
 591         self.wb_out    = WBMasterOut()
 592         self.wb_in     = WBSlaveOut()
 593
 594         self.log_out   = Signal(20)
 595
 596     def stage_0(self, m, r0, r1, r0_full):
 597         """Latch the request in r0.req as long as we're not stalling
 598         """
 599         comb = m.d.comb
 600         sync = m.d.sync
 601         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 602
 603         r = RegStage0("stage0")
 604
 605         # TODO, this goes in unit tests and formal proofs
 606         with m.If(d_in.valid & m_in.valid):
 607             sync += Display("request collision loadstore vs MMU")
 608
 609         with m.If(m_in.valid):
 610             comb += r.req.valid.eq(1)
 611             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 612             comb += r.req.dcbz.eq(0)
 613             comb += r.req.nc.eq(0)
 614             comb += r.req.reserve.eq(0)
 615             comb += r.req.virt_mode.eq(0)
 616             comb += r.req.priv_mode.eq(1)
 617             comb += r.req.addr.eq(m_in.addr)
 618             comb += r.req.data.eq(m_in.pte)
 619             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 620             comb += r.tlbie.eq(m_in.tlbie)
 621             comb += r.doall.eq(m_in.doall)
 622             comb += r.tlbld.eq(m_in.tlbld)
 623             comb += r.mmu_req.eq(1)
 624         with m.Else():
 625             comb += r.req.eq(d_in)
 626             comb += r.tlbie.eq(0)
 627             comb += r.doall.eq(0)
 628             comb += r.tlbld.eq(0)
 629             comb += r.mmu_req.eq(0)
 630         with m.If(~(r1.full & r0_full)):
 631             sync += r0.eq(r)
 632             sync += r0_full.eq(r.req.valid)
 633
 634     def tlb_read(self, m, r0_stall, tlb_valid_way,
 635                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 636                  dtlb_tags, dtlb_ptes):
 637         """TLB
 638         Operates in the second cycle on the request latched in r0.req.
 639         TLB updates write the entry at the end of the second cycle.
 640         """
 641         comb = m.d.comb
 642         sync = m.d.sync
 643         m_in, d_in = self.m_in, self.d_in
 644
 645         index    = Signal(TLB_SET_BITS)
 646         addrbits = Signal(TLB_SET_BITS)
 647
 648         amin = TLB_LG_PGSZ
 649         amax = TLB_LG_PGSZ + TLB_SET_BITS
 650
 651         with m.If(m_in.valid):
 652             comb += addrbits.eq(m_in.addr[amin : amax])
 653         with m.Else():
 654             comb += addrbits.eq(d_in.addr[amin : amax])
 655         comb += index.eq(addrbits)
 656
 657         # If we have any op and the previous op isn't finished,
 658         # then keep the same output for next cycle.
 659         with m.If(~r0_stall):
 660             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 661             sync += tlb_tag_way.eq(dtlb_tags[index])
 662             sync += tlb_pte_way.eq(dtlb_ptes[index])
 663
 664     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 665         """Generate TLB PLRUs
 666         """
 667         comb = m.d.comb
 668         sync = m.d.sync
 669
 670         if TLB_NUM_WAYS == 0:
 671             return
 672         for i in range(TLB_SET_SIZE):
 673             # TLB PLRU interface
 674             tlb_plru        = PLRU(TLB_WAY_BITS)
 675             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 676             tlb_plru_acc_en = Signal()
 677
 678             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 679             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 680             comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
 681             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 682
 683     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 684                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 685                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 686
 687         comb = m.d.comb
 688
 689         hitway = Signal(TLB_WAY_BITS)
 690         hit    = Signal()
 691         eatag  = Signal(TLB_EA_TAG_BITS)
 692
 693         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 694         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 695         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 696
 697         for i in range(TLB_NUM_WAYS):
 698             is_tag_hit = Signal()
 699             comb += is_tag_hit.eq(tlb_valid_way[i]
 700                                   & (read_tlb_tag(i, tlb_tag_way) == eatag))
 701             with m.If(is_tag_hit):
 702                 comb += hitway.eq(i)
 703                 comb += hit.eq(1)
 704
 705         comb += tlb_hit.eq(hit & r0_valid)
 706         comb += tlb_hit_way.eq(hitway)
 707
 708         with m.If(tlb_hit):
 709             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 710         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 711
 712         with m.If(r0.req.virt_mode):
 713             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 714                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 715                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 716             comb += perm_attr.reference.eq(pte[8])
 717             comb += perm_attr.changed.eq(pte[7])
 718             comb += perm_attr.nocache.eq(pte[5])
 719             comb += perm_attr.priv.eq(pte[3])
 720             comb += perm_attr.rd_perm.eq(pte[2])
 721             comb += perm_attr.wr_perm.eq(pte[1])
 722         with m.Else():
 723             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 724                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 725             comb += perm_attr.reference.eq(1)
 726             comb += perm_attr.changed.eq(1)
 727             comb += perm_attr.nocache.eq(0)
 728             comb += perm_attr.priv.eq(1)
 729             comb += perm_attr.rd_perm.eq(1)
 730             comb += perm_attr.wr_perm.eq(1)
 731
 732     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 733                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 734                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 735
 736         dtlb_valids = TLBValidBitsArray()
 737
 738         comb = m.d.comb
 739         sync = m.d.sync
 740
 741         tlbie    = Signal()
 742         tlbwe    = Signal()
 743
 744         comb += tlbie.eq(r0_valid & r0.tlbie)
 745         comb += tlbwe.eq(r0_valid & r0.tlbld)
 746
 747         m.submodules.tlb_update = d = DTLBUpdate()
 748         with m.If(tlbie & r0.doall):
 749             # clear all valid bits at once
 750             for i in range(TLB_SET_SIZE):
 751                 sync += dtlb_valid_bits[i].eq(0)
 752         with m.If(d.updated):
 753             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 754             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 755         with m.If(d.v_updated):
 756             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 757
 758         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 759
 760         comb += d.tlbie.eq(tlbie)
 761         comb += d.tlbwe.eq(tlbwe)
 762         comb += d.doall.eq(r0.doall)
 763         comb += d.tlb_hit.eq(tlb_hit)
 764         comb += d.tlb_hit_way.eq(tlb_hit_way)
 765         comb += d.tlb_tag_way.eq(tlb_tag_way)
 766         comb += d.tlb_pte_way.eq(tlb_pte_way)
 767         comb += d.tlb_req_index.eq(tlb_req_index)
 768
 769         with m.If(tlb_hit):
 770             comb += d.repl_way.eq(tlb_hit_way)
 771         with m.Else():
 772             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 773         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 774         comb += d.pte_data.eq(r0.req.data)
 775
 776     def maybe_plrus(self, m, r1, plru_victim):
 777         """Generate PLRUs
 778         """
 779         comb = m.d.comb
 780         sync = m.d.sync
 781
 782         if TLB_NUM_WAYS == 0:
 783             return
 784
 785         for i in range(NUM_LINES):
 786             # PLRU interface
 787             plru        = PLRU(WAY_BITS)
 788             setattr(m.submodules, "plru%d" % i, plru)
 789             plru_acc_en = Signal()
 790
 791             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 792             comb += plru.acc_en.eq(plru_acc_en)
 793             comb += plru.acc_i.eq(r1.hit_way)
 794             comb += plru_victim[i].eq(plru.lru_o)
 795
 796     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 797         """Cache tag RAM read port
 798         """
 799         comb = m.d.comb
 800         sync = m.d.sync
 801         m_in, d_in = self.m_in, self.d_in
 802
 803         index = Signal(INDEX_BITS)
 804
 805         with m.If(r0_stall):
 806             comb += index.eq(req_index)
 807         with m.Elif(m_in.valid):
 808             comb += index.eq(get_index(m_in.addr))
 809         with m.Else():
 810             comb += index.eq(get_index(d_in.addr))
 811         sync += cache_tag_set.eq(cache_tags[index])
 812
 813     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 814                        r0_valid, r1, cache_valids, replace_way,
 815                        use_forward1_next, use_forward2_next,
 816                        req_hit_way, plru_victim, rc_ok, perm_attr,
 817                        valid_ra, perm_ok, access_ok, req_op, req_go,
 818                        tlb_pte_way,
 819                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 820                        cancel_store, req_same_tag, r0_stall, early_req_row):
 821         """Cache request parsing and hit detection
 822         """
 823
 824         comb = m.d.comb
 825         sync = m.d.sync
 826         m_in, d_in = self.m_in, self.d_in
 827
 828         is_hit      = Signal()
 829         hit_way     = Signal(WAY_BITS)
 830         op          = Signal(Op)
 831         opsel       = Signal(3)
 832         go          = Signal()
 833         nc          = Signal()
 834         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 835                                   for i in range(TLB_NUM_WAYS))
 836         cache_valid_idx = Signal(NUM_WAYS)
 837
 838         # Extract line, row and tag from request
 839         comb += req_index.eq(get_index(r0.req.addr))
 840         comb += req_row.eq(get_row(r0.req.addr))
 841         comb += req_tag.eq(get_tag(ra))
 842
 843         if False: # display on comb is a bit... busy.
 844             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 845                     r0.req.addr, ra, req_index, req_tag, req_row)
 846
 847         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 848         comb += cache_valid_idx.eq(cache_valids[req_index])
 849
 850         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 851                                 tlb_valid_way, tlb_hit_way,
 852                                 cache_valid_idx, cache_tag_set,
 853                                 r0.req.addr,
 854                                 hit_set)
 855
 856         comb += dc.tlb_hit.eq(tlb_hit)
 857         comb += dc.reload_tag.eq(r1.reload_tag)
 858         comb += dc.virt_mode.eq(r0.req.virt_mode)
 859         comb += dc.go.eq(go)
 860         comb += dc.req_index.eq(req_index)
 861         comb += is_hit.eq(dc.is_hit)
 862         comb += hit_way.eq(dc.hit_way)
 863         comb += req_same_tag.eq(dc.rel_match)
 864
 865         # See if the request matches the line currently being reloaded
 866         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 867                   (req_index == r1.store_index) & req_same_tag):
 868             # For a store, consider this a hit even if the row isn't
 869             # valid since it will be by the time we perform the store.
 870             # For a load, check the appropriate row valid bit.
 871             rrow = Signal(ROW_LINE_BITS)
 872             comb += rrow.eq(req_row)
 873             valid = r1.rows_valid[rrow]
 874             comb += is_hit.eq((~r0.req.load) | valid)
 875             comb += hit_way.eq(replace_way)
 876
 877         # Whether to use forwarded data for a load or not
 878         with m.If((get_row(r1.req.real_addr) == req_row) &
 879                   (r1.req.hit_way == hit_way)):
 880             # Only need to consider r1.write_bram here, since if we
 881             # are writing refill data here, then we don't have a
 882             # cache hit this cycle on the line being refilled.
 883             # (There is the possibility that the load following the
 884             # load miss that started the refill could be to the old
 885             # contents of the victim line, since it is a couple of
 886             # cycles after the refill starts before we see the updated
 887             # cache tag. In that case we don't use the bypass.)
 888             comb += use_forward1_next.eq(r1.write_bram)
 889         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 890             comb += use_forward2_next.eq(r1.forward_valid1)
 891
 892         # The way that matched on a hit
 893         comb += req_hit_way.eq(hit_way)
 894
 895         # The way to replace on a miss
 896         with m.If(r1.write_tag):
 897             comb += replace_way.eq(plru_victim[r1.store_index])
 898         with m.Else():
 899             comb += replace_way.eq(r1.store_way)
 900
 901         # work out whether we have permission for this access
 902         # NB we don't yet implement AMR, thus no KUAP
 903         comb += rc_ok.eq(perm_attr.reference
 904                          & (r0.req.load | perm_attr.changed))
 905         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 906                            (perm_attr.wr_perm |
 907                               (r0.req.load & perm_attr.rd_perm)))
 908         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 909         # Combine the request and cache hit status to decide what
 910         # operation needs to be done
 911         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 912         comb += op.eq(Op.OP_NONE)
 913         with m.If(go):
 914             with m.If(~access_ok):
 915                 comb += op.eq(Op.OP_BAD)
 916             with m.Elif(cancel_store):
 917                 comb += op.eq(Op.OP_STCX_FAIL)
 918             with m.Else():
 919                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 920                 with m.Switch(opsel):
 921                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 922                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 923                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 924                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 925                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 926                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 927                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 928                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 929         comb += req_op.eq(op)
 930         comb += req_go.eq(go)
 931
 932         # Version of the row number that is valid one cycle earlier
 933         # in the cases where we need to read the cache data BRAM.
 934         # If we're stalling then we need to keep reading the last
 935         # row requested.
 936         with m.If(~r0_stall):
 937             with m.If(m_in.valid):
 938                 comb += early_req_row.eq(get_row(m_in.addr))
 939             with m.Else():
 940                 comb += early_req_row.eq(get_row(d_in.addr))
 941         with m.Else():
 942             comb += early_req_row.eq(req_row)
 943
 944     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 945                          r0_valid, r0, reservation):
 946         """Handle load-with-reservation and store-conditional instructions
 947         """
 948         comb = m.d.comb
 949
 950         with m.If(r0_valid & r0.req.reserve):
 951             # XXX generate alignment interrupt if address
 952             # is not aligned XXX or if r0.req.nc = '1'
 953             with m.If(r0.req.load):
 954                 comb += set_rsrv.eq(1) # load with reservation
 955             with m.Else():
 956                 comb += clear_rsrv.eq(1) # store conditional
 957                 with m.If((~reservation.valid) |
 958                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
 959                     comb += cancel_store.eq(1)
 960
 961     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 962                         reservation, r0):
 963
 964         comb = m.d.comb
 965         sync = m.d.sync
 966
 967         with m.If(r0_valid & access_ok):
 968             with m.If(clear_rsrv):
 969                 sync += reservation.valid.eq(0)
 970             with m.Elif(set_rsrv):
 971                 sync += reservation.valid.eq(1)
 972                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 973
 974     def writeback_control(self, m, r1, cache_out_row):
 975         """Return data for loads & completion control logic
 976         """
 977         comb = m.d.comb
 978         sync = m.d.sync
 979         d_out, m_out = self.d_out, self.m_out
 980
 981         data_out = Signal(64)
 982         data_fwd = Signal(64)
 983
 984         # Use the bypass if are reading the row that was
 985         # written 1 or 2 cycles ago, including for the
 986         # slow_valid = 1 case (i.e. completing a load
 987         # miss or a non-cacheable load).
 988         with m.If(r1.use_forward1):
 989             comb += data_fwd.eq(r1.forward_data1)
 990         with m.Else():
 991             comb += data_fwd.eq(r1.forward_data2)
 992
 993         comb += data_out.eq(cache_out_row)
 994
 995         for i in range(8):
 996             with m.If(r1.forward_sel[i]):
 997                 dsel = data_fwd.word_select(i, 8)
 998                 comb += data_out.word_select(i, 8).eq(dsel)
 999
1000         comb += d_out.valid.eq(r1.ls_valid)
1001         comb += d_out.data.eq(data_out)
1002         comb += d_out.store_done.eq(~r1.stcx_fail)
1003         comb += d_out.error.eq(r1.ls_error)
1004         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1005
1006         # Outputs to MMU
1007         comb += m_out.done.eq(r1.mmu_done)
1008         comb += m_out.err.eq(r1.mmu_error)
1009         comb += m_out.data.eq(data_out)
1010
1011         # We have a valid load or store hit or we just completed
1012         # a slow op such as a load miss, a NC load or a store
1013         #
1014         # Note: the load hit is delayed by one cycle. However it
1015         # can still not collide with r.slow_valid (well unless I
1016         # miscalculated) because slow_valid can only be set on a
1017         # subsequent request and not on its first cycle (the state
1018         # machine must have advanced), which makes slow_valid
1019         # at least 2 cycles from the previous hit_load_valid.
1020
1021         # Sanity: Only one of these must be set in any given cycle
1022
1023         if False: # TODO: need Display to get this to work
1024             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1025             "unexpected slow_valid collision with stcx_fail"
1026
1027             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1028              "unexpected hit_load_delayed collision with slow_valid"
1029
1030         with m.If(~r1.mmu_req):
1031             # Request came from loadstore1...
1032             # Load hit case is the standard path
1033             with m.If(r1.hit_load_valid):
1034                 sync += Display("completing load hit data=%x", data_out)
1035
1036             # error cases complete without stalling
1037             with m.If(r1.ls_error):
1038                 sync += Display("completing ld/st with error")
1039
1040             # Slow ops (load miss, NC, stores)
1041             with m.If(r1.slow_valid):
1042                 sync += Display("completing store or load miss adr=%x data=%x",
1043                                 r1.req.real_addr, data_out)
1044
1045         with m.Else():
1046             # Request came from MMU
1047             with m.If(r1.hit_load_valid):
1048                 sync += Display("completing load hit to MMU, data=%x",
1049                                 m_out.data)
1050             # error cases complete without stalling
1051             with m.If(r1.mmu_error):
1052                 sync += Display("combpleting MMU ld with error")
1053
1054             # Slow ops (i.e. load miss)
1055             with m.If(r1.slow_valid):
1056                 sync += Display("completing MMU load miss, data=%x",
1057                                 m_out.data)
1058
1059     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1060         """rams
1061         Generate a cache RAM for each way. This handles the normal
1062         reads, writes from reloads and the special store-hit update
1063         path as well.
1064
1065         Note: the BRAMs have an extra read buffer, meaning the output
1066         is pipelined an extra cycle. This differs from the
1067         icache. The writeback logic needs to take that into
1068         account by using 1-cycle delayed signals for load hits.
1069         """
1070         comb = m.d.comb
1071         wb_in = self.wb_in
1072
1073         for i in range(NUM_WAYS):
1074             do_read  = Signal(name="do_rd%d" % i)
1075             rd_addr  = Signal(ROW_BITS)
1076             do_write = Signal(name="do_wr%d" % i)
1077             wr_addr  = Signal(ROW_BITS)
1078             wr_data  = Signal(WB_DATA_BITS)
1079             wr_sel   = Signal(ROW_SIZE)
1080             wr_sel_m = Signal(ROW_SIZE)
1081             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1082
1083             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True)
1084             setattr(m.submodules, "cacheram_%d" % i, way)
1085
1086             comb += way.rd_en.eq(do_read)
1087             comb += way.rd_addr.eq(rd_addr)
1088             comb += _d_out.eq(way.rd_data_o)
1089             comb += way.wr_sel.eq(wr_sel_m)
1090             comb += way.wr_addr.eq(wr_addr)
1091             comb += way.wr_data.eq(wr_data)
1092
1093             # Cache hit reads
1094             comb += do_read.eq(1)
1095             comb += rd_addr.eq(early_req_row)
1096             with m.If(r1.hit_way == i):
1097                 comb += cache_out_row.eq(_d_out)
1098
1099             # Write mux:
1100             #
1101             # Defaults to wishbone read responses (cache refill)
1102             #
1103             # For timing, the mux on wr_data/sel/addr is not
1104             # dependent on anything other than the current state.
1105
1106             with m.If(r1.write_bram):
1107                 # Write store data to BRAM.  This happens one
1108                 # cycle after the store is in r0.
1109                 comb += wr_data.eq(r1.req.data)
1110                 comb += wr_sel.eq(r1.req.byte_sel)
1111                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1112
1113                 with m.If(i == r1.req.hit_way):
1114                     comb += do_write.eq(1)
1115             with m.Else():
1116                 # Otherwise, we might be doing a reload or a DCBZ
1117                 with m.If(r1.dcbz):
1118                     comb += wr_data.eq(0)
1119                 with m.Else():
1120                     comb += wr_data.eq(wb_in.dat)
1121                 comb += wr_addr.eq(r1.store_row)
1122                 comb += wr_sel.eq(~0) # all 1s
1123
1124             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1125                       & wb_in.ack & (replace_way == i)):
1126                 comb += do_write.eq(1)
1127
1128             # Mask write selects with do_write since BRAM
1129             # doesn't have a global write-enable
1130             with m.If(do_write):
1131                 comb += wr_sel_m.eq(wr_sel)
1132
1133     # Cache hit synchronous machine for the easy case.
1134     # This handles load hits.
1135     # It also handles error cases (TLB miss, cache paradox)
1136     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1137                         req_hit_way, req_index, req_tag, access_ok,
1138                         tlb_hit, tlb_hit_way, tlb_req_index):
1139
1140         comb = m.d.comb
1141         sync = m.d.sync
1142
1143         with m.If(req_op != Op.OP_NONE):
1144             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1145                     req_op, r0.req.addr, r0.req.nc,
1146                     req_index, req_tag, req_hit_way)
1147
1148         with m.If(r0_valid):
1149             sync += r1.mmu_req.eq(r0.mmu_req)
1150
1151         # Fast path for load/store hits.
1152         # Set signals for the writeback controls.
1153         sync += r1.hit_way.eq(req_hit_way)
1154         sync += r1.hit_index.eq(req_index)
1155
1156         with m.If(req_op == Op.OP_LOAD_HIT):
1157             sync += r1.hit_load_valid.eq(1)
1158         with m.Else():
1159             sync += r1.hit_load_valid.eq(0)
1160
1161         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1162             sync += r1.cache_hit.eq(1)
1163         with m.Else():
1164             sync += r1.cache_hit.eq(0)
1165
1166         with m.If(req_op == Op.OP_BAD):
1167             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1168             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1169             sync += r1.ls_error.eq(~r0.mmu_req)
1170             sync += r1.mmu_error.eq(r0.mmu_req)
1171             sync += r1.cache_paradox.eq(access_ok)
1172
1173             with m.Else():
1174                 sync += r1.ls_error.eq(0)
1175                 sync += r1.mmu_error.eq(0)
1176                 sync += r1.cache_paradox.eq(0)
1177
1178         with m.If(req_op == Op.OP_STCX_FAIL):
1179             sync += r1.stcx_fail.eq(1)
1180         with m.Else():
1181             sync += r1.stcx_fail.eq(0)
1182
1183         # Record TLB hit information for updating TLB PLRU
1184         sync += r1.tlb_hit.eq(tlb_hit)
1185         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1186         sync += r1.tlb_hit_index.eq(tlb_req_index)
1187
1188     # Memory accesses are handled by this state machine:
1189     #
1190     #   * Cache load miss/reload (in conjunction with "rams")
1191     #   * Load hits for non-cachable forms
1192     #   * Stores (the collision case is handled in "rams")
1193     #
1194     # All wishbone requests generation is done here.
1195     # This machine operates at stage 1.
1196     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1197                     cache_valids, r0, replace_way,
1198                     req_hit_way, req_same_tag,
1199                     r0_valid, req_op, cache_tags, req_go, ra):
1200
1201         comb = m.d.comb
1202         sync = m.d.sync
1203         wb_in = self.wb_in
1204
1205         req         = MemAccessRequest("mreq_ds")
1206
1207         req_row = Signal(ROW_BITS)
1208         req_idx = Signal(INDEX_BITS)
1209         req_tag = Signal(TAG_BITS)
1210         comb += req_idx.eq(get_index(req.real_addr))
1211         comb += req_row.eq(get_row(req.real_addr))
1212         comb += req_tag.eq(get_tag(req.real_addr))
1213
1214         sync += r1.use_forward1.eq(use_forward1_next)
1215         sync += r1.forward_sel.eq(0)
1216
1217         with m.If(use_forward1_next):
1218             sync += r1.forward_sel.eq(r1.req.byte_sel)
1219         with m.Elif(use_forward2_next):
1220             sync += r1.forward_sel.eq(r1.forward_sel1)
1221
1222         sync += r1.forward_data2.eq(r1.forward_data1)
1223         with m.If(r1.write_bram):
1224             sync += r1.forward_data1.eq(r1.req.data)
1225             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1226             sync += r1.forward_way1.eq(r1.req.hit_way)
1227             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1228             sync += r1.forward_valid1.eq(1)
1229         with m.Else():
1230             with m.If(r1.dcbz):
1231                 sync += r1.forward_data1.eq(0)
1232             with m.Else():
1233                 sync += r1.forward_data1.eq(wb_in.dat)
1234             sync += r1.forward_sel1.eq(~0) # all 1s
1235             sync += r1.forward_way1.eq(replace_way)
1236             sync += r1.forward_row1.eq(r1.store_row)
1237             sync += r1.forward_valid1.eq(0)
1238
1239         # One cycle pulses reset
1240         sync += r1.slow_valid.eq(0)
1241         sync += r1.write_bram.eq(0)
1242         sync += r1.inc_acks.eq(0)
1243         sync += r1.dec_acks.eq(0)
1244
1245         sync += r1.ls_valid.eq(0)
1246         # complete tlbies and TLB loads in the third cycle
1247         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1248
1249         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1250             with m.If(~r0.mmu_req):
1251                 sync += r1.ls_valid.eq(1)
1252             with m.Else():
1253                 sync += r1.mmu_done.eq(1)
1254
1255         with m.If(r1.write_tag):
1256             # Store new tag in selected way
1257             for i in range(NUM_WAYS):
1258                 with m.If(i == replace_way):
1259                     ct = Signal(TAG_RAM_WIDTH)
1260                     comb += ct.eq(cache_tags[r1.store_index])
1261                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1262                     sync += cache_tags[r1.store_index].eq(ct)
1263             sync += r1.store_way.eq(replace_way)
1264             sync += r1.write_tag.eq(0)
1265
1266         # Take request from r1.req if there is one there,
1267         # else from req_op, ra, etc.
1268         with m.If(r1.full):
1269             comb += req.eq(r1.req)
1270         with m.Else():
1271             comb += req.op.eq(req_op)
1272             comb += req.valid.eq(req_go)
1273             comb += req.mmu_req.eq(r0.mmu_req)
1274             comb += req.dcbz.eq(r0.req.dcbz)
1275             comb += req.real_addr.eq(ra)
1276
1277             with m.If(~r0.req.dcbz):
1278                 comb += req.data.eq(r0.req.data)
1279             with m.Else():
1280                 comb += req.data.eq(0)
1281
1282             # Select all bytes for dcbz
1283             # and for cacheable loads
1284             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1285                 comb += req.byte_sel.eq(~0) # all 1s
1286             with m.Else():
1287                 comb += req.byte_sel.eq(r0.req.byte_sel)
1288             comb += req.hit_way.eq(req_hit_way)
1289             comb += req.same_tag.eq(req_same_tag)
1290
1291             # Store the incoming request from r0,
1292             # if it is a slow request
1293             # Note that r1.full = 1 implies req_op = OP_NONE
1294             with m.If((req_op == Op.OP_LOAD_MISS)
1295                       | (req_op == Op.OP_LOAD_NC)
1296                       | (req_op == Op.OP_STORE_MISS)
1297                       | (req_op == Op.OP_STORE_HIT)):
1298                 sync += r1.req.eq(req)
1299                 sync += r1.full.eq(1)
1300
1301         # Main state machine
1302         with m.Switch(r1.state):
1303
1304             with m.Case(State.IDLE):
1305                 sync += r1.real_adr.eq(req.real_addr)
1306                 sync += r1.wb.sel.eq(req.byte_sel)
1307                 sync += r1.wb.dat.eq(req.data)
1308                 sync += r1.dcbz.eq(req.dcbz)
1309
1310                 # Keep track of our index and way
1311                 # for subsequent stores.
1312                 sync += r1.store_index.eq(req_idx)
1313                 sync += r1.store_row.eq(req_row)
1314                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1315                 sync += r1.reload_tag.eq(req_tag)
1316                 sync += r1.req.same_tag.eq(1)
1317
1318                 with m.If(req.op == Op.OP_STORE_HIT):
1319                     sync += r1.store_way.eq(req.hit_way)
1320
1321                 # Reset per-row valid bits,
1322                 # ready for handling OP_LOAD_MISS
1323                 for i in range(ROW_PER_LINE):
1324                     sync += r1.rows_valid[i].eq(0)
1325
1326                 with m.If(req_op != Op.OP_NONE):
1327                     sync += Display("cache op %d", req.op)
1328
1329                 with m.Switch(req.op):
1330                     with m.Case(Op.OP_LOAD_HIT):
1331                         # stay in IDLE state
1332                         pass
1333
1334                     with m.Case(Op.OP_LOAD_MISS):
1335                         sync += Display("cache miss real addr: %x " \
1336                                 "idx: %x tag: %x",
1337                                 req.real_addr, req_row, req_tag)
1338
1339                         # Start the wishbone cycle
1340                         sync += r1.wb.we.eq(0)
1341                         sync += r1.wb.cyc.eq(1)
1342                         sync += r1.wb.stb.eq(1)
1343
1344                         # Track that we had one request sent
1345                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1346                         sync += r1.write_tag.eq(1)
1347
1348                     with m.Case(Op.OP_LOAD_NC):
1349                         sync += r1.wb.cyc.eq(1)
1350                         sync += r1.wb.stb.eq(1)
1351                         sync += r1.wb.we.eq(0)
1352                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1353
1354                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1355                         with m.If(~req.dcbz):
1356                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1357                             sync += r1.acks_pending.eq(1)
1358                             sync += r1.full.eq(0)
1359                             sync += r1.slow_valid.eq(1)
1360
1361                             with m.If(~req.mmu_req):
1362                                 sync += r1.ls_valid.eq(1)
1363                             with m.Else():
1364                                 sync += r1.mmu_done.eq(1)
1365
1366                             with m.If(req.op == Op.OP_STORE_HIT):
1367                                 sync += r1.write_bram.eq(1)
1368                         with m.Else():
1369                             # dcbz is handled much like a load miss except
1370                             # that we are writing to memory instead of reading
1371                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1372
1373                             with m.If(req.op == Op.OP_STORE_MISS):
1374                                 sync += r1.write_tag.eq(1)
1375
1376                         sync += r1.wb.we.eq(1)
1377                         sync += r1.wb.cyc.eq(1)
1378                         sync += r1.wb.stb.eq(1)
1379
1380                     # OP_NONE and OP_BAD do nothing
1381                     # OP_BAD & OP_STCX_FAIL were
1382                     # handled above already
1383                     with m.Case(Op.OP_NONE):
1384                         pass
1385                     with m.Case(Op.OP_BAD):
1386                         pass
1387                     with m.Case(Op.OP_STCX_FAIL):
1388                         pass
1389
1390             with m.Case(State.RELOAD_WAIT_ACK):
1391                 ld_stbs_done = Signal()
1392                 # Requests are all sent if stb is 0
1393                 comb += ld_stbs_done.eq(~r1.wb.stb)
1394
1395                 with m.If((~wb_in.stall) & r1.wb.stb):
1396                     # That was the last word?  We are done sending.
1397                     # Clear stb and set ld_stbs_done so we can handle an
1398                     # eventual last ack on the same cycle.
1399                     with m.If(is_last_row_addr(r1.real_adr, r1.end_row_ix)):
1400                         sync += r1.wb.stb.eq(0)
1401                         comb += ld_stbs_done.eq(1)
1402
1403                     # Calculate the next row address in the current cache line
1404                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1405                     comb += row.eq(r1.real_adr[ROW_OFF_BITS:])
1406                     sync += r1.real_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(row+1)
1407
1408                 # Incoming acks processing
1409                 sync += r1.forward_valid1.eq(wb_in.ack)
1410                 with m.If(wb_in.ack):
1411                     srow = Signal(ROW_LINE_BITS)
1412                     comb += srow.eq(r1.store_row)
1413                     sync += r1.rows_valid[srow].eq(1)
1414
1415                     # If this is the data we were looking for,
1416                     # we can complete the request next cycle.
1417                     # Compare the whole address in case the
1418                     # request in r1.req is not the one that
1419                     # started this refill.
1420                     with m.If(r1.full & r1.req.same_tag &
1421                               ((r1.dcbz & r1.req.dcbz) |
1422                                ((~r1.dcbz) & (r1.req.op == Op.OP_LOAD_MISS))) &
1423                                 (r1.store_row == get_row(r1.req.real_addr))):
1424                         sync += r1.full.eq(0)
1425                         sync += r1.slow_valid.eq(1)
1426                         with m.If(~r1.mmu_req):
1427                             sync += r1.ls_valid.eq(1)
1428                         with m.Else():
1429                             sync += r1.mmu_done.eq(1)
1430                         sync += r1.forward_sel.eq(~0) # all 1s
1431                         sync += r1.use_forward1.eq(1)
1432
1433                     # Check for completion
1434                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1435                                                       r1.end_row_ix)):
1436                         # Complete wishbone cycle
1437                         sync += r1.wb.cyc.eq(0)
1438
1439                         # Cache line is now valid
1440                         cv = Signal(INDEX_BITS)
1441                         comb += cv.eq(cache_valids[r1.store_index])
1442                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1443                         sync += cache_valids[r1.store_index].eq(cv)
1444
1445                         sync += r1.state.eq(State.IDLE)
1446
1447                     # Increment store row counter
1448                     sync += r1.store_row.eq(next_row(r1.store_row))
1449
1450             with m.Case(State.STORE_WAIT_ACK):
1451                 st_stbs_done = Signal()
1452                 acks        = Signal(3)
1453                 adjust_acks = Signal(3)
1454
1455                 comb += st_stbs_done.eq(~r1.wb.stb)
1456                 comb += acks.eq(r1.acks_pending)
1457
1458                 with m.If(r1.inc_acks != r1.dec_acks):
1459                     with m.If(r1.inc_acks):
1460                         comb += adjust_acks.eq(acks + 1)
1461                     with m.Else():
1462                         comb += adjust_acks.eq(acks - 1)
1463                 with m.Else():
1464                     comb += adjust_acks.eq(acks)
1465
1466                 sync += r1.acks_pending.eq(adjust_acks)
1467
1468                 # Clear stb when slave accepted request
1469                 with m.If(~wb_in.stall):
1470                     # See if there is another store waiting
1471                     # to be done which is in the same real page.
1472                     with m.If(req.valid):
1473                         ra = req.real_addr[0:SET_SIZE_BITS]
1474                         sync += r1.real_adr[0:SET_SIZE_BITS].eq(ra)
1475                         sync += r1.wb.dat.eq(req.data)
1476                         sync += r1.wb.sel.eq(req.byte_sel)
1477
1478                     with m.If((adjust_acks < 7) & req.same_tag &
1479                                 ((req.op == Op.OP_STORE_MISS)
1480                                  | (req.op == Op.OP_STORE_HIT))):
1481                         sync += r1.wb.stb.eq(1)
1482                         comb += st_stbs_done.eq(0)
1483
1484                         with m.If(req.op == Op.OP_STORE_HIT):
1485                             sync += r1.write_bram.eq(1)
1486                         sync += r1.full.eq(0)
1487                         sync += r1.slow_valid.eq(1)
1488
1489                         # Store requests never come from the MMU
1490                         sync += r1.ls_valid.eq(1)
1491                         comb += st_stbs_done.eq(0)
1492                         sync += r1.inc_acks.eq(1)
1493                     with m.Else():
1494                         sync += r1.wb.stb.eq(0)
1495                         comb += st_stbs_done.eq(1)
1496
1497                 # Got ack ? See if complete.
1498                 with m.If(wb_in.ack):
1499                     with m.If(st_stbs_done & (adjust_acks == 1)):
1500                         sync += r1.state.eq(State.IDLE)
1501                         sync += r1.wb.cyc.eq(0)
1502                         sync += r1.wb.stb.eq(0)
1503                     sync += r1.dec_acks.eq(1)
1504
1505             with m.Case(State.NC_LOAD_WAIT_ACK):
1506                 # Clear stb when slave accepted request
1507                 with m.If(~wb_in.stall):
1508                     sync += r1.wb.stb.eq(0)
1509
1510                 # Got ack ? complete.
1511                 with m.If(wb_in.ack):
1512                     sync += r1.state.eq(State.IDLE)
1513                     sync += r1.full.eq(0)
1514                     sync += r1.slow_valid.eq(1)
1515
1516                     with m.If(~r1.mmu_req):
1517                         sync += r1.ls_valid.eq(1)
1518                     with m.Else():
1519                         sync += r1.mmu_done.eq(1)
1520
1521                     sync += r1.forward_sel.eq(~0) # all 1s
1522                     sync += r1.use_forward1.eq(1)
1523                     sync += r1.wb.cyc.eq(0)
1524                     sync += r1.wb.stb.eq(0)
1525
1526     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1527
1528         sync = m.d.sync
1529         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1530
1531         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1532                                stall_out, req_op[:3], d_out.valid, d_out.error,
1533                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1534                                r1.real_adr[3:6]))
1535
1536     def elaborate(self, platform):
1537
1538         m = Module()
1539         comb = m.d.comb
1540
1541         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1542         cache_tags       = CacheTagArray()
1543         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1544         cache_valids = CacheValidBitsArray()
1545
1546         # TODO attribute ram_style : string;
1547         # TODO attribute ram_style of cache_tags : signal is "distributed";
1548
1549         """note: these are passed to nmigen.hdl.Memory as "attributes".
1550            don't know how, just that they are.
1551         """
1552         dtlb_valid_bits = TLBValidBitsArray()
1553         dtlb_tags       = TLBTagsArray()
1554         dtlb_ptes       = TLBPtesArray()
1555         # TODO attribute ram_style of
1556         #  dtlb_tags : signal is "distributed";
1557         # TODO attribute ram_style of
1558         #  dtlb_ptes : signal is "distributed";
1559
1560         r0      = RegStage0("r0")
1561         r0_full = Signal()
1562
1563         r1 = RegStage1("r1")
1564
1565         reservation = Reservation()
1566
1567         # Async signals on incoming request
1568         req_index    = Signal(INDEX_BITS)
1569         req_row      = Signal(ROW_BITS)
1570         req_hit_way  = Signal(WAY_BITS)
1571         req_tag      = Signal(TAG_BITS)
1572         req_op       = Signal(Op)
1573         req_data     = Signal(64)
1574         req_same_tag = Signal()
1575         req_go       = Signal()
1576
1577         early_req_row     = Signal(ROW_BITS)
1578
1579         cancel_store      = Signal()
1580         set_rsrv          = Signal()
1581         clear_rsrv        = Signal()
1582
1583         r0_valid          = Signal()
1584         r0_stall          = Signal()
1585
1586         use_forward1_next = Signal()
1587         use_forward2_next = Signal()
1588
1589         cache_out_row     = Signal(WB_DATA_BITS)
1590
1591         plru_victim       = PLRUOut()
1592         replace_way       = Signal(WAY_BITS)
1593
1594         # Wishbone read/write/cache write formatting signals
1595         bus_sel           = Signal(8)
1596
1597         # TLB signals
1598         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1599         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1600         tlb_valid_way = Signal(TLB_NUM_WAYS)
1601         tlb_req_index = Signal(TLB_SET_BITS)
1602         tlb_hit       = Signal()
1603         tlb_hit_way   = Signal(TLB_WAY_BITS)
1604         pte           = Signal(TLB_PTE_BITS)
1605         ra            = Signal(REAL_ADDR_BITS)
1606         valid_ra      = Signal()
1607         perm_attr     = PermAttr("dc_perms")
1608         rc_ok         = Signal()
1609         perm_ok       = Signal()
1610         access_ok     = Signal()
1611
1612         tlb_plru_victim = TLBPLRUOut()
1613
1614         # we don't yet handle collisions between loadstore1 requests
1615         # and MMU requests
1616         comb += self.m_out.stall.eq(0)
1617
1618         # Hold off the request in r0 when r1 has an uncompleted request
1619         comb += r0_stall.eq(r0_full & r1.full)
1620         comb += r0_valid.eq(r0_full & ~r1.full)
1621         comb += self.stall_out.eq(r0_stall)
1622
1623         # Wire up wishbone request latch out of stage 1
1624         comb += r1.wb.adr.eq(r1.real_adr)
1625         comb += self.wb_out.eq(r1.wb)
1626         comb += self.wb_out.adr.eq(r1.wb.adr[3:]) # truncate LSBs
1627
1628         # deal with litex not doing wishbone pipeline mode
1629         comb += self.wb_in.stall.eq(self.wb_out.cyc & ~self.wb_in.ack)
1630
1631         # call sub-functions putting everything together, using shared
1632         # signals established above
1633         self.stage_0(m, r0, r1, r0_full)
1634         self.tlb_read(m, r0_stall, tlb_valid_way,
1635                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1636                       dtlb_tags, dtlb_ptes)
1637         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1638                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1639                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1640         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1641                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1642                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1643         self.maybe_plrus(m, r1, plru_victim)
1644         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1645         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1646         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1647                            r0_valid, r1, cache_valids, replace_way,
1648                            use_forward1_next, use_forward2_next,
1649                            req_hit_way, plru_victim, rc_ok, perm_attr,
1650                            valid_ra, perm_ok, access_ok, req_op, req_go,
1651                            tlb_pte_way,
1652                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1653                            cancel_store, req_same_tag, r0_stall, early_req_row)
1654         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1655                            r0_valid, r0, reservation)
1656         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1657                            reservation, r0)
1658         self.writeback_control(m, r1, cache_out_row)
1659         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1660         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1661                         req_hit_way, req_index, req_tag, access_ok,
1662                         tlb_hit, tlb_hit_way, tlb_req_index)
1663         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1664                     cache_valids, r0, replace_way,
1665                     req_hit_way, req_same_tag,
1666                          r0_valid, req_op, cache_tags, req_go, ra)
1667         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1668
1669         return m
1670
1671 def dcache_load(dut, addr, nc=0):
1672     yield dut.d_in.load.eq(1)
1673     yield dut.d_in.nc.eq(nc)
1674     yield dut.d_in.addr.eq(addr)
1675     yield dut.d_in.byte_sel.eq(~0)
1676     yield dut.d_in.valid.eq(1)
1677     yield
1678     yield dut.d_in.valid.eq(0)
1679     yield dut.d_in.byte_sel.eq(0)
1680     while not (yield dut.d_out.valid):
1681         yield
1682     data = yield dut.d_out.data
1683     return data
1684
1685
1686 def dcache_store(dut, addr, data, nc=0):
1687     yield dut.d_in.load.eq(0)
1688     yield dut.d_in.nc.eq(nc)
1689     yield dut.d_in.data.eq(data)
1690     yield dut.d_in.byte_sel.eq(~0)
1691     yield dut.d_in.addr.eq(addr)
1692     yield dut.d_in.valid.eq(1)
1693     yield
1694     yield dut.d_in.valid.eq(0)
1695     yield dut.d_in.byte_sel.eq(0)
1696     while not (yield dut.d_out.valid):
1697         yield
1698
1699
1700 def dcache_random_sim(dut, mem):
1701
1702     # start copy of mem
1703     sim_mem = deepcopy(mem)
1704     memsize = len(sim_mem)
1705     print ("mem len", memsize)
1706
1707     # clear stuff
1708     yield dut.d_in.valid.eq(0)
1709     yield dut.d_in.load.eq(0)
1710     yield dut.d_in.priv_mode.eq(1)
1711     yield dut.d_in.nc.eq(0)
1712     yield dut.d_in.addr.eq(0)
1713     yield dut.d_in.data.eq(0)
1714     yield dut.m_in.valid.eq(0)
1715     yield dut.m_in.addr.eq(0)
1716     yield dut.m_in.pte.eq(0)
1717     # wait 4 * clk_period
1718     yield
1719     yield
1720     yield
1721     yield
1722
1723     print ()
1724
1725     #for i in range(1024):
1726     #    sim_mem[i] = i
1727
1728     for i in range(1024):
1729         addr = randint(0, memsize-1)
1730         data = randint(0, (1<<64)-1)
1731         sim_mem[addr] = data
1732         row = addr
1733         addr *= 8
1734
1735         print ("random testing %d 0x%x row %d data 0x%x" % (i, addr, row, data))
1736
1737         yield from dcache_load(dut, addr)
1738         yield from dcache_store(dut, addr, data)
1739
1740         addr = randint(0, memsize-1)
1741         sim_data = sim_mem[addr]
1742         row = addr
1743         addr *= 8
1744
1745         print ("    load 0x%x row %d expect data 0x%x" % (addr, row, sim_data))
1746         data = yield from dcache_load(dut, addr)
1747         assert data == sim_data, \
1748             "check addr 0x%x row %d data %x != %x" % (addr, row, data, sim_data)
1749
1750     for addr in range(memsize):
1751         data = yield from dcache_load(dut, addr*8)
1752         assert data == sim_mem[addr], \
1753             "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1754
1755
1756 def dcache_sim(dut, mem):
1757     # clear stuff
1758     yield dut.d_in.valid.eq(0)
1759     yield dut.d_in.load.eq(0)
1760     yield dut.d_in.priv_mode.eq(1)
1761     yield dut.d_in.nc.eq(0)
1762     yield dut.d_in.addr.eq(0)
1763     yield dut.d_in.data.eq(0)
1764     yield dut.m_in.valid.eq(0)
1765     yield dut.m_in.addr.eq(0)
1766     yield dut.m_in.pte.eq(0)
1767     # wait 4 * clk_period
1768     yield
1769     yield
1770     yield
1771     yield
1772
1773     # Cacheable read of address 4
1774     data = yield from dcache_load(dut, 0x58)
1775     addr = yield dut.d_in.addr
1776     assert data == 0x0000001700000016, \
1777         f"data @%x=%x expected 0x0000001700000016" % (addr, data)
1778
1779     # Cacheable read of address 20
1780     data = yield from dcache_load(dut, 0x20)
1781     addr = yield dut.d_in.addr
1782     assert data == 0x0000000900000008, \
1783         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1784
1785     # Cacheable read of address 30
1786     data = yield from dcache_load(dut, 0x530)
1787     addr = yield dut.d_in.addr
1788     assert data == 0x0000014D0000014C, \
1789         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1790
1791     # 2nd Cacheable read of address 30
1792     data = yield from dcache_load(dut, 0x530)
1793     addr = yield dut.d_in.addr
1794     assert data == 0x0000014D0000014C, \
1795         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1796
1797     # Non-cacheable read of address 100
1798     data = yield from dcache_load(dut, 0x100, nc=1)
1799     addr = yield dut.d_in.addr
1800     assert data == 0x0000004100000040, \
1801         f"data @%x=%x expected 0000004100000040" % (addr, data)
1802
1803     # Store at address 530
1804     yield from dcache_store(dut, 0x530, 0x121)
1805
1806     # Store at address 30
1807     yield from dcache_store(dut, 0x530, 0x12345678)
1808
1809     # 3nd Cacheable read of address 530
1810     data = yield from dcache_load(dut, 0x530)
1811     addr = yield dut.d_in.addr
1812     assert data == 0x12345678, \
1813         f"data @%x=%x expected 0x12345678" % (addr, data)
1814
1815     # 4th Cacheable read of address 20
1816     data = yield from dcache_load(dut, 0x20)
1817     addr = yield dut.d_in.addr
1818     assert data == 0x0000000900000008, \
1819         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1820
1821     yield
1822     yield
1823     yield
1824     yield
1825
1826
1827 def test_dcache(mem, test_fn, test_name):
1828     dut = DCache()
1829
1830     memory = Memory(width=64, depth=len(mem), init=mem, simulate=True)
1831     sram = SRAM(memory=memory, granularity=8)
1832
1833     m = Module()
1834     m.submodules.dcache = dut
1835     m.submodules.sram = sram
1836
1837     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1838     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1839     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1840     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1841     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1842     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1843
1844     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1845     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1846
1847     # nmigen Simulation
1848     sim = Simulator(m)
1849     sim.add_clock(1e-6)
1850
1851     sim.add_sync_process(wrap(test_fn(dut, mem)))
1852     with sim.write_vcd('test_dcache%s.vcd' % test_name):
1853         sim.run()
1854
1855 if __name__ == '__main__':
1856     seed(0)
1857     dut = DCache()
1858     vl = rtlil.convert(dut, ports=[])
1859     with open("test_dcache.il", "w") as f:
1860         f.write(vl)
1861
1862     mem = []
1863     for i in range(1024):
1864         mem.append((i*2)| ((i*2+1)<<32))
1865
1866     test_dcache(mem, dcache_sim, "")
1867
1868     mem = []
1869     memsize = 256
1870     for i in range(memsize):
1871         mem.append(i)
1872
1873     test_dcache(mem, dcache_random_sim, "random")
1874