src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 """
   6
   7 from enum import Enum, unique
   8
   9 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  10 from nmutil.util import Display
  11
  12 from random import randint, seed
  13
  14 from nmigen.cli import main
  15 from nmutil.iocontrol import RecordObject
  16 from nmigen.utils import log2_int
  17 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  18                                      DCacheToLoadStore1Type,
  19                                      MMUToDCacheType,
  20                                      DCacheToMMUType)
  21
  22 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  23                                 WBAddrType, WBDataType, WBSelType,
  24                                 WBMasterOut, WBSlaveOut,
  25                                 WBMasterOutVector, WBSlaveOutVector,
  26                                 WBIOMasterOut, WBIOSlaveOut)
  27
  28 from soc.experiment.cache_ram import CacheRam
  29 #from soc.experiment.plru import PLRU
  30 from nmutil.plru import PLRU
  31
  32 # for test
  33 from soc.bus.sram import SRAM
  34 from nmigen import Memory
  35 from nmigen.cli import rtlil
  36
  37 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  38 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  39 from nmutil.sim_tmp_alternative import Simulator
  40
  41 from nmutil.util import wrap
  42
  43
  44 # TODO: make these parameters of DCache at some point
  45 LINE_SIZE = 64    # Line size in bytes
  46 NUM_LINES = 16    # Number of lines in a set
  47 NUM_WAYS = 4      # Number of ways
  48 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  49 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  50 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  51 LOG_LENGTH = 0    # Non-zero to enable log data collection
  52
  53 # BRAM organisation: We never access more than
  54 #     -- WB_DATA_BITS at a time so to save
  55 #     -- resources we make the array only that wide, and
  56 #     -- use consecutive indices for to make a cache "line"
  57 #     --
  58 #     -- ROW_SIZE is the width in bytes of the BRAM
  59 #     -- (based on WB, so 64-bits)
  60 ROW_SIZE = WB_DATA_BITS // 8;
  61
  62 # ROW_PER_LINE is the number of row (wishbone
  63 # transactions) in a line
  64 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  65
  66 # BRAM_ROWS is the number of rows in BRAM needed
  67 # to represent the full dcache
  68 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  69
  70 print ("ROW_SIZE", ROW_SIZE)
  71 print ("ROW_PER_LINE", ROW_PER_LINE)
  72 print ("BRAM_ROWS", BRAM_ROWS)
  73 print ("NUM_WAYS", NUM_WAYS)
  74
  75 # Bit fields counts in the address
  76
  77 # REAL_ADDR_BITS is the number of real address
  78 # bits that we store
  79 REAL_ADDR_BITS = 56
  80
  81 # ROW_BITS is the number of bits to select a row
  82 ROW_BITS = log2_int(BRAM_ROWS)
  83
  84 # ROW_LINE_BITS is the number of bits to select
  85 # a row within a line
  86 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  87
  88 # LINE_OFF_BITS is the number of bits for
  89 # the offset in a cache line
  90 LINE_OFF_BITS = log2_int(LINE_SIZE)
  91
  92 # ROW_OFF_BITS is the number of bits for
  93 # the offset in a row
  94 ROW_OFF_BITS = log2_int(ROW_SIZE)
  95
  96 # INDEX_BITS is the number if bits to
  97 # select a cache line
  98 INDEX_BITS = log2_int(NUM_LINES)
  99
 100 # SET_SIZE_BITS is the log base 2 of the set size
 101 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 102
 103 # TAG_BITS is the number of bits of
 104 # the tag part of the address
 105 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 106
 107 # TAG_WIDTH is the width in bits of each way of the tag RAM
 108 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 109
 110 # WAY_BITS is the number of bits to select a way
 111 WAY_BITS = log2_int(NUM_WAYS)
 112
 113 # Example of layout for 32 lines of 64 bytes:
 114 layout = """\
 115   ..  tag    |index|  line  |
 116   ..         |   row   |    |
 117   ..         |     |---|    | ROW_LINE_BITS  (3)
 118   ..         |     |--- - --| LINE_OFF_BITS (6)
 119   ..         |         |- --| ROW_OFF_BITS  (3)
 120   ..         |----- ---|    | ROW_BITS      (8)
 121   ..         |-----|        | INDEX_BITS    (5)
 122   .. --------|              | TAG_BITS      (45)
 123 """
 124 print (layout)
 125 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 126             (TAG_BITS, INDEX_BITS, ROW_BITS,
 127              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 128 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 129 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 130 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 131
 132 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 133
 134 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 135
 136 def CacheTagArray():
 137     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 138                         for x in range(NUM_LINES))
 139
 140 def CacheValidBitsArray():
 141     return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
 142                         for x in range(NUM_LINES))
 143
 144 def RowPerLineValidArray():
 145     return Array(Signal(name="rows_valid%d" % x) \
 146                         for x in range(ROW_PER_LINE))
 147
 148 # L1 TLB
 149 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 150 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 151 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 152 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 153 TLB_PTE_BITS     = 64
 154 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 155
 156 def ispow2(x):
 157     return (1<<log2_int(x, False)) == x
 158
 159 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 160 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 161 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 162 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 163 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 164 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 165         "geometry bits don't add up"
 166 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 167         "geometry bits don't add up"
 168 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 169          "geometry bits don't add up"
 170 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 171 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 172
 173
 174 def TLBValidBitsArray():
 175     return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
 176                 for x in range(TLB_SET_SIZE))
 177
 178 def TLBTagEAArray():
 179     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 180                 for x in range (TLB_NUM_WAYS))
 181
 182 def TLBTagsArray():
 183     return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
 184                 for x in range (TLB_SET_SIZE))
 185
 186 def TLBPtesArray():
 187     return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
 188                 for x in range(TLB_SET_SIZE))
 189
 190 def HitWaySet():
 191     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 192                         for x in range(TLB_NUM_WAYS))
 193
 194 # Cache RAM interface
 195 def CacheRamOut():
 196     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 197                  for x in range(NUM_WAYS))
 198
 199 # PLRU output interface
 200 def PLRUOut():
 201     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 202                 for x in range(NUM_LINES))
 203
 204 # TLB PLRU output interface
 205 def TLBPLRUOut():
 206     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 207                 for x in range(TLB_SET_SIZE))
 208
 209 # Helper functions to decode incoming requests
 210 #
 211 # Return the cache line index (tag index) for an address
 212 def get_index(addr):
 213     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 214
 215 # Return the cache row index (data memory) for an address
 216 def get_row(addr):
 217     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 218
 219 # Return the index of a row within a line
 220 def get_row_of_line(row):
 221     return row[:ROW_BITS][:ROW_LINE_BITS]
 222
 223 # Returns whether this is the last row of a line
 224 def is_last_row_addr(addr, last):
 225     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 226
 227 # Returns whether this is the last row of a line
 228 def is_last_row(row, last):
 229     return get_row_of_line(row) == last
 230
 231 # Return the next row in the current cache line. We use a
 232 # dedicated function in order to limit the size of the
 233 # generated adder to be only the bits within a cache line
 234 # (3 bits with default settings)
 235 def next_row(row):
 236     row_v = row[0:ROW_LINE_BITS] + 1
 237     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 238
 239 # Get the tag value from the address
 240 def get_tag(addr):
 241     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 242
 243 # Read a tag from a tag memory row
 244 def read_tag(way, tagset):
 245     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 246
 247 # Read a TLB tag from a TLB tag memory row
 248 def read_tlb_tag(way, tags):
 249     return tags.word_select(way, TLB_EA_TAG_BITS)
 250
 251 # Write a TLB tag to a TLB tag memory row
 252 def write_tlb_tag(way, tags, tag):
 253     return read_tlb_tag(way, tags).eq(tag)
 254
 255 # Read a PTE from a TLB PTE memory row
 256 def read_tlb_pte(way, ptes):
 257     return ptes.word_select(way, TLB_PTE_BITS)
 258
 259 def write_tlb_pte(way, ptes, newpte):
 260     return read_tlb_pte(way, ptes).eq(newpte)
 261
 262
 263 # Record for storing permission, attribute, etc. bits from a PTE
 264 class PermAttr(RecordObject):
 265     def __init__(self, name=None):
 266         super().__init__(name=name)
 267         self.reference = Signal()
 268         self.changed   = Signal()
 269         self.nocache   = Signal()
 270         self.priv      = Signal()
 271         self.rd_perm   = Signal()
 272         self.wr_perm   = Signal()
 273
 274
 275 def extract_perm_attr(pte):
 276     pa = PermAttr()
 277     return pa;
 278
 279
 280 # Type of operation on a "valid" input
 281 @unique
 282 class Op(Enum):
 283     OP_NONE       = 0
 284     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 285     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 286     OP_LOAD_HIT   = 3 # Cache hit on load
 287     OP_LOAD_MISS  = 4 # Load missing cache
 288     OP_LOAD_NC    = 5 # Non-cachable load
 289     OP_STORE_HIT  = 6 # Store hitting cache
 290     OP_STORE_MISS = 7 # Store missing cache
 291
 292
 293 # Cache state machine
 294 @unique
 295 class State(Enum):
 296     IDLE             = 0 # Normal load hit processing
 297     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 298     STORE_WAIT_ACK   = 2 # Store wait ack
 299     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 300
 301
 302 # Dcache operations:
 303 #
 304 # In order to make timing, we use the BRAMs with
 305 # an output buffer, which means that the BRAM
 306 # output is delayed by an extra cycle.
 307 #
 308 # Thus, the dcache has a 2-stage internal pipeline
 309 # for cache hits with no stalls.
 310 #
 311 # All other operations are handled via stalling
 312 # in the first stage.
 313 #
 314 # The second stage can thus complete a hit at the same
 315 # time as the first stage emits a stall for a complex op.
 316 #
 317 # Stage 0 register, basically contains just the latched request
 318
 319 class RegStage0(RecordObject):
 320     def __init__(self, name=None):
 321         super().__init__(name=name)
 322         self.req     = LoadStore1ToDCacheType(name="lsmem")
 323         self.tlbie   = Signal()
 324         self.doall   = Signal()
 325         self.tlbld   = Signal()
 326         self.mmu_req = Signal() # indicates source of request
 327
 328
 329 class MemAccessRequest(RecordObject):
 330     def __init__(self, name=None):
 331         super().__init__(name=name)
 332         self.op        = Signal(Op)
 333         self.valid     = Signal()
 334         self.dcbz      = Signal()
 335         self.real_addr = Signal(REAL_ADDR_BITS)
 336         self.data      = Signal(64)
 337         self.byte_sel  = Signal(8)
 338         self.hit_way   = Signal(WAY_BITS)
 339         self.same_tag  = Signal()
 340         self.mmu_req   = Signal()
 341
 342
 343 # First stage register, contains state for stage 1 of load hits
 344 # and for the state machine used by all other operations
 345 class RegStage1(RecordObject):
 346     def __init__(self, name=None):
 347         super().__init__(name=name)
 348         # Info about the request
 349         self.full             = Signal() # have uncompleted request
 350         self.mmu_req          = Signal() # request is from MMU
 351         self.req              = MemAccessRequest(name="reqmem")
 352
 353         # Cache hit state
 354         self.hit_way          = Signal(WAY_BITS)
 355         self.hit_load_valid   = Signal()
 356         self.hit_index        = Signal(INDEX_BITS)
 357         self.cache_hit        = Signal()
 358
 359         # TLB hit state
 360         self.tlb_hit          = Signal()
 361         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 362         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 363
 364         # 2-stage data buffer for data forwarded from writes to reads
 365         self.forward_data1    = Signal(64)
 366         self.forward_data2    = Signal(64)
 367         self.forward_sel1     = Signal(8)
 368         self.forward_valid1   = Signal()
 369         self.forward_way1     = Signal(WAY_BITS)
 370         self.forward_row1     = Signal(ROW_BITS)
 371         self.use_forward1     = Signal()
 372         self.forward_sel      = Signal(8)
 373
 374         # Cache miss state (reload state machine)
 375         self.state            = Signal(State)
 376         self.dcbz             = Signal()
 377         self.write_bram       = Signal()
 378         self.write_tag        = Signal()
 379         self.slow_valid       = Signal()
 380         self.real_adr         = Signal(REAL_ADDR_BITS)
 381         self.wb               = WBMasterOut("wb")
 382         self.reload_tag       = Signal(TAG_BITS)
 383         self.store_way        = Signal(WAY_BITS)
 384         self.store_row        = Signal(ROW_BITS)
 385         self.store_index      = Signal(INDEX_BITS)
 386         self.end_row_ix       = Signal(ROW_LINE_BITS)
 387         self.rows_valid       = RowPerLineValidArray()
 388         self.acks_pending     = Signal(3)
 389         self.inc_acks         = Signal()
 390         self.dec_acks         = Signal()
 391
 392         # Signals to complete (possibly with error)
 393         self.ls_valid         = Signal()
 394         self.ls_error         = Signal()
 395         self.mmu_done         = Signal()
 396         self.mmu_error        = Signal()
 397         self.cache_paradox    = Signal()
 398
 399         # Signal to complete a failed stcx.
 400         self.stcx_fail        = Signal()
 401
 402
 403 # Reservation information
 404 class Reservation(RecordObject):
 405     def __init__(self):
 406         super().__init__()
 407         self.valid = Signal()
 408         self.addr  = Signal(64-LINE_OFF_BITS)
 409
 410
 411 class DTLBUpdate(Elaboratable):
 412     def __init__(self):
 413         self.tlbie    = Signal()
 414         self.tlbwe    = Signal()
 415         self.doall    = Signal()
 416         self.updated  = Signal()
 417         self.v_updated  = Signal()
 418         self.tlb_hit    = Signal()
 419         self.tlb_req_index = Signal(TLB_SET_BITS)
 420
 421         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 422         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 423         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 424         self.repl_way        = Signal(TLB_WAY_BITS)
 425         self.eatag           = Signal(TLB_EA_TAG_BITS)
 426         self.pte_data        = Signal(TLB_PTE_BITS)
 427
 428         self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 429
 430         self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 431         self.pb_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 432         self.db_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 433
 434     def elaborate(self, platform):
 435         m = Module()
 436         comb = m.d.comb
 437         sync = m.d.sync
 438
 439         tagset   = Signal(TLB_TAG_WAY_BITS)
 440         pteset   = Signal(TLB_PTE_WAY_BITS)
 441
 442         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 443         comb += db_out.eq(self.dv)
 444
 445         with m.If(self.tlbie & self.doall):
 446             pass # clear all back in parent
 447         with m.Elif(self.tlbie):
 448             with m.If(self.tlb_hit):
 449                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 450                 comb += self.v_updated.eq(1)
 451
 452         with m.Elif(self.tlbwe):
 453
 454             comb += tagset.eq(self.tlb_tag_way)
 455             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 456             comb += tb_out.eq(tagset)
 457
 458             comb += pteset.eq(self.tlb_pte_way)
 459             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 460             comb += pb_out.eq(pteset)
 461
 462             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 463
 464             comb += self.updated.eq(1)
 465             comb += self.v_updated.eq(1)
 466
 467         return m
 468
 469
 470 class DCachePendingHit(Elaboratable):
 471
 472     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 473                       cache_valid_idx, cache_tag_set,
 474                     req_addr,
 475                     hit_set):
 476
 477         self.go          = Signal()
 478         self.virt_mode   = Signal()
 479         self.is_hit      = Signal()
 480         self.tlb_hit     = Signal()
 481         self.hit_way     = Signal(WAY_BITS)
 482         self.rel_match   = Signal()
 483         self.req_index   = Signal(INDEX_BITS)
 484         self.reload_tag  = Signal(TAG_BITS)
 485
 486         self.tlb_hit_way = tlb_hit_way
 487         self.tlb_pte_way = tlb_pte_way
 488         self.tlb_valid_way = tlb_valid_way
 489         self.cache_valid_idx = cache_valid_idx
 490         self.cache_tag_set = cache_tag_set
 491         self.req_addr = req_addr
 492         self.hit_set = hit_set
 493
 494     def elaborate(self, platform):
 495         m = Module()
 496         comb = m.d.comb
 497         sync = m.d.sync
 498
 499         go = self.go
 500         virt_mode = self.virt_mode
 501         is_hit = self.is_hit
 502         tlb_pte_way = self.tlb_pte_way
 503         tlb_valid_way = self.tlb_valid_way
 504         cache_valid_idx = self.cache_valid_idx
 505         cache_tag_set = self.cache_tag_set
 506         req_addr = self.req_addr
 507         tlb_hit_way = self.tlb_hit_way
 508         tlb_hit = self.tlb_hit
 509         hit_set = self.hit_set
 510         hit_way = self.hit_way
 511         rel_match = self.rel_match
 512         req_index = self.req_index
 513         reload_tag = self.reload_tag
 514
 515         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 516                                     for i in range(TLB_NUM_WAYS))
 517         hit_way_set = HitWaySet()
 518
 519         # Test if pending request is a hit on any way
 520         # In order to make timing in virtual mode,
 521         # when we are using the TLB, we compare each
 522         # way with each of the real addresses from each way of
 523         # the TLB, and then decide later which match to use.
 524
 525         with m.If(virt_mode):
 526             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 527                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 528                 s_hit       = Signal()
 529                 s_pte       = Signal(TLB_PTE_BITS)
 530                 s_ra        = Signal(REAL_ADDR_BITS)
 531                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 532                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 533                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 534                 comb += s_tag.eq(get_tag(s_ra))
 535
 536                 for i in range(NUM_WAYS): # way_t
 537                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 538                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 539                                   (read_tag(i, cache_tag_set) == s_tag)
 540                                   & tlb_valid_way[j])
 541                     with m.If(is_tag_hit):
 542                         comb += hit_way_set[j].eq(i)
 543                         comb += s_hit.eq(1)
 544                 comb += hit_set[j].eq(s_hit)
 545                 with m.If(s_tag == reload_tag):
 546                     comb += rel_matches[j].eq(1)
 547             with m.If(tlb_hit):
 548                 comb += is_hit.eq(hit_set[tlb_hit_way])
 549                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 550                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 551         with m.Else():
 552             s_tag       = Signal(TAG_BITS)
 553             comb += s_tag.eq(get_tag(req_addr))
 554             for i in range(NUM_WAYS): # way_t
 555                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 556                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 557                           (read_tag(i, cache_tag_set) == s_tag))
 558                 with m.If(is_tag_hit):
 559                     comb += hit_way.eq(i)
 560                     comb += is_hit.eq(1)
 561             with m.If(s_tag == reload_tag):
 562                 comb += rel_match.eq(1)
 563
 564         return m
 565
 566
 567 class DCache(Elaboratable):
 568     """Set associative dcache write-through
 569     TODO (in no specific order):
 570     * See list in icache.vhdl
 571     * Complete load misses on the cycle when WB data comes instead of
 572       at the end of line (this requires dealing with requests coming in
 573       while not idle...)
 574     """
 575     def __init__(self):
 576         self.d_in      = LoadStore1ToDCacheType("d_in")
 577         self.d_out     = DCacheToLoadStore1Type("d_out")
 578
 579         self.m_in      = MMUToDCacheType("m_in")
 580         self.m_out     = DCacheToMMUType("m_out")
 581
 582         self.stall_out = Signal()
 583
 584         self.wb_out    = WBMasterOut()
 585         self.wb_in     = WBSlaveOut()
 586
 587         self.log_out   = Signal(20)
 588
 589     def stage_0(self, m, r0, r1, r0_full):
 590         """Latch the request in r0.req as long as we're not stalling
 591         """
 592         comb = m.d.comb
 593         sync = m.d.sync
 594         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 595
 596         r = RegStage0("stage0")
 597
 598         # TODO, this goes in unit tests and formal proofs
 599         with m.If(d_in.valid & m_in.valid):
 600             sync += Display("request collision loadstore vs MMU")
 601
 602         with m.If(m_in.valid):
 603             comb += r.req.valid.eq(1)
 604             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 605             comb += r.req.dcbz.eq(0)
 606             comb += r.req.nc.eq(0)
 607             comb += r.req.reserve.eq(0)
 608             comb += r.req.virt_mode.eq(0)
 609             comb += r.req.priv_mode.eq(1)
 610             comb += r.req.addr.eq(m_in.addr)
 611             comb += r.req.data.eq(m_in.pte)
 612             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 613             comb += r.tlbie.eq(m_in.tlbie)
 614             comb += r.doall.eq(m_in.doall)
 615             comb += r.tlbld.eq(m_in.tlbld)
 616             comb += r.mmu_req.eq(1)
 617         with m.Else():
 618             comb += r.req.eq(d_in)
 619             comb += r.tlbie.eq(0)
 620             comb += r.doall.eq(0)
 621             comb += r.tlbld.eq(0)
 622             comb += r.mmu_req.eq(0)
 623         with m.If(~(r1.full & r0_full)):
 624             sync += r0.eq(r)
 625             sync += r0_full.eq(r.req.valid)
 626
 627     def tlb_read(self, m, r0_stall, tlb_valid_way,
 628                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 629                  dtlb_tags, dtlb_ptes):
 630         """TLB
 631         Operates in the second cycle on the request latched in r0.req.
 632         TLB updates write the entry at the end of the second cycle.
 633         """
 634         comb = m.d.comb
 635         sync = m.d.sync
 636         m_in, d_in = self.m_in, self.d_in
 637
 638         index    = Signal(TLB_SET_BITS)
 639         addrbits = Signal(TLB_SET_BITS)
 640
 641         amin = TLB_LG_PGSZ
 642         amax = TLB_LG_PGSZ + TLB_SET_BITS
 643
 644         with m.If(m_in.valid):
 645             comb += addrbits.eq(m_in.addr[amin : amax])
 646         with m.Else():
 647             comb += addrbits.eq(d_in.addr[amin : amax])
 648         comb += index.eq(addrbits)
 649
 650         # If we have any op and the previous op isn't finished,
 651         # then keep the same output for next cycle.
 652         with m.If(~r0_stall):
 653             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 654             sync += tlb_tag_way.eq(dtlb_tags[index])
 655             sync += tlb_pte_way.eq(dtlb_ptes[index])
 656
 657     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 658         """Generate TLB PLRUs
 659         """
 660         comb = m.d.comb
 661         sync = m.d.sync
 662
 663         if TLB_NUM_WAYS == 0:
 664             return
 665         for i in range(TLB_SET_SIZE):
 666             # TLB PLRU interface
 667             tlb_plru        = PLRU(TLB_WAY_BITS)
 668             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 669             tlb_plru_acc_en = Signal()
 670
 671             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 672             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 673             comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
 674             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 675
 676     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 677                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 678                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 679
 680         comb = m.d.comb
 681
 682         hitway = Signal(TLB_WAY_BITS)
 683         hit    = Signal()
 684         eatag  = Signal(TLB_EA_TAG_BITS)
 685
 686         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 687         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 688         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 689
 690         for i in range(TLB_NUM_WAYS):
 691             is_tag_hit = Signal()
 692             comb += is_tag_hit.eq(tlb_valid_way[i]
 693                                   & (read_tlb_tag(i, tlb_tag_way) == eatag))
 694             with m.If(is_tag_hit):
 695                 comb += hitway.eq(i)
 696                 comb += hit.eq(1)
 697
 698         comb += tlb_hit.eq(hit & r0_valid)
 699         comb += tlb_hit_way.eq(hitway)
 700
 701         with m.If(tlb_hit):
 702             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 703         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 704
 705         with m.If(r0.req.virt_mode):
 706             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 707                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 708                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 709             comb += perm_attr.reference.eq(pte[8])
 710             comb += perm_attr.changed.eq(pte[7])
 711             comb += perm_attr.nocache.eq(pte[5])
 712             comb += perm_attr.priv.eq(pte[3])
 713             comb += perm_attr.rd_perm.eq(pte[2])
 714             comb += perm_attr.wr_perm.eq(pte[1])
 715         with m.Else():
 716             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 717                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 718             comb += perm_attr.reference.eq(1)
 719             comb += perm_attr.changed.eq(1)
 720             comb += perm_attr.nocache.eq(0)
 721             comb += perm_attr.priv.eq(1)
 722             comb += perm_attr.rd_perm.eq(1)
 723             comb += perm_attr.wr_perm.eq(1)
 724
 725     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 726                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 727                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 728
 729         dtlb_valids = TLBValidBitsArray()
 730
 731         comb = m.d.comb
 732         sync = m.d.sync
 733
 734         tlbie    = Signal()
 735         tlbwe    = Signal()
 736
 737         comb += tlbie.eq(r0_valid & r0.tlbie)
 738         comb += tlbwe.eq(r0_valid & r0.tlbld)
 739
 740         m.submodules.tlb_update = d = DTLBUpdate()
 741         with m.If(tlbie & r0.doall):
 742             # clear all valid bits at once
 743             for i in range(TLB_SET_SIZE):
 744                 sync += dtlb_valid_bits[i].eq(0)
 745         with m.If(d.updated):
 746             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 747             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 748         with m.If(d.v_updated):
 749             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 750
 751         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 752
 753         comb += d.tlbie.eq(tlbie)
 754         comb += d.tlbwe.eq(tlbwe)
 755         comb += d.doall.eq(r0.doall)
 756         comb += d.tlb_hit.eq(tlb_hit)
 757         comb += d.tlb_hit_way.eq(tlb_hit_way)
 758         comb += d.tlb_tag_way.eq(tlb_tag_way)
 759         comb += d.tlb_pte_way.eq(tlb_pte_way)
 760         comb += d.tlb_req_index.eq(tlb_req_index)
 761
 762         with m.If(tlb_hit):
 763             comb += d.repl_way.eq(tlb_hit_way)
 764         with m.Else():
 765             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 766         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 767         comb += d.pte_data.eq(r0.req.data)
 768
 769     def maybe_plrus(self, m, r1, plru_victim):
 770         """Generate PLRUs
 771         """
 772         comb = m.d.comb
 773         sync = m.d.sync
 774
 775         if TLB_NUM_WAYS == 0:
 776             return
 777
 778         for i in range(NUM_LINES):
 779             # PLRU interface
 780             plru        = PLRU(WAY_BITS)
 781             setattr(m.submodules, "plru%d" % i, plru)
 782             plru_acc_en = Signal()
 783
 784             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 785             comb += plru.acc_en.eq(plru_acc_en)
 786             comb += plru.acc_i.eq(r1.hit_way)
 787             comb += plru_victim[i].eq(plru.lru_o)
 788
 789     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 790         """Cache tag RAM read port
 791         """
 792         comb = m.d.comb
 793         sync = m.d.sync
 794         m_in, d_in = self.m_in, self.d_in
 795
 796         index = Signal(INDEX_BITS)
 797
 798         with m.If(r0_stall):
 799             comb += index.eq(req_index)
 800         with m.Elif(m_in.valid):
 801             comb += index.eq(get_index(m_in.addr))
 802         with m.Else():
 803             comb += index.eq(get_index(d_in.addr))
 804         sync += cache_tag_set.eq(cache_tags[index])
 805
 806     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 807                        r0_valid, r1, cache_valids, replace_way,
 808                        use_forward1_next, use_forward2_next,
 809                        req_hit_way, plru_victim, rc_ok, perm_attr,
 810                        valid_ra, perm_ok, access_ok, req_op, req_go,
 811                        tlb_pte_way,
 812                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 813                        cancel_store, req_same_tag, r0_stall, early_req_row):
 814         """Cache request parsing and hit detection
 815         """
 816
 817         comb = m.d.comb
 818         sync = m.d.sync
 819         m_in, d_in = self.m_in, self.d_in
 820
 821         is_hit      = Signal()
 822         hit_way     = Signal(WAY_BITS)
 823         op          = Signal(Op)
 824         opsel       = Signal(3)
 825         go          = Signal()
 826         nc          = Signal()
 827         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 828                                   for i in range(TLB_NUM_WAYS))
 829         cache_valid_idx = Signal(NUM_WAYS)
 830
 831         # Extract line, row and tag from request
 832         comb += req_index.eq(get_index(r0.req.addr))
 833         comb += req_row.eq(get_row(r0.req.addr))
 834         comb += req_tag.eq(get_tag(ra))
 835
 836         if False: # display on comb is a bit... busy.
 837             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 838                     r0.req.addr, ra, req_index, req_tag, req_row)
 839
 840         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 841         comb += cache_valid_idx.eq(cache_valids[req_index])
 842
 843         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 844                                 tlb_valid_way, tlb_hit_way,
 845                                 cache_valid_idx, cache_tag_set,
 846                                 r0.req.addr,
 847                                 hit_set)
 848
 849         comb += dc.tlb_hit.eq(tlb_hit)
 850         comb += dc.reload_tag.eq(r1.reload_tag)
 851         comb += dc.virt_mode.eq(r0.req.virt_mode)
 852         comb += dc.go.eq(go)
 853         comb += dc.req_index.eq(req_index)
 854         comb += is_hit.eq(dc.is_hit)
 855         comb += hit_way.eq(dc.hit_way)
 856         comb += req_same_tag.eq(dc.rel_match)
 857
 858         # See if the request matches the line currently being reloaded
 859         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 860                   (req_index == r1.store_index) & req_same_tag):
 861             # For a store, consider this a hit even if the row isn't
 862             # valid since it will be by the time we perform the store.
 863             # For a load, check the appropriate row valid bit.
 864             rrow = Signal(ROW_LINE_BITS)
 865             comb += rrow.eq(req_row)
 866             valid = r1.rows_valid[rrow]
 867             comb += is_hit.eq((~r0.req.load) | valid)
 868             comb += hit_way.eq(replace_way)
 869
 870         # Whether to use forwarded data for a load or not
 871         with m.If((get_row(r1.req.real_addr) == req_row) &
 872                   (r1.req.hit_way == hit_way)):
 873             # Only need to consider r1.write_bram here, since if we
 874             # are writing refill data here, then we don't have a
 875             # cache hit this cycle on the line being refilled.
 876             # (There is the possibility that the load following the
 877             # load miss that started the refill could be to the old
 878             # contents of the victim line, since it is a couple of
 879             # cycles after the refill starts before we see the updated
 880             # cache tag. In that case we don't use the bypass.)
 881             comb += use_forward1_next.eq(r1.write_bram)
 882         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 883             comb += use_forward2_next.eq(r1.forward_valid1)
 884
 885         # The way that matched on a hit
 886         comb += req_hit_way.eq(hit_way)
 887
 888         # The way to replace on a miss
 889         with m.If(r1.write_tag):
 890             comb += replace_way.eq(plru_victim[r1.store_index])
 891         with m.Else():
 892             comb += replace_way.eq(r1.store_way)
 893
 894         # work out whether we have permission for this access
 895         # NB we don't yet implement AMR, thus no KUAP
 896         comb += rc_ok.eq(perm_attr.reference
 897                          & (r0.req.load | perm_attr.changed))
 898         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 899                            (perm_attr.wr_perm |
 900                               (r0.req.load & perm_attr.rd_perm)))
 901         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 902         # Combine the request and cache hit status to decide what
 903         # operation needs to be done
 904         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 905         comb += op.eq(Op.OP_NONE)
 906         with m.If(go):
 907             with m.If(~access_ok):
 908                 comb += op.eq(Op.OP_BAD)
 909             with m.Elif(cancel_store):
 910                 comb += op.eq(Op.OP_STCX_FAIL)
 911             with m.Else():
 912                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 913                 with m.Switch(opsel):
 914                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 915                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 916                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 917                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 918                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 919                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 920                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 921                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 922         comb += req_op.eq(op)
 923         comb += req_go.eq(go)
 924
 925         # Version of the row number that is valid one cycle earlier
 926         # in the cases where we need to read the cache data BRAM.
 927         # If we're stalling then we need to keep reading the last
 928         # row requested.
 929         with m.If(~r0_stall):
 930             with m.If(m_in.valid):
 931                 comb += early_req_row.eq(get_row(m_in.addr))
 932             with m.Else():
 933                 comb += early_req_row.eq(get_row(d_in.addr))
 934         with m.Else():
 935             comb += early_req_row.eq(req_row)
 936
 937     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 938                          r0_valid, r0, reservation):
 939         """Handle load-with-reservation and store-conditional instructions
 940         """
 941         comb = m.d.comb
 942
 943         with m.If(r0_valid & r0.req.reserve):
 944             # XXX generate alignment interrupt if address
 945             # is not aligned XXX or if r0.req.nc = '1'
 946             with m.If(r0.req.load):
 947                 comb += set_rsrv.eq(1) # load with reservation
 948             with m.Else():
 949                 comb += clear_rsrv.eq(1) # store conditional
 950                 with m.If((~reservation.valid) |
 951                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
 952                     comb += cancel_store.eq(1)
 953
 954     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 955                         reservation, r0):
 956
 957         comb = m.d.comb
 958         sync = m.d.sync
 959
 960         with m.If(r0_valid & access_ok):
 961             with m.If(clear_rsrv):
 962                 sync += reservation.valid.eq(0)
 963             with m.Elif(set_rsrv):
 964                 sync += reservation.valid.eq(1)
 965                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 966
 967     def writeback_control(self, m, r1, cache_out_row):
 968         """Return data for loads & completion control logic
 969         """
 970         comb = m.d.comb
 971         sync = m.d.sync
 972         d_out, m_out = self.d_out, self.m_out
 973
 974         data_out = Signal(64)
 975         data_fwd = Signal(64)
 976
 977         # Use the bypass if are reading the row that was
 978         # written 1 or 2 cycles ago, including for the
 979         # slow_valid = 1 case (i.e. completing a load
 980         # miss or a non-cacheable load).
 981         with m.If(r1.use_forward1):
 982             comb += data_fwd.eq(r1.forward_data1)
 983         with m.Else():
 984             comb += data_fwd.eq(r1.forward_data2)
 985
 986         comb += data_out.eq(cache_out_row)
 987
 988         for i in range(8):
 989             with m.If(r1.forward_sel[i]):
 990                 dsel = data_fwd.word_select(i, 8)
 991                 comb += data_out.word_select(i, 8).eq(dsel)
 992
 993         comb += d_out.valid.eq(r1.ls_valid)
 994         comb += d_out.data.eq(data_out)
 995         comb += d_out.store_done.eq(~r1.stcx_fail)
 996         comb += d_out.error.eq(r1.ls_error)
 997         comb += d_out.cache_paradox.eq(r1.cache_paradox)
 998
 999         # Outputs to MMU
1000         comb += m_out.done.eq(r1.mmu_done)
1001         comb += m_out.err.eq(r1.mmu_error)
1002         comb += m_out.data.eq(data_out)
1003
1004         # We have a valid load or store hit or we just completed
1005         # a slow op such as a load miss, a NC load or a store
1006         #
1007         # Note: the load hit is delayed by one cycle. However it
1008         # can still not collide with r.slow_valid (well unless I
1009         # miscalculated) because slow_valid can only be set on a
1010         # subsequent request and not on its first cycle (the state
1011         # machine must have advanced), which makes slow_valid
1012         # at least 2 cycles from the previous hit_load_valid.
1013
1014         # Sanity: Only one of these must be set in any given cycle
1015
1016         if False: # TODO: need Display to get this to work
1017             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1018             "unexpected slow_valid collision with stcx_fail"
1019
1020             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1021              "unexpected hit_load_delayed collision with slow_valid"
1022
1023         with m.If(~r1.mmu_req):
1024             # Request came from loadstore1...
1025             # Load hit case is the standard path
1026             with m.If(r1.hit_load_valid):
1027                 sync += Display("completing load hit data=%x", data_out)
1028
1029             # error cases complete without stalling
1030             with m.If(r1.ls_error):
1031                 sync += Display("completing ld/st with error")
1032
1033             # Slow ops (load miss, NC, stores)
1034             with m.If(r1.slow_valid):
1035                 sync += Display("completing store or load miss data=%x",
1036                                 data_out)
1037
1038         with m.Else():
1039             # Request came from MMU
1040             with m.If(r1.hit_load_valid):
1041                 sync += Display("completing load hit to MMU, data=%x",
1042                                 m_out.data)
1043             # error cases complete without stalling
1044             with m.If(r1.mmu_error):
1045                 sync += Display("combpleting MMU ld with error")
1046
1047             # Slow ops (i.e. load miss)
1048             with m.If(r1.slow_valid):
1049                 sync += Display("completing MMU load miss, data=%x",
1050                                 m_out.data)
1051
1052     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1053         """rams
1054         Generate a cache RAM for each way. This handles the normal
1055         reads, writes from reloads and the special store-hit update
1056         path as well.
1057
1058         Note: the BRAMs have an extra read buffer, meaning the output
1059         is pipelined an extra cycle. This differs from the
1060         icache. The writeback logic needs to take that into
1061         account by using 1-cycle delayed signals for load hits.
1062         """
1063         comb = m.d.comb
1064         wb_in = self.wb_in
1065
1066         for i in range(NUM_WAYS):
1067             do_read  = Signal(name="do_rd%d" % i)
1068             rd_addr  = Signal(ROW_BITS)
1069             do_write = Signal(name="do_wr%d" % i)
1070             wr_addr  = Signal(ROW_BITS)
1071             wr_data  = Signal(WB_DATA_BITS)
1072             wr_sel   = Signal(ROW_SIZE)
1073             wr_sel_m = Signal(ROW_SIZE)
1074             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1075
1076             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True)
1077             setattr(m.submodules, "cacheram_%d" % i, way)
1078
1079             comb += way.rd_en.eq(do_read)
1080             comb += way.rd_addr.eq(rd_addr)
1081             comb += _d_out.eq(way.rd_data_o)
1082             comb += way.wr_sel.eq(wr_sel_m)
1083             comb += way.wr_addr.eq(wr_addr)
1084             comb += way.wr_data.eq(wr_data)
1085
1086             # Cache hit reads
1087             comb += do_read.eq(1)
1088             comb += rd_addr.eq(early_req_row)
1089             with m.If(r1.hit_way == i):
1090                 comb += cache_out_row.eq(_d_out)
1091
1092             # Write mux:
1093             #
1094             # Defaults to wishbone read responses (cache refill)
1095             #
1096             # For timing, the mux on wr_data/sel/addr is not
1097             # dependent on anything other than the current state.
1098
1099             with m.If(r1.write_bram):
1100                 # Write store data to BRAM.  This happens one
1101                 # cycle after the store is in r0.
1102                 comb += wr_data.eq(r1.req.data)
1103                 comb += wr_sel.eq(r1.req.byte_sel)
1104                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1105
1106                 with m.If(i == r1.req.hit_way):
1107                     comb += do_write.eq(1)
1108             with m.Else():
1109                 # Otherwise, we might be doing a reload or a DCBZ
1110                 with m.If(r1.dcbz):
1111                     comb += wr_data.eq(0)
1112                 with m.Else():
1113                     comb += wr_data.eq(wb_in.dat)
1114                 comb += wr_addr.eq(r1.store_row)
1115                 comb += wr_sel.eq(~0) # all 1s
1116
1117             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1118                       & wb_in.ack & (replace_way == i)):
1119                 comb += do_write.eq(1)
1120
1121             # Mask write selects with do_write since BRAM
1122             # doesn't have a global write-enable
1123             with m.If(do_write):
1124                 comb += wr_sel_m.eq(wr_sel)
1125
1126     # Cache hit synchronous machine for the easy case.
1127     # This handles load hits.
1128     # It also handles error cases (TLB miss, cache paradox)
1129     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1130                         req_hit_way, req_index, req_tag, access_ok,
1131                         tlb_hit, tlb_hit_way, tlb_req_index):
1132
1133         comb = m.d.comb
1134         sync = m.d.sync
1135
1136         with m.If(req_op != Op.OP_NONE):
1137             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1138                     req_op, r0.req.addr, r0.req.nc,
1139                     req_index, req_tag, req_hit_way)
1140
1141         with m.If(r0_valid):
1142             sync += r1.mmu_req.eq(r0.mmu_req)
1143
1144         # Fast path for load/store hits.
1145         # Set signals for the writeback controls.
1146         sync += r1.hit_way.eq(req_hit_way)
1147         sync += r1.hit_index.eq(req_index)
1148
1149         with m.If(req_op == Op.OP_LOAD_HIT):
1150             sync += r1.hit_load_valid.eq(1)
1151         with m.Else():
1152             sync += r1.hit_load_valid.eq(0)
1153
1154         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1155             sync += r1.cache_hit.eq(1)
1156         with m.Else():
1157             sync += r1.cache_hit.eq(0)
1158
1159         with m.If(req_op == Op.OP_BAD):
1160             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1161             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1162             sync += r1.ls_error.eq(~r0.mmu_req)
1163             sync += r1.mmu_error.eq(r0.mmu_req)
1164             sync += r1.cache_paradox.eq(access_ok)
1165
1166             with m.Else():
1167                 sync += r1.ls_error.eq(0)
1168                 sync += r1.mmu_error.eq(0)
1169                 sync += r1.cache_paradox.eq(0)
1170
1171         with m.If(req_op == Op.OP_STCX_FAIL):
1172             sync += r1.stcx_fail.eq(1)
1173         with m.Else():
1174             sync += r1.stcx_fail.eq(0)
1175
1176         # Record TLB hit information for updating TLB PLRU
1177         sync += r1.tlb_hit.eq(tlb_hit)
1178         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1179         sync += r1.tlb_hit_index.eq(tlb_req_index)
1180
1181     # Memory accesses are handled by this state machine:
1182     #
1183     #   * Cache load miss/reload (in conjunction with "rams")
1184     #   * Load hits for non-cachable forms
1185     #   * Stores (the collision case is handled in "rams")
1186     #
1187     # All wishbone requests generation is done here.
1188     # This machine operates at stage 1.
1189     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1190                     cache_valids, r0, replace_way,
1191                     req_hit_way, req_same_tag,
1192                     r0_valid, req_op, cache_tags, req_go, ra):
1193
1194         comb = m.d.comb
1195         sync = m.d.sync
1196         wb_in = self.wb_in
1197
1198         req         = MemAccessRequest("mreq_ds")
1199         acks        = Signal(3)
1200         adjust_acks = Signal(3)
1201
1202         req_row = Signal(ROW_BITS)
1203         req_idx = Signal(INDEX_BITS)
1204         req_tag = Signal(TAG_BITS)
1205         comb += req_idx.eq(get_index(req.real_addr))
1206         comb += req_row.eq(get_row(req.real_addr))
1207         comb += req_tag.eq(get_tag(req.real_addr))
1208
1209         sync += r1.use_forward1.eq(use_forward1_next)
1210         sync += r1.forward_sel.eq(0)
1211
1212         with m.If(use_forward1_next):
1213             sync += r1.forward_sel.eq(r1.req.byte_sel)
1214         with m.Elif(use_forward2_next):
1215             sync += r1.forward_sel.eq(r1.forward_sel1)
1216
1217         sync += r1.forward_data2.eq(r1.forward_data1)
1218         with m.If(r1.write_bram):
1219             sync += r1.forward_data1.eq(r1.req.data)
1220             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1221             sync += r1.forward_way1.eq(r1.req.hit_way)
1222             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1223             sync += r1.forward_valid1.eq(1)
1224         with m.Else():
1225             with m.If(r1.dcbz):
1226                 sync += r1.forward_data1.eq(0)
1227             with m.Else():
1228                 sync += r1.forward_data1.eq(wb_in.dat)
1229             sync += r1.forward_sel1.eq(~0) # all 1s
1230             sync += r1.forward_way1.eq(replace_way)
1231             sync += r1.forward_row1.eq(r1.store_row)
1232             sync += r1.forward_valid1.eq(0)
1233
1234         # One cycle pulses reset
1235         sync += r1.slow_valid.eq(0)
1236         sync += r1.write_bram.eq(0)
1237         sync += r1.inc_acks.eq(0)
1238         sync += r1.dec_acks.eq(0)
1239
1240         sync += r1.ls_valid.eq(0)
1241         # complete tlbies and TLB loads in the third cycle
1242         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1243
1244         with m.If((req_op == Op.OP_LOAD_HIT)
1245                   | (req_op == Op.OP_STCX_FAIL)):
1246             with m.If(~r0.mmu_req):
1247                 sync += r1.ls_valid.eq(1)
1248             with m.Else():
1249                 sync += r1.mmu_done.eq(1)
1250
1251         with m.If(r1.write_tag):
1252             # Store new tag in selected way
1253             for i in range(NUM_WAYS):
1254                 with m.If(i == replace_way):
1255                     ct = Signal(TAG_RAM_WIDTH)
1256                     comb += ct.eq(cache_tags[r1.store_index])
1257                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1258                     sync += cache_tags[r1.store_index].eq(ct)
1259             sync += r1.store_way.eq(replace_way)
1260             sync += r1.write_tag.eq(0)
1261
1262         # Take request from r1.req if there is one there,
1263         # else from req_op, ra, etc.
1264         with m.If(r1.full):
1265             comb += req.eq(r1.req)
1266         with m.Else():
1267             comb += req.op.eq(req_op)
1268             comb += req.valid.eq(req_go)
1269             comb += req.mmu_req.eq(r0.mmu_req)
1270             comb += req.dcbz.eq(r0.req.dcbz)
1271             comb += req.real_addr.eq(ra)
1272
1273             with m.If(~r0.req.dcbz):
1274                 comb += req.data.eq(r0.req.data)
1275             with m.Else():
1276                 comb += req.data.eq(0)
1277
1278             # Select all bytes for dcbz
1279             # and for cacheable loads
1280             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1281                 comb += req.byte_sel.eq(~0) # all 1s
1282             with m.Else():
1283                 comb += req.byte_sel.eq(r0.req.byte_sel)
1284             comb += req.hit_way.eq(req_hit_way)
1285             comb += req.same_tag.eq(req_same_tag)
1286
1287             # Store the incoming request from r0,
1288             # if it is a slow request
1289             # Note that r1.full = 1 implies req_op = OP_NONE
1290             with m.If((req_op == Op.OP_LOAD_MISS)
1291                       | (req_op == Op.OP_LOAD_NC)
1292                       | (req_op == Op.OP_STORE_MISS)
1293                       | (req_op == Op.OP_STORE_HIT)):
1294                 sync += r1.req.eq(req)
1295                 sync += r1.full.eq(1)
1296
1297         # Main state machine
1298         with m.Switch(r1.state):
1299
1300             with m.Case(State.IDLE):
1301                 sync += r1.real_adr.eq(req.real_addr)
1302                 sync += r1.wb.sel.eq(req.byte_sel)
1303                 sync += r1.wb.dat.eq(req.data)
1304                 sync += r1.dcbz.eq(req.dcbz)
1305
1306                 # Keep track of our index and way
1307                 # for subsequent stores.
1308                 sync += r1.store_index.eq(req_idx)
1309                 sync += r1.store_row.eq(req_row)
1310                 sync += r1.end_row_ix.eq(get_row_of_line(req_row))
1311                 sync += r1.reload_tag.eq(req_tag)
1312                 sync += r1.req.same_tag.eq(1)
1313
1314                 with m.If(req.op == Op.OP_STORE_HIT):
1315                     sync += r1.store_way.eq(req.hit_way)
1316
1317                 # Reset per-row valid bits,
1318                 # ready for handling OP_LOAD_MISS
1319                 for i in range(ROW_PER_LINE):
1320                     sync += r1.rows_valid[i].eq(0)
1321
1322                 with m.If(req_op != Op.OP_NONE):
1323                     sync += Display("cache op %d", req.op)
1324
1325                 with m.Switch(req.op):
1326                     with m.Case(Op.OP_LOAD_HIT):
1327                         # stay in IDLE state
1328                         pass
1329
1330                     with m.Case(Op.OP_LOAD_MISS):
1331                         sync += Display("cache miss real addr: %x " \
1332                                 "idx: %x tag: %x",
1333                                 req.real_addr, req_row, req_tag)
1334
1335                         # Start the wishbone cycle
1336                         sync += r1.wb.we.eq(0)
1337                         sync += r1.wb.cyc.eq(1)
1338                         sync += r1.wb.stb.eq(1)
1339
1340                         # Track that we had one request sent
1341                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1342                         sync += r1.write_tag.eq(1)
1343
1344                     with m.Case(Op.OP_LOAD_NC):
1345                         sync += r1.wb.cyc.eq(1)
1346                         sync += r1.wb.stb.eq(1)
1347                         sync += r1.wb.we.eq(0)
1348                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1349
1350                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1351                         with m.If(~req.dcbz):
1352                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1353                             sync += r1.acks_pending.eq(1)
1354                             sync += r1.full.eq(0)
1355                             sync += r1.slow_valid.eq(1)
1356
1357                             with m.If(~req.mmu_req):
1358                                 sync += r1.ls_valid.eq(1)
1359                             with m.Else():
1360                                 sync += r1.mmu_done.eq(1)
1361
1362                             with m.If(req.op == Op.OP_STORE_HIT):
1363                                 sync += r1.write_bram.eq(1)
1364                         with m.Else():
1365                             # dcbz is handled much like a load miss except
1366                             # that we are writing to memory instead of reading
1367                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1368
1369                             with m.If(req.op == Op.OP_STORE_MISS):
1370                                 sync += r1.write_tag.eq(1)
1371
1372                         sync += r1.wb.we.eq(1)
1373                         sync += r1.wb.cyc.eq(1)
1374                         sync += r1.wb.stb.eq(1)
1375
1376                     # OP_NONE and OP_BAD do nothing
1377                     # OP_BAD & OP_STCX_FAIL were
1378                     # handled above already
1379                     with m.Case(Op.OP_NONE):
1380                         pass
1381                     with m.Case(Op.OP_BAD):
1382                         pass
1383                     with m.Case(Op.OP_STCX_FAIL):
1384                         pass
1385
1386             with m.Case(State.RELOAD_WAIT_ACK):
1387                 ld_stbs_done = Signal()
1388                 # Requests are all sent if stb is 0
1389                 comb += ld_stbs_done.eq(~r1.wb.stb)
1390
1391                 with m.If((~wb_in.stall) & r1.wb.stb):
1392                     # That was the last word?
1393                     # We are done sending.
1394                     # Clear stb and set ld_stbs_done
1395                     # so we can handle an eventual
1396                     # last ack on the same cycle.
1397                     with m.If(is_last_row_addr(r1.real_adr, r1.end_row_ix)):
1398                         sync += r1.wb.stb.eq(0)
1399                         comb += ld_stbs_done.eq(1)
1400
1401                     # Calculate the next row address in the current cache line
1402                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1403                     comb += row.eq(r1.real_adr[ROW_OFF_BITS:])
1404                     sync += r1.real_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(row+1)
1405
1406                 # Incoming acks processing
1407                 sync += r1.forward_valid1.eq(wb_in.ack)
1408                 with m.If(wb_in.ack):
1409                     srow = Signal(ROW_LINE_BITS)
1410                     comb += srow.eq(r1.store_row)
1411                     sync += r1.rows_valid[srow].eq(1)
1412
1413                     # If this is the data we were looking for,
1414                     # we can complete the request next cycle.
1415                     # Compare the whole address in case the
1416                     # request in r1.req is not the one that
1417                     # started this refill.
1418                     with m.If(r1.full & r1.req.same_tag &
1419                               ((r1.dcbz & r1.req.dcbz) |
1420                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1421                                 (r1.store_row == get_row(r1.req.real_addr))):
1422                         sync += r1.full.eq(0)
1423                         sync += r1.slow_valid.eq(1)
1424                         with m.If(~r1.mmu_req):
1425                             sync += r1.ls_valid.eq(1)
1426                         with m.Else():
1427                             sync += r1.mmu_done.eq(1)
1428                         sync += r1.forward_sel.eq(~0) # all 1s
1429                         sync += r1.use_forward1.eq(1)
1430
1431                     # Check for completion
1432                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1433                                                       r1.end_row_ix)):
1434                         # Complete wishbone cycle
1435                         sync += r1.wb.cyc.eq(0)
1436
1437                         # Cache line is now valid
1438                         cv = Signal(INDEX_BITS)
1439                         comb += cv.eq(cache_valids[r1.store_index])
1440                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1441                         sync += cache_valids[r1.store_index].eq(cv)
1442                         sync += r1.state.eq(State.IDLE)
1443
1444                     # Increment store row counter
1445                     sync += r1.store_row.eq(next_row(r1.store_row))
1446
1447             with m.Case(State.STORE_WAIT_ACK):
1448                 st_stbs_done = Signal()
1449                 comb += st_stbs_done.eq(~r1.wb.stb)
1450                 comb += acks.eq(r1.acks_pending)
1451
1452                 with m.If(r1.inc_acks != r1.dec_acks):
1453                     with m.If(r1.inc_acks):
1454                         comb += adjust_acks.eq(acks + 1)
1455                     with m.Else():
1456                         comb += adjust_acks.eq(acks - 1)
1457                 with m.Else():
1458                     comb += adjust_acks.eq(acks)
1459
1460                 sync += r1.acks_pending.eq(adjust_acks)
1461
1462                 # Clear stb when slave accepted request
1463                 with m.If(~wb_in.stall):
1464                     # See if there is another store waiting
1465                     # to be done which is in the same real page.
1466                     with m.If(req.valid):
1467                         ra = req.real_addr[0:SET_SIZE_BITS]
1468                         sync += r1.real_adr[0:SET_SIZE_BITS].eq(ra)
1469                         sync += r1.wb.dat.eq(req.data)
1470                         sync += r1.wb.sel.eq(req.byte_sel)
1471
1472                     with m.Elif((adjust_acks < 7) & req.same_tag &
1473                                 ((req.op == Op.OP_STORE_MISS)
1474                                  | (req.op == Op.OP_STORE_HIT))):
1475                         sync += r1.wb.stb.eq(1)
1476                         comb += st_stbs_done.eq(0)
1477
1478                         with m.If(req.op == Op.OP_STORE_HIT):
1479                             sync += r1.write_bram.eq(1)
1480                         sync += r1.full.eq(0)
1481                         sync += r1.slow_valid.eq(1)
1482
1483                         # Store requests never come from the MMU
1484                         sync += r1.ls_valid.eq(1)
1485                         comb += st_stbs_done.eq(0)
1486                         sync += r1.inc_acks.eq(1)
1487                     with m.Else():
1488                         sync += r1.wb.stb.eq(0)
1489                         comb += st_stbs_done.eq(1)
1490
1491                 # Got ack ? See if complete.
1492                 with m.If(wb_in.ack):
1493                     with m.If(st_stbs_done & (adjust_acks == 1)):
1494                         sync += r1.state.eq(State.IDLE)
1495                         sync += r1.wb.cyc.eq(0)
1496                         sync += r1.wb.stb.eq(0)
1497                     sync += r1.dec_acks.eq(1)
1498
1499             with m.Case(State.NC_LOAD_WAIT_ACK):
1500                 # Clear stb when slave accepted request
1501                 with m.If(~wb_in.stall):
1502                     sync += r1.wb.stb.eq(0)
1503
1504                 # Got ack ? complete.
1505                 with m.If(wb_in.ack):
1506                     sync += r1.state.eq(State.IDLE)
1507                     sync += r1.full.eq(0)
1508                     sync += r1.slow_valid.eq(1)
1509
1510                     with m.If(~r1.mmu_req):
1511                         sync += r1.ls_valid.eq(1)
1512                     with m.Else():
1513                         sync += r1.mmu_done.eq(1)
1514
1515                     sync += r1.forward_sel.eq(~0) # all 1s
1516                     sync += r1.use_forward1.eq(1)
1517                     sync += r1.wb.cyc.eq(0)
1518                     sync += r1.wb.stb.eq(0)
1519
1520     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1521
1522         sync = m.d.sync
1523         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1524
1525         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1526                                stall_out, req_op[:3], d_out.valid, d_out.error,
1527                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1528                                r1.real_adr[3:6]))
1529
1530     def elaborate(self, platform):
1531
1532         m = Module()
1533         comb = m.d.comb
1534
1535         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1536         cache_tags       = CacheTagArray()
1537         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1538         cache_valids = CacheValidBitsArray()
1539
1540         # TODO attribute ram_style : string;
1541         # TODO attribute ram_style of cache_tags : signal is "distributed";
1542
1543         """note: these are passed to nmigen.hdl.Memory as "attributes".
1544            don't know how, just that they are.
1545         """
1546         dtlb_valid_bits = TLBValidBitsArray()
1547         dtlb_tags       = TLBTagsArray()
1548         dtlb_ptes       = TLBPtesArray()
1549         # TODO attribute ram_style of
1550         #  dtlb_tags : signal is "distributed";
1551         # TODO attribute ram_style of
1552         #  dtlb_ptes : signal is "distributed";
1553
1554         r0      = RegStage0("r0")
1555         r0_full = Signal()
1556
1557         r1 = RegStage1("r1")
1558
1559         reservation = Reservation()
1560
1561         # Async signals on incoming request
1562         req_index    = Signal(INDEX_BITS)
1563         req_row      = Signal(ROW_BITS)
1564         req_hit_way  = Signal(WAY_BITS)
1565         req_tag      = Signal(TAG_BITS)
1566         req_op       = Signal(Op)
1567         req_data     = Signal(64)
1568         req_same_tag = Signal()
1569         req_go       = Signal()
1570
1571         early_req_row     = Signal(ROW_BITS)
1572
1573         cancel_store      = Signal()
1574         set_rsrv          = Signal()
1575         clear_rsrv        = Signal()
1576
1577         r0_valid          = Signal()
1578         r0_stall          = Signal()
1579
1580         use_forward1_next = Signal()
1581         use_forward2_next = Signal()
1582
1583         cache_out_row     = Signal(WB_DATA_BITS)
1584
1585         plru_victim       = PLRUOut()
1586         replace_way       = Signal(WAY_BITS)
1587
1588         # Wishbone read/write/cache write formatting signals
1589         bus_sel           = Signal(8)
1590
1591         # TLB signals
1592         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1593         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1594         tlb_valid_way = Signal(TLB_NUM_WAYS)
1595         tlb_req_index = Signal(TLB_SET_BITS)
1596         tlb_hit       = Signal()
1597         tlb_hit_way   = Signal(TLB_WAY_BITS)
1598         pte           = Signal(TLB_PTE_BITS)
1599         ra            = Signal(REAL_ADDR_BITS)
1600         valid_ra      = Signal()
1601         perm_attr     = PermAttr("dc_perms")
1602         rc_ok         = Signal()
1603         perm_ok       = Signal()
1604         access_ok     = Signal()
1605
1606         tlb_plru_victim = TLBPLRUOut()
1607
1608         # we don't yet handle collisions between loadstore1 requests
1609         # and MMU requests
1610         comb += self.m_out.stall.eq(0)
1611
1612         # Hold off the request in r0 when r1 has an uncompleted request
1613         comb += r0_stall.eq(r0_full & r1.full)
1614         comb += r0_valid.eq(r0_full & ~r1.full)
1615         comb += self.stall_out.eq(r0_stall)
1616
1617         # Wire up wishbone request latch out of stage 1
1618         comb += r1.wb.adr.eq(r1.real_adr[ROW_OFF_BITS:]) # truncate LSBs
1619         comb += self.wb_out.eq(r1.wb)
1620
1621         # call sub-functions putting everything together, using shared
1622         # signals established above
1623         self.stage_0(m, r0, r1, r0_full)
1624         self.tlb_read(m, r0_stall, tlb_valid_way,
1625                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1626                       dtlb_tags, dtlb_ptes)
1627         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1628                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1629                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1630         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1631                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1632                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1633         self.maybe_plrus(m, r1, plru_victim)
1634         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1635         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1636         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1637                            r0_valid, r1, cache_valids, replace_way,
1638                            use_forward1_next, use_forward2_next,
1639                            req_hit_way, plru_victim, rc_ok, perm_attr,
1640                            valid_ra, perm_ok, access_ok, req_op, req_go,
1641                            tlb_pte_way,
1642                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1643                            cancel_store, req_same_tag, r0_stall, early_req_row)
1644         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1645                            r0_valid, r0, reservation)
1646         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1647                            reservation, r0)
1648         self.writeback_control(m, r1, cache_out_row)
1649         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1650         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1651                         req_hit_way, req_index, req_tag, access_ok,
1652                         tlb_hit, tlb_hit_way, tlb_req_index)
1653         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1654                     cache_valids, r0, replace_way,
1655                     req_hit_way, req_same_tag,
1656                          r0_valid, req_op, cache_tags, req_go, ra)
1657         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1658
1659         return m
1660
1661 def dcache_load(dut, addr, nc=0):
1662     yield dut.d_in.load.eq(1)
1663     yield dut.d_in.nc.eq(nc)
1664     yield dut.d_in.addr.eq(addr)
1665     yield dut.d_in.byte_sel.eq(~0)
1666     yield dut.d_in.valid.eq(1)
1667     yield
1668     yield dut.d_in.valid.eq(0)
1669     yield dut.d_in.byte_sel.eq(0)
1670     while not (yield dut.d_out.valid):
1671         yield
1672     data = yield dut.d_out.data
1673     return data
1674
1675
1676 def dcache_store(dut, addr, data, nc=0):
1677     yield dut.d_in.load.eq(0)
1678     yield dut.d_in.nc.eq(nc)
1679     yield dut.d_in.data.eq(data)
1680     yield dut.d_in.byte_sel.eq(~0)
1681     yield dut.d_in.addr.eq(addr)
1682     yield dut.d_in.valid.eq(1)
1683     yield
1684     yield dut.d_in.valid.eq(0)
1685     yield dut.d_in.byte_sel.eq(0)
1686     while not (yield dut.d_out.valid):
1687         yield
1688
1689
1690 def dcache_random_sim(dut):
1691
1692     # start with stack of zeros
1693     sim_mem = [0] * 1024
1694
1695     # clear stuff
1696     yield dut.d_in.valid.eq(0)
1697     yield dut.d_in.load.eq(0)
1698     yield dut.d_in.priv_mode.eq(1)
1699     yield dut.d_in.nc.eq(0)
1700     yield dut.d_in.addr.eq(0)
1701     yield dut.d_in.data.eq(0)
1702     yield dut.m_in.valid.eq(0)
1703     yield dut.m_in.addr.eq(0)
1704     yield dut.m_in.pte.eq(0)
1705     # wait 4 * clk_period
1706     yield
1707     yield
1708     yield
1709     yield
1710
1711     print ()
1712
1713     for i in range(1024):
1714         sim_mem[i] = i
1715
1716     for i in range(1024):
1717         addr = randint(0, 1023)
1718         data = randint(0, (1<<64)-1)
1719         sim_mem[addr] = data
1720         row = addr
1721         addr *= 8
1722
1723         print ("random testing %d 0x%x row %d data 0x%x" % (i, addr, row, data))
1724
1725         yield from dcache_load(dut, addr)
1726         yield from dcache_store(dut, addr, data)
1727
1728         addr = randint(0, 1023)
1729         sim_data = sim_mem[addr]
1730         row = addr
1731         addr *= 8
1732
1733         print ("    load 0x%x row %d expect data 0x%x" % (addr, row, sim_data))
1734         data = yield from dcache_load(dut, addr)
1735         assert data == sim_data, \
1736             "check addr 0x%x row %d data %x != %x" % (addr, row, data, sim_data)
1737
1738     for addr in range(1024):
1739         data = yield from dcache_load(dut, addr*8)
1740         assert data == sim_mem[addr], \
1741             "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1742
1743
1744 def dcache_sim(dut):
1745     # clear stuff
1746     yield dut.d_in.valid.eq(0)
1747     yield dut.d_in.load.eq(0)
1748     yield dut.d_in.priv_mode.eq(1)
1749     yield dut.d_in.nc.eq(0)
1750     yield dut.d_in.addr.eq(0)
1751     yield dut.d_in.data.eq(0)
1752     yield dut.m_in.valid.eq(0)
1753     yield dut.m_in.addr.eq(0)
1754     yield dut.m_in.pte.eq(0)
1755     # wait 4 * clk_period
1756     yield
1757     yield
1758     yield
1759     yield
1760
1761     # Cacheable read of address 4
1762     data = yield from dcache_load(dut, 0x58)
1763     addr = yield dut.d_in.addr
1764     assert data == 0x0000001700000016, \
1765         f"data @%x=%x expected 0x0000001700000016" % (addr, data)
1766
1767     # Cacheable read of address 20
1768     data = yield from dcache_load(dut, 0x20)
1769     addr = yield dut.d_in.addr
1770     assert data == 0x0000000900000008, \
1771         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1772
1773     # Cacheable read of address 30
1774     data = yield from dcache_load(dut, 0x530)
1775     addr = yield dut.d_in.addr
1776     assert data == 0x0000014D0000014C, \
1777         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1778
1779     # 2nd Cacheable read of address 30
1780     data = yield from dcache_load(dut, 0x530)
1781     addr = yield dut.d_in.addr
1782     assert data == 0x0000014D0000014C, \
1783         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1784
1785     # Non-cacheable read of address 100
1786     data = yield from dcache_load(dut, 0x100, nc=1)
1787     addr = yield dut.d_in.addr
1788     assert data == 0x0000004100000040, \
1789         f"data @%x=%x expected 0000004100000040" % (addr, data)
1790
1791     # Store at address 530
1792     yield from dcache_store(dut, 0x530, 0x121)
1793
1794     # Store at address 30
1795     yield from dcache_store(dut, 0x530, 0x12345678)
1796
1797     # 3nd Cacheable read of address 530
1798     data = yield from dcache_load(dut, 0x530)
1799     addr = yield dut.d_in.addr
1800     assert data == 0x12345678, \
1801         f"data @%x=%x expected 0x12345678" % (addr, data)
1802
1803     # 4th Cacheable read of address 20
1804     data = yield from dcache_load(dut, 0x20)
1805     addr = yield dut.d_in.addr
1806     assert data == 0x0000000900000008, \
1807         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1808
1809     yield
1810     yield
1811     yield
1812     yield
1813
1814
1815 def test_dcache(mem, test_fn, test_name):
1816     dut = DCache()
1817
1818     memory = Memory(width=64, depth=16*64, init=mem)
1819     sram = SRAM(memory=memory, granularity=8)
1820
1821     m = Module()
1822     m.submodules.dcache = dut
1823     m.submodules.sram = sram
1824
1825     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1826     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1827     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1828     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1829     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1830     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1831
1832     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1833     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1834
1835     # nmigen Simulation
1836     sim = Simulator(m)
1837     sim.add_clock(1e-6)
1838
1839     sim.add_sync_process(wrap(test_fn(dut)))
1840     with sim.write_vcd('test_dcache%s.vcd' % test_name):
1841         sim.run()
1842
1843 if __name__ == '__main__':
1844     seed(0)
1845     dut = DCache()
1846     vl = rtlil.convert(dut, ports=[])
1847     with open("test_dcache.il", "w") as f:
1848         f.write(vl)
1849
1850     mem = []
1851     for i in range(0, 512):
1852         mem.append(i)
1853
1854     test_dcache(mem, dcache_random_sim, "random")
1855
1856     mem = []
1857     for i in range(0, 512):
1858         mem.append((i*2)| ((i*2+1)<<32))
1859
1860     test_dcache(mem, dcache_sim, "")
1861