src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16
  17 """
  18
  19 import sys
  20
  21 from nmutil.gtkw import write_gtkw
  22
  23 sys.setrecursionlimit(1000000)
  24
  25 from enum import Enum, unique
  26
  27 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  28 from nmutil.util import Display
  29
  30 from copy import deepcopy
  31 from random import randint, seed
  32
  33 from nmigen.cli import main
  34 from nmutil.iocontrol import RecordObject
  35 from nmigen.utils import log2_int
  36 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  37                                      DCacheToLoadStore1Type,
  38                                      MMUToDCacheType,
  39                                      DCacheToMMUType)
  40
  41 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  42                                 WBAddrType, WBDataType, WBSelType,
  43                                 WBMasterOut, WBSlaveOut,
  44                                 WBMasterOutVector, WBSlaveOutVector,
  45                                 WBIOMasterOut, WBIOSlaveOut)
  46
  47 from soc.experiment.cache_ram import CacheRam
  48 #from soc.experiment.plru import PLRU
  49 from nmutil.plru import PLRU
  50
  51 # for test
  52 from soc.bus.sram import SRAM
  53 from nmigen import Memory
  54 from nmigen.cli import rtlil
  55
  56 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  57 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  58 from nmutil.sim_tmp_alternative import Simulator
  59
  60 from nmutil.util import wrap
  61
  62
  63 # TODO: make these parameters of DCache at some point
  64 LINE_SIZE = 64    # Line size in bytes
  65 NUM_LINES = 16    # Number of lines in a set
  66 NUM_WAYS = 4      # Number of ways
  67 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  68 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  69 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  70 LOG_LENGTH = 0    # Non-zero to enable log data collection
  71
  72 # BRAM organisation: We never access more than
  73 #     -- WB_DATA_BITS at a time so to save
  74 #     -- resources we make the array only that wide, and
  75 #     -- use consecutive indices for to make a cache "line"
  76 #     --
  77 #     -- ROW_SIZE is the width in bytes of the BRAM
  78 #     -- (based on WB, so 64-bits)
  79 ROW_SIZE = WB_DATA_BITS // 8;
  80
  81 # ROW_PER_LINE is the number of row (wishbone
  82 # transactions) in a line
  83 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  84
  85 # BRAM_ROWS is the number of rows in BRAM needed
  86 # to represent the full dcache
  87 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  88
  89 print ("ROW_SIZE", ROW_SIZE)
  90 print ("ROW_PER_LINE", ROW_PER_LINE)
  91 print ("BRAM_ROWS", BRAM_ROWS)
  92 print ("NUM_WAYS", NUM_WAYS)
  93
  94 # Bit fields counts in the address
  95
  96 # REAL_ADDR_BITS is the number of real address
  97 # bits that we store
  98 REAL_ADDR_BITS = 56
  99
 100 # ROW_BITS is the number of bits to select a row
 101 ROW_BITS = log2_int(BRAM_ROWS)
 102
 103 # ROW_LINE_BITS is the number of bits to select
 104 # a row within a line
 105 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 106
 107 # LINE_OFF_BITS is the number of bits for
 108 # the offset in a cache line
 109 LINE_OFF_BITS = log2_int(LINE_SIZE)
 110
 111 # ROW_OFF_BITS is the number of bits for
 112 # the offset in a row
 113 ROW_OFF_BITS = log2_int(ROW_SIZE)
 114
 115 # INDEX_BITS is the number if bits to
 116 # select a cache line
 117 INDEX_BITS = log2_int(NUM_LINES)
 118
 119 # SET_SIZE_BITS is the log base 2 of the set size
 120 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 121
 122 # TAG_BITS is the number of bits of
 123 # the tag part of the address
 124 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 125
 126 # TAG_WIDTH is the width in bits of each way of the tag RAM
 127 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 128
 129 # WAY_BITS is the number of bits to select a way
 130 WAY_BITS = log2_int(NUM_WAYS)
 131
 132 # Example of layout for 32 lines of 64 bytes:
 133 layout = """\
 134   ..  tag    |index|  line  |
 135   ..         |   row   |    |
 136   ..         |     |---|    | ROW_LINE_BITS  (3)
 137   ..         |     |--- - --| LINE_OFF_BITS (6)
 138   ..         |         |- --| ROW_OFF_BITS  (3)
 139   ..         |----- ---|    | ROW_BITS      (8)
 140   ..         |-----|        | INDEX_BITS    (5)
 141   .. --------|              | TAG_BITS      (45)
 142 """
 143 print (layout)
 144 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 145             (TAG_BITS, INDEX_BITS, ROW_BITS,
 146              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 147 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 148 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 149 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 150
 151 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 152
 153 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 154
 155 def CacheTagArray():
 156     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 157                         for x in range(NUM_LINES))
 158
 159 def CacheValidBitsArray():
 160     return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
 161                         for x in range(NUM_LINES))
 162
 163 def RowPerLineValidArray():
 164     return Array(Signal(name="rows_valid%d" % x) \
 165                         for x in range(ROW_PER_LINE))
 166
 167 # L1 TLB
 168 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 169 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 170 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 171 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 172 TLB_PTE_BITS     = 64
 173 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 174
 175 def ispow2(x):
 176     return (1<<log2_int(x, False)) == x
 177
 178 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 179 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 180 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 181 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 182 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 183 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 184         "geometry bits don't add up"
 185 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 186         "geometry bits don't add up"
 187 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 188          "geometry bits don't add up"
 189 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 190 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 191
 192
 193 def TLBValidBitsArray():
 194     return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
 195                 for x in range(TLB_SET_SIZE))
 196
 197 def TLBTagEAArray():
 198     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 199                 for x in range (TLB_NUM_WAYS))
 200
 201 def TLBTagsArray():
 202     return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
 203                 for x in range (TLB_SET_SIZE))
 204
 205 def TLBPtesArray():
 206     return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
 207                 for x in range(TLB_SET_SIZE))
 208
 209 def HitWaySet():
 210     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 211                         for x in range(TLB_NUM_WAYS))
 212
 213 # Cache RAM interface
 214 def CacheRamOut():
 215     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 216                  for x in range(NUM_WAYS))
 217
 218 # PLRU output interface
 219 def PLRUOut():
 220     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 221                 for x in range(NUM_LINES))
 222
 223 # TLB PLRU output interface
 224 def TLBPLRUOut():
 225     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 226                 for x in range(TLB_SET_SIZE))
 227
 228 # Helper functions to decode incoming requests
 229 #
 230 # Return the cache line index (tag index) for an address
 231 def get_index(addr):
 232     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 233
 234 # Return the cache row index (data memory) for an address
 235 def get_row(addr):
 236     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 237
 238 # Return the index of a row within a line
 239 def get_row_of_line(row):
 240     return row[:ROW_BITS][:ROW_LINE_BITS]
 241
 242 # Returns whether this is the last row of a line
 243 def is_last_row_addr(addr, last):
 244     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 245
 246 # Returns whether this is the last row of a line
 247 def is_last_row(row, last):
 248     return get_row_of_line(row) == last
 249
 250 # Return the next row in the current cache line. We use a
 251 # dedicated function in order to limit the size of the
 252 # generated adder to be only the bits within a cache line
 253 # (3 bits with default settings)
 254 def next_row(row):
 255     row_v = row[0:ROW_LINE_BITS] + 1
 256     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 257
 258 # Get the tag value from the address
 259 def get_tag(addr):
 260     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 261
 262 # Read a tag from a tag memory row
 263 def read_tag(way, tagset):
 264     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 265
 266 # Read a TLB tag from a TLB tag memory row
 267 def read_tlb_tag(way, tags):
 268     return tags.word_select(way, TLB_EA_TAG_BITS)
 269
 270 # Write a TLB tag to a TLB tag memory row
 271 def write_tlb_tag(way, tags, tag):
 272     return read_tlb_tag(way, tags).eq(tag)
 273
 274 # Read a PTE from a TLB PTE memory row
 275 def read_tlb_pte(way, ptes):
 276     return ptes.word_select(way, TLB_PTE_BITS)
 277
 278 def write_tlb_pte(way, ptes, newpte):
 279     return read_tlb_pte(way, ptes).eq(newpte)
 280
 281
 282 # Record for storing permission, attribute, etc. bits from a PTE
 283 class PermAttr(RecordObject):
 284     def __init__(self, name=None):
 285         super().__init__(name=name)
 286         self.reference = Signal()
 287         self.changed   = Signal()
 288         self.nocache   = Signal()
 289         self.priv      = Signal()
 290         self.rd_perm   = Signal()
 291         self.wr_perm   = Signal()
 292
 293
 294 def extract_perm_attr(pte):
 295     pa = PermAttr()
 296     return pa;
 297
 298
 299 # Type of operation on a "valid" input
 300 @unique
 301 class Op(Enum):
 302     OP_NONE       = 0
 303     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 304     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 305     OP_LOAD_HIT   = 3 # Cache hit on load
 306     OP_LOAD_MISS  = 4 # Load missing cache
 307     OP_LOAD_NC    = 5 # Non-cachable load
 308     OP_STORE_HIT  = 6 # Store hitting cache
 309     OP_STORE_MISS = 7 # Store missing cache
 310
 311
 312 # Cache state machine
 313 @unique
 314 class State(Enum):
 315     IDLE             = 0 # Normal load hit processing
 316     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 317     STORE_WAIT_ACK   = 2 # Store wait ack
 318     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 319
 320
 321 # Dcache operations:
 322 #
 323 # In order to make timing, we use the BRAMs with
 324 # an output buffer, which means that the BRAM
 325 # output is delayed by an extra cycle.
 326 #
 327 # Thus, the dcache has a 2-stage internal pipeline
 328 # for cache hits with no stalls.
 329 #
 330 # All other operations are handled via stalling
 331 # in the first stage.
 332 #
 333 # The second stage can thus complete a hit at the same
 334 # time as the first stage emits a stall for a complex op.
 335 #
 336 # Stage 0 register, basically contains just the latched request
 337
 338 class RegStage0(RecordObject):
 339     def __init__(self, name=None):
 340         super().__init__(name=name)
 341         self.req     = LoadStore1ToDCacheType(name="lsmem")
 342         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 343         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 344         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 345         self.mmu_req = Signal() # indicates source of request
 346         self.d_valid = Signal() # indicates req.data is valid now
 347
 348
 349 class MemAccessRequest(RecordObject):
 350     def __init__(self, name=None):
 351         super().__init__(name=name)
 352         self.op        = Signal(Op)
 353         self.valid     = Signal()
 354         self.dcbz      = Signal()
 355         self.real_addr = Signal(REAL_ADDR_BITS)
 356         self.data      = Signal(64)
 357         self.byte_sel  = Signal(8)
 358         self.hit_way   = Signal(WAY_BITS)
 359         self.same_tag  = Signal()
 360         self.mmu_req   = Signal()
 361
 362
 363 # First stage register, contains state for stage 1 of load hits
 364 # and for the state machine used by all other operations
 365 class RegStage1(RecordObject):
 366     def __init__(self, name=None):
 367         super().__init__(name=name)
 368         # Info about the request
 369         self.full             = Signal() # have uncompleted request
 370         self.mmu_req          = Signal() # request is from MMU
 371         self.req              = MemAccessRequest(name="reqmem")
 372
 373         # Cache hit state
 374         self.hit_way          = Signal(WAY_BITS)
 375         self.hit_load_valid   = Signal()
 376         self.hit_index        = Signal(INDEX_BITS)
 377         self.cache_hit        = Signal()
 378
 379         # TLB hit state
 380         self.tlb_hit          = Signal()
 381         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 382         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 383
 384         # 2-stage data buffer for data forwarded from writes to reads
 385         self.forward_data1    = Signal(64)
 386         self.forward_data2    = Signal(64)
 387         self.forward_sel1     = Signal(8)
 388         self.forward_valid1   = Signal()
 389         self.forward_way1     = Signal(WAY_BITS)
 390         self.forward_row1     = Signal(ROW_BITS)
 391         self.use_forward1     = Signal()
 392         self.forward_sel      = Signal(8)
 393
 394         # Cache miss state (reload state machine)
 395         self.state            = Signal(State)
 396         self.dcbz             = Signal()
 397         self.write_bram       = Signal()
 398         self.write_tag        = Signal()
 399         self.slow_valid       = Signal()
 400         self.wb               = WBMasterOut("wb")
 401         self.reload_tag       = Signal(TAG_BITS)
 402         self.store_way        = Signal(WAY_BITS)
 403         self.store_row        = Signal(ROW_BITS)
 404         self.store_index      = Signal(INDEX_BITS)
 405         self.end_row_ix       = Signal(ROW_LINE_BITS)
 406         self.rows_valid       = RowPerLineValidArray()
 407         self.acks_pending     = Signal(3)
 408         self.inc_acks         = Signal()
 409         self.dec_acks         = Signal()
 410
 411         # Signals to complete (possibly with error)
 412         self.ls_valid         = Signal()
 413         self.ls_error         = Signal()
 414         self.mmu_done         = Signal()
 415         self.mmu_error        = Signal()
 416         self.cache_paradox    = Signal()
 417
 418         # Signal to complete a failed stcx.
 419         self.stcx_fail        = Signal()
 420
 421
 422 # Reservation information
 423 class Reservation(RecordObject):
 424     def __init__(self):
 425         super().__init__()
 426         self.valid = Signal()
 427         self.addr  = Signal(64-LINE_OFF_BITS)
 428
 429
 430 class DTLBUpdate(Elaboratable):
 431     def __init__(self):
 432         self.tlbie    = Signal()
 433         self.tlbwe    = Signal()
 434         self.doall    = Signal()
 435         self.updated  = Signal()
 436         self.v_updated  = Signal()
 437         self.tlb_hit    = Signal()
 438         self.tlb_req_index = Signal(TLB_SET_BITS)
 439
 440         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 441         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 442         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 443         self.repl_way        = Signal(TLB_WAY_BITS)
 444         self.eatag           = Signal(TLB_EA_TAG_BITS)
 445         self.pte_data        = Signal(TLB_PTE_BITS)
 446
 447         self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 448
 449         self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 450         self.pb_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 451         self.db_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 452
 453     def elaborate(self, platform):
 454         m = Module()
 455         comb = m.d.comb
 456         sync = m.d.sync
 457
 458         tagset   = Signal(TLB_TAG_WAY_BITS)
 459         pteset   = Signal(TLB_PTE_WAY_BITS)
 460
 461         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 462         comb += db_out.eq(self.dv)
 463
 464         with m.If(self.tlbie & self.doall):
 465             pass # clear all back in parent
 466         with m.Elif(self.tlbie):
 467             with m.If(self.tlb_hit):
 468                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 469                 comb += self.v_updated.eq(1)
 470
 471         with m.Elif(self.tlbwe):
 472
 473             comb += tagset.eq(self.tlb_tag_way)
 474             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 475             comb += tb_out.eq(tagset)
 476
 477             comb += pteset.eq(self.tlb_pte_way)
 478             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 479             comb += pb_out.eq(pteset)
 480
 481             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 482
 483             comb += self.updated.eq(1)
 484             comb += self.v_updated.eq(1)
 485
 486         return m
 487
 488
 489 class DCachePendingHit(Elaboratable):
 490
 491     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 492                       cache_valid_idx, cache_tag_set,
 493                     req_addr,
 494                     hit_set):
 495
 496         self.go          = Signal()
 497         self.virt_mode   = Signal()
 498         self.is_hit      = Signal()
 499         self.tlb_hit     = Signal()
 500         self.hit_way     = Signal(WAY_BITS)
 501         self.rel_match   = Signal()
 502         self.req_index   = Signal(INDEX_BITS)
 503         self.reload_tag  = Signal(TAG_BITS)
 504
 505         self.tlb_hit_way = tlb_hit_way
 506         self.tlb_pte_way = tlb_pte_way
 507         self.tlb_valid_way = tlb_valid_way
 508         self.cache_valid_idx = cache_valid_idx
 509         self.cache_tag_set = cache_tag_set
 510         self.req_addr = req_addr
 511         self.hit_set = hit_set
 512
 513     def elaborate(self, platform):
 514         m = Module()
 515         comb = m.d.comb
 516         sync = m.d.sync
 517
 518         go = self.go
 519         virt_mode = self.virt_mode
 520         is_hit = self.is_hit
 521         tlb_pte_way = self.tlb_pte_way
 522         tlb_valid_way = self.tlb_valid_way
 523         cache_valid_idx = self.cache_valid_idx
 524         cache_tag_set = self.cache_tag_set
 525         req_addr = self.req_addr
 526         tlb_hit_way = self.tlb_hit_way
 527         tlb_hit = self.tlb_hit
 528         hit_set = self.hit_set
 529         hit_way = self.hit_way
 530         rel_match = self.rel_match
 531         req_index = self.req_index
 532         reload_tag = self.reload_tag
 533
 534         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 535                                     for i in range(TLB_NUM_WAYS))
 536         hit_way_set = HitWaySet()
 537
 538         # Test if pending request is a hit on any way
 539         # In order to make timing in virtual mode,
 540         # when we are using the TLB, we compare each
 541         # way with each of the real addresses from each way of
 542         # the TLB, and then decide later which match to use.
 543
 544         with m.If(virt_mode):
 545             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 546                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 547                 s_hit       = Signal()
 548                 s_pte       = Signal(TLB_PTE_BITS)
 549                 s_ra        = Signal(REAL_ADDR_BITS)
 550                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 551                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 552                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 553                 comb += s_tag.eq(get_tag(s_ra))
 554
 555                 for i in range(NUM_WAYS): # way_t
 556                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 557                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 558                                   (read_tag(i, cache_tag_set) == s_tag)
 559                                   & tlb_valid_way[j])
 560                     with m.If(is_tag_hit):
 561                         comb += hit_way_set[j].eq(i)
 562                         comb += s_hit.eq(1)
 563                 comb += hit_set[j].eq(s_hit)
 564                 with m.If(s_tag == reload_tag):
 565                     comb += rel_matches[j].eq(1)
 566             with m.If(tlb_hit):
 567                 comb += is_hit.eq(hit_set[tlb_hit_way])
 568                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 569                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 570         with m.Else():
 571             s_tag       = Signal(TAG_BITS)
 572             comb += s_tag.eq(get_tag(req_addr))
 573             for i in range(NUM_WAYS): # way_t
 574                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 575                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 576                           (read_tag(i, cache_tag_set) == s_tag))
 577                 with m.If(is_tag_hit):
 578                     comb += hit_way.eq(i)
 579                     comb += is_hit.eq(1)
 580             with m.If(s_tag == reload_tag):
 581                 comb += rel_match.eq(1)
 582
 583         return m
 584
 585
 586 class DCache(Elaboratable):
 587     """Set associative dcache write-through
 588
 589     TODO (in no specific order):
 590     * See list in icache.vhdl
 591     * Complete load misses on the cycle when WB data comes instead of
 592       at the end of line (this requires dealing with requests coming in
 593       while not idle...)
 594     """
 595     def __init__(self):
 596         self.d_in      = LoadStore1ToDCacheType("d_in")
 597         self.d_out     = DCacheToLoadStore1Type("d_out")
 598
 599         self.m_in      = MMUToDCacheType("m_in")
 600         self.m_out     = DCacheToMMUType("m_out")
 601
 602         self.stall_out = Signal()
 603
 604         self.wb_out    = WBMasterOut("wb_out")
 605         self.wb_in     = WBSlaveOut("wb_in")
 606
 607         self.log_out   = Signal(20)
 608
 609     def stage_0(self, m, r0, r1, r0_full):
 610         """Latch the request in r0.req as long as we're not stalling
 611         """
 612         comb = m.d.comb
 613         sync = m.d.sync
 614         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 615
 616         r = RegStage0("stage0")
 617
 618         # TODO, this goes in unit tests and formal proofs
 619         with m.If(d_in.valid & m_in.valid):
 620             sync += Display("request collision loadstore vs MMU")
 621
 622         with m.If(m_in.valid):
 623             comb += r.req.valid.eq(1)
 624             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 625             comb += r.req.dcbz.eq(0)
 626             comb += r.req.nc.eq(0)
 627             comb += r.req.reserve.eq(0)
 628             comb += r.req.virt_mode.eq(0)
 629             comb += r.req.priv_mode.eq(1)
 630             comb += r.req.addr.eq(m_in.addr)
 631             comb += r.req.data.eq(m_in.pte)
 632             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 633             comb += r.tlbie.eq(m_in.tlbie)
 634             comb += r.doall.eq(m_in.doall)
 635             comb += r.tlbld.eq(m_in.tlbld)
 636             comb += r.mmu_req.eq(1)
 637         with m.Else():
 638             comb += r.req.eq(d_in)
 639             comb += r.req.data.eq(0)
 640             comb += r.tlbie.eq(0)
 641             comb += r.doall.eq(0)
 642             comb += r.tlbld.eq(0)
 643             comb += r.mmu_req.eq(0)
 644         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 645             sync += r0.eq(r)
 646             sync += r0_full.eq(r.req.valid)
 647             # Sample data the cycle after a request comes in from loadstore1.
 648             # If another request has come in already then the data will get
 649             # put directly into req.data below.
 650             with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
 651                      ~r0.mmu_req):
 652                 sync += r0.req.data.eq(d_in.data)
 653                 sync += r0.d_valid.eq(1)
 654
 655     def tlb_read(self, m, r0_stall, tlb_valid_way,
 656                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 657                  dtlb_tags, dtlb_ptes):
 658         """TLB
 659         Operates in the second cycle on the request latched in r0.req.
 660         TLB updates write the entry at the end of the second cycle.
 661         """
 662         comb = m.d.comb
 663         sync = m.d.sync
 664         m_in, d_in = self.m_in, self.d_in
 665
 666         index    = Signal(TLB_SET_BITS)
 667         addrbits = Signal(TLB_SET_BITS)
 668
 669         amin = TLB_LG_PGSZ
 670         amax = TLB_LG_PGSZ + TLB_SET_BITS
 671
 672         with m.If(m_in.valid):
 673             comb += addrbits.eq(m_in.addr[amin : amax])
 674         with m.Else():
 675             comb += addrbits.eq(d_in.addr[amin : amax])
 676         comb += index.eq(addrbits)
 677
 678         # If we have any op and the previous op isn't finished,
 679         # then keep the same output for next cycle.
 680         with m.If(~r0_stall):
 681             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 682             sync += tlb_tag_way.eq(dtlb_tags[index])
 683             sync += tlb_pte_way.eq(dtlb_ptes[index])
 684
 685     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 686         """Generate TLB PLRUs
 687         """
 688         comb = m.d.comb
 689         sync = m.d.sync
 690
 691         if TLB_NUM_WAYS == 0:
 692             return
 693         for i in range(TLB_SET_SIZE):
 694             # TLB PLRU interface
 695             tlb_plru        = PLRU(TLB_WAY_BITS)
 696             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 697             tlb_plru_acc_en = Signal()
 698
 699             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 700             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 701             comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
 702             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 703
 704     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 705                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 706                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 707
 708         comb = m.d.comb
 709
 710         hitway = Signal(TLB_WAY_BITS)
 711         hit    = Signal()
 712         eatag  = Signal(TLB_EA_TAG_BITS)
 713
 714         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 715         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 716         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 717
 718         for i in range(TLB_NUM_WAYS):
 719             is_tag_hit = Signal()
 720             comb += is_tag_hit.eq(tlb_valid_way[i]
 721                                   & (read_tlb_tag(i, tlb_tag_way) == eatag))
 722             with m.If(is_tag_hit):
 723                 comb += hitway.eq(i)
 724                 comb += hit.eq(1)
 725
 726         comb += tlb_hit.eq(hit & r0_valid)
 727         comb += tlb_hit_way.eq(hitway)
 728
 729         with m.If(tlb_hit):
 730             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 731         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 732
 733         with m.If(r0.req.virt_mode):
 734             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 735                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 736                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 737             comb += perm_attr.reference.eq(pte[8])
 738             comb += perm_attr.changed.eq(pte[7])
 739             comb += perm_attr.nocache.eq(pte[5])
 740             comb += perm_attr.priv.eq(pte[3])
 741             comb += perm_attr.rd_perm.eq(pte[2])
 742             comb += perm_attr.wr_perm.eq(pte[1])
 743         with m.Else():
 744             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 745                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 746             comb += perm_attr.reference.eq(1)
 747             comb += perm_attr.changed.eq(1)
 748             comb += perm_attr.nocache.eq(0)
 749             comb += perm_attr.priv.eq(1)
 750             comb += perm_attr.rd_perm.eq(1)
 751             comb += perm_attr.wr_perm.eq(1)
 752
 753     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 754                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 755                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 756
 757         dtlb_valids = TLBValidBitsArray()
 758
 759         comb = m.d.comb
 760         sync = m.d.sync
 761
 762         tlbie    = Signal()
 763         tlbwe    = Signal()
 764
 765         comb += tlbie.eq(r0_valid & r0.tlbie)
 766         comb += tlbwe.eq(r0_valid & r0.tlbld)
 767
 768         m.submodules.tlb_update = d = DTLBUpdate()
 769         with m.If(tlbie & r0.doall):
 770             # clear all valid bits at once
 771             for i in range(TLB_SET_SIZE):
 772                 sync += dtlb_valid_bits[i].eq(0)
 773         with m.If(d.updated):
 774             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 775             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 776         with m.If(d.v_updated):
 777             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 778
 779         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 780
 781         comb += d.tlbie.eq(tlbie)
 782         comb += d.tlbwe.eq(tlbwe)
 783         comb += d.doall.eq(r0.doall)
 784         comb += d.tlb_hit.eq(tlb_hit)
 785         comb += d.tlb_hit_way.eq(tlb_hit_way)
 786         comb += d.tlb_tag_way.eq(tlb_tag_way)
 787         comb += d.tlb_pte_way.eq(tlb_pte_way)
 788         comb += d.tlb_req_index.eq(tlb_req_index)
 789
 790         with m.If(tlb_hit):
 791             comb += d.repl_way.eq(tlb_hit_way)
 792         with m.Else():
 793             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 794         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 795         comb += d.pte_data.eq(r0.req.data)
 796
 797     def maybe_plrus(self, m, r1, plru_victim):
 798         """Generate PLRUs
 799         """
 800         comb = m.d.comb
 801         sync = m.d.sync
 802
 803         if TLB_NUM_WAYS == 0:
 804             return
 805
 806         for i in range(NUM_LINES):
 807             # PLRU interface
 808             plru        = PLRU(WAY_BITS)
 809             setattr(m.submodules, "plru%d" % i, plru)
 810             plru_acc_en = Signal()
 811
 812             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 813             comb += plru.acc_en.eq(plru_acc_en)
 814             comb += plru.acc_i.eq(r1.hit_way)
 815             comb += plru_victim[i].eq(plru.lru_o)
 816
 817     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 818         """Cache tag RAM read port
 819         """
 820         comb = m.d.comb
 821         sync = m.d.sync
 822         m_in, d_in = self.m_in, self.d_in
 823
 824         index = Signal(INDEX_BITS)
 825
 826         with m.If(r0_stall):
 827             comb += index.eq(req_index)
 828         with m.Elif(m_in.valid):
 829             comb += index.eq(get_index(m_in.addr))
 830         with m.Else():
 831             comb += index.eq(get_index(d_in.addr))
 832         sync += cache_tag_set.eq(cache_tags[index])
 833
 834     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 835                        r0_valid, r1, cache_valids, replace_way,
 836                        use_forward1_next, use_forward2_next,
 837                        req_hit_way, plru_victim, rc_ok, perm_attr,
 838                        valid_ra, perm_ok, access_ok, req_op, req_go,
 839                        tlb_pte_way,
 840                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 841                        cancel_store, req_same_tag, r0_stall, early_req_row):
 842         """Cache request parsing and hit detection
 843         """
 844
 845         comb = m.d.comb
 846         m_in, d_in = self.m_in, self.d_in
 847
 848         is_hit      = Signal()
 849         hit_way     = Signal(WAY_BITS)
 850         op          = Signal(Op)
 851         opsel       = Signal(3)
 852         go          = Signal()
 853         nc          = Signal()
 854         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 855                                   for i in range(TLB_NUM_WAYS))
 856         cache_valid_idx = Signal(NUM_WAYS)
 857
 858         # Extract line, row and tag from request
 859         comb += req_index.eq(get_index(r0.req.addr))
 860         comb += req_row.eq(get_row(r0.req.addr))
 861         comb += req_tag.eq(get_tag(ra))
 862
 863         if False: # display on comb is a bit... busy.
 864             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 865                     r0.req.addr, ra, req_index, req_tag, req_row)
 866
 867         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 868         comb += cache_valid_idx.eq(cache_valids[req_index])
 869
 870         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 871                                 tlb_valid_way, tlb_hit_way,
 872                                 cache_valid_idx, cache_tag_set,
 873                                 r0.req.addr,
 874                                 hit_set)
 875
 876         comb += dc.tlb_hit.eq(tlb_hit)
 877         comb += dc.reload_tag.eq(r1.reload_tag)
 878         comb += dc.virt_mode.eq(r0.req.virt_mode)
 879         comb += dc.go.eq(go)
 880         comb += dc.req_index.eq(req_index)
 881         comb += is_hit.eq(dc.is_hit)
 882         comb += hit_way.eq(dc.hit_way)
 883         comb += req_same_tag.eq(dc.rel_match)
 884
 885         # See if the request matches the line currently being reloaded
 886         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 887                   (req_index == r1.store_index) & req_same_tag):
 888             # For a store, consider this a hit even if the row isn't
 889             # valid since it will be by the time we perform the store.
 890             # For a load, check the appropriate row valid bit.
 891             rrow = Signal(ROW_LINE_BITS)
 892             comb += rrow.eq(req_row)
 893             valid = r1.rows_valid[rrow]
 894             comb += is_hit.eq((~r0.req.load) | valid)
 895             comb += hit_way.eq(replace_way)
 896
 897         # Whether to use forwarded data for a load or not
 898         with m.If((get_row(r1.req.real_addr) == req_row) &
 899                   (r1.req.hit_way == hit_way)):
 900             # Only need to consider r1.write_bram here, since if we
 901             # are writing refill data here, then we don't have a
 902             # cache hit this cycle on the line being refilled.
 903             # (There is the possibility that the load following the
 904             # load miss that started the refill could be to the old
 905             # contents of the victim line, since it is a couple of
 906             # cycles after the refill starts before we see the updated
 907             # cache tag. In that case we don't use the bypass.)
 908             comb += use_forward1_next.eq(r1.write_bram)
 909         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 910             comb += use_forward2_next.eq(r1.forward_valid1)
 911
 912         # The way that matched on a hit
 913         comb += req_hit_way.eq(hit_way)
 914
 915         # The way to replace on a miss
 916         with m.If(r1.write_tag):
 917             comb += replace_way.eq(plru_victim[r1.store_index])
 918         with m.Else():
 919             comb += replace_way.eq(r1.store_way)
 920
 921         # work out whether we have permission for this access
 922         # NB we don't yet implement AMR, thus no KUAP
 923         comb += rc_ok.eq(perm_attr.reference
 924                          & (r0.req.load | perm_attr.changed))
 925         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 926                            (perm_attr.wr_perm |
 927                               (r0.req.load & perm_attr.rd_perm)))
 928         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 929         # Combine the request and cache hit status to decide what
 930         # operation needs to be done
 931         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 932         comb += op.eq(Op.OP_NONE)
 933         with m.If(go):
 934             with m.If(~access_ok):
 935                 comb += op.eq(Op.OP_BAD)
 936             with m.Elif(cancel_store):
 937                 comb += op.eq(Op.OP_STCX_FAIL)
 938             with m.Else():
 939                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 940                 with m.Switch(opsel):
 941                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 942                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 943                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 944                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 945                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 946                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 947                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 948                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 949         comb += req_op.eq(op)
 950         comb += req_go.eq(go)
 951
 952         # Version of the row number that is valid one cycle earlier
 953         # in the cases where we need to read the cache data BRAM.
 954         # If we're stalling then we need to keep reading the last
 955         # row requested.
 956         with m.If(~r0_stall):
 957             with m.If(m_in.valid):
 958                 comb += early_req_row.eq(get_row(m_in.addr))
 959             with m.Else():
 960                 comb += early_req_row.eq(get_row(d_in.addr))
 961         with m.Else():
 962             comb += early_req_row.eq(req_row)
 963
 964     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 965                          r0_valid, r0, reservation):
 966         """Handle load-with-reservation and store-conditional instructions
 967         """
 968         comb = m.d.comb
 969
 970         with m.If(r0_valid & r0.req.reserve):
 971             # XXX generate alignment interrupt if address
 972             # is not aligned XXX or if r0.req.nc = '1'
 973             with m.If(r0.req.load):
 974                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
 975             with m.Else():
 976                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
 977                 with m.If((~reservation.valid) |
 978                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
 979                     comb += cancel_store.eq(1)
 980
 981     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 982                         reservation, r0):
 983
 984         comb = m.d.comb
 985         sync = m.d.sync
 986
 987         with m.If(r0_valid & access_ok):
 988             with m.If(clear_rsrv):
 989                 sync += reservation.valid.eq(0)
 990             with m.Elif(set_rsrv):
 991                 sync += reservation.valid.eq(1)
 992                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 993
 994     def writeback_control(self, m, r1, cache_out_row):
 995         """Return data for loads & completion control logic
 996         """
 997         comb = m.d.comb
 998         sync = m.d.sync
 999         d_out, m_out = self.d_out, self.m_out
1000
1001         data_out = Signal(64)
1002         data_fwd = Signal(64)
1003
1004         # Use the bypass if are reading the row that was
1005         # written 1 or 2 cycles ago, including for the
1006         # slow_valid = 1 case (i.e. completing a load
1007         # miss or a non-cacheable load).
1008         with m.If(r1.use_forward1):
1009             comb += data_fwd.eq(r1.forward_data1)
1010         with m.Else():
1011             comb += data_fwd.eq(r1.forward_data2)
1012
1013         comb += data_out.eq(cache_out_row)
1014
1015         for i in range(8):
1016             with m.If(r1.forward_sel[i]):
1017                 dsel = data_fwd.word_select(i, 8)
1018                 comb += data_out.word_select(i, 8).eq(dsel)
1019
1020         comb += d_out.valid.eq(r1.ls_valid)
1021         comb += d_out.data.eq(data_out)
1022         comb += d_out.store_done.eq(~r1.stcx_fail)
1023         comb += d_out.error.eq(r1.ls_error)
1024         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1025
1026         # Outputs to MMU
1027         comb += m_out.done.eq(r1.mmu_done)
1028         comb += m_out.err.eq(r1.mmu_error)
1029         comb += m_out.data.eq(data_out)
1030
1031         # We have a valid load or store hit or we just completed
1032         # a slow op such as a load miss, a NC load or a store
1033         #
1034         # Note: the load hit is delayed by one cycle. However it
1035         # can still not collide with r.slow_valid (well unless I
1036         # miscalculated) because slow_valid can only be set on a
1037         # subsequent request and not on its first cycle (the state
1038         # machine must have advanced), which makes slow_valid
1039         # at least 2 cycles from the previous hit_load_valid.
1040
1041         # Sanity: Only one of these must be set in any given cycle
1042
1043         if False: # TODO: need Display to get this to work
1044             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1045             "unexpected slow_valid collision with stcx_fail"
1046
1047             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1048              "unexpected hit_load_delayed collision with slow_valid"
1049
1050         with m.If(~r1.mmu_req):
1051             # Request came from loadstore1...
1052             # Load hit case is the standard path
1053             with m.If(r1.hit_load_valid):
1054                 sync += Display("completing load hit data=%x", data_out)
1055
1056             # error cases complete without stalling
1057             with m.If(r1.ls_error):
1058                 sync += Display("completing ld/st with error")
1059
1060             # Slow ops (load miss, NC, stores)
1061             with m.If(r1.slow_valid):
1062                 sync += Display("completing store or load miss adr=%x data=%x",
1063                                 r1.req.real_addr, data_out)
1064
1065         with m.Else():
1066             # Request came from MMU
1067             with m.If(r1.hit_load_valid):
1068                 sync += Display("completing load hit to MMU, data=%x",
1069                                 m_out.data)
1070             # error cases complete without stalling
1071             with m.If(r1.mmu_error):
1072                 sync += Display("combpleting MMU ld with error")
1073
1074             # Slow ops (i.e. load miss)
1075             with m.If(r1.slow_valid):
1076                 sync += Display("completing MMU load miss, data=%x",
1077                                 m_out.data)
1078
1079     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1080         """rams
1081         Generate a cache RAM for each way. This handles the normal
1082         reads, writes from reloads and the special store-hit update
1083         path as well.
1084
1085         Note: the BRAMs have an extra read buffer, meaning the output
1086         is pipelined an extra cycle. This differs from the
1087         icache. The writeback logic needs to take that into
1088         account by using 1-cycle delayed signals for load hits.
1089         """
1090         comb = m.d.comb
1091         wb_in = self.wb_in
1092
1093         for i in range(NUM_WAYS):
1094             do_read  = Signal(name="do_rd%d" % i)
1095             rd_addr  = Signal(ROW_BITS, name="rd_addr_%d" % i)
1096             do_write = Signal(name="do_wr%d" % i)
1097             wr_addr  = Signal(ROW_BITS, name="wr_addr_%d" % i)
1098             wr_data  = Signal(WB_DATA_BITS, name="din_%d" % i)
1099             wr_sel   = Signal(ROW_SIZE)
1100             wr_sel_m = Signal(ROW_SIZE)
1101             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1102
1103             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True)
1104             setattr(m.submodules, "cacheram_%d" % i, way)
1105
1106             comb += way.rd_en.eq(do_read)
1107             comb += way.rd_addr.eq(rd_addr)
1108             comb += _d_out.eq(way.rd_data_o)
1109             comb += way.wr_sel.eq(wr_sel_m)
1110             comb += way.wr_addr.eq(wr_addr)
1111             comb += way.wr_data.eq(wr_data)
1112
1113             # Cache hit reads
1114             comb += do_read.eq(1)
1115             comb += rd_addr.eq(early_req_row)
1116             with m.If(r1.hit_way == i):
1117                 comb += cache_out_row.eq(_d_out)
1118
1119             # Write mux:
1120             #
1121             # Defaults to wishbone read responses (cache refill)
1122             #
1123             # For timing, the mux on wr_data/sel/addr is not
1124             # dependent on anything other than the current state.
1125
1126             with m.If(r1.write_bram):
1127                 # Write store data to BRAM.  This happens one
1128                 # cycle after the store is in r0.
1129                 comb += wr_data.eq(r1.req.data)
1130                 comb += wr_sel.eq(r1.req.byte_sel)
1131                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1132
1133                 with m.If(i == r1.req.hit_way):
1134                     comb += do_write.eq(1)
1135             with m.Else():
1136                 # Otherwise, we might be doing a reload or a DCBZ
1137                 with m.If(r1.dcbz):
1138                     comb += wr_data.eq(0)
1139                 with m.Else():
1140                     comb += wr_data.eq(wb_in.dat)
1141                 comb += wr_addr.eq(r1.store_row)
1142                 comb += wr_sel.eq(~0) # all 1s
1143
1144                 with m.If((r1.state == State.RELOAD_WAIT_ACK)
1145                           & wb_in.ack & (replace_way == i)):
1146                     comb += do_write.eq(1)
1147
1148             # Mask write selects with do_write since BRAM
1149             # doesn't have a global write-enable
1150             with m.If(do_write):
1151                 comb += wr_sel_m.eq(wr_sel)
1152
1153     # Cache hit synchronous machine for the easy case.
1154     # This handles load hits.
1155     # It also handles error cases (TLB miss, cache paradox)
1156     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1157                         req_hit_way, req_index, req_tag, access_ok,
1158                         tlb_hit, tlb_hit_way, tlb_req_index):
1159
1160         comb = m.d.comb
1161         sync = m.d.sync
1162
1163         with m.If(req_op != Op.OP_NONE):
1164             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1165                     req_op, r0.req.addr, r0.req.nc,
1166                     req_index, req_tag, req_hit_way)
1167
1168         with m.If(r0_valid):
1169             sync += r1.mmu_req.eq(r0.mmu_req)
1170
1171         # Fast path for load/store hits.
1172         # Set signals for the writeback controls.
1173         sync += r1.hit_way.eq(req_hit_way)
1174         sync += r1.hit_index.eq(req_index)
1175
1176         with m.If(req_op == Op.OP_LOAD_HIT):
1177             sync += r1.hit_load_valid.eq(1)
1178         with m.Else():
1179             sync += r1.hit_load_valid.eq(0)
1180
1181         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1182             sync += r1.cache_hit.eq(1)
1183         with m.Else():
1184             sync += r1.cache_hit.eq(0)
1185
1186         with m.If(req_op == Op.OP_BAD):
1187             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1188             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1189             sync += r1.ls_error.eq(~r0.mmu_req)
1190             sync += r1.mmu_error.eq(r0.mmu_req)
1191             sync += r1.cache_paradox.eq(access_ok)
1192
1193         with m.Else():
1194             sync += r1.ls_error.eq(0)
1195             sync += r1.mmu_error.eq(0)
1196             sync += r1.cache_paradox.eq(0)
1197
1198         with m.If(req_op == Op.OP_STCX_FAIL):
1199             sync += r1.stcx_fail.eq(1)
1200         with m.Else():
1201             sync += r1.stcx_fail.eq(0)
1202
1203         # Record TLB hit information for updating TLB PLRU
1204         sync += r1.tlb_hit.eq(tlb_hit)
1205         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1206         sync += r1.tlb_hit_index.eq(tlb_req_index)
1207
1208     # Memory accesses are handled by this state machine:
1209     #
1210     #   * Cache load miss/reload (in conjunction with "rams")
1211     #   * Load hits for non-cachable forms
1212     #   * Stores (the collision case is handled in "rams")
1213     #
1214     # All wishbone requests generation is done here.
1215     # This machine operates at stage 1.
1216     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1217                     cache_valids, r0, replace_way,
1218                     req_hit_way, req_same_tag,
1219                     r0_valid, req_op, cache_tags, req_go, ra):
1220
1221         comb = m.d.comb
1222         sync = m.d.sync
1223         wb_in = self.wb_in
1224         d_in = self.d_in
1225
1226         req         = MemAccessRequest("mreq_ds")
1227
1228         req_row = Signal(ROW_BITS)
1229         req_idx = Signal(INDEX_BITS)
1230         req_tag = Signal(TAG_BITS)
1231         comb += req_idx.eq(get_index(req.real_addr))
1232         comb += req_row.eq(get_row(req.real_addr))
1233         comb += req_tag.eq(get_tag(req.real_addr))
1234
1235         sync += r1.use_forward1.eq(use_forward1_next)
1236         sync += r1.forward_sel.eq(0)
1237
1238         with m.If(use_forward1_next):
1239             sync += r1.forward_sel.eq(r1.req.byte_sel)
1240         with m.Elif(use_forward2_next):
1241             sync += r1.forward_sel.eq(r1.forward_sel1)
1242
1243         sync += r1.forward_data2.eq(r1.forward_data1)
1244         with m.If(r1.write_bram):
1245             sync += r1.forward_data1.eq(r1.req.data)
1246             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1247             sync += r1.forward_way1.eq(r1.req.hit_way)
1248             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1249             sync += r1.forward_valid1.eq(1)
1250         with m.Else():
1251             with m.If(r1.dcbz):
1252                 sync += r1.forward_data1.eq(0)
1253             with m.Else():
1254                 sync += r1.forward_data1.eq(wb_in.dat)
1255             sync += r1.forward_sel1.eq(~0) # all 1s
1256             sync += r1.forward_way1.eq(replace_way)
1257             sync += r1.forward_row1.eq(r1.store_row)
1258             sync += r1.forward_valid1.eq(0)
1259
1260         # One cycle pulses reset
1261         sync += r1.slow_valid.eq(0)
1262         sync += r1.write_bram.eq(0)
1263         sync += r1.inc_acks.eq(0)
1264         sync += r1.dec_acks.eq(0)
1265
1266         sync += r1.ls_valid.eq(0)
1267         # complete tlbies and TLB loads in the third cycle
1268         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1269
1270         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1271             with m.If(~r0.mmu_req):
1272                 sync += r1.ls_valid.eq(1)
1273             with m.Else():
1274                 sync += r1.mmu_done.eq(1)
1275
1276         with m.If(r1.write_tag):
1277             # Store new tag in selected way
1278             for i in range(NUM_WAYS):
1279                 with m.If(i == replace_way):
1280                     ct = Signal(TAG_RAM_WIDTH)
1281                     comb += ct.eq(cache_tags[r1.store_index])
1282                     """
1283 TODO: check this
1284 cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
1285                     (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag;
1286                     """
1287                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1288                     sync += cache_tags[r1.store_index].eq(ct)
1289             sync += r1.store_way.eq(replace_way)
1290             sync += r1.write_tag.eq(0)
1291
1292         # Take request from r1.req if there is one there,
1293         # else from req_op, ra, etc.
1294         with m.If(r1.full):
1295             comb += req.eq(r1.req)
1296         with m.Else():
1297             comb += req.op.eq(req_op)
1298             comb += req.valid.eq(req_go)
1299             comb += req.mmu_req.eq(r0.mmu_req)
1300             comb += req.dcbz.eq(r0.req.dcbz)
1301             comb += req.real_addr.eq(ra)
1302
1303             with m.If(r0.req.dcbz):
1304                 # force data to 0 for dcbz
1305                 comb += req.data.eq(0)
1306             with m.Elif(r0.d_valid):
1307                 comb += req.data.eq(r0.req.data)
1308             with m.Else():
1309                 comb += req.data.eq(d_in.data)
1310
1311             # Select all bytes for dcbz
1312             # and for cacheable loads
1313             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1314                 comb += req.byte_sel.eq(~0) # all 1s
1315             with m.Else():
1316                 comb += req.byte_sel.eq(r0.req.byte_sel)
1317             comb += req.hit_way.eq(req_hit_way)
1318             comb += req.same_tag.eq(req_same_tag)
1319
1320             # Store the incoming request from r0,
1321             # if it is a slow request
1322             # Note that r1.full = 1 implies req_op = OP_NONE
1323             with m.If((req_op == Op.OP_LOAD_MISS)
1324                       | (req_op == Op.OP_LOAD_NC)
1325                       | (req_op == Op.OP_STORE_MISS)
1326                       | (req_op == Op.OP_STORE_HIT)):
1327                 sync += r1.req.eq(req)
1328                 sync += r1.full.eq(1)
1329
1330         # Main state machine
1331         with m.Switch(r1.state):
1332
1333             with m.Case(State.IDLE):
1334                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1335                 sync += r1.wb.sel.eq(req.byte_sel)
1336                 sync += r1.wb.dat.eq(req.data)
1337                 sync += r1.dcbz.eq(req.dcbz)
1338
1339                 # Keep track of our index and way
1340                 # for subsequent stores.
1341                 sync += r1.store_index.eq(req_idx)
1342                 sync += r1.store_row.eq(req_row)
1343                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1344                 sync += r1.reload_tag.eq(req_tag)
1345                 sync += r1.req.same_tag.eq(1)
1346
1347                 with m.If(req.op == Op.OP_STORE_HIT):
1348                     sync += r1.store_way.eq(req.hit_way)
1349
1350                 # Reset per-row valid bits,
1351                 # ready for handling OP_LOAD_MISS
1352                 for i in range(ROW_PER_LINE):
1353                     sync += r1.rows_valid[i].eq(0)
1354
1355                 with m.If(req_op != Op.OP_NONE):
1356                     sync += Display("cache op %d", req.op)
1357
1358                 with m.Switch(req.op):
1359                     with m.Case(Op.OP_LOAD_HIT):
1360                         # stay in IDLE state
1361                         pass
1362
1363                     with m.Case(Op.OP_LOAD_MISS):
1364                         sync += Display("cache miss real addr: %x " \
1365                                 "idx: %x tag: %x",
1366                                 req.real_addr, req_row, req_tag)
1367
1368                         # Start the wishbone cycle
1369                         sync += r1.wb.we.eq(0)
1370                         sync += r1.wb.cyc.eq(1)
1371                         sync += r1.wb.stb.eq(1)
1372
1373                         # Track that we had one request sent
1374                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1375                         sync += r1.write_tag.eq(1)
1376
1377                     with m.Case(Op.OP_LOAD_NC):
1378                         sync += r1.wb.cyc.eq(1)
1379                         sync += r1.wb.stb.eq(1)
1380                         sync += r1.wb.we.eq(0)
1381                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1382
1383                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1384                         with m.If(~req.dcbz):
1385                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1386                             sync += r1.acks_pending.eq(1)
1387                             sync += r1.full.eq(0)
1388                             sync += r1.slow_valid.eq(1)
1389
1390                             with m.If(~req.mmu_req):
1391                                 sync += r1.ls_valid.eq(1)
1392                             with m.Else():
1393                                 sync += r1.mmu_done.eq(1)
1394
1395                             with m.If(req.op == Op.OP_STORE_HIT):
1396                                 sync += r1.write_bram.eq(1)
1397                         with m.Else():
1398                             # dcbz is handled much like a load miss except
1399                             # that we are writing to memory instead of reading
1400                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1401
1402                             with m.If(req.op == Op.OP_STORE_MISS):
1403                                 sync += r1.write_tag.eq(1)
1404
1405                         sync += r1.wb.we.eq(1)
1406                         sync += r1.wb.cyc.eq(1)
1407                         sync += r1.wb.stb.eq(1)
1408
1409                     # OP_NONE and OP_BAD do nothing
1410                     # OP_BAD & OP_STCX_FAIL were
1411                     # handled above already
1412                     with m.Case(Op.OP_NONE):
1413                         pass
1414                     with m.Case(Op.OP_BAD):
1415                         pass
1416                     with m.Case(Op.OP_STCX_FAIL):
1417                         pass
1418
1419             with m.Case(State.RELOAD_WAIT_ACK):
1420                 ld_stbs_done = Signal()
1421                 # Requests are all sent if stb is 0
1422                 comb += ld_stbs_done.eq(~r1.wb.stb)
1423
1424                 # If we are still sending requests, was one accepted?
1425                 with m.If((~wb_in.stall) & r1.wb.stb):
1426                     # That was the last word?  We are done sending.
1427                     # Clear stb and set ld_stbs_done so we can handle an
1428                     # eventual last ack on the same cycle.
1429                     # sigh - reconstruct wb adr with 3 extra 0s at front
1430                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1431                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1432                         sync += r1.wb.stb.eq(0)
1433                         comb += ld_stbs_done.eq(1)
1434
1435                     # Calculate the next row address in the current cache line
1436                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1437                     comb += row.eq(r1.wb.adr)
1438                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1439
1440                 # Incoming acks processing
1441                 sync += r1.forward_valid1.eq(wb_in.ack)
1442                 with m.If(wb_in.ack):
1443                     srow = Signal(ROW_LINE_BITS)
1444                     comb += srow.eq(r1.store_row)
1445                     sync += r1.rows_valid[srow].eq(1)
1446
1447                     # If this is the data we were looking for,
1448                     # we can complete the request next cycle.
1449                     # Compare the whole address in case the
1450                     # request in r1.req is not the one that
1451                     # started this refill.
1452                     with m.If(req.valid & r1.req.same_tag &
1453                               ((r1.dcbz & r1.req.dcbz) |
1454                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1455                                 (r1.store_row == get_row(req.real_addr))):
1456                         sync += r1.full.eq(0)
1457                         sync += r1.slow_valid.eq(1)
1458                         with m.If(~r1.mmu_req):
1459                             sync += r1.ls_valid.eq(1)
1460                         with m.Else():
1461                             sync += r1.mmu_done.eq(1)
1462                         sync += r1.forward_sel.eq(~0) # all 1s
1463                         sync += r1.use_forward1.eq(1)
1464
1465                     # Check for completion
1466                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1467                                                       r1.end_row_ix)):
1468                         # Complete wishbone cycle
1469                         sync += r1.wb.cyc.eq(0)
1470
1471                         # Cache line is now valid
1472                         cv = Signal(INDEX_BITS)
1473                         comb += cv.eq(cache_valids[r1.store_index])
1474                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1475                         sync += cache_valids[r1.store_index].eq(cv)
1476
1477                         sync += r1.state.eq(State.IDLE)
1478
1479                     # Increment store row counter
1480                     sync += r1.store_row.eq(next_row(r1.store_row))
1481
1482             with m.Case(State.STORE_WAIT_ACK):
1483                 st_stbs_done = Signal()
1484                 acks        = Signal(3)
1485                 adjust_acks = Signal(3)
1486
1487                 comb += st_stbs_done.eq(~r1.wb.stb)
1488                 comb += acks.eq(r1.acks_pending)
1489
1490                 with m.If(r1.inc_acks != r1.dec_acks):
1491                     with m.If(r1.inc_acks):
1492                         comb += adjust_acks.eq(acks + 1)
1493                     with m.Else():
1494                         comb += adjust_acks.eq(acks - 1)
1495                 with m.Else():
1496                     comb += adjust_acks.eq(acks)
1497
1498                 sync += r1.acks_pending.eq(adjust_acks)
1499
1500                 # Clear stb when slave accepted request
1501                 with m.If(~wb_in.stall):
1502                     # See if there is another store waiting
1503                     # to be done which is in the same real page.
1504                     with m.If(req.valid):
1505                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1506                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1507                         sync += r1.wb.dat.eq(req.data)
1508                         sync += r1.wb.sel.eq(req.byte_sel)
1509
1510                     with m.If((adjust_acks < 7) & req.same_tag &
1511                                 ((req.op == Op.OP_STORE_MISS)
1512                                  | (req.op == Op.OP_STORE_HIT))):
1513                         sync += r1.wb.stb.eq(1)
1514                         comb += st_stbs_done.eq(0)
1515
1516                         with m.If(req.op == Op.OP_STORE_HIT):
1517                             sync += r1.write_bram.eq(1)
1518                         sync += r1.full.eq(0)
1519                         sync += r1.slow_valid.eq(1)
1520
1521                         # Store requests never come from the MMU
1522                         sync += r1.ls_valid.eq(1)
1523                         comb += st_stbs_done.eq(0)
1524                         sync += r1.inc_acks.eq(1)
1525                     with m.Else():
1526                         sync += r1.wb.stb.eq(0)
1527                         comb += st_stbs_done.eq(1)
1528
1529                 # Got ack ? See if complete.
1530                 with m.If(wb_in.ack):
1531                     with m.If(st_stbs_done & (adjust_acks == 1)):
1532                         sync += r1.state.eq(State.IDLE)
1533                         sync += r1.wb.cyc.eq(0)
1534                         sync += r1.wb.stb.eq(0)
1535                     sync += r1.dec_acks.eq(1)
1536
1537             with m.Case(State.NC_LOAD_WAIT_ACK):
1538                 # Clear stb when slave accepted request
1539                 with m.If(~wb_in.stall):
1540                     sync += r1.wb.stb.eq(0)
1541
1542                 # Got ack ? complete.
1543                 with m.If(wb_in.ack):
1544                     sync += r1.state.eq(State.IDLE)
1545                     sync += r1.full.eq(0)
1546                     sync += r1.slow_valid.eq(1)
1547
1548                     with m.If(~r1.mmu_req):
1549                         sync += r1.ls_valid.eq(1)
1550                     with m.Else():
1551                         sync += r1.mmu_done.eq(1)
1552
1553                     sync += r1.forward_sel.eq(~0) # all 1s
1554                     sync += r1.use_forward1.eq(1)
1555                     sync += r1.wb.cyc.eq(0)
1556                     sync += r1.wb.stb.eq(0)
1557
1558     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1559
1560         sync = m.d.sync
1561         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1562
1563         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1564                                stall_out, req_op[:3], d_out.valid, d_out.error,
1565                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1566                                r1.real_adr[3:6]))
1567
1568     def elaborate(self, platform):
1569
1570         m = Module()
1571         comb = m.d.comb
1572         d_in = self.d_in
1573
1574         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1575         cache_tags       = CacheTagArray()
1576         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1577         cache_valids = CacheValidBitsArray()
1578
1579         # TODO attribute ram_style : string;
1580         # TODO attribute ram_style of cache_tags : signal is "distributed";
1581
1582         """note: these are passed to nmigen.hdl.Memory as "attributes".
1583            don't know how, just that they are.
1584         """
1585         dtlb_valid_bits = TLBValidBitsArray()
1586         dtlb_tags       = TLBTagsArray()
1587         dtlb_ptes       = TLBPtesArray()
1588         # TODO attribute ram_style of
1589         #  dtlb_tags : signal is "distributed";
1590         # TODO attribute ram_style of
1591         #  dtlb_ptes : signal is "distributed";
1592
1593         r0      = RegStage0("r0")
1594         r0_full = Signal()
1595
1596         r1 = RegStage1("r1")
1597
1598         reservation = Reservation()
1599
1600         # Async signals on incoming request
1601         req_index    = Signal(INDEX_BITS)
1602         req_row      = Signal(ROW_BITS)
1603         req_hit_way  = Signal(WAY_BITS)
1604         req_tag      = Signal(TAG_BITS)
1605         req_op       = Signal(Op)
1606         req_data     = Signal(64)
1607         req_same_tag = Signal()
1608         req_go       = Signal()
1609
1610         early_req_row     = Signal(ROW_BITS)
1611
1612         cancel_store      = Signal()
1613         set_rsrv          = Signal()
1614         clear_rsrv        = Signal()
1615
1616         r0_valid          = Signal()
1617         r0_stall          = Signal()
1618
1619         use_forward1_next = Signal()
1620         use_forward2_next = Signal()
1621
1622         cache_out_row     = Signal(WB_DATA_BITS)
1623
1624         plru_victim       = PLRUOut()
1625         replace_way       = Signal(WAY_BITS)
1626
1627         # Wishbone read/write/cache write formatting signals
1628         bus_sel           = Signal(8)
1629
1630         # TLB signals
1631         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1632         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1633         tlb_valid_way = Signal(TLB_NUM_WAYS)
1634         tlb_req_index = Signal(TLB_SET_BITS)
1635         tlb_hit       = Signal()
1636         tlb_hit_way   = Signal(TLB_WAY_BITS)
1637         pte           = Signal(TLB_PTE_BITS)
1638         ra            = Signal(REAL_ADDR_BITS)
1639         valid_ra      = Signal()
1640         perm_attr     = PermAttr("dc_perms")
1641         rc_ok         = Signal()
1642         perm_ok       = Signal()
1643         access_ok     = Signal()
1644
1645         tlb_plru_victim = TLBPLRUOut()
1646
1647         # we don't yet handle collisions between loadstore1 requests
1648         # and MMU requests
1649         comb += self.m_out.stall.eq(0)
1650
1651         # Hold off the request in r0 when r1 has an uncompleted request
1652         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1653         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1654         comb += self.stall_out.eq(r0_stall)
1655
1656         # Wire up wishbone request latch out of stage 1
1657         comb += self.wb_out.eq(r1.wb)
1658
1659         # deal with litex not doing wishbone pipeline mode
1660         # XXX in wrong way.  FIFOs are needed in the SRAM test
1661         # so that stb/ack match up
1662         comb += self.wb_in.stall.eq(self.wb_out.cyc & ~self.wb_in.ack)
1663
1664         # call sub-functions putting everything together, using shared
1665         # signals established above
1666         self.stage_0(m, r0, r1, r0_full)
1667         self.tlb_read(m, r0_stall, tlb_valid_way,
1668                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1669                       dtlb_tags, dtlb_ptes)
1670         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1671                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1672                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1673         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1674                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1675                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1676         self.maybe_plrus(m, r1, plru_victim)
1677         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1678         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1679         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1680                            r0_valid, r1, cache_valids, replace_way,
1681                            use_forward1_next, use_forward2_next,
1682                            req_hit_way, plru_victim, rc_ok, perm_attr,
1683                            valid_ra, perm_ok, access_ok, req_op, req_go,
1684                            tlb_pte_way,
1685                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1686                            cancel_store, req_same_tag, r0_stall, early_req_row)
1687         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1688                            r0_valid, r0, reservation)
1689         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1690                            reservation, r0)
1691         self.writeback_control(m, r1, cache_out_row)
1692         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1693         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1694                         req_hit_way, req_index, req_tag, access_ok,
1695                         tlb_hit, tlb_hit_way, tlb_req_index)
1696         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1697                     cache_valids, r0, replace_way,
1698                     req_hit_way, req_same_tag,
1699                          r0_valid, req_op, cache_tags, req_go, ra)
1700         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1701
1702         return m
1703
1704 def dcache_load(dut, addr, nc=0):
1705     yield dut.d_in.load.eq(1)
1706     yield dut.d_in.nc.eq(nc)
1707     yield dut.d_in.addr.eq(addr)
1708     yield dut.d_in.byte_sel.eq(~0)
1709     yield dut.d_in.valid.eq(1)
1710     yield
1711     yield dut.d_in.valid.eq(0)
1712     yield dut.d_in.byte_sel.eq(0)
1713     while not (yield dut.d_out.valid):
1714         yield
1715     # yield # data is valid one cycle AFTER valid goes hi? (no it isn't)
1716     data = yield dut.d_out.data
1717     return data
1718
1719
1720 def dcache_store(dut, addr, data, nc=0):
1721     yield dut.d_in.load.eq(0)
1722     yield dut.d_in.nc.eq(nc)
1723     yield dut.d_in.byte_sel.eq(~0)
1724     yield dut.d_in.addr.eq(addr)
1725     yield dut.d_in.valid.eq(1)
1726     yield
1727     yield dut.d_in.data.eq(data)    # leave set, but the cycle AFTER
1728     yield dut.d_in.valid.eq(0)
1729     yield dut.d_in.byte_sel.eq(0)
1730     while not (yield dut.d_out.valid):
1731         yield
1732
1733
1734 def dcache_random_sim(dut, mem, nc=0):
1735
1736     # start copy of mem
1737     sim_mem = deepcopy(mem)
1738     memsize = len(sim_mem)
1739     print ("mem len", memsize)
1740
1741     # clear stuff
1742     yield dut.d_in.valid.eq(0)
1743     yield dut.d_in.load.eq(0)
1744     yield dut.d_in.priv_mode.eq(1)
1745     yield dut.d_in.nc.eq(0)
1746     yield dut.d_in.addr.eq(0)
1747     yield dut.d_in.data.eq(0)
1748     yield dut.m_in.valid.eq(0)
1749     yield dut.m_in.addr.eq(0)
1750     yield dut.m_in.pte.eq(0)
1751     # wait 4 * clk_period
1752     yield
1753     yield
1754     yield
1755     yield
1756
1757     print ()
1758
1759     #for i in range(1024):
1760     #    sim_mem[i] = i
1761
1762     for i in range(1024):
1763         addr = randint(0, memsize-1)
1764         data = randint(0, (1<<64)-1)
1765         sim_mem[addr] = data
1766         row = addr
1767         addr *= 8
1768
1769         print ("random testing %d 0x%x row %d data 0x%x" % (i, addr, row, data))
1770
1771         yield from dcache_load(dut, addr, nc)
1772         yield from dcache_store(dut, addr, data, nc)
1773
1774         addr = randint(0, memsize-1)
1775         sim_data = sim_mem[addr]
1776         row = addr
1777         addr *= 8
1778
1779         print ("    load 0x%x row %d expect data 0x%x" % (addr, row, sim_data))
1780         data = yield from dcache_load(dut, addr, nc)
1781         assert data == sim_data, \
1782             "check addr 0x%x row %d data %x != %x" % (addr, row, data, sim_data)
1783
1784     for addr in range(memsize):
1785         data = yield from dcache_load(dut, addr*8, nc)
1786         assert data == sim_mem[addr], \
1787             "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1788
1789
1790 def dcache_regression_sim(dut, mem, nc=0):
1791
1792     # start copy of mem
1793     sim_mem = deepcopy(mem)
1794     memsize = len(sim_mem)
1795     print ("mem len", memsize)
1796
1797     # clear stuff
1798     yield dut.d_in.valid.eq(0)
1799     yield dut.d_in.load.eq(0)
1800     yield dut.d_in.priv_mode.eq(1)
1801     yield dut.d_in.nc.eq(0)
1802     yield dut.d_in.addr.eq(0)
1803     yield dut.d_in.data.eq(0)
1804     yield dut.m_in.valid.eq(0)
1805     yield dut.m_in.addr.eq(0)
1806     yield dut.m_in.pte.eq(0)
1807     # wait 4 * clk_period
1808     yield
1809     yield
1810     yield
1811     yield
1812
1813     addr = 0
1814     row = addr
1815     addr *= 8
1816
1817     print ("random testing %d 0x%x row %d" % (i, addr, row))
1818
1819     yield from dcache_load(dut, addr, nc)
1820
1821     addr = 2
1822     sim_data = sim_mem[addr]
1823     row = addr
1824     addr *= 8
1825
1826     print ("    load 0x%x row %d expect data 0x%x" % (addr, row, sim_data))
1827     data = yield from dcache_load(dut, addr, nc)
1828     assert data == sim_data, \
1829         "check addr 0x%x row %d data %x != %x" % (addr, row, data, sim_data)
1830
1831
1832
1833 def dcache_sim(dut, mem):
1834     # clear stuff
1835     yield dut.d_in.valid.eq(0)
1836     yield dut.d_in.load.eq(0)
1837     yield dut.d_in.priv_mode.eq(1)
1838     yield dut.d_in.nc.eq(0)
1839     yield dut.d_in.addr.eq(0)
1840     yield dut.d_in.data.eq(0)
1841     yield dut.m_in.valid.eq(0)
1842     yield dut.m_in.addr.eq(0)
1843     yield dut.m_in.pte.eq(0)
1844     # wait 4 * clk_period
1845     yield
1846     yield
1847     yield
1848     yield
1849
1850     # Cacheable read of address 4
1851     data = yield from dcache_load(dut, 0x58)
1852     addr = yield dut.d_in.addr
1853     assert data == 0x0000001700000016, \
1854         f"data @%x=%x expected 0x0000001700000016" % (addr, data)
1855
1856     # Cacheable read of address 20
1857     data = yield from dcache_load(dut, 0x20)
1858     addr = yield dut.d_in.addr
1859     assert data == 0x0000000900000008, \
1860         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1861
1862     # Cacheable read of address 30
1863     data = yield from dcache_load(dut, 0x530)
1864     addr = yield dut.d_in.addr
1865     assert data == 0x0000014D0000014C, \
1866         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1867
1868     # 2nd Cacheable read of address 30
1869     data = yield from dcache_load(dut, 0x530)
1870     addr = yield dut.d_in.addr
1871     assert data == 0x0000014D0000014C, \
1872         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1873
1874     # Non-cacheable read of address 100
1875     data = yield from dcache_load(dut, 0x100, nc=1)
1876     addr = yield dut.d_in.addr
1877     assert data == 0x0000004100000040, \
1878         f"data @%x=%x expected 0000004100000040" % (addr, data)
1879
1880     # Store at address 530
1881     yield from dcache_store(dut, 0x530, 0x121)
1882
1883     # Store at address 30
1884     yield from dcache_store(dut, 0x530, 0x12345678)
1885
1886     # 3nd Cacheable read of address 530
1887     data = yield from dcache_load(dut, 0x530)
1888     addr = yield dut.d_in.addr
1889     assert data == 0x12345678, \
1890         f"data @%x=%x expected 0x12345678" % (addr, data)
1891
1892     # 4th Cacheable read of address 20
1893     data = yield from dcache_load(dut, 0x20)
1894     addr = yield dut.d_in.addr
1895     assert data == 0x0000000900000008, \
1896         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1897
1898     yield
1899     yield
1900     yield
1901     yield
1902
1903
1904 def test_dcache(mem, test_fn, test_name):
1905     dut = DCache()
1906
1907     memory = Memory(width=64, depth=len(mem), init=mem, simulate=True)
1908     sram = SRAM(memory=memory, granularity=8)
1909
1910     m = Module()
1911     m.submodules.dcache = dut
1912     m.submodules.sram = sram
1913
1914     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1915     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1916     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1917     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1918     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1919     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1920
1921     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1922     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1923
1924     dcache_write_gtkw(test_name)
1925
1926     # nmigen Simulation
1927     sim = Simulator(m)
1928     sim.add_clock(1e-6)
1929
1930     sim.add_sync_process(wrap(test_fn(dut, mem)))
1931     with sim.write_vcd('test_dcache%s.vcd' % test_name):
1932         sim.run()
1933
1934
1935 def dcache_write_gtkw(test_name):
1936     traces = [
1937         'clk',
1938         ('d_in', [
1939             'd_in_load', 'd_in_nc', 'd_in_addr[63:0]', 'd_in_data[63:0]',
1940             'd_in_byte_sel[7:0]', 'd_in_valid'
1941         ]),
1942         ('d_out', [
1943             'd_out_valid', 'd_out_data[63:0]'
1944         ]),
1945         ('wb_out', [
1946             'wb_out_cyc', 'wb_out_stb', 'wb_out_we',
1947             'wb_out_adr[31:0]', 'wb_out_sel[7:0]', 'wb_out_dat[63:0]'
1948         ]),
1949         ('wb_in', [
1950             'wb_in_stall', 'wb_in_ack', 'wb_in_dat[63:0]'
1951         ])
1952     ]
1953     write_gtkw('test_dcache%s.gtkw' % test_name,
1954                'test_dcache%s.vcd' % test_name,
1955                traces, module='top.dcache')
1956
1957
1958 if __name__ == '__main__':
1959     seed(0)
1960     dut = DCache()
1961     vl = rtlil.convert(dut, ports=[])
1962     with open("test_dcache.il", "w") as f:
1963         f.write(vl)
1964
1965     mem = []
1966     memsize = 16
1967     for i in range(memsize):
1968         mem.append(i)
1969
1970     test_dcache(mem, dcache_regression_sim, "simpleregression")
1971
1972     mem = []
1973     memsize = 256
1974     for i in range(memsize):
1975         mem.append(i)
1976
1977     test_dcache(mem, dcache_random_sim, "random")
1978
1979     mem = []
1980     for i in range(1024):
1981         mem.append((i*2)| ((i*2+1)<<32))
1982
1983     test_dcache(mem, dcache_sim, "")
1984