src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8 """
   9
  10 import sys
  11 sys.setrecursionlimit(1000000)
  12
  13 from enum import Enum, unique
  14
  15 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  16 from nmutil.util import Display
  17
  18 from copy import deepcopy
  19 from random import randint, seed
  20
  21 from nmigen.cli import main
  22 from nmutil.iocontrol import RecordObject
  23 from nmigen.utils import log2_int
  24 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  25                                      DCacheToLoadStore1Type,
  26                                      MMUToDCacheType,
  27                                      DCacheToMMUType)
  28
  29 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  30                                 WBAddrType, WBDataType, WBSelType,
  31                                 WBMasterOut, WBSlaveOut,
  32                                 WBMasterOutVector, WBSlaveOutVector,
  33                                 WBIOMasterOut, WBIOSlaveOut)
  34
  35 from soc.experiment.cache_ram import CacheRam
  36 #from soc.experiment.plru import PLRU
  37 from nmutil.plru import PLRU
  38
  39 # for test
  40 from soc.bus.sram import SRAM
  41 from nmigen import Memory
  42 from nmigen.cli import rtlil
  43
  44 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  45 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  46 from nmutil.sim_tmp_alternative import Simulator
  47
  48 from nmutil.util import wrap
  49
  50
  51 # TODO: make these parameters of DCache at some point
  52 LINE_SIZE = 64    # Line size in bytes
  53 NUM_LINES = 16    # Number of lines in a set
  54 NUM_WAYS = 4      # Number of ways
  55 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  56 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  57 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  58 LOG_LENGTH = 0    # Non-zero to enable log data collection
  59
  60 # BRAM organisation: We never access more than
  61 #     -- WB_DATA_BITS at a time so to save
  62 #     -- resources we make the array only that wide, and
  63 #     -- use consecutive indices for to make a cache "line"
  64 #     --
  65 #     -- ROW_SIZE is the width in bytes of the BRAM
  66 #     -- (based on WB, so 64-bits)
  67 ROW_SIZE = WB_DATA_BITS // 8;
  68
  69 # ROW_PER_LINE is the number of row (wishbone
  70 # transactions) in a line
  71 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  72
  73 # BRAM_ROWS is the number of rows in BRAM needed
  74 # to represent the full dcache
  75 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  76
  77 print ("ROW_SIZE", ROW_SIZE)
  78 print ("ROW_PER_LINE", ROW_PER_LINE)
  79 print ("BRAM_ROWS", BRAM_ROWS)
  80 print ("NUM_WAYS", NUM_WAYS)
  81
  82 # Bit fields counts in the address
  83
  84 # REAL_ADDR_BITS is the number of real address
  85 # bits that we store
  86 REAL_ADDR_BITS = 56
  87
  88 # ROW_BITS is the number of bits to select a row
  89 ROW_BITS = log2_int(BRAM_ROWS)
  90
  91 # ROW_LINE_BITS is the number of bits to select
  92 # a row within a line
  93 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  94
  95 # LINE_OFF_BITS is the number of bits for
  96 # the offset in a cache line
  97 LINE_OFF_BITS = log2_int(LINE_SIZE)
  98
  99 # ROW_OFF_BITS is the number of bits for
 100 # the offset in a row
 101 ROW_OFF_BITS = log2_int(ROW_SIZE)
 102
 103 # INDEX_BITS is the number if bits to
 104 # select a cache line
 105 INDEX_BITS = log2_int(NUM_LINES)
 106
 107 # SET_SIZE_BITS is the log base 2 of the set size
 108 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 109
 110 # TAG_BITS is the number of bits of
 111 # the tag part of the address
 112 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 113
 114 # TAG_WIDTH is the width in bits of each way of the tag RAM
 115 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 116
 117 # WAY_BITS is the number of bits to select a way
 118 WAY_BITS = log2_int(NUM_WAYS)
 119
 120 # Example of layout for 32 lines of 64 bytes:
 121 layout = """\
 122   ..  tag    |index|  line  |
 123   ..         |   row   |    |
 124   ..         |     |---|    | ROW_LINE_BITS  (3)
 125   ..         |     |--- - --| LINE_OFF_BITS (6)
 126   ..         |         |- --| ROW_OFF_BITS  (3)
 127   ..         |----- ---|    | ROW_BITS      (8)
 128   ..         |-----|        | INDEX_BITS    (5)
 129   .. --------|              | TAG_BITS      (45)
 130 """
 131 print (layout)
 132 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 133             (TAG_BITS, INDEX_BITS, ROW_BITS,
 134              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 135 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 136 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 137 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 138
 139 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 140
 141 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 142
 143 def CacheTagArray():
 144     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 145                         for x in range(NUM_LINES))
 146
 147 def CacheValidBitsArray():
 148     return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
 149                         for x in range(NUM_LINES))
 150
 151 def RowPerLineValidArray():
 152     return Array(Signal(name="rows_valid%d" % x) \
 153                         for x in range(ROW_PER_LINE))
 154
 155 # L1 TLB
 156 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 157 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 158 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 159 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 160 TLB_PTE_BITS     = 64
 161 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 162
 163 def ispow2(x):
 164     return (1<<log2_int(x, False)) == x
 165
 166 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 167 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 168 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 169 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 170 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 171 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 172         "geometry bits don't add up"
 173 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 174         "geometry bits don't add up"
 175 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 176          "geometry bits don't add up"
 177 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 178 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 179
 180
 181 def TLBValidBitsArray():
 182     return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
 183                 for x in range(TLB_SET_SIZE))
 184
 185 def TLBTagEAArray():
 186     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 187                 for x in range (TLB_NUM_WAYS))
 188
 189 def TLBTagsArray():
 190     return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
 191                 for x in range (TLB_SET_SIZE))
 192
 193 def TLBPtesArray():
 194     return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
 195                 for x in range(TLB_SET_SIZE))
 196
 197 def HitWaySet():
 198     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 199                         for x in range(TLB_NUM_WAYS))
 200
 201 # Cache RAM interface
 202 def CacheRamOut():
 203     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 204                  for x in range(NUM_WAYS))
 205
 206 # PLRU output interface
 207 def PLRUOut():
 208     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 209                 for x in range(NUM_LINES))
 210
 211 # TLB PLRU output interface
 212 def TLBPLRUOut():
 213     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 214                 for x in range(TLB_SET_SIZE))
 215
 216 # Helper functions to decode incoming requests
 217 #
 218 # Return the cache line index (tag index) for an address
 219 def get_index(addr):
 220     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 221
 222 # Return the cache row index (data memory) for an address
 223 def get_row(addr):
 224     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 225
 226 # Return the index of a row within a line
 227 def get_row_of_line(row):
 228     return row[:ROW_BITS][:ROW_LINE_BITS]
 229
 230 # Returns whether this is the last row of a line
 231 def is_last_row_addr(addr, last):
 232     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 233
 234 # Returns whether this is the last row of a line
 235 def is_last_row(row, last):
 236     return get_row_of_line(row) == last
 237
 238 # Return the next row in the current cache line. We use a
 239 # dedicated function in order to limit the size of the
 240 # generated adder to be only the bits within a cache line
 241 # (3 bits with default settings)
 242 def next_row(row):
 243     row_v = row[0:ROW_LINE_BITS] + 1
 244     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 245
 246 # Get the tag value from the address
 247 def get_tag(addr):
 248     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 249
 250 # Read a tag from a tag memory row
 251 def read_tag(way, tagset):
 252     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 253
 254 # Read a TLB tag from a TLB tag memory row
 255 def read_tlb_tag(way, tags):
 256     return tags.word_select(way, TLB_EA_TAG_BITS)
 257
 258 # Write a TLB tag to a TLB tag memory row
 259 def write_tlb_tag(way, tags, tag):
 260     return read_tlb_tag(way, tags).eq(tag)
 261
 262 # Read a PTE from a TLB PTE memory row
 263 def read_tlb_pte(way, ptes):
 264     return ptes.word_select(way, TLB_PTE_BITS)
 265
 266 def write_tlb_pte(way, ptes, newpte):
 267     return read_tlb_pte(way, ptes).eq(newpte)
 268
 269
 270 # Record for storing permission, attribute, etc. bits from a PTE
 271 class PermAttr(RecordObject):
 272     def __init__(self, name=None):
 273         super().__init__(name=name)
 274         self.reference = Signal()
 275         self.changed   = Signal()
 276         self.nocache   = Signal()
 277         self.priv      = Signal()
 278         self.rd_perm   = Signal()
 279         self.wr_perm   = Signal()
 280
 281
 282 def extract_perm_attr(pte):
 283     pa = PermAttr()
 284     return pa;
 285
 286
 287 # Type of operation on a "valid" input
 288 @unique
 289 class Op(Enum):
 290     OP_NONE       = 0
 291     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 292     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 293     OP_LOAD_HIT   = 3 # Cache hit on load
 294     OP_LOAD_MISS  = 4 # Load missing cache
 295     OP_LOAD_NC    = 5 # Non-cachable load
 296     OP_STORE_HIT  = 6 # Store hitting cache
 297     OP_STORE_MISS = 7 # Store missing cache
 298
 299
 300 # Cache state machine
 301 @unique
 302 class State(Enum):
 303     IDLE             = 0 # Normal load hit processing
 304     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 305     STORE_WAIT_ACK   = 2 # Store wait ack
 306     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 307
 308
 309 # Dcache operations:
 310 #
 311 # In order to make timing, we use the BRAMs with
 312 # an output buffer, which means that the BRAM
 313 # output is delayed by an extra cycle.
 314 #
 315 # Thus, the dcache has a 2-stage internal pipeline
 316 # for cache hits with no stalls.
 317 #
 318 # All other operations are handled via stalling
 319 # in the first stage.
 320 #
 321 # The second stage can thus complete a hit at the same
 322 # time as the first stage emits a stall for a complex op.
 323 #
 324 # Stage 0 register, basically contains just the latched request
 325
 326 class RegStage0(RecordObject):
 327     def __init__(self, name=None):
 328         super().__init__(name=name)
 329         self.req     = LoadStore1ToDCacheType(name="lsmem")
 330         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 331         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 332         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 333         self.mmu_req = Signal() # indicates source of request
 334         self.d_valid = Signal() # indicates req.data is valid now
 335
 336
 337 class MemAccessRequest(RecordObject):
 338     def __init__(self, name=None):
 339         super().__init__(name=name)
 340         self.op        = Signal(Op)
 341         self.valid     = Signal()
 342         self.dcbz      = Signal()
 343         self.real_addr = Signal(REAL_ADDR_BITS)
 344         self.data      = Signal(64)
 345         self.byte_sel  = Signal(8)
 346         self.hit_way   = Signal(WAY_BITS)
 347         self.same_tag  = Signal()
 348         self.mmu_req   = Signal()
 349
 350
 351 # First stage register, contains state for stage 1 of load hits
 352 # and for the state machine used by all other operations
 353 class RegStage1(RecordObject):
 354     def __init__(self, name=None):
 355         super().__init__(name=name)
 356         # Info about the request
 357         self.full             = Signal() # have uncompleted request
 358         self.mmu_req          = Signal() # request is from MMU
 359         self.req              = MemAccessRequest(name="reqmem")
 360
 361         # Cache hit state
 362         self.hit_way          = Signal(WAY_BITS)
 363         self.hit_load_valid   = Signal()
 364         self.hit_index        = Signal(INDEX_BITS)
 365         self.cache_hit        = Signal()
 366
 367         # TLB hit state
 368         self.tlb_hit          = Signal()
 369         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 370         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 371
 372         # 2-stage data buffer for data forwarded from writes to reads
 373         self.forward_data1    = Signal(64)
 374         self.forward_data2    = Signal(64)
 375         self.forward_sel1     = Signal(8)
 376         self.forward_valid1   = Signal()
 377         self.forward_way1     = Signal(WAY_BITS)
 378         self.forward_row1     = Signal(ROW_BITS)
 379         self.use_forward1     = Signal()
 380         self.forward_sel      = Signal(8)
 381
 382         # Cache miss state (reload state machine)
 383         self.state            = Signal(State)
 384         self.dcbz             = Signal()
 385         self.write_bram       = Signal()
 386         self.write_tag        = Signal()
 387         self.slow_valid       = Signal()
 388         self.real_adr         = Signal(REAL_ADDR_BITS)
 389         self.wb               = WBMasterOut("wb")
 390         self.reload_tag       = Signal(TAG_BITS)
 391         self.store_way        = Signal(WAY_BITS)
 392         self.store_row        = Signal(ROW_BITS)
 393         self.store_index      = Signal(INDEX_BITS)
 394         self.end_row_ix       = Signal(ROW_LINE_BITS)
 395         self.rows_valid       = RowPerLineValidArray()
 396         self.acks_pending     = Signal(3)
 397         self.inc_acks         = Signal()
 398         self.dec_acks         = Signal()
 399
 400         # Signals to complete (possibly with error)
 401         self.ls_valid         = Signal()
 402         self.ls_error         = Signal()
 403         self.mmu_done         = Signal()
 404         self.mmu_error        = Signal()
 405         self.cache_paradox    = Signal()
 406
 407         # Signal to complete a failed stcx.
 408         self.stcx_fail        = Signal()
 409
 410
 411 # Reservation information
 412 class Reservation(RecordObject):
 413     def __init__(self):
 414         super().__init__()
 415         self.valid = Signal()
 416         self.addr  = Signal(64-LINE_OFF_BITS)
 417
 418
 419 class DTLBUpdate(Elaboratable):
 420     def __init__(self):
 421         self.tlbie    = Signal()
 422         self.tlbwe    = Signal()
 423         self.doall    = Signal()
 424         self.updated  = Signal()
 425         self.v_updated  = Signal()
 426         self.tlb_hit    = Signal()
 427         self.tlb_req_index = Signal(TLB_SET_BITS)
 428
 429         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 430         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 431         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 432         self.repl_way        = Signal(TLB_WAY_BITS)
 433         self.eatag           = Signal(TLB_EA_TAG_BITS)
 434         self.pte_data        = Signal(TLB_PTE_BITS)
 435
 436         self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 437
 438         self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 439         self.pb_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 440         self.db_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 441
 442     def elaborate(self, platform):
 443         m = Module()
 444         comb = m.d.comb
 445         sync = m.d.sync
 446
 447         tagset   = Signal(TLB_TAG_WAY_BITS)
 448         pteset   = Signal(TLB_PTE_WAY_BITS)
 449
 450         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 451         comb += db_out.eq(self.dv)
 452
 453         with m.If(self.tlbie & self.doall):
 454             pass # clear all back in parent
 455         with m.Elif(self.tlbie):
 456             with m.If(self.tlb_hit):
 457                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 458                 comb += self.v_updated.eq(1)
 459
 460         with m.Elif(self.tlbwe):
 461
 462             comb += tagset.eq(self.tlb_tag_way)
 463             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 464             comb += tb_out.eq(tagset)
 465
 466             comb += pteset.eq(self.tlb_pte_way)
 467             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 468             comb += pb_out.eq(pteset)
 469
 470             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 471
 472             comb += self.updated.eq(1)
 473             comb += self.v_updated.eq(1)
 474
 475         return m
 476
 477
 478 class DCachePendingHit(Elaboratable):
 479
 480     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 481                       cache_valid_idx, cache_tag_set,
 482                     req_addr,
 483                     hit_set):
 484
 485         self.go          = Signal()
 486         self.virt_mode   = Signal()
 487         self.is_hit      = Signal()
 488         self.tlb_hit     = Signal()
 489         self.hit_way     = Signal(WAY_BITS)
 490         self.rel_match   = Signal()
 491         self.req_index   = Signal(INDEX_BITS)
 492         self.reload_tag  = Signal(TAG_BITS)
 493
 494         self.tlb_hit_way = tlb_hit_way
 495         self.tlb_pte_way = tlb_pte_way
 496         self.tlb_valid_way = tlb_valid_way
 497         self.cache_valid_idx = cache_valid_idx
 498         self.cache_tag_set = cache_tag_set
 499         self.req_addr = req_addr
 500         self.hit_set = hit_set
 501
 502     def elaborate(self, platform):
 503         m = Module()
 504         comb = m.d.comb
 505         sync = m.d.sync
 506
 507         go = self.go
 508         virt_mode = self.virt_mode
 509         is_hit = self.is_hit
 510         tlb_pte_way = self.tlb_pte_way
 511         tlb_valid_way = self.tlb_valid_way
 512         cache_valid_idx = self.cache_valid_idx
 513         cache_tag_set = self.cache_tag_set
 514         req_addr = self.req_addr
 515         tlb_hit_way = self.tlb_hit_way
 516         tlb_hit = self.tlb_hit
 517         hit_set = self.hit_set
 518         hit_way = self.hit_way
 519         rel_match = self.rel_match
 520         req_index = self.req_index
 521         reload_tag = self.reload_tag
 522
 523         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 524                                     for i in range(TLB_NUM_WAYS))
 525         hit_way_set = HitWaySet()
 526
 527         # Test if pending request is a hit on any way
 528         # In order to make timing in virtual mode,
 529         # when we are using the TLB, we compare each
 530         # way with each of the real addresses from each way of
 531         # the TLB, and then decide later which match to use.
 532
 533         with m.If(virt_mode):
 534             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 535                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 536                 s_hit       = Signal()
 537                 s_pte       = Signal(TLB_PTE_BITS)
 538                 s_ra        = Signal(REAL_ADDR_BITS)
 539                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 540                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 541                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 542                 comb += s_tag.eq(get_tag(s_ra))
 543
 544                 for i in range(NUM_WAYS): # way_t
 545                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 546                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 547                                   (read_tag(i, cache_tag_set) == s_tag)
 548                                   & tlb_valid_way[j])
 549                     with m.If(is_tag_hit):
 550                         comb += hit_way_set[j].eq(i)
 551                         comb += s_hit.eq(1)
 552                 comb += hit_set[j].eq(s_hit)
 553                 with m.If(s_tag == reload_tag):
 554                     comb += rel_matches[j].eq(1)
 555             with m.If(tlb_hit):
 556                 comb += is_hit.eq(hit_set[tlb_hit_way])
 557                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 558                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 559         with m.Else():
 560             s_tag       = Signal(TAG_BITS)
 561             comb += s_tag.eq(get_tag(req_addr))
 562             for i in range(NUM_WAYS): # way_t
 563                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 564                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 565                           (read_tag(i, cache_tag_set) == s_tag))
 566                 with m.If(is_tag_hit):
 567                     comb += hit_way.eq(i)
 568                     comb += is_hit.eq(1)
 569             with m.If(s_tag == reload_tag):
 570                 comb += rel_match.eq(1)
 571
 572         return m
 573
 574
 575 class DCache(Elaboratable):
 576     """Set associative dcache write-through
 577     TODO (in no specific order):
 578     * See list in icache.vhdl
 579     * Complete load misses on the cycle when WB data comes instead of
 580       at the end of line (this requires dealing with requests coming in
 581       while not idle...)
 582     """
 583     def __init__(self):
 584         self.d_in      = LoadStore1ToDCacheType("d_in")
 585         self.d_out     = DCacheToLoadStore1Type("d_out")
 586
 587         self.m_in      = MMUToDCacheType("m_in")
 588         self.m_out     = DCacheToMMUType("m_out")
 589
 590         self.stall_out = Signal()
 591
 592         self.wb_out    = WBMasterOut()
 593         self.wb_in     = WBSlaveOut()
 594
 595         self.log_out   = Signal(20)
 596
 597     def stage_0(self, m, r0, r1, r0_full):
 598         """Latch the request in r0.req as long as we're not stalling
 599         """
 600         comb = m.d.comb
 601         sync = m.d.sync
 602         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 603
 604         r = RegStage0("stage0")
 605
 606         # TODO, this goes in unit tests and formal proofs
 607         with m.If(d_in.valid & m_in.valid):
 608             sync += Display("request collision loadstore vs MMU")
 609
 610         with m.If(m_in.valid):
 611             comb += r.req.valid.eq(1)
 612             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 613             comb += r.req.dcbz.eq(0)
 614             comb += r.req.nc.eq(0)
 615             comb += r.req.reserve.eq(0)
 616             comb += r.req.virt_mode.eq(0)
 617             comb += r.req.priv_mode.eq(1)
 618             comb += r.req.addr.eq(m_in.addr)
 619             comb += r.req.data.eq(m_in.pte)
 620             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 621             comb += r.tlbie.eq(m_in.tlbie)
 622             comb += r.doall.eq(m_in.doall)
 623             comb += r.tlbld.eq(m_in.tlbld)
 624             comb += r.mmu_req.eq(1)
 625         with m.Else():
 626             comb += r.req.eq(d_in)
 627             comb += r.req.data.eq(0)
 628             comb += r.tlbie.eq(0)
 629             comb += r.doall.eq(0)
 630             comb += r.tlbld.eq(0)
 631             comb += r.mmu_req.eq(0)
 632         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 633             sync += r0.eq(r)
 634             sync += r0_full.eq(r.req.valid)
 635             # Sample data the cycle after a request comes in from loadstore1.
 636             # If another request has come in already then the data will get
 637             # put directly into req.data below.
 638             with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
 639                      ~r0.mmu_req):
 640                 sync += r0.req.data.eq(d_in.data)
 641                 sync += r0.d_valid.eq(1)
 642
 643     def tlb_read(self, m, r0_stall, tlb_valid_way,
 644                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 645                  dtlb_tags, dtlb_ptes):
 646         """TLB
 647         Operates in the second cycle on the request latched in r0.req.
 648         TLB updates write the entry at the end of the second cycle.
 649         """
 650         comb = m.d.comb
 651         sync = m.d.sync
 652         m_in, d_in = self.m_in, self.d_in
 653
 654         index    = Signal(TLB_SET_BITS)
 655         addrbits = Signal(TLB_SET_BITS)
 656
 657         amin = TLB_LG_PGSZ
 658         amax = TLB_LG_PGSZ + TLB_SET_BITS
 659
 660         with m.If(m_in.valid):
 661             comb += addrbits.eq(m_in.addr[amin : amax])
 662         with m.Else():
 663             comb += addrbits.eq(d_in.addr[amin : amax])
 664         comb += index.eq(addrbits)
 665
 666         # If we have any op and the previous op isn't finished,
 667         # then keep the same output for next cycle.
 668         with m.If(~r0_stall):
 669             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 670             sync += tlb_tag_way.eq(dtlb_tags[index])
 671             sync += tlb_pte_way.eq(dtlb_ptes[index])
 672
 673     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 674         """Generate TLB PLRUs
 675         """
 676         comb = m.d.comb
 677         sync = m.d.sync
 678
 679         if TLB_NUM_WAYS == 0:
 680             return
 681         for i in range(TLB_SET_SIZE):
 682             # TLB PLRU interface
 683             tlb_plru        = PLRU(TLB_WAY_BITS)
 684             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 685             tlb_plru_acc_en = Signal()
 686
 687             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 688             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 689             comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
 690             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 691
 692     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 693                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 694                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 695
 696         comb = m.d.comb
 697
 698         hitway = Signal(TLB_WAY_BITS)
 699         hit    = Signal()
 700         eatag  = Signal(TLB_EA_TAG_BITS)
 701
 702         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 703         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 704         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 705
 706         for i in range(TLB_NUM_WAYS):
 707             is_tag_hit = Signal()
 708             comb += is_tag_hit.eq(tlb_valid_way[i]
 709                                   & (read_tlb_tag(i, tlb_tag_way) == eatag))
 710             with m.If(is_tag_hit):
 711                 comb += hitway.eq(i)
 712                 comb += hit.eq(1)
 713
 714         comb += tlb_hit.eq(hit & r0_valid)
 715         comb += tlb_hit_way.eq(hitway)
 716
 717         with m.If(tlb_hit):
 718             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 719         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 720
 721         with m.If(r0.req.virt_mode):
 722             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 723                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 724                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 725             comb += perm_attr.reference.eq(pte[8])
 726             comb += perm_attr.changed.eq(pte[7])
 727             comb += perm_attr.nocache.eq(pte[5])
 728             comb += perm_attr.priv.eq(pte[3])
 729             comb += perm_attr.rd_perm.eq(pte[2])
 730             comb += perm_attr.wr_perm.eq(pte[1])
 731         with m.Else():
 732             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 733                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 734             comb += perm_attr.reference.eq(1)
 735             comb += perm_attr.changed.eq(1)
 736             comb += perm_attr.nocache.eq(0)
 737             comb += perm_attr.priv.eq(1)
 738             comb += perm_attr.rd_perm.eq(1)
 739             comb += perm_attr.wr_perm.eq(1)
 740
 741     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 742                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 743                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 744
 745         dtlb_valids = TLBValidBitsArray()
 746
 747         comb = m.d.comb
 748         sync = m.d.sync
 749
 750         tlbie    = Signal()
 751         tlbwe    = Signal()
 752
 753         comb += tlbie.eq(r0_valid & r0.tlbie)
 754         comb += tlbwe.eq(r0_valid & r0.tlbld)
 755
 756         m.submodules.tlb_update = d = DTLBUpdate()
 757         with m.If(tlbie & r0.doall):
 758             # clear all valid bits at once
 759             for i in range(TLB_SET_SIZE):
 760                 sync += dtlb_valid_bits[i].eq(0)
 761         with m.If(d.updated):
 762             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 763             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 764         with m.If(d.v_updated):
 765             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 766
 767         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 768
 769         comb += d.tlbie.eq(tlbie)
 770         comb += d.tlbwe.eq(tlbwe)
 771         comb += d.doall.eq(r0.doall)
 772         comb += d.tlb_hit.eq(tlb_hit)
 773         comb += d.tlb_hit_way.eq(tlb_hit_way)
 774         comb += d.tlb_tag_way.eq(tlb_tag_way)
 775         comb += d.tlb_pte_way.eq(tlb_pte_way)
 776         comb += d.tlb_req_index.eq(tlb_req_index)
 777
 778         with m.If(tlb_hit):
 779             comb += d.repl_way.eq(tlb_hit_way)
 780         with m.Else():
 781             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 782         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 783         comb += d.pte_data.eq(r0.req.data)
 784
 785     def maybe_plrus(self, m, r1, plru_victim):
 786         """Generate PLRUs
 787         """
 788         comb = m.d.comb
 789         sync = m.d.sync
 790
 791         if TLB_NUM_WAYS == 0:
 792             return
 793
 794         for i in range(NUM_LINES):
 795             # PLRU interface
 796             plru        = PLRU(WAY_BITS)
 797             setattr(m.submodules, "plru%d" % i, plru)
 798             plru_acc_en = Signal()
 799
 800             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 801             comb += plru.acc_en.eq(plru_acc_en)
 802             comb += plru.acc_i.eq(r1.hit_way)
 803             comb += plru_victim[i].eq(plru.lru_o)
 804
 805     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 806         """Cache tag RAM read port
 807         """
 808         comb = m.d.comb
 809         sync = m.d.sync
 810         m_in, d_in = self.m_in, self.d_in
 811
 812         index = Signal(INDEX_BITS)
 813
 814         with m.If(r0_stall):
 815             comb += index.eq(req_index)
 816         with m.Elif(m_in.valid):
 817             comb += index.eq(get_index(m_in.addr))
 818         with m.Else():
 819             comb += index.eq(get_index(d_in.addr))
 820         sync += cache_tag_set.eq(cache_tags[index])
 821
 822     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 823                        r0_valid, r1, cache_valids, replace_way,
 824                        use_forward1_next, use_forward2_next,
 825                        req_hit_way, plru_victim, rc_ok, perm_attr,
 826                        valid_ra, perm_ok, access_ok, req_op, req_go,
 827                        tlb_pte_way,
 828                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 829                        cancel_store, req_same_tag, r0_stall, early_req_row):
 830         """Cache request parsing and hit detection
 831         """
 832
 833         comb = m.d.comb
 834         m_in, d_in = self.m_in, self.d_in
 835
 836         is_hit      = Signal()
 837         hit_way     = Signal(WAY_BITS)
 838         op          = Signal(Op)
 839         opsel       = Signal(3)
 840         go          = Signal()
 841         nc          = Signal()
 842         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 843                                   for i in range(TLB_NUM_WAYS))
 844         cache_valid_idx = Signal(NUM_WAYS)
 845
 846         # Extract line, row and tag from request
 847         comb += req_index.eq(get_index(r0.req.addr))
 848         comb += req_row.eq(get_row(r0.req.addr))
 849         comb += req_tag.eq(get_tag(ra))
 850
 851         if False: # display on comb is a bit... busy.
 852             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 853                     r0.req.addr, ra, req_index, req_tag, req_row)
 854
 855         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 856         comb += cache_valid_idx.eq(cache_valids[req_index])
 857
 858         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 859                                 tlb_valid_way, tlb_hit_way,
 860                                 cache_valid_idx, cache_tag_set,
 861                                 r0.req.addr,
 862                                 hit_set)
 863
 864         comb += dc.tlb_hit.eq(tlb_hit)
 865         comb += dc.reload_tag.eq(r1.reload_tag)
 866         comb += dc.virt_mode.eq(r0.req.virt_mode)
 867         comb += dc.go.eq(go)
 868         comb += dc.req_index.eq(req_index)
 869         comb += is_hit.eq(dc.is_hit)
 870         comb += hit_way.eq(dc.hit_way)
 871         comb += req_same_tag.eq(dc.rel_match)
 872
 873         # See if the request matches the line currently being reloaded
 874         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 875                   (req_index == r1.store_index) & req_same_tag):
 876             # For a store, consider this a hit even if the row isn't
 877             # valid since it will be by the time we perform the store.
 878             # For a load, check the appropriate row valid bit.
 879             rrow = Signal(ROW_LINE_BITS)
 880             comb += rrow.eq(req_row)
 881             valid = r1.rows_valid[rrow]
 882             comb += is_hit.eq((~r0.req.load) | valid)
 883             comb += hit_way.eq(replace_way)
 884
 885         # Whether to use forwarded data for a load or not
 886         with m.If((get_row(r1.req.real_addr) == req_row) &
 887                   (r1.req.hit_way == hit_way)):
 888             # Only need to consider r1.write_bram here, since if we
 889             # are writing refill data here, then we don't have a
 890             # cache hit this cycle on the line being refilled.
 891             # (There is the possibility that the load following the
 892             # load miss that started the refill could be to the old
 893             # contents of the victim line, since it is a couple of
 894             # cycles after the refill starts before we see the updated
 895             # cache tag. In that case we don't use the bypass.)
 896             comb += use_forward1_next.eq(r1.write_bram)
 897         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 898             comb += use_forward2_next.eq(r1.forward_valid1)
 899
 900         # The way that matched on a hit
 901         comb += req_hit_way.eq(hit_way)
 902
 903         # The way to replace on a miss
 904         with m.If(r1.write_tag):
 905             comb += replace_way.eq(plru_victim[r1.store_index])
 906         with m.Else():
 907             comb += replace_way.eq(r1.store_way)
 908
 909         # work out whether we have permission for this access
 910         # NB we don't yet implement AMR, thus no KUAP
 911         comb += rc_ok.eq(perm_attr.reference
 912                          & (r0.req.load | perm_attr.changed))
 913         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 914                            (perm_attr.wr_perm |
 915                               (r0.req.load & perm_attr.rd_perm)))
 916         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 917         # Combine the request and cache hit status to decide what
 918         # operation needs to be done
 919         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 920         comb += op.eq(Op.OP_NONE)
 921         with m.If(go):
 922             with m.If(~access_ok):
 923                 comb += op.eq(Op.OP_BAD)
 924             with m.Elif(cancel_store):
 925                 comb += op.eq(Op.OP_STCX_FAIL)
 926             with m.Else():
 927                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 928                 with m.Switch(opsel):
 929                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 930                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 931                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 932                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 933                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 934                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 935                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 936                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 937         comb += req_op.eq(op)
 938         comb += req_go.eq(go)
 939
 940         # Version of the row number that is valid one cycle earlier
 941         # in the cases where we need to read the cache data BRAM.
 942         # If we're stalling then we need to keep reading the last
 943         # row requested.
 944         with m.If(~r0_stall):
 945             with m.If(m_in.valid):
 946                 comb += early_req_row.eq(get_row(m_in.addr))
 947             with m.Else():
 948                 comb += early_req_row.eq(get_row(d_in.addr))
 949         with m.Else():
 950             comb += early_req_row.eq(req_row)
 951
 952     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 953                          r0_valid, r0, reservation):
 954         """Handle load-with-reservation and store-conditional instructions
 955         """
 956         comb = m.d.comb
 957
 958         with m.If(r0_valid & r0.req.reserve):
 959             # XXX generate alignment interrupt if address
 960             # is not aligned XXX or if r0.req.nc = '1'
 961             with m.If(r0.req.load):
 962                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
 963             with m.Else():
 964                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
 965                 with m.If((~reservation.valid) |
 966                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
 967                     comb += cancel_store.eq(1)
 968
 969     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 970                         reservation, r0):
 971
 972         comb = m.d.comb
 973         sync = m.d.sync
 974
 975         with m.If(r0_valid & access_ok):
 976             with m.If(clear_rsrv):
 977                 sync += reservation.valid.eq(0)
 978             with m.Elif(set_rsrv):
 979                 sync += reservation.valid.eq(1)
 980                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 981
 982     def writeback_control(self, m, r1, cache_out_row):
 983         """Return data for loads & completion control logic
 984         """
 985         comb = m.d.comb
 986         sync = m.d.sync
 987         d_out, m_out = self.d_out, self.m_out
 988
 989         data_out = Signal(64)
 990         data_fwd = Signal(64)
 991
 992         # Use the bypass if are reading the row that was
 993         # written 1 or 2 cycles ago, including for the
 994         # slow_valid = 1 case (i.e. completing a load
 995         # miss or a non-cacheable load).
 996         with m.If(r1.use_forward1):
 997             comb += data_fwd.eq(r1.forward_data1)
 998         with m.Else():
 999             comb += data_fwd.eq(r1.forward_data2)
1000
1001         comb += data_out.eq(cache_out_row)
1002
1003         for i in range(8):
1004             with m.If(r1.forward_sel[i]):
1005                 dsel = data_fwd.word_select(i, 8)
1006                 comb += data_out.word_select(i, 8).eq(dsel)
1007
1008         comb += d_out.valid.eq(r1.ls_valid)
1009         comb += d_out.data.eq(data_out)
1010         comb += d_out.store_done.eq(~r1.stcx_fail)
1011         comb += d_out.error.eq(r1.ls_error)
1012         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1013
1014         # Outputs to MMU
1015         comb += m_out.done.eq(r1.mmu_done)
1016         comb += m_out.err.eq(r1.mmu_error)
1017         comb += m_out.data.eq(data_out)
1018
1019         # We have a valid load or store hit or we just completed
1020         # a slow op such as a load miss, a NC load or a store
1021         #
1022         # Note: the load hit is delayed by one cycle. However it
1023         # can still not collide with r.slow_valid (well unless I
1024         # miscalculated) because slow_valid can only be set on a
1025         # subsequent request and not on its first cycle (the state
1026         # machine must have advanced), which makes slow_valid
1027         # at least 2 cycles from the previous hit_load_valid.
1028
1029         # Sanity: Only one of these must be set in any given cycle
1030
1031         if False: # TODO: need Display to get this to work
1032             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1033             "unexpected slow_valid collision with stcx_fail"
1034
1035             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1036              "unexpected hit_load_delayed collision with slow_valid"
1037
1038         with m.If(~r1.mmu_req):
1039             # Request came from loadstore1...
1040             # Load hit case is the standard path
1041             with m.If(r1.hit_load_valid):
1042                 sync += Display("completing load hit data=%x", data_out)
1043
1044             # error cases complete without stalling
1045             with m.If(r1.ls_error):
1046                 sync += Display("completing ld/st with error")
1047
1048             # Slow ops (load miss, NC, stores)
1049             with m.If(r1.slow_valid):
1050                 sync += Display("completing store or load miss adr=%x data=%x",
1051                                 r1.req.real_addr, data_out)
1052
1053         with m.Else():
1054             # Request came from MMU
1055             with m.If(r1.hit_load_valid):
1056                 sync += Display("completing load hit to MMU, data=%x",
1057                                 m_out.data)
1058             # error cases complete without stalling
1059             with m.If(r1.mmu_error):
1060                 sync += Display("combpleting MMU ld with error")
1061
1062             # Slow ops (i.e. load miss)
1063             with m.If(r1.slow_valid):
1064                 sync += Display("completing MMU load miss, data=%x",
1065                                 m_out.data)
1066
1067     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1068         """rams
1069         Generate a cache RAM for each way. This handles the normal
1070         reads, writes from reloads and the special store-hit update
1071         path as well.
1072
1073         Note: the BRAMs have an extra read buffer, meaning the output
1074         is pipelined an extra cycle. This differs from the
1075         icache. The writeback logic needs to take that into
1076         account by using 1-cycle delayed signals for load hits.
1077         """
1078         comb = m.d.comb
1079         wb_in = self.wb_in
1080
1081         for i in range(NUM_WAYS):
1082             do_read  = Signal(name="do_rd%d" % i)
1083             rd_addr  = Signal(ROW_BITS, name="rd_addr_%d" % i)
1084             do_write = Signal(name="do_wr%d" % i)
1085             wr_addr  = Signal(ROW_BITS, name="wr_addr_%d" % i)
1086             wr_data  = Signal(WB_DATA_BITS, name="din_%d" % i)
1087             wr_sel   = Signal(ROW_SIZE)
1088             wr_sel_m = Signal(ROW_SIZE)
1089             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1090
1091             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True)
1092             setattr(m.submodules, "cacheram_%d" % i, way)
1093
1094             comb += way.rd_en.eq(do_read)
1095             comb += way.rd_addr.eq(rd_addr)
1096             comb += _d_out.eq(way.rd_data_o)
1097             comb += way.wr_sel.eq(wr_sel_m)
1098             comb += way.wr_addr.eq(wr_addr)
1099             comb += way.wr_data.eq(wr_data)
1100
1101             # Cache hit reads
1102             comb += do_read.eq(1)
1103             comb += rd_addr.eq(early_req_row)
1104             with m.If(r1.hit_way == i):
1105                 comb += cache_out_row.eq(_d_out)
1106
1107             # Write mux:
1108             #
1109             # Defaults to wishbone read responses (cache refill)
1110             #
1111             # For timing, the mux on wr_data/sel/addr is not
1112             # dependent on anything other than the current state.
1113
1114             with m.If(r1.write_bram):
1115                 # Write store data to BRAM.  This happens one
1116                 # cycle after the store is in r0.
1117                 comb += wr_data.eq(r1.req.data)
1118                 comb += wr_sel.eq(r1.req.byte_sel)
1119                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1120
1121                 with m.If(i == r1.req.hit_way):
1122                     comb += do_write.eq(1)
1123             with m.Else():
1124                 # Otherwise, we might be doing a reload or a DCBZ
1125                 with m.If(r1.dcbz):
1126                     comb += wr_data.eq(0)
1127                 with m.Else():
1128                     comb += wr_data.eq(wb_in.dat)
1129                 comb += wr_addr.eq(r1.store_row)
1130                 comb += wr_sel.eq(~0) # all 1s
1131
1132             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1133                       & wb_in.ack & (replace_way == i)):
1134                 comb += do_write.eq(1)
1135
1136             # Mask write selects with do_write since BRAM
1137             # doesn't have a global write-enable
1138             with m.If(do_write):
1139                 comb += wr_sel_m.eq(wr_sel)
1140
1141     # Cache hit synchronous machine for the easy case.
1142     # This handles load hits.
1143     # It also handles error cases (TLB miss, cache paradox)
1144     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1145                         req_hit_way, req_index, req_tag, access_ok,
1146                         tlb_hit, tlb_hit_way, tlb_req_index):
1147
1148         comb = m.d.comb
1149         sync = m.d.sync
1150
1151         with m.If(req_op != Op.OP_NONE):
1152             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1153                     req_op, r0.req.addr, r0.req.nc,
1154                     req_index, req_tag, req_hit_way)
1155
1156         with m.If(r0_valid):
1157             sync += r1.mmu_req.eq(r0.mmu_req)
1158
1159         # Fast path for load/store hits.
1160         # Set signals for the writeback controls.
1161         sync += r1.hit_way.eq(req_hit_way)
1162         sync += r1.hit_index.eq(req_index)
1163
1164         with m.If(req_op == Op.OP_LOAD_HIT):
1165             sync += r1.hit_load_valid.eq(1)
1166         with m.Else():
1167             sync += r1.hit_load_valid.eq(0)
1168
1169         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1170             sync += r1.cache_hit.eq(1)
1171         with m.Else():
1172             sync += r1.cache_hit.eq(0)
1173
1174         with m.If(req_op == Op.OP_BAD):
1175             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1176             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1177             sync += r1.ls_error.eq(~r0.mmu_req)
1178             sync += r1.mmu_error.eq(r0.mmu_req)
1179             sync += r1.cache_paradox.eq(access_ok)
1180
1181             with m.Else():
1182                 sync += r1.ls_error.eq(0)
1183                 sync += r1.mmu_error.eq(0)
1184                 sync += r1.cache_paradox.eq(0)
1185
1186         with m.If(req_op == Op.OP_STCX_FAIL):
1187             sync += r1.stcx_fail.eq(1)
1188         with m.Else():
1189             sync += r1.stcx_fail.eq(0)
1190
1191         # Record TLB hit information for updating TLB PLRU
1192         sync += r1.tlb_hit.eq(tlb_hit)
1193         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1194         sync += r1.tlb_hit_index.eq(tlb_req_index)
1195
1196     # Memory accesses are handled by this state machine:
1197     #
1198     #   * Cache load miss/reload (in conjunction with "rams")
1199     #   * Load hits for non-cachable forms
1200     #   * Stores (the collision case is handled in "rams")
1201     #
1202     # All wishbone requests generation is done here.
1203     # This machine operates at stage 1.
1204     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1205                     cache_valids, r0, replace_way,
1206                     req_hit_way, req_same_tag,
1207                     r0_valid, req_op, cache_tags, req_go, ra):
1208
1209         comb = m.d.comb
1210         sync = m.d.sync
1211         wb_in = self.wb_in
1212         d_in = self.d_in
1213
1214         req         = MemAccessRequest("mreq_ds")
1215
1216         req_row = Signal(ROW_BITS)
1217         req_idx = Signal(INDEX_BITS)
1218         req_tag = Signal(TAG_BITS)
1219         comb += req_idx.eq(get_index(req.real_addr))
1220         comb += req_row.eq(get_row(req.real_addr))
1221         comb += req_tag.eq(get_tag(req.real_addr))
1222
1223         sync += r1.use_forward1.eq(use_forward1_next)
1224         sync += r1.forward_sel.eq(0)
1225
1226         with m.If(use_forward1_next):
1227             sync += r1.forward_sel.eq(r1.req.byte_sel)
1228         with m.Elif(use_forward2_next):
1229             sync += r1.forward_sel.eq(r1.forward_sel1)
1230
1231         sync += r1.forward_data2.eq(r1.forward_data1)
1232         with m.If(r1.write_bram):
1233             sync += r1.forward_data1.eq(r1.req.data)
1234             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1235             sync += r1.forward_way1.eq(r1.req.hit_way)
1236             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1237             sync += r1.forward_valid1.eq(1)
1238         with m.Else():
1239             with m.If(r1.dcbz):
1240                 sync += r1.forward_data1.eq(0)
1241             with m.Else():
1242                 sync += r1.forward_data1.eq(wb_in.dat)
1243             sync += r1.forward_sel1.eq(~0) # all 1s
1244             sync += r1.forward_way1.eq(replace_way)
1245             sync += r1.forward_row1.eq(r1.store_row)
1246             sync += r1.forward_valid1.eq(0)
1247
1248         # One cycle pulses reset
1249         sync += r1.slow_valid.eq(0)
1250         sync += r1.write_bram.eq(0)
1251         sync += r1.inc_acks.eq(0)
1252         sync += r1.dec_acks.eq(0)
1253
1254         sync += r1.ls_valid.eq(0)
1255         # complete tlbies and TLB loads in the third cycle
1256         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1257
1258         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1259             with m.If(~r0.mmu_req):
1260                 sync += r1.ls_valid.eq(1)
1261             with m.Else():
1262                 sync += r1.mmu_done.eq(1)
1263
1264         with m.If(r1.write_tag):
1265             # Store new tag in selected way
1266             for i in range(NUM_WAYS):
1267                 with m.If(i == replace_way):
1268                     ct = Signal(TAG_RAM_WIDTH)
1269                     comb += ct.eq(cache_tags[r1.store_index])
1270                     """
1271 TODO: check this
1272 cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
1273                     (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag;
1274                     """
1275                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1276                     sync += cache_tags[r1.store_index].eq(ct)
1277             sync += r1.store_way.eq(replace_way)
1278             sync += r1.write_tag.eq(0)
1279
1280         # Take request from r1.req if there is one there,
1281         # else from req_op, ra, etc.
1282         with m.If(r1.full):
1283             comb += req.eq(r1.req)
1284         with m.Else():
1285             comb += req.op.eq(req_op)
1286             comb += req.valid.eq(req_go)
1287             comb += req.mmu_req.eq(r0.mmu_req)
1288             comb += req.dcbz.eq(r0.req.dcbz)
1289             comb += req.real_addr.eq(ra)
1290
1291             with m.If(r0.req.dcbz):
1292                 # force data to 0 for dcbz
1293                 comb += req.data.eq(0)
1294             with m.Elif(r0.d_valid):
1295                 comb += req.data.eq(r0.req.data)
1296             with m.Else():
1297                 comb += req.data.eq(d_in.data)
1298
1299             # Select all bytes for dcbz
1300             # and for cacheable loads
1301             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1302                 comb += req.byte_sel.eq(~0) # all 1s
1303             with m.Else():
1304                 comb += req.byte_sel.eq(r0.req.byte_sel)
1305             comb += req.hit_way.eq(req_hit_way)
1306             comb += req.same_tag.eq(req_same_tag)
1307
1308             # Store the incoming request from r0,
1309             # if it is a slow request
1310             # Note that r1.full = 1 implies req_op = OP_NONE
1311             with m.If((req_op == Op.OP_LOAD_MISS)
1312                       | (req_op == Op.OP_LOAD_NC)
1313                       | (req_op == Op.OP_STORE_MISS)
1314                       | (req_op == Op.OP_STORE_HIT)):
1315                 sync += r1.req.eq(req)
1316                 sync += r1.full.eq(1)
1317
1318         # Main state machine
1319         with m.Switch(r1.state):
1320
1321             with m.Case(State.IDLE):
1322                 sync += r1.real_adr.eq(req.real_addr)
1323                 sync += r1.wb.sel.eq(req.byte_sel)
1324                 sync += r1.wb.dat.eq(req.data)
1325                 sync += r1.dcbz.eq(req.dcbz)
1326
1327                 # Keep track of our index and way
1328                 # for subsequent stores.
1329                 sync += r1.store_index.eq(req_idx)
1330                 sync += r1.store_row.eq(req_row)
1331                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1332                 sync += r1.reload_tag.eq(req_tag)
1333                 sync += r1.req.same_tag.eq(1)
1334
1335                 with m.If(req.op == Op.OP_STORE_HIT):
1336                     sync += r1.store_way.eq(req.hit_way)
1337
1338                 # Reset per-row valid bits,
1339                 # ready for handling OP_LOAD_MISS
1340                 for i in range(ROW_PER_LINE):
1341                     sync += r1.rows_valid[i].eq(0)
1342
1343                 with m.If(req_op != Op.OP_NONE):
1344                     sync += Display("cache op %d", req.op)
1345
1346                 with m.Switch(req.op):
1347                     with m.Case(Op.OP_LOAD_HIT):
1348                         # stay in IDLE state
1349                         pass
1350
1351                     with m.Case(Op.OP_LOAD_MISS):
1352                         sync += Display("cache miss real addr: %x " \
1353                                 "idx: %x tag: %x",
1354                                 req.real_addr, req_row, req_tag)
1355
1356                         # Start the wishbone cycle
1357                         sync += r1.wb.we.eq(0)
1358                         sync += r1.wb.cyc.eq(1)
1359                         sync += r1.wb.stb.eq(1)
1360
1361                         # Track that we had one request sent
1362                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1363                         sync += r1.write_tag.eq(1)
1364
1365                     with m.Case(Op.OP_LOAD_NC):
1366                         sync += r1.wb.cyc.eq(1)
1367                         sync += r1.wb.stb.eq(1)
1368                         sync += r1.wb.we.eq(0)
1369                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1370
1371                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1372                         with m.If(~req.dcbz):
1373                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1374                             sync += r1.acks_pending.eq(1)
1375                             sync += r1.full.eq(0)
1376                             sync += r1.slow_valid.eq(1)
1377
1378                             with m.If(~req.mmu_req):
1379                                 sync += r1.ls_valid.eq(1)
1380                             with m.Else():
1381                                 sync += r1.mmu_done.eq(1)
1382
1383                             with m.If(req.op == Op.OP_STORE_HIT):
1384                                 sync += r1.write_bram.eq(1)
1385                         with m.Else():
1386                             # dcbz is handled much like a load miss except
1387                             # that we are writing to memory instead of reading
1388                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1389
1390                             with m.If(req.op == Op.OP_STORE_MISS):
1391                                 sync += r1.write_tag.eq(1)
1392
1393                         sync += r1.wb.we.eq(1)
1394                         sync += r1.wb.cyc.eq(1)
1395                         sync += r1.wb.stb.eq(1)
1396
1397                     # OP_NONE and OP_BAD do nothing
1398                     # OP_BAD & OP_STCX_FAIL were
1399                     # handled above already
1400                     with m.Case(Op.OP_NONE):
1401                         pass
1402                     with m.Case(Op.OP_BAD):
1403                         pass
1404                     with m.Case(Op.OP_STCX_FAIL):
1405                         pass
1406
1407             with m.Case(State.RELOAD_WAIT_ACK):
1408                 ld_stbs_done = Signal()
1409                 # Requests are all sent if stb is 0
1410                 comb += ld_stbs_done.eq(~r1.wb.stb)
1411
1412                 # If we are still sening requests, was one accepted?
1413                 with m.If((~wb_in.stall) & r1.wb.stb):
1414                     # That was the last word?  We are done sending.
1415                     # Clear stb and set ld_stbs_done so we can handle an
1416                     # eventual last ack on the same cycle.
1417                     with m.If(is_last_row_addr(r1.real_adr, r1.end_row_ix)):
1418                         sync += r1.wb.stb.eq(0)
1419                         comb += ld_stbs_done.eq(1)
1420
1421                     # Calculate the next row address in the current cache line
1422                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1423                     comb += row.eq(r1.real_adr[ROW_OFF_BITS:])
1424                     sync += r1.real_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(row+1)
1425
1426                 # Incoming acks processing
1427                 sync += r1.forward_valid1.eq(wb_in.ack)
1428                 with m.If(wb_in.ack):
1429                     srow = Signal(ROW_LINE_BITS)
1430                     comb += srow.eq(r1.store_row)
1431                     sync += r1.rows_valid[srow].eq(1)
1432
1433                     # If this is the data we were looking for,
1434                     # we can complete the request next cycle.
1435                     # Compare the whole address in case the
1436                     # request in r1.req is not the one that
1437                     # started this refill.
1438                     with m.If(req.valid & r1.req.same_tag &
1439                               ((r1.dcbz & r1.req.dcbz) |
1440                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1441                                 (r1.store_row == get_row(req.real_addr))):
1442                         sync += r1.full.eq(0)
1443                         sync += r1.slow_valid.eq(1)
1444                         with m.If(~r1.mmu_req):
1445                             sync += r1.ls_valid.eq(1)
1446                         with m.Else():
1447                             sync += r1.mmu_done.eq(1)
1448                         sync += r1.forward_sel.eq(~0) # all 1s
1449                         sync += r1.use_forward1.eq(1)
1450
1451                     # Check for completion
1452                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1453                                                       r1.end_row_ix)):
1454                         # Complete wishbone cycle
1455                         sync += r1.wb.cyc.eq(0)
1456
1457                         # Cache line is now valid
1458                         cv = Signal(INDEX_BITS)
1459                         comb += cv.eq(cache_valids[r1.store_index])
1460                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1461                         sync += cache_valids[r1.store_index].eq(cv)
1462
1463                         sync += r1.state.eq(State.IDLE)
1464
1465                     # Increment store row counter
1466                     sync += r1.store_row.eq(next_row(r1.store_row))
1467
1468             with m.Case(State.STORE_WAIT_ACK):
1469                 st_stbs_done = Signal()
1470                 acks        = Signal(3)
1471                 adjust_acks = Signal(3)
1472
1473                 comb += st_stbs_done.eq(~r1.wb.stb)
1474                 comb += acks.eq(r1.acks_pending)
1475
1476                 with m.If(r1.inc_acks != r1.dec_acks):
1477                     with m.If(r1.inc_acks):
1478                         comb += adjust_acks.eq(acks + 1)
1479                     with m.Else():
1480                         comb += adjust_acks.eq(acks - 1)
1481                 with m.Else():
1482                     comb += adjust_acks.eq(acks)
1483
1484                 sync += r1.acks_pending.eq(adjust_acks)
1485
1486                 # Clear stb when slave accepted request
1487                 with m.If(~wb_in.stall):
1488                     # See if there is another store waiting
1489                     # to be done which is in the same real page.
1490                     with m.If(req.valid):
1491                         _ra = req.real_addr[0:SET_SIZE_BITS]
1492                         sync += r1.real_adr[0:SET_SIZE_BITS].eq(_ra)
1493                         sync += r1.wb.dat.eq(req.data)
1494                         sync += r1.wb.sel.eq(req.byte_sel)
1495
1496                     with m.If((adjust_acks < 7) & req.same_tag &
1497                                 ((req.op == Op.OP_STORE_MISS)
1498                                  | (req.op == Op.OP_STORE_HIT))):
1499                         sync += r1.wb.stb.eq(1)
1500                         comb += st_stbs_done.eq(0)
1501
1502                         with m.If(req.op == Op.OP_STORE_HIT):
1503                             sync += r1.write_bram.eq(1)
1504                         sync += r1.full.eq(0)
1505                         sync += r1.slow_valid.eq(1)
1506
1507                         # Store requests never come from the MMU
1508                         sync += r1.ls_valid.eq(1)
1509                         comb += st_stbs_done.eq(0)
1510                         sync += r1.inc_acks.eq(1)
1511                     with m.Else():
1512                         sync += r1.wb.stb.eq(0)
1513                         comb += st_stbs_done.eq(1)
1514
1515                 # Got ack ? See if complete.
1516                 with m.If(wb_in.ack):
1517                     with m.If(st_stbs_done & (adjust_acks == 1)):
1518                         sync += r1.state.eq(State.IDLE)
1519                         sync += r1.wb.cyc.eq(0)
1520                         sync += r1.wb.stb.eq(0)
1521                     sync += r1.dec_acks.eq(1)
1522
1523             with m.Case(State.NC_LOAD_WAIT_ACK):
1524                 # Clear stb when slave accepted request
1525                 with m.If(~wb_in.stall):
1526                     sync += r1.wb.stb.eq(0)
1527
1528                 # Got ack ? complete.
1529                 with m.If(wb_in.ack):
1530                     sync += r1.state.eq(State.IDLE)
1531                     sync += r1.full.eq(0)
1532                     sync += r1.slow_valid.eq(1)
1533
1534                     with m.If(~r1.mmu_req):
1535                         sync += r1.ls_valid.eq(1)
1536                     with m.Else():
1537                         sync += r1.mmu_done.eq(1)
1538
1539                     sync += r1.forward_sel.eq(~0) # all 1s
1540                     sync += r1.use_forward1.eq(1)
1541                     sync += r1.wb.cyc.eq(0)
1542                     sync += r1.wb.stb.eq(0)
1543
1544     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1545
1546         sync = m.d.sync
1547         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1548
1549         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1550                                stall_out, req_op[:3], d_out.valid, d_out.error,
1551                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1552                                r1.real_adr[3:6]))
1553
1554     def elaborate(self, platform):
1555
1556         m = Module()
1557         comb = m.d.comb
1558         d_in = self.d_in
1559
1560         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1561         cache_tags       = CacheTagArray()
1562         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1563         cache_valids = CacheValidBitsArray()
1564
1565         # TODO attribute ram_style : string;
1566         # TODO attribute ram_style of cache_tags : signal is "distributed";
1567
1568         """note: these are passed to nmigen.hdl.Memory as "attributes".
1569            don't know how, just that they are.
1570         """
1571         dtlb_valid_bits = TLBValidBitsArray()
1572         dtlb_tags       = TLBTagsArray()
1573         dtlb_ptes       = TLBPtesArray()
1574         # TODO attribute ram_style of
1575         #  dtlb_tags : signal is "distributed";
1576         # TODO attribute ram_style of
1577         #  dtlb_ptes : signal is "distributed";
1578
1579         r0      = RegStage0("r0")
1580         r0_full = Signal()
1581
1582         r1 = RegStage1("r1")
1583
1584         reservation = Reservation()
1585
1586         # Async signals on incoming request
1587         req_index    = Signal(INDEX_BITS)
1588         req_row      = Signal(ROW_BITS)
1589         req_hit_way  = Signal(WAY_BITS)
1590         req_tag      = Signal(TAG_BITS)
1591         req_op       = Signal(Op)
1592         req_data     = Signal(64)
1593         req_same_tag = Signal()
1594         req_go       = Signal()
1595
1596         early_req_row     = Signal(ROW_BITS)
1597
1598         cancel_store      = Signal()
1599         set_rsrv          = Signal()
1600         clear_rsrv        = Signal()
1601
1602         r0_valid          = Signal()
1603         r0_stall          = Signal()
1604
1605         use_forward1_next = Signal()
1606         use_forward2_next = Signal()
1607
1608         cache_out_row     = Signal(WB_DATA_BITS)
1609
1610         plru_victim       = PLRUOut()
1611         replace_way       = Signal(WAY_BITS)
1612
1613         # Wishbone read/write/cache write formatting signals
1614         bus_sel           = Signal(8)
1615
1616         # TLB signals
1617         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1618         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1619         tlb_valid_way = Signal(TLB_NUM_WAYS)
1620         tlb_req_index = Signal(TLB_SET_BITS)
1621         tlb_hit       = Signal()
1622         tlb_hit_way   = Signal(TLB_WAY_BITS)
1623         pte           = Signal(TLB_PTE_BITS)
1624         ra            = Signal(REAL_ADDR_BITS)
1625         valid_ra      = Signal()
1626         perm_attr     = PermAttr("dc_perms")
1627         rc_ok         = Signal()
1628         perm_ok       = Signal()
1629         access_ok     = Signal()
1630
1631         tlb_plru_victim = TLBPLRUOut()
1632
1633         # we don't yet handle collisions between loadstore1 requests
1634         # and MMU requests
1635         comb += self.m_out.stall.eq(0)
1636
1637         # Hold off the request in r0 when r1 has an uncompleted request
1638         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1639         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1640         comb += self.stall_out.eq(r0_stall)
1641
1642         # Wire up wishbone request latch out of stage 1
1643         comb += r1.wb.adr.eq(r1.real_adr)
1644         comb += self.wb_out.eq(r1.wb)
1645         comb += self.wb_out.adr.eq(r1.wb.adr[3:]) # truncate LSBs
1646
1647         # deal with litex not doing wishbone pipeline mode
1648         comb += self.wb_in.stall.eq(self.wb_out.cyc & ~self.wb_in.ack)
1649
1650         # call sub-functions putting everything together, using shared
1651         # signals established above
1652         self.stage_0(m, r0, r1, r0_full)
1653         self.tlb_read(m, r0_stall, tlb_valid_way,
1654                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1655                       dtlb_tags, dtlb_ptes)
1656         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1657                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1658                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1659         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1660                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1661                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1662         self.maybe_plrus(m, r1, plru_victim)
1663         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1664         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1665         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1666                            r0_valid, r1, cache_valids, replace_way,
1667                            use_forward1_next, use_forward2_next,
1668                            req_hit_way, plru_victim, rc_ok, perm_attr,
1669                            valid_ra, perm_ok, access_ok, req_op, req_go,
1670                            tlb_pte_way,
1671                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1672                            cancel_store, req_same_tag, r0_stall, early_req_row)
1673         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1674                            r0_valid, r0, reservation)
1675         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1676                            reservation, r0)
1677         self.writeback_control(m, r1, cache_out_row)
1678         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1679         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1680                         req_hit_way, req_index, req_tag, access_ok,
1681                         tlb_hit, tlb_hit_way, tlb_req_index)
1682         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1683                     cache_valids, r0, replace_way,
1684                     req_hit_way, req_same_tag,
1685                          r0_valid, req_op, cache_tags, req_go, ra)
1686         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1687
1688         return m
1689
1690 def dcache_load(dut, addr, nc=0):
1691     yield dut.d_in.load.eq(1)
1692     yield dut.d_in.nc.eq(nc)
1693     yield dut.d_in.addr.eq(addr)
1694     yield dut.d_in.byte_sel.eq(~0)
1695     yield dut.d_in.valid.eq(1)
1696     yield
1697     yield dut.d_in.valid.eq(0)
1698     yield dut.d_in.byte_sel.eq(0)
1699     while not (yield dut.d_out.valid):
1700         yield
1701     # yield # data is valid one cycle AFTER valid goes hi? (no it isn't)
1702     data = yield dut.d_out.data
1703     return data
1704
1705
1706 def dcache_store(dut, addr, data, nc=0):
1707     yield dut.d_in.load.eq(0)
1708     yield dut.d_in.nc.eq(nc)
1709     yield dut.d_in.data.eq(data)
1710     yield dut.d_in.byte_sel.eq(~0)
1711     yield dut.d_in.addr.eq(addr)
1712     yield dut.d_in.valid.eq(1)
1713     yield
1714     yield dut.d_in.valid.eq(0)
1715     yield dut.d_in.byte_sel.eq(0)
1716     while not (yield dut.d_out.valid):
1717         yield
1718
1719
1720 def dcache_random_sim(dut, mem):
1721
1722     # start copy of mem
1723     sim_mem = deepcopy(mem)
1724     memsize = len(sim_mem)
1725     print ("mem len", memsize)
1726
1727     # clear stuff
1728     yield dut.d_in.valid.eq(0)
1729     yield dut.d_in.load.eq(0)
1730     yield dut.d_in.priv_mode.eq(1)
1731     yield dut.d_in.nc.eq(0)
1732     yield dut.d_in.addr.eq(0)
1733     yield dut.d_in.data.eq(0)
1734     yield dut.m_in.valid.eq(0)
1735     yield dut.m_in.addr.eq(0)
1736     yield dut.m_in.pte.eq(0)
1737     # wait 4 * clk_period
1738     yield
1739     yield
1740     yield
1741     yield
1742
1743     print ()
1744
1745     #for i in range(1024):
1746     #    sim_mem[i] = i
1747
1748     for i in range(1024):
1749         addr = randint(0, memsize-1)
1750         data = randint(0, (1<<64)-1)
1751         sim_mem[addr] = data
1752         row = addr
1753         addr *= 8
1754
1755         print ("random testing %d 0x%x row %d data 0x%x" % (i, addr, row, data))
1756
1757         yield from dcache_load(dut, addr)
1758         yield from dcache_store(dut, addr, data)
1759
1760         addr = randint(0, memsize-1)
1761         sim_data = sim_mem[addr]
1762         row = addr
1763         addr *= 8
1764
1765         print ("    load 0x%x row %d expect data 0x%x" % (addr, row, sim_data))
1766         data = yield from dcache_load(dut, addr)
1767         assert data == sim_data, \
1768             "check addr 0x%x row %d data %x != %x" % (addr, row, data, sim_data)
1769
1770     for addr in range(memsize):
1771         data = yield from dcache_load(dut, addr*8)
1772         assert data == sim_mem[addr], \
1773             "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1774
1775 def dcache_regression_sim(dut, mem):
1776
1777     # start copy of mem
1778     sim_mem = deepcopy(mem)
1779     memsize = len(sim_mem)
1780     print ("mem len", memsize)
1781
1782     # clear stuff
1783     yield dut.d_in.valid.eq(0)
1784     yield dut.d_in.load.eq(0)
1785     yield dut.d_in.priv_mode.eq(1)
1786     yield dut.d_in.nc.eq(0)
1787     yield dut.d_in.addr.eq(0)
1788     yield dut.d_in.data.eq(0)
1789     yield dut.m_in.valid.eq(0)
1790     yield dut.m_in.addr.eq(0)
1791     yield dut.m_in.pte.eq(0)
1792     # wait 4 * clk_period
1793     yield
1794     yield
1795     yield
1796     yield
1797
1798     addr = 6
1799     data = ~i
1800     sim_mem[addr] = data
1801     row = addr
1802     addr *= 8
1803
1804     print ("random testing %d 0x%x row %d data 0x%x" % (i, addr, row, data))
1805
1806     yield from dcache_load(dut, addr)
1807     #yield from dcache_store(dut, addr, data)
1808
1809     addr = 7
1810     sim_data = sim_mem[addr]
1811     row = addr
1812     addr *= 8
1813
1814     print ("    load 0x%x row %d expect data 0x%x" % (addr, row, sim_data))
1815     data = yield from dcache_load(dut, addr)
1816     assert data == sim_data, \
1817         "check addr 0x%x row %d data %x != %x" % (addr, row, data, sim_data)
1818
1819
1820
1821
1822 def dcache_sim(dut, mem):
1823     # clear stuff
1824     yield dut.d_in.valid.eq(0)
1825     yield dut.d_in.load.eq(0)
1826     yield dut.d_in.priv_mode.eq(1)
1827     yield dut.d_in.nc.eq(0)
1828     yield dut.d_in.addr.eq(0)
1829     yield dut.d_in.data.eq(0)
1830     yield dut.m_in.valid.eq(0)
1831     yield dut.m_in.addr.eq(0)
1832     yield dut.m_in.pte.eq(0)
1833     # wait 4 * clk_period
1834     yield
1835     yield
1836     yield
1837     yield
1838
1839     # Cacheable read of address 4
1840     data = yield from dcache_load(dut, 0x58)
1841     addr = yield dut.d_in.addr
1842     assert data == 0x0000001700000016, \
1843         f"data @%x=%x expected 0x0000001700000016" % (addr, data)
1844
1845     # Cacheable read of address 20
1846     data = yield from dcache_load(dut, 0x20)
1847     addr = yield dut.d_in.addr
1848     assert data == 0x0000000900000008, \
1849         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1850
1851     # Cacheable read of address 30
1852     data = yield from dcache_load(dut, 0x530)
1853     addr = yield dut.d_in.addr
1854     assert data == 0x0000014D0000014C, \
1855         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1856
1857     # 2nd Cacheable read of address 30
1858     data = yield from dcache_load(dut, 0x530)
1859     addr = yield dut.d_in.addr
1860     assert data == 0x0000014D0000014C, \
1861         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1862
1863     # Non-cacheable read of address 100
1864     data = yield from dcache_load(dut, 0x100, nc=1)
1865     addr = yield dut.d_in.addr
1866     assert data == 0x0000004100000040, \
1867         f"data @%x=%x expected 0000004100000040" % (addr, data)
1868
1869     # Store at address 530
1870     yield from dcache_store(dut, 0x530, 0x121)
1871
1872     # Store at address 30
1873     yield from dcache_store(dut, 0x530, 0x12345678)
1874
1875     # 3nd Cacheable read of address 530
1876     data = yield from dcache_load(dut, 0x530)
1877     addr = yield dut.d_in.addr
1878     assert data == 0x12345678, \
1879         f"data @%x=%x expected 0x12345678" % (addr, data)
1880
1881     # 4th Cacheable read of address 20
1882     data = yield from dcache_load(dut, 0x20)
1883     addr = yield dut.d_in.addr
1884     assert data == 0x0000000900000008, \
1885         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1886
1887     yield
1888     yield
1889     yield
1890     yield
1891
1892
1893 def test_dcache(mem, test_fn, test_name):
1894     dut = DCache()
1895
1896     memory = Memory(width=64, depth=len(mem), init=mem, simulate=True)
1897     sram = SRAM(memory=memory, granularity=8)
1898
1899     m = Module()
1900     m.submodules.dcache = dut
1901     m.submodules.sram = sram
1902
1903     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1904     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1905     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1906     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1907     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1908     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1909
1910     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1911     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1912
1913     # nmigen Simulation
1914     sim = Simulator(m)
1915     sim.add_clock(1e-6)
1916
1917     sim.add_sync_process(wrap(test_fn(dut, mem)))
1918     with sim.write_vcd('test_dcache%s.vcd' % test_name):
1919         sim.run()
1920
1921 if __name__ == '__main__':
1922     seed(0)
1923     dut = DCache()
1924     vl = rtlil.convert(dut, ports=[])
1925     with open("test_dcache.il", "w") as f:
1926         f.write(vl)
1927
1928     mem = []
1929     memsize = 16
1930     for i in range(memsize):
1931         mem.append(i)
1932
1933     test_dcache(mem, dcache_regression_sim, "random")
1934
1935     exit(0)
1936
1937     mem = []
1938     memsize = 256
1939     for i in range(memsize):
1940         mem.append(i)
1941
1942     test_dcache(mem, dcache_random_sim, "random")
1943
1944     mem = []
1945     for i in range(1024):
1946         mem.append((i*2)| ((i*2+1)<<32))
1947
1948     test_dcache(mem, dcache_sim, "")
1949