src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8 """
   9
  10 import sys
  11 sys.setrecursionlimit(1000000)
  12
  13 from enum import Enum, unique
  14
  15 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  16 from nmutil.util import Display
  17
  18 from copy import deepcopy
  19 from random import randint, seed
  20
  21 from nmigen.cli import main
  22 from nmutil.iocontrol import RecordObject
  23 from nmigen.utils import log2_int
  24 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  25                                      DCacheToLoadStore1Type,
  26                                      MMUToDCacheType,
  27                                      DCacheToMMUType)
  28
  29 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  30                                 WBAddrType, WBDataType, WBSelType,
  31                                 WBMasterOut, WBSlaveOut,
  32                                 WBMasterOutVector, WBSlaveOutVector,
  33                                 WBIOMasterOut, WBIOSlaveOut)
  34
  35 from soc.experiment.cache_ram import CacheRam
  36 #from soc.experiment.plru import PLRU
  37 from nmutil.plru import PLRU
  38
  39 # for test
  40 from soc.bus.sram import SRAM
  41 from nmigen import Memory
  42 from nmigen.cli import rtlil
  43
  44 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  45 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  46 from nmutil.sim_tmp_alternative import Simulator
  47
  48 from nmutil.util import wrap
  49
  50
  51 # TODO: make these parameters of DCache at some point
  52 LINE_SIZE = 64    # Line size in bytes
  53 NUM_LINES = 16    # Number of lines in a set
  54 NUM_WAYS = 4      # Number of ways
  55 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  56 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  57 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  58 LOG_LENGTH = 0    # Non-zero to enable log data collection
  59
  60 # BRAM organisation: We never access more than
  61 #     -- WB_DATA_BITS at a time so to save
  62 #     -- resources we make the array only that wide, and
  63 #     -- use consecutive indices for to make a cache "line"
  64 #     --
  65 #     -- ROW_SIZE is the width in bytes of the BRAM
  66 #     -- (based on WB, so 64-bits)
  67 ROW_SIZE = WB_DATA_BITS // 8;
  68
  69 # ROW_PER_LINE is the number of row (wishbone
  70 # transactions) in a line
  71 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  72
  73 # BRAM_ROWS is the number of rows in BRAM needed
  74 # to represent the full dcache
  75 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  76
  77 print ("ROW_SIZE", ROW_SIZE)
  78 print ("ROW_PER_LINE", ROW_PER_LINE)
  79 print ("BRAM_ROWS", BRAM_ROWS)
  80 print ("NUM_WAYS", NUM_WAYS)
  81
  82 # Bit fields counts in the address
  83
  84 # REAL_ADDR_BITS is the number of real address
  85 # bits that we store
  86 REAL_ADDR_BITS = 56
  87
  88 # ROW_BITS is the number of bits to select a row
  89 ROW_BITS = log2_int(BRAM_ROWS)
  90
  91 # ROW_LINE_BITS is the number of bits to select
  92 # a row within a line
  93 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  94
  95 # LINE_OFF_BITS is the number of bits for
  96 # the offset in a cache line
  97 LINE_OFF_BITS = log2_int(LINE_SIZE)
  98
  99 # ROW_OFF_BITS is the number of bits for
 100 # the offset in a row
 101 ROW_OFF_BITS = log2_int(ROW_SIZE)
 102
 103 # INDEX_BITS is the number if bits to
 104 # select a cache line
 105 INDEX_BITS = log2_int(NUM_LINES)
 106
 107 # SET_SIZE_BITS is the log base 2 of the set size
 108 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 109
 110 # TAG_BITS is the number of bits of
 111 # the tag part of the address
 112 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 113
 114 # TAG_WIDTH is the width in bits of each way of the tag RAM
 115 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 116
 117 # WAY_BITS is the number of bits to select a way
 118 WAY_BITS = log2_int(NUM_WAYS)
 119
 120 # Example of layout for 32 lines of 64 bytes:
 121 layout = """\
 122   ..  tag    |index|  line  |
 123   ..         |   row   |    |
 124   ..         |     |---|    | ROW_LINE_BITS  (3)
 125   ..         |     |--- - --| LINE_OFF_BITS (6)
 126   ..         |         |- --| ROW_OFF_BITS  (3)
 127   ..         |----- ---|    | ROW_BITS      (8)
 128   ..         |-----|        | INDEX_BITS    (5)
 129   .. --------|              | TAG_BITS      (45)
 130 """
 131 print (layout)
 132 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 133             (TAG_BITS, INDEX_BITS, ROW_BITS,
 134              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 135 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 136 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 137 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 138
 139 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 140
 141 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 142
 143 def CacheTagArray():
 144     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 145                         for x in range(NUM_LINES))
 146
 147 def CacheValidBitsArray():
 148     return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
 149                         for x in range(NUM_LINES))
 150
 151 def RowPerLineValidArray():
 152     return Array(Signal(name="rows_valid%d" % x) \
 153                         for x in range(ROW_PER_LINE))
 154
 155 # L1 TLB
 156 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 157 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 158 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 159 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 160 TLB_PTE_BITS     = 64
 161 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 162
 163 def ispow2(x):
 164     return (1<<log2_int(x, False)) == x
 165
 166 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 167 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 168 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 169 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 170 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 171 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 172         "geometry bits don't add up"
 173 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 174         "geometry bits don't add up"
 175 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 176          "geometry bits don't add up"
 177 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 178 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 179
 180
 181 def TLBValidBitsArray():
 182     return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
 183                 for x in range(TLB_SET_SIZE))
 184
 185 def TLBTagEAArray():
 186     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 187                 for x in range (TLB_NUM_WAYS))
 188
 189 def TLBTagsArray():
 190     return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
 191                 for x in range (TLB_SET_SIZE))
 192
 193 def TLBPtesArray():
 194     return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
 195                 for x in range(TLB_SET_SIZE))
 196
 197 def HitWaySet():
 198     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 199                         for x in range(TLB_NUM_WAYS))
 200
 201 # Cache RAM interface
 202 def CacheRamOut():
 203     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 204                  for x in range(NUM_WAYS))
 205
 206 # PLRU output interface
 207 def PLRUOut():
 208     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 209                 for x in range(NUM_LINES))
 210
 211 # TLB PLRU output interface
 212 def TLBPLRUOut():
 213     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 214                 for x in range(TLB_SET_SIZE))
 215
 216 # Helper functions to decode incoming requests
 217 #
 218 # Return the cache line index (tag index) for an address
 219 def get_index(addr):
 220     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 221
 222 # Return the cache row index (data memory) for an address
 223 def get_row(addr):
 224     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 225
 226 # Return the index of a row within a line
 227 def get_row_of_line(row):
 228     return row[:ROW_BITS][:ROW_LINE_BITS]
 229
 230 # Returns whether this is the last row of a line
 231 def is_last_row_addr(addr, last):
 232     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 233
 234 # Returns whether this is the last row of a line
 235 def is_last_row(row, last):
 236     return get_row_of_line(row) == last
 237
 238 # Return the next row in the current cache line. We use a
 239 # dedicated function in order to limit the size of the
 240 # generated adder to be only the bits within a cache line
 241 # (3 bits with default settings)
 242 def next_row(row):
 243     row_v = row[0:ROW_LINE_BITS] + 1
 244     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 245
 246 # Get the tag value from the address
 247 def get_tag(addr):
 248     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 249
 250 # Read a tag from a tag memory row
 251 def read_tag(way, tagset):
 252     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 253
 254 # Read a TLB tag from a TLB tag memory row
 255 def read_tlb_tag(way, tags):
 256     return tags.word_select(way, TLB_EA_TAG_BITS)
 257
 258 # Write a TLB tag to a TLB tag memory row
 259 def write_tlb_tag(way, tags, tag):
 260     return read_tlb_tag(way, tags).eq(tag)
 261
 262 # Read a PTE from a TLB PTE memory row
 263 def read_tlb_pte(way, ptes):
 264     return ptes.word_select(way, TLB_PTE_BITS)
 265
 266 def write_tlb_pte(way, ptes, newpte):
 267     return read_tlb_pte(way, ptes).eq(newpte)
 268
 269
 270 # Record for storing permission, attribute, etc. bits from a PTE
 271 class PermAttr(RecordObject):
 272     def __init__(self, name=None):
 273         super().__init__(name=name)
 274         self.reference = Signal()
 275         self.changed   = Signal()
 276         self.nocache   = Signal()
 277         self.priv      = Signal()
 278         self.rd_perm   = Signal()
 279         self.wr_perm   = Signal()
 280
 281
 282 def extract_perm_attr(pte):
 283     pa = PermAttr()
 284     return pa;
 285
 286
 287 # Type of operation on a "valid" input
 288 @unique
 289 class Op(Enum):
 290     OP_NONE       = 0
 291     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 292     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 293     OP_LOAD_HIT   = 3 # Cache hit on load
 294     OP_LOAD_MISS  = 4 # Load missing cache
 295     OP_LOAD_NC    = 5 # Non-cachable load
 296     OP_STORE_HIT  = 6 # Store hitting cache
 297     OP_STORE_MISS = 7 # Store missing cache
 298
 299
 300 # Cache state machine
 301 @unique
 302 class State(Enum):
 303     IDLE             = 0 # Normal load hit processing
 304     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 305     STORE_WAIT_ACK   = 2 # Store wait ack
 306     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 307
 308
 309 # Dcache operations:
 310 #
 311 # In order to make timing, we use the BRAMs with
 312 # an output buffer, which means that the BRAM
 313 # output is delayed by an extra cycle.
 314 #
 315 # Thus, the dcache has a 2-stage internal pipeline
 316 # for cache hits with no stalls.
 317 #
 318 # All other operations are handled via stalling
 319 # in the first stage.
 320 #
 321 # The second stage can thus complete a hit at the same
 322 # time as the first stage emits a stall for a complex op.
 323 #
 324 # Stage 0 register, basically contains just the latched request
 325
 326 class RegStage0(RecordObject):
 327     def __init__(self, name=None):
 328         super().__init__(name=name)
 329         self.req     = LoadStore1ToDCacheType(name="lsmem")
 330         self.tlbie   = Signal()
 331         self.doall   = Signal()
 332         self.tlbld   = Signal()
 333         self.mmu_req = Signal() # indicates source of request
 334
 335
 336 class MemAccessRequest(RecordObject):
 337     def __init__(self, name=None):
 338         super().__init__(name=name)
 339         self.op        = Signal(Op)
 340         self.valid     = Signal()
 341         self.dcbz      = Signal()
 342         self.real_addr = Signal(REAL_ADDR_BITS)
 343         self.data      = Signal(64)
 344         self.byte_sel  = Signal(8)
 345         self.hit_way   = Signal(WAY_BITS)
 346         self.same_tag  = Signal()
 347         self.mmu_req   = Signal()
 348
 349
 350 # First stage register, contains state for stage 1 of load hits
 351 # and for the state machine used by all other operations
 352 class RegStage1(RecordObject):
 353     def __init__(self, name=None):
 354         super().__init__(name=name)
 355         # Info about the request
 356         self.full             = Signal() # have uncompleted request
 357         self.mmu_req          = Signal() # request is from MMU
 358         self.req              = MemAccessRequest(name="reqmem")
 359
 360         # Cache hit state
 361         self.hit_way          = Signal(WAY_BITS)
 362         self.hit_load_valid   = Signal()
 363         self.hit_index        = Signal(INDEX_BITS)
 364         self.cache_hit        = Signal()
 365
 366         # TLB hit state
 367         self.tlb_hit          = Signal()
 368         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 369         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 370
 371         # 2-stage data buffer for data forwarded from writes to reads
 372         self.forward_data1    = Signal(64)
 373         self.forward_data2    = Signal(64)
 374         self.forward_sel1     = Signal(8)
 375         self.forward_valid1   = Signal()
 376         self.forward_way1     = Signal(WAY_BITS)
 377         self.forward_row1     = Signal(ROW_BITS)
 378         self.use_forward1     = Signal()
 379         self.forward_sel      = Signal(8)
 380
 381         # Cache miss state (reload state machine)
 382         self.state            = Signal(State)
 383         self.dcbz             = Signal()
 384         self.write_bram       = Signal()
 385         self.write_tag        = Signal()
 386         self.slow_valid       = Signal()
 387         self.real_adr         = Signal(REAL_ADDR_BITS)
 388         self.wb               = WBMasterOut("wb")
 389         self.reload_tag       = Signal(TAG_BITS)
 390         self.store_way        = Signal(WAY_BITS)
 391         self.store_row        = Signal(ROW_BITS)
 392         self.store_index      = Signal(INDEX_BITS)
 393         self.end_row_ix       = Signal(ROW_LINE_BITS)
 394         self.rows_valid       = RowPerLineValidArray()
 395         self.acks_pending     = Signal(3)
 396         self.inc_acks         = Signal()
 397         self.dec_acks         = Signal()
 398
 399         # Signals to complete (possibly with error)
 400         self.ls_valid         = Signal()
 401         self.ls_error         = Signal()
 402         self.mmu_done         = Signal()
 403         self.mmu_error        = Signal()
 404         self.cache_paradox    = Signal()
 405
 406         # Signal to complete a failed stcx.
 407         self.stcx_fail        = Signal()
 408
 409
 410 # Reservation information
 411 class Reservation(RecordObject):
 412     def __init__(self):
 413         super().__init__()
 414         self.valid = Signal()
 415         self.addr  = Signal(64-LINE_OFF_BITS)
 416
 417
 418 class DTLBUpdate(Elaboratable):
 419     def __init__(self):
 420         self.tlbie    = Signal()
 421         self.tlbwe    = Signal()
 422         self.doall    = Signal()
 423         self.updated  = Signal()
 424         self.v_updated  = Signal()
 425         self.tlb_hit    = Signal()
 426         self.tlb_req_index = Signal(TLB_SET_BITS)
 427
 428         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 429         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 430         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 431         self.repl_way        = Signal(TLB_WAY_BITS)
 432         self.eatag           = Signal(TLB_EA_TAG_BITS)
 433         self.pte_data        = Signal(TLB_PTE_BITS)
 434
 435         self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 436
 437         self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 438         self.pb_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 439         self.db_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 440
 441     def elaborate(self, platform):
 442         m = Module()
 443         comb = m.d.comb
 444         sync = m.d.sync
 445
 446         tagset   = Signal(TLB_TAG_WAY_BITS)
 447         pteset   = Signal(TLB_PTE_WAY_BITS)
 448
 449         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 450         comb += db_out.eq(self.dv)
 451
 452         with m.If(self.tlbie & self.doall):
 453             pass # clear all back in parent
 454         with m.Elif(self.tlbie):
 455             with m.If(self.tlb_hit):
 456                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 457                 comb += self.v_updated.eq(1)
 458
 459         with m.Elif(self.tlbwe):
 460
 461             comb += tagset.eq(self.tlb_tag_way)
 462             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 463             comb += tb_out.eq(tagset)
 464
 465             comb += pteset.eq(self.tlb_pte_way)
 466             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 467             comb += pb_out.eq(pteset)
 468
 469             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 470
 471             comb += self.updated.eq(1)
 472             comb += self.v_updated.eq(1)
 473
 474         return m
 475
 476
 477 class DCachePendingHit(Elaboratable):
 478
 479     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 480                       cache_valid_idx, cache_tag_set,
 481                     req_addr,
 482                     hit_set):
 483
 484         self.go          = Signal()
 485         self.virt_mode   = Signal()
 486         self.is_hit      = Signal()
 487         self.tlb_hit     = Signal()
 488         self.hit_way     = Signal(WAY_BITS)
 489         self.rel_match   = Signal()
 490         self.req_index   = Signal(INDEX_BITS)
 491         self.reload_tag  = Signal(TAG_BITS)
 492
 493         self.tlb_hit_way = tlb_hit_way
 494         self.tlb_pte_way = tlb_pte_way
 495         self.tlb_valid_way = tlb_valid_way
 496         self.cache_valid_idx = cache_valid_idx
 497         self.cache_tag_set = cache_tag_set
 498         self.req_addr = req_addr
 499         self.hit_set = hit_set
 500
 501     def elaborate(self, platform):
 502         m = Module()
 503         comb = m.d.comb
 504         sync = m.d.sync
 505
 506         go = self.go
 507         virt_mode = self.virt_mode
 508         is_hit = self.is_hit
 509         tlb_pte_way = self.tlb_pte_way
 510         tlb_valid_way = self.tlb_valid_way
 511         cache_valid_idx = self.cache_valid_idx
 512         cache_tag_set = self.cache_tag_set
 513         req_addr = self.req_addr
 514         tlb_hit_way = self.tlb_hit_way
 515         tlb_hit = self.tlb_hit
 516         hit_set = self.hit_set
 517         hit_way = self.hit_way
 518         rel_match = self.rel_match
 519         req_index = self.req_index
 520         reload_tag = self.reload_tag
 521
 522         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 523                                     for i in range(TLB_NUM_WAYS))
 524         hit_way_set = HitWaySet()
 525
 526         # Test if pending request is a hit on any way
 527         # In order to make timing in virtual mode,
 528         # when we are using the TLB, we compare each
 529         # way with each of the real addresses from each way of
 530         # the TLB, and then decide later which match to use.
 531
 532         with m.If(virt_mode):
 533             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 534                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 535                 s_hit       = Signal()
 536                 s_pte       = Signal(TLB_PTE_BITS)
 537                 s_ra        = Signal(REAL_ADDR_BITS)
 538                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 539                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 540                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 541                 comb += s_tag.eq(get_tag(s_ra))
 542
 543                 for i in range(NUM_WAYS): # way_t
 544                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 545                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 546                                   (read_tag(i, cache_tag_set) == s_tag)
 547                                   & tlb_valid_way[j])
 548                     with m.If(is_tag_hit):
 549                         comb += hit_way_set[j].eq(i)
 550                         comb += s_hit.eq(1)
 551                 comb += hit_set[j].eq(s_hit)
 552                 with m.If(s_tag == reload_tag):
 553                     comb += rel_matches[j].eq(1)
 554             with m.If(tlb_hit):
 555                 comb += is_hit.eq(hit_set[tlb_hit_way])
 556                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 557                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 558         with m.Else():
 559             s_tag       = Signal(TAG_BITS)
 560             comb += s_tag.eq(get_tag(req_addr))
 561             for i in range(NUM_WAYS): # way_t
 562                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 563                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 564                           (read_tag(i, cache_tag_set) == s_tag))
 565                 with m.If(is_tag_hit):
 566                     comb += hit_way.eq(i)
 567                     comb += is_hit.eq(1)
 568             with m.If(s_tag == reload_tag):
 569                 comb += rel_match.eq(1)
 570
 571         return m
 572
 573
 574 class DCache(Elaboratable):
 575     """Set associative dcache write-through
 576     TODO (in no specific order):
 577     * See list in icache.vhdl
 578     * Complete load misses on the cycle when WB data comes instead of
 579       at the end of line (this requires dealing with requests coming in
 580       while not idle...)
 581     """
 582     def __init__(self):
 583         self.d_in      = LoadStore1ToDCacheType("d_in")
 584         self.d_out     = DCacheToLoadStore1Type("d_out")
 585
 586         self.m_in      = MMUToDCacheType("m_in")
 587         self.m_out     = DCacheToMMUType("m_out")
 588
 589         self.stall_out = Signal()
 590
 591         self.wb_out    = WBMasterOut()
 592         self.wb_in     = WBSlaveOut()
 593
 594         self.log_out   = Signal(20)
 595
 596     def stage_0(self, m, r0, r1, r0_full):
 597         """Latch the request in r0.req as long as we're not stalling
 598         """
 599         comb = m.d.comb
 600         sync = m.d.sync
 601         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 602
 603         r = RegStage0("stage0")
 604
 605         # TODO, this goes in unit tests and formal proofs
 606         with m.If(d_in.valid & m_in.valid):
 607             sync += Display("request collision loadstore vs MMU")
 608
 609         with m.If(m_in.valid):
 610             comb += r.req.valid.eq(1)
 611             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 612             comb += r.req.dcbz.eq(0)
 613             comb += r.req.nc.eq(0)
 614             comb += r.req.reserve.eq(0)
 615             comb += r.req.virt_mode.eq(0)
 616             comb += r.req.priv_mode.eq(1)
 617             comb += r.req.addr.eq(m_in.addr)
 618             comb += r.req.data.eq(m_in.pte)
 619             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 620             comb += r.tlbie.eq(m_in.tlbie)
 621             comb += r.doall.eq(m_in.doall)
 622             comb += r.tlbld.eq(m_in.tlbld)
 623             comb += r.mmu_req.eq(1)
 624         with m.Else():
 625             comb += r.req.eq(d_in)
 626             comb += r.tlbie.eq(0)
 627             comb += r.doall.eq(0)
 628             comb += r.tlbld.eq(0)
 629             comb += r.mmu_req.eq(0)
 630         with m.If(~(r1.full & r0_full)):
 631             sync += r0.eq(r)
 632             sync += r0_full.eq(r.req.valid)
 633
 634     def tlb_read(self, m, r0_stall, tlb_valid_way,
 635                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 636                  dtlb_tags, dtlb_ptes):
 637         """TLB
 638         Operates in the second cycle on the request latched in r0.req.
 639         TLB updates write the entry at the end of the second cycle.
 640         """
 641         comb = m.d.comb
 642         sync = m.d.sync
 643         m_in, d_in = self.m_in, self.d_in
 644
 645         index    = Signal(TLB_SET_BITS)
 646         addrbits = Signal(TLB_SET_BITS)
 647
 648         amin = TLB_LG_PGSZ
 649         amax = TLB_LG_PGSZ + TLB_SET_BITS
 650
 651         with m.If(m_in.valid):
 652             comb += addrbits.eq(m_in.addr[amin : amax])
 653         with m.Else():
 654             comb += addrbits.eq(d_in.addr[amin : amax])
 655         comb += index.eq(addrbits)
 656
 657         # If we have any op and the previous op isn't finished,
 658         # then keep the same output for next cycle.
 659         with m.If(~r0_stall):
 660             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 661             sync += tlb_tag_way.eq(dtlb_tags[index])
 662             sync += tlb_pte_way.eq(dtlb_ptes[index])
 663
 664     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 665         """Generate TLB PLRUs
 666         """
 667         comb = m.d.comb
 668         sync = m.d.sync
 669
 670         if TLB_NUM_WAYS == 0:
 671             return
 672         for i in range(TLB_SET_SIZE):
 673             # TLB PLRU interface
 674             tlb_plru        = PLRU(TLB_WAY_BITS)
 675             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 676             tlb_plru_acc_en = Signal()
 677
 678             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 679             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 680             comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
 681             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 682
 683     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 684                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 685                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 686
 687         comb = m.d.comb
 688
 689         hitway = Signal(TLB_WAY_BITS)
 690         hit    = Signal()
 691         eatag  = Signal(TLB_EA_TAG_BITS)
 692
 693         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 694         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 695         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 696
 697         for i in range(TLB_NUM_WAYS):
 698             is_tag_hit = Signal()
 699             comb += is_tag_hit.eq(tlb_valid_way[i]
 700                                   & (read_tlb_tag(i, tlb_tag_way) == eatag))
 701             with m.If(is_tag_hit):
 702                 comb += hitway.eq(i)
 703                 comb += hit.eq(1)
 704
 705         comb += tlb_hit.eq(hit & r0_valid)
 706         comb += tlb_hit_way.eq(hitway)
 707
 708         with m.If(tlb_hit):
 709             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 710         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 711
 712         with m.If(r0.req.virt_mode):
 713             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 714                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 715                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 716             comb += perm_attr.reference.eq(pte[8])
 717             comb += perm_attr.changed.eq(pte[7])
 718             comb += perm_attr.nocache.eq(pte[5])
 719             comb += perm_attr.priv.eq(pte[3])
 720             comb += perm_attr.rd_perm.eq(pte[2])
 721             comb += perm_attr.wr_perm.eq(pte[1])
 722         with m.Else():
 723             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 724                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 725             comb += perm_attr.reference.eq(1)
 726             comb += perm_attr.changed.eq(1)
 727             comb += perm_attr.nocache.eq(0)
 728             comb += perm_attr.priv.eq(1)
 729             comb += perm_attr.rd_perm.eq(1)
 730             comb += perm_attr.wr_perm.eq(1)
 731
 732     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 733                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 734                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 735
 736         dtlb_valids = TLBValidBitsArray()
 737
 738         comb = m.d.comb
 739         sync = m.d.sync
 740
 741         tlbie    = Signal()
 742         tlbwe    = Signal()
 743
 744         comb += tlbie.eq(r0_valid & r0.tlbie)
 745         comb += tlbwe.eq(r0_valid & r0.tlbld)
 746
 747         m.submodules.tlb_update = d = DTLBUpdate()
 748         with m.If(tlbie & r0.doall):
 749             # clear all valid bits at once
 750             for i in range(TLB_SET_SIZE):
 751                 sync += dtlb_valid_bits[i].eq(0)
 752         with m.If(d.updated):
 753             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 754             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 755         with m.If(d.v_updated):
 756             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 757
 758         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 759
 760         comb += d.tlbie.eq(tlbie)
 761         comb += d.tlbwe.eq(tlbwe)
 762         comb += d.doall.eq(r0.doall)
 763         comb += d.tlb_hit.eq(tlb_hit)
 764         comb += d.tlb_hit_way.eq(tlb_hit_way)
 765         comb += d.tlb_tag_way.eq(tlb_tag_way)
 766         comb += d.tlb_pte_way.eq(tlb_pte_way)
 767         comb += d.tlb_req_index.eq(tlb_req_index)
 768
 769         with m.If(tlb_hit):
 770             comb += d.repl_way.eq(tlb_hit_way)
 771         with m.Else():
 772             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 773         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 774         comb += d.pte_data.eq(r0.req.data)
 775
 776     def maybe_plrus(self, m, r1, plru_victim):
 777         """Generate PLRUs
 778         """
 779         comb = m.d.comb
 780         sync = m.d.sync
 781
 782         if TLB_NUM_WAYS == 0:
 783             return
 784
 785         for i in range(NUM_LINES):
 786             # PLRU interface
 787             plru        = PLRU(WAY_BITS)
 788             setattr(m.submodules, "plru%d" % i, plru)
 789             plru_acc_en = Signal()
 790
 791             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 792             comb += plru.acc_en.eq(plru_acc_en)
 793             comb += plru.acc_i.eq(r1.hit_way)
 794             comb += plru_victim[i].eq(plru.lru_o)
 795
 796     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 797         """Cache tag RAM read port
 798         """
 799         comb = m.d.comb
 800         sync = m.d.sync
 801         m_in, d_in = self.m_in, self.d_in
 802
 803         index = Signal(INDEX_BITS)
 804
 805         with m.If(r0_stall):
 806             comb += index.eq(req_index)
 807         with m.Elif(m_in.valid):
 808             comb += index.eq(get_index(m_in.addr))
 809         with m.Else():
 810             comb += index.eq(get_index(d_in.addr))
 811         sync += cache_tag_set.eq(cache_tags[index])
 812
 813     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 814                        r0_valid, r1, cache_valids, replace_way,
 815                        use_forward1_next, use_forward2_next,
 816                        req_hit_way, plru_victim, rc_ok, perm_attr,
 817                        valid_ra, perm_ok, access_ok, req_op, req_go,
 818                        tlb_pte_way,
 819                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 820                        cancel_store, req_same_tag, r0_stall, early_req_row):
 821         """Cache request parsing and hit detection
 822         """
 823
 824         comb = m.d.comb
 825         sync = m.d.sync
 826         m_in, d_in = self.m_in, self.d_in
 827
 828         is_hit      = Signal()
 829         hit_way     = Signal(WAY_BITS)
 830         op          = Signal(Op)
 831         opsel       = Signal(3)
 832         go          = Signal()
 833         nc          = Signal()
 834         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 835                                   for i in range(TLB_NUM_WAYS))
 836         cache_valid_idx = Signal(NUM_WAYS)
 837
 838         # Extract line, row and tag from request
 839         comb += req_index.eq(get_index(r0.req.addr))
 840         comb += req_row.eq(get_row(r0.req.addr))
 841         comb += req_tag.eq(get_tag(ra))
 842
 843         if False: # display on comb is a bit... busy.
 844             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 845                     r0.req.addr, ra, req_index, req_tag, req_row)
 846
 847         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 848         comb += cache_valid_idx.eq(cache_valids[req_index])
 849
 850         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 851                                 tlb_valid_way, tlb_hit_way,
 852                                 cache_valid_idx, cache_tag_set,
 853                                 r0.req.addr,
 854                                 hit_set)
 855
 856         comb += dc.tlb_hit.eq(tlb_hit)
 857         comb += dc.reload_tag.eq(r1.reload_tag)
 858         comb += dc.virt_mode.eq(r0.req.virt_mode)
 859         comb += dc.go.eq(go)
 860         comb += dc.req_index.eq(req_index)
 861         comb += is_hit.eq(dc.is_hit)
 862         comb += hit_way.eq(dc.hit_way)
 863         comb += req_same_tag.eq(dc.rel_match)
 864
 865         # See if the request matches the line currently being reloaded
 866         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 867                   (req_index == r1.store_index) & req_same_tag):
 868             # For a store, consider this a hit even if the row isn't
 869             # valid since it will be by the time we perform the store.
 870             # For a load, check the appropriate row valid bit.
 871             rrow = Signal(ROW_LINE_BITS)
 872             comb += rrow.eq(req_row)
 873             valid = r1.rows_valid[rrow]
 874             comb += is_hit.eq((~r0.req.load) | valid)
 875             comb += hit_way.eq(replace_way)
 876
 877         # Whether to use forwarded data for a load or not
 878         with m.If((get_row(r1.req.real_addr) == req_row) &
 879                   (r1.req.hit_way == hit_way)):
 880             # Only need to consider r1.write_bram here, since if we
 881             # are writing refill data here, then we don't have a
 882             # cache hit this cycle on the line being refilled.
 883             # (There is the possibility that the load following the
 884             # load miss that started the refill could be to the old
 885             # contents of the victim line, since it is a couple of
 886             # cycles after the refill starts before we see the updated
 887             # cache tag. In that case we don't use the bypass.)
 888             comb += use_forward1_next.eq(r1.write_bram)
 889         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 890             comb += use_forward2_next.eq(r1.forward_valid1)
 891
 892         # The way that matched on a hit
 893         comb += req_hit_way.eq(hit_way)
 894
 895         # The way to replace on a miss
 896         with m.If(r1.write_tag):
 897             comb += replace_way.eq(plru_victim[r1.store_index])
 898         with m.Else():
 899             comb += replace_way.eq(r1.store_way)
 900
 901         # work out whether we have permission for this access
 902         # NB we don't yet implement AMR, thus no KUAP
 903         comb += rc_ok.eq(perm_attr.reference
 904                          & (r0.req.load | perm_attr.changed))
 905         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 906                            (perm_attr.wr_perm |
 907                               (r0.req.load & perm_attr.rd_perm)))
 908         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 909         # Combine the request and cache hit status to decide what
 910         # operation needs to be done
 911         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 912         comb += op.eq(Op.OP_NONE)
 913         with m.If(go):
 914             with m.If(~access_ok):
 915                 comb += op.eq(Op.OP_BAD)
 916             with m.Elif(cancel_store):
 917                 comb += op.eq(Op.OP_STCX_FAIL)
 918             with m.Else():
 919                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 920                 with m.Switch(opsel):
 921                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 922                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 923                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 924                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 925                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 926                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 927                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 928                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 929         comb += req_op.eq(op)
 930         comb += req_go.eq(go)
 931
 932         # Version of the row number that is valid one cycle earlier
 933         # in the cases where we need to read the cache data BRAM.
 934         # If we're stalling then we need to keep reading the last
 935         # row requested.
 936         with m.If(~r0_stall):
 937             with m.If(m_in.valid):
 938                 comb += early_req_row.eq(get_row(m_in.addr))
 939             with m.Else():
 940                 comb += early_req_row.eq(get_row(d_in.addr))
 941         with m.Else():
 942             comb += early_req_row.eq(req_row)
 943
 944     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 945                          r0_valid, r0, reservation):
 946         """Handle load-with-reservation and store-conditional instructions
 947         """
 948         comb = m.d.comb
 949
 950         with m.If(r0_valid & r0.req.reserve):
 951             # XXX generate alignment interrupt if address
 952             # is not aligned XXX or if r0.req.nc = '1'
 953             with m.If(r0.req.load):
 954                 comb += set_rsrv.eq(1) # load with reservation
 955             with m.Else():
 956                 comb += clear_rsrv.eq(1) # store conditional
 957                 with m.If((~reservation.valid) |
 958                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
 959                     comb += cancel_store.eq(1)
 960
 961     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 962                         reservation, r0):
 963
 964         comb = m.d.comb
 965         sync = m.d.sync
 966
 967         with m.If(r0_valid & access_ok):
 968             with m.If(clear_rsrv):
 969                 sync += reservation.valid.eq(0)
 970             with m.Elif(set_rsrv):
 971                 sync += reservation.valid.eq(1)
 972                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 973
 974     def writeback_control(self, m, r1, cache_out_row):
 975         """Return data for loads & completion control logic
 976         """
 977         comb = m.d.comb
 978         sync = m.d.sync
 979         d_out, m_out = self.d_out, self.m_out
 980
 981         data_out = Signal(64)
 982         data_fwd = Signal(64)
 983
 984         # Use the bypass if are reading the row that was
 985         # written 1 or 2 cycles ago, including for the
 986         # slow_valid = 1 case (i.e. completing a load
 987         # miss or a non-cacheable load).
 988         with m.If(r1.use_forward1):
 989             comb += data_fwd.eq(r1.forward_data1)
 990         with m.Else():
 991             comb += data_fwd.eq(r1.forward_data2)
 992
 993         comb += data_out.eq(cache_out_row)
 994
 995         for i in range(8):
 996             with m.If(r1.forward_sel[i]):
 997                 dsel = data_fwd.word_select(i, 8)
 998                 comb += data_out.word_select(i, 8).eq(dsel)
 999
1000         comb += d_out.valid.eq(r1.ls_valid)
1001         comb += d_out.data.eq(data_out)
1002         comb += d_out.store_done.eq(~r1.stcx_fail)
1003         comb += d_out.error.eq(r1.ls_error)
1004         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1005
1006         # Outputs to MMU
1007         comb += m_out.done.eq(r1.mmu_done)
1008         comb += m_out.err.eq(r1.mmu_error)
1009         comb += m_out.data.eq(data_out)
1010
1011         # We have a valid load or store hit or we just completed
1012         # a slow op such as a load miss, a NC load or a store
1013         #
1014         # Note: the load hit is delayed by one cycle. However it
1015         # can still not collide with r.slow_valid (well unless I
1016         # miscalculated) because slow_valid can only be set on a
1017         # subsequent request and not on its first cycle (the state
1018         # machine must have advanced), which makes slow_valid
1019         # at least 2 cycles from the previous hit_load_valid.
1020
1021         # Sanity: Only one of these must be set in any given cycle
1022
1023         if False: # TODO: need Display to get this to work
1024             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1025             "unexpected slow_valid collision with stcx_fail"
1026
1027             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1028              "unexpected hit_load_delayed collision with slow_valid"
1029
1030         with m.If(~r1.mmu_req):
1031             # Request came from loadstore1...
1032             # Load hit case is the standard path
1033             with m.If(r1.hit_load_valid):
1034                 sync += Display("completing load hit data=%x", data_out)
1035
1036             # error cases complete without stalling
1037             with m.If(r1.ls_error):
1038                 sync += Display("completing ld/st with error")
1039
1040             # Slow ops (load miss, NC, stores)
1041             with m.If(r1.slow_valid):
1042                 sync += Display("completing store or load miss data=%x",
1043                                 data_out)
1044
1045         with m.Else():
1046             # Request came from MMU
1047             with m.If(r1.hit_load_valid):
1048                 sync += Display("completing load hit to MMU, data=%x",
1049                                 m_out.data)
1050             # error cases complete without stalling
1051             with m.If(r1.mmu_error):
1052                 sync += Display("combpleting MMU ld with error")
1053
1054             # Slow ops (i.e. load miss)
1055             with m.If(r1.slow_valid):
1056                 sync += Display("completing MMU load miss, data=%x",
1057                                 m_out.data)
1058
1059     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1060         """rams
1061         Generate a cache RAM for each way. This handles the normal
1062         reads, writes from reloads and the special store-hit update
1063         path as well.
1064
1065         Note: the BRAMs have an extra read buffer, meaning the output
1066         is pipelined an extra cycle. This differs from the
1067         icache. The writeback logic needs to take that into
1068         account by using 1-cycle delayed signals for load hits.
1069         """
1070         comb = m.d.comb
1071         wb_in = self.wb_in
1072
1073         for i in range(NUM_WAYS):
1074             do_read  = Signal(name="do_rd%d" % i)
1075             rd_addr  = Signal(ROW_BITS)
1076             do_write = Signal(name="do_wr%d" % i)
1077             wr_addr  = Signal(ROW_BITS)
1078             wr_data  = Signal(WB_DATA_BITS)
1079             wr_sel   = Signal(ROW_SIZE)
1080             wr_sel_m = Signal(ROW_SIZE)
1081             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1082
1083             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True)
1084             setattr(m.submodules, "cacheram_%d" % i, way)
1085
1086             comb += way.rd_en.eq(do_read)
1087             comb += way.rd_addr.eq(rd_addr)
1088             comb += _d_out.eq(way.rd_data_o)
1089             comb += way.wr_sel.eq(wr_sel_m)
1090             comb += way.wr_addr.eq(wr_addr)
1091             comb += way.wr_data.eq(wr_data)
1092
1093             # Cache hit reads
1094             comb += do_read.eq(1)
1095             comb += rd_addr.eq(early_req_row)
1096             with m.If(r1.hit_way == i):
1097                 comb += cache_out_row.eq(_d_out)
1098
1099             # Write mux:
1100             #
1101             # Defaults to wishbone read responses (cache refill)
1102             #
1103             # For timing, the mux on wr_data/sel/addr is not
1104             # dependent on anything other than the current state.
1105
1106             with m.If(r1.write_bram):
1107                 # Write store data to BRAM.  This happens one
1108                 # cycle after the store is in r0.
1109                 comb += wr_data.eq(r1.req.data)
1110                 comb += wr_sel.eq(r1.req.byte_sel)
1111                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1112
1113                 with m.If(i == r1.req.hit_way):
1114                     comb += do_write.eq(1)
1115             with m.Else():
1116                 # Otherwise, we might be doing a reload or a DCBZ
1117                 with m.If(r1.dcbz):
1118                     comb += wr_data.eq(0)
1119                 with m.Else():
1120                     comb += wr_data.eq(wb_in.dat)
1121                 comb += wr_addr.eq(r1.store_row)
1122                 comb += wr_sel.eq(~0) # all 1s
1123
1124             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1125                       & wb_in.ack & (replace_way == i)):
1126                 comb += do_write.eq(1)
1127
1128             # Mask write selects with do_write since BRAM
1129             # doesn't have a global write-enable
1130             with m.If(do_write):
1131                 comb += wr_sel_m.eq(wr_sel)
1132
1133     # Cache hit synchronous machine for the easy case.
1134     # This handles load hits.
1135     # It also handles error cases (TLB miss, cache paradox)
1136     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1137                         req_hit_way, req_index, req_tag, access_ok,
1138                         tlb_hit, tlb_hit_way, tlb_req_index):
1139
1140         comb = m.d.comb
1141         sync = m.d.sync
1142
1143         with m.If(req_op != Op.OP_NONE):
1144             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1145                     req_op, r0.req.addr, r0.req.nc,
1146                     req_index, req_tag, req_hit_way)
1147
1148         with m.If(r0_valid):
1149             sync += r1.mmu_req.eq(r0.mmu_req)
1150
1151         # Fast path for load/store hits.
1152         # Set signals for the writeback controls.
1153         sync += r1.hit_way.eq(req_hit_way)
1154         sync += r1.hit_index.eq(req_index)
1155
1156         with m.If(req_op == Op.OP_LOAD_HIT):
1157             sync += r1.hit_load_valid.eq(1)
1158         with m.Else():
1159             sync += r1.hit_load_valid.eq(0)
1160
1161         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1162             sync += r1.cache_hit.eq(1)
1163         with m.Else():
1164             sync += r1.cache_hit.eq(0)
1165
1166         with m.If(req_op == Op.OP_BAD):
1167             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1168             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1169             sync += r1.ls_error.eq(~r0.mmu_req)
1170             sync += r1.mmu_error.eq(r0.mmu_req)
1171             sync += r1.cache_paradox.eq(access_ok)
1172
1173             with m.Else():
1174                 sync += r1.ls_error.eq(0)
1175                 sync += r1.mmu_error.eq(0)
1176                 sync += r1.cache_paradox.eq(0)
1177
1178         with m.If(req_op == Op.OP_STCX_FAIL):
1179             sync += r1.stcx_fail.eq(1)
1180         with m.Else():
1181             sync += r1.stcx_fail.eq(0)
1182
1183         # Record TLB hit information for updating TLB PLRU
1184         sync += r1.tlb_hit.eq(tlb_hit)
1185         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1186         sync += r1.tlb_hit_index.eq(tlb_req_index)
1187
1188     # Memory accesses are handled by this state machine:
1189     #
1190     #   * Cache load miss/reload (in conjunction with "rams")
1191     #   * Load hits for non-cachable forms
1192     #   * Stores (the collision case is handled in "rams")
1193     #
1194     # All wishbone requests generation is done here.
1195     # This machine operates at stage 1.
1196     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1197                     cache_valids, r0, replace_way,
1198                     req_hit_way, req_same_tag,
1199                     r0_valid, req_op, cache_tags, req_go, ra):
1200
1201         comb = m.d.comb
1202         sync = m.d.sync
1203         wb_in = self.wb_in
1204
1205         req         = MemAccessRequest("mreq_ds")
1206         acks        = Signal(3)
1207         adjust_acks = Signal(3)
1208
1209         req_row = Signal(ROW_BITS)
1210         req_idx = Signal(INDEX_BITS)
1211         req_tag = Signal(TAG_BITS)
1212         comb += req_idx.eq(get_index(req.real_addr))
1213         comb += req_row.eq(get_row(req.real_addr))
1214         comb += req_tag.eq(get_tag(req.real_addr))
1215
1216         sync += r1.use_forward1.eq(use_forward1_next)
1217         sync += r1.forward_sel.eq(0)
1218
1219         with m.If(use_forward1_next):
1220             sync += r1.forward_sel.eq(r1.req.byte_sel)
1221         with m.Elif(use_forward2_next):
1222             sync += r1.forward_sel.eq(r1.forward_sel1)
1223
1224         sync += r1.forward_data2.eq(r1.forward_data1)
1225         with m.If(r1.write_bram):
1226             sync += r1.forward_data1.eq(r1.req.data)
1227             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1228             sync += r1.forward_way1.eq(r1.req.hit_way)
1229             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1230             sync += r1.forward_valid1.eq(1)
1231         with m.Else():
1232             with m.If(r1.dcbz):
1233                 sync += r1.forward_data1.eq(0)
1234             with m.Else():
1235                 sync += r1.forward_data1.eq(wb_in.dat)
1236             sync += r1.forward_sel1.eq(~0) # all 1s
1237             sync += r1.forward_way1.eq(replace_way)
1238             sync += r1.forward_row1.eq(r1.store_row)
1239             sync += r1.forward_valid1.eq(0)
1240
1241         # One cycle pulses reset
1242         sync += r1.slow_valid.eq(0)
1243         sync += r1.write_bram.eq(0)
1244         sync += r1.inc_acks.eq(0)
1245         sync += r1.dec_acks.eq(0)
1246
1247         sync += r1.ls_valid.eq(0)
1248         # complete tlbies and TLB loads in the third cycle
1249         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1250
1251         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1252             with m.If(~r0.mmu_req):
1253                 sync += r1.ls_valid.eq(1)
1254             with m.Else():
1255                 sync += r1.mmu_done.eq(1)
1256
1257         with m.If(r1.write_tag):
1258             # Store new tag in selected way
1259             for i in range(NUM_WAYS):
1260                 with m.If(i == replace_way):
1261                     ct = Signal(TAG_RAM_WIDTH)
1262                     comb += ct.eq(cache_tags[r1.store_index])
1263                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1264                     sync += cache_tags[r1.store_index].eq(ct)
1265             sync += r1.store_way.eq(replace_way)
1266             sync += r1.write_tag.eq(0)
1267
1268         # Take request from r1.req if there is one there,
1269         # else from req_op, ra, etc.
1270         with m.If(r1.full):
1271             comb += req.eq(r1.req)
1272         with m.Else():
1273             comb += req.op.eq(req_op)
1274             comb += req.valid.eq(req_go)
1275             comb += req.mmu_req.eq(r0.mmu_req)
1276             comb += req.dcbz.eq(r0.req.dcbz)
1277             comb += req.real_addr.eq(ra)
1278
1279             with m.If(~r0.req.dcbz):
1280                 comb += req.data.eq(r0.req.data)
1281             with m.Else():
1282                 comb += req.data.eq(0)
1283
1284             # Select all bytes for dcbz
1285             # and for cacheable loads
1286             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1287                 comb += req.byte_sel.eq(~0) # all 1s
1288             with m.Else():
1289                 comb += req.byte_sel.eq(r0.req.byte_sel)
1290             comb += req.hit_way.eq(req_hit_way)
1291             comb += req.same_tag.eq(req_same_tag)
1292
1293             # Store the incoming request from r0,
1294             # if it is a slow request
1295             # Note that r1.full = 1 implies req_op = OP_NONE
1296             with m.If((req_op == Op.OP_LOAD_MISS)
1297                       | (req_op == Op.OP_LOAD_NC)
1298                       | (req_op == Op.OP_STORE_MISS)
1299                       | (req_op == Op.OP_STORE_HIT)):
1300                 sync += r1.req.eq(req)
1301                 sync += r1.full.eq(1)
1302
1303         # Main state machine
1304         with m.Switch(r1.state):
1305
1306             with m.Case(State.IDLE):
1307                 sync += r1.real_adr.eq(req.real_addr)
1308                 sync += r1.wb.sel.eq(req.byte_sel)
1309                 sync += r1.wb.dat.eq(req.data)
1310                 sync += r1.dcbz.eq(req.dcbz)
1311
1312                 # Keep track of our index and way
1313                 # for subsequent stores.
1314                 sync += r1.store_index.eq(req_idx)
1315                 sync += r1.store_row.eq(req_row)
1316                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1317                 sync += r1.reload_tag.eq(req_tag)
1318                 sync += r1.req.same_tag.eq(1)
1319
1320                 with m.If(req.op == Op.OP_STORE_HIT):
1321                     sync += r1.store_way.eq(req.hit_way)
1322
1323                 # Reset per-row valid bits,
1324                 # ready for handling OP_LOAD_MISS
1325                 for i in range(ROW_PER_LINE):
1326                     sync += r1.rows_valid[i].eq(0)
1327
1328                 with m.If(req_op != Op.OP_NONE):
1329                     sync += Display("cache op %d", req.op)
1330
1331                 with m.Switch(req.op):
1332                     with m.Case(Op.OP_LOAD_HIT):
1333                         # stay in IDLE state
1334                         pass
1335
1336                     with m.Case(Op.OP_LOAD_MISS):
1337                         sync += Display("cache miss real addr: %x " \
1338                                 "idx: %x tag: %x",
1339                                 req.real_addr, req_row, req_tag)
1340
1341                         # Start the wishbone cycle
1342                         sync += r1.wb.we.eq(0)
1343                         sync += r1.wb.cyc.eq(1)
1344                         sync += r1.wb.stb.eq(1)
1345
1346                         # Track that we had one request sent
1347                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1348                         sync += r1.write_tag.eq(1)
1349
1350                     with m.Case(Op.OP_LOAD_NC):
1351                         sync += r1.wb.cyc.eq(1)
1352                         sync += r1.wb.stb.eq(1)
1353                         sync += r1.wb.we.eq(0)
1354                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1355
1356                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1357                         with m.If(~req.dcbz):
1358                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1359                             sync += r1.acks_pending.eq(1)
1360                             sync += r1.full.eq(0)
1361                             sync += r1.slow_valid.eq(1)
1362
1363                             with m.If(~req.mmu_req):
1364                                 sync += r1.ls_valid.eq(1)
1365                             with m.Else():
1366                                 sync += r1.mmu_done.eq(1)
1367
1368                             with m.If(req.op == Op.OP_STORE_HIT):
1369                                 sync += r1.write_bram.eq(1)
1370                         with m.Else():
1371                             # dcbz is handled much like a load miss except
1372                             # that we are writing to memory instead of reading
1373                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1374
1375                             with m.If(req.op == Op.OP_STORE_MISS):
1376                                 sync += r1.write_tag.eq(1)
1377
1378                         sync += r1.wb.we.eq(1)
1379                         sync += r1.wb.cyc.eq(1)
1380                         sync += r1.wb.stb.eq(1)
1381
1382                     # OP_NONE and OP_BAD do nothing
1383                     # OP_BAD & OP_STCX_FAIL were
1384                     # handled above already
1385                     with m.Case(Op.OP_NONE):
1386                         pass
1387                     with m.Case(Op.OP_BAD):
1388                         pass
1389                     with m.Case(Op.OP_STCX_FAIL):
1390                         pass
1391
1392             with m.Case(State.RELOAD_WAIT_ACK):
1393                 ld_stbs_done = Signal()
1394                 # Requests are all sent if stb is 0
1395                 comb += ld_stbs_done.eq(~r1.wb.stb)
1396
1397                 with m.If((~wb_in.stall) & r1.wb.stb):
1398                     # That was the last word?  We are done sending.
1399                     # Clear stb and set ld_stbs_done so we can handle an
1400                     # eventual last ack on the same cycle.
1401                     with m.If(is_last_row_addr(r1.real_adr, r1.end_row_ix)):
1402                         sync += r1.wb.stb.eq(0)
1403                         comb += ld_stbs_done.eq(1)
1404
1405                     # Calculate the next row address in the current cache line
1406                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1407                     comb += row.eq(r1.real_adr[ROW_OFF_BITS:])
1408                     sync += r1.real_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(row+1)
1409
1410                 # Incoming acks processing
1411                 sync += r1.forward_valid1.eq(wb_in.ack)
1412                 with m.If(wb_in.ack):
1413                     srow = Signal(ROW_LINE_BITS)
1414                     comb += srow.eq(r1.store_row)
1415                     sync += r1.rows_valid[srow].eq(1)
1416
1417                     # If this is the data we were looking for,
1418                     # we can complete the request next cycle.
1419                     # Compare the whole address in case the
1420                     # request in r1.req is not the one that
1421                     # started this refill.
1422                     with m.If(r1.full & r1.req.same_tag &
1423                               ((r1.dcbz & r1.req.dcbz) |
1424                                ((~r1.dcbz) & (r1.req.op == Op.OP_LOAD_MISS))) &
1425                                 (r1.store_row == get_row(r1.req.real_addr))):
1426                         sync += r1.full.eq(0)
1427                         sync += r1.slow_valid.eq(1)
1428                         with m.If(~r1.mmu_req):
1429                             sync += r1.ls_valid.eq(1)
1430                         with m.Else():
1431                             sync += r1.mmu_done.eq(1)
1432                         sync += r1.forward_sel.eq(~0) # all 1s
1433                         sync += r1.use_forward1.eq(1)
1434
1435                     # Check for completion
1436                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1437                                                       r1.end_row_ix)):
1438                         # Complete wishbone cycle
1439                         sync += r1.wb.cyc.eq(0)
1440
1441                         # Cache line is now valid
1442                         cv = Signal(INDEX_BITS)
1443                         comb += cv.eq(cache_valids[r1.store_index])
1444                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1445                         sync += cache_valids[r1.store_index].eq(cv)
1446
1447                         sync += r1.state.eq(State.IDLE)
1448
1449                     # Increment store row counter
1450                     sync += r1.store_row.eq(next_row(r1.store_row))
1451
1452             with m.Case(State.STORE_WAIT_ACK):
1453                 st_stbs_done = Signal()
1454                 comb += st_stbs_done.eq(~r1.wb.stb)
1455                 comb += acks.eq(r1.acks_pending)
1456
1457                 with m.If(r1.inc_acks != r1.dec_acks):
1458                     with m.If(r1.inc_acks):
1459                         comb += adjust_acks.eq(acks + 1)
1460                     with m.Else():
1461                         comb += adjust_acks.eq(acks - 1)
1462                 with m.Else():
1463                     comb += adjust_acks.eq(acks)
1464
1465                 sync += r1.acks_pending.eq(adjust_acks)
1466
1467                 # Clear stb when slave accepted request
1468                 with m.If(~wb_in.stall):
1469                     # See if there is another store waiting
1470                     # to be done which is in the same real page.
1471                     with m.If(req.valid):
1472                         ra = req.real_addr[0:SET_SIZE_BITS]
1473                         sync += r1.real_adr[0:SET_SIZE_BITS].eq(ra)
1474                         sync += r1.wb.dat.eq(req.data)
1475                         sync += r1.wb.sel.eq(req.byte_sel)
1476
1477                     with m.Elif((adjust_acks < 7) & req.same_tag &
1478                                 ((req.op == Op.OP_STORE_MISS)
1479                                  | (req.op == Op.OP_STORE_HIT))):
1480                         sync += r1.wb.stb.eq(1)
1481                         comb += st_stbs_done.eq(0)
1482
1483                         with m.If(req.op == Op.OP_STORE_HIT):
1484                             sync += r1.write_bram.eq(1)
1485                         sync += r1.full.eq(0)
1486                         sync += r1.slow_valid.eq(1)
1487
1488                         # Store requests never come from the MMU
1489                         sync += r1.ls_valid.eq(1)
1490                         comb += st_stbs_done.eq(0)
1491                         sync += r1.inc_acks.eq(1)
1492                     with m.Else():
1493                         sync += r1.wb.stb.eq(0)
1494                         comb += st_stbs_done.eq(1)
1495
1496                 # Got ack ? See if complete.
1497                 with m.If(wb_in.ack):
1498                     with m.If(st_stbs_done & (adjust_acks == 1)):
1499                         sync += r1.state.eq(State.IDLE)
1500                         sync += r1.wb.cyc.eq(0)
1501                         sync += r1.wb.stb.eq(0)
1502                     sync += r1.dec_acks.eq(1)
1503
1504             with m.Case(State.NC_LOAD_WAIT_ACK):
1505                 # Clear stb when slave accepted request
1506                 with m.If(~wb_in.stall):
1507                     sync += r1.wb.stb.eq(0)
1508
1509                 # Got ack ? complete.
1510                 with m.If(wb_in.ack):
1511                     sync += r1.state.eq(State.IDLE)
1512                     sync += r1.full.eq(0)
1513                     sync += r1.slow_valid.eq(1)
1514
1515                     with m.If(~r1.mmu_req):
1516                         sync += r1.ls_valid.eq(1)
1517                     with m.Else():
1518                         sync += r1.mmu_done.eq(1)
1519
1520                     sync += r1.forward_sel.eq(~0) # all 1s
1521                     sync += r1.use_forward1.eq(1)
1522                     sync += r1.wb.cyc.eq(0)
1523                     sync += r1.wb.stb.eq(0)
1524
1525     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1526
1527         sync = m.d.sync
1528         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1529
1530         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1531                                stall_out, req_op[:3], d_out.valid, d_out.error,
1532                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1533                                r1.real_adr[3:6]))
1534
1535     def elaborate(self, platform):
1536
1537         m = Module()
1538         comb = m.d.comb
1539
1540         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1541         cache_tags       = CacheTagArray()
1542         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1543         cache_valids = CacheValidBitsArray()
1544
1545         # TODO attribute ram_style : string;
1546         # TODO attribute ram_style of cache_tags : signal is "distributed";
1547
1548         """note: these are passed to nmigen.hdl.Memory as "attributes".
1549            don't know how, just that they are.
1550         """
1551         dtlb_valid_bits = TLBValidBitsArray()
1552         dtlb_tags       = TLBTagsArray()
1553         dtlb_ptes       = TLBPtesArray()
1554         # TODO attribute ram_style of
1555         #  dtlb_tags : signal is "distributed";
1556         # TODO attribute ram_style of
1557         #  dtlb_ptes : signal is "distributed";
1558
1559         r0      = RegStage0("r0")
1560         r0_full = Signal()
1561
1562         r1 = RegStage1("r1")
1563
1564         reservation = Reservation()
1565
1566         # Async signals on incoming request
1567         req_index    = Signal(INDEX_BITS)
1568         req_row      = Signal(ROW_BITS)
1569         req_hit_way  = Signal(WAY_BITS)
1570         req_tag      = Signal(TAG_BITS)
1571         req_op       = Signal(Op)
1572         req_data     = Signal(64)
1573         req_same_tag = Signal()
1574         req_go       = Signal()
1575
1576         early_req_row     = Signal(ROW_BITS)
1577
1578         cancel_store      = Signal()
1579         set_rsrv          = Signal()
1580         clear_rsrv        = Signal()
1581
1582         r0_valid          = Signal()
1583         r0_stall          = Signal()
1584
1585         use_forward1_next = Signal()
1586         use_forward2_next = Signal()
1587
1588         cache_out_row     = Signal(WB_DATA_BITS)
1589
1590         plru_victim       = PLRUOut()
1591         replace_way       = Signal(WAY_BITS)
1592
1593         # Wishbone read/write/cache write formatting signals
1594         bus_sel           = Signal(8)
1595
1596         # TLB signals
1597         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1598         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1599         tlb_valid_way = Signal(TLB_NUM_WAYS)
1600         tlb_req_index = Signal(TLB_SET_BITS)
1601         tlb_hit       = Signal()
1602         tlb_hit_way   = Signal(TLB_WAY_BITS)
1603         pte           = Signal(TLB_PTE_BITS)
1604         ra            = Signal(REAL_ADDR_BITS)
1605         valid_ra      = Signal()
1606         perm_attr     = PermAttr("dc_perms")
1607         rc_ok         = Signal()
1608         perm_ok       = Signal()
1609         access_ok     = Signal()
1610
1611         tlb_plru_victim = TLBPLRUOut()
1612
1613         # we don't yet handle collisions between loadstore1 requests
1614         # and MMU requests
1615         comb += self.m_out.stall.eq(0)
1616
1617         # Hold off the request in r0 when r1 has an uncompleted request
1618         comb += r0_stall.eq(r0_full & r1.full)
1619         comb += r0_valid.eq(r0_full & ~r1.full)
1620         comb += self.stall_out.eq(r0_stall)
1621
1622         # Wire up wishbone request latch out of stage 1
1623         comb += r1.wb.adr.eq(r1.real_adr)
1624         comb += self.wb_out.eq(r1.wb)
1625         comb += self.wb_out.adr.eq(r1.wb.adr[3:]) # truncate LSBs
1626
1627         # deal with litex not doing wishbone pipeline mode
1628         comb += self.wb_in.stall.eq(self.wb_out.cyc & ~self.wb_in.ack)
1629
1630         # call sub-functions putting everything together, using shared
1631         # signals established above
1632         self.stage_0(m, r0, r1, r0_full)
1633         self.tlb_read(m, r0_stall, tlb_valid_way,
1634                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1635                       dtlb_tags, dtlb_ptes)
1636         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1637                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1638                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1639         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1640                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1641                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1642         self.maybe_plrus(m, r1, plru_victim)
1643         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1644         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1645         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1646                            r0_valid, r1, cache_valids, replace_way,
1647                            use_forward1_next, use_forward2_next,
1648                            req_hit_way, plru_victim, rc_ok, perm_attr,
1649                            valid_ra, perm_ok, access_ok, req_op, req_go,
1650                            tlb_pte_way,
1651                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1652                            cancel_store, req_same_tag, r0_stall, early_req_row)
1653         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1654                            r0_valid, r0, reservation)
1655         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1656                            reservation, r0)
1657         self.writeback_control(m, r1, cache_out_row)
1658         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1659         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1660                         req_hit_way, req_index, req_tag, access_ok,
1661                         tlb_hit, tlb_hit_way, tlb_req_index)
1662         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1663                     cache_valids, r0, replace_way,
1664                     req_hit_way, req_same_tag,
1665                          r0_valid, req_op, cache_tags, req_go, ra)
1666         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1667
1668         return m
1669
1670 def dcache_load(dut, addr, nc=0):
1671     yield dut.d_in.load.eq(1)
1672     yield dut.d_in.nc.eq(nc)
1673     yield dut.d_in.addr.eq(addr)
1674     yield dut.d_in.byte_sel.eq(~0)
1675     yield dut.d_in.valid.eq(1)
1676     yield
1677     yield dut.d_in.valid.eq(0)
1678     yield dut.d_in.byte_sel.eq(0)
1679     while not (yield dut.d_out.valid):
1680         yield
1681     data = yield dut.d_out.data
1682     return data
1683
1684
1685 def dcache_store(dut, addr, data, nc=0):
1686     yield dut.d_in.load.eq(0)
1687     yield dut.d_in.nc.eq(nc)
1688     yield dut.d_in.data.eq(data)
1689     yield dut.d_in.byte_sel.eq(~0)
1690     yield dut.d_in.addr.eq(addr)
1691     yield dut.d_in.valid.eq(1)
1692     yield
1693     yield dut.d_in.valid.eq(0)
1694     yield dut.d_in.byte_sel.eq(0)
1695     while not (yield dut.d_out.valid):
1696         yield
1697
1698
1699 def dcache_random_sim(dut, mem):
1700
1701     # start copy of mem
1702     sim_mem = deepcopy(mem)
1703     memsize = len(sim_mem)
1704     print ("mem len", memsize)
1705
1706     # clear stuff
1707     yield dut.d_in.valid.eq(0)
1708     yield dut.d_in.load.eq(0)
1709     yield dut.d_in.priv_mode.eq(1)
1710     yield dut.d_in.nc.eq(0)
1711     yield dut.d_in.addr.eq(0)
1712     yield dut.d_in.data.eq(0)
1713     yield dut.m_in.valid.eq(0)
1714     yield dut.m_in.addr.eq(0)
1715     yield dut.m_in.pte.eq(0)
1716     # wait 4 * clk_period
1717     yield
1718     yield
1719     yield
1720     yield
1721
1722     print ()
1723
1724     #for i in range(1024):
1725     #    sim_mem[i] = i
1726
1727     for i in range(1024):
1728         addr = randint(0, memsize-1)
1729         data = randint(0, (1<<64)-1)
1730         sim_mem[addr] = data
1731         row = addr
1732         addr *= 8
1733
1734         print ("random testing %d 0x%x row %d data 0x%x" % (i, addr, row, data))
1735
1736         yield from dcache_load(dut, addr)
1737         yield from dcache_store(dut, addr, data)
1738
1739         addr = randint(0, memsize-1)
1740         sim_data = sim_mem[addr]
1741         row = addr
1742         addr *= 8
1743
1744         print ("    load 0x%x row %d expect data 0x%x" % (addr, row, sim_data))
1745         data = yield from dcache_load(dut, addr)
1746         assert data == sim_data, \
1747             "check addr 0x%x row %d data %x != %x" % (addr, row, data, sim_data)
1748
1749     for addr in range(memsize):
1750         data = yield from dcache_load(dut, addr*8)
1751         assert data == sim_mem[addr], \
1752             "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1753
1754
1755 def dcache_sim(dut, mem):
1756     # clear stuff
1757     yield dut.d_in.valid.eq(0)
1758     yield dut.d_in.load.eq(0)
1759     yield dut.d_in.priv_mode.eq(1)
1760     yield dut.d_in.nc.eq(0)
1761     yield dut.d_in.addr.eq(0)
1762     yield dut.d_in.data.eq(0)
1763     yield dut.m_in.valid.eq(0)
1764     yield dut.m_in.addr.eq(0)
1765     yield dut.m_in.pte.eq(0)
1766     # wait 4 * clk_period
1767     yield
1768     yield
1769     yield
1770     yield
1771
1772     # Cacheable read of address 4
1773     data = yield from dcache_load(dut, 0x58)
1774     addr = yield dut.d_in.addr
1775     assert data == 0x0000001700000016, \
1776         f"data @%x=%x expected 0x0000001700000016" % (addr, data)
1777
1778     # Cacheable read of address 20
1779     data = yield from dcache_load(dut, 0x20)
1780     addr = yield dut.d_in.addr
1781     assert data == 0x0000000900000008, \
1782         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1783
1784     # Cacheable read of address 30
1785     data = yield from dcache_load(dut, 0x530)
1786     addr = yield dut.d_in.addr
1787     assert data == 0x0000014D0000014C, \
1788         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1789
1790     # 2nd Cacheable read of address 30
1791     data = yield from dcache_load(dut, 0x530)
1792     addr = yield dut.d_in.addr
1793     assert data == 0x0000014D0000014C, \
1794         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1795
1796     # Non-cacheable read of address 100
1797     data = yield from dcache_load(dut, 0x100, nc=1)
1798     addr = yield dut.d_in.addr
1799     assert data == 0x0000004100000040, \
1800         f"data @%x=%x expected 0000004100000040" % (addr, data)
1801
1802     # Store at address 530
1803     yield from dcache_store(dut, 0x530, 0x121)
1804
1805     # Store at address 30
1806     yield from dcache_store(dut, 0x530, 0x12345678)
1807
1808     # 3nd Cacheable read of address 530
1809     data = yield from dcache_load(dut, 0x530)
1810     addr = yield dut.d_in.addr
1811     assert data == 0x12345678, \
1812         f"data @%x=%x expected 0x12345678" % (addr, data)
1813
1814     # 4th Cacheable read of address 20
1815     data = yield from dcache_load(dut, 0x20)
1816     addr = yield dut.d_in.addr
1817     assert data == 0x0000000900000008, \
1818         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1819
1820     yield
1821     yield
1822     yield
1823     yield
1824
1825
1826 def test_dcache(mem, test_fn, test_name):
1827     dut = DCache()
1828
1829     memory = Memory(width=64, depth=len(mem), init=mem, simulate=True)
1830     sram = SRAM(memory=memory, granularity=8)
1831
1832     m = Module()
1833     m.submodules.dcache = dut
1834     m.submodules.sram = sram
1835
1836     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1837     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1838     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1839     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1840     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1841     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1842
1843     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1844     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1845
1846     # nmigen Simulation
1847     sim = Simulator(m)
1848     sim.add_clock(1e-6)
1849
1850     sim.add_sync_process(wrap(test_fn(dut, mem)))
1851     with sim.write_vcd('test_dcache%s.vcd' % test_name):
1852         sim.run()
1853
1854 if __name__ == '__main__':
1855     seed(0)
1856     dut = DCache()
1857     vl = rtlil.convert(dut, ports=[])
1858     with open("test_dcache.il", "w") as f:
1859         f.write(vl)
1860
1861     mem = []
1862     for i in range(1024):
1863         mem.append((i*2)| ((i*2+1)<<32))
1864
1865     test_dcache(mem, dcache_sim, "")
1866
1867     mem = []
1868     memsize = 16384
1869     for i in range(memsize):
1870         mem.append(i)
1871
1872     test_dcache(mem, dcache_random_sim, "random")
1873