src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 """
   6
   7 from enum import Enum, unique
   8
   9 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  10 from nmutil.util import Display
  11
  12 from random import randint
  13
  14 from nmigen.cli import main
  15 from nmutil.iocontrol import RecordObject
  16 from nmigen.utils import log2_int
  17 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  18                                      DCacheToLoadStore1Type,
  19                                      MMUToDCacheType,
  20                                      DCacheToMMUType)
  21
  22 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  23                                 WBAddrType, WBDataType, WBSelType,
  24                                 WBMasterOut, WBSlaveOut,
  25                                 WBMasterOutVector, WBSlaveOutVector,
  26                                 WBIOMasterOut, WBIOSlaveOut)
  27
  28 from soc.experiment.cache_ram import CacheRam
  29 #from soc.experiment.plru import PLRU
  30 from nmutil.plru import PLRU
  31
  32 # for test
  33 from nmigen_soc.wishbone.sram import SRAM
  34 from nmigen import Memory
  35 from nmigen.cli import rtlil
  36 if True:
  37     from nmigen.back.pysim import Simulator, Delay, Settle
  38 else:
  39     from nmigen.sim.cxxsim import Simulator, Delay, Settle
  40 from nmutil.util import wrap
  41
  42
  43 # TODO: make these parameters of DCache at some point
  44 LINE_SIZE = 64    # Line size in bytes
  45 NUM_LINES = 16    # Number of lines in a set
  46 NUM_WAYS = 4      # Number of ways
  47 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  48 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  49 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  50 LOG_LENGTH = 0    # Non-zero to enable log data collection
  51
  52 # BRAM organisation: We never access more than
  53 #     -- WB_DATA_BITS at a time so to save
  54 #     -- resources we make the array only that wide, and
  55 #     -- use consecutive indices for to make a cache "line"
  56 #     --
  57 #     -- ROW_SIZE is the width in bytes of the BRAM
  58 #     -- (based on WB, so 64-bits)
  59 ROW_SIZE = WB_DATA_BITS // 8;
  60
  61 # ROW_PER_LINE is the number of row (wishbone
  62 # transactions) in a line
  63 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  64
  65 # BRAM_ROWS is the number of rows in BRAM needed
  66 # to represent the full dcache
  67 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  68
  69 print ("ROW_SIZE", ROW_SIZE)
  70 print ("ROW_PER_LINE", ROW_PER_LINE)
  71 print ("BRAM_ROWS", BRAM_ROWS)
  72 print ("NUM_WAYS", NUM_WAYS)
  73
  74 # Bit fields counts in the address
  75
  76 # REAL_ADDR_BITS is the number of real address
  77 # bits that we store
  78 REAL_ADDR_BITS = 56
  79
  80 # ROW_BITS is the number of bits to select a row
  81 ROW_BITS = log2_int(BRAM_ROWS)
  82
  83 # ROW_LINE_BITS is the number of bits to select
  84 # a row within a line
  85 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  86
  87 # LINE_OFF_BITS is the number of bits for
  88 # the offset in a cache line
  89 LINE_OFF_BITS = log2_int(LINE_SIZE)
  90
  91 # ROW_OFF_BITS is the number of bits for
  92 # the offset in a row
  93 ROW_OFF_BITS = log2_int(ROW_SIZE)
  94
  95 # INDEX_BITS is the number if bits to
  96 # select a cache line
  97 INDEX_BITS = log2_int(NUM_LINES)
  98
  99 # SET_SIZE_BITS is the log base 2 of the set size
 100 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 101
 102 # TAG_BITS is the number of bits of
 103 # the tag part of the address
 104 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 105
 106 # TAG_WIDTH is the width in bits of each way of the tag RAM
 107 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 108
 109 # WAY_BITS is the number of bits to select a way
 110 WAY_BITS = log2_int(NUM_WAYS)
 111
 112 # Example of layout for 32 lines of 64 bytes:
 113 layout = """\
 114   ..  tag    |index|  line  |
 115   ..         |   row   |    |
 116   ..         |     |---|    | ROW_LINE_BITS  (3)
 117   ..         |     |--- - --| LINE_OFF_BITS (6)
 118   ..         |         |- --| ROW_OFF_BITS  (3)
 119   ..         |----- ---|    | ROW_BITS      (8)
 120   ..         |-----|        | INDEX_BITS    (5)
 121   .. --------|              | TAG_BITS      (45)
 122 """
 123 print (layout)
 124 print ("Dcache TAG %d IDX %d ROW %d ROFF %d LOFF %d RLB %d" % \
 125             (TAG_BITS, INDEX_BITS, ROW_BITS,
 126              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 127 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 128 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 129 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 130
 131 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 132
 133 def CacheTagArray():
 134     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 135                         for x in range(NUM_LINES))
 136
 137 def CacheValidBitsArray():
 138     return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
 139                         for x in range(NUM_LINES))
 140
 141 def RowPerLineValidArray():
 142     return Array(Signal(name="rows_valid%d" % x) \
 143                         for x in range(ROW_PER_LINE))
 144
 145 # L1 TLB
 146 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 147 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 148 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 149 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 150 TLB_PTE_BITS     = 64
 151 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 152
 153 def ispow2(x):
 154     return (1<<log2_int(x, False)) == x
 155
 156 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 157 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 158 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 159 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 160 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 161 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 162         "geometry bits don't add up"
 163 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 164         "geometry bits don't add up"
 165 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 166          "geometry bits don't add up"
 167 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 168 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 169
 170
 171 def TLBValidBitsArray():
 172     return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
 173
 174 def TLBTagEAArray():
 175     return Array(Signal(TLB_EA_TAG_BITS) for x in range (TLB_NUM_WAYS))
 176
 177 def TLBTagsArray():
 178     return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
 179
 180 def TLBPtesArray():
 181     return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
 182
 183 def HitWaySet():
 184     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 185                         for x in range(TLB_NUM_WAYS))
 186
 187 # Cache RAM interface
 188 def CacheRamOut():
 189     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 190                  for x in range(NUM_WAYS))
 191
 192 # PLRU output interface
 193 def PLRUOut():
 194     return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
 195
 196 # TLB PLRU output interface
 197 def TLBPLRUOut():
 198     return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
 199
 200 # Helper functions to decode incoming requests
 201 #
 202 # Return the cache line index (tag index) for an address
 203 def get_index(addr):
 204     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 205
 206 # Return the cache row index (data memory) for an address
 207 def get_row(addr):
 208     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 209
 210 # Return the index of a row within a line
 211 def get_row_of_line(row):
 212     return row[:ROW_BITS][:ROW_LINE_BITS]
 213
 214 # Returns whether this is the last row of a line
 215 def is_last_row_addr(addr, last):
 216     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 217
 218 # Returns whether this is the last row of a line
 219 def is_last_row(row, last):
 220     return get_row_of_line(row) == last
 221
 222 # Return the next row in the current cache line. We use a
 223 # dedicated function in order to limit the size of the
 224 # generated adder to be only the bits within a cache line
 225 # (3 bits with default settings)
 226 def next_row(row):
 227     row_v = row[0:ROW_LINE_BITS] + 1
 228     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 229
 230 # Get the tag value from the address
 231 def get_tag(addr):
 232     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 233
 234 # Read a tag from a tag memory row
 235 def read_tag(way, tagset):
 236     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 237
 238 # Read a TLB tag from a TLB tag memory row
 239 def read_tlb_tag(way, tags):
 240     return tags.word_select(way, TLB_EA_TAG_BITS)
 241
 242 # Write a TLB tag to a TLB tag memory row
 243 def write_tlb_tag(way, tags, tag):
 244     return read_tlb_tag(way, tags).eq(tag)
 245
 246 # Read a PTE from a TLB PTE memory row
 247 def read_tlb_pte(way, ptes):
 248     return ptes.word_select(way, TLB_PTE_BITS)
 249
 250 def write_tlb_pte(way, ptes, newpte):
 251     return read_tlb_pte(way, ptes).eq(newpte)
 252
 253
 254 # Record for storing permission, attribute, etc. bits from a PTE
 255 class PermAttr(RecordObject):
 256     def __init__(self, name=None):
 257         super().__init__(name=name)
 258         self.reference = Signal()
 259         self.changed   = Signal()
 260         self.nocache   = Signal()
 261         self.priv      = Signal()
 262         self.rd_perm   = Signal()
 263         self.wr_perm   = Signal()
 264
 265
 266 def extract_perm_attr(pte):
 267     pa = PermAttr()
 268     return pa;
 269
 270
 271 # Type of operation on a "valid" input
 272 @unique
 273 class Op(Enum):
 274     OP_NONE       = 0
 275     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 276     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 277     OP_LOAD_HIT   = 3 # Cache hit on load
 278     OP_LOAD_MISS  = 4 # Load missing cache
 279     OP_LOAD_NC    = 5 # Non-cachable load
 280     OP_STORE_HIT  = 6 # Store hitting cache
 281     OP_STORE_MISS = 7 # Store missing cache
 282
 283
 284 # Cache state machine
 285 @unique
 286 class State(Enum):
 287     IDLE             = 0 # Normal load hit processing
 288     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 289     STORE_WAIT_ACK   = 2 # Store wait ack
 290     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 291
 292
 293 # Dcache operations:
 294 #
 295 # In order to make timing, we use the BRAMs with
 296 # an output buffer, which means that the BRAM
 297 # output is delayed by an extra cycle.
 298 #
 299 # Thus, the dcache has a 2-stage internal pipeline
 300 # for cache hits with no stalls.
 301 #
 302 # All other operations are handled via stalling
 303 # in the first stage.
 304 #
 305 # The second stage can thus complete a hit at the same
 306 # time as the first stage emits a stall for a complex op.
 307 #
 308 # Stage 0 register, basically contains just the latched request
 309
 310 class RegStage0(RecordObject):
 311     def __init__(self, name=None):
 312         super().__init__(name=name)
 313         self.req     = LoadStore1ToDCacheType(name="lsmem")
 314         self.tlbie   = Signal()
 315         self.doall   = Signal()
 316         self.tlbld   = Signal()
 317         self.mmu_req = Signal() # indicates source of request
 318
 319
 320 class MemAccessRequest(RecordObject):
 321     def __init__(self, name=None):
 322         super().__init__(name=name)
 323         self.op        = Signal(Op)
 324         self.valid     = Signal()
 325         self.dcbz      = Signal()
 326         self.real_addr = Signal(REAL_ADDR_BITS)
 327         self.data      = Signal(64)
 328         self.byte_sel  = Signal(8)
 329         self.hit_way   = Signal(WAY_BITS)
 330         self.same_tag  = Signal()
 331         self.mmu_req   = Signal()
 332
 333
 334 # First stage register, contains state for stage 1 of load hits
 335 # and for the state machine used by all other operations
 336 class RegStage1(RecordObject):
 337     def __init__(self, name=None):
 338         super().__init__(name=name)
 339         # Info about the request
 340         self.full             = Signal() # have uncompleted request
 341         self.mmu_req          = Signal() # request is from MMU
 342         self.req              = MemAccessRequest(name="reqmem")
 343
 344         # Cache hit state
 345         self.hit_way          = Signal(WAY_BITS)
 346         self.hit_load_valid   = Signal()
 347         self.hit_index        = Signal(INDEX_BITS)
 348         self.cache_hit        = Signal()
 349
 350         # TLB hit state
 351         self.tlb_hit          = Signal()
 352         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 353         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 354
 355         # 2-stage data buffer for data forwarded from writes to reads
 356         self.forward_data1    = Signal(64)
 357         self.forward_data2    = Signal(64)
 358         self.forward_sel1     = Signal(8)
 359         self.forward_valid1   = Signal()
 360         self.forward_way1     = Signal(WAY_BITS)
 361         self.forward_row1     = Signal(ROW_BITS)
 362         self.use_forward1     = Signal()
 363         self.forward_sel      = Signal(8)
 364
 365         # Cache miss state (reload state machine)
 366         self.state            = Signal(State)
 367         self.dcbz             = Signal()
 368         self.write_bram       = Signal()
 369         self.write_tag        = Signal()
 370         self.slow_valid       = Signal()
 371         self.real_adr         = Signal(REAL_ADDR_BITS)
 372         self.wb               = WBMasterOut("wb")
 373         self.reload_tag       = Signal(TAG_BITS)
 374         self.store_way        = Signal(WAY_BITS)
 375         self.store_row        = Signal(ROW_BITS)
 376         self.store_index      = Signal(INDEX_BITS)
 377         self.end_row_ix       = Signal(ROW_LINE_BITS)
 378         self.rows_valid       = RowPerLineValidArray()
 379         self.acks_pending     = Signal(3)
 380         self.inc_acks         = Signal()
 381         self.dec_acks         = Signal()
 382
 383         # Signals to complete (possibly with error)
 384         self.ls_valid         = Signal()
 385         self.ls_error         = Signal()
 386         self.mmu_done         = Signal()
 387         self.mmu_error        = Signal()
 388         self.cache_paradox    = Signal()
 389
 390         # Signal to complete a failed stcx.
 391         self.stcx_fail        = Signal()
 392
 393
 394 # Reservation information
 395 class Reservation(RecordObject):
 396     def __init__(self):
 397         super().__init__()
 398         self.valid = Signal()
 399         self.addr  = Signal(64-LINE_OFF_BITS)
 400
 401
 402 class DTLBUpdate(Elaboratable):
 403     def __init__(self):
 404         self.tlbie    = Signal()
 405         self.tlbwe    = Signal()
 406         self.doall    = Signal()
 407         self.updated  = Signal()
 408         self.v_updated  = Signal()
 409         self.tlb_hit    = Signal()
 410         self.tlb_req_index = Signal(TLB_SET_BITS)
 411
 412         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 413         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 414         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 415         self.repl_way        = Signal(TLB_WAY_BITS)
 416         self.eatag           = Signal(TLB_EA_TAG_BITS)
 417         self.pte_data        = Signal(TLB_PTE_BITS)
 418
 419         self.dv = Signal(TLB_PTE_WAY_BITS)
 420
 421         self.tb_out = Signal(TLB_TAG_WAY_BITS)
 422         self.pb_out = Signal(TLB_NUM_WAYS)
 423         self.db_out = Signal(TLB_PTE_WAY_BITS)
 424
 425     def elaborate(self, platform):
 426         m = Module()
 427         comb = m.d.comb
 428         sync = m.d.sync
 429
 430         tagset   = Signal(TLB_TAG_WAY_BITS)
 431         pteset   = Signal(TLB_PTE_WAY_BITS)
 432
 433         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 434
 435         with m.If(self.tlbie & self.doall):
 436             pass # clear all back in parent
 437         with m.Elif(self.tlbie):
 438             with m.If(self.tlb_hit):
 439                 comb += db_out.eq(self.dv)
 440                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 441                 comb += self.v_updated.eq(1)
 442
 443         with m.Elif(self.tlbwe):
 444
 445             comb += tagset.eq(self.tlb_tag_way)
 446             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 447             comb += tb_out.eq(tagset)
 448
 449             comb += pteset.eq(self.tlb_pte_way)
 450             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 451             comb += pb_out.eq(pteset)
 452
 453             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 454
 455             comb += self.updated.eq(1)
 456             comb += self.v_updated.eq(1)
 457
 458         return m
 459
 460
 461 class DCachePendingHit(Elaboratable):
 462
 463     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 464                       cache_valid_idx, cache_tag_set,
 465                     req_addr,
 466                     hit_set):
 467
 468         self.go          = Signal()
 469         self.virt_mode   = Signal()
 470         self.is_hit      = Signal()
 471         self.tlb_hit     = Signal()
 472         self.hit_way     = Signal(WAY_BITS)
 473         self.rel_match   = Signal()
 474         self.req_index   = Signal(INDEX_BITS)
 475         self.reload_tag  = Signal(TAG_BITS)
 476
 477         self.tlb_hit_way = tlb_hit_way
 478         self.tlb_pte_way = tlb_pte_way
 479         self.tlb_valid_way = tlb_valid_way
 480         self.cache_valid_idx = cache_valid_idx
 481         self.cache_tag_set = cache_tag_set
 482         self.req_addr = req_addr
 483         self.hit_set = hit_set
 484
 485     def elaborate(self, platform):
 486         m = Module()
 487         comb = m.d.comb
 488         sync = m.d.sync
 489
 490         go = self.go
 491         virt_mode = self.virt_mode
 492         is_hit = self.is_hit
 493         tlb_pte_way = self.tlb_pte_way
 494         tlb_valid_way = self.tlb_valid_way
 495         cache_valid_idx = self.cache_valid_idx
 496         cache_tag_set = self.cache_tag_set
 497         req_addr = self.req_addr
 498         tlb_hit_way = self.tlb_hit_way
 499         tlb_hit = self.tlb_hit
 500         hit_set = self.hit_set
 501         hit_way = self.hit_way
 502         rel_match = self.rel_match
 503         req_index = self.req_index
 504         reload_tag = self.reload_tag
 505
 506         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 507                                     for i in range(TLB_NUM_WAYS))
 508         hit_way_set = HitWaySet()
 509
 510         # Test if pending request is a hit on any way
 511         # In order to make timing in virtual mode,
 512         # when we are using the TLB, we compare each
 513         # way with each of the real addresses from each way of
 514         # the TLB, and then decide later which match to use.
 515
 516         with m.If(virt_mode):
 517             for j in range(TLB_NUM_WAYS):
 518                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 519                 s_hit       = Signal()
 520                 s_pte       = Signal(TLB_PTE_BITS)
 521                 s_ra        = Signal(REAL_ADDR_BITS)
 522                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 523                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 524                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 525                 comb += s_tag.eq(get_tag(s_ra))
 526
 527                 for i in range(NUM_WAYS):
 528                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 529                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 530                                   (read_tag(i, cache_tag_set) == s_tag)
 531                                   & tlb_valid_way[j])
 532                     with m.If(is_tag_hit):
 533                         comb += hit_way_set[j].eq(i)
 534                         comb += s_hit.eq(1)
 535                 comb += hit_set[j].eq(s_hit)
 536                 with m.If(s_tag == reload_tag):
 537                     comb += rel_matches[j].eq(1)
 538             with m.If(tlb_hit):
 539                 comb += is_hit.eq(hit_set[tlb_hit_way])
 540                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 541                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 542         with m.Else():
 543             s_tag       = Signal(TAG_BITS)
 544             comb += s_tag.eq(get_tag(req_addr))
 545             for i in range(NUM_WAYS):
 546                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 547                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 548                           (read_tag(i, cache_tag_set) == s_tag))
 549                 with m.If(is_tag_hit):
 550                     comb += hit_way.eq(i)
 551                     comb += is_hit.eq(1)
 552             with m.If(s_tag == reload_tag):
 553                 comb += rel_match.eq(1)
 554
 555         return m
 556
 557
 558 class DCache(Elaboratable):
 559     """Set associative dcache write-through
 560     TODO (in no specific order):
 561     * See list in icache.vhdl
 562     * Complete load misses on the cycle when WB data comes instead of
 563       at the end of line (this requires dealing with requests coming in
 564       while not idle...)
 565     """
 566     def __init__(self):
 567         self.d_in      = LoadStore1ToDCacheType("d_in")
 568         self.d_out     = DCacheToLoadStore1Type("d_out")
 569
 570         self.m_in      = MMUToDCacheType("m_in")
 571         self.m_out     = DCacheToMMUType("m_out")
 572
 573         self.stall_out = Signal()
 574
 575         self.wb_out    = WBMasterOut()
 576         self.wb_in     = WBSlaveOut()
 577
 578         self.log_out   = Signal(20)
 579
 580     def stage_0(self, m, r0, r1, r0_full):
 581         """Latch the request in r0.req as long as we're not stalling
 582         """
 583         comb = m.d.comb
 584         sync = m.d.sync
 585         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 586
 587         r = RegStage0("stage0")
 588
 589         # TODO, this goes in unit tests and formal proofs
 590         with m.If(d_in.valid & m_in.valid):
 591             sync += Display("request collision loadstore vs MMU")
 592
 593         with m.If(m_in.valid):
 594             sync += r.req.valid.eq(1)
 595             sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 596             sync += r.req.dcbz.eq(0)
 597             sync += r.req.nc.eq(0)
 598             sync += r.req.reserve.eq(0)
 599             sync += r.req.virt_mode.eq(0)
 600             sync += r.req.priv_mode.eq(1)
 601             sync += r.req.addr.eq(m_in.addr)
 602             sync += r.req.data.eq(m_in.pte)
 603             sync += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 604             sync += r.tlbie.eq(m_in.tlbie)
 605             sync += r.doall.eq(m_in.doall)
 606             sync += r.tlbld.eq(m_in.tlbld)
 607             sync += r.mmu_req.eq(1)
 608         with m.Else():
 609             sync += r.req.eq(d_in)
 610             sync += r.tlbie.eq(0)
 611             sync += r.doall.eq(0)
 612             sync += r.tlbld.eq(0)
 613             sync += r.mmu_req.eq(0)
 614             with m.If(~(r1.full & r0_full)):
 615                 sync += r0.eq(r)
 616                 sync += r0_full.eq(r.req.valid)
 617
 618     def tlb_read(self, m, r0_stall, tlb_valid_way,
 619                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 620                  dtlb_tags, dtlb_ptes):
 621         """TLB
 622         Operates in the second cycle on the request latched in r0.req.
 623         TLB updates write the entry at the end of the second cycle.
 624         """
 625         comb = m.d.comb
 626         sync = m.d.sync
 627         m_in, d_in = self.m_in, self.d_in
 628
 629         index    = Signal(TLB_SET_BITS)
 630         addrbits = Signal(TLB_SET_BITS)
 631
 632         amin = TLB_LG_PGSZ
 633         amax = TLB_LG_PGSZ + TLB_SET_BITS
 634
 635         with m.If(m_in.valid):
 636             comb += addrbits.eq(m_in.addr[amin : amax])
 637         with m.Else():
 638             comb += addrbits.eq(d_in.addr[amin : amax])
 639         comb += index.eq(addrbits)
 640
 641         # If we have any op and the previous op isn't finished,
 642         # then keep the same output for next cycle.
 643         with m.If(~r0_stall):
 644             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 645             sync += tlb_tag_way.eq(dtlb_tags[index])
 646             sync += tlb_pte_way.eq(dtlb_ptes[index])
 647
 648     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 649         """Generate TLB PLRUs
 650         """
 651         comb = m.d.comb
 652         sync = m.d.sync
 653
 654         if TLB_NUM_WAYS == 0:
 655             return
 656         for i in range(TLB_SET_SIZE):
 657             # TLB PLRU interface
 658             tlb_plru        = PLRU(TLB_WAY_BITS)
 659             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 660             tlb_plru_acc_en = Signal()
 661
 662             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 663             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 664             comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
 665             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 666
 667     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 668                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 669                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 670
 671         comb = m.d.comb
 672         sync = m.d.sync
 673
 674         hitway = Signal(TLB_WAY_BITS)
 675         hit    = Signal()
 676         eatag  = Signal(TLB_EA_TAG_BITS)
 677
 678         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 679         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 680         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 681
 682         for i in range(TLB_NUM_WAYS):
 683             is_tag_hit = Signal()
 684             comb += is_tag_hit.eq(tlb_valid_way[i]
 685                                   & (read_tlb_tag(i, tlb_tag_way) == eatag))
 686             with m.If(is_tag_hit):
 687                 comb += hitway.eq(i)
 688                 comb += hit.eq(1)
 689
 690         comb += tlb_hit.eq(hit & r0_valid)
 691         comb += tlb_hit_way.eq(hitway)
 692
 693         with m.If(tlb_hit):
 694             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 695         with m.Else():
 696             comb += pte.eq(0)
 697         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 698         with m.If(r0.req.virt_mode):
 699             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 700                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 701                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 702             comb += perm_attr.reference.eq(pte[8])
 703             comb += perm_attr.changed.eq(pte[7])
 704             comb += perm_attr.nocache.eq(pte[5])
 705             comb += perm_attr.priv.eq(pte[3])
 706             comb += perm_attr.rd_perm.eq(pte[2])
 707             comb += perm_attr.wr_perm.eq(pte[1])
 708         with m.Else():
 709             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 710                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 711
 712             comb += perm_attr.reference.eq(1)
 713             comb += perm_attr.changed.eq(1)
 714             comb += perm_attr.nocache.eq(0)
 715             comb += perm_attr.priv.eq(1)
 716             comb += perm_attr.rd_perm.eq(1)
 717             comb += perm_attr.wr_perm.eq(1)
 718
 719     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 720                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 721                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 722
 723         comb = m.d.comb
 724         sync = m.d.sync
 725
 726         tlbie    = Signal()
 727         tlbwe    = Signal()
 728
 729         comb += tlbie.eq(r0_valid & r0.tlbie)
 730         comb += tlbwe.eq(r0_valid & r0.tlbld)
 731
 732         m.submodules.tlb_update = d = DTLBUpdate()
 733         with m.If(tlbie & r0.doall):
 734             # clear all valid bits at once
 735             for i in range(TLB_SET_SIZE):
 736                 sync += dtlb_valid_bits[i].eq(0)
 737         with m.If(d.updated):
 738             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 739             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 740         with m.If(d.v_updated):
 741             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 742
 743         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 744
 745         comb += d.tlbie.eq(tlbie)
 746         comb += d.tlbwe.eq(tlbwe)
 747         comb += d.doall.eq(r0.doall)
 748         comb += d.tlb_hit.eq(tlb_hit)
 749         comb += d.tlb_hit_way.eq(tlb_hit_way)
 750         comb += d.tlb_tag_way.eq(tlb_tag_way)
 751         comb += d.tlb_pte_way.eq(tlb_pte_way)
 752         comb += d.tlb_req_index.eq(tlb_req_index)
 753
 754         with m.If(tlb_hit):
 755             comb += d.repl_way.eq(tlb_hit_way)
 756         with m.Else():
 757             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 758         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 759         comb += d.pte_data.eq(r0.req.data)
 760
 761     def maybe_plrus(self, m, r1, plru_victim):
 762         """Generate PLRUs
 763         """
 764         comb = m.d.comb
 765         sync = m.d.sync
 766
 767         if TLB_NUM_WAYS == 0:
 768             return
 769
 770         for i in range(NUM_LINES):
 771             # PLRU interface
 772             plru        = PLRU(WAY_BITS)
 773             setattr(m.submodules, "plru%d" % i, plru)
 774             plru_acc_en = Signal()
 775
 776             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 777             comb += plru.acc_en.eq(plru_acc_en)
 778             comb += plru.acc_i.eq(r1.hit_way)
 779             comb += plru_victim[i].eq(plru.lru_o)
 780
 781     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 782         """Cache tag RAM read port
 783         """
 784         comb = m.d.comb
 785         sync = m.d.sync
 786         m_in, d_in = self.m_in, self.d_in
 787
 788         index = Signal(INDEX_BITS)
 789
 790         with m.If(r0_stall):
 791             comb += index.eq(req_index)
 792         with m.Elif(m_in.valid):
 793             comb += index.eq(get_index(m_in.addr))
 794         with m.Else():
 795             comb += index.eq(get_index(d_in.addr))
 796         sync += cache_tag_set.eq(cache_tags[index])
 797
 798     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 799                        r0_valid, r1, cache_valids, replace_way,
 800                        use_forward1_next, use_forward2_next,
 801                        req_hit_way, plru_victim, rc_ok, perm_attr,
 802                        valid_ra, perm_ok, access_ok, req_op, req_go,
 803                        tlb_pte_way,
 804                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 805                        cancel_store, req_same_tag, r0_stall, early_req_row):
 806         """Cache request parsing and hit detection
 807         """
 808
 809         comb = m.d.comb
 810         sync = m.d.sync
 811         m_in, d_in = self.m_in, self.d_in
 812
 813         is_hit      = Signal()
 814         hit_way     = Signal(WAY_BITS)
 815         op          = Signal(Op)
 816         opsel       = Signal(3)
 817         go          = Signal()
 818         nc          = Signal()
 819         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 820                                   for i in range(TLB_NUM_WAYS))
 821         cache_valid_idx = Signal(NUM_WAYS)
 822
 823         # Extract line, row and tag from request
 824         comb += req_index.eq(get_index(r0.req.addr))
 825         comb += req_row.eq(get_row(r0.req.addr))
 826         comb += req_tag.eq(get_tag(ra))
 827
 828         if False: # display on comb is a bit... busy.
 829             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 830                     r0.req.addr, ra, req_index, req_tag, req_row)
 831
 832         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 833         comb += cache_valid_idx.eq(cache_valids[req_index])
 834
 835         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 836                                 tlb_valid_way, tlb_hit_way,
 837                                 cache_valid_idx, cache_tag_set,
 838                                 r0.req.addr,
 839                                 hit_set)
 840
 841         comb += dc.tlb_hit.eq(tlb_hit)
 842         comb += dc.reload_tag.eq(r1.reload_tag)
 843         comb += dc.virt_mode.eq(r0.req.virt_mode)
 844         comb += dc.go.eq(go)
 845         comb += dc.req_index.eq(req_index)
 846         comb += is_hit.eq(dc.is_hit)
 847         comb += hit_way.eq(dc.hit_way)
 848         comb += req_same_tag.eq(dc.rel_match)
 849
 850         # See if the request matches the line currently being reloaded
 851         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 852                   (req_index == r1.store_index) & req_same_tag):
 853             # For a store, consider this a hit even if the row isn't
 854             # valid since it will be by the time we perform the store.
 855             # For a load, check the appropriate row valid bit.
 856             rrow = Signal(ROW_LINE_BITS)
 857             comb += rrow.eq(req_row)
 858             valid = r1.rows_valid[rrow]
 859             comb += is_hit.eq(~r0.req.load | valid)
 860             comb += hit_way.eq(replace_way)
 861
 862         # Whether to use forwarded data for a load or not
 863         with m.If((get_row(r1.req.real_addr) == req_row) &
 864                   (r1.req.hit_way == hit_way)):
 865             # Only need to consider r1.write_bram here, since if we
 866             # are writing refill data here, then we don't have a
 867             # cache hit this cycle on the line being refilled.
 868             # (There is the possibility that the load following the
 869             # load miss that started the refill could be to the old
 870             # contents of the victim line, since it is a couple of
 871             # cycles after the refill starts before we see the updated
 872             # cache tag. In that case we don't use the bypass.)
 873             comb += use_forward1_next.eq(r1.write_bram)
 874         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 875             comb += use_forward2_next.eq(r1.forward_valid1)
 876
 877         # The way that matched on a hit
 878         comb += req_hit_way.eq(hit_way)
 879
 880         # The way to replace on a miss
 881         with m.If(r1.write_tag):
 882             comb += replace_way.eq(plru_victim[r1.store_index])
 883         with m.Else():
 884             comb += replace_way.eq(r1.store_way)
 885
 886         # work out whether we have permission for this access
 887         # NB we don't yet implement AMR, thus no KUAP
 888         comb += rc_ok.eq(perm_attr.reference
 889                          & (r0.req.load | perm_attr.changed)
 890                 )
 891         comb += perm_ok.eq((r0.req.priv_mode | ~perm_attr.priv) &
 892                            (perm_attr.wr_perm |
 893                               (r0.req.load & perm_attr.rd_perm)))
 894         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 895         # Combine the request and cache hit status to decide what
 896         # operation needs to be done
 897         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 898         comb += op.eq(Op.OP_NONE)
 899         with m.If(go):
 900             with m.If(~access_ok):
 901                 comb += op.eq(Op.OP_BAD)
 902             with m.Elif(cancel_store):
 903                 comb += op.eq(Op.OP_STCX_FAIL)
 904             with m.Else():
 905                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 906                 with m.Switch(opsel):
 907                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 908                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 909                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 910                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 911                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 912                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 913                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 914                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 915         comb += req_op.eq(op)
 916         comb += req_go.eq(go)
 917
 918         # Version of the row number that is valid one cycle earlier
 919         # in the cases where we need to read the cache data BRAM.
 920         # If we're stalling then we need to keep reading the last
 921         # row requested.
 922         with m.If(~r0_stall):
 923             with m.If(m_in.valid):
 924                 comb += early_req_row.eq(get_row(m_in.addr))
 925             with m.Else():
 926                 comb += early_req_row.eq(get_row(d_in.addr))
 927         with m.Else():
 928             comb += early_req_row.eq(req_row)
 929
 930     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 931                          r0_valid, r0, reservation):
 932         """Handle load-with-reservation and store-conditional instructions
 933         """
 934         comb = m.d.comb
 935         sync = m.d.sync
 936
 937         with m.If(r0_valid & r0.req.reserve):
 938             # XXX generate alignment interrupt if address
 939             # is not aligned XXX or if r0.req.nc = '1'
 940             with m.If(r0.req.load):
 941                 comb += set_rsrv.eq(1) # load with reservation
 942             with m.Else():
 943                 comb += clear_rsrv.eq(1) # store conditional
 944                 with m.If(~reservation.valid |
 945                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
 946                     comb += cancel_store.eq(1)
 947
 948     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 949                         reservation, r0):
 950
 951         comb = m.d.comb
 952         sync = m.d.sync
 953
 954         with m.If(r0_valid & access_ok):
 955             with m.If(clear_rsrv):
 956                 sync += reservation.valid.eq(0)
 957             with m.Elif(set_rsrv):
 958                 sync += reservation.valid.eq(1)
 959                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 960
 961     def writeback_control(self, m, r1, cache_out):
 962         """Return data for loads & completion control logic
 963         """
 964         comb = m.d.comb
 965         sync = m.d.sync
 966         d_out, m_out = self.d_out, self.m_out
 967
 968         data_out = Signal(64)
 969         data_fwd = Signal(64)
 970
 971         # Use the bypass if are reading the row that was
 972         # written 1 or 2 cycles ago, including for the
 973         # slow_valid = 1 case (i.e. completing a load
 974         # miss or a non-cacheable load).
 975         with m.If(r1.use_forward1):
 976             comb += data_fwd.eq(r1.forward_data1)
 977         with m.Else():
 978             comb += data_fwd.eq(r1.forward_data2)
 979
 980         comb += data_out.eq(cache_out[r1.hit_way])
 981
 982         for i in range(8):
 983             with m.If(r1.forward_sel[i]):
 984                 dsel = data_fwd.word_select(i, 8)
 985                 comb += data_out.word_select(i, 8).eq(dsel)
 986
 987         comb += d_out.valid.eq(r1.ls_valid)
 988         comb += d_out.data.eq(data_out)
 989         comb += d_out.store_done.eq(~r1.stcx_fail)
 990         comb += d_out.error.eq(r1.ls_error)
 991         comb += d_out.cache_paradox.eq(r1.cache_paradox)
 992
 993         # Outputs to MMU
 994         comb += m_out.done.eq(r1.mmu_done)
 995         comb += m_out.err.eq(r1.mmu_error)
 996         comb += m_out.data.eq(data_out)
 997
 998         # We have a valid load or store hit or we just completed
 999         # a slow op such as a load miss, a NC load or a store
1000         #
1001         # Note: the load hit is delayed by one cycle. However it
1002         # can still not collide with r.slow_valid (well unless I
1003         # miscalculated) because slow_valid can only be set on a
1004         # subsequent request and not on its first cycle (the state
1005         # machine must have advanced), which makes slow_valid
1006         # at least 2 cycles from the previous hit_load_valid.
1007
1008         # Sanity: Only one of these must be set in any given cycle
1009
1010         if False: # TODO: need Display to get this to work
1011             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1012             "unexpected slow_valid collision with stcx_fail"
1013
1014             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1015              "unexpected hit_load_delayed collision with slow_valid"
1016
1017         with m.If(~r1.mmu_req):
1018             # Request came from loadstore1...
1019             # Load hit case is the standard path
1020             with m.If(r1.hit_load_valid):
1021                 sync += Display("completing load hit data=%x", data_out)
1022
1023             # error cases complete without stalling
1024             with m.If(r1.ls_error):
1025                 sync += Display("completing ld/st with error")
1026
1027             # Slow ops (load miss, NC, stores)
1028             with m.If(r1.slow_valid):
1029                 sync += Display("completing store or load miss data=%x",
1030                                 data_out)
1031
1032         with m.Else():
1033             # Request came from MMU
1034             with m.If(r1.hit_load_valid):
1035                 sync += Display("completing load hit to MMU, data=%x",
1036                                 m_out.data)
1037             # error cases complete without stalling
1038             with m.If(r1.mmu_error):
1039                 sync += Display("combpleting MMU ld with error")
1040
1041             # Slow ops (i.e. load miss)
1042             with m.If(r1.slow_valid):
1043                 sync += Display("completing MMU load miss, data=%x",
1044                                 m_out.data)
1045
1046     def rams(self, m, r1, early_req_row, cache_out, replace_way):
1047         """rams
1048         Generate a cache RAM for each way. This handles the normal
1049         reads, writes from reloads and the special store-hit update
1050         path as well.
1051
1052         Note: the BRAMs have an extra read buffer, meaning the output
1053         is pipelined an extra cycle. This differs from the
1054         icache. The writeback logic needs to take that into
1055         account by using 1-cycle delayed signals for load hits.
1056         """
1057         comb = m.d.comb
1058         wb_in = self.wb_in
1059
1060         for i in range(NUM_WAYS):
1061             do_read  = Signal(name="do_rd%d" % i)
1062             rd_addr  = Signal(ROW_BITS)
1063             do_write = Signal(name="do_wr%d" % i)
1064             wr_addr  = Signal(ROW_BITS)
1065             wr_data  = Signal(WB_DATA_BITS)
1066             wr_sel   = Signal(ROW_SIZE)
1067             wr_sel_m = Signal(ROW_SIZE)
1068             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i)
1069
1070             way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
1071             setattr(m.submodules, "cacheram_%d" % i, way)
1072
1073             comb += way.rd_en.eq(do_read)
1074             comb += way.rd_addr.eq(rd_addr)
1075             comb += _d_out.eq(way.rd_data_o)
1076             comb += way.wr_sel.eq(wr_sel_m)
1077             comb += way.wr_addr.eq(wr_addr)
1078             comb += way.wr_data.eq(wr_data)
1079
1080             # Cache hit reads
1081             comb += do_read.eq(1)
1082             comb += rd_addr.eq(early_req_row[:ROW_BITS])
1083             comb += cache_out[i].eq(_d_out)
1084
1085             # Write mux:
1086             #
1087             # Defaults to wishbone read responses (cache refill)
1088             #
1089             # For timing, the mux on wr_data/sel/addr is not
1090             # dependent on anything other than the current state.
1091
1092             with m.If(r1.write_bram):
1093                 # Write store data to BRAM.  This happens one
1094                 # cycle after the store is in r0.
1095                 comb += wr_data.eq(r1.req.data)
1096                 comb += wr_sel.eq(r1.req.byte_sel)
1097                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1098
1099                 with m.If(i == r1.req.hit_way):
1100                     comb += do_write.eq(1)
1101             with m.Else():
1102                 # Otherwise, we might be doing a reload or a DCBZ
1103                 with m.If(r1.dcbz):
1104                     comb += wr_data.eq(0)
1105                 with m.Else():
1106                     comb += wr_data.eq(wb_in.dat)
1107                 comb += wr_addr.eq(r1.store_row)
1108                 comb += wr_sel.eq(~0) # all 1s
1109
1110             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1111                       & wb_in.ack & (replace_way == i)):
1112                 comb += do_write.eq(1)
1113
1114             # Mask write selects with do_write since BRAM
1115             # doesn't have a global write-enable
1116             with m.If(do_write):
1117                 comb += wr_sel_m.eq(wr_sel)
1118
1119     # Cache hit synchronous machine for the easy case.
1120     # This handles load hits.
1121     # It also handles error cases (TLB miss, cache paradox)
1122     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1123                         req_hit_way, req_index, req_tag, access_ok,
1124                         tlb_hit, tlb_hit_way, tlb_req_index):
1125
1126         comb = m.d.comb
1127         sync = m.d.sync
1128
1129         with m.If(req_op != Op.OP_NONE):
1130             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1131                     req_op, r0.req.addr, r0.req.nc,
1132                     req_index, req_tag, req_hit_way)
1133
1134         with m.If(r0_valid):
1135             sync += r1.mmu_req.eq(r0.mmu_req)
1136
1137         # Fast path for load/store hits.
1138         # Set signals for the writeback controls.
1139         sync += r1.hit_way.eq(req_hit_way)
1140         sync += r1.hit_index.eq(req_index)
1141
1142         with m.If(req_op == Op.OP_LOAD_HIT):
1143             sync += r1.hit_load_valid.eq(1)
1144         with m.Else():
1145             sync += r1.hit_load_valid.eq(0)
1146
1147         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1148             sync += r1.cache_hit.eq(1)
1149         with m.Else():
1150             sync += r1.cache_hit.eq(0)
1151
1152         with m.If(req_op == Op.OP_BAD):
1153             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1154             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1155             sync += r1.ls_error.eq(~r0.mmu_req)
1156             sync += r1.mmu_error.eq(r0.mmu_req)
1157             sync += r1.cache_paradox.eq(access_ok)
1158
1159             with m.Else():
1160                 sync += r1.ls_error.eq(0)
1161                 sync += r1.mmu_error.eq(0)
1162                 sync += r1.cache_paradox.eq(0)
1163
1164         with m.If(req_op == Op.OP_STCX_FAIL):
1165             r1.stcx_fail.eq(1)
1166         with m.Else():
1167             sync += r1.stcx_fail.eq(0)
1168
1169         # Record TLB hit information for updating TLB PLRU
1170         sync += r1.tlb_hit.eq(tlb_hit)
1171         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1172         sync += r1.tlb_hit_index.eq(tlb_req_index)
1173
1174     # Memory accesses are handled by this state machine:
1175     #
1176     #   * Cache load miss/reload (in conjunction with "rams")
1177     #   * Load hits for non-cachable forms
1178     #   * Stores (the collision case is handled in "rams")
1179     #
1180     # All wishbone requests generation is done here.
1181     # This machine operates at stage 1.
1182     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1183                     cache_valids, r0, replace_way,
1184                     req_hit_way, req_same_tag,
1185                     r0_valid, req_op, cache_tags, req_go, ra):
1186
1187         comb = m.d.comb
1188         sync = m.d.sync
1189         wb_in = self.wb_in
1190
1191         req         = MemAccessRequest("mreq_ds")
1192         acks        = Signal(3)
1193         adjust_acks = Signal(3)
1194
1195         req_row = Signal(ROW_BITS)
1196         req_idx = Signal(INDEX_BITS)
1197         req_tag = Signal(TAG_BITS)
1198         comb += req_idx.eq(get_index(req.real_addr))
1199         comb += req_row.eq(get_row(req.real_addr))
1200         comb += req_tag.eq(get_tag(req.real_addr))
1201
1202         sync += r1.use_forward1.eq(use_forward1_next)
1203         sync += r1.forward_sel.eq(0)
1204
1205         with m.If(use_forward1_next):
1206             sync += r1.forward_sel.eq(r1.req.byte_sel)
1207         with m.Elif(use_forward2_next):
1208             sync += r1.forward_sel.eq(r1.forward_sel1)
1209
1210         sync += r1.forward_data2.eq(r1.forward_data1)
1211         with m.If(r1.write_bram):
1212             sync += r1.forward_data1.eq(r1.req.data)
1213             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1214             sync += r1.forward_way1.eq(r1.req.hit_way)
1215             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1216             sync += r1.forward_valid1.eq(1)
1217         with m.Else():
1218             with m.If(r1.dcbz):
1219                 sync += r1.forward_data1.eq(0)
1220             with m.Else():
1221                 sync += r1.forward_data1.eq(wb_in.dat)
1222             sync += r1.forward_sel1.eq(~0) # all 1s
1223             sync += r1.forward_way1.eq(replace_way)
1224             sync += r1.forward_row1.eq(r1.store_row)
1225             sync += r1.forward_valid1.eq(0)
1226
1227         # One cycle pulses reset
1228         sync += r1.slow_valid.eq(0)
1229         sync += r1.write_bram.eq(0)
1230         sync += r1.inc_acks.eq(0)
1231         sync += r1.dec_acks.eq(0)
1232
1233         sync += r1.ls_valid.eq(0)
1234         # complete tlbies and TLB loads in the third cycle
1235         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1236
1237         with m.If((req_op == Op.OP_LOAD_HIT)
1238                   | (req_op == Op.OP_STCX_FAIL)):
1239             with m.If(~r0.mmu_req):
1240                 sync += r1.ls_valid.eq(1)
1241             with m.Else():
1242                 sync += r1.mmu_done.eq(1)
1243
1244         with m.If(r1.write_tag):
1245             # Store new tag in selected way
1246             for i in range(NUM_WAYS):
1247                 with m.If(i == replace_way):
1248                     ct = Signal(TAG_RAM_WIDTH)
1249                     comb += ct.eq(cache_tags[r1.store_index])
1250                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1251                     sync += cache_tags[r1.store_index].eq(ct)
1252             sync += r1.store_way.eq(replace_way)
1253             sync += r1.write_tag.eq(0)
1254
1255         # Take request from r1.req if there is one there,
1256         # else from req_op, ra, etc.
1257         with m.If(r1.full):
1258             comb += req.eq(r1.req)
1259         with m.Else():
1260             comb += req.op.eq(req_op)
1261             comb += req.valid.eq(req_go)
1262             comb += req.mmu_req.eq(r0.mmu_req)
1263             comb += req.dcbz.eq(r0.req.dcbz)
1264             comb += req.real_addr.eq(ra)
1265
1266             with m.If(~r0.req.dcbz):
1267                 comb += req.data.eq(r0.req.data)
1268             with m.Else():
1269                 comb += req.data.eq(0)
1270
1271             # Select all bytes for dcbz
1272             # and for cacheable loads
1273             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1274                 comb += req.byte_sel.eq(~0) # all 1s
1275             with m.Else():
1276                 comb += req.byte_sel.eq(r0.req.byte_sel)
1277             comb += req.hit_way.eq(req_hit_way)
1278             comb += req.same_tag.eq(req_same_tag)
1279
1280             # Store the incoming request from r0,
1281             # if it is a slow request
1282             # Note that r1.full = 1 implies req_op = OP_NONE
1283             with m.If((req_op == Op.OP_LOAD_MISS)
1284                       | (req_op == Op.OP_LOAD_NC)
1285                       | (req_op == Op.OP_STORE_MISS)
1286                       | (req_op == Op.OP_STORE_HIT)):
1287                 sync += r1.req.eq(req)
1288                 sync += r1.full.eq(1)
1289
1290         # Main state machine
1291         with m.Switch(r1.state):
1292
1293             with m.Case(State.IDLE):
1294                 sync += r1.real_adr.eq(req.real_addr)
1295                 sync += r1.wb.sel.eq(req.byte_sel)
1296                 sync += r1.wb.dat.eq(req.data)
1297                 sync += r1.dcbz.eq(req.dcbz)
1298
1299                 # Keep track of our index and way
1300                 # for subsequent stores.
1301                 sync += r1.store_index.eq(req_idx)
1302                 sync += r1.store_row.eq(req_row)
1303                 sync += r1.end_row_ix.eq(get_row_of_line(req_row))
1304                 sync += r1.reload_tag.eq(req_tag)
1305                 sync += r1.req.same_tag.eq(1)
1306
1307                 with m.If(req.op == Op.OP_STORE_HIT):
1308                     sync += r1.store_way.eq(req.hit_way)
1309
1310                 # Reset per-row valid bits,
1311                 # ready for handling OP_LOAD_MISS
1312                 for i in range(ROW_PER_LINE):
1313                     sync += r1.rows_valid[i].eq(0)
1314
1315                 with m.If(req_op != Op.OP_NONE):
1316                     sync += Display("cache op %d", req.op)
1317
1318                 with m.Switch(req.op):
1319                     with m.Case(Op.OP_LOAD_HIT):
1320                         # stay in IDLE state
1321                         pass
1322
1323                     with m.Case(Op.OP_LOAD_MISS):
1324                         sync += Display("cache miss real addr: %x " \
1325                                 "idx: %x tag: %x",
1326                                 req.real_addr, req_row, req_tag)
1327
1328                         # Start the wishbone cycle
1329                         sync += r1.wb.we.eq(0)
1330                         sync += r1.wb.cyc.eq(1)
1331                         sync += r1.wb.stb.eq(1)
1332
1333                         # Track that we had one request sent
1334                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1335                         sync += r1.write_tag.eq(1)
1336
1337                     with m.Case(Op.OP_LOAD_NC):
1338                         sync += r1.wb.cyc.eq(1)
1339                         sync += r1.wb.stb.eq(1)
1340                         sync += r1.wb.we.eq(0)
1341                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1342
1343                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1344                         with m.If(~req.dcbz):
1345                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1346                             sync += r1.acks_pending.eq(1)
1347                             sync += r1.full.eq(0)
1348                             sync += r1.slow_valid.eq(1)
1349
1350                             with m.If(~req.mmu_req):
1351                                 sync += r1.ls_valid.eq(1)
1352                             with m.Else():
1353                                 sync += r1.mmu_done.eq(1)
1354
1355                             with m.If(req.op == Op.OP_STORE_HIT):
1356                                 sync += r1.write_bram.eq(1)
1357                         with m.Else():
1358                             # dcbz is handled much like a load miss except
1359                             # that we are writing to memory instead of reading
1360                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1361
1362                             with m.If(req.op == Op.OP_STORE_MISS):
1363                                 sync += r1.write_tag.eq(1)
1364
1365                         sync += r1.wb.we.eq(1)
1366                         sync += r1.wb.cyc.eq(1)
1367                         sync += r1.wb.stb.eq(1)
1368
1369                     # OP_NONE and OP_BAD do nothing
1370                     # OP_BAD & OP_STCX_FAIL were
1371                     # handled above already
1372                     with m.Case(Op.OP_NONE):
1373                         pass
1374                     with m.Case(Op.OP_BAD):
1375                         pass
1376                     with m.Case(Op.OP_STCX_FAIL):
1377                         pass
1378
1379             with m.Case(State.RELOAD_WAIT_ACK):
1380                 ld_stbs_done = Signal()
1381                 # Requests are all sent if stb is 0
1382                 comb += ld_stbs_done.eq(~r1.wb.stb)
1383
1384                 with m.If((~wb_in.stall) & r1.wb.stb):
1385                     # That was the last word?
1386                     # We are done sending.
1387                     # Clear stb and set ld_stbs_done
1388                     # so we can handle an eventual
1389                     # last ack on the same cycle.
1390                     with m.If(is_last_row_addr(r1.real_adr, r1.end_row_ix)):
1391                         sync += r1.wb.stb.eq(0)
1392                         comb += ld_stbs_done.eq(1)
1393
1394                     # Calculate the next row address in the current cache line
1395                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1396                     comb += row.eq(r1.real_adr[ROW_OFF_BITS:])
1397                     sync += r1.real_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(row+1)
1398
1399                 # Incoming acks processing
1400                 sync += r1.forward_valid1.eq(wb_in.ack)
1401                 with m.If(wb_in.ack):
1402                     srow = Signal(ROW_LINE_BITS)
1403                     comb += srow.eq(r1.store_row)
1404                     sync += r1.rows_valid[srow].eq(1)
1405
1406                     # If this is the data we were looking for,
1407                     # we can complete the request next cycle.
1408                     # Compare the whole address in case the
1409                     # request in r1.req is not the one that
1410                     # started this refill.
1411                     with m.If(r1.full & r1.req.same_tag &
1412                               ((r1.dcbz & r1.req.dcbz) |
1413                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1414                                 (r1.store_row == get_row(r1.req.real_addr))):
1415                         sync += r1.full.eq(0)
1416                         sync += r1.slow_valid.eq(1)
1417                         with m.If(~r1.mmu_req):
1418                             sync += r1.ls_valid.eq(1)
1419                         with m.Else():
1420                             sync += r1.mmu_done.eq(1)
1421                         sync += r1.forward_sel.eq(~0) # all 1s
1422                         sync += r1.use_forward1.eq(1)
1423
1424                     # Check for completion
1425                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1426                                                       r1.end_row_ix)):
1427                         # Complete wishbone cycle
1428                         sync += r1.wb.cyc.eq(0)
1429
1430                         # Cache line is now valid
1431                         cv = Signal(INDEX_BITS)
1432                         comb += cv.eq(cache_valids[r1.store_index])
1433                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1434                         sync += cache_valids[r1.store_index].eq(cv)
1435                         sync += r1.state.eq(State.IDLE)
1436
1437                     # Increment store row counter
1438                     sync += r1.store_row.eq(next_row(r1.store_row))
1439
1440             with m.Case(State.STORE_WAIT_ACK):
1441                 st_stbs_done = Signal()
1442                 comb += st_stbs_done.eq(~r1.wb.stb)
1443                 comb += acks.eq(r1.acks_pending)
1444
1445                 with m.If(r1.inc_acks != r1.dec_acks):
1446                     with m.If(r1.inc_acks):
1447                         comb += adjust_acks.eq(acks + 1)
1448                     with m.Else():
1449                         comb += adjust_acks.eq(acks - 1)
1450                 with m.Else():
1451                     comb += adjust_acks.eq(acks)
1452
1453                 sync += r1.acks_pending.eq(adjust_acks)
1454
1455                 # Clear stb when slave accepted request
1456                 with m.If(~wb_in.stall):
1457                     # See if there is another store waiting
1458                     # to be done which is in the same real page.
1459                     with m.If(req.valid):
1460                         ra = req.real_addr[0:SET_SIZE_BITS]
1461                         sync += r1.real_adr[0:SET_SIZE_BITS].eq(ra)
1462                         sync += r1.wb.dat.eq(req.data)
1463                         sync += r1.wb.sel.eq(req.byte_sel)
1464
1465                     with m.Elif((adjust_acks < 7) & req.same_tag &
1466                                 ((req.op == Op.OP_STORE_MISS)
1467                                  | (req.op == Op.OP_STORE_HIT))):
1468                         sync += r1.wb.stb.eq(1)
1469                         comb += st_stbs_done.eq(0)
1470
1471                         with m.If(req.op == Op.OP_STORE_HIT):
1472                             sync += r1.write_bram.eq(1)
1473                         sync += r1.full.eq(0)
1474                         sync += r1.slow_valid.eq(1)
1475
1476                         # Store requests never come from the MMU
1477                         sync += r1.ls_valid.eq(1)
1478                         comb += st_stbs_done.eq(0)
1479                         sync += r1.inc_acks.eq(1)
1480                     with m.Else():
1481                         sync += r1.wb.stb.eq(0)
1482                         comb += st_stbs_done.eq(1)
1483
1484                 # Got ack ? See if complete.
1485                 with m.If(wb_in.ack):
1486                     with m.If(st_stbs_done & (adjust_acks == 1)):
1487                         sync += r1.state.eq(State.IDLE)
1488                         sync += r1.wb.cyc.eq(0)
1489                         sync += r1.wb.stb.eq(0)
1490                     sync += r1.dec_acks.eq(1)
1491
1492             with m.Case(State.NC_LOAD_WAIT_ACK):
1493                 # Clear stb when slave accepted request
1494                 with m.If(~wb_in.stall):
1495                     sync += r1.wb.stb.eq(0)
1496
1497                 # Got ack ? complete.
1498                 with m.If(wb_in.ack):
1499                     sync += r1.state.eq(State.IDLE)
1500                     sync += r1.full.eq(0)
1501                     sync += r1.slow_valid.eq(1)
1502
1503                     with m.If(~r1.mmu_req):
1504                         sync += r1.ls_valid.eq(1)
1505                     with m.Else():
1506                         sync += r1.mmu_done.eq(1)
1507
1508                     sync += r1.forward_sel.eq(~0) # all 1s
1509                     sync += r1.use_forward1.eq(1)
1510                     sync += r1.wb.cyc.eq(0)
1511                     sync += r1.wb.stb.eq(0)
1512
1513     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1514
1515         sync = m.d.sync
1516         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1517
1518         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1519                                stall_out, req_op[:3], d_out.valid, d_out.error,
1520                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1521                                r1.real_adr[3:6]))
1522
1523     def elaborate(self, platform):
1524
1525         m = Module()
1526         comb = m.d.comb
1527
1528         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1529         cache_tags       = CacheTagArray()
1530         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1531         cache_valids = CacheValidBitsArray()
1532
1533         # TODO attribute ram_style : string;
1534         # TODO attribute ram_style of cache_tags : signal is "distributed";
1535
1536         """note: these are passed to nmigen.hdl.Memory as "attributes".
1537            don't know how, just that they are.
1538         """
1539         dtlb_valid_bits = TLBValidBitsArray()
1540         dtlb_tags       = TLBTagsArray()
1541         dtlb_ptes       = TLBPtesArray()
1542         # TODO attribute ram_style of
1543         #  dtlb_tags : signal is "distributed";
1544         # TODO attribute ram_style of
1545         #  dtlb_ptes : signal is "distributed";
1546
1547         r0      = RegStage0("r0")
1548         r0_full = Signal()
1549
1550         r1 = RegStage1("r1")
1551
1552         reservation = Reservation()
1553
1554         # Async signals on incoming request
1555         req_index    = Signal(INDEX_BITS)
1556         req_row      = Signal(ROW_BITS)
1557         req_hit_way  = Signal(WAY_BITS)
1558         req_tag      = Signal(TAG_BITS)
1559         req_op       = Signal(Op)
1560         req_data     = Signal(64)
1561         req_same_tag = Signal()
1562         req_go       = Signal()
1563
1564         early_req_row     = Signal(ROW_BITS)
1565
1566         cancel_store      = Signal()
1567         set_rsrv          = Signal()
1568         clear_rsrv        = Signal()
1569
1570         r0_valid          = Signal()
1571         r0_stall          = Signal()
1572
1573         use_forward1_next = Signal()
1574         use_forward2_next = Signal()
1575
1576         cache_out         = CacheRamOut()
1577
1578         plru_victim       = PLRUOut()
1579         replace_way       = Signal(WAY_BITS)
1580
1581         # Wishbone read/write/cache write formatting signals
1582         bus_sel           = Signal(8)
1583
1584         # TLB signals
1585         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1586         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1587         tlb_valid_way = Signal(TLB_NUM_WAYS)
1588         tlb_req_index = Signal(TLB_SET_BITS)
1589         tlb_hit       = Signal()
1590         tlb_hit_way   = Signal(TLB_WAY_BITS)
1591         pte           = Signal(TLB_PTE_BITS)
1592         ra            = Signal(REAL_ADDR_BITS)
1593         valid_ra      = Signal()
1594         perm_attr     = PermAttr("dc_perms")
1595         rc_ok         = Signal()
1596         perm_ok       = Signal()
1597         access_ok     = Signal()
1598
1599         tlb_plru_victim = TLBPLRUOut()
1600
1601         # we don't yet handle collisions between loadstore1 requests
1602         # and MMU requests
1603         comb += self.m_out.stall.eq(0)
1604
1605         # Hold off the request in r0 when r1 has an uncompleted request
1606         comb += r0_stall.eq(r0_full & r1.full)
1607         comb += r0_valid.eq(r0_full & ~r1.full)
1608         comb += self.stall_out.eq(r0_stall)
1609
1610         # Wire up wishbone request latch out of stage 1
1611         comb += r1.wb.adr.eq(r1.real_adr[ROW_OFF_BITS:]) # truncate LSBs
1612         comb += self.wb_out.eq(r1.wb)
1613
1614         # call sub-functions putting everything together, using shared
1615         # signals established above
1616         self.stage_0(m, r0, r1, r0_full)
1617         self.tlb_read(m, r0_stall, tlb_valid_way,
1618                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1619                       dtlb_tags, dtlb_ptes)
1620         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1621                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1622                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1623         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1624                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1625                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1626         self.maybe_plrus(m, r1, plru_victim)
1627         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1628         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1629         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1630                            r0_valid, r1, cache_valids, replace_way,
1631                            use_forward1_next, use_forward2_next,
1632                            req_hit_way, plru_victim, rc_ok, perm_attr,
1633                            valid_ra, perm_ok, access_ok, req_op, req_go,
1634                            tlb_pte_way,
1635                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1636                            cancel_store, req_same_tag, r0_stall, early_req_row)
1637         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1638                            r0_valid, r0, reservation)
1639         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1640                            reservation, r0)
1641         self.writeback_control(m, r1, cache_out)
1642         self.rams(m, r1, early_req_row, cache_out, replace_way)
1643         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1644                         req_hit_way, req_index, req_tag, access_ok,
1645                         tlb_hit, tlb_hit_way, tlb_req_index)
1646         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1647                     cache_valids, r0, replace_way,
1648                     req_hit_way, req_same_tag,
1649                          r0_valid, req_op, cache_tags, req_go, ra)
1650         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1651
1652         return m
1653
1654 def dcache_load(dut, addr, nc=0):
1655     yield dut.d_in.load.eq(1)
1656     yield dut.d_in.nc.eq(nc)
1657     yield dut.d_in.addr.eq(addr)
1658     yield dut.d_in.byte_sel.eq(~0)
1659     yield dut.d_in.valid.eq(1)
1660     yield
1661     yield dut.d_in.valid.eq(0)
1662     yield dut.d_in.byte_sel.eq(0)
1663     yield
1664     while not (yield dut.d_out.valid):
1665         yield
1666     data = yield dut.d_out.data
1667     return data
1668
1669
1670 def dcache_store(dut, addr, data, nc=0):
1671     yield dut.d_in.load.eq(0)
1672     yield dut.d_in.nc.eq(nc)
1673     yield dut.d_in.data.eq(data)
1674     yield dut.d_in.byte_sel.eq(~0)
1675     yield dut.d_in.addr.eq(addr)
1676     yield dut.d_in.valid.eq(1)
1677     yield
1678     yield dut.d_in.valid.eq(0)
1679     yield dut.d_in.byte_sel.eq(0)
1680     yield
1681     while not (yield dut.d_out.valid):
1682         yield
1683
1684
1685 def dcache_random_sim(dut):
1686
1687     # start with stack of zeros
1688     sim_mem = [0] * 512
1689
1690     # clear stuff
1691     yield dut.d_in.valid.eq(0)
1692     yield dut.d_in.load.eq(0)
1693     yield dut.d_in.priv_mode.eq(1)
1694     yield dut.d_in.nc.eq(0)
1695     yield dut.d_in.addr.eq(0)
1696     yield dut.d_in.data.eq(0)
1697     yield dut.m_in.valid.eq(0)
1698     yield dut.m_in.addr.eq(0)
1699     yield dut.m_in.pte.eq(0)
1700     # wait 4 * clk_period
1701     yield
1702     yield
1703     yield
1704     yield
1705
1706     print ()
1707
1708     for i in range(256):
1709         addr = randint(0, 255)
1710         data = randint(0, (1<<64)-1)
1711         sim_mem[addr] = data
1712         addr *= 8
1713
1714         print ("testing %x data %x" % (addr, data))
1715
1716         yield from dcache_load(dut, addr)
1717         yield from dcache_store(dut, addr, data)
1718
1719         addr = randint(0, 255)
1720         sim_data = sim_mem[addr]
1721         addr *= 8
1722
1723         data = yield from dcache_load(dut, addr)
1724         assert data == sim_data, \
1725             "check %x data %x != %x" % (addr, data, sim_data)
1726
1727     for addr in range(256):
1728         data = yield from dcache_load(dut, addr*8)
1729         assert data == sim_mem[addr], \
1730             "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1731
1732 def dcache_sim(dut):
1733     # clear stuff
1734     yield dut.d_in.valid.eq(0)
1735     yield dut.d_in.load.eq(0)
1736     yield dut.d_in.priv_mode.eq(1)
1737     yield dut.d_in.nc.eq(0)
1738     yield dut.d_in.addr.eq(0)
1739     yield dut.d_in.data.eq(0)
1740     yield dut.m_in.valid.eq(0)
1741     yield dut.m_in.addr.eq(0)
1742     yield dut.m_in.pte.eq(0)
1743     # wait 4 * clk_period
1744     yield
1745     yield
1746     yield
1747     yield
1748
1749     # Cacheable read of address 4
1750     data = yield from dcache_load(dut, 0x58)
1751     addr = yield dut.d_in.addr
1752     assert data == 0x0000001700000016, \
1753         f"data @%x=%x expected 0x0000001700000016" % (addr, data)
1754
1755     # Cacheable read of address 20
1756     data = yield from dcache_load(dut, 0x20)
1757     addr = yield dut.d_in.addr
1758     assert data == 0x0000000900000008, \
1759         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1760
1761     # Cacheable read of address 30
1762     data = yield from dcache_load(dut, 0x530)
1763     addr = yield dut.d_in.addr
1764     assert data == 0x0000014D0000014C, \
1765         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1766
1767     # 2nd Cacheable read of address 30
1768     data = yield from dcache_load(dut, 0x530)
1769     addr = yield dut.d_in.addr
1770     assert data == 0x0000014D0000014C, \
1771         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1772
1773     # Non-cacheable read of address 100
1774     data = yield from dcache_load(dut, 0x100, nc=1)
1775     addr = yield dut.d_in.addr
1776     assert data == 0x0000004100000040, \
1777         f"data @%x=%x expected 0000004100000040" % (addr, data)
1778
1779     # Store at address 530
1780     yield from dcache_store(dut, 0x530, 0x121)
1781
1782     # Store at address 30
1783     yield from dcache_store(dut, 0x530, 0x12345678)
1784
1785     # 3nd Cacheable read of address 530
1786     data = yield from dcache_load(dut, 0x530)
1787     addr = yield dut.d_in.addr
1788     assert data == 0x12345678, \
1789         f"data @%x=%x expected 0x12345678" % (addr, data)
1790
1791     # 4th Cacheable read of address 20
1792     data = yield from dcache_load(dut, 0x20)
1793     addr = yield dut.d_in.addr
1794     assert data == 0x0000000900000008, \
1795         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1796
1797     yield
1798     yield
1799     yield
1800     yield
1801
1802
1803 def test_dcache(mem, test_fn, test_name):
1804     dut = DCache()
1805
1806     memory = Memory(width=64, depth=16*64, init=mem)
1807     sram = SRAM(memory=memory, granularity=8)
1808
1809     m = Module()
1810     m.submodules.dcache = dut
1811     m.submodules.sram = sram
1812
1813     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1814     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1815     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1816     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1817     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1818     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1819
1820     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1821     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1822
1823     # nmigen Simulation
1824     sim = Simulator(m)
1825     sim.add_clock(1e-6)
1826
1827     sim.add_sync_process(wrap(test_fn(dut)))
1828     with sim.write_vcd('test_dcache%s.vcd' % test_name):
1829         sim.run()
1830
1831 if __name__ == '__main__':
1832     dut = DCache()
1833     vl = rtlil.convert(dut, ports=[])
1834     with open("test_dcache.il", "w") as f:
1835         f.write(vl)
1836
1837     mem = []
1838     for i in range(0,512):
1839         mem.append((i*2)| ((i*2+1)<<32))
1840
1841     test_dcache(mem, dcache_sim, "")
1842     test_dcache(None, dcache_random_sim, "random")
1843