src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 """
   6
   7 from enum import Enum, unique
   8
   9 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  10 try:
  11     from nmigen.hdl.ast import Display
  12 except ImportError:
  13     def Display(*args):
  14         return []
  15
  16 from random import randint
  17
  18 from nmigen.cli import main
  19 from nmutil.iocontrol import RecordObject
  20 from nmutil.util import wrap
  21 from nmigen.utils import log2_int
  22 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  23                                      DCacheToLoadStore1Type,
  24                                      MMUToDCacheType,
  25                                      DCacheToMMUType)
  26
  27 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  28                                 WBAddrType, WBDataType, WBSelType,
  29                                 WBMasterOut, WBSlaveOut,
  30                                 WBMasterOutVector, WBSlaveOutVector,
  31                                 WBIOMasterOut, WBIOSlaveOut)
  32
  33 from soc.experiment.cache_ram import CacheRam
  34 from soc.experiment.plru import PLRU
  35
  36 # for test
  37 from nmigen_soc.wishbone.sram import SRAM
  38 from nmigen import Memory
  39 from nmigen.cli import rtlil
  40 if True:
  41     from nmigen.back.pysim import Simulator, Delay, Settle
  42 else:
  43     from nmigen.sim.cxxsim import Simulator, Delay, Settle
  44
  45
  46 # TODO: make these parameters of DCache at some point
  47 LINE_SIZE = 64    # Line size in bytes
  48 NUM_LINES = 16    # Number of lines in a set
  49 NUM_WAYS = 4      # Number of ways
  50 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  51 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  52 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  53 LOG_LENGTH = 0    # Non-zero to enable log data collection
  54
  55 # BRAM organisation: We never access more than
  56 #     -- WB_DATA_BITS at a time so to save
  57 #     -- resources we make the array only that wide, and
  58 #     -- use consecutive indices for to make a cache "line"
  59 #     --
  60 #     -- ROW_SIZE is the width in bytes of the BRAM
  61 #     -- (based on WB, so 64-bits)
  62 ROW_SIZE = WB_DATA_BITS // 8;
  63
  64 # ROW_PER_LINE is the number of row (wishbone
  65 # transactions) in a line
  66 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  67
  68 # BRAM_ROWS is the number of rows in BRAM needed
  69 # to represent the full dcache
  70 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  71
  72 print ("ROW_SIZE", ROW_SIZE)
  73 print ("ROW_PER_LINE", ROW_PER_LINE)
  74 print ("BRAM_ROWS", BRAM_ROWS)
  75 print ("NUM_WAYS", NUM_WAYS)
  76
  77 # Bit fields counts in the address
  78
  79 # REAL_ADDR_BITS is the number of real address
  80 # bits that we store
  81 REAL_ADDR_BITS = 56
  82
  83 # ROW_BITS is the number of bits to select a row
  84 ROW_BITS = log2_int(BRAM_ROWS)
  85
  86 # ROW_LINE_BITS is the number of bits to select
  87 # a row within a line
  88 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  89
  90 # LINE_OFF_BITS is the number of bits for
  91 # the offset in a cache line
  92 LINE_OFF_BITS = log2_int(LINE_SIZE)
  93
  94 # ROW_OFF_BITS is the number of bits for
  95 # the offset in a row
  96 ROW_OFF_BITS = log2_int(ROW_SIZE)
  97
  98 # INDEX_BITS is the number if bits to
  99 # select a cache line
 100 INDEX_BITS = log2_int(NUM_LINES)
 101
 102 # SET_SIZE_BITS is the log base 2 of the set size
 103 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 104
 105 # TAG_BITS is the number of bits of
 106 # the tag part of the address
 107 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 108
 109 # TAG_WIDTH is the width in bits of each way of the tag RAM
 110 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 111
 112 # WAY_BITS is the number of bits to select a way
 113 WAY_BITS = log2_int(NUM_WAYS)
 114
 115 # Example of layout for 32 lines of 64 bytes:
 116 layout = """\
 117   ..  tag    |index|  line  |
 118   ..         |   row   |    |
 119   ..         |     |---|    | ROW_LINE_BITS  (3)
 120   ..         |     |--- - --| LINE_OFF_BITS (6)
 121   ..         |         |- --| ROW_OFF_BITS  (3)
 122   ..         |----- ---|    | ROW_BITS      (8)
 123   ..         |-----|        | INDEX_BITS    (5)
 124   .. --------|              | TAG_BITS      (45)
 125 """
 126 print (layout)
 127 print ("Dcache TAG %d IDX %d ROW %d ROFF %d LOFF %d RLB %d" % \
 128             (TAG_BITS, INDEX_BITS, ROW_BITS,
 129              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 130 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 131 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 132 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 133
 134 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 135
 136 def CacheTagArray():
 137     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 138                         for x in range(NUM_LINES))
 139
 140 def CacheValidBitsArray():
 141     return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
 142                         for x in range(NUM_LINES))
 143
 144 def RowPerLineValidArray():
 145     return Array(Signal(name="rows_valid%d" % x) \
 146                         for x in range(ROW_PER_LINE))
 147
 148 # L1 TLB
 149 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 150 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 151 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 152 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 153 TLB_PTE_BITS     = 64
 154 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 155
 156 def ispow2(x):
 157     return (1<<log2_int(x, False)) == x
 158
 159 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 160 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 161 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 162 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 163 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 164 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 165         "geometry bits don't add up"
 166 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 167         "geometry bits don't add up"
 168 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 169          "geometry bits don't add up"
 170 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 171 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 172
 173
 174 def TLBValidBitsArray():
 175     return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
 176
 177 def TLBTagEAArray():
 178     return Array(Signal(TLB_EA_TAG_BITS) for x in range (TLB_NUM_WAYS))
 179
 180 def TLBTagsArray():
 181     return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
 182
 183 def TLBPtesArray():
 184     return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
 185
 186 def HitWaySet():
 187     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 188                         for x in range(TLB_NUM_WAYS))
 189
 190 # Cache RAM interface
 191 def CacheRamOut():
 192     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 193                  for x in range(NUM_WAYS))
 194
 195 # PLRU output interface
 196 def PLRUOut():
 197     return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
 198
 199 # TLB PLRU output interface
 200 def TLBPLRUOut():
 201     return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
 202
 203 # Helper functions to decode incoming requests
 204 #
 205 # Return the cache line index (tag index) for an address
 206 def get_index(addr):
 207     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 208
 209 # Return the cache row index (data memory) for an address
 210 def get_row(addr):
 211     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 212
 213 # Return the index of a row within a line
 214 def get_row_of_line(row):
 215     return row[:ROW_BITS][:ROW_LINE_BITS]
 216
 217 # Returns whether this is the last row of a line
 218 def is_last_row_addr(addr, last):
 219     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 220
 221 # Returns whether this is the last row of a line
 222 def is_last_row(row, last):
 223     return get_row_of_line(row) == last
 224
 225 # Return the next row in the current cache line. We use a
 226 # dedicated function in order to limit the size of the
 227 # generated adder to be only the bits within a cache line
 228 # (3 bits with default settings)
 229 def next_row(row):
 230     row_v = row[0:ROW_LINE_BITS] + 1
 231     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 232
 233 # Get the tag value from the address
 234 def get_tag(addr):
 235     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 236
 237 # Read a tag from a tag memory row
 238 def read_tag(way, tagset):
 239     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 240
 241 # Read a TLB tag from a TLB tag memory row
 242 def read_tlb_tag(way, tags):
 243     return tags.word_select(way, TLB_EA_TAG_BITS)
 244
 245 # Write a TLB tag to a TLB tag memory row
 246 def write_tlb_tag(way, tags, tag):
 247     return read_tlb_tag(way, tags).eq(tag)
 248
 249 # Read a PTE from a TLB PTE memory row
 250 def read_tlb_pte(way, ptes):
 251     return ptes.word_select(way, TLB_PTE_BITS)
 252
 253 def write_tlb_pte(way, ptes, newpte):
 254     return read_tlb_pte(way, ptes).eq(newpte)
 255
 256
 257 # Record for storing permission, attribute, etc. bits from a PTE
 258 class PermAttr(RecordObject):
 259     def __init__(self, name=None):
 260         super().__init__(name=name)
 261         self.reference = Signal()
 262         self.changed   = Signal()
 263         self.nocache   = Signal()
 264         self.priv      = Signal()
 265         self.rd_perm   = Signal()
 266         self.wr_perm   = Signal()
 267
 268
 269 def extract_perm_attr(pte):
 270     pa = PermAttr()
 271     pa.reference = pte[8]
 272     pa.changed   = pte[7]
 273     pa.nocache   = pte[5]
 274     pa.priv      = pte[3]
 275     pa.rd_perm   = pte[2]
 276     pa.wr_perm   = pte[1]
 277     return pa;
 278
 279
 280 # Type of operation on a "valid" input
 281 @unique
 282 class Op(Enum):
 283     OP_NONE       = 0
 284     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 285     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 286     OP_LOAD_HIT   = 3 # Cache hit on load
 287     OP_LOAD_MISS  = 4 # Load missing cache
 288     OP_LOAD_NC    = 5 # Non-cachable load
 289     OP_STORE_HIT  = 6 # Store hitting cache
 290     OP_STORE_MISS = 7 # Store missing cache
 291
 292
 293 # Cache state machine
 294 @unique
 295 class State(Enum):
 296     IDLE             = 0 # Normal load hit processing
 297     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 298     STORE_WAIT_ACK   = 2 # Store wait ack
 299     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 300
 301
 302 # Dcache operations:
 303 #
 304 # In order to make timing, we use the BRAMs with
 305 # an output buffer, which means that the BRAM
 306 # output is delayed by an extra cycle.
 307 #
 308 # Thus, the dcache has a 2-stage internal pipeline
 309 # for cache hits with no stalls.
 310 #
 311 # All other operations are handled via stalling
 312 # in the first stage.
 313 #
 314 # The second stage can thus complete a hit at the same
 315 # time as the first stage emits a stall for a complex op.
 316 #
 317 # Stage 0 register, basically contains just the latched request
 318
 319 class RegStage0(RecordObject):
 320     def __init__(self, name=None):
 321         super().__init__(name=name)
 322         self.req     = LoadStore1ToDCacheType(name="lsmem")
 323         self.tlbie   = Signal()
 324         self.doall   = Signal()
 325         self.tlbld   = Signal()
 326         self.mmu_req = Signal() # indicates source of request
 327
 328
 329 class MemAccessRequest(RecordObject):
 330     def __init__(self, name=None):
 331         super().__init__(name=name)
 332         self.op        = Signal(Op)
 333         self.valid     = Signal()
 334         self.dcbz      = Signal()
 335         self.real_addr = Signal(REAL_ADDR_BITS)
 336         self.data      = Signal(64)
 337         self.byte_sel  = Signal(8)
 338         self.hit_way   = Signal(WAY_BITS)
 339         self.same_tag  = Signal()
 340         self.mmu_req   = Signal()
 341
 342
 343 # First stage register, contains state for stage 1 of load hits
 344 # and for the state machine used by all other operations
 345 class RegStage1(RecordObject):
 346     def __init__(self, name=None):
 347         super().__init__(name=name)
 348         # Info about the request
 349         self.full             = Signal() # have uncompleted request
 350         self.mmu_req          = Signal() # request is from MMU
 351         self.req              = MemAccessRequest(name="reqmem")
 352
 353         # Cache hit state
 354         self.hit_way          = Signal(WAY_BITS)
 355         self.hit_load_valid   = Signal()
 356         self.hit_index        = Signal(INDEX_BITS)
 357         self.cache_hit        = Signal()
 358
 359         # TLB hit state
 360         self.tlb_hit          = Signal()
 361         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 362         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 363
 364         # 2-stage data buffer for data forwarded from writes to reads
 365         self.forward_data1    = Signal(64)
 366         self.forward_data2    = Signal(64)
 367         self.forward_sel1     = Signal(8)
 368         self.forward_valid1   = Signal()
 369         self.forward_way1     = Signal(WAY_BITS)
 370         self.forward_row1     = Signal(ROW_BITS)
 371         self.use_forward1     = Signal()
 372         self.forward_sel      = Signal(8)
 373
 374         # Cache miss state (reload state machine)
 375         self.state            = Signal(State)
 376         self.dcbz             = Signal()
 377         self.write_bram       = Signal()
 378         self.write_tag        = Signal()
 379         self.slow_valid       = Signal()
 380         self.wb               = WBMasterOut("wb")
 381         self.reload_tag       = Signal(TAG_BITS)
 382         self.store_way        = Signal(WAY_BITS)
 383         self.store_row        = Signal(ROW_BITS)
 384         self.store_index      = Signal(INDEX_BITS)
 385         self.end_row_ix       = Signal(ROW_LINE_BITS)
 386         self.rows_valid       = RowPerLineValidArray()
 387         self.acks_pending     = Signal(3)
 388         self.inc_acks         = Signal()
 389         self.dec_acks         = Signal()
 390
 391         # Signals to complete (possibly with error)
 392         self.ls_valid         = Signal()
 393         self.ls_error         = Signal()
 394         self.mmu_done         = Signal()
 395         self.mmu_error        = Signal()
 396         self.cache_paradox    = Signal()
 397
 398         # Signal to complete a failed stcx.
 399         self.stcx_fail        = Signal()
 400
 401
 402 # Reservation information
 403 class Reservation(RecordObject):
 404     def __init__(self):
 405         super().__init__()
 406         self.valid = Signal()
 407         self.addr  = Signal(64-LINE_OFF_BITS)
 408
 409
 410 class DTLBUpdate(Elaboratable):
 411     def __init__(self):
 412         self.tlbie    = Signal()
 413         self.tlbwe    = Signal()
 414         self.doall    = Signal()
 415         self.updated  = Signal()
 416         self.v_updated  = Signal()
 417         self.tlb_hit    = Signal()
 418         self.tlb_req_index = Signal(TLB_SET_BITS)
 419
 420         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 421         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 422         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 423         self.repl_way        = Signal(TLB_WAY_BITS)
 424         self.eatag           = Signal(TLB_EA_TAG_BITS)
 425         self.pte_data        = Signal(TLB_PTE_BITS)
 426
 427         self.dv = Signal(TLB_PTE_WAY_BITS)
 428
 429         self.tb_out = Signal(TLB_TAG_WAY_BITS)
 430         self.pb_out = Signal(TLB_NUM_WAYS)
 431         self.db_out = Signal(TLB_PTE_WAY_BITS)
 432
 433     def elaborate(self, platform):
 434         m = Module()
 435         comb = m.d.comb
 436         sync = m.d.sync
 437
 438         tagset   = Signal(TLB_TAG_WAY_BITS)
 439         pteset   = Signal(TLB_PTE_WAY_BITS)
 440
 441         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 442
 443         with m.If(self.tlbie & self.doall):
 444             pass # clear all back in parent
 445         with m.Elif(self.tlbie):
 446             with m.If(self.tlb_hit):
 447                 comb += db_out.eq(self.dv)
 448                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 449                 comb += self.v_updated.eq(1)
 450
 451         with m.Elif(self.tlbwe):
 452
 453             comb += tagset.eq(self.tlb_tag_way)
 454             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 455             comb += tb_out.eq(tagset)
 456
 457             comb += pteset.eq(self.tlb_pte_way)
 458             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 459             comb += pb_out.eq(pteset)
 460
 461             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 462
 463             comb += self.updated.eq(1)
 464             comb += self.v_updated.eq(1)
 465
 466         return m
 467
 468
 469 class DCachePendingHit(Elaboratable):
 470
 471     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 472                       cache_valid_idx, cache_tag_set,
 473                     req_addr,
 474                     hit_set):
 475
 476         self.go          = Signal()
 477         self.virt_mode   = Signal()
 478         self.is_hit      = Signal()
 479         self.tlb_hit     = Signal()
 480         self.hit_way     = Signal(WAY_BITS)
 481         self.rel_match   = Signal()
 482         self.req_index   = Signal(INDEX_BITS)
 483         self.reload_tag  = Signal(TAG_BITS)
 484
 485         self.tlb_hit_way = tlb_hit_way
 486         self.tlb_pte_way = tlb_pte_way
 487         self.tlb_valid_way = tlb_valid_way
 488         self.cache_valid_idx = cache_valid_idx
 489         self.cache_tag_set = cache_tag_set
 490         self.req_addr = req_addr
 491         self.hit_set = hit_set
 492
 493     def elaborate(self, platform):
 494         m = Module()
 495         comb = m.d.comb
 496         sync = m.d.sync
 497
 498         go = self.go
 499         virt_mode = self.virt_mode
 500         is_hit = self.is_hit
 501         tlb_pte_way = self.tlb_pte_way
 502         tlb_valid_way = self.tlb_valid_way
 503         cache_valid_idx = self.cache_valid_idx
 504         cache_tag_set = self.cache_tag_set
 505         req_addr = self.req_addr
 506         tlb_hit_way = self.tlb_hit_way
 507         tlb_hit = self.tlb_hit
 508         hit_set = self.hit_set
 509         hit_way = self.hit_way
 510         rel_match = self.rel_match
 511         req_index = self.req_index
 512         reload_tag = self.reload_tag
 513
 514         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 515                                     for i in range(TLB_NUM_WAYS))
 516         hit_way_set = HitWaySet()
 517
 518         # Test if pending request is a hit on any way
 519         # In order to make timing in virtual mode,
 520         # when we are using the TLB, we compare each
 521         # way with each of the real addresses from each way of
 522         # the TLB, and then decide later which match to use.
 523
 524         with m.If(virt_mode):
 525             for j in range(TLB_NUM_WAYS):
 526                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 527                 s_hit       = Signal()
 528                 s_pte       = Signal(TLB_PTE_BITS)
 529                 s_ra        = Signal(REAL_ADDR_BITS)
 530                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 531                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 532                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 533                 comb += s_tag.eq(get_tag(s_ra))
 534
 535                 for i in range(NUM_WAYS):
 536                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 537                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 538                                   (read_tag(i, cache_tag_set) == s_tag)
 539                                   & tlb_valid_way[j])
 540                     with m.If(is_tag_hit):
 541                         comb += hit_way_set[j].eq(i)
 542                         comb += s_hit.eq(1)
 543                 comb += hit_set[j].eq(s_hit)
 544                 with m.If(s_tag == reload_tag):
 545                     comb += rel_matches[j].eq(1)
 546             with m.If(tlb_hit):
 547                 comb += is_hit.eq(hit_set[tlb_hit_way])
 548                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 549                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 550         with m.Else():
 551             s_tag       = Signal(TAG_BITS)
 552             comb += s_tag.eq(get_tag(req_addr))
 553             for i in range(NUM_WAYS):
 554                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 555                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 556                           (read_tag(i, cache_tag_set) == s_tag))
 557                 with m.If(is_tag_hit):
 558                     comb += hit_way.eq(i)
 559                     comb += is_hit.eq(1)
 560             with m.If(s_tag == reload_tag):
 561                 comb += rel_match.eq(1)
 562
 563         return m
 564
 565
 566 class DCache(Elaboratable):
 567     """Set associative dcache write-through
 568     TODO (in no specific order):
 569     * See list in icache.vhdl
 570     * Complete load misses on the cycle when WB data comes instead of
 571       at the end of line (this requires dealing with requests coming in
 572       while not idle...)
 573     """
 574     def __init__(self):
 575         self.d_in      = LoadStore1ToDCacheType("d_in")
 576         self.d_out     = DCacheToLoadStore1Type("d_out")
 577
 578         self.m_in      = MMUToDCacheType("m_in")
 579         self.m_out     = DCacheToMMUType("m_out")
 580
 581         self.stall_out = Signal()
 582
 583         self.wb_out    = WBMasterOut()
 584         self.wb_in     = WBSlaveOut()
 585
 586         self.log_out   = Signal(20)
 587
 588     def stage_0(self, m, r0, r1, r0_full):
 589         """Latch the request in r0.req as long as we're not stalling
 590         """
 591         comb = m.d.comb
 592         sync = m.d.sync
 593         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 594
 595         r = RegStage0("stage0")
 596
 597         # TODO, this goes in unit tests and formal proofs
 598         with m.If(d_in.valid & m_in.valid):
 599             sync += Display("request collision loadstore vs MMU")
 600
 601         with m.If(m_in.valid):
 602             sync += r.req.valid.eq(1)
 603             sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 604             sync += r.req.dcbz.eq(0)
 605             sync += r.req.nc.eq(0)
 606             sync += r.req.reserve.eq(0)
 607             sync += r.req.virt_mode.eq(1)
 608             sync += r.req.priv_mode.eq(1)
 609             sync += r.req.addr.eq(m_in.addr)
 610             sync += r.req.data.eq(m_in.pte)
 611             sync += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 612             sync += r.tlbie.eq(m_in.tlbie)
 613             sync += r.doall.eq(m_in.doall)
 614             sync += r.tlbld.eq(m_in.tlbld)
 615             sync += r.mmu_req.eq(1)
 616         with m.Else():
 617             sync += r.req.eq(d_in)
 618             sync += r.tlbie.eq(0)
 619             sync += r.doall.eq(0)
 620             sync += r.tlbld.eq(0)
 621             sync += r.mmu_req.eq(0)
 622             with m.If(~(r1.full & r0_full)):
 623                 sync += r0.eq(r)
 624                 sync += r0_full.eq(r.req.valid)
 625
 626     def tlb_read(self, m, r0_stall, tlb_valid_way,
 627                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 628                  dtlb_tags, dtlb_ptes):
 629         """TLB
 630         Operates in the second cycle on the request latched in r0.req.
 631         TLB updates write the entry at the end of the second cycle.
 632         """
 633         comb = m.d.comb
 634         sync = m.d.sync
 635         m_in, d_in = self.m_in, self.d_in
 636
 637         index    = Signal(TLB_SET_BITS)
 638         addrbits = Signal(TLB_SET_BITS)
 639
 640         amin = TLB_LG_PGSZ
 641         amax = TLB_LG_PGSZ + TLB_SET_BITS
 642
 643         with m.If(m_in.valid):
 644             comb += addrbits.eq(m_in.addr[amin : amax])
 645         with m.Else():
 646             comb += addrbits.eq(d_in.addr[amin : amax])
 647         comb += index.eq(addrbits)
 648
 649         # If we have any op and the previous op isn't finished,
 650         # then keep the same output for next cycle.
 651         with m.If(~r0_stall):
 652             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 653             sync += tlb_tag_way.eq(dtlb_tags[index])
 654             sync += tlb_pte_way.eq(dtlb_ptes[index])
 655
 656     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 657         """Generate TLB PLRUs
 658         """
 659         comb = m.d.comb
 660         sync = m.d.sync
 661
 662         if TLB_NUM_WAYS == 0:
 663             return
 664         for i in range(TLB_SET_SIZE):
 665             # TLB PLRU interface
 666             tlb_plru        = PLRU(WAY_BITS)
 667             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 668             tlb_plru_acc_en = Signal()
 669
 670             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 671             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 672             comb += tlb_plru.acc.eq(r1.tlb_hit_way)
 673             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 674
 675     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 676                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 677                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 678
 679         comb = m.d.comb
 680         sync = m.d.sync
 681
 682         hitway = Signal(TLB_WAY_BITS)
 683         hit    = Signal()
 684         eatag  = Signal(TLB_EA_TAG_BITS)
 685
 686         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 687         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 688         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 689
 690         for i in range(TLB_NUM_WAYS):
 691             is_tag_hit = Signal()
 692             comb += is_tag_hit.eq(tlb_valid_way[i]
 693                                   & read_tlb_tag(i, tlb_tag_way) == eatag)
 694             with m.If(is_tag_hit):
 695                 comb += hitway.eq(i)
 696                 comb += hit.eq(1)
 697
 698         comb += tlb_hit.eq(hit & r0_valid)
 699         comb += tlb_hit_way.eq(hitway)
 700
 701         with m.If(tlb_hit):
 702             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 703         with m.Else():
 704             comb += pte.eq(0)
 705         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 706         with m.If(r0.req.virt_mode):
 707             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 708                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 709                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 710             comb += perm_attr.eq(extract_perm_attr(pte))
 711         with m.Else():
 712             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 713                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 714
 715             comb += perm_attr.reference.eq(1)
 716             comb += perm_attr.changed.eq(1)
 717             comb += perm_attr.nocache.eq(0)
 718             comb += perm_attr.priv.eq(1)
 719             comb += perm_attr.rd_perm.eq(1)
 720             comb += perm_attr.wr_perm.eq(1)
 721
 722     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 723                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 724                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 725
 726         comb = m.d.comb
 727         sync = m.d.sync
 728
 729         tlbie    = Signal()
 730         tlbwe    = Signal()
 731
 732         comb += tlbie.eq(r0_valid & r0.tlbie)
 733         comb += tlbwe.eq(r0_valid & r0.tlbld)
 734
 735         m.submodules.tlb_update = d = DTLBUpdate()
 736         with m.If(tlbie & r0.doall):
 737             # clear all valid bits at once
 738             for i in range(TLB_SET_SIZE):
 739                 sync += dtlb_valid_bits[i].eq(0)
 740         with m.If(d.updated):
 741             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 742             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 743         with m.If(d.v_updated):
 744             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 745
 746         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 747
 748         comb += d.tlbie.eq(tlbie)
 749         comb += d.tlbwe.eq(tlbwe)
 750         comb += d.doall.eq(r0.doall)
 751         comb += d.tlb_hit.eq(tlb_hit)
 752         comb += d.tlb_hit_way.eq(tlb_hit_way)
 753         comb += d.tlb_tag_way.eq(tlb_tag_way)
 754         comb += d.tlb_pte_way.eq(tlb_pte_way)
 755         comb += d.tlb_req_index.eq(tlb_req_index)
 756
 757         with m.If(tlb_hit):
 758             comb += d.repl_way.eq(tlb_hit_way)
 759         with m.Else():
 760             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 761         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 762         comb += d.pte_data.eq(r0.req.data)
 763
 764     def maybe_plrus(self, m, r1, plru_victim):
 765         """Generate PLRUs
 766         """
 767         comb = m.d.comb
 768         sync = m.d.sync
 769
 770         if TLB_NUM_WAYS == 0:
 771             return
 772
 773         for i in range(NUM_LINES):
 774             # PLRU interface
 775             plru        = PLRU(WAY_BITS)
 776             setattr(m.submodules, "plru%d" % i, plru)
 777             plru_acc_en = Signal()
 778
 779             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 780             comb += plru.acc_en.eq(plru_acc_en)
 781             comb += plru.acc.eq(r1.hit_way)
 782             comb += plru_victim[i].eq(plru.lru_o)
 783
 784     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 785         """Cache tag RAM read port
 786         """
 787         comb = m.d.comb
 788         sync = m.d.sync
 789         m_in, d_in = self.m_in, self.d_in
 790
 791         index = Signal(INDEX_BITS)
 792
 793         with m.If(r0_stall):
 794             comb += index.eq(req_index)
 795         with m.Elif(m_in.valid):
 796             comb += index.eq(get_index(m_in.addr))
 797         with m.Else():
 798             comb += index.eq(get_index(d_in.addr))
 799         sync += cache_tag_set.eq(cache_tags[index])
 800
 801     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 802                        r0_valid, r1, cache_valid_bits, replace_way,
 803                        use_forward1_next, use_forward2_next,
 804                        req_hit_way, plru_victim, rc_ok, perm_attr,
 805                        valid_ra, perm_ok, access_ok, req_op, req_go,
 806                        tlb_pte_way,
 807                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 808                        cancel_store, req_same_tag, r0_stall, early_req_row):
 809         """Cache request parsing and hit detection
 810         """
 811
 812         comb = m.d.comb
 813         sync = m.d.sync
 814         m_in, d_in = self.m_in, self.d_in
 815
 816         is_hit      = Signal()
 817         hit_way     = Signal(WAY_BITS)
 818         op          = Signal(Op)
 819         opsel       = Signal(3)
 820         go          = Signal()
 821         nc          = Signal()
 822         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 823                                   for i in range(TLB_NUM_WAYS))
 824         cache_valid_idx = Signal(NUM_WAYS)
 825
 826         # Extract line, row and tag from request
 827         comb += req_index.eq(get_index(r0.req.addr))
 828         comb += req_row.eq(get_row(r0.req.addr))
 829         comb += req_tag.eq(get_tag(ra))
 830
 831         if False: # display on comb is a bit... busy.
 832             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 833                     r0.req.addr, ra, req_index, req_tag, req_row)
 834
 835         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 836         comb += cache_valid_idx.eq(cache_valid_bits[req_index])
 837
 838         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 839                                 tlb_valid_way, tlb_hit_way,
 840                                 cache_valid_idx, cache_tag_set,
 841                                 r0.req.addr,
 842                                 hit_set)
 843
 844         comb += dc.tlb_hit.eq(tlb_hit)
 845         comb += dc.reload_tag.eq(r1.reload_tag)
 846         comb += dc.virt_mode.eq(r0.req.virt_mode)
 847         comb += dc.go.eq(go)
 848         comb += dc.req_index.eq(req_index)
 849         comb += is_hit.eq(dc.is_hit)
 850         comb += hit_way.eq(dc.hit_way)
 851         comb += req_same_tag.eq(dc.rel_match)
 852
 853         # See if the request matches the line currently being reloaded
 854         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 855                   (req_index == r1.store_index) & req_same_tag):
 856             # For a store, consider this a hit even if the row isn't
 857             # valid since it will be by the time we perform the store.
 858             # For a load, check the appropriate row valid bit.
 859             valid = r1.rows_valid[req_row[:ROW_LINE_BITS]]
 860             comb += is_hit.eq(~r0.req.load | valid)
 861             comb += hit_way.eq(replace_way)
 862
 863         # Whether to use forwarded data for a load or not
 864         with m.If((get_row(r1.req.real_addr) == req_row) &
 865                   (r1.req.hit_way == hit_way)):
 866             # Only need to consider r1.write_bram here, since if we
 867             # are writing refill data here, then we don't have a
 868             # cache hit this cycle on the line being refilled.
 869             # (There is the possibility that the load following the
 870             # load miss that started the refill could be to the old
 871             # contents of the victim line, since it is a couple of
 872             # cycles after the refill starts before we see the updated
 873             # cache tag. In that case we don't use the bypass.)
 874             comb += use_forward1_next.eq(r1.write_bram)
 875         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 876             comb += use_forward2_next.eq(r1.forward_valid1)
 877
 878         # The way that matched on a hit
 879         comb += req_hit_way.eq(hit_way)
 880
 881         # The way to replace on a miss
 882         with m.If(r1.write_tag):
 883             comb += replace_way.eq(plru_victim[r1.store_index])
 884         with m.Else():
 885             comb += replace_way.eq(r1.store_way)
 886
 887         # work out whether we have permission for this access
 888         # NB we don't yet implement AMR, thus no KUAP
 889         comb += rc_ok.eq(perm_attr.reference
 890                          & (r0.req.load | perm_attr.changed)
 891                 )
 892         comb += perm_ok.eq((r0.req.priv_mode | ~perm_attr.priv) &
 893                            (perm_attr.wr_perm |
 894                               (r0.req.load & perm_attr.rd_perm)))
 895         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 896         # Combine the request and cache hit status to decide what
 897         # operation needs to be done
 898         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 899         comb += op.eq(Op.OP_NONE)
 900         with m.If(go):
 901             with m.If(~access_ok):
 902                 comb += op.eq(Op.OP_BAD)
 903             with m.Elif(cancel_store):
 904                 comb += op.eq(Op.OP_STCX_FAIL)
 905             with m.Else():
 906                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 907                 with m.Switch(opsel):
 908                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 909                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 910                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 911                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 912                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 913                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 914                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 915                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 916         comb += req_op.eq(op)
 917         comb += req_go.eq(go)
 918
 919         # Version of the row number that is valid one cycle earlier
 920         # in the cases where we need to read the cache data BRAM.
 921         # If we're stalling then we need to keep reading the last
 922         # row requested.
 923         with m.If(~r0_stall):
 924             with m.If(m_in.valid):
 925                 comb += early_req_row.eq(get_row(m_in.addr))
 926             with m.Else():
 927                 comb += early_req_row.eq(get_row(d_in.addr))
 928         with m.Else():
 929             comb += early_req_row.eq(req_row)
 930
 931     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 932                          r0_valid, r0, reservation):
 933         """Handle load-with-reservation and store-conditional instructions
 934         """
 935         comb = m.d.comb
 936         sync = m.d.sync
 937
 938         with m.If(r0_valid & r0.req.reserve):
 939             # XXX generate alignment interrupt if address
 940             # is not aligned XXX or if r0.req.nc = '1'
 941             with m.If(r0.req.load):
 942                 comb += set_rsrv.eq(1) # load with reservation
 943             with m.Else():
 944                 comb += clear_rsrv.eq(1) # store conditional
 945                 with m.If(~reservation.valid |
 946                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
 947                     comb += cancel_store.eq(1)
 948
 949     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 950                         reservation, r0):
 951
 952         comb = m.d.comb
 953         sync = m.d.sync
 954
 955         with m.If(r0_valid & access_ok):
 956             with m.If(clear_rsrv):
 957                 sync += reservation.valid.eq(0)
 958             with m.Elif(set_rsrv):
 959                 sync += reservation.valid.eq(1)
 960                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 961
 962     def writeback_control(self, m, r1, cache_out):
 963         """Return data for loads & completion control logic
 964         """
 965         comb = m.d.comb
 966         sync = m.d.sync
 967         d_out, m_out = self.d_out, self.m_out
 968
 969         data_out = Signal(64)
 970         data_fwd = Signal(64)
 971
 972         # Use the bypass if are reading the row that was
 973         # written 1 or 2 cycles ago, including for the
 974         # slow_valid = 1 case (i.e. completing a load
 975         # miss or a non-cacheable load).
 976         with m.If(r1.use_forward1):
 977             comb += data_fwd.eq(r1.forward_data1)
 978         with m.Else():
 979             comb += data_fwd.eq(r1.forward_data2)
 980
 981         comb += data_out.eq(cache_out[r1.hit_way])
 982
 983         for i in range(8):
 984             with m.If(r1.forward_sel[i]):
 985                 dsel = data_fwd.word_select(i, 8)
 986                 comb += data_out.word_select(i, 8).eq(dsel)
 987
 988         comb += d_out.valid.eq(r1.ls_valid)
 989         comb += d_out.data.eq(data_out)
 990         comb += d_out.store_done.eq(~r1.stcx_fail)
 991         comb += d_out.error.eq(r1.ls_error)
 992         comb += d_out.cache_paradox.eq(r1.cache_paradox)
 993
 994         # Outputs to MMU
 995         comb += m_out.done.eq(r1.mmu_done)
 996         comb += m_out.err.eq(r1.mmu_error)
 997         comb += m_out.data.eq(data_out)
 998
 999         # We have a valid load or store hit or we just completed
1000         # a slow op such as a load miss, a NC load or a store
1001         #
1002         # Note: the load hit is delayed by one cycle. However it
1003         # can still not collide with r.slow_valid (well unless I
1004         # miscalculated) because slow_valid can only be set on a
1005         # subsequent request and not on its first cycle (the state
1006         # machine must have advanced), which makes slow_valid
1007         # at least 2 cycles from the previous hit_load_valid.
1008
1009         # Sanity: Only one of these must be set in any given cycle
1010
1011         if False: # TODO: need Display to get this to work
1012             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1013             "unexpected slow_valid collision with stcx_fail"
1014
1015             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1016              "unexpected hit_load_delayed collision with slow_valid"
1017
1018         with m.If(~r1.mmu_req):
1019             # Request came from loadstore1...
1020             # Load hit case is the standard path
1021             with m.If(r1.hit_load_valid):
1022                 sync += Display("completing load hit data=%x", data_out)
1023
1024             # error cases complete without stalling
1025             with m.If(r1.ls_error):
1026                 sync += Display("completing ld/st with error")
1027
1028             # Slow ops (load miss, NC, stores)
1029             with m.If(r1.slow_valid):
1030                 sync += Display("completing store or load miss data=%x",
1031                                 data_out)
1032
1033         with m.Else():
1034             # Request came from MMU
1035             with m.If(r1.hit_load_valid):
1036                 sync += Display("completing load hit to MMU, data=%x",
1037                                 m_out.data)
1038             # error cases complete without stalling
1039             with m.If(r1.mmu_error):
1040                 sync += Display("combpleting MMU ld with error")
1041
1042             # Slow ops (i.e. load miss)
1043             with m.If(r1.slow_valid):
1044                 sync += Display("completing MMU load miss, data=%x",
1045                                 m_out.data)
1046
1047     def rams(self, m, r1, early_req_row, cache_out, replace_way):
1048         """rams
1049         Generate a cache RAM for each way. This handles the normal
1050         reads, writes from reloads and the special store-hit update
1051         path as well.
1052
1053         Note: the BRAMs have an extra read buffer, meaning the output
1054         is pipelined an extra cycle. This differs from the
1055         icache. The writeback logic needs to take that into
1056         account by using 1-cycle delayed signals for load hits.
1057         """
1058         comb = m.d.comb
1059         wb_in = self.wb_in
1060
1061         for i in range(NUM_WAYS):
1062             do_read  = Signal(name="do_rd%d" % i)
1063             rd_addr  = Signal(ROW_BITS)
1064             do_write = Signal(name="do_wr%d" % i)
1065             wr_addr  = Signal(ROW_BITS)
1066             wr_data  = Signal(WB_DATA_BITS)
1067             wr_sel   = Signal(ROW_SIZE)
1068             wr_sel_m = Signal(ROW_SIZE)
1069             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i)
1070
1071             way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
1072             setattr(m.submodules, "cacheram_%d" % i, way)
1073
1074             comb += way.rd_en.eq(do_read)
1075             comb += way.rd_addr.eq(rd_addr)
1076             comb += _d_out.eq(way.rd_data_o)
1077             comb += way.wr_sel.eq(wr_sel_m)
1078             comb += way.wr_addr.eq(wr_addr)
1079             comb += way.wr_data.eq(wr_data)
1080
1081             # Cache hit reads
1082             comb += do_read.eq(1)
1083             comb += rd_addr.eq(early_req_row[:ROW_BITS])
1084             comb += cache_out[i].eq(_d_out)
1085
1086             # Write mux:
1087             #
1088             # Defaults to wishbone read responses (cache refill)
1089             #
1090             # For timing, the mux on wr_data/sel/addr is not
1091             # dependent on anything other than the current state.
1092
1093             with m.If(r1.write_bram):
1094                 # Write store data to BRAM.  This happens one
1095                 # cycle after the store is in r0.
1096                 comb += wr_data.eq(r1.req.data)
1097                 comb += wr_sel.eq(r1.req.byte_sel)
1098                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1099
1100                 with m.If(i == r1.req.hit_way):
1101                     comb += do_write.eq(1)
1102             with m.Else():
1103                 # Otherwise, we might be doing a reload or a DCBZ
1104                 with m.If(r1.dcbz):
1105                     comb += wr_data.eq(0)
1106                 with m.Else():
1107                     comb += wr_data.eq(wb_in.dat)
1108                 comb += wr_addr.eq(r1.store_row)
1109                 comb += wr_sel.eq(~0) # all 1s
1110
1111             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1112                       & wb_in.ack & (replace_way == i)):
1113                 comb += do_write.eq(1)
1114
1115             # Mask write selects with do_write since BRAM
1116             # doesn't have a global write-enable
1117             with m.If(do_write):
1118                 comb += wr_sel_m.eq(wr_sel)
1119
1120     # Cache hit synchronous machine for the easy case.
1121     # This handles load hits.
1122     # It also handles error cases (TLB miss, cache paradox)
1123     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1124                         req_hit_way, req_index, req_tag, access_ok,
1125                         tlb_hit, tlb_hit_way, tlb_req_index):
1126
1127         comb = m.d.comb
1128         sync = m.d.sync
1129
1130         with m.If(req_op != Op.OP_NONE):
1131             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1132                     req_op, r0.req.addr, r0.req.nc,
1133                     req_index, req_tag, req_hit_way)
1134
1135         with m.If(r0_valid):
1136             sync += r1.mmu_req.eq(r0.mmu_req)
1137
1138         # Fast path for load/store hits.
1139         # Set signals for the writeback controls.
1140         sync += r1.hit_way.eq(req_hit_way)
1141         sync += r1.hit_index.eq(req_index)
1142
1143         with m.If(req_op == Op.OP_LOAD_HIT):
1144             sync += r1.hit_load_valid.eq(1)
1145         with m.Else():
1146             sync += r1.hit_load_valid.eq(0)
1147
1148         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1149             sync += r1.cache_hit.eq(1)
1150         with m.Else():
1151             sync += r1.cache_hit.eq(0)
1152
1153         with m.If(req_op == Op.OP_BAD):
1154             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1155             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1156             sync += r1.ls_error.eq(~r0.mmu_req)
1157             sync += r1.mmu_error.eq(r0.mmu_req)
1158             sync += r1.cache_paradox.eq(access_ok)
1159
1160             with m.Else():
1161                 sync += r1.ls_error.eq(0)
1162                 sync += r1.mmu_error.eq(0)
1163                 sync += r1.cache_paradox.eq(0)
1164
1165         with m.If(req_op == Op.OP_STCX_FAIL):
1166             r1.stcx_fail.eq(1)
1167         with m.Else():
1168             sync += r1.stcx_fail.eq(0)
1169
1170         # Record TLB hit information for updating TLB PLRU
1171         sync += r1.tlb_hit.eq(tlb_hit)
1172         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1173         sync += r1.tlb_hit_index.eq(tlb_req_index)
1174
1175     # Memory accesses are handled by this state machine:
1176     #
1177     #   * Cache load miss/reload (in conjunction with "rams")
1178     #   * Load hits for non-cachable forms
1179     #   * Stores (the collision case is handled in "rams")
1180     #
1181     # All wishbone requests generation is done here.
1182     # This machine operates at stage 1.
1183     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1184                     cache_valid_bits, r0, replace_way,
1185                     req_hit_way, req_same_tag,
1186                     r0_valid, req_op, cache_tags, req_go, ra):
1187
1188         comb = m.d.comb
1189         sync = m.d.sync
1190         wb_in = self.wb_in
1191
1192         req         = MemAccessRequest("mreq_ds")
1193         acks        = Signal(3)
1194         adjust_acks = Signal(3)
1195
1196         req_row = Signal(ROW_BITS)
1197         req_idx = Signal(INDEX_BITS)
1198         req_tag = Signal(TAG_BITS)
1199         comb += req_idx.eq(get_index(req.real_addr))
1200         comb += req_row.eq(get_row(req.real_addr))
1201         comb += req_tag.eq(get_tag(req.real_addr))
1202
1203         sync += r1.use_forward1.eq(use_forward1_next)
1204         sync += r1.forward_sel.eq(0)
1205
1206         with m.If(use_forward1_next):
1207             sync += r1.forward_sel.eq(r1.req.byte_sel)
1208         with m.Elif(use_forward2_next):
1209             sync += r1.forward_sel.eq(r1.forward_sel1)
1210
1211         sync += r1.forward_data2.eq(r1.forward_data1)
1212         with m.If(r1.write_bram):
1213             sync += r1.forward_data1.eq(r1.req.data)
1214             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1215             sync += r1.forward_way1.eq(r1.req.hit_way)
1216             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1217             sync += r1.forward_valid1.eq(1)
1218         with m.Else():
1219             with m.If(r1.dcbz):
1220                 sync += r1.forward_data1.eq(0)
1221             with m.Else():
1222                 sync += r1.forward_data1.eq(wb_in.dat)
1223             sync += r1.forward_sel1.eq(~0) # all 1s
1224             sync += r1.forward_way1.eq(replace_way)
1225             sync += r1.forward_row1.eq(r1.store_row)
1226             sync += r1.forward_valid1.eq(0)
1227
1228         # One cycle pulses reset
1229         sync += r1.slow_valid.eq(0)
1230         sync += r1.write_bram.eq(0)
1231         sync += r1.inc_acks.eq(0)
1232         sync += r1.dec_acks.eq(0)
1233
1234         sync += r1.ls_valid.eq(0)
1235         # complete tlbies and TLB loads in the third cycle
1236         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1237
1238         with m.If((req_op == Op.OP_LOAD_HIT)
1239                   | (req_op == Op.OP_STCX_FAIL)):
1240             with m.If(~r0.mmu_req):
1241                 sync += r1.ls_valid.eq(1)
1242             with m.Else():
1243                 sync += r1.mmu_done.eq(1)
1244
1245         with m.If(r1.write_tag):
1246             # Store new tag in selected way
1247             for i in range(NUM_WAYS):
1248                 with m.If(i == replace_way):
1249                     ct = Signal(TAG_RAM_WIDTH)
1250                     comb += ct.eq(cache_tags[r1.store_index])
1251                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1252                     sync += cache_tags[r1.store_index].eq(ct)
1253             sync += r1.store_way.eq(replace_way)
1254             sync += r1.write_tag.eq(0)
1255
1256         # Take request from r1.req if there is one there,
1257         # else from req_op, ra, etc.
1258         with m.If(r1.full):
1259             comb += req.eq(r1.req)
1260         with m.Else():
1261             comb += req.op.eq(req_op)
1262             comb += req.valid.eq(req_go)
1263             comb += req.mmu_req.eq(r0.mmu_req)
1264             comb += req.dcbz.eq(r0.req.dcbz)
1265             comb += req.real_addr.eq(ra)
1266
1267             with m.If(~r0.req.dcbz):
1268                 comb += req.data.eq(r0.req.data)
1269             with m.Else():
1270                 comb += req.data.eq(0)
1271
1272             # Select all bytes for dcbz
1273             # and for cacheable loads
1274             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1275                 comb += req.byte_sel.eq(~0) # all 1s
1276             with m.Else():
1277                 comb += req.byte_sel.eq(r0.req.byte_sel)
1278             comb += req.hit_way.eq(req_hit_way)
1279             comb += req.same_tag.eq(req_same_tag)
1280
1281             # Store the incoming request from r0,
1282             # if it is a slow request
1283             # Note that r1.full = 1 implies req_op = OP_NONE
1284             with m.If((req_op == Op.OP_LOAD_MISS)
1285                       | (req_op == Op.OP_LOAD_NC)
1286                       | (req_op == Op.OP_STORE_MISS)
1287                       | (req_op == Op.OP_STORE_HIT)):
1288                 sync += r1.req.eq(req)
1289                 sync += r1.full.eq(1)
1290
1291         # Main state machine
1292         with m.Switch(r1.state):
1293
1294             with m.Case(State.IDLE):
1295                 sync += r1.wb.adr.eq(req.real_addr)
1296                 sync += r1.wb.sel.eq(req.byte_sel)
1297                 sync += r1.wb.dat.eq(req.data)
1298                 sync += r1.dcbz.eq(req.dcbz)
1299
1300                 # Keep track of our index and way
1301                 # for subsequent stores.
1302                 sync += r1.store_index.eq(req_idx)
1303                 sync += r1.store_row.eq(req_row)
1304                 sync += r1.end_row_ix.eq(get_row_of_line(req_row))
1305                 sync += r1.reload_tag.eq(req_tag)
1306                 sync += r1.req.same_tag.eq(1)
1307
1308                 with m.If(req.op == Op.OP_STORE_HIT):
1309                     sync += r1.store_way.eq(req.hit_way)
1310
1311                 # Reset per-row valid bits,
1312                 # ready for handling OP_LOAD_MISS
1313                 for i in range(ROW_PER_LINE):
1314                     sync += r1.rows_valid[i].eq(0)
1315
1316                 with m.If(req_op != Op.OP_NONE):
1317                     sync += Display("cache op %d", req.op)
1318
1319                 with m.Switch(req.op):
1320                     with m.Case(Op.OP_LOAD_HIT):
1321                         # stay in IDLE state
1322                         pass
1323
1324                     with m.Case(Op.OP_LOAD_MISS):
1325                         sync += Display("cache miss real addr: %x " \
1326                                 "idx: %x tag: %x",
1327                                 req.real_addr, req_row, req_tag)
1328
1329                         # Start the wishbone cycle
1330                         sync += r1.wb.we.eq(0)
1331                         sync += r1.wb.cyc.eq(1)
1332                         sync += r1.wb.stb.eq(1)
1333
1334                         # Track that we had one request sent
1335                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1336                         sync += r1.write_tag.eq(1)
1337
1338                     with m.Case(Op.OP_LOAD_NC):
1339                         sync += r1.wb.cyc.eq(1)
1340                         sync += r1.wb.stb.eq(1)
1341                         sync += r1.wb.we.eq(0)
1342                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1343
1344                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1345                         with m.If(~req.dcbz):
1346                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1347                             sync += r1.acks_pending.eq(1)
1348                             sync += r1.full.eq(0)
1349                             sync += r1.slow_valid.eq(1)
1350
1351                             with m.If(~req.mmu_req):
1352                                 sync += r1.ls_valid.eq(1)
1353                             with m.Else():
1354                                 sync += r1.mmu_done.eq(1)
1355
1356                             with m.If(req.op == Op.OP_STORE_HIT):
1357                                 sync += r1.write_bram.eq(1)
1358                         with m.Else():
1359                             # dcbz is handled much like a load miss except
1360                             # that we are writing to memory instead of reading
1361                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1362
1363                             with m.If(req.op == Op.OP_STORE_MISS):
1364                                 sync += r1.write_tag.eq(1)
1365
1366                         sync += r1.wb.we.eq(1)
1367                         sync += r1.wb.cyc.eq(1)
1368                         sync += r1.wb.stb.eq(1)
1369
1370                     # OP_NONE and OP_BAD do nothing
1371                     # OP_BAD & OP_STCX_FAIL were
1372                     # handled above already
1373                     with m.Case(Op.OP_NONE):
1374                         pass
1375                     with m.Case(Op.OP_BAD):
1376                         pass
1377                     with m.Case(Op.OP_STCX_FAIL):
1378                         pass
1379
1380             with m.Case(State.RELOAD_WAIT_ACK):
1381                 ld_stbs_done = Signal()
1382                 # Requests are all sent if stb is 0
1383                 comb += ld_stbs_done.eq(~r1.wb.stb)
1384
1385                 with m.If((~wb_in.stall) & r1.wb.stb):
1386                     # That was the last word?
1387                     # We are done sending.
1388                     # Clear stb and set ld_stbs_done
1389                     # so we can handle an eventual
1390                     # last ack on the same cycle.
1391                     with m.If(is_last_row_addr(r1.wb.adr, r1.end_row_ix)):
1392                         sync += r1.wb.stb.eq(0)
1393                         comb += ld_stbs_done.eq(1)
1394
1395                     # Calculate the next row address in the current cache line
1396                     rarange = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1397                     comb += rarange.eq(r1.wb.adr[ROW_OFF_BITS:LINE_OFF_BITS]+1)
1398                     sync += r1.wb.adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
1399
1400                 # Incoming acks processing
1401                 sync += r1.forward_valid1.eq(wb_in.ack)
1402                 with m.If(wb_in.ack):
1403                     sync += r1.rows_valid[r1.store_row[:ROW_LINE_BITS]].eq(1)
1404
1405                     # If this is the data we were looking for,
1406                     # we can complete the request next cycle.
1407                     # Compare the whole address in case the
1408                     # request in r1.req is not the one that
1409                     # started this refill.
1410                     with m.If(r1.full & r1.req.same_tag &
1411                               ((r1.dcbz & r1.req.dcbz) |
1412                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1413                                 (r1.store_row == get_row(r1.req.real_addr))):
1414                         sync += r1.full.eq(0)
1415                         sync += r1.slow_valid.eq(1)
1416                         with m.If(~r1.mmu_req):
1417                             sync += r1.ls_valid.eq(1)
1418                         with m.Else():
1419                             sync += r1.mmu_done.eq(1)
1420                         sync += r1.forward_sel.eq(~0) # all 1s
1421                         sync += r1.use_forward1.eq(1)
1422
1423                     # Check for completion
1424                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1425                                                       r1.end_row_ix)):
1426                         # Complete wishbone cycle
1427                         sync += r1.wb.cyc.eq(0)
1428
1429                         # Cache line is now valid
1430                         cv = Signal(INDEX_BITS)
1431                         comb += cv.eq(cache_valid_bits[r1.store_index])
1432                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1433                         sync += cache_valid_bits[r1.store_index].eq(cv)
1434                         sync += r1.state.eq(State.IDLE)
1435
1436                     # Increment store row counter
1437                     sync += r1.store_row.eq(next_row(r1.store_row))
1438
1439             with m.Case(State.STORE_WAIT_ACK):
1440                 st_stbs_done = Signal()
1441                 comb += st_stbs_done.eq(~r1.wb.stb)
1442                 comb += acks.eq(r1.acks_pending)
1443
1444                 with m.If(r1.inc_acks != r1.dec_acks):
1445                     with m.If(r1.inc_acks):
1446                         comb += adjust_acks.eq(acks + 1)
1447                     with m.Else():
1448                         comb += adjust_acks.eq(acks - 1)
1449                 with m.Else():
1450                     comb += adjust_acks.eq(acks)
1451
1452                 sync += r1.acks_pending.eq(adjust_acks)
1453
1454                 # Clear stb when slave accepted request
1455                 with m.If(~wb_in.stall):
1456                     # See if there is another store waiting
1457                     # to be done which is in the same real page.
1458                     with m.If(req.valid):
1459                         ra = req.real_addr[0:SET_SIZE_BITS]
1460                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
1461                         sync += r1.wb.dat.eq(req.data)
1462                         sync += r1.wb.sel.eq(req.byte_sel)
1463
1464                     with m.Elif((adjust_acks < 7) & req.same_tag &
1465                                 ((req.op == Op.OP_STORE_MISS)
1466                                  | (req.op == Op.OP_STORE_HIT))):
1467                         sync += r1.wb.stb.eq(1)
1468                         comb += st_stbs_done.eq(0)
1469
1470                         with m.If(req.op == Op.OP_STORE_HIT):
1471                             sync += r1.write_bram.eq(1)
1472                         sync += r1.full.eq(0)
1473                         sync += r1.slow_valid.eq(1)
1474
1475                         # Store requests never come from the MMU
1476                         sync += r1.ls_valid.eq(1)
1477                         comb += st_stbs_done.eq(0)
1478                         sync += r1.inc_acks.eq(1)
1479                     with m.Else():
1480                         sync += r1.wb.stb.eq(0)
1481                         comb += st_stbs_done.eq(1)
1482
1483                 # Got ack ? See if complete.
1484                 with m.If(wb_in.ack):
1485                     with m.If(st_stbs_done & (adjust_acks == 1)):
1486                         sync += r1.state.eq(State.IDLE)
1487                         sync += r1.wb.cyc.eq(0)
1488                         sync += r1.wb.stb.eq(0)
1489                     sync += r1.dec_acks.eq(1)
1490
1491             with m.Case(State.NC_LOAD_WAIT_ACK):
1492                 # Clear stb when slave accepted request
1493                 with m.If(~wb_in.stall):
1494                     sync += r1.wb.stb.eq(0)
1495
1496                 # Got ack ? complete.
1497                 with m.If(wb_in.ack):
1498                     sync += r1.state.eq(State.IDLE)
1499                     sync += r1.full.eq(0)
1500                     sync += r1.slow_valid.eq(1)
1501
1502                     with m.If(~r1.mmu_req):
1503                         sync += r1.ls_valid.eq(1)
1504                     with m.Else():
1505                         sync += r1.mmu_done.eq(1)
1506
1507                     sync += r1.forward_sel.eq(~0) # all 1s
1508                     sync += r1.use_forward1.eq(1)
1509                     sync += r1.wb.cyc.eq(0)
1510                     sync += r1.wb.stb.eq(0)
1511
1512     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1513
1514         sync = m.d.sync
1515         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1516
1517         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1518                                stall_out, req_op[:3], d_out.valid, d_out.error,
1519                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1520                                r1.wb.adr[3:6]))
1521
1522     def elaborate(self, platform):
1523
1524         m = Module()
1525         comb = m.d.comb
1526
1527         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1528         cache_tags       = CacheTagArray()
1529         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1530         cache_valid_bits = CacheValidBitsArray()
1531
1532         # TODO attribute ram_style : string;
1533         # TODO attribute ram_style of cache_tags : signal is "distributed";
1534
1535         """note: these are passed to nmigen.hdl.Memory as "attributes".
1536            don't know how, just that they are.
1537         """
1538         dtlb_valid_bits = TLBValidBitsArray()
1539         dtlb_tags       = TLBTagsArray()
1540         dtlb_ptes       = TLBPtesArray()
1541         # TODO attribute ram_style of
1542         #  dtlb_tags : signal is "distributed";
1543         # TODO attribute ram_style of
1544         #  dtlb_ptes : signal is "distributed";
1545
1546         r0      = RegStage0("r0")
1547         r0_full = Signal()
1548
1549         r1 = RegStage1("r1")
1550
1551         reservation = Reservation()
1552
1553         # Async signals on incoming request
1554         req_index    = Signal(INDEX_BITS)
1555         req_row      = Signal(ROW_BITS)
1556         req_hit_way  = Signal(WAY_BITS)
1557         req_tag      = Signal(TAG_BITS)
1558         req_op       = Signal(Op)
1559         req_data     = Signal(64)
1560         req_same_tag = Signal()
1561         req_go       = Signal()
1562
1563         early_req_row     = Signal(ROW_BITS)
1564
1565         cancel_store      = Signal()
1566         set_rsrv          = Signal()
1567         clear_rsrv        = Signal()
1568
1569         r0_valid          = Signal()
1570         r0_stall          = Signal()
1571
1572         use_forward1_next = Signal()
1573         use_forward2_next = Signal()
1574
1575         cache_out         = CacheRamOut()
1576
1577         plru_victim       = PLRUOut()
1578         replace_way       = Signal(WAY_BITS)
1579
1580         # Wishbone read/write/cache write formatting signals
1581         bus_sel           = Signal(8)
1582
1583         # TLB signals
1584         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1585         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1586         tlb_valid_way = Signal(TLB_NUM_WAYS)
1587         tlb_req_index = Signal(TLB_SET_BITS)
1588         tlb_hit       = Signal()
1589         tlb_hit_way   = Signal(TLB_WAY_BITS)
1590         pte           = Signal(TLB_PTE_BITS)
1591         ra            = Signal(REAL_ADDR_BITS)
1592         valid_ra      = Signal()
1593         perm_attr     = PermAttr("dc_perms")
1594         rc_ok         = Signal()
1595         perm_ok       = Signal()
1596         access_ok     = Signal()
1597
1598         tlb_plru_victim = TLBPLRUOut()
1599
1600         # we don't yet handle collisions between loadstore1 requests
1601         # and MMU requests
1602         comb += self.m_out.stall.eq(0)
1603
1604         # Hold off the request in r0 when r1 has an uncompleted request
1605         comb += r0_stall.eq(r0_full & r1.full)
1606         comb += r0_valid.eq(r0_full & ~r1.full)
1607         comb += self.stall_out.eq(r0_stall)
1608
1609         # Wire up wishbone request latch out of stage 1
1610         comb += self.wb_out.eq(r1.wb)
1611
1612         # call sub-functions putting everything together, using shared
1613         # signals established above
1614         self.stage_0(m, r0, r1, r0_full)
1615         self.tlb_read(m, r0_stall, tlb_valid_way,
1616                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1617                       dtlb_tags, dtlb_ptes)
1618         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1619                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1620                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1621         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1622                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1623                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1624         self.maybe_plrus(m, r1, plru_victim)
1625         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1626         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1627         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1628                            r0_valid, r1, cache_valid_bits, replace_way,
1629                            use_forward1_next, use_forward2_next,
1630                            req_hit_way, plru_victim, rc_ok, perm_attr,
1631                            valid_ra, perm_ok, access_ok, req_op, req_go,
1632                            tlb_pte_way,
1633                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1634                            cancel_store, req_same_tag, r0_stall, early_req_row)
1635         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1636                            r0_valid, r0, reservation)
1637         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1638                            reservation, r0)
1639         self.writeback_control(m, r1, cache_out)
1640         self.rams(m, r1, early_req_row, cache_out, replace_way)
1641         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1642                         req_hit_way, req_index, req_tag, access_ok,
1643                         tlb_hit, tlb_hit_way, tlb_req_index)
1644         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1645                     cache_valid_bits, r0, replace_way,
1646                     req_hit_way, req_same_tag,
1647                          r0_valid, req_op, cache_tags, req_go, ra)
1648         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1649
1650         return m
1651
1652 def dcache_load(dut, addr, nc=0):
1653     yield dut.d_in.load.eq(1)
1654     yield dut.d_in.nc.eq(nc)
1655     yield dut.d_in.addr.eq(addr)
1656     yield dut.d_in.byte_sel.eq(~0)
1657     yield dut.d_in.valid.eq(1)
1658     yield
1659     yield dut.d_in.valid.eq(0)
1660     yield dut.d_in.byte_sel.eq(0)
1661     yield
1662     while not (yield dut.d_out.valid):
1663         yield
1664     data = yield dut.d_out.data
1665     return data
1666
1667
1668 def dcache_store(dut, addr, data, nc=0):
1669     yield dut.d_in.load.eq(0)
1670     yield dut.d_in.nc.eq(nc)
1671     yield dut.d_in.data.eq(data)
1672     yield dut.d_in.byte_sel.eq(~0)
1673     yield dut.d_in.addr.eq(addr)
1674     yield dut.d_in.valid.eq(1)
1675     yield
1676     yield dut.d_in.valid.eq(0)
1677     yield dut.d_in.byte_sel.eq(0)
1678     yield
1679     while not (yield dut.d_out.valid):
1680         yield
1681
1682
1683 def dcache_random_sim(dut):
1684
1685     # start with stack of zeros
1686     sim_mem = [0] * 512
1687
1688     # clear stuff
1689     yield dut.d_in.valid.eq(0)
1690     yield dut.d_in.load.eq(0)
1691     yield dut.d_in.priv_mode.eq(1)
1692     yield dut.d_in.nc.eq(0)
1693     yield dut.d_in.addr.eq(0)
1694     yield dut.d_in.data.eq(0)
1695     yield dut.m_in.valid.eq(0)
1696     yield dut.m_in.addr.eq(0)
1697     yield dut.m_in.pte.eq(0)
1698     # wait 4 * clk_period
1699     yield
1700     yield
1701     yield
1702     yield
1703
1704     print ()
1705
1706     for i in range(256):
1707         addr = randint(0, 255)
1708         data = randint(0, (1<<64)-1)
1709         sim_mem[addr] = data
1710         addr *= 8
1711
1712         print ("testing %x data %x" % (addr, data))
1713
1714         yield from dcache_load(dut, addr)
1715         yield from dcache_store(dut, addr, data)
1716
1717         addr = randint(0, 255)
1718         sim_data = sim_mem[addr]
1719         addr *= 8
1720
1721         data = yield from dcache_load(dut, addr)
1722         assert data == sim_data, \
1723             "check %x data %x != %x" % (addr, data, sim_data)
1724
1725     for addr in range(256):
1726         data = yield from dcache_load(dut, addr*8)
1727         assert data == sim_mem[addr], \
1728             "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1729
1730 def dcache_sim(dut):
1731     # clear stuff
1732     yield dut.d_in.valid.eq(0)
1733     yield dut.d_in.load.eq(0)
1734     yield dut.d_in.priv_mode.eq(1)
1735     yield dut.d_in.nc.eq(0)
1736     yield dut.d_in.addr.eq(0)
1737     yield dut.d_in.data.eq(0)
1738     yield dut.m_in.valid.eq(0)
1739     yield dut.m_in.addr.eq(0)
1740     yield dut.m_in.pte.eq(0)
1741     # wait 4 * clk_period
1742     yield
1743     yield
1744     yield
1745     yield
1746
1747     # Cacheable read of address 4
1748     data = yield from dcache_load(dut, 0x58)
1749     addr = yield dut.d_in.addr
1750     assert data == 0x0000001700000016, \
1751         f"data @%x=%x expected 0x0000001700000016" % (addr, data)
1752
1753     # Cacheable read of address 20
1754     data = yield from dcache_load(dut, 0x20)
1755     addr = yield dut.d_in.addr
1756     assert data == 0x0000000900000008, \
1757         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1758
1759     # Cacheable read of address 30
1760     data = yield from dcache_load(dut, 0x530)
1761     addr = yield dut.d_in.addr
1762     assert data == 0x0000014D0000014C, \
1763         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1764
1765     # 2nd Cacheable read of address 30
1766     data = yield from dcache_load(dut, 0x530)
1767     addr = yield dut.d_in.addr
1768     assert data == 0x0000014D0000014C, \
1769         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1770
1771     # Non-cacheable read of address 100
1772     data = yield from dcache_load(dut, 0x100, nc=1)
1773     addr = yield dut.d_in.addr
1774     assert data == 0x0000004100000040, \
1775         f"data @%x=%x expected 0000004100000040" % (addr, data)
1776
1777     # Store at address 530
1778     yield from dcache_store(dut, 0x530, 0x121)
1779
1780     # Store at address 30
1781     yield from dcache_store(dut, 0x530, 0x12345678)
1782
1783     # 3nd Cacheable read of address 530
1784     data = yield from dcache_load(dut, 0x530)
1785     addr = yield dut.d_in.addr
1786     assert data == 0x12345678, \
1787         f"data @%x=%x expected 0x12345678" % (addr, data)
1788
1789     # 4th Cacheable read of address 20
1790     data = yield from dcache_load(dut, 0x20)
1791     addr = yield dut.d_in.addr
1792     assert data == 0x0000000900000008, \
1793         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1794
1795     yield
1796     yield
1797     yield
1798     yield
1799
1800
1801 def test_dcache(mem, test_fn, test_name):
1802     dut = DCache()
1803
1804     memory = Memory(width=64, depth=16*64, init=mem)
1805     sram = SRAM(memory=memory, granularity=8)
1806
1807     m = Module()
1808     m.submodules.dcache = dut
1809     m.submodules.sram = sram
1810
1811     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1812     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1813     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1814     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1815     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr[3:])
1816     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1817
1818     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1819     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1820
1821     # nmigen Simulation
1822     sim = Simulator(m)
1823     sim.add_clock(1e-6)
1824
1825     sim.add_sync_process(wrap(test_fn(dut)))
1826     with sim.write_vcd('test_dcache%s.vcd' % test_name):
1827         sim.run()
1828
1829 if __name__ == '__main__':
1830     dut = DCache()
1831     vl = rtlil.convert(dut, ports=[])
1832     with open("test_dcache.il", "w") as f:
1833         f.write(vl)
1834
1835     mem = []
1836     for i in range(0,512):
1837         mem.append((i*2)| ((i*2+1)<<32))
1838
1839     test_dcache(mem, dcache_sim, "")
1840     #test_dcache(None, dcache_random_sim, "random")
1841