src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 """
   6
   7 from enum import Enum, unique
   8
   9 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  10 try:
  11     from nmigen.hdl.ast import Display
  12 except ImportError:
  13     def Display(*args):
  14         return []
  15
  16 from nmigen.cli import main
  17 from nmutil.iocontrol import RecordObject
  18 from nmutil.util import wrap
  19 from nmigen.utils import log2_int
  20 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  21                                      DCacheToLoadStore1Type,
  22                                      MMUToDCacheType,
  23                                      DCacheToMMUType)
  24
  25 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  26                                 WBAddrType, WBDataType, WBSelType,
  27                                 WBMasterOut, WBSlaveOut,
  28                                 WBMasterOutVector, WBSlaveOutVector,
  29                                 WBIOMasterOut, WBIOSlaveOut)
  30
  31 from soc.experiment.cache_ram import CacheRam
  32 from soc.experiment.plru import PLRU
  33
  34 # for test
  35 from nmigen_soc.wishbone.sram import SRAM
  36 from nmigen import Memory
  37 from nmigen.cli import rtlil
  38 if True:
  39     from nmigen.back.pysim import Simulator, Delay, Settle
  40 else:
  41     from nmigen.sim.cxxsim import Simulator, Delay, Settle
  42
  43
  44 # TODO: make these parameters of DCache at some point
  45 LINE_SIZE = 64    # Line size in bytes
  46 NUM_LINES = 16    # Number of lines in a set
  47 NUM_WAYS = 4      # Number of ways
  48 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  49 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  50 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  51 LOG_LENGTH = 0    # Non-zero to enable log data collection
  52
  53 # BRAM organisation: We never access more than
  54 #     -- WB_DATA_BITS at a time so to save
  55 #     -- resources we make the array only that wide, and
  56 #     -- use consecutive indices for to make a cache "line"
  57 #     --
  58 #     -- ROW_SIZE is the width in bytes of the BRAM
  59 #     -- (based on WB, so 64-bits)
  60 ROW_SIZE = WB_DATA_BITS // 8;
  61
  62 # ROW_PER_LINE is the number of row (wishbone
  63 # transactions) in a line
  64 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  65
  66 # BRAM_ROWS is the number of rows in BRAM needed
  67 # to represent the full dcache
  68 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  69
  70
  71 # Bit fields counts in the address
  72
  73 # REAL_ADDR_BITS is the number of real address
  74 # bits that we store
  75 REAL_ADDR_BITS = 56
  76
  77 # ROW_BITS is the number of bits to select a row
  78 ROW_BITS = log2_int(BRAM_ROWS)
  79
  80 # ROW_LINE_BITS is the number of bits to select
  81 # a row within a line
  82 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  83
  84 # LINE_OFF_BITS is the number of bits for
  85 # the offset in a cache line
  86 LINE_OFF_BITS = log2_int(LINE_SIZE)
  87
  88 # ROW_OFF_BITS is the number of bits for
  89 # the offset in a row
  90 ROW_OFF_BITS = log2_int(ROW_SIZE)
  91
  92 # INDEX_BITS is the number if bits to
  93 # select a cache line
  94 INDEX_BITS = log2_int(NUM_LINES)
  95
  96 # SET_SIZE_BITS is the log base 2 of the set size
  97 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
  98
  99 # TAG_BITS is the number of bits of
 100 # the tag part of the address
 101 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 102
 103 # TAG_WIDTH is the width in bits of each way of the tag RAM
 104 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 105
 106 # WAY_BITS is the number of bits to select a way
 107 WAY_BITS = log2_int(NUM_WAYS)
 108
 109 # Example of layout for 32 lines of 64 bytes:
 110 #
 111 # ..  tag    |index|  line  |
 112 # ..         |   row   |    |
 113 # ..         |     |---|    | ROW_LINE_BITS  (3)
 114 # ..         |     |--- - --| LINE_OFF_BITS (6)
 115 # ..         |         |- --| ROW_OFF_BITS  (3)
 116 # ..         |----- ---|    | ROW_BITS      (8)
 117 # ..         |-----|        | INDEX_BITS    (5)
 118 # .. --------|              | TAG_BITS      (45)
 119
 120 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 121
 122 def CacheTagArray():
 123     return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
 124
 125 def CacheValidBitsArray():
 126     return Array(Signal(INDEX_BITS) for x in range(NUM_LINES))
 127
 128 def RowPerLineValidArray():
 129     return Array(Signal(name="rows_valid%d" % x) for x in range(ROW_PER_LINE))
 130
 131 # L1 TLB
 132 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 133 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 134 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 135 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 136 TLB_PTE_BITS     = 64
 137 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 138
 139 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 140 assert (LINE_SIZE % 2) == 0, "LINE_SIZE not power of 2"
 141 assert (NUM_LINES % 2) == 0, "NUM_LINES not power of 2"
 142 assert (ROW_PER_LINE % 2) == 0, "ROW_PER_LINE not power of 2"
 143 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 144 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 145         "geometry bits don't add up"
 146 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 147         "geometry bits don't add up"
 148 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 149          "geometry bits don't add up"
 150 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 151 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 152
 153
 154 def TLBValidBitsArray():
 155     return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
 156
 157 def TLBTagEAArray():
 158     return Array(Signal(TLB_EA_TAG_BITS) for x in range (TLB_NUM_WAYS))
 159
 160 def TLBTagsArray():
 161     return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
 162
 163 def TLBPtesArray():
 164     return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
 165
 166 def HitWaySet():
 167     return Array(Signal(WAY_BITS) for x in range(TLB_NUM_WAYS))
 168
 169 # Cache RAM interface
 170 def CacheRamOut():
 171     return Array(Signal(WB_DATA_BITS) for x in range(NUM_WAYS))
 172
 173 # PLRU output interface
 174 def PLRUOut():
 175     return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
 176
 177 # TLB PLRU output interface
 178 def TLBPLRUOut():
 179     return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
 180
 181 # Helper functions to decode incoming requests
 182 #
 183 # Return the cache line index (tag index) for an address
 184 def get_index(addr):
 185     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 186
 187 # Return the cache row index (data memory) for an address
 188 def get_row(addr):
 189     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 190
 191 # Return the index of a row within a line
 192 def get_row_of_line(row):
 193     return row[:ROW_LINE_BITS]
 194
 195 # Returns whether this is the last row of a line
 196 def is_last_row_addr(addr, last):
 197     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 198
 199 # Returns whether this is the last row of a line
 200 def is_last_row(row, last):
 201     return get_row_of_line(row) == last
 202
 203 # Return the next row in the current cache line. We use a
 204 # dedicated function in order to limit the size of the
 205 # generated adder to be only the bits within a cache line
 206 # (3 bits with default settings)
 207 def next_row(row):
 208     row_v = row[0:ROW_LINE_BITS] + 1
 209     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 210
 211 # Get the tag value from the address
 212 def get_tag(addr):
 213     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 214
 215 # Read a tag from a tag memory row
 216 def read_tag(way, tagset):
 217     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 218
 219 # Read a TLB tag from a TLB tag memory row
 220 def read_tlb_tag(way, tags):
 221     return tags.word_select(way, TLB_EA_TAG_BITS)
 222
 223 # Write a TLB tag to a TLB tag memory row
 224 def write_tlb_tag(way, tags, tag):
 225     return read_tlb_tag(way, tags).eq(tag)
 226
 227 # Read a PTE from a TLB PTE memory row
 228 def read_tlb_pte(way, ptes):
 229     return ptes.word_select(way, TLB_PTE_BITS)
 230
 231 def write_tlb_pte(way, ptes, newpte):
 232     return read_tlb_pte(way, ptes).eq(newpte)
 233
 234
 235 # Record for storing permission, attribute, etc. bits from a PTE
 236 class PermAttr(RecordObject):
 237     def __init__(self, name=None):
 238         super().__init__(name=name)
 239         self.reference = Signal()
 240         self.changed   = Signal()
 241         self.nocache   = Signal()
 242         self.priv      = Signal()
 243         self.rd_perm   = Signal()
 244         self.wr_perm   = Signal()
 245
 246
 247 def extract_perm_attr(pte):
 248     pa = PermAttr()
 249     pa.reference = pte[8]
 250     pa.changed   = pte[7]
 251     pa.nocache   = pte[5]
 252     pa.priv      = pte[3]
 253     pa.rd_perm   = pte[2]
 254     pa.wr_perm   = pte[1]
 255     return pa;
 256
 257
 258 # Type of operation on a "valid" input
 259 @unique
 260 class Op(Enum):
 261     OP_NONE       = 0
 262     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 263     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 264     OP_LOAD_HIT   = 3 # Cache hit on load
 265     OP_LOAD_MISS  = 4 # Load missing cache
 266     OP_LOAD_NC    = 5 # Non-cachable load
 267     OP_STORE_HIT  = 6 # Store hitting cache
 268     OP_STORE_MISS = 7 # Store missing cache
 269
 270
 271 # Cache state machine
 272 @unique
 273 class State(Enum):
 274     IDLE             = 0 # Normal load hit processing
 275     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 276     STORE_WAIT_ACK   = 2 # Store wait ack
 277     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 278
 279
 280 # Dcache operations:
 281 #
 282 # In order to make timing, we use the BRAMs with
 283 # an output buffer, which means that the BRAM
 284 # output is delayed by an extra cycle.
 285 #
 286 # Thus, the dcache has a 2-stage internal pipeline
 287 # for cache hits with no stalls.
 288 #
 289 # All other operations are handled via stalling
 290 # in the first stage.
 291 #
 292 # The second stage can thus complete a hit at the same
 293 # time as the first stage emits a stall for a complex op.
 294 #
 295 # Stage 0 register, basically contains just the latched request
 296
 297 class RegStage0(RecordObject):
 298     def __init__(self, name=None):
 299         super().__init__(name=name)
 300         self.req     = LoadStore1ToDCacheType(name="lsmem")
 301         self.tlbie   = Signal()
 302         self.doall   = Signal()
 303         self.tlbld   = Signal()
 304         self.mmu_req = Signal() # indicates source of request
 305
 306
 307 class MemAccessRequest(RecordObject):
 308     def __init__(self, name=None):
 309         super().__init__(name=name)
 310         self.op        = Signal(Op)
 311         self.valid     = Signal()
 312         self.dcbz      = Signal()
 313         self.real_addr = Signal(REAL_ADDR_BITS)
 314         self.data      = Signal(64)
 315         self.byte_sel  = Signal(8)
 316         self.hit_way   = Signal(WAY_BITS)
 317         self.same_tag  = Signal()
 318         self.mmu_req   = Signal()
 319
 320
 321 # First stage register, contains state for stage 1 of load hits
 322 # and for the state machine used by all other operations
 323 class RegStage1(RecordObject):
 324     def __init__(self, name=None):
 325         super().__init__(name=name)
 326         # Info about the request
 327         self.full             = Signal() # have uncompleted request
 328         self.mmu_req          = Signal() # request is from MMU
 329         self.req              = MemAccessRequest(name="reqmem")
 330
 331         # Cache hit state
 332         self.hit_way          = Signal(WAY_BITS)
 333         self.hit_load_valid   = Signal()
 334         self.hit_index        = Signal(INDEX_BITS)
 335         self.cache_hit        = Signal()
 336
 337         # TLB hit state
 338         self.tlb_hit          = Signal()
 339         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 340         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 341
 342         # 2-stage data buffer for data forwarded from writes to reads
 343         self.forward_data1    = Signal(64)
 344         self.forward_data2    = Signal(64)
 345         self.forward_sel1     = Signal(8)
 346         self.forward_valid1   = Signal()
 347         self.forward_way1     = Signal(WAY_BITS)
 348         self.forward_row1     = Signal(ROW_BITS)
 349         self.use_forward1     = Signal()
 350         self.forward_sel      = Signal(8)
 351
 352         # Cache miss state (reload state machine)
 353         self.state            = Signal(State)
 354         self.dcbz             = Signal()
 355         self.write_bram       = Signal()
 356         self.write_tag        = Signal()
 357         self.slow_valid       = Signal()
 358         self.wb               = WBMasterOut()
 359         self.reload_tag       = Signal(TAG_BITS)
 360         self.store_way        = Signal(WAY_BITS)
 361         self.store_row        = Signal(ROW_BITS)
 362         self.store_index      = Signal(INDEX_BITS)
 363         self.end_row_ix       = Signal(ROW_LINE_BITS)
 364         self.rows_valid       = RowPerLineValidArray()
 365         self.acks_pending     = Signal(3)
 366         self.inc_acks         = Signal()
 367         self.dec_acks         = Signal()
 368
 369         # Signals to complete (possibly with error)
 370         self.ls_valid         = Signal()
 371         self.ls_error         = Signal()
 372         self.mmu_done         = Signal()
 373         self.mmu_error        = Signal()
 374         self.cache_paradox    = Signal()
 375
 376         # Signal to complete a failed stcx.
 377         self.stcx_fail        = Signal()
 378
 379
 380 # Reservation information
 381 class Reservation(RecordObject):
 382     def __init__(self):
 383         super().__init__()
 384         self.valid = Signal()
 385         self.addr  = Signal(64-LINE_OFF_BITS)
 386
 387
 388 class DTLBUpdate(Elaboratable):
 389     def __init__(self):
 390         self.tlbie    = Signal()
 391         self.tlbwe    = Signal()
 392         self.doall    = Signal()
 393         self.updated  = Signal()
 394         self.v_updated  = Signal()
 395         self.tlb_hit    = Signal()
 396         self.tlb_req_index = Signal(TLB_SET_BITS)
 397
 398         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 399         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 400         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 401         self.repl_way        = Signal(TLB_WAY_BITS)
 402         self.eatag           = Signal(TLB_EA_TAG_BITS)
 403         self.pte_data        = Signal(TLB_PTE_BITS)
 404
 405         self.dv = Signal(TLB_PTE_WAY_BITS)
 406
 407         self.tb_out = Signal(TLB_TAG_WAY_BITS)
 408         self.pb_out = Signal(TLB_NUM_WAYS)
 409         self.db_out = Signal(TLB_PTE_WAY_BITS)
 410
 411     def elaborate(self, platform):
 412         m = Module()
 413         comb = m.d.comb
 414         sync = m.d.sync
 415
 416         tagset   = Signal(TLB_TAG_WAY_BITS)
 417         pteset   = Signal(TLB_PTE_WAY_BITS)
 418
 419         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 420
 421         with m.If(self.tlbie & self.doall):
 422             pass # clear all back in parent
 423         with m.Elif(self.tlbie):
 424             with m.If(self.tlb_hit):
 425                 comb += db_out.eq(self.dv)
 426                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 427                 comb += self.v_updated.eq(1)
 428
 429         with m.Elif(self.tlbwe):
 430
 431             comb += tagset.eq(self.tlb_tag_way)
 432             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 433             comb += tb_out.eq(tagset)
 434
 435             comb += pteset.eq(self.tlb_pte_way)
 436             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 437             comb += pb_out.eq(pteset)
 438
 439             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 440
 441             comb += self.updated.eq(1)
 442             comb += self.v_updated.eq(1)
 443
 444         return m
 445
 446     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 447                        r0_valid, r1, cache_valid_bits, replace_way,
 448                        use_forward1_next, use_forward2_next,
 449                        req_hit_way, plru_victim, rc_ok, perm_attr,
 450                        valid_ra, perm_ok, access_ok, req_op, req_go,
 451                        tlb_pte_way,
 452                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 453                        cancel_store, req_same_tag, r0_stall, early_req_row):
 454         """Cache request parsing and hit detection
 455         """
 456
 457 class DCachePendingHit(Elaboratable):
 458
 459     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 460                       cache_valid_idx, cache_tag_set,
 461                     req_addr,
 462                     hit_set):
 463
 464         self.go          = Signal()
 465         self.virt_mode   = Signal()
 466         self.is_hit      = Signal()
 467         self.tlb_hit     = Signal()
 468         self.hit_way     = Signal(WAY_BITS)
 469         self.rel_match   = Signal()
 470         self.req_index   = Signal(INDEX_BITS)
 471         self.reload_tag  = Signal(TAG_BITS)
 472
 473         self.tlb_hit_way = tlb_hit_way
 474         self.tlb_pte_way = tlb_pte_way
 475         self.tlb_valid_way = tlb_valid_way
 476         self.cache_valid_idx = cache_valid_idx
 477         self.cache_tag_set = cache_tag_set
 478         self.req_addr = req_addr
 479         self.hit_set = hit_set
 480
 481     def elaborate(self, platform):
 482         m = Module()
 483         comb = m.d.comb
 484         sync = m.d.sync
 485
 486         go = self.go
 487         virt_mode = self.virt_mode
 488         is_hit = self.is_hit
 489         tlb_pte_way = self.tlb_pte_way
 490         tlb_valid_way = self.tlb_valid_way
 491         cache_valid_idx = self.cache_valid_idx
 492         cache_tag_set = self.cache_tag_set
 493         req_addr = self.req_addr
 494         tlb_hit_way = self.tlb_hit_way
 495         tlb_hit = self.tlb_hit
 496         hit_set = self.hit_set
 497         hit_way = self.hit_way
 498         rel_match = self.rel_match
 499         req_index = self.req_index
 500         reload_tag = self.reload_tag
 501
 502         rel_matches = Array(Signal() for i in range(TLB_NUM_WAYS))
 503         hit_way_set = HitWaySet()
 504
 505         # Test if pending request is a hit on any way
 506         # In order to make timing in virtual mode,
 507         # when we are using the TLB, we compare each
 508         # way with each of the real addresses from each way of
 509         # the TLB, and then decide later which match to use.
 510
 511         with m.If(virt_mode):
 512             for j in range(TLB_NUM_WAYS):
 513                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 514                 s_hit       = Signal()
 515                 s_pte       = Signal(TLB_PTE_BITS)
 516                 s_ra        = Signal(REAL_ADDR_BITS)
 517                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 518                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 519                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 520                 comb += s_tag.eq(get_tag(s_ra))
 521
 522                 for i in range(NUM_WAYS):
 523                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 524                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 525                                   (read_tag(i, cache_tag_set) == s_tag)
 526                                   & tlb_valid_way[j])
 527                     with m.If(is_tag_hit):
 528                         comb += hit_way_set[j].eq(i)
 529                         comb += s_hit.eq(1)
 530                 comb += hit_set[j].eq(s_hit)
 531                 with m.If(s_tag == reload_tag):
 532                     comb += rel_matches[j].eq(1)
 533             with m.If(tlb_hit):
 534                 comb += is_hit.eq(hit_set[tlb_hit_way])
 535                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 536                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 537         with m.Else():
 538             s_tag       = Signal(TAG_BITS)
 539             comb += s_tag.eq(get_tag(req_addr))
 540             for i in range(NUM_WAYS):
 541                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 542                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 543                           (read_tag(i, cache_tag_set) == s_tag))
 544                 with m.If(is_tag_hit):
 545                     comb += hit_way.eq(i)
 546                     comb += is_hit.eq(1)
 547             with m.If(s_tag == reload_tag):
 548                 comb += rel_match.eq(1)
 549
 550         return m
 551
 552
 553 class DCache(Elaboratable):
 554     """Set associative dcache write-through
 555     TODO (in no specific order):
 556     * See list in icache.vhdl
 557     * Complete load misses on the cycle when WB data comes instead of
 558       at the end of line (this requires dealing with requests coming in
 559       while not idle...)
 560     """
 561     def __init__(self):
 562         self.d_in      = LoadStore1ToDCacheType("d_in")
 563         self.d_out     = DCacheToLoadStore1Type("d_out")
 564
 565         self.m_in      = MMUToDCacheType("m_in")
 566         self.m_out     = DCacheToMMUType("m_out")
 567
 568         self.stall_out = Signal()
 569
 570         self.wb_out    = WBMasterOut()
 571         self.wb_in     = WBSlaveOut()
 572
 573         self.log_out   = Signal(20)
 574
 575     def stage_0(self, m, r0, r1, r0_full):
 576         """Latch the request in r0.req as long as we're not stalling
 577         """
 578         comb = m.d.comb
 579         sync = m.d.sync
 580         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 581
 582         r = RegStage0("stage0")
 583
 584         # TODO, this goes in unit tests and formal proofs
 585         with m.If(~(d_in.valid & m_in.valid)):
 586             #sync += Display("request collision loadstore vs MMU")
 587             pass
 588
 589         with m.If(m_in.valid):
 590             sync += r.req.valid.eq(1)
 591             sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 592             sync += r.req.dcbz.eq(0)
 593             sync += r.req.nc.eq(0)
 594             sync += r.req.reserve.eq(0)
 595             sync += r.req.virt_mode.eq(1)
 596             sync += r.req.priv_mode.eq(1)
 597             sync += r.req.addr.eq(m_in.addr)
 598             sync += r.req.data.eq(m_in.pte)
 599             sync += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 600             sync += r.tlbie.eq(m_in.tlbie)
 601             sync += r.doall.eq(m_in.doall)
 602             sync += r.tlbld.eq(m_in.tlbld)
 603             sync += r.mmu_req.eq(1)
 604         with m.Else():
 605             sync += r.req.eq(d_in)
 606             sync += r.tlbie.eq(0)
 607             sync += r.doall.eq(0)
 608             sync += r.tlbld.eq(0)
 609             sync += r.mmu_req.eq(0)
 610             with m.If(~(r1.full & r0_full)):
 611                 sync += r0.eq(r)
 612                 sync += r0_full.eq(r.req.valid)
 613
 614     def tlb_read(self, m, r0_stall, tlb_valid_way,
 615                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 616                  dtlb_tags, dtlb_ptes):
 617         """TLB
 618         Operates in the second cycle on the request latched in r0.req.
 619         TLB updates write the entry at the end of the second cycle.
 620         """
 621         comb = m.d.comb
 622         sync = m.d.sync
 623         m_in, d_in = self.m_in, self.d_in
 624
 625         index    = Signal(TLB_SET_BITS)
 626         addrbits = Signal(TLB_SET_BITS)
 627
 628         amin = TLB_LG_PGSZ
 629         amax = TLB_LG_PGSZ + TLB_SET_BITS
 630
 631         with m.If(m_in.valid):
 632             comb += addrbits.eq(m_in.addr[amin : amax])
 633         with m.Else():
 634             comb += addrbits.eq(d_in.addr[amin : amax])
 635         comb += index.eq(addrbits)
 636
 637         # If we have any op and the previous op isn't finished,
 638         # then keep the same output for next cycle.
 639         with m.If(~r0_stall):
 640             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 641             sync += tlb_tag_way.eq(dtlb_tags[index])
 642             sync += tlb_pte_way.eq(dtlb_ptes[index])
 643
 644     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 645         """Generate TLB PLRUs
 646         """
 647         comb = m.d.comb
 648         sync = m.d.sync
 649
 650         if TLB_NUM_WAYS == 0:
 651             return
 652         for i in range(TLB_SET_SIZE):
 653             # TLB PLRU interface
 654             tlb_plru        = PLRU(WAY_BITS)
 655             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 656             tlb_plru_acc_en = Signal()
 657
 658             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 659             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 660             comb += tlb_plru.acc.eq(r1.tlb_hit_way)
 661             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 662
 663     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 664                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 665                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 666
 667         comb = m.d.comb
 668         sync = m.d.sync
 669
 670         hitway = Signal(TLB_WAY_BITS)
 671         hit    = Signal()
 672         eatag  = Signal(TLB_EA_TAG_BITS)
 673
 674         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 675         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 676         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 677
 678         for i in range(TLB_NUM_WAYS):
 679             is_tag_hit = Signal()
 680             comb += is_tag_hit.eq(tlb_valid_way[i]
 681                                   & read_tlb_tag(i, tlb_tag_way) == eatag)
 682             with m.If(is_tag_hit):
 683                 comb += hitway.eq(i)
 684                 comb += hit.eq(1)
 685
 686         comb += tlb_hit.eq(hit & r0_valid)
 687         comb += tlb_hit_way.eq(hitway)
 688
 689         with m.If(tlb_hit):
 690             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 691         with m.Else():
 692             comb += pte.eq(0)
 693         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 694         with m.If(r0.req.virt_mode):
 695             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 696                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 697                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 698             comb += perm_attr.eq(extract_perm_attr(pte))
 699         with m.Else():
 700             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 701                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 702
 703             comb += perm_attr.reference.eq(1)
 704             comb += perm_attr.changed.eq(1)
 705             comb += perm_attr.nocache.eq(0)
 706             comb += perm_attr.priv.eq(1)
 707             comb += perm_attr.rd_perm.eq(1)
 708             comb += perm_attr.wr_perm.eq(1)
 709
 710     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 711                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 712                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 713
 714         comb = m.d.comb
 715         sync = m.d.sync
 716
 717         tlbie    = Signal()
 718         tlbwe    = Signal()
 719
 720         comb += tlbie.eq(r0_valid & r0.tlbie)
 721         comb += tlbwe.eq(r0_valid & r0.tlbld)
 722
 723         m.submodules.tlb_update = d = DTLBUpdate()
 724         with m.If(tlbie & r0.doall):
 725             # clear all valid bits at once
 726             for i in range(TLB_SET_SIZE):
 727                 sync += dtlb_valid_bits[i].eq(0)
 728         with m.If(d.updated):
 729             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 730             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 731         with m.If(d.v_updated):
 732             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 733
 734         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 735
 736         comb += d.tlbie.eq(tlbie)
 737         comb += d.tlbwe.eq(tlbwe)
 738         comb += d.doall.eq(r0.doall)
 739         comb += d.tlb_hit.eq(tlb_hit)
 740         comb += d.tlb_hit_way.eq(tlb_hit_way)
 741         comb += d.tlb_tag_way.eq(tlb_tag_way)
 742         comb += d.tlb_pte_way.eq(tlb_pte_way)
 743         comb += d.tlb_req_index.eq(tlb_req_index)
 744
 745         with m.If(tlb_hit):
 746             comb += d.repl_way.eq(tlb_hit_way)
 747         with m.Else():
 748             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 749         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 750         comb += d.pte_data.eq(r0.req.data)
 751
 752     def maybe_plrus(self, m, r1, plru_victim):
 753         """Generate PLRUs
 754         """
 755         comb = m.d.comb
 756         sync = m.d.sync
 757
 758         if TLB_NUM_WAYS == 0:
 759             return
 760
 761         for i in range(NUM_LINES):
 762             # PLRU interface
 763             plru        = PLRU(WAY_BITS)
 764             setattr(m.submodules, "plru%d" % i, plru)
 765             plru_acc_en = Signal()
 766
 767             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 768             comb += plru.acc_en.eq(plru_acc_en)
 769             comb += plru.acc.eq(r1.hit_way)
 770             comb += plru_victim[i].eq(plru.lru_o)
 771
 772     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 773         """Cache tag RAM read port
 774         """
 775         comb = m.d.comb
 776         sync = m.d.sync
 777         m_in, d_in = self.m_in, self.d_in
 778
 779         index = Signal(INDEX_BITS)
 780
 781         with m.If(r0_stall):
 782             comb += index.eq(req_index)
 783         with m.Elif(m_in.valid):
 784             comb += index.eq(get_index(m_in.addr))
 785         with m.Else():
 786             comb += index.eq(get_index(d_in.addr))
 787         sync += cache_tag_set.eq(cache_tags[index])
 788
 789     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 790                        r0_valid, r1, cache_valid_bits, replace_way,
 791                        use_forward1_next, use_forward2_next,
 792                        req_hit_way, plru_victim, rc_ok, perm_attr,
 793                        valid_ra, perm_ok, access_ok, req_op, req_go,
 794                        tlb_pte_way,
 795                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 796                        cancel_store, req_same_tag, r0_stall, early_req_row):
 797         """Cache request parsing and hit detection
 798         """
 799
 800         comb = m.d.comb
 801         sync = m.d.sync
 802         m_in, d_in = self.m_in, self.d_in
 803
 804         is_hit      = Signal()
 805         hit_way     = Signal(WAY_BITS)
 806         op          = Signal(Op)
 807         opsel       = Signal(3)
 808         go          = Signal()
 809         nc          = Signal()
 810         hit_set     = Array(Signal() for i in range(TLB_NUM_WAYS))
 811         cache_valid_idx = Signal(INDEX_BITS)
 812
 813         # Extract line, row and tag from request
 814         comb += req_index.eq(get_index(r0.req.addr))
 815         comb += req_row.eq(get_row(r0.req.addr))
 816         comb += req_tag.eq(get_tag(ra))
 817
 818         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 819         comb += cache_valid_idx.eq(cache_valid_bits[req_index])
 820
 821         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 822                                 tlb_valid_way, tlb_hit_way,
 823                                 cache_valid_idx, cache_tag_set,
 824                                 r0.req.addr,
 825                                 hit_set)
 826
 827         comb += dc.tlb_hit.eq(tlb_hit)
 828         comb += dc.reload_tag.eq(r1.reload_tag)
 829         comb += dc.virt_mode.eq(r0.req.virt_mode)
 830         comb += dc.go.eq(go)
 831         comb += dc.req_index.eq(req_index)
 832         comb += is_hit.eq(dc.is_hit)
 833         comb += hit_way.eq(dc.hit_way)
 834         comb += req_same_tag.eq(dc.rel_match)
 835
 836         # See if the request matches the line currently being reloaded
 837         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 838                   (req_index == r1.store_index) & req_same_tag):
 839             # For a store, consider this a hit even if the row isn't
 840             # valid since it will be by the time we perform the store.
 841             # For a load, check the appropriate row valid bit.
 842             valid = r1.rows_valid[req_row % ROW_PER_LINE]
 843             comb += is_hit.eq(~r0.req.load | valid)
 844             comb += hit_way.eq(replace_way)
 845
 846         # Whether to use forwarded data for a load or not
 847         comb += use_forward1_next.eq(0)
 848         with m.If((get_row(r1.req.real_addr) == req_row) &
 849                   (r1.req.hit_way == hit_way)):
 850             # Only need to consider r1.write_bram here, since if we
 851             # are writing refill data here, then we don't have a
 852             # cache hit this cycle on the line being refilled.
 853             # (There is the possibility that the load following the
 854             # load miss that started the refill could be to the old
 855             # contents of the victim line, since it is a couple of
 856             # cycles after the refill starts before we see the updated
 857             # cache tag. In that case we don't use the bypass.)
 858             comb += use_forward1_next.eq(r1.write_bram)
 859         comb += use_forward2_next.eq(0)
 860         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 861             comb += use_forward2_next.eq(r1.forward_valid1)
 862
 863         # The way that matched on a hit
 864         comb += req_hit_way.eq(hit_way)
 865
 866         # The way to replace on a miss
 867         with m.If(r1.write_tag):
 868             comb += replace_way.eq(plru_victim[r1.store_index])
 869         with m.Else():
 870             comb += replace_way.eq(r1.store_way)
 871
 872         # work out whether we have permission for this access
 873         # NB we don't yet implement AMR, thus no KUAP
 874         comb += rc_ok.eq(perm_attr.reference
 875                          & (r0.req.load | perm_attr.changed)
 876                 )
 877         comb += perm_ok.eq((r0.req.priv_mode | ~perm_attr.priv) &
 878                            (perm_attr.wr_perm |
 879                               (r0.req.load & perm_attr.rd_perm)))
 880         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 881         # Combine the request and cache hit status to decide what
 882         # operation needs to be done
 883         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 884         comb += op.eq(Op.OP_NONE)
 885         with m.If(go):
 886             with m.If(~access_ok):
 887                 comb += op.eq(Op.OP_BAD)
 888             with m.Elif(cancel_store):
 889                 comb += op.eq(Op.OP_STCX_FAIL)
 890             with m.Else():
 891                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 892                 with m.Switch(opsel):
 893                     with m.Case(0b101):
 894                         comb += op.eq(Op.OP_LOAD_HIT)
 895                     with m.Case(0b100):
 896                         comb += op.eq(Op.OP_LOAD_MISS)
 897                     with m.Case(0b110):
 898                         comb += op.eq(Op.OP_LOAD_NC)
 899                     with m.Case(0b001):
 900                         comb += op.eq(Op.OP_STORE_HIT)
 901                     with m.Case(0b000):
 902                         comb += op.eq(Op.OP_STORE_MISS)
 903                     with m.Case(0b010):
 904                         comb += op.eq(Op.OP_STORE_MISS)
 905                     with m.Case(0b011):
 906                         comb += op.eq(Op.OP_BAD)
 907                     with m.Case(0b111):
 908                         comb += op.eq(Op.OP_BAD)
 909                     with m.Default():
 910                         comb += op.eq(Op.OP_NONE)
 911         comb += req_op.eq(op)
 912         comb += req_go.eq(go)
 913
 914         # Version of the row number that is valid one cycle earlier
 915         # in the cases where we need to read the cache data BRAM.
 916         # If we're stalling then we need to keep reading the last
 917         # row requested.
 918         with m.If(~r0_stall):
 919             with m.If(m_in.valid):
 920                 comb += early_req_row.eq(get_row(m_in.addr))
 921             with m.Else():
 922                 comb += early_req_row.eq(get_row(d_in.addr))
 923         with m.Else():
 924             comb += early_req_row.eq(req_row)
 925
 926     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 927                          r0_valid, r0, reservation):
 928         """Handle load-with-reservation and store-conditional instructions
 929         """
 930         comb = m.d.comb
 931         sync = m.d.sync
 932
 933         with m.If(r0_valid & r0.req.reserve):
 934
 935             # XXX generate alignment interrupt if address
 936             # is not aligned XXX or if r0.req.nc = '1'
 937             with m.If(r0.req.load):
 938                 comb += set_rsrv.eq(1) # load with reservation
 939             with m.Else():
 940                 comb += clear_rsrv.eq(1) # store conditional
 941                 with m.If(~reservation.valid | r0.req.addr[LINE_OFF_BITS:64]):
 942                     comb += cancel_store.eq(1)
 943
 944     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 945                         reservation, r0):
 946
 947         comb = m.d.comb
 948         sync = m.d.sync
 949
 950         with m.If(r0_valid & access_ok):
 951             with m.If(clear_rsrv):
 952                 sync += reservation.valid.eq(0)
 953             with m.Elif(set_rsrv):
 954                 sync += reservation.valid.eq(1)
 955                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 956
 957     def writeback_control(self, m, r1, cache_out):
 958         """Return data for loads & completion control logic
 959         """
 960         comb = m.d.comb
 961         sync = m.d.sync
 962         d_out, m_out = self.d_out, self.m_out
 963
 964         data_out = Signal(64)
 965         data_fwd = Signal(64)
 966
 967         # Use the bypass if are reading the row that was
 968         # written 1 or 2 cycles ago, including for the
 969         # slow_valid = 1 case (i.e. completing a load
 970         # miss or a non-cacheable load).
 971         with m.If(r1.use_forward1):
 972             comb += data_fwd.eq(r1.forward_data1)
 973         with m.Else():
 974             comb += data_fwd.eq(r1.forward_data2)
 975
 976         comb += data_out.eq(cache_out[r1.hit_way])
 977
 978         for i in range(8):
 979             with m.If(r1.forward_sel[i]):
 980                 dsel = data_fwd.word_select(i, 8)
 981                 comb += data_out.word_select(i, 8).eq(dsel)
 982
 983         comb += d_out.valid.eq(r1.ls_valid)
 984         comb += d_out.data.eq(data_out)
 985         comb += d_out.store_done.eq(~r1.stcx_fail)
 986         comb += d_out.error.eq(r1.ls_error)
 987         comb += d_out.cache_paradox.eq(r1.cache_paradox)
 988
 989         # Outputs to MMU
 990         comb += m_out.done.eq(r1.mmu_done)
 991         comb += m_out.err.eq(r1.mmu_error)
 992         comb += m_out.data.eq(data_out)
 993
 994         # We have a valid load or store hit or we just completed
 995         # a slow op such as a load miss, a NC load or a store
 996         #
 997         # Note: the load hit is delayed by one cycle. However it
 998         # can still not collide with r.slow_valid (well unless I
 999         # miscalculated) because slow_valid can only be set on a
1000         # subsequent request and not on its first cycle (the state
1001         # machine must have advanced), which makes slow_valid
1002         # at least 2 cycles from the previous hit_load_valid.
1003
1004         # Sanity: Only one of these must be set in any given cycle
1005
1006         if False: # TODO: need Display to get this to work
1007             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1008             "unexpected slow_valid collision with stcx_fail"
1009
1010             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1011              "unexpected hit_load_delayed collision with slow_valid"
1012
1013         with m.If(~r1.mmu_req):
1014             # Request came from loadstore1...
1015             # Load hit case is the standard path
1016             with m.If(r1.hit_load_valid):
1017                 sync += Display("completing load hit data=%x", data_out)
1018
1019             # error cases complete without stalling
1020             with m.If(r1.ls_error):
1021                 sync += Display("completing ld/st with error")
1022
1023             # Slow ops (load miss, NC, stores)
1024             with m.If(r1.slow_valid):
1025                 sync += Display("completing store or load miss data=%x",
1026                                 data_out)
1027
1028         with m.Else():
1029             # Request came from MMU
1030             with m.If(r1.hit_load_valid):
1031                 sync += Display("completing load hit to MMU, data=%x",
1032                                 m_out.data)
1033             # error cases complete without stalling
1034             with m.If(r1.mmu_error):
1035                 sync += Display("combpleting MMU ld with error")
1036
1037             # Slow ops (i.e. load miss)
1038             with m.If(r1.slow_valid):
1039                 sync += Display("completing MMU load miss, data=%x",
1040                                 m_out.data)
1041
1042     def rams(self, m, r1, early_req_row, cache_out, replace_way):
1043         """rams
1044         Generate a cache RAM for each way. This handles the normal
1045         reads, writes from reloads and the special store-hit update
1046         path as well.
1047
1048         Note: the BRAMs have an extra read buffer, meaning the output
1049         is pipelined an extra cycle. This differs from the
1050         icache. The writeback logic needs to take that into
1051         account by using 1-cycle delayed signals for load hits.
1052         """
1053         comb = m.d.comb
1054         wb_in = self.wb_in
1055
1056         for i in range(NUM_WAYS):
1057             do_read  = Signal()
1058             rd_addr  = Signal(ROW_BITS)
1059             do_write = Signal()
1060             wr_addr  = Signal(ROW_BITS)
1061             wr_data  = Signal(WB_DATA_BITS)
1062             wr_sel   = Signal(ROW_SIZE)
1063             wr_sel_m = Signal(ROW_SIZE)
1064             _d_out   = Signal(WB_DATA_BITS)
1065
1066             way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
1067             setattr(m.submodules, "cacheram_%d" % i, way)
1068
1069             comb += way.rd_en.eq(do_read)
1070             comb += way.rd_addr.eq(rd_addr)
1071             comb += _d_out.eq(way.rd_data_o)
1072             comb += way.wr_sel.eq(wr_sel_m)
1073             comb += way.wr_addr.eq(wr_addr)
1074             comb += way.wr_data.eq(wr_data)
1075
1076             # Cache hit reads
1077             comb += do_read.eq(1)
1078             comb += rd_addr.eq(early_req_row)
1079             comb += cache_out[i].eq(_d_out)
1080
1081             # Write mux:
1082             #
1083             # Defaults to wishbone read responses (cache refill)
1084             #
1085             # For timing, the mux on wr_data/sel/addr is not
1086             # dependent on anything other than the current state.
1087
1088             with m.If(r1.write_bram):
1089                 # Write store data to BRAM.  This happens one
1090                 # cycle after the store is in r0.
1091                 comb += wr_data.eq(r1.req.data)
1092                 comb += wr_sel.eq(r1.req.byte_sel)
1093                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1094
1095                 with m.If(i == r1.req.hit_way):
1096                     comb += do_write.eq(1)
1097             with m.Else():
1098                 # Otherwise, we might be doing a reload or a DCBZ
1099                 with m.If(r1.dcbz):
1100                     comb += wr_data.eq(0)
1101                 with m.Else():
1102                     comb += wr_data.eq(wb_in.dat)
1103                 comb += wr_addr.eq(r1.store_row)
1104                 comb += wr_sel.eq(~0) # all 1s
1105
1106             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1107                       & wb_in.ack & (replace_way == i)):
1108                 comb += do_write.eq(1)
1109
1110             # Mask write selects with do_write since BRAM
1111             # doesn't have a global write-enable
1112             with m.If(do_write):
1113                 comb += wr_sel_m.eq(wr_sel)
1114
1115     # Cache hit synchronous machine for the easy case.
1116     # This handles load hits.
1117     # It also handles error cases (TLB miss, cache paradox)
1118     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1119                         req_hit_way, req_index, req_tag, access_ok,
1120                         tlb_hit, tlb_hit_way, tlb_req_index):
1121
1122         comb = m.d.comb
1123         sync = m.d.sync
1124
1125         with m.If(req_op != Op.OP_NONE):
1126             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1127                     req_op, r0.req.addr, r0.req.nc,
1128                     req_index, req_tag, req_hit_way)
1129
1130         with m.If(r0_valid):
1131             sync += r1.mmu_req.eq(r0.mmu_req)
1132
1133         # Fast path for load/store hits.
1134         # Set signals for the writeback controls.
1135         sync += r1.hit_way.eq(req_hit_way)
1136         sync += r1.hit_index.eq(req_index)
1137
1138         with m.If(req_op == Op.OP_LOAD_HIT):
1139             sync += r1.hit_load_valid.eq(1)
1140         with m.Else():
1141             sync += r1.hit_load_valid.eq(0)
1142
1143         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1144             sync += r1.cache_hit.eq(1)
1145         with m.Else():
1146             sync += r1.cache_hit.eq(0)
1147
1148         with m.If(req_op == Op.OP_BAD):
1149             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1150             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1151             sync += r1.ls_error.eq(~r0.mmu_req)
1152             sync += r1.mmu_error.eq(r0.mmu_req)
1153             sync += r1.cache_paradox.eq(access_ok)
1154
1155             with m.Else():
1156                 sync += r1.ls_error.eq(0)
1157                 sync += r1.mmu_error.eq(0)
1158                 sync += r1.cache_paradox.eq(0)
1159
1160         with m.If(req_op == Op.OP_STCX_FAIL):
1161             r1.stcx_fail.eq(1)
1162         with m.Else():
1163             sync += r1.stcx_fail.eq(0)
1164
1165         # Record TLB hit information for updating TLB PLRU
1166         sync += r1.tlb_hit.eq(tlb_hit)
1167         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1168         sync += r1.tlb_hit_index.eq(tlb_req_index)
1169
1170     # Memory accesses are handled by this state machine:
1171     #
1172     #   * Cache load miss/reload (in conjunction with "rams")
1173     #   * Load hits for non-cachable forms
1174     #   * Stores (the collision case is handled in "rams")
1175     #
1176     # All wishbone requests generation is done here.
1177     # This machine operates at stage 1.
1178     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1179                     cache_valid_bits, r0, replace_way,
1180                     req_hit_way, req_same_tag,
1181                     r0_valid, req_op, cache_tag, req_go, ra):
1182
1183         comb = m.d.comb
1184         sync = m.d.sync
1185         wb_in = self.wb_in
1186
1187         req         = MemAccessRequest("mreq_ds")
1188         acks        = Signal(3)
1189         adjust_acks = Signal(3)
1190
1191         sync += r1.use_forward1.eq(use_forward1_next)
1192         sync += r1.forward_sel.eq(0)
1193
1194         with m.If(use_forward1_next):
1195             sync += r1.forward_sel.eq(r1.req.byte_sel)
1196         with m.Elif(use_forward2_next):
1197             sync += r1.forward_sel.eq(r1.forward_sel1)
1198
1199         sync += r1.forward_data2.eq(r1.forward_data1)
1200         with m.If(r1.write_bram):
1201             sync += r1.forward_data1.eq(r1.req.data)
1202             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1203             sync += r1.forward_way1.eq(r1.req.hit_way)
1204             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1205             sync += r1.forward_valid1.eq(1)
1206         with m.Else():
1207             with m.If(r1.dcbz):
1208                 sync += r1.forward_data1.eq(0)
1209             with m.Else():
1210                 sync += r1.forward_data1.eq(wb_in.dat)
1211             sync += r1.forward_sel1.eq(~0) # all 1s
1212             sync += r1.forward_way1.eq(replace_way)
1213             sync += r1.forward_row1.eq(r1.store_row)
1214             sync += r1.forward_valid1.eq(0)
1215
1216         # One cycle pulses reset
1217         sync += r1.slow_valid.eq(0)
1218         sync += r1.write_bram.eq(0)
1219         sync += r1.inc_acks.eq(0)
1220         sync += r1.dec_acks.eq(0)
1221
1222         sync += r1.ls_valid.eq(0)
1223         # complete tlbies and TLB loads in the third cycle
1224         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1225
1226         with m.If((req_op == Op.OP_LOAD_HIT)
1227                   | (req_op == Op.OP_STCX_FAIL)):
1228             with m.If(~r0.mmu_req):
1229                 sync += r1.ls_valid.eq(1)
1230             with m.Else():
1231                 sync += r1.mmu_done.eq(1)
1232
1233         with m.If(r1.write_tag):
1234             # Store new tag in selected way
1235             for i in range(NUM_WAYS):
1236                 with m.If(i == replace_way):
1237                     ct = Signal(TAG_RAM_WIDTH)
1238                     comb += ct.eq(cache_tag[r1.store_index])
1239                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1240                     sync += cache_tag[r1.store_index].eq(ct)
1241             sync += r1.store_way.eq(replace_way)
1242             sync += r1.write_tag.eq(0)
1243
1244         # Take request from r1.req if there is one there,
1245         # else from req_op, ra, etc.
1246         with m.If(r1.full):
1247             comb += req.eq(r1.req)
1248         with m.Else():
1249             comb += req.op.eq(req_op)
1250             comb += req.valid.eq(req_go)
1251             comb += req.mmu_req.eq(r0.mmu_req)
1252             comb += req.dcbz.eq(r0.req.dcbz)
1253             comb += req.real_addr.eq(ra)
1254
1255             with m.If(~r0.req.dcbz):
1256                 comb += req.data.eq(r0.req.data)
1257             with m.Else():
1258                 comb += req.data.eq(0)
1259
1260             # Select all bytes for dcbz
1261             # and for cacheable loads
1262             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1263                 comb += req.byte_sel.eq(~0) # all 1s
1264             with m.Else():
1265                 comb += req.byte_sel.eq(r0.req.byte_sel)
1266             comb += req.hit_way.eq(req_hit_way)
1267             comb += req.same_tag.eq(req_same_tag)
1268
1269             # Store the incoming request from r0,
1270             # if it is a slow request
1271             # Note that r1.full = 1 implies req_op = OP_NONE
1272             with m.If((req_op == Op.OP_LOAD_MISS)
1273                       | (req_op == Op.OP_LOAD_NC)
1274                       | (req_op == Op.OP_STORE_MISS)
1275                       | (req_op == Op.OP_STORE_HIT)):
1276                 sync += r1.req.eq(req)
1277                 sync += r1.full.eq(1)
1278
1279         # Main state machine
1280         with m.Switch(r1.state):
1281
1282             with m.Case(State.IDLE):
1283                 # XXX check 'left downto.  probably means len(r1.wb.adr)
1284                 #                     r1.wb.adr <= req.real_addr(
1285                 #                                   r1.wb.adr'left downto 0
1286                 #                                  );
1287                 sync += r1.wb.adr.eq(req.real_addr)
1288                 sync += r1.wb.sel.eq(req.byte_sel)
1289                 sync += r1.wb.dat.eq(req.data)
1290                 sync += r1.dcbz.eq(req.dcbz)
1291
1292                 # Keep track of our index and way
1293                 # for subsequent stores.
1294                 sync += r1.store_index.eq(get_index(req.real_addr))
1295                 sync += r1.store_row.eq(get_row(req.real_addr))
1296                 sync += r1.end_row_ix.eq(
1297                          get_row_of_line(get_row(req.real_addr))
1298                         )
1299                 sync += r1.reload_tag.eq(get_tag(req.real_addr))
1300                 sync += r1.req.same_tag.eq(1)
1301
1302                 with m.If(req.op == Op.OP_STORE_HIT):
1303                     sync += r1.store_way.eq(req.hit_way)
1304
1305                 # Reset per-row valid bits,
1306                 # ready for handling OP_LOAD_MISS
1307                 for i in range(ROW_PER_LINE):
1308                     sync += r1.rows_valid[i].eq(0)
1309
1310                 with m.If(req_op != Op.OP_NONE):
1311                     sync += Display("cache op %d", req.op)
1312
1313                 with m.Switch(req.op):
1314                     with m.Case(Op.OP_LOAD_HIT):
1315                         # stay in IDLE state
1316                         pass
1317
1318                     with m.Case(Op.OP_LOAD_MISS):
1319                         #Display(f"cache miss real addr:" \
1320                         #      f"{req_real_addr}" \
1321                         #      f" idx:{get_index(req_real_addr)}" \
1322                         #      f" tag:{get_tag(req.real_addr)}")
1323                         pass
1324
1325                         # Start the wishbone cycle
1326                         sync += r1.wb.we.eq(0)
1327                         sync += r1.wb.cyc.eq(1)
1328                         sync += r1.wb.stb.eq(1)
1329
1330                         # Track that we had one request sent
1331                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1332                         sync += r1.write_tag.eq(1)
1333
1334                     with m.Case(Op.OP_LOAD_NC):
1335                         sync += r1.wb.cyc.eq(1)
1336                         sync += r1.wb.stb.eq(1)
1337                         sync += r1.wb.we.eq(0)
1338                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1339
1340                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1341                         with m.If(~req.dcbz):
1342                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1343                             sync += r1.acks_pending.eq(1)
1344                             sync += r1.full.eq(0)
1345                             sync += r1.slow_valid.eq(1)
1346
1347                             with m.If(~req.mmu_req):
1348                                 sync += r1.ls_valid.eq(1)
1349                             with m.Else():
1350                                 sync += r1.mmu_done.eq(1)
1351
1352                             with m.If(req.op == Op.OP_STORE_HIT):
1353                                 sync += r1.write_bram.eq(1)
1354                         with m.Else():
1355                             # dcbz is handled much like a load miss except
1356                             # that we are writing to memory instead of reading
1357                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1358
1359                             with m.If(req.op == Op.OP_STORE_MISS):
1360                                 sync += r1.write_tag.eq(1)
1361
1362                         sync += r1.wb.we.eq(1)
1363                         sync += r1.wb.cyc.eq(1)
1364                         sync += r1.wb.stb.eq(1)
1365
1366                     # OP_NONE and OP_BAD do nothing
1367                     # OP_BAD & OP_STCX_FAIL were
1368                     # handled above already
1369                     with m.Case(Op.OP_NONE):
1370                         pass
1371                     with m.Case(Op.OP_BAD):
1372                         pass
1373                     with m.Case(Op.OP_STCX_FAIL):
1374                         pass
1375
1376             with m.Case(State.RELOAD_WAIT_ACK):
1377                 ld_stbs_done = Signal()
1378                 # Requests are all sent if stb is 0
1379                 comb += ld_stbs_done.eq(~r1.wb.stb)
1380
1381                 with m.If((~wb_in.stall) & r1.wb.stb):
1382                     # That was the last word?
1383                     # We are done sending.
1384                     # Clear stb and set ld_stbs_done
1385                     # so we can handle an eventual
1386                     # last ack on the same cycle.
1387                     with m.If(is_last_row_addr(r1.wb.adr, r1.end_row_ix)):
1388                         sync += r1.wb.stb.eq(0)
1389                         comb += ld_stbs_done.eq(1)
1390
1391                     # Calculate the next row address in the current cache line
1392                     rarange = r1.wb.adr[ROW_OFF_BITS : LINE_OFF_BITS]
1393                     sync += rarange.eq(rarange + 1)
1394
1395                 # Incoming acks processing
1396                 sync += r1.forward_valid1.eq(wb_in.ack)
1397                 with m.If(wb_in.ack):
1398                     # XXX needs an Array bit-accessor here
1399                     sync += r1.rows_valid[r1.store_row % ROW_PER_LINE].eq(1)
1400
1401                     # If this is the data we were looking for,
1402                     # we can complete the request next cycle.
1403                     # Compare the whole address in case the
1404                     # request in r1.req is not the one that
1405                     # started this refill.
1406                     with m.If(r1.full & r1.req.same_tag &
1407                               ((r1.dcbz & r1.req.dcbz) |
1408                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1409                                 (r1.store_row == get_row(r1.req.real_addr))):
1410                         sync += r1.full.eq(0)
1411                         sync += r1.slow_valid.eq(1)
1412                         with m.If(~r1.mmu_req):
1413                             sync += r1.ls_valid.eq(1)
1414                         with m.Else():
1415                             sync += r1.mmu_done.eq(1)
1416                         sync += r1.forward_sel.eq(~0) # all 1s
1417                         sync += r1.use_forward1.eq(1)
1418
1419                     # Check for completion
1420                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1421                                                       r1.end_row_ix)):
1422                         # Complete wishbone cycle
1423                         sync += r1.wb.cyc.eq(0)
1424
1425                         # Cache line is now valid
1426                         cv = Signal(INDEX_BITS)
1427                         sync += cv.eq(cache_valid_bits[r1.store_index])
1428                         sync += cv.bit_select(r1.store_way, 1).eq(1)
1429                         sync += r1.state.eq(State.IDLE)
1430
1431                     # Increment store row counter
1432                     sync += r1.store_row.eq(next_row(r1.store_row))
1433
1434             with m.Case(State.STORE_WAIT_ACK):
1435                 st_stbs_done = Signal()
1436                 comb += st_stbs_done.eq(~r1.wb.stb)
1437                 comb += acks.eq(r1.acks_pending)
1438
1439                 with m.If(r1.inc_acks != r1.dec_acks):
1440                     with m.If(r1.inc_acks):
1441                         comb += adjust_acks.eq(acks + 1)
1442                     with m.Else():
1443                         comb += adjust_acks.eq(acks - 1)
1444                 with m.Else():
1445                     comb += adjust_acks.eq(acks)
1446
1447                 sync += r1.acks_pending.eq(adjust_acks)
1448
1449                 # Clear stb when slave accepted request
1450                 with m.If(~wb_in.stall):
1451                     # See if there is another store waiting
1452                     # to be done which is in the same real page.
1453                     with m.If(req.valid):
1454                         ra = req.real_addr[0:SET_SIZE_BITS]
1455                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
1456                         sync += r1.wb.dat.eq(req.data)
1457                         sync += r1.wb.sel.eq(req.byte_sel)
1458
1459                     with m.Elif((adjust_acks < 7) & req.same_tag &
1460                                 ((req.op == Op.OP_STORE_MISS)
1461                                  | (req.op == Op.OP_STORE_HIT))):
1462                         sync += r1.wb.stb.eq(1)
1463                         comb += st_stbs_done.eq(0)
1464
1465                         with m.If(req.op == Op.OP_STORE_HIT):
1466                             sync += r1.write_bram.eq(1)
1467                         sync += r1.full.eq(0)
1468                         sync += r1.slow_valid.eq(1)
1469
1470                         # Store requests never come from the MMU
1471                         sync += r1.ls_valid.eq(1)
1472                         comb += st_stbs_done.eq(0)
1473                         sync += r1.inc_acks.eq(1)
1474                     with m.Else():
1475                         sync += r1.wb.stb.eq(0)
1476                         comb += st_stbs_done.eq(1)
1477
1478                 # Got ack ? See if complete.
1479                 with m.If(wb_in.ack):
1480                     with m.If(st_stbs_done & (adjust_acks == 1)):
1481                         sync += r1.state.eq(State.IDLE)
1482                         sync += r1.wb.cyc.eq(0)
1483                         sync += r1.wb.stb.eq(0)
1484                     sync += r1.dec_acks.eq(1)
1485
1486             with m.Case(State.NC_LOAD_WAIT_ACK):
1487                 # Clear stb when slave accepted request
1488                 with m.If(~wb_in.stall):
1489                     sync += r1.wb.stb.eq(0)
1490
1491                 # Got ack ? complete.
1492                 with m.If(wb_in.ack):
1493                     sync += r1.state.eq(State.IDLE)
1494                     sync += r1.full.eq(0)
1495                     sync += r1.slow_valid.eq(1)
1496
1497                     with m.If(~r1.mmu_req):
1498                         sync += r1.ls_valid.eq(1)
1499                     with m.Else():
1500                         sync += r1.mmu_done.eq(1)
1501
1502                     sync += r1.forward_sel.eq(~0) # all 1s
1503                     sync += r1.use_forward1.eq(1)
1504                     sync += r1.wb.cyc.eq(0)
1505                     sync += r1.wb.stb.eq(0)
1506
1507     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1508
1509         sync = m.d.sync
1510         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1511
1512         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1513                                stall_out, req_op[:3], d_out.valid, d_out.error,
1514                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1515                                r1.wb.adr[3:6]))
1516
1517     def elaborate(self, platform):
1518
1519         m = Module()
1520         comb = m.d.comb
1521
1522         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1523         cache_tags       = CacheTagArray()
1524         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1525         cache_valid_bits = CacheValidBitsArray()
1526
1527         # TODO attribute ram_style : string;
1528         # TODO attribute ram_style of cache_tags : signal is "distributed";
1529
1530         """note: these are passed to nmigen.hdl.Memory as "attributes".
1531            don't know how, just that they are.
1532         """
1533         dtlb_valid_bits = TLBValidBitsArray()
1534         dtlb_tags       = TLBTagsArray()
1535         dtlb_ptes       = TLBPtesArray()
1536         # TODO attribute ram_style of
1537         #  dtlb_tags : signal is "distributed";
1538         # TODO attribute ram_style of
1539         #  dtlb_ptes : signal is "distributed";
1540
1541         r0      = RegStage0("r0")
1542         r0_full = Signal()
1543
1544         r1 = RegStage1("r1")
1545
1546         reservation = Reservation()
1547
1548         # Async signals on incoming request
1549         req_index    = Signal(INDEX_BITS)
1550         req_row      = Signal(ROW_BITS)
1551         req_hit_way  = Signal(WAY_BITS)
1552         req_tag      = Signal(TAG_BITS)
1553         req_op       = Signal(Op)
1554         req_data     = Signal(64)
1555         req_same_tag = Signal()
1556         req_go       = Signal()
1557
1558         early_req_row     = Signal(ROW_BITS)
1559
1560         cancel_store      = Signal()
1561         set_rsrv          = Signal()
1562         clear_rsrv        = Signal()
1563
1564         r0_valid          = Signal()
1565         r0_stall          = Signal()
1566
1567         use_forward1_next = Signal()
1568         use_forward2_next = Signal()
1569
1570         cache_out         = CacheRamOut()
1571
1572         plru_victim       = PLRUOut()
1573         replace_way       = Signal(WAY_BITS)
1574
1575         # Wishbone read/write/cache write formatting signals
1576         bus_sel           = Signal(8)
1577
1578         # TLB signals
1579         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1580         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1581         tlb_valid_way = Signal(TLB_NUM_WAYS)
1582         tlb_req_index = Signal(TLB_SET_BITS)
1583         tlb_hit       = Signal()
1584         tlb_hit_way   = Signal(TLB_WAY_BITS)
1585         pte           = Signal(TLB_PTE_BITS)
1586         ra            = Signal(REAL_ADDR_BITS)
1587         valid_ra      = Signal()
1588         perm_attr     = PermAttr("dc_perms")
1589         rc_ok         = Signal()
1590         perm_ok       = Signal()
1591         access_ok     = Signal()
1592
1593         tlb_plru_victim = TLBPLRUOut()
1594
1595         # we don't yet handle collisions between loadstore1 requests
1596         # and MMU requests
1597         comb += self.m_out.stall.eq(0)
1598
1599         # Hold off the request in r0 when r1 has an uncompleted request
1600         comb += r0_stall.eq(r0_full & r1.full)
1601         comb += r0_valid.eq(r0_full & ~r1.full)
1602         comb += self.stall_out.eq(r0_stall)
1603
1604         # Wire up wishbone request latch out of stage 1
1605         comb += self.wb_out.eq(r1.wb)
1606
1607         # call sub-functions putting everything together, using shared
1608         # signals established above
1609         self.stage_0(m, r0, r1, r0_full)
1610         self.tlb_read(m, r0_stall, tlb_valid_way,
1611                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1612                       dtlb_tags, dtlb_ptes)
1613         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1614                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1615                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1616         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1617                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1618                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1619         self.maybe_plrus(m, r1, plru_victim)
1620         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1621         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1622         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1623                            r0_valid, r1, cache_valid_bits, replace_way,
1624                            use_forward1_next, use_forward2_next,
1625                            req_hit_way, plru_victim, rc_ok, perm_attr,
1626                            valid_ra, perm_ok, access_ok, req_op, req_go,
1627                            tlb_pte_way,
1628                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1629                            cancel_store, req_same_tag, r0_stall, early_req_row)
1630         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1631                            r0_valid, r0, reservation)
1632         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1633                            reservation, r0)
1634         self.writeback_control(m, r1, cache_out)
1635         self.rams(m, r1, early_req_row, cache_out, replace_way)
1636         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1637                         req_hit_way, req_index, req_tag, access_ok,
1638                         tlb_hit, tlb_hit_way, tlb_req_index)
1639         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1640                     cache_valid_bits, r0, replace_way,
1641                     req_hit_way, req_same_tag,
1642                          r0_valid, req_op, cache_tags, req_go, ra)
1643         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1644
1645         return m
1646
1647 def dcache_load(dut, addr, nc=0):
1648     yield dut.d_in.load.eq(1)
1649     yield dut.d_in.nc.eq(nc)
1650     yield dut.d_in.addr.eq(addr)
1651     yield dut.d_in.valid.eq(1)
1652     yield
1653     yield dut.d_in.valid.eq(0)
1654     yield
1655     while not (yield dut.d_out.valid):
1656         yield
1657     data = yield dut.d_out.data
1658     return data
1659
1660
1661 def dcache_store(dut, addr, data, nc=0):
1662     yield dut.d_in.load.eq(0)
1663     yield dut.d_in.nc.eq(nc)
1664     yield dut.d_in.data.eq(data)
1665     yield dut.d_in.byte_sel.eq(~0)
1666     yield dut.d_in.addr.eq(addr)
1667     yield dut.d_in.valid.eq(1)
1668     yield
1669     yield dut.d_in.valid.eq(0)
1670     yield dut.d_in.byte_sel.eq(0)
1671     yield
1672     while not (yield dut.d_out.valid):
1673         yield
1674
1675
1676 def dcache_sim(dut):
1677     # clear stuff
1678     yield dut.d_in.valid.eq(0)
1679     yield dut.d_in.load.eq(0)
1680     yield dut.d_in.priv_mode.eq(1)
1681     yield dut.d_in.nc.eq(0)
1682     yield dut.d_in.addr.eq(0)
1683     yield dut.d_in.data.eq(0)
1684     yield dut.m_in.valid.eq(0)
1685     yield dut.m_in.addr.eq(0)
1686     yield dut.m_in.pte.eq(0)
1687     # wait 4 * clk_period
1688     yield
1689     yield
1690     yield
1691     yield
1692
1693     # Cacheable read of address 4
1694     data = yield from dcache_load(dut, 0x4)
1695     addr = yield dut.d_in.addr
1696     assert data == 0x0000000100000000, \
1697         f"data @%x=%x expected 0x0000000100000000" % (addr, data)
1698
1699     # Cacheable read of address 30
1700     data = yield from dcache_load(dut, 0x30)
1701     addr = yield dut.d_in.addr
1702     assert data == 0x0000000D0000000C, \
1703         f"data @%x=%x expected 0000000D0000000C" % (addr, data)
1704
1705     # 2nd Cacheable read of address 30
1706     data = yield from dcache_load(dut, 0x30)
1707     addr = yield dut.d_in.addr
1708     assert data == 0x0000000D0000000C, \
1709         f"data @%x=%x expected 0000000D0000000C" % (addr, data)
1710
1711     # Non-cacheable read of address 100
1712     data = yield from dcache_load(dut, 0x100, nc=1)
1713     addr = yield dut.d_in.addr
1714     assert data == 0x0000004100000040, \
1715         f"data @%x=%x expected 0000004100000040" % (addr, data)
1716
1717     # Store at address 30
1718     yield from dcache_store(dut, 0x30, 0x121)
1719
1720     # Store at address 30
1721     yield from dcache_store(dut, 0x30, 0x12345678)
1722
1723     # 3nd Cacheable read of address 30
1724     data = yield from dcache_load(dut, 0x30)
1725     addr = yield dut.d_in.addr
1726     assert data == 0x12345678, \
1727         f"data @%x=%x expected 0x12345678" % (addr, data)
1728
1729     yield
1730     yield
1731     yield
1732     yield
1733
1734
1735 def test_dcache():
1736     dut = DCache()
1737     vl = rtlil.convert(dut, ports=[])
1738     with open("test_dcache.il", "w") as f:
1739         f.write(vl)
1740
1741     mem = []
1742     for i in range(0,128):
1743         mem.append((i*2)| ((i*2+1)<<32))
1744     memory = Memory(width=64, depth=16*8, init=mem)
1745     sram = SRAM(memory=memory, granularity=8)
1746
1747     m = Module()
1748     m.submodules.dcache = dut
1749     m.submodules.sram = sram
1750
1751     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1752     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1753     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1754     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1755     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr[3:])
1756     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1757
1758     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1759     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1760
1761     # nmigen Simulation
1762     sim = Simulator(m)
1763     sim.add_clock(1e-6)
1764
1765     sim.add_sync_process(wrap(dcache_sim(dut)))
1766     with sim.write_vcd('test_dcache.vcd'):
1767         sim.run()
1768
1769 if __name__ == '__main__':
1770     test_dcache()
1771