src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8 """
   9
  10 import sys
  11 sys.setrecursionlimit(1000000)
  12
  13 from enum import Enum, unique
  14
  15 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  16 from nmutil.util import Display
  17
  18 from copy import deepcopy
  19 from random import randint, seed
  20
  21 from nmigen.cli import main
  22 from nmutil.iocontrol import RecordObject
  23 from nmigen.utils import log2_int
  24 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  25                                      DCacheToLoadStore1Type,
  26                                      MMUToDCacheType,
  27                                      DCacheToMMUType)
  28
  29 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  30                                 WBAddrType, WBDataType, WBSelType,
  31                                 WBMasterOut, WBSlaveOut,
  32                                 WBMasterOutVector, WBSlaveOutVector,
  33                                 WBIOMasterOut, WBIOSlaveOut)
  34
  35 from soc.experiment.cache_ram import CacheRam
  36 #from soc.experiment.plru import PLRU
  37 from nmutil.plru import PLRU
  38
  39 # for test
  40 from soc.bus.sram import SRAM
  41 from nmigen import Memory
  42 from nmigen.cli import rtlil
  43
  44 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  45 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  46 from nmutil.sim_tmp_alternative import Simulator
  47
  48 from nmutil.util import wrap
  49
  50
  51 # TODO: make these parameters of DCache at some point
  52 LINE_SIZE = 64    # Line size in bytes
  53 NUM_LINES = 16    # Number of lines in a set
  54 NUM_WAYS = 4      # Number of ways
  55 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  56 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  57 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  58 LOG_LENGTH = 0    # Non-zero to enable log data collection
  59
  60 # BRAM organisation: We never access more than
  61 #     -- WB_DATA_BITS at a time so to save
  62 #     -- resources we make the array only that wide, and
  63 #     -- use consecutive indices for to make a cache "line"
  64 #     --
  65 #     -- ROW_SIZE is the width in bytes of the BRAM
  66 #     -- (based on WB, so 64-bits)
  67 ROW_SIZE = WB_DATA_BITS // 8;
  68
  69 # ROW_PER_LINE is the number of row (wishbone
  70 # transactions) in a line
  71 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  72
  73 # BRAM_ROWS is the number of rows in BRAM needed
  74 # to represent the full dcache
  75 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  76
  77 print ("ROW_SIZE", ROW_SIZE)
  78 print ("ROW_PER_LINE", ROW_PER_LINE)
  79 print ("BRAM_ROWS", BRAM_ROWS)
  80 print ("NUM_WAYS", NUM_WAYS)
  81
  82 # Bit fields counts in the address
  83
  84 # REAL_ADDR_BITS is the number of real address
  85 # bits that we store
  86 REAL_ADDR_BITS = 56
  87
  88 # ROW_BITS is the number of bits to select a row
  89 ROW_BITS = log2_int(BRAM_ROWS)
  90
  91 # ROW_LINE_BITS is the number of bits to select
  92 # a row within a line
  93 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  94
  95 # LINE_OFF_BITS is the number of bits for
  96 # the offset in a cache line
  97 LINE_OFF_BITS = log2_int(LINE_SIZE)
  98
  99 # ROW_OFF_BITS is the number of bits for
 100 # the offset in a row
 101 ROW_OFF_BITS = log2_int(ROW_SIZE)
 102
 103 # INDEX_BITS is the number if bits to
 104 # select a cache line
 105 INDEX_BITS = log2_int(NUM_LINES)
 106
 107 # SET_SIZE_BITS is the log base 2 of the set size
 108 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 109
 110 # TAG_BITS is the number of bits of
 111 # the tag part of the address
 112 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 113
 114 # TAG_WIDTH is the width in bits of each way of the tag RAM
 115 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 116
 117 # WAY_BITS is the number of bits to select a way
 118 WAY_BITS = log2_int(NUM_WAYS)
 119
 120 # Example of layout for 32 lines of 64 bytes:
 121 layout = """\
 122   ..  tag    |index|  line  |
 123   ..         |   row   |    |
 124   ..         |     |---|    | ROW_LINE_BITS  (3)
 125   ..         |     |--- - --| LINE_OFF_BITS (6)
 126   ..         |         |- --| ROW_OFF_BITS  (3)
 127   ..         |----- ---|    | ROW_BITS      (8)
 128   ..         |-----|        | INDEX_BITS    (5)
 129   .. --------|              | TAG_BITS      (45)
 130 """
 131 print (layout)
 132 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 133             (TAG_BITS, INDEX_BITS, ROW_BITS,
 134              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 135 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 136 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 137 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 138
 139 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 140
 141 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 142
 143 def CacheTagArray():
 144     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 145                         for x in range(NUM_LINES))
 146
 147 def CacheValidBitsArray():
 148     return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
 149                         for x in range(NUM_LINES))
 150
 151 def RowPerLineValidArray():
 152     return Array(Signal(name="rows_valid%d" % x) \
 153                         for x in range(ROW_PER_LINE))
 154
 155 # L1 TLB
 156 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 157 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 158 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 159 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 160 TLB_PTE_BITS     = 64
 161 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 162
 163 def ispow2(x):
 164     return (1<<log2_int(x, False)) == x
 165
 166 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 167 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 168 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 169 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 170 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 171 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 172         "geometry bits don't add up"
 173 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 174         "geometry bits don't add up"
 175 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 176          "geometry bits don't add up"
 177 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 178 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 179
 180
 181 def TLBValidBitsArray():
 182     return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
 183                 for x in range(TLB_SET_SIZE))
 184
 185 def TLBTagEAArray():
 186     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 187                 for x in range (TLB_NUM_WAYS))
 188
 189 def TLBTagsArray():
 190     return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
 191                 for x in range (TLB_SET_SIZE))
 192
 193 def TLBPtesArray():
 194     return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
 195                 for x in range(TLB_SET_SIZE))
 196
 197 def HitWaySet():
 198     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 199                         for x in range(TLB_NUM_WAYS))
 200
 201 # Cache RAM interface
 202 def CacheRamOut():
 203     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 204                  for x in range(NUM_WAYS))
 205
 206 # PLRU output interface
 207 def PLRUOut():
 208     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 209                 for x in range(NUM_LINES))
 210
 211 # TLB PLRU output interface
 212 def TLBPLRUOut():
 213     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 214                 for x in range(TLB_SET_SIZE))
 215
 216 # Helper functions to decode incoming requests
 217 #
 218 # Return the cache line index (tag index) for an address
 219 def get_index(addr):
 220     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 221
 222 # Return the cache row index (data memory) for an address
 223 def get_row(addr):
 224     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 225
 226 # Return the index of a row within a line
 227 def get_row_of_line(row):
 228     return row[:ROW_BITS][:ROW_LINE_BITS]
 229
 230 # Returns whether this is the last row of a line
 231 def is_last_row_addr(addr, last):
 232     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 233
 234 # Returns whether this is the last row of a line
 235 def is_last_row(row, last):
 236     return get_row_of_line(row) == last
 237
 238 # Return the next row in the current cache line. We use a
 239 # dedicated function in order to limit the size of the
 240 # generated adder to be only the bits within a cache line
 241 # (3 bits with default settings)
 242 def next_row(row):
 243     row_v = row[0:ROW_LINE_BITS] + 1
 244     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 245
 246 # Get the tag value from the address
 247 def get_tag(addr):
 248     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 249
 250 # Read a tag from a tag memory row
 251 def read_tag(way, tagset):
 252     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 253
 254 # Read a TLB tag from a TLB tag memory row
 255 def read_tlb_tag(way, tags):
 256     return tags.word_select(way, TLB_EA_TAG_BITS)
 257
 258 # Write a TLB tag to a TLB tag memory row
 259 def write_tlb_tag(way, tags, tag):
 260     return read_tlb_tag(way, tags).eq(tag)
 261
 262 # Read a PTE from a TLB PTE memory row
 263 def read_tlb_pte(way, ptes):
 264     return ptes.word_select(way, TLB_PTE_BITS)
 265
 266 def write_tlb_pte(way, ptes, newpte):
 267     return read_tlb_pte(way, ptes).eq(newpte)
 268
 269
 270 # Record for storing permission, attribute, etc. bits from a PTE
 271 class PermAttr(RecordObject):
 272     def __init__(self, name=None):
 273         super().__init__(name=name)
 274         self.reference = Signal()
 275         self.changed   = Signal()
 276         self.nocache   = Signal()
 277         self.priv      = Signal()
 278         self.rd_perm   = Signal()
 279         self.wr_perm   = Signal()
 280
 281
 282 def extract_perm_attr(pte):
 283     pa = PermAttr()
 284     return pa;
 285
 286
 287 # Type of operation on a "valid" input
 288 @unique
 289 class Op(Enum):
 290     OP_NONE       = 0
 291     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 292     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 293     OP_LOAD_HIT   = 3 # Cache hit on load
 294     OP_LOAD_MISS  = 4 # Load missing cache
 295     OP_LOAD_NC    = 5 # Non-cachable load
 296     OP_STORE_HIT  = 6 # Store hitting cache
 297     OP_STORE_MISS = 7 # Store missing cache
 298
 299
 300 # Cache state machine
 301 @unique
 302 class State(Enum):
 303     IDLE             = 0 # Normal load hit processing
 304     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 305     STORE_WAIT_ACK   = 2 # Store wait ack
 306     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 307
 308
 309 # Dcache operations:
 310 #
 311 # In order to make timing, we use the BRAMs with
 312 # an output buffer, which means that the BRAM
 313 # output is delayed by an extra cycle.
 314 #
 315 # Thus, the dcache has a 2-stage internal pipeline
 316 # for cache hits with no stalls.
 317 #
 318 # All other operations are handled via stalling
 319 # in the first stage.
 320 #
 321 # The second stage can thus complete a hit at the same
 322 # time as the first stage emits a stall for a complex op.
 323 #
 324 # Stage 0 register, basically contains just the latched request
 325
 326 class RegStage0(RecordObject):
 327     def __init__(self, name=None):
 328         super().__init__(name=name)
 329         self.req     = LoadStore1ToDCacheType(name="lsmem")
 330         self.tlbie   = Signal()
 331         self.doall   = Signal()
 332         self.tlbld   = Signal()
 333         self.mmu_req = Signal() # indicates source of request
 334
 335
 336 class MemAccessRequest(RecordObject):
 337     def __init__(self, name=None):
 338         super().__init__(name=name)
 339         self.op        = Signal(Op)
 340         self.valid     = Signal()
 341         self.dcbz      = Signal()
 342         self.real_addr = Signal(REAL_ADDR_BITS)
 343         self.data      = Signal(64)
 344         self.byte_sel  = Signal(8)
 345         self.hit_way   = Signal(WAY_BITS)
 346         self.same_tag  = Signal()
 347         self.mmu_req   = Signal()
 348
 349
 350 # First stage register, contains state for stage 1 of load hits
 351 # and for the state machine used by all other operations
 352 class RegStage1(RecordObject):
 353     def __init__(self, name=None):
 354         super().__init__(name=name)
 355         # Info about the request
 356         self.full             = Signal() # have uncompleted request
 357         self.mmu_req          = Signal() # request is from MMU
 358         self.req              = MemAccessRequest(name="reqmem")
 359
 360         # Cache hit state
 361         self.hit_way          = Signal(WAY_BITS)
 362         self.hit_load_valid   = Signal()
 363         self.hit_index        = Signal(INDEX_BITS)
 364         self.cache_hit        = Signal()
 365
 366         # TLB hit state
 367         self.tlb_hit          = Signal()
 368         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 369         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 370
 371         # 2-stage data buffer for data forwarded from writes to reads
 372         self.forward_data1    = Signal(64)
 373         self.forward_data2    = Signal(64)
 374         self.forward_sel1     = Signal(8)
 375         self.forward_valid1   = Signal()
 376         self.forward_way1     = Signal(WAY_BITS)
 377         self.forward_row1     = Signal(ROW_BITS)
 378         self.use_forward1     = Signal()
 379         self.forward_sel      = Signal(8)
 380
 381         # Cache miss state (reload state machine)
 382         self.state            = Signal(State)
 383         self.dcbz             = Signal()
 384         self.write_bram       = Signal()
 385         self.write_tag        = Signal()
 386         self.slow_valid       = Signal()
 387         self.real_adr         = Signal(REAL_ADDR_BITS)
 388         self.wb               = WBMasterOut("wb")
 389         self.reload_tag       = Signal(TAG_BITS)
 390         self.store_way        = Signal(WAY_BITS)
 391         self.store_row        = Signal(ROW_BITS)
 392         self.store_index      = Signal(INDEX_BITS)
 393         self.end_row_ix       = Signal(ROW_LINE_BITS)
 394         self.rows_valid       = RowPerLineValidArray()
 395         self.acks_pending     = Signal(3)
 396         self.inc_acks         = Signal()
 397         self.dec_acks         = Signal()
 398
 399         # Signals to complete (possibly with error)
 400         self.ls_valid         = Signal()
 401         self.ls_error         = Signal()
 402         self.mmu_done         = Signal()
 403         self.mmu_error        = Signal()
 404         self.cache_paradox    = Signal()
 405
 406         # Signal to complete a failed stcx.
 407         self.stcx_fail        = Signal()
 408
 409
 410 # Reservation information
 411 class Reservation(RecordObject):
 412     def __init__(self):
 413         super().__init__()
 414         self.valid = Signal()
 415         self.addr  = Signal(64-LINE_OFF_BITS)
 416
 417
 418 class DTLBUpdate(Elaboratable):
 419     def __init__(self):
 420         self.tlbie    = Signal()
 421         self.tlbwe    = Signal()
 422         self.doall    = Signal()
 423         self.updated  = Signal()
 424         self.v_updated  = Signal()
 425         self.tlb_hit    = Signal()
 426         self.tlb_req_index = Signal(TLB_SET_BITS)
 427
 428         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 429         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 430         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 431         self.repl_way        = Signal(TLB_WAY_BITS)
 432         self.eatag           = Signal(TLB_EA_TAG_BITS)
 433         self.pte_data        = Signal(TLB_PTE_BITS)
 434
 435         self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 436
 437         self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 438         self.pb_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 439         self.db_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 440
 441     def elaborate(self, platform):
 442         m = Module()
 443         comb = m.d.comb
 444         sync = m.d.sync
 445
 446         tagset   = Signal(TLB_TAG_WAY_BITS)
 447         pteset   = Signal(TLB_PTE_WAY_BITS)
 448
 449         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 450         comb += db_out.eq(self.dv)
 451
 452         with m.If(self.tlbie & self.doall):
 453             pass # clear all back in parent
 454         with m.Elif(self.tlbie):
 455             with m.If(self.tlb_hit):
 456                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(1)
 457                 comb += self.v_updated.eq(1)
 458
 459         with m.Elif(self.tlbwe):
 460
 461             comb += tagset.eq(self.tlb_tag_way)
 462             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 463             comb += tb_out.eq(tagset)
 464
 465             comb += pteset.eq(self.tlb_pte_way)
 466             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 467             comb += pb_out.eq(pteset)
 468
 469             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 470
 471             comb += self.updated.eq(1)
 472             comb += self.v_updated.eq(1)
 473
 474         return m
 475
 476
 477 class DCachePendingHit(Elaboratable):
 478
 479     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 480                       cache_valid_idx, cache_tag_set,
 481                     req_addr,
 482                     hit_set):
 483
 484         self.go          = Signal()
 485         self.virt_mode   = Signal()
 486         self.is_hit      = Signal()
 487         self.tlb_hit     = Signal()
 488         self.hit_way     = Signal(WAY_BITS)
 489         self.rel_match   = Signal()
 490         self.req_index   = Signal(INDEX_BITS)
 491         self.reload_tag  = Signal(TAG_BITS)
 492
 493         self.tlb_hit_way = tlb_hit_way
 494         self.tlb_pte_way = tlb_pte_way
 495         self.tlb_valid_way = tlb_valid_way
 496         self.cache_valid_idx = cache_valid_idx
 497         self.cache_tag_set = cache_tag_set
 498         self.req_addr = req_addr
 499         self.hit_set = hit_set
 500
 501     def elaborate(self, platform):
 502         m = Module()
 503         comb = m.d.comb
 504         sync = m.d.sync
 505
 506         go = self.go
 507         virt_mode = self.virt_mode
 508         is_hit = self.is_hit
 509         tlb_pte_way = self.tlb_pte_way
 510         tlb_valid_way = self.tlb_valid_way
 511         cache_valid_idx = self.cache_valid_idx
 512         cache_tag_set = self.cache_tag_set
 513         req_addr = self.req_addr
 514         tlb_hit_way = self.tlb_hit_way
 515         tlb_hit = self.tlb_hit
 516         hit_set = self.hit_set
 517         hit_way = self.hit_way
 518         rel_match = self.rel_match
 519         req_index = self.req_index
 520         reload_tag = self.reload_tag
 521
 522         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 523                                     for i in range(TLB_NUM_WAYS))
 524         hit_way_set = HitWaySet()
 525
 526         # Test if pending request is a hit on any way
 527         # In order to make timing in virtual mode,
 528         # when we are using the TLB, we compare each
 529         # way with each of the real addresses from each way of
 530         # the TLB, and then decide later which match to use.
 531
 532         with m.If(virt_mode):
 533             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 534                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 535                 s_hit       = Signal()
 536                 s_pte       = Signal(TLB_PTE_BITS)
 537                 s_ra        = Signal(REAL_ADDR_BITS)
 538                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 539                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 540                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 541                 comb += s_tag.eq(get_tag(s_ra))
 542
 543                 for i in range(NUM_WAYS): # way_t
 544                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 545                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 546                                   (read_tag(i, cache_tag_set) == s_tag)
 547                                   & tlb_valid_way[j])
 548                     with m.If(is_tag_hit):
 549                         comb += hit_way_set[j].eq(i)
 550                         comb += s_hit.eq(1)
 551                 comb += hit_set[j].eq(s_hit)
 552                 with m.If(s_tag == reload_tag):
 553                     comb += rel_matches[j].eq(1)
 554             with m.If(tlb_hit):
 555                 comb += is_hit.eq(hit_set[tlb_hit_way])
 556                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 557                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 558         with m.Else():
 559             s_tag       = Signal(TAG_BITS)
 560             comb += s_tag.eq(get_tag(req_addr))
 561             for i in range(NUM_WAYS): # way_t
 562                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 563                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 564                           (read_tag(i, cache_tag_set) == s_tag))
 565                 with m.If(is_tag_hit):
 566                     comb += hit_way.eq(i)
 567                     comb += is_hit.eq(1)
 568             with m.If(s_tag == reload_tag):
 569                 comb += rel_match.eq(1)
 570
 571         return m
 572
 573
 574 class DCache(Elaboratable):
 575     """Set associative dcache write-through
 576     TODO (in no specific order):
 577     * See list in icache.vhdl
 578     * Complete load misses on the cycle when WB data comes instead of
 579       at the end of line (this requires dealing with requests coming in
 580       while not idle...)
 581     """
 582     def __init__(self):
 583         self.d_in      = LoadStore1ToDCacheType("d_in")
 584         self.d_out     = DCacheToLoadStore1Type("d_out")
 585
 586         self.m_in      = MMUToDCacheType("m_in")
 587         self.m_out     = DCacheToMMUType("m_out")
 588
 589         self.stall_out = Signal()
 590
 591         self.wb_out    = WBMasterOut()
 592         self.wb_in     = WBSlaveOut()
 593
 594         self.log_out   = Signal(20)
 595
 596     def stage_0(self, m, r0, r1, r0_full):
 597         """Latch the request in r0.req as long as we're not stalling
 598         """
 599         comb = m.d.comb
 600         sync = m.d.sync
 601         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 602
 603         r = RegStage0("stage0")
 604
 605         # TODO, this goes in unit tests and formal proofs
 606         with m.If(d_in.valid & m_in.valid):
 607             sync += Display("request collision loadstore vs MMU")
 608
 609         with m.If(m_in.valid):
 610             comb += r.req.valid.eq(1)
 611             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 612             comb += r.req.dcbz.eq(0)
 613             comb += r.req.nc.eq(0)
 614             comb += r.req.reserve.eq(0)
 615             comb += r.req.virt_mode.eq(0)
 616             comb += r.req.priv_mode.eq(1)
 617             comb += r.req.addr.eq(m_in.addr)
 618             comb += r.req.data.eq(m_in.pte)
 619             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 620             comb += r.tlbie.eq(m_in.tlbie)
 621             comb += r.doall.eq(m_in.doall)
 622             comb += r.tlbld.eq(m_in.tlbld)
 623             comb += r.mmu_req.eq(1)
 624         with m.Else():
 625             comb += r.req.eq(d_in)
 626             comb += r.tlbie.eq(0)
 627             comb += r.doall.eq(0)
 628             comb += r.tlbld.eq(0)
 629             comb += r.mmu_req.eq(0)
 630         with m.If(~(r1.full & r0_full)):
 631             sync += r0.eq(r)
 632             sync += r0_full.eq(r.req.valid)
 633
 634     def tlb_read(self, m, r0_stall, tlb_valid_way,
 635                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 636                  dtlb_tags, dtlb_ptes):
 637         """TLB
 638         Operates in the second cycle on the request latched in r0.req.
 639         TLB updates write the entry at the end of the second cycle.
 640         """
 641         comb = m.d.comb
 642         sync = m.d.sync
 643         m_in, d_in = self.m_in, self.d_in
 644
 645         index    = Signal(TLB_SET_BITS)
 646         addrbits = Signal(TLB_SET_BITS)
 647
 648         amin = TLB_LG_PGSZ
 649         amax = TLB_LG_PGSZ + TLB_SET_BITS
 650
 651         with m.If(m_in.valid):
 652             comb += addrbits.eq(m_in.addr[amin : amax])
 653         with m.Else():
 654             comb += addrbits.eq(d_in.addr[amin : amax])
 655         comb += index.eq(addrbits)
 656
 657         # If we have any op and the previous op isn't finished,
 658         # then keep the same output for next cycle.
 659         with m.If(~r0_stall):
 660             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 661             sync += tlb_tag_way.eq(dtlb_tags[index])
 662             sync += tlb_pte_way.eq(dtlb_ptes[index])
 663
 664     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 665         """Generate TLB PLRUs
 666         """
 667         comb = m.d.comb
 668         sync = m.d.sync
 669
 670         if TLB_NUM_WAYS == 0:
 671             return
 672         for i in range(TLB_SET_SIZE):
 673             # TLB PLRU interface
 674             tlb_plru        = PLRU(TLB_WAY_BITS)
 675             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 676             tlb_plru_acc_en = Signal()
 677
 678             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 679             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 680             comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
 681             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 682
 683     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 684                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 685                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 686
 687         comb = m.d.comb
 688
 689         hitway = Signal(TLB_WAY_BITS)
 690         hit    = Signal()
 691         eatag  = Signal(TLB_EA_TAG_BITS)
 692
 693         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 694         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 695         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 696
 697         for i in range(TLB_NUM_WAYS):
 698             is_tag_hit = Signal()
 699             comb += is_tag_hit.eq(tlb_valid_way[i]
 700                                   & (read_tlb_tag(i, tlb_tag_way) == eatag))
 701             with m.If(is_tag_hit):
 702                 comb += hitway.eq(i)
 703                 comb += hit.eq(1)
 704
 705         comb += tlb_hit.eq(hit & r0_valid)
 706         comb += tlb_hit_way.eq(hitway)
 707
 708         with m.If(tlb_hit):
 709             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 710         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 711
 712         with m.If(r0.req.virt_mode):
 713             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 714                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 715                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 716             comb += perm_attr.reference.eq(pte[8])
 717             comb += perm_attr.changed.eq(pte[7])
 718             comb += perm_attr.nocache.eq(pte[5])
 719             comb += perm_attr.priv.eq(pte[3])
 720             comb += perm_attr.rd_perm.eq(pte[2])
 721             comb += perm_attr.wr_perm.eq(pte[1])
 722         with m.Else():
 723             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 724                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 725             comb += perm_attr.reference.eq(1)
 726             comb += perm_attr.changed.eq(1)
 727             comb += perm_attr.nocache.eq(0)
 728             comb += perm_attr.priv.eq(1)
 729             comb += perm_attr.rd_perm.eq(1)
 730             comb += perm_attr.wr_perm.eq(1)
 731
 732     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 733                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 734                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 735
 736         dtlb_valids = TLBValidBitsArray()
 737
 738         comb = m.d.comb
 739         sync = m.d.sync
 740
 741         tlbie    = Signal()
 742         tlbwe    = Signal()
 743
 744         comb += tlbie.eq(r0_valid & r0.tlbie)
 745         comb += tlbwe.eq(r0_valid & r0.tlbld)
 746
 747         m.submodules.tlb_update = d = DTLBUpdate()
 748         with m.If(tlbie & r0.doall):
 749             # clear all valid bits at once
 750             for i in range(TLB_SET_SIZE):
 751                 sync += dtlb_valid_bits[i].eq(0)
 752         with m.If(d.updated):
 753             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 754             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 755         with m.If(d.v_updated):
 756             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 757
 758         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 759
 760         comb += d.tlbie.eq(tlbie)
 761         comb += d.tlbwe.eq(tlbwe)
 762         comb += d.doall.eq(r0.doall)
 763         comb += d.tlb_hit.eq(tlb_hit)
 764         comb += d.tlb_hit_way.eq(tlb_hit_way)
 765         comb += d.tlb_tag_way.eq(tlb_tag_way)
 766         comb += d.tlb_pte_way.eq(tlb_pte_way)
 767         comb += d.tlb_req_index.eq(tlb_req_index)
 768
 769         with m.If(tlb_hit):
 770             comb += d.repl_way.eq(tlb_hit_way)
 771         with m.Else():
 772             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 773         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 774         comb += d.pte_data.eq(r0.req.data)
 775
 776     def maybe_plrus(self, m, r1, plru_victim):
 777         """Generate PLRUs
 778         """
 779         comb = m.d.comb
 780         sync = m.d.sync
 781
 782         if TLB_NUM_WAYS == 0:
 783             return
 784
 785         for i in range(NUM_LINES):
 786             # PLRU interface
 787             plru        = PLRU(WAY_BITS)
 788             setattr(m.submodules, "plru%d" % i, plru)
 789             plru_acc_en = Signal()
 790
 791             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 792             comb += plru.acc_en.eq(plru_acc_en)
 793             comb += plru.acc_i.eq(r1.hit_way)
 794             comb += plru_victim[i].eq(plru.lru_o)
 795
 796     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 797         """Cache tag RAM read port
 798         """
 799         comb = m.d.comb
 800         sync = m.d.sync
 801         m_in, d_in = self.m_in, self.d_in
 802
 803         index = Signal(INDEX_BITS)
 804
 805         with m.If(r0_stall):
 806             comb += index.eq(req_index)
 807         with m.Elif(m_in.valid):
 808             comb += index.eq(get_index(m_in.addr))
 809         with m.Else():
 810             comb += index.eq(get_index(d_in.addr))
 811         sync += cache_tag_set.eq(cache_tags[index])
 812
 813     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 814                        r0_valid, r1, cache_valids, replace_way,
 815                        use_forward1_next, use_forward2_next,
 816                        req_hit_way, plru_victim, rc_ok, perm_attr,
 817                        valid_ra, perm_ok, access_ok, req_op, req_go,
 818                        tlb_pte_way,
 819                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 820                        cancel_store, req_same_tag, r0_stall, early_req_row):
 821         """Cache request parsing and hit detection
 822         """
 823
 824         comb = m.d.comb
 825         sync = m.d.sync
 826         m_in, d_in = self.m_in, self.d_in
 827
 828         is_hit      = Signal()
 829         hit_way     = Signal(WAY_BITS)
 830         op          = Signal(Op)
 831         opsel       = Signal(3)
 832         go          = Signal()
 833         nc          = Signal()
 834         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 835                                   for i in range(TLB_NUM_WAYS))
 836         cache_valid_idx = Signal(NUM_WAYS)
 837
 838         # Extract line, row and tag from request
 839         comb += req_index.eq(get_index(r0.req.addr))
 840         comb += req_row.eq(get_row(r0.req.addr))
 841         comb += req_tag.eq(get_tag(ra))
 842
 843         if False: # display on comb is a bit... busy.
 844             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 845                     r0.req.addr, ra, req_index, req_tag, req_row)
 846
 847         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 848         comb += cache_valid_idx.eq(cache_valids[req_index])
 849
 850         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 851                                 tlb_valid_way, tlb_hit_way,
 852                                 cache_valid_idx, cache_tag_set,
 853                                 r0.req.addr,
 854                                 hit_set)
 855
 856         comb += dc.tlb_hit.eq(tlb_hit)
 857         comb += dc.reload_tag.eq(r1.reload_tag)
 858         comb += dc.virt_mode.eq(r0.req.virt_mode)
 859         comb += dc.go.eq(go)
 860         comb += dc.req_index.eq(req_index)
 861         comb += is_hit.eq(dc.is_hit)
 862         comb += hit_way.eq(dc.hit_way)
 863         comb += req_same_tag.eq(dc.rel_match)
 864
 865         # See if the request matches the line currently being reloaded
 866         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 867                   (req_index == r1.store_index) & req_same_tag):
 868             # For a store, consider this a hit even if the row isn't
 869             # valid since it will be by the time we perform the store.
 870             # For a load, check the appropriate row valid bit.
 871             rrow = Signal(ROW_LINE_BITS)
 872             comb += rrow.eq(req_row)
 873             valid = r1.rows_valid[rrow]
 874             comb += is_hit.eq((~r0.req.load) | valid)
 875             comb += hit_way.eq(replace_way)
 876
 877         # Whether to use forwarded data for a load or not
 878         with m.If((get_row(r1.req.real_addr) == req_row) &
 879                   (r1.req.hit_way == hit_way)):
 880             # Only need to consider r1.write_bram here, since if we
 881             # are writing refill data here, then we don't have a
 882             # cache hit this cycle on the line being refilled.
 883             # (There is the possibility that the load following the
 884             # load miss that started the refill could be to the old
 885             # contents of the victim line, since it is a couple of
 886             # cycles after the refill starts before we see the updated
 887             # cache tag. In that case we don't use the bypass.)
 888             comb += use_forward1_next.eq(r1.write_bram)
 889         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 890             comb += use_forward2_next.eq(r1.forward_valid1)
 891
 892         # The way that matched on a hit
 893         comb += req_hit_way.eq(hit_way)
 894
 895         # The way to replace on a miss
 896         with m.If(r1.write_tag):
 897             comb += replace_way.eq(plru_victim[r1.store_index])
 898         with m.Else():
 899             comb += replace_way.eq(r1.store_way)
 900
 901         # work out whether we have permission for this access
 902         # NB we don't yet implement AMR, thus no KUAP
 903         comb += rc_ok.eq(perm_attr.reference
 904                          & (r0.req.load | perm_attr.changed))
 905         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 906                            (perm_attr.wr_perm |
 907                               (r0.req.load & perm_attr.rd_perm)))
 908         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 909         # Combine the request and cache hit status to decide what
 910         # operation needs to be done
 911         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 912         comb += op.eq(Op.OP_NONE)
 913         with m.If(go):
 914             with m.If(~access_ok):
 915                 comb += op.eq(Op.OP_BAD)
 916             with m.Elif(cancel_store):
 917                 comb += op.eq(Op.OP_STCX_FAIL)
 918             with m.Else():
 919                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 920                 with m.Switch(opsel):
 921                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 922                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 923                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 924                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 925                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 926                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 927                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 928                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 929         comb += req_op.eq(op)
 930         comb += req_go.eq(go)
 931
 932         # Version of the row number that is valid one cycle earlier
 933         # in the cases where we need to read the cache data BRAM.
 934         # If we're stalling then we need to keep reading the last
 935         # row requested.
 936         with m.If(~r0_stall):
 937             with m.If(m_in.valid):
 938                 comb += early_req_row.eq(get_row(m_in.addr))
 939             with m.Else():
 940                 comb += early_req_row.eq(get_row(d_in.addr))
 941         with m.Else():
 942             comb += early_req_row.eq(req_row)
 943
 944     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 945                          r0_valid, r0, reservation):
 946         """Handle load-with-reservation and store-conditional instructions
 947         """
 948         comb = m.d.comb
 949
 950         with m.If(r0_valid & r0.req.reserve):
 951             # XXX generate alignment interrupt if address
 952             # is not aligned XXX or if r0.req.nc = '1'
 953             with m.If(r0.req.load):
 954                 comb += set_rsrv.eq(1) # load with reservation
 955             with m.Else():
 956                 comb += clear_rsrv.eq(1) # store conditional
 957                 with m.If((~reservation.valid) |
 958                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
 959                     comb += cancel_store.eq(1)
 960
 961     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
 962                         reservation, r0):
 963
 964         comb = m.d.comb
 965         sync = m.d.sync
 966
 967         with m.If(r0_valid & access_ok):
 968             with m.If(clear_rsrv):
 969                 sync += reservation.valid.eq(0)
 970             with m.Elif(set_rsrv):
 971                 sync += reservation.valid.eq(1)
 972                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 973
 974     def writeback_control(self, m, r1, cache_out_row):
 975         """Return data for loads & completion control logic
 976         """
 977         comb = m.d.comb
 978         sync = m.d.sync
 979         d_out, m_out = self.d_out, self.m_out
 980
 981         data_out = Signal(64)
 982         data_fwd = Signal(64)
 983
 984         # Use the bypass if are reading the row that was
 985         # written 1 or 2 cycles ago, including for the
 986         # slow_valid = 1 case (i.e. completing a load
 987         # miss or a non-cacheable load).
 988         with m.If(r1.use_forward1):
 989             comb += data_fwd.eq(r1.forward_data1)
 990         with m.Else():
 991             comb += data_fwd.eq(r1.forward_data2)
 992
 993         comb += data_out.eq(cache_out_row)
 994
 995         for i in range(8):
 996             with m.If(r1.forward_sel[i]):
 997                 dsel = data_fwd.word_select(i, 8)
 998                 comb += data_out.word_select(i, 8).eq(dsel)
 999
1000         comb += d_out.valid.eq(r1.ls_valid)
1001         comb += d_out.data.eq(data_out)
1002         comb += d_out.store_done.eq(~r1.stcx_fail)
1003         comb += d_out.error.eq(r1.ls_error)
1004         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1005
1006         # Outputs to MMU
1007         comb += m_out.done.eq(r1.mmu_done)
1008         comb += m_out.err.eq(r1.mmu_error)
1009         comb += m_out.data.eq(data_out)
1010
1011         # We have a valid load or store hit or we just completed
1012         # a slow op such as a load miss, a NC load or a store
1013         #
1014         # Note: the load hit is delayed by one cycle. However it
1015         # can still not collide with r.slow_valid (well unless I
1016         # miscalculated) because slow_valid can only be set on a
1017         # subsequent request and not on its first cycle (the state
1018         # machine must have advanced), which makes slow_valid
1019         # at least 2 cycles from the previous hit_load_valid.
1020
1021         # Sanity: Only one of these must be set in any given cycle
1022
1023         if False: # TODO: need Display to get this to work
1024             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1025             "unexpected slow_valid collision with stcx_fail"
1026
1027             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1028              "unexpected hit_load_delayed collision with slow_valid"
1029
1030         with m.If(~r1.mmu_req):
1031             # Request came from loadstore1...
1032             # Load hit case is the standard path
1033             with m.If(r1.hit_load_valid):
1034                 sync += Display("completing load hit data=%x", data_out)
1035
1036             # error cases complete without stalling
1037             with m.If(r1.ls_error):
1038                 sync += Display("completing ld/st with error")
1039
1040             # Slow ops (load miss, NC, stores)
1041             with m.If(r1.slow_valid):
1042                 sync += Display("completing store or load miss adr=%x data=%x",
1043                                 r1.req.real_addr, data_out)
1044
1045         with m.Else():
1046             # Request came from MMU
1047             with m.If(r1.hit_load_valid):
1048                 sync += Display("completing load hit to MMU, data=%x",
1049                                 m_out.data)
1050             # error cases complete without stalling
1051             with m.If(r1.mmu_error):
1052                 sync += Display("combpleting MMU ld with error")
1053
1054             # Slow ops (i.e. load miss)
1055             with m.If(r1.slow_valid):
1056                 sync += Display("completing MMU load miss, data=%x",
1057                                 m_out.data)
1058
1059     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1060         """rams
1061         Generate a cache RAM for each way. This handles the normal
1062         reads, writes from reloads and the special store-hit update
1063         path as well.
1064
1065         Note: the BRAMs have an extra read buffer, meaning the output
1066         is pipelined an extra cycle. This differs from the
1067         icache. The writeback logic needs to take that into
1068         account by using 1-cycle delayed signals for load hits.
1069         """
1070         comb = m.d.comb
1071         wb_in = self.wb_in
1072
1073         for i in range(NUM_WAYS):
1074             do_read  = Signal(name="do_rd%d" % i)
1075             rd_addr  = Signal(ROW_BITS)
1076             do_write = Signal(name="do_wr%d" % i)
1077             wr_addr  = Signal(ROW_BITS)
1078             wr_data  = Signal(WB_DATA_BITS)
1079             wr_sel   = Signal(ROW_SIZE)
1080             wr_sel_m = Signal(ROW_SIZE)
1081             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1082
1083             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True)
1084             setattr(m.submodules, "cacheram_%d" % i, way)
1085
1086             comb += way.rd_en.eq(do_read)
1087             comb += way.rd_addr.eq(rd_addr)
1088             comb += _d_out.eq(way.rd_data_o)
1089             comb += way.wr_sel.eq(wr_sel_m)
1090             comb += way.wr_addr.eq(wr_addr)
1091             comb += way.wr_data.eq(wr_data)
1092
1093             # Cache hit reads
1094             comb += do_read.eq(1)
1095             comb += rd_addr.eq(early_req_row)
1096             with m.If(r1.hit_way == i):
1097                 comb += cache_out_row.eq(_d_out)
1098
1099             # Write mux:
1100             #
1101             # Defaults to wishbone read responses (cache refill)
1102             #
1103             # For timing, the mux on wr_data/sel/addr is not
1104             # dependent on anything other than the current state.
1105
1106             with m.If(r1.write_bram):
1107                 # Write store data to BRAM.  This happens one
1108                 # cycle after the store is in r0.
1109                 comb += wr_data.eq(r1.req.data)
1110                 comb += wr_sel.eq(r1.req.byte_sel)
1111                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1112
1113                 with m.If(i == r1.req.hit_way):
1114                     comb += do_write.eq(1)
1115             with m.Else():
1116                 # Otherwise, we might be doing a reload or a DCBZ
1117                 with m.If(r1.dcbz):
1118                     comb += wr_data.eq(0)
1119                 with m.Else():
1120                     comb += wr_data.eq(wb_in.dat)
1121                 comb += wr_addr.eq(r1.store_row)
1122                 comb += wr_sel.eq(~0) # all 1s
1123
1124             with m.If((r1.state == State.RELOAD_WAIT_ACK)
1125                       & wb_in.ack & (replace_way == i)):
1126                 comb += do_write.eq(1)
1127
1128             # Mask write selects with do_write since BRAM
1129             # doesn't have a global write-enable
1130             with m.If(do_write):
1131                 comb += wr_sel_m.eq(wr_sel)
1132
1133     # Cache hit synchronous machine for the easy case.
1134     # This handles load hits.
1135     # It also handles error cases (TLB miss, cache paradox)
1136     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1137                         req_hit_way, req_index, req_tag, access_ok,
1138                         tlb_hit, tlb_hit_way, tlb_req_index):
1139
1140         comb = m.d.comb
1141         sync = m.d.sync
1142
1143         with m.If(req_op != Op.OP_NONE):
1144             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1145                     req_op, r0.req.addr, r0.req.nc,
1146                     req_index, req_tag, req_hit_way)
1147
1148         with m.If(r0_valid):
1149             sync += r1.mmu_req.eq(r0.mmu_req)
1150
1151         # Fast path for load/store hits.
1152         # Set signals for the writeback controls.
1153         sync += r1.hit_way.eq(req_hit_way)
1154         sync += r1.hit_index.eq(req_index)
1155
1156         with m.If(req_op == Op.OP_LOAD_HIT):
1157             sync += r1.hit_load_valid.eq(1)
1158         with m.Else():
1159             sync += r1.hit_load_valid.eq(0)
1160
1161         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1162             sync += r1.cache_hit.eq(1)
1163         with m.Else():
1164             sync += r1.cache_hit.eq(0)
1165
1166         with m.If(req_op == Op.OP_BAD):
1167             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1168             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1169             sync += r1.ls_error.eq(~r0.mmu_req)
1170             sync += r1.mmu_error.eq(r0.mmu_req)
1171             sync += r1.cache_paradox.eq(access_ok)
1172
1173             with m.Else():
1174                 sync += r1.ls_error.eq(0)
1175                 sync += r1.mmu_error.eq(0)
1176                 sync += r1.cache_paradox.eq(0)
1177
1178         with m.If(req_op == Op.OP_STCX_FAIL):
1179             sync += r1.stcx_fail.eq(1)
1180         with m.Else():
1181             sync += r1.stcx_fail.eq(0)
1182
1183         # Record TLB hit information for updating TLB PLRU
1184         sync += r1.tlb_hit.eq(tlb_hit)
1185         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1186         sync += r1.tlb_hit_index.eq(tlb_req_index)
1187
1188     # Memory accesses are handled by this state machine:
1189     #
1190     #   * Cache load miss/reload (in conjunction with "rams")
1191     #   * Load hits for non-cachable forms
1192     #   * Stores (the collision case is handled in "rams")
1193     #
1194     # All wishbone requests generation is done here.
1195     # This machine operates at stage 1.
1196     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1197                     cache_valids, r0, replace_way,
1198                     req_hit_way, req_same_tag,
1199                     r0_valid, req_op, cache_tags, req_go, ra):
1200
1201         comb = m.d.comb
1202         sync = m.d.sync
1203         wb_in = self.wb_in
1204
1205         req         = MemAccessRequest("mreq_ds")
1206
1207         req_row = Signal(ROW_BITS)
1208         req_idx = Signal(INDEX_BITS)
1209         req_tag = Signal(TAG_BITS)
1210         comb += req_idx.eq(get_index(req.real_addr))
1211         comb += req_row.eq(get_row(req.real_addr))
1212         comb += req_tag.eq(get_tag(req.real_addr))
1213
1214         sync += r1.use_forward1.eq(use_forward1_next)
1215         sync += r1.forward_sel.eq(0)
1216
1217         with m.If(use_forward1_next):
1218             sync += r1.forward_sel.eq(r1.req.byte_sel)
1219         with m.Elif(use_forward2_next):
1220             sync += r1.forward_sel.eq(r1.forward_sel1)
1221
1222         sync += r1.forward_data2.eq(r1.forward_data1)
1223         with m.If(r1.write_bram):
1224             sync += r1.forward_data1.eq(r1.req.data)
1225             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1226             sync += r1.forward_way1.eq(r1.req.hit_way)
1227             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1228             sync += r1.forward_valid1.eq(1)
1229         with m.Else():
1230             with m.If(r1.dcbz):
1231                 sync += r1.forward_data1.eq(0)
1232             with m.Else():
1233                 sync += r1.forward_data1.eq(wb_in.dat)
1234             sync += r1.forward_sel1.eq(~0) # all 1s
1235             sync += r1.forward_way1.eq(replace_way)
1236             sync += r1.forward_row1.eq(r1.store_row)
1237             sync += r1.forward_valid1.eq(0)
1238
1239         # One cycle pulses reset
1240         sync += r1.slow_valid.eq(0)
1241         sync += r1.write_bram.eq(0)
1242         sync += r1.inc_acks.eq(0)
1243         sync += r1.dec_acks.eq(0)
1244
1245         sync += r1.ls_valid.eq(0)
1246         # complete tlbies and TLB loads in the third cycle
1247         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1248
1249         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1250             with m.If(~r0.mmu_req):
1251                 sync += r1.ls_valid.eq(1)
1252             with m.Else():
1253                 sync += r1.mmu_done.eq(1)
1254
1255         with m.If(r1.write_tag):
1256             # Store new tag in selected way
1257             for i in range(NUM_WAYS):
1258                 with m.If(i == replace_way):
1259                     ct = Signal(TAG_RAM_WIDTH)
1260                     comb += ct.eq(cache_tags[r1.store_index])
1261                     """
1262 TODO: check this
1263 cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
1264                     (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag;
1265                     """
1266                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1267                     sync += cache_tags[r1.store_index].eq(ct)
1268             sync += r1.store_way.eq(replace_way)
1269             sync += r1.write_tag.eq(0)
1270
1271         # Take request from r1.req if there is one there,
1272         # else from req_op, ra, etc.
1273         with m.If(r1.full):
1274             comb += req.eq(r1.req)
1275         with m.Else():
1276             comb += req.op.eq(req_op)
1277             comb += req.valid.eq(req_go)
1278             comb += req.mmu_req.eq(r0.mmu_req)
1279             comb += req.dcbz.eq(r0.req.dcbz)
1280             comb += req.real_addr.eq(ra)
1281
1282             with m.If(~r0.req.dcbz):
1283                 comb += req.data.eq(r0.req.data)
1284             with m.Else():
1285                 comb += req.data.eq(0)
1286
1287             # Select all bytes for dcbz
1288             # and for cacheable loads
1289             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1290                 comb += req.byte_sel.eq(~0) # all 1s
1291             with m.Else():
1292                 comb += req.byte_sel.eq(r0.req.byte_sel)
1293             comb += req.hit_way.eq(req_hit_way)
1294             comb += req.same_tag.eq(req_same_tag)
1295
1296             # Store the incoming request from r0,
1297             # if it is a slow request
1298             # Note that r1.full = 1 implies req_op = OP_NONE
1299             with m.If((req_op == Op.OP_LOAD_MISS)
1300                       | (req_op == Op.OP_LOAD_NC)
1301                       | (req_op == Op.OP_STORE_MISS)
1302                       | (req_op == Op.OP_STORE_HIT)):
1303                 sync += r1.req.eq(req)
1304                 sync += r1.full.eq(1)
1305
1306         # Main state machine
1307         with m.Switch(r1.state):
1308
1309             with m.Case(State.IDLE):
1310                 sync += r1.real_adr.eq(req.real_addr)
1311                 sync += r1.wb.sel.eq(req.byte_sel)
1312                 sync += r1.wb.dat.eq(req.data)
1313                 sync += r1.dcbz.eq(req.dcbz)
1314
1315                 # Keep track of our index and way
1316                 # for subsequent stores.
1317                 sync += r1.store_index.eq(req_idx)
1318                 sync += r1.store_row.eq(req_row)
1319                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1320                 sync += r1.reload_tag.eq(req_tag)
1321                 sync += r1.req.same_tag.eq(1)
1322
1323                 with m.If(req.op == Op.OP_STORE_HIT):
1324                     sync += r1.store_way.eq(req.hit_way)
1325
1326                 # Reset per-row valid bits,
1327                 # ready for handling OP_LOAD_MISS
1328                 for i in range(ROW_PER_LINE):
1329                     sync += r1.rows_valid[i].eq(0)
1330
1331                 with m.If(req_op != Op.OP_NONE):
1332                     sync += Display("cache op %d", req.op)
1333
1334                 with m.Switch(req.op):
1335                     with m.Case(Op.OP_LOAD_HIT):
1336                         # stay in IDLE state
1337                         pass
1338
1339                     with m.Case(Op.OP_LOAD_MISS):
1340                         sync += Display("cache miss real addr: %x " \
1341                                 "idx: %x tag: %x",
1342                                 req.real_addr, req_row, req_tag)
1343
1344                         # Start the wishbone cycle
1345                         sync += r1.wb.we.eq(0)
1346                         sync += r1.wb.cyc.eq(1)
1347                         sync += r1.wb.stb.eq(1)
1348
1349                         # Track that we had one request sent
1350                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1351                         sync += r1.write_tag.eq(1)
1352
1353                     with m.Case(Op.OP_LOAD_NC):
1354                         sync += r1.wb.cyc.eq(1)
1355                         sync += r1.wb.stb.eq(1)
1356                         sync += r1.wb.we.eq(0)
1357                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1358
1359                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1360                         with m.If(~req.dcbz):
1361                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1362                             sync += r1.acks_pending.eq(1)
1363                             sync += r1.full.eq(0)
1364                             sync += r1.slow_valid.eq(1)
1365
1366                             with m.If(~req.mmu_req):
1367                                 sync += r1.ls_valid.eq(1)
1368                             with m.Else():
1369                                 sync += r1.mmu_done.eq(1)
1370
1371                             with m.If(req.op == Op.OP_STORE_HIT):
1372                                 sync += r1.write_bram.eq(1)
1373                         with m.Else():
1374                             # dcbz is handled much like a load miss except
1375                             # that we are writing to memory instead of reading
1376                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1377
1378                             with m.If(req.op == Op.OP_STORE_MISS):
1379                                 sync += r1.write_tag.eq(1)
1380
1381                         sync += r1.wb.we.eq(1)
1382                         sync += r1.wb.cyc.eq(1)
1383                         sync += r1.wb.stb.eq(1)
1384
1385                     # OP_NONE and OP_BAD do nothing
1386                     # OP_BAD & OP_STCX_FAIL were
1387                     # handled above already
1388                     with m.Case(Op.OP_NONE):
1389                         pass
1390                     with m.Case(Op.OP_BAD):
1391                         pass
1392                     with m.Case(Op.OP_STCX_FAIL):
1393                         pass
1394
1395             with m.Case(State.RELOAD_WAIT_ACK):
1396                 ld_stbs_done = Signal()
1397                 # Requests are all sent if stb is 0
1398                 comb += ld_stbs_done.eq(~r1.wb.stb)
1399
1400                 with m.If((~wb_in.stall) & r1.wb.stb):
1401                     # That was the last word?  We are done sending.
1402                     # Clear stb and set ld_stbs_done so we can handle an
1403                     # eventual last ack on the same cycle.
1404                     with m.If(is_last_row_addr(r1.real_adr, r1.end_row_ix)):
1405                         sync += r1.wb.stb.eq(0)
1406                         comb += ld_stbs_done.eq(1)
1407
1408                     # Calculate the next row address in the current cache line
1409                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1410                     comb += row.eq(r1.real_adr[ROW_OFF_BITS:])
1411                     sync += r1.real_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(row+1)
1412
1413                 # Incoming acks processing
1414                 sync += r1.forward_valid1.eq(wb_in.ack)
1415                 with m.If(wb_in.ack):
1416                     srow = Signal(ROW_LINE_BITS)
1417                     comb += srow.eq(r1.store_row)
1418                     sync += r1.rows_valid[srow].eq(1)
1419
1420                     # If this is the data we were looking for,
1421                     # we can complete the request next cycle.
1422                     # Compare the whole address in case the
1423                     # request in r1.req is not the one that
1424                     # started this refill.
1425                     with m.If(r1.full & r1.req.same_tag &
1426                               ((r1.dcbz & r1.req.dcbz) |
1427                                ((~r1.dcbz) & (r1.req.op == Op.OP_LOAD_MISS))) &
1428                                 (r1.store_row == get_row(r1.req.real_addr))):
1429                         sync += r1.full.eq(0)
1430                         sync += r1.slow_valid.eq(1)
1431                         with m.If(~r1.mmu_req):
1432                             sync += r1.ls_valid.eq(1)
1433                         with m.Else():
1434                             sync += r1.mmu_done.eq(1)
1435                         sync += r1.forward_sel.eq(~0) # all 1s
1436                         sync += r1.use_forward1.eq(1)
1437
1438                     # Check for completion
1439                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1440                                                       r1.end_row_ix)):
1441                         # Complete wishbone cycle
1442                         sync += r1.wb.cyc.eq(0)
1443
1444                         # Cache line is now valid
1445                         cv = Signal(INDEX_BITS)
1446                         comb += cv.eq(cache_valids[r1.store_index])
1447                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1448                         sync += cache_valids[r1.store_index].eq(cv)
1449
1450                         sync += r1.state.eq(State.IDLE)
1451
1452                     # Increment store row counter
1453                     sync += r1.store_row.eq(next_row(r1.store_row))
1454
1455             with m.Case(State.STORE_WAIT_ACK):
1456                 st_stbs_done = Signal()
1457                 acks        = Signal(3)
1458                 adjust_acks = Signal(3)
1459
1460                 comb += st_stbs_done.eq(~r1.wb.stb)
1461                 comb += acks.eq(r1.acks_pending)
1462
1463                 with m.If(r1.inc_acks != r1.dec_acks):
1464                     with m.If(r1.inc_acks):
1465                         comb += adjust_acks.eq(acks + 1)
1466                     with m.Else():
1467                         comb += adjust_acks.eq(acks - 1)
1468                 with m.Else():
1469                     comb += adjust_acks.eq(acks)
1470
1471                 sync += r1.acks_pending.eq(adjust_acks)
1472
1473                 # Clear stb when slave accepted request
1474                 with m.If(~wb_in.stall):
1475                     # See if there is another store waiting
1476                     # to be done which is in the same real page.
1477                     with m.If(req.valid):
1478                         ra = req.real_addr[0:SET_SIZE_BITS]
1479                         sync += r1.real_adr[0:SET_SIZE_BITS].eq(ra)
1480                         sync += r1.wb.dat.eq(req.data)
1481                         sync += r1.wb.sel.eq(req.byte_sel)
1482
1483                     with m.If((adjust_acks < 7) & req.same_tag &
1484                                 ((req.op == Op.OP_STORE_MISS)
1485                                  | (req.op == Op.OP_STORE_HIT))):
1486                         sync += r1.wb.stb.eq(1)
1487                         comb += st_stbs_done.eq(0)
1488
1489                         with m.If(req.op == Op.OP_STORE_HIT):
1490                             sync += r1.write_bram.eq(1)
1491                         sync += r1.full.eq(0)
1492                         sync += r1.slow_valid.eq(1)
1493
1494                         # Store requests never come from the MMU
1495                         sync += r1.ls_valid.eq(1)
1496                         comb += st_stbs_done.eq(0)
1497                         sync += r1.inc_acks.eq(1)
1498                     with m.Else():
1499                         sync += r1.wb.stb.eq(0)
1500                         comb += st_stbs_done.eq(1)
1501
1502                 # Got ack ? See if complete.
1503                 with m.If(wb_in.ack):
1504                     with m.If(st_stbs_done & (adjust_acks == 1)):
1505                         sync += r1.state.eq(State.IDLE)
1506                         sync += r1.wb.cyc.eq(0)
1507                         sync += r1.wb.stb.eq(0)
1508                     sync += r1.dec_acks.eq(1)
1509
1510             with m.Case(State.NC_LOAD_WAIT_ACK):
1511                 # Clear stb when slave accepted request
1512                 with m.If(~wb_in.stall):
1513                     sync += r1.wb.stb.eq(0)
1514
1515                 # Got ack ? complete.
1516                 with m.If(wb_in.ack):
1517                     sync += r1.state.eq(State.IDLE)
1518                     sync += r1.full.eq(0)
1519                     sync += r1.slow_valid.eq(1)
1520
1521                     with m.If(~r1.mmu_req):
1522                         sync += r1.ls_valid.eq(1)
1523                     with m.Else():
1524                         sync += r1.mmu_done.eq(1)
1525
1526                     sync += r1.forward_sel.eq(~0) # all 1s
1527                     sync += r1.use_forward1.eq(1)
1528                     sync += r1.wb.cyc.eq(0)
1529                     sync += r1.wb.stb.eq(0)
1530
1531     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1532
1533         sync = m.d.sync
1534         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1535
1536         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1537                                stall_out, req_op[:3], d_out.valid, d_out.error,
1538                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1539                                r1.real_adr[3:6]))
1540
1541     def elaborate(self, platform):
1542
1543         m = Module()
1544         comb = m.d.comb
1545
1546         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1547         cache_tags       = CacheTagArray()
1548         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1549         cache_valids = CacheValidBitsArray()
1550
1551         # TODO attribute ram_style : string;
1552         # TODO attribute ram_style of cache_tags : signal is "distributed";
1553
1554         """note: these are passed to nmigen.hdl.Memory as "attributes".
1555            don't know how, just that they are.
1556         """
1557         dtlb_valid_bits = TLBValidBitsArray()
1558         dtlb_tags       = TLBTagsArray()
1559         dtlb_ptes       = TLBPtesArray()
1560         # TODO attribute ram_style of
1561         #  dtlb_tags : signal is "distributed";
1562         # TODO attribute ram_style of
1563         #  dtlb_ptes : signal is "distributed";
1564
1565         r0      = RegStage0("r0")
1566         r0_full = Signal()
1567
1568         r1 = RegStage1("r1")
1569
1570         reservation = Reservation()
1571
1572         # Async signals on incoming request
1573         req_index    = Signal(INDEX_BITS)
1574         req_row      = Signal(ROW_BITS)
1575         req_hit_way  = Signal(WAY_BITS)
1576         req_tag      = Signal(TAG_BITS)
1577         req_op       = Signal(Op)
1578         req_data     = Signal(64)
1579         req_same_tag = Signal()
1580         req_go       = Signal()
1581
1582         early_req_row     = Signal(ROW_BITS)
1583
1584         cancel_store      = Signal()
1585         set_rsrv          = Signal()
1586         clear_rsrv        = Signal()
1587
1588         r0_valid          = Signal()
1589         r0_stall          = Signal()
1590
1591         use_forward1_next = Signal()
1592         use_forward2_next = Signal()
1593
1594         cache_out_row     = Signal(WB_DATA_BITS)
1595
1596         plru_victim       = PLRUOut()
1597         replace_way       = Signal(WAY_BITS)
1598
1599         # Wishbone read/write/cache write formatting signals
1600         bus_sel           = Signal(8)
1601
1602         # TLB signals
1603         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1604         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1605         tlb_valid_way = Signal(TLB_NUM_WAYS)
1606         tlb_req_index = Signal(TLB_SET_BITS)
1607         tlb_hit       = Signal()
1608         tlb_hit_way   = Signal(TLB_WAY_BITS)
1609         pte           = Signal(TLB_PTE_BITS)
1610         ra            = Signal(REAL_ADDR_BITS)
1611         valid_ra      = Signal()
1612         perm_attr     = PermAttr("dc_perms")
1613         rc_ok         = Signal()
1614         perm_ok       = Signal()
1615         access_ok     = Signal()
1616
1617         tlb_plru_victim = TLBPLRUOut()
1618
1619         # we don't yet handle collisions between loadstore1 requests
1620         # and MMU requests
1621         comb += self.m_out.stall.eq(0)
1622
1623         # Hold off the request in r0 when r1 has an uncompleted request
1624         comb += r0_stall.eq(r0_full & r1.full)
1625         comb += r0_valid.eq(r0_full & ~r1.full)
1626         comb += self.stall_out.eq(r0_stall)
1627
1628         # Wire up wishbone request latch out of stage 1
1629         comb += r1.wb.adr.eq(r1.real_adr)
1630         comb += self.wb_out.eq(r1.wb)
1631         comb += self.wb_out.adr.eq(r1.wb.adr[3:]) # truncate LSBs
1632
1633         # deal with litex not doing wishbone pipeline mode
1634         comb += self.wb_in.stall.eq(self.wb_out.cyc & ~self.wb_in.ack)
1635
1636         # call sub-functions putting everything together, using shared
1637         # signals established above
1638         self.stage_0(m, r0, r1, r0_full)
1639         self.tlb_read(m, r0_stall, tlb_valid_way,
1640                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1641                       dtlb_tags, dtlb_ptes)
1642         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1643                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1644                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1645         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1646                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1647                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1648         self.maybe_plrus(m, r1, plru_victim)
1649         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1650         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1651         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1652                            r0_valid, r1, cache_valids, replace_way,
1653                            use_forward1_next, use_forward2_next,
1654                            req_hit_way, plru_victim, rc_ok, perm_attr,
1655                            valid_ra, perm_ok, access_ok, req_op, req_go,
1656                            tlb_pte_way,
1657                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1658                            cancel_store, req_same_tag, r0_stall, early_req_row)
1659         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1660                            r0_valid, r0, reservation)
1661         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1662                            reservation, r0)
1663         self.writeback_control(m, r1, cache_out_row)
1664         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1665         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1666                         req_hit_way, req_index, req_tag, access_ok,
1667                         tlb_hit, tlb_hit_way, tlb_req_index)
1668         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1669                     cache_valids, r0, replace_way,
1670                     req_hit_way, req_same_tag,
1671                          r0_valid, req_op, cache_tags, req_go, ra)
1672         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1673
1674         return m
1675
1676 def dcache_load(dut, addr, nc=0):
1677     yield dut.d_in.load.eq(1)
1678     yield dut.d_in.nc.eq(nc)
1679     yield dut.d_in.addr.eq(addr)
1680     yield dut.d_in.byte_sel.eq(~0)
1681     yield dut.d_in.valid.eq(1)
1682     yield
1683     yield dut.d_in.valid.eq(0)
1684     yield dut.d_in.byte_sel.eq(0)
1685     while not (yield dut.d_out.valid):
1686         yield
1687     data = yield dut.d_out.data
1688     return data
1689
1690
1691 def dcache_store(dut, addr, data, nc=0):
1692     yield dut.d_in.load.eq(0)
1693     yield dut.d_in.nc.eq(nc)
1694     yield dut.d_in.data.eq(data)
1695     yield dut.d_in.byte_sel.eq(~0)
1696     yield dut.d_in.addr.eq(addr)
1697     yield dut.d_in.valid.eq(1)
1698     yield
1699     yield dut.d_in.valid.eq(0)
1700     yield dut.d_in.byte_sel.eq(0)
1701     while not (yield dut.d_out.valid):
1702         yield
1703
1704
1705 def dcache_random_sim(dut, mem):
1706
1707     # start copy of mem
1708     sim_mem = deepcopy(mem)
1709     memsize = len(sim_mem)
1710     print ("mem len", memsize)
1711
1712     # clear stuff
1713     yield dut.d_in.valid.eq(0)
1714     yield dut.d_in.load.eq(0)
1715     yield dut.d_in.priv_mode.eq(1)
1716     yield dut.d_in.nc.eq(0)
1717     yield dut.d_in.addr.eq(0)
1718     yield dut.d_in.data.eq(0)
1719     yield dut.m_in.valid.eq(0)
1720     yield dut.m_in.addr.eq(0)
1721     yield dut.m_in.pte.eq(0)
1722     # wait 4 * clk_period
1723     yield
1724     yield
1725     yield
1726     yield
1727
1728     print ()
1729
1730     #for i in range(1024):
1731     #    sim_mem[i] = i
1732
1733     for i in range(1024):
1734         addr = randint(0, memsize-1)
1735         data = randint(0, (1<<64)-1)
1736         sim_mem[addr] = data
1737         row = addr
1738         addr *= 8
1739
1740         print ("random testing %d 0x%x row %d data 0x%x" % (i, addr, row, data))
1741
1742         yield from dcache_load(dut, addr)
1743         yield from dcache_store(dut, addr, data)
1744
1745         addr = randint(0, memsize-1)
1746         sim_data = sim_mem[addr]
1747         row = addr
1748         addr *= 8
1749
1750         print ("    load 0x%x row %d expect data 0x%x" % (addr, row, sim_data))
1751         data = yield from dcache_load(dut, addr)
1752         assert data == sim_data, \
1753             "check addr 0x%x row %d data %x != %x" % (addr, row, data, sim_data)
1754
1755     for addr in range(memsize):
1756         data = yield from dcache_load(dut, addr*8)
1757         assert data == sim_mem[addr], \
1758             "final check %x data %x != %x" % (addr*8, data, sim_mem[addr])
1759
1760
1761 def dcache_sim(dut, mem):
1762     # clear stuff
1763     yield dut.d_in.valid.eq(0)
1764     yield dut.d_in.load.eq(0)
1765     yield dut.d_in.priv_mode.eq(1)
1766     yield dut.d_in.nc.eq(0)
1767     yield dut.d_in.addr.eq(0)
1768     yield dut.d_in.data.eq(0)
1769     yield dut.m_in.valid.eq(0)
1770     yield dut.m_in.addr.eq(0)
1771     yield dut.m_in.pte.eq(0)
1772     # wait 4 * clk_period
1773     yield
1774     yield
1775     yield
1776     yield
1777
1778     # Cacheable read of address 4
1779     data = yield from dcache_load(dut, 0x58)
1780     addr = yield dut.d_in.addr
1781     assert data == 0x0000001700000016, \
1782         f"data @%x=%x expected 0x0000001700000016" % (addr, data)
1783
1784     # Cacheable read of address 20
1785     data = yield from dcache_load(dut, 0x20)
1786     addr = yield dut.d_in.addr
1787     assert data == 0x0000000900000008, \
1788         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1789
1790     # Cacheable read of address 30
1791     data = yield from dcache_load(dut, 0x530)
1792     addr = yield dut.d_in.addr
1793     assert data == 0x0000014D0000014C, \
1794         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1795
1796     # 2nd Cacheable read of address 30
1797     data = yield from dcache_load(dut, 0x530)
1798     addr = yield dut.d_in.addr
1799     assert data == 0x0000014D0000014C, \
1800         f"data @%x=%x expected 0000014D0000014C" % (addr, data)
1801
1802     # Non-cacheable read of address 100
1803     data = yield from dcache_load(dut, 0x100, nc=1)
1804     addr = yield dut.d_in.addr
1805     assert data == 0x0000004100000040, \
1806         f"data @%x=%x expected 0000004100000040" % (addr, data)
1807
1808     # Store at address 530
1809     yield from dcache_store(dut, 0x530, 0x121)
1810
1811     # Store at address 30
1812     yield from dcache_store(dut, 0x530, 0x12345678)
1813
1814     # 3nd Cacheable read of address 530
1815     data = yield from dcache_load(dut, 0x530)
1816     addr = yield dut.d_in.addr
1817     assert data == 0x12345678, \
1818         f"data @%x=%x expected 0x12345678" % (addr, data)
1819
1820     # 4th Cacheable read of address 20
1821     data = yield from dcache_load(dut, 0x20)
1822     addr = yield dut.d_in.addr
1823     assert data == 0x0000000900000008, \
1824         f"data @%x=%x expected 0x0000000900000008" % (addr, data)
1825
1826     yield
1827     yield
1828     yield
1829     yield
1830
1831
1832 def test_dcache(mem, test_fn, test_name):
1833     dut = DCache()
1834
1835     memory = Memory(width=64, depth=len(mem), init=mem, simulate=True)
1836     sram = SRAM(memory=memory, granularity=8)
1837
1838     m = Module()
1839     m.submodules.dcache = dut
1840     m.submodules.sram = sram
1841
1842     m.d.comb += sram.bus.cyc.eq(dut.wb_out.cyc)
1843     m.d.comb += sram.bus.stb.eq(dut.wb_out.stb)
1844     m.d.comb += sram.bus.we.eq(dut.wb_out.we)
1845     m.d.comb += sram.bus.sel.eq(dut.wb_out.sel)
1846     m.d.comb += sram.bus.adr.eq(dut.wb_out.adr)
1847     m.d.comb += sram.bus.dat_w.eq(dut.wb_out.dat)
1848
1849     m.d.comb += dut.wb_in.ack.eq(sram.bus.ack)
1850     m.d.comb += dut.wb_in.dat.eq(sram.bus.dat_r)
1851
1852     # nmigen Simulation
1853     sim = Simulator(m)
1854     sim.add_clock(1e-6)
1855
1856     sim.add_sync_process(wrap(test_fn(dut, mem)))
1857     with sim.write_vcd('test_dcache%s.vcd' % test_name):
1858         sim.run()
1859
1860 if __name__ == '__main__':
1861     seed(0)
1862     dut = DCache()
1863     vl = rtlil.convert(dut, ports=[])
1864     with open("test_dcache.il", "w") as f:
1865         f.write(vl)
1866
1867     mem = []
1868     for i in range(1024):
1869         mem.append((i*2)| ((i*2+1)<<32))
1870
1871     test_dcache(mem, dcache_sim, "")
1872
1873     mem = []
1874     memsize = 256
1875     for i in range(memsize):
1876         mem.append(i)
1877
1878     test_dcache(mem, dcache_random_sim, "random")
1879