src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16
  17 """
  18
  19 import sys
  20
  21 from nmutil.gtkw import write_gtkw
  22
  23 sys.setrecursionlimit(1000000)
  24
  25 from enum import Enum, unique
  26
  27 from nmigen import Module, Signal, Elaboratable, Cat, Repl, Array, Const
  28 from nmutil.util import Display
  29
  30 from copy import deepcopy
  31 from random import randint, seed
  32
  33 from nmigen.cli import main
  34 from nmutil.iocontrol import RecordObject
  35 from nmigen.utils import log2_int
  36 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  37                                      DCacheToLoadStore1Type,
  38                                      MMUToDCacheType,
  39                                      DCacheToMMUType)
  40
  41 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  42                                 WBAddrType, WBDataType, WBSelType,
  43                                 WBMasterOut, WBSlaveOut,
  44                                 WBMasterOutVector, WBSlaveOutVector,
  45                                 WBIOMasterOut, WBIOSlaveOut)
  46
  47 from soc.experiment.cache_ram import CacheRam
  48 #from soc.experiment.plru import PLRU
  49 from nmutil.plru import PLRU
  50
  51 # for test
  52 from soc.bus.sram import SRAM
  53 from nmigen import Memory
  54 from nmigen.cli import rtlil
  55
  56 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  57 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  58 from nmutil.sim_tmp_alternative import Simulator
  59
  60 from nmutil.util import wrap
  61
  62
  63 # TODO: make these parameters of DCache at some point
  64 LINE_SIZE = 64    # Line size in bytes
  65 NUM_LINES = 16    # Number of lines in a set
  66 NUM_WAYS = 4      # Number of ways
  67 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  68 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  69 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  70 LOG_LENGTH = 0    # Non-zero to enable log data collection
  71
  72 # BRAM organisation: We never access more than
  73 #     -- WB_DATA_BITS at a time so to save
  74 #     -- resources we make the array only that wide, and
  75 #     -- use consecutive indices for to make a cache "line"
  76 #     --
  77 #     -- ROW_SIZE is the width in bytes of the BRAM
  78 #     -- (based on WB, so 64-bits)
  79 ROW_SIZE = WB_DATA_BITS // 8;
  80
  81 # ROW_PER_LINE is the number of row (wishbone
  82 # transactions) in a line
  83 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  84
  85 # BRAM_ROWS is the number of rows in BRAM needed
  86 # to represent the full dcache
  87 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  88
  89 print ("ROW_SIZE", ROW_SIZE)
  90 print ("ROW_PER_LINE", ROW_PER_LINE)
  91 print ("BRAM_ROWS", BRAM_ROWS)
  92 print ("NUM_WAYS", NUM_WAYS)
  93
  94 # Bit fields counts in the address
  95
  96 # REAL_ADDR_BITS is the number of real address
  97 # bits that we store
  98 REAL_ADDR_BITS = 56
  99
 100 # ROW_BITS is the number of bits to select a row
 101 ROW_BITS = log2_int(BRAM_ROWS)
 102
 103 # ROW_LINE_BITS is the number of bits to select
 104 # a row within a line
 105 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 106
 107 # LINE_OFF_BITS is the number of bits for
 108 # the offset in a cache line
 109 LINE_OFF_BITS = log2_int(LINE_SIZE)
 110
 111 # ROW_OFF_BITS is the number of bits for
 112 # the offset in a row
 113 ROW_OFF_BITS = log2_int(ROW_SIZE)
 114
 115 # INDEX_BITS is the number if bits to
 116 # select a cache line
 117 INDEX_BITS = log2_int(NUM_LINES)
 118
 119 # SET_SIZE_BITS is the log base 2 of the set size
 120 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 121
 122 # TAG_BITS is the number of bits of
 123 # the tag part of the address
 124 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 125
 126 # TAG_WIDTH is the width in bits of each way of the tag RAM
 127 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 128
 129 # WAY_BITS is the number of bits to select a way
 130 WAY_BITS = log2_int(NUM_WAYS)
 131
 132 # Example of layout for 32 lines of 64 bytes:
 133 layout = """\
 134   ..  tag    |index|  line  |
 135   ..         |   row   |    |
 136   ..         |     |---|    | ROW_LINE_BITS  (3)
 137   ..         |     |--- - --| LINE_OFF_BITS (6)
 138   ..         |         |- --| ROW_OFF_BITS  (3)
 139   ..         |----- ---|    | ROW_BITS      (8)
 140   ..         |-----|        | INDEX_BITS    (5)
 141   .. --------|              | TAG_BITS      (45)
 142 """
 143 print (layout)
 144 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 145             (TAG_BITS, INDEX_BITS, ROW_BITS,
 146              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 147 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 148 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 149 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 150
 151 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 152
 153 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 154
 155 def CacheTagArray():
 156     return Array(Signal(TAG_RAM_WIDTH, name="cachetag_%d" % x) \
 157                         for x in range(NUM_LINES))
 158
 159 def CacheValidBitsArray():
 160     return Array(Signal(NUM_WAYS, name="cachevalid_%d" % x) \
 161                         for x in range(NUM_LINES))
 162
 163 def RowPerLineValidArray():
 164     return Array(Signal(name="rows_valid%d" % x) \
 165                         for x in range(ROW_PER_LINE))
 166
 167 # L1 TLB
 168 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 169 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 170 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 171 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 172 TLB_PTE_BITS     = 64
 173 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 174
 175 def ispow2(x):
 176     return (1<<log2_int(x, False)) == x
 177
 178 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 179 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 180 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 181 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 182 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 183 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 184         "geometry bits don't add up"
 185 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 186         "geometry bits don't add up"
 187 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 188          "geometry bits don't add up"
 189 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 190 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 191
 192
 193 def TLBValidBitsArray():
 194     return Array(Signal(TLB_NUM_WAYS, name="tlbvalid%d" % x) \
 195                 for x in range(TLB_SET_SIZE))
 196
 197 def TLBTagEAArray():
 198     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 199                 for x in range (TLB_NUM_WAYS))
 200
 201 def TLBTagsArray():
 202     return Array(Signal(TLB_TAG_WAY_BITS, name="tlbtags%d" % x) \
 203                 for x in range (TLB_SET_SIZE))
 204
 205 def TLBPtesArray():
 206     return Array(Signal(TLB_PTE_WAY_BITS, name="tlbptes%d" % x) \
 207                 for x in range(TLB_SET_SIZE))
 208
 209 def HitWaySet():
 210     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 211                         for x in range(TLB_NUM_WAYS))
 212
 213 # Cache RAM interface
 214 def CacheRamOut():
 215     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 216                  for x in range(NUM_WAYS))
 217
 218 # PLRU output interface
 219 def PLRUOut():
 220     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 221                 for x in range(NUM_LINES))
 222
 223 # TLB PLRU output interface
 224 def TLBPLRUOut():
 225     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 226                 for x in range(TLB_SET_SIZE))
 227
 228 # Helper functions to decode incoming requests
 229 #
 230 # Return the cache line index (tag index) for an address
 231 def get_index(addr):
 232     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 233
 234 # Return the cache row index (data memory) for an address
 235 def get_row(addr):
 236     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 237
 238 # Return the index of a row within a line
 239 def get_row_of_line(row):
 240     return row[:ROW_BITS][:ROW_LINE_BITS]
 241
 242 # Returns whether this is the last row of a line
 243 def is_last_row_addr(addr, last):
 244     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 245
 246 # Returns whether this is the last row of a line
 247 def is_last_row(row, last):
 248     return get_row_of_line(row) == last
 249
 250 # Return the next row in the current cache line. We use a
 251 # dedicated function in order to limit the size of the
 252 # generated adder to be only the bits within a cache line
 253 # (3 bits with default settings)
 254 def next_row(row):
 255     row_v = row[0:ROW_LINE_BITS] + 1
 256     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 257
 258 # Get the tag value from the address
 259 def get_tag(addr):
 260     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 261
 262 # Read a tag from a tag memory row
 263 def read_tag(way, tagset):
 264     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 265
 266 # Read a TLB tag from a TLB tag memory row
 267 def read_tlb_tag(way, tags):
 268     return tags.word_select(way, TLB_EA_TAG_BITS)
 269
 270 # Write a TLB tag to a TLB tag memory row
 271 def write_tlb_tag(way, tags, tag):
 272     return read_tlb_tag(way, tags).eq(tag)
 273
 274 # Read a PTE from a TLB PTE memory row
 275 def read_tlb_pte(way, ptes):
 276     return ptes.word_select(way, TLB_PTE_BITS)
 277
 278 def write_tlb_pte(way, ptes, newpte):
 279     return read_tlb_pte(way, ptes).eq(newpte)
 280
 281
 282 # Record for storing permission, attribute, etc. bits from a PTE
 283 class PermAttr(RecordObject):
 284     def __init__(self, name=None):
 285         super().__init__(name=name)
 286         self.reference = Signal()
 287         self.changed   = Signal()
 288         self.nocache   = Signal()
 289         self.priv      = Signal()
 290         self.rd_perm   = Signal()
 291         self.wr_perm   = Signal()
 292
 293
 294 def extract_perm_attr(pte):
 295     pa = PermAttr()
 296     return pa;
 297
 298
 299 # Type of operation on a "valid" input
 300 @unique
 301 class Op(Enum):
 302     OP_NONE       = 0
 303     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 304     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 305     OP_LOAD_HIT   = 3 # Cache hit on load
 306     OP_LOAD_MISS  = 4 # Load missing cache
 307     OP_LOAD_NC    = 5 # Non-cachable load
 308     OP_STORE_HIT  = 6 # Store hitting cache
 309     OP_STORE_MISS = 7 # Store missing cache
 310
 311
 312 # Cache state machine
 313 @unique
 314 class State(Enum):
 315     IDLE             = 0 # Normal load hit processing
 316     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 317     STORE_WAIT_ACK   = 2 # Store wait ack
 318     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 319
 320
 321 # Dcache operations:
 322 #
 323 # In order to make timing, we use the BRAMs with
 324 # an output buffer, which means that the BRAM
 325 # output is delayed by an extra cycle.
 326 #
 327 # Thus, the dcache has a 2-stage internal pipeline
 328 # for cache hits with no stalls.
 329 #
 330 # All other operations are handled via stalling
 331 # in the first stage.
 332 #
 333 # The second stage can thus complete a hit at the same
 334 # time as the first stage emits a stall for a complex op.
 335 #
 336 # Stage 0 register, basically contains just the latched request
 337
 338 class RegStage0(RecordObject):
 339     def __init__(self, name=None):
 340         super().__init__(name=name)
 341         self.req     = LoadStore1ToDCacheType(name="lsmem")
 342         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 343         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 344         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 345         self.mmu_req = Signal() # indicates source of request
 346         self.d_valid = Signal() # indicates req.data is valid now
 347
 348
 349 class MemAccessRequest(RecordObject):
 350     def __init__(self, name=None):
 351         super().__init__(name=name)
 352         self.op        = Signal(Op)
 353         self.valid     = Signal()
 354         self.dcbz      = Signal()
 355         self.real_addr = Signal(REAL_ADDR_BITS)
 356         self.data      = Signal(64)
 357         self.byte_sel  = Signal(8)
 358         self.hit_way   = Signal(WAY_BITS)
 359         self.same_tag  = Signal()
 360         self.mmu_req   = Signal()
 361
 362
 363 # First stage register, contains state for stage 1 of load hits
 364 # and for the state machine used by all other operations
 365 class RegStage1(RecordObject):
 366     def __init__(self, name=None):
 367         super().__init__(name=name)
 368         # Info about the request
 369         self.full             = Signal() # have uncompleted request
 370         self.mmu_req          = Signal() # request is from MMU
 371         self.req              = MemAccessRequest(name="reqmem")
 372
 373         # Cache hit state
 374         self.hit_way          = Signal(WAY_BITS)
 375         self.hit_load_valid   = Signal()
 376         self.hit_index        = Signal(INDEX_BITS)
 377         self.cache_hit        = Signal()
 378
 379         # TLB hit state
 380         self.tlb_hit          = Signal()
 381         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 382         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 383
 384         # 2-stage data buffer for data forwarded from writes to reads
 385         self.forward_data1    = Signal(64)
 386         self.forward_data2    = Signal(64)
 387         self.forward_sel1     = Signal(8)
 388         self.forward_valid1   = Signal()
 389         self.forward_way1     = Signal(WAY_BITS)
 390         self.forward_row1     = Signal(ROW_BITS)
 391         self.use_forward1     = Signal()
 392         self.forward_sel      = Signal(8)
 393
 394         # Cache miss state (reload state machine)
 395         self.state            = Signal(State)
 396         self.dcbz             = Signal()
 397         self.write_bram       = Signal()
 398         self.write_tag        = Signal()
 399         self.slow_valid       = Signal()
 400         self.wb               = WBMasterOut("wb")
 401         self.reload_tag       = Signal(TAG_BITS)
 402         self.store_way        = Signal(WAY_BITS)
 403         self.store_row        = Signal(ROW_BITS)
 404         self.store_index      = Signal(INDEX_BITS)
 405         self.end_row_ix       = Signal(ROW_LINE_BITS)
 406         self.rows_valid       = RowPerLineValidArray()
 407         self.acks_pending     = Signal(3)
 408         self.inc_acks         = Signal()
 409         self.dec_acks         = Signal()
 410
 411         # Signals to complete (possibly with error)
 412         self.ls_valid         = Signal()
 413         self.ls_error         = Signal()
 414         self.mmu_done         = Signal()
 415         self.mmu_error        = Signal()
 416         self.cache_paradox    = Signal()
 417
 418         # Signal to complete a failed stcx.
 419         self.stcx_fail        = Signal()
 420
 421
 422 # Reservation information
 423 class Reservation(RecordObject):
 424     def __init__(self):
 425         super().__init__()
 426         self.valid = Signal()
 427         self.addr  = Signal(64-LINE_OFF_BITS)
 428
 429
 430 class DTLBUpdate(Elaboratable):
 431     def __init__(self):
 432         self.tlbie    = Signal()
 433         self.tlbwe    = Signal()
 434         self.doall    = Signal()
 435         self.updated  = Signal()
 436         self.v_updated  = Signal()
 437         self.tlb_hit    = Signal()
 438         self.tlb_req_index = Signal(TLB_SET_BITS)
 439
 440         self.tlb_hit_way     = Signal(TLB_WAY_BITS)
 441         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 442         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 443         self.repl_way        = Signal(TLB_WAY_BITS)
 444         self.eatag           = Signal(TLB_EA_TAG_BITS)
 445         self.pte_data        = Signal(TLB_PTE_BITS)
 446
 447         self.dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 448
 449         self.tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 450         self.db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 451         self.pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 452
 453     def elaborate(self, platform):
 454         m = Module()
 455         comb = m.d.comb
 456         sync = m.d.sync
 457
 458         tagset   = Signal(TLB_TAG_WAY_BITS)
 459         pteset   = Signal(TLB_PTE_WAY_BITS)
 460
 461         tb_out, pb_out, db_out = self.tb_out, self.pb_out, self.db_out
 462         comb += db_out.eq(self.dv)
 463
 464         with m.If(self.tlbie & self.doall):
 465             pass # clear all back in parent
 466         with m.Elif(self.tlbie):
 467             with m.If(self.tlb_hit):
 468                 comb += db_out.bit_select(self.tlb_hit_way, 1).eq(0)
 469                 comb += self.v_updated.eq(1)
 470
 471         with m.Elif(self.tlbwe):
 472
 473             comb += tagset.eq(self.tlb_tag_way)
 474             comb += write_tlb_tag(self.repl_way, tagset, self.eatag)
 475             comb += tb_out.eq(tagset)
 476
 477             comb += pteset.eq(self.tlb_pte_way)
 478             comb += write_tlb_pte(self.repl_way, pteset, self.pte_data)
 479             comb += pb_out.eq(pteset)
 480
 481             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 482
 483             comb += self.updated.eq(1)
 484             comb += self.v_updated.eq(1)
 485
 486         return m
 487
 488
 489 class DCachePendingHit(Elaboratable):
 490
 491     def __init__(self, tlb_pte_way, tlb_valid_way, tlb_hit_way,
 492                       cache_valid_idx, cache_tag_set,
 493                     req_addr,
 494                     hit_set):
 495
 496         self.go          = Signal()
 497         self.virt_mode   = Signal()
 498         self.is_hit      = Signal()
 499         self.tlb_hit     = Signal()
 500         self.hit_way     = Signal(WAY_BITS)
 501         self.rel_match   = Signal()
 502         self.req_index   = Signal(INDEX_BITS)
 503         self.reload_tag  = Signal(TAG_BITS)
 504
 505         self.tlb_hit_way = tlb_hit_way
 506         self.tlb_pte_way = tlb_pte_way
 507         self.tlb_valid_way = tlb_valid_way
 508         self.cache_valid_idx = cache_valid_idx
 509         self.cache_tag_set = cache_tag_set
 510         self.req_addr = req_addr
 511         self.hit_set = hit_set
 512
 513     def elaborate(self, platform):
 514         m = Module()
 515         comb = m.d.comb
 516         sync = m.d.sync
 517
 518         go = self.go
 519         virt_mode = self.virt_mode
 520         is_hit = self.is_hit
 521         tlb_pte_way = self.tlb_pte_way
 522         tlb_valid_way = self.tlb_valid_way
 523         cache_valid_idx = self.cache_valid_idx
 524         cache_tag_set = self.cache_tag_set
 525         req_addr = self.req_addr
 526         tlb_hit_way = self.tlb_hit_way
 527         tlb_hit = self.tlb_hit
 528         hit_set = self.hit_set
 529         hit_way = self.hit_way
 530         rel_match = self.rel_match
 531         req_index = self.req_index
 532         reload_tag = self.reload_tag
 533
 534         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 535                                     for i in range(TLB_NUM_WAYS))
 536         hit_way_set = HitWaySet()
 537
 538         # Test if pending request is a hit on any way
 539         # In order to make timing in virtual mode,
 540         # when we are using the TLB, we compare each
 541         # way with each of the real addresses from each way of
 542         # the TLB, and then decide later which match to use.
 543
 544         with m.If(virt_mode):
 545             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 546                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 547                 s_hit       = Signal()
 548                 s_pte       = Signal(TLB_PTE_BITS)
 549                 s_ra        = Signal(REAL_ADDR_BITS)
 550                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 551                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 552                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 553                 comb += s_tag.eq(get_tag(s_ra))
 554
 555                 for i in range(NUM_WAYS): # way_t
 556                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 557                     comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 558                                   (read_tag(i, cache_tag_set) == s_tag)
 559                                   & tlb_valid_way[j])
 560                     with m.If(is_tag_hit):
 561                         comb += hit_way_set[j].eq(i)
 562                         comb += s_hit.eq(1)
 563                 comb += hit_set[j].eq(s_hit)
 564                 with m.If(s_tag == reload_tag):
 565                     comb += rel_matches[j].eq(1)
 566             with m.If(tlb_hit):
 567                 comb += is_hit.eq(hit_set[tlb_hit_way])
 568                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 569                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 570         with m.Else():
 571             s_tag       = Signal(TAG_BITS)
 572             comb += s_tag.eq(get_tag(req_addr))
 573             for i in range(NUM_WAYS): # way_t
 574                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 575                 comb += is_tag_hit.eq(go & cache_valid_idx[i] &
 576                           (read_tag(i, cache_tag_set) == s_tag))
 577                 with m.If(is_tag_hit):
 578                     comb += hit_way.eq(i)
 579                     comb += is_hit.eq(1)
 580             with m.If(s_tag == reload_tag):
 581                 comb += rel_match.eq(1)
 582
 583         return m
 584
 585
 586 class DCache(Elaboratable):
 587     """Set associative dcache write-through
 588
 589     TODO (in no specific order):
 590     * See list in icache.vhdl
 591     * Complete load misses on the cycle when WB data comes instead of
 592       at the end of line (this requires dealing with requests coming in
 593       while not idle...)
 594     """
 595     def __init__(self):
 596         self.d_in      = LoadStore1ToDCacheType("d_in")
 597         self.d_out     = DCacheToLoadStore1Type("d_out")
 598
 599         self.m_in      = MMUToDCacheType("m_in")
 600         self.m_out     = DCacheToMMUType("m_out")
 601
 602         self.stall_out = Signal()
 603
 604         self.wb_out    = WBMasterOut("wb_out")
 605         self.wb_in     = WBSlaveOut("wb_in")
 606
 607         self.log_out   = Signal(20)
 608
 609     def stage_0(self, m, r0, r1, r0_full):
 610         """Latch the request in r0.req as long as we're not stalling
 611         """
 612         comb = m.d.comb
 613         sync = m.d.sync
 614         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 615
 616         r = RegStage0("stage0")
 617
 618         # TODO, this goes in unit tests and formal proofs
 619         with m.If(d_in.valid & m_in.valid):
 620             sync += Display("request collision loadstore vs MMU")
 621
 622         with m.If(m_in.valid):
 623             comb += r.req.valid.eq(1)
 624             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 625             comb += r.req.dcbz.eq(0)
 626             comb += r.req.nc.eq(0)
 627             comb += r.req.reserve.eq(0)
 628             comb += r.req.virt_mode.eq(0)
 629             comb += r.req.priv_mode.eq(1)
 630             comb += r.req.addr.eq(m_in.addr)
 631             comb += r.req.data.eq(m_in.pte)
 632             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 633             comb += r.tlbie.eq(m_in.tlbie)
 634             comb += r.doall.eq(m_in.doall)
 635             comb += r.tlbld.eq(m_in.tlbld)
 636             comb += r.mmu_req.eq(1)
 637             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 638                                  m_in.addr, m_in.pte, r.req.load)
 639
 640         with m.Else():
 641             comb += r.req.eq(d_in)
 642             comb += r.req.data.eq(0)
 643             comb += r.tlbie.eq(0)
 644             comb += r.doall.eq(0)
 645             comb += r.tlbld.eq(0)
 646             comb += r.mmu_req.eq(0)
 647         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 648             sync += r0.eq(r)
 649             sync += r0_full.eq(r.req.valid)
 650             # Sample data the cycle after a request comes in from loadstore1.
 651             # If another request has come in already then the data will get
 652             # put directly into req.data below.
 653             with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
 654                      ~r0.mmu_req):
 655                 sync += r0.req.data.eq(d_in.data)
 656                 sync += r0.d_valid.eq(1)
 657         with m.If(d_in.valid):
 658             m.d.sync += Display("    DCACHE req cache "
 659                                 "virt %d addr %x data %x ld %d",
 660                                  r.req.virt_mode, r.req.addr,
 661                                  r.req.data, r.req.load)
 662
 663     def tlb_read(self, m, r0_stall, tlb_valid_way,
 664                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 665                  dtlb_tags, dtlb_ptes):
 666         """TLB
 667         Operates in the second cycle on the request latched in r0.req.
 668         TLB updates write the entry at the end of the second cycle.
 669         """
 670         comb = m.d.comb
 671         sync = m.d.sync
 672         m_in, d_in = self.m_in, self.d_in
 673
 674         index    = Signal(TLB_SET_BITS)
 675         addrbits = Signal(TLB_SET_BITS)
 676
 677         amin = TLB_LG_PGSZ
 678         amax = TLB_LG_PGSZ + TLB_SET_BITS
 679
 680         with m.If(m_in.valid):
 681             comb += addrbits.eq(m_in.addr[amin : amax])
 682         with m.Else():
 683             comb += addrbits.eq(d_in.addr[amin : amax])
 684         comb += index.eq(addrbits)
 685
 686         # If we have any op and the previous op isn't finished,
 687         # then keep the same output for next cycle.
 688         with m.If(~r0_stall):
 689             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 690             sync += tlb_tag_way.eq(dtlb_tags[index])
 691             sync += tlb_pte_way.eq(dtlb_ptes[index])
 692
 693     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim):
 694         """Generate TLB PLRUs
 695         """
 696         comb = m.d.comb
 697         sync = m.d.sync
 698
 699         if TLB_NUM_WAYS == 0:
 700             return
 701         for i in range(TLB_SET_SIZE):
 702             # TLB PLRU interface
 703             tlb_plru        = PLRU(TLB_WAY_BITS)
 704             setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 705             tlb_plru_acc_en = Signal()
 706
 707             comb += tlb_plru_acc_en.eq(r1.tlb_hit & (r1.tlb_hit_index == i))
 708             comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 709             comb += tlb_plru.acc_i.eq(r1.tlb_hit_way)
 710             comb += tlb_plru_victim[i].eq(tlb_plru.lru_o)
 711
 712     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 713                    tlb_valid_way, tlb_tag_way, tlb_hit_way,
 714                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 715
 716         comb = m.d.comb
 717
 718         hitway = Signal(TLB_WAY_BITS)
 719         hit    = Signal()
 720         eatag  = Signal(TLB_EA_TAG_BITS)
 721
 722         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 723         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 724         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 725
 726         for i in range(TLB_NUM_WAYS):
 727             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 728             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 729             comb += tlb_tag.eq(read_tlb_tag(i, tlb_tag_way))
 730             comb += is_tag_hit.eq(tlb_valid_way[i] & (tlb_tag == eatag))
 731             with m.If(is_tag_hit):
 732                 comb += hitway.eq(i)
 733                 comb += hit.eq(1)
 734
 735         comb += tlb_hit.eq(hit & r0_valid)
 736         comb += tlb_hit_way.eq(hitway)
 737
 738         with m.If(tlb_hit):
 739             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 740         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 741
 742         with m.If(r0.req.virt_mode):
 743             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 744                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 745                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 746             comb += perm_attr.reference.eq(pte[8])
 747             comb += perm_attr.changed.eq(pte[7])
 748             comb += perm_attr.nocache.eq(pte[5])
 749             comb += perm_attr.priv.eq(pte[3])
 750             comb += perm_attr.rd_perm.eq(pte[2])
 751             comb += perm_attr.wr_perm.eq(pte[1])
 752         with m.Else():
 753             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 754                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 755             comb += perm_attr.reference.eq(1)
 756             comb += perm_attr.changed.eq(1)
 757             comb += perm_attr.nocache.eq(0)
 758             comb += perm_attr.priv.eq(1)
 759             comb += perm_attr.rd_perm.eq(1)
 760             comb += perm_attr.wr_perm.eq(1)
 761
 762         with m.If(valid_ra):
 763             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 764                                 r0.req.virt_mode, tlb_hit, ra, pte)
 765             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 766             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 767             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 768             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 769             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 770             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 771
 772     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 773                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 774                     dtlb_tags, tlb_pte_way, dtlb_ptes):
 775
 776         dtlb_valids = TLBValidBitsArray()
 777
 778         comb = m.d.comb
 779         sync = m.d.sync
 780
 781         tlbie    = Signal()
 782         tlbwe    = Signal()
 783
 784         comb += tlbie.eq(r0_valid & r0.tlbie)
 785         comb += tlbwe.eq(r0_valid & r0.tlbld)
 786
 787         m.submodules.tlb_update = d = DTLBUpdate()
 788         with m.If(tlbie & r0.doall):
 789             # clear all valid bits at once
 790             for i in range(TLB_SET_SIZE):
 791                 sync += dtlb_valid_bits[i].eq(0)
 792         with m.If(d.updated):
 793             sync += dtlb_tags[tlb_req_index].eq(d.tb_out)
 794             sync += dtlb_ptes[tlb_req_index].eq(d.pb_out)
 795         with m.If(d.v_updated):
 796             sync += dtlb_valid_bits[tlb_req_index].eq(d.db_out)
 797
 798         comb += d.dv.eq(dtlb_valid_bits[tlb_req_index])
 799
 800         comb += d.tlbie.eq(tlbie)
 801         comb += d.tlbwe.eq(tlbwe)
 802         comb += d.doall.eq(r0.doall)
 803         comb += d.tlb_hit.eq(tlb_hit)
 804         comb += d.tlb_hit_way.eq(tlb_hit_way)
 805         comb += d.tlb_tag_way.eq(tlb_tag_way)
 806         comb += d.tlb_pte_way.eq(tlb_pte_way)
 807         comb += d.tlb_req_index.eq(tlb_req_index)
 808
 809         with m.If(tlb_hit):
 810             comb += d.repl_way.eq(tlb_hit_way)
 811         with m.Else():
 812             comb += d.repl_way.eq(tlb_plru_victim[tlb_req_index])
 813         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 814         comb += d.pte_data.eq(r0.req.data)
 815
 816     def maybe_plrus(self, m, r1, plru_victim):
 817         """Generate PLRUs
 818         """
 819         comb = m.d.comb
 820         sync = m.d.sync
 821
 822         if TLB_NUM_WAYS == 0:
 823             return
 824
 825         for i in range(NUM_LINES):
 826             # PLRU interface
 827             plru        = PLRU(WAY_BITS)
 828             setattr(m.submodules, "plru%d" % i, plru)
 829             plru_acc_en = Signal()
 830
 831             comb += plru_acc_en.eq(r1.cache_hit & (r1.hit_index == i))
 832             comb += plru.acc_en.eq(plru_acc_en)
 833             comb += plru.acc_i.eq(r1.hit_way)
 834             comb += plru_victim[i].eq(plru.lru_o)
 835
 836     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 837         """Cache tag RAM read port
 838         """
 839         comb = m.d.comb
 840         sync = m.d.sync
 841         m_in, d_in = self.m_in, self.d_in
 842
 843         index = Signal(INDEX_BITS)
 844
 845         with m.If(r0_stall):
 846             comb += index.eq(req_index)
 847         with m.Elif(m_in.valid):
 848             comb += index.eq(get_index(m_in.addr))
 849         with m.Else():
 850             comb += index.eq(get_index(d_in.addr))
 851         sync += cache_tag_set.eq(cache_tags[index])
 852
 853     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 854                        r0_valid, r1, cache_valids, replace_way,
 855                        use_forward1_next, use_forward2_next,
 856                        req_hit_way, plru_victim, rc_ok, perm_attr,
 857                        valid_ra, perm_ok, access_ok, req_op, req_go,
 858                        tlb_pte_way,
 859                        tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
 860                        cancel_store, req_same_tag, r0_stall, early_req_row):
 861         """Cache request parsing and hit detection
 862         """
 863
 864         comb = m.d.comb
 865         m_in, d_in = self.m_in, self.d_in
 866
 867         is_hit      = Signal()
 868         hit_way     = Signal(WAY_BITS)
 869         op          = Signal(Op)
 870         opsel       = Signal(3)
 871         go          = Signal()
 872         nc          = Signal()
 873         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 874                                   for i in range(TLB_NUM_WAYS))
 875         cache_valid_idx = Signal(NUM_WAYS)
 876
 877         # Extract line, row and tag from request
 878         comb += req_index.eq(get_index(r0.req.addr))
 879         comb += req_row.eq(get_row(r0.req.addr))
 880         comb += req_tag.eq(get_tag(ra))
 881
 882         if False: # display on comb is a bit... busy.
 883             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 884                     r0.req.addr, ra, req_index, req_tag, req_row)
 885
 886         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 887         comb += cache_valid_idx.eq(cache_valids[req_index])
 888
 889         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_pte_way,
 890                                 tlb_valid_way, tlb_hit_way,
 891                                 cache_valid_idx, cache_tag_set,
 892                                 r0.req.addr,
 893                                 hit_set)
 894
 895         comb += dc.tlb_hit.eq(tlb_hit)
 896         comb += dc.reload_tag.eq(r1.reload_tag)
 897         comb += dc.virt_mode.eq(r0.req.virt_mode)
 898         comb += dc.go.eq(go)
 899         comb += dc.req_index.eq(req_index)
 900         comb += is_hit.eq(dc.is_hit)
 901         comb += hit_way.eq(dc.hit_way)
 902         comb += req_same_tag.eq(dc.rel_match)
 903
 904         # See if the request matches the line currently being reloaded
 905         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 906                   (req_index == r1.store_index) & req_same_tag):
 907             # For a store, consider this a hit even if the row isn't
 908             # valid since it will be by the time we perform the store.
 909             # For a load, check the appropriate row valid bit.
 910             rrow = Signal(ROW_LINE_BITS)
 911             comb += rrow.eq(req_row)
 912             valid = r1.rows_valid[rrow]
 913             comb += is_hit.eq((~r0.req.load) | valid)
 914             comb += hit_way.eq(replace_way)
 915
 916         # Whether to use forwarded data for a load or not
 917         with m.If((get_row(r1.req.real_addr) == req_row) &
 918                   (r1.req.hit_way == hit_way)):
 919             # Only need to consider r1.write_bram here, since if we
 920             # are writing refill data here, then we don't have a
 921             # cache hit this cycle on the line being refilled.
 922             # (There is the possibility that the load following the
 923             # load miss that started the refill could be to the old
 924             # contents of the victim line, since it is a couple of
 925             # cycles after the refill starts before we see the updated
 926             # cache tag. In that case we don't use the bypass.)
 927             comb += use_forward1_next.eq(r1.write_bram)
 928         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 929             comb += use_forward2_next.eq(r1.forward_valid1)
 930
 931         # The way that matched on a hit
 932         comb += req_hit_way.eq(hit_way)
 933
 934         # The way to replace on a miss
 935         with m.If(r1.write_tag):
 936             comb += replace_way.eq(plru_victim[r1.store_index])
 937         with m.Else():
 938             comb += replace_way.eq(r1.store_way)
 939
 940         # work out whether we have permission for this access
 941         # NB we don't yet implement AMR, thus no KUAP
 942         comb += rc_ok.eq(perm_attr.reference
 943                          & (r0.req.load | perm_attr.changed))
 944         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 945                            (perm_attr.wr_perm |
 946                               (r0.req.load & perm_attr.rd_perm)))
 947         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 948         # Combine the request and cache hit status to decide what
 949         # operation needs to be done
 950         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 951         comb += op.eq(Op.OP_NONE)
 952         with m.If(go):
 953             with m.If(~access_ok):
 954                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
 955                                  valid_ra, perm_ok, rc_ok)
 956                 comb += op.eq(Op.OP_BAD)
 957             with m.Elif(cancel_store):
 958                 m.d.sync += Display("DCACHE cancel store")
 959                 comb += op.eq(Op.OP_STCX_FAIL)
 960             with m.Else():
 961                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
 962                                  valid_ra, nc, r0.req.load)
 963                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 964                 with m.Switch(opsel):
 965                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 966                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 967                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 968                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 969                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
 970                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
 971                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
 972                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
 973         comb += req_op.eq(op)
 974         comb += req_go.eq(go)
 975
 976         # Version of the row number that is valid one cycle earlier
 977         # in the cases where we need to read the cache data BRAM.
 978         # If we're stalling then we need to keep reading the last
 979         # row requested.
 980         with m.If(~r0_stall):
 981             with m.If(m_in.valid):
 982                 comb += early_req_row.eq(get_row(m_in.addr))
 983             with m.Else():
 984                 comb += early_req_row.eq(get_row(d_in.addr))
 985         with m.Else():
 986             comb += early_req_row.eq(req_row)
 987
 988     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 989                          r0_valid, r0, reservation):
 990         """Handle load-with-reservation and store-conditional instructions
 991         """
 992         comb = m.d.comb
 993
 994         with m.If(r0_valid & r0.req.reserve):
 995             # XXX generate alignment interrupt if address
 996             # is not aligned XXX or if r0.req.nc = '1'
 997             with m.If(r0.req.load):
 998                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
 999             with m.Else():
1000                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1001                 with m.If((~reservation.valid) |
1002                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1003                     comb += cancel_store.eq(1)
1004
1005     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1006                         reservation, r0):
1007
1008         comb = m.d.comb
1009         sync = m.d.sync
1010
1011         with m.If(r0_valid & access_ok):
1012             with m.If(clear_rsrv):
1013                 sync += reservation.valid.eq(0)
1014             with m.Elif(set_rsrv):
1015                 sync += reservation.valid.eq(1)
1016                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1017
1018     def writeback_control(self, m, r1, cache_out_row):
1019         """Return data for loads & completion control logic
1020         """
1021         comb = m.d.comb
1022         sync = m.d.sync
1023         d_out, m_out = self.d_out, self.m_out
1024
1025         data_out = Signal(64)
1026         data_fwd = Signal(64)
1027
1028         # Use the bypass if are reading the row that was
1029         # written 1 or 2 cycles ago, including for the
1030         # slow_valid = 1 case (i.e. completing a load
1031         # miss or a non-cacheable load).
1032         with m.If(r1.use_forward1):
1033             comb += data_fwd.eq(r1.forward_data1)
1034         with m.Else():
1035             comb += data_fwd.eq(r1.forward_data2)
1036
1037         comb += data_out.eq(cache_out_row)
1038
1039         for i in range(8):
1040             with m.If(r1.forward_sel[i]):
1041                 dsel = data_fwd.word_select(i, 8)
1042                 comb += data_out.word_select(i, 8).eq(dsel)
1043
1044         comb += d_out.valid.eq(r1.ls_valid)
1045         comb += d_out.data.eq(data_out)
1046         comb += d_out.store_done.eq(~r1.stcx_fail)
1047         comb += d_out.error.eq(r1.ls_error)
1048         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1049
1050         # Outputs to MMU
1051         comb += m_out.done.eq(r1.mmu_done)
1052         comb += m_out.err.eq(r1.mmu_error)
1053         comb += m_out.data.eq(data_out)
1054
1055         # We have a valid load or store hit or we just completed
1056         # a slow op such as a load miss, a NC load or a store
1057         #
1058         # Note: the load hit is delayed by one cycle. However it
1059         # can still not collide with r.slow_valid (well unless I
1060         # miscalculated) because slow_valid can only be set on a
1061         # subsequent request and not on its first cycle (the state
1062         # machine must have advanced), which makes slow_valid
1063         # at least 2 cycles from the previous hit_load_valid.
1064
1065         # Sanity: Only one of these must be set in any given cycle
1066
1067         if False: # TODO: need Display to get this to work
1068             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1069             "unexpected slow_valid collision with stcx_fail"
1070
1071             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1072              "unexpected hit_load_delayed collision with slow_valid"
1073
1074         with m.If(~r1.mmu_req):
1075             # Request came from loadstore1...
1076             # Load hit case is the standard path
1077             with m.If(r1.hit_load_valid):
1078                 sync += Display("completing load hit data=%x", data_out)
1079
1080             # error cases complete without stalling
1081             with m.If(r1.ls_error):
1082                 with m.If(r1.dcbz):
1083                     sync += Display("completing dcbz with error")
1084                 with m.Else():
1085                     sync += Display("completing ld/st with error")
1086
1087             # Slow ops (load miss, NC, stores)
1088             with m.If(r1.slow_valid):
1089                 sync += Display("completing store or load miss adr=%x data=%x",
1090                                 r1.req.real_addr, data_out)
1091
1092         with m.Else():
1093             # Request came from MMU
1094             with m.If(r1.hit_load_valid):
1095                 sync += Display("completing load hit to MMU, data=%x",
1096                                 m_out.data)
1097             # error cases complete without stalling
1098             with m.If(r1.mmu_error):
1099                 sync += Display("combpleting MMU ld with error")
1100
1101             # Slow ops (i.e. load miss)
1102             with m.If(r1.slow_valid):
1103                 sync += Display("completing MMU load miss, adr=%x data=%x",
1104                                 r1.req.real_addr, m_out.data)
1105
1106     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1107         """rams
1108         Generate a cache RAM for each way. This handles the normal
1109         reads, writes from reloads and the special store-hit update
1110         path as well.
1111
1112         Note: the BRAMs have an extra read buffer, meaning the output
1113         is pipelined an extra cycle. This differs from the
1114         icache. The writeback logic needs to take that into
1115         account by using 1-cycle delayed signals for load hits.
1116         """
1117         comb = m.d.comb
1118         wb_in = self.wb_in
1119
1120         for i in range(NUM_WAYS):
1121             do_read  = Signal(name="do_rd%d" % i)
1122             rd_addr  = Signal(ROW_BITS, name="rd_addr_%d" % i)
1123             do_write = Signal(name="do_wr%d" % i)
1124             wr_addr  = Signal(ROW_BITS, name="wr_addr_%d" % i)
1125             wr_data  = Signal(WB_DATA_BITS, name="din_%d" % i)
1126             wr_sel   = Signal(ROW_SIZE)
1127             wr_sel_m = Signal(ROW_SIZE)
1128             _d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1129
1130             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1131             setattr(m.submodules, "cacheram_%d" % i, way)
1132
1133             comb += way.rd_en.eq(do_read)
1134             comb += way.rd_addr.eq(rd_addr)
1135             comb += _d_out.eq(way.rd_data_o)
1136             comb += way.wr_sel.eq(wr_sel_m)
1137             comb += way.wr_addr.eq(wr_addr)
1138             comb += way.wr_data.eq(wr_data)
1139
1140             # Cache hit reads
1141             comb += do_read.eq(1)
1142             comb += rd_addr.eq(early_req_row)
1143             with m.If(r1.hit_way == i):
1144                 comb += cache_out_row.eq(_d_out)
1145
1146             # Write mux:
1147             #
1148             # Defaults to wishbone read responses (cache refill)
1149             #
1150             # For timing, the mux on wr_data/sel/addr is not
1151             # dependent on anything other than the current state.
1152
1153             with m.If(r1.write_bram):
1154                 # Write store data to BRAM.  This happens one
1155                 # cycle after the store is in r0.
1156                 comb += wr_data.eq(r1.req.data)
1157                 comb += wr_sel.eq(r1.req.byte_sel)
1158                 comb += wr_addr.eq(get_row(r1.req.real_addr))
1159
1160                 with m.If(i == r1.req.hit_way):
1161                     comb += do_write.eq(1)
1162             with m.Else():
1163                 # Otherwise, we might be doing a reload or a DCBZ
1164                 with m.If(r1.dcbz):
1165                     comb += wr_data.eq(0)
1166                 with m.Else():
1167                     comb += wr_data.eq(wb_in.dat)
1168                 comb += wr_addr.eq(r1.store_row)
1169                 comb += wr_sel.eq(~0) # all 1s
1170
1171                 with m.If((r1.state == State.RELOAD_WAIT_ACK)
1172                           & wb_in.ack & (replace_way == i)):
1173                     comb += do_write.eq(1)
1174
1175             # Mask write selects with do_write since BRAM
1176             # doesn't have a global write-enable
1177             with m.If(do_write):
1178                 comb += wr_sel_m.eq(wr_sel)
1179
1180     # Cache hit synchronous machine for the easy case.
1181     # This handles load hits.
1182     # It also handles error cases (TLB miss, cache paradox)
1183     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1184                         req_hit_way, req_index, req_tag, access_ok,
1185                         tlb_hit, tlb_hit_way, tlb_req_index):
1186
1187         comb = m.d.comb
1188         sync = m.d.sync
1189
1190         with m.If(req_op != Op.OP_NONE):
1191             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1192                     req_op, r0.req.addr, r0.req.nc,
1193                     req_index, req_tag, req_hit_way)
1194
1195         with m.If(r0_valid):
1196             sync += r1.mmu_req.eq(r0.mmu_req)
1197
1198         # Fast path for load/store hits.
1199         # Set signals for the writeback controls.
1200         sync += r1.hit_way.eq(req_hit_way)
1201         sync += r1.hit_index.eq(req_index)
1202
1203         with m.If(req_op == Op.OP_LOAD_HIT):
1204             sync += r1.hit_load_valid.eq(1)
1205         with m.Else():
1206             sync += r1.hit_load_valid.eq(0)
1207
1208         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1209             sync += r1.cache_hit.eq(1)
1210         with m.Else():
1211             sync += r1.cache_hit.eq(0)
1212
1213         with m.If(req_op == Op.OP_BAD):
1214             sync += Display("Signalling ld/st error "
1215                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1216                             ~r0.mmu_req,r0.mmu_req,access_ok)
1217             sync += r1.ls_error.eq(~r0.mmu_req)
1218             sync += r1.mmu_error.eq(r0.mmu_req)
1219             sync += r1.cache_paradox.eq(access_ok)
1220
1221         with m.Else():
1222             sync += r1.ls_error.eq(0)
1223             sync += r1.mmu_error.eq(0)
1224             sync += r1.cache_paradox.eq(0)
1225
1226         with m.If(req_op == Op.OP_STCX_FAIL):
1227             sync += r1.stcx_fail.eq(1)
1228         with m.Else():
1229             sync += r1.stcx_fail.eq(0)
1230
1231         # Record TLB hit information for updating TLB PLRU
1232         sync += r1.tlb_hit.eq(tlb_hit)
1233         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1234         sync += r1.tlb_hit_index.eq(tlb_req_index)
1235
1236     # Memory accesses are handled by this state machine:
1237     #
1238     #   * Cache load miss/reload (in conjunction with "rams")
1239     #   * Load hits for non-cachable forms
1240     #   * Stores (the collision case is handled in "rams")
1241     #
1242     # All wishbone requests generation is done here.
1243     # This machine operates at stage 1.
1244     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1245                     cache_valids, r0, replace_way,
1246                     req_hit_way, req_same_tag,
1247                     r0_valid, req_op, cache_tags, req_go, ra):
1248
1249         comb = m.d.comb
1250         sync = m.d.sync
1251         wb_in = self.wb_in
1252         d_in = self.d_in
1253
1254         req         = MemAccessRequest("mreq_ds")
1255
1256         req_row = Signal(ROW_BITS)
1257         req_idx = Signal(INDEX_BITS)
1258         req_tag = Signal(TAG_BITS)
1259         comb += req_idx.eq(get_index(req.real_addr))
1260         comb += req_row.eq(get_row(req.real_addr))
1261         comb += req_tag.eq(get_tag(req.real_addr))
1262
1263         sync += r1.use_forward1.eq(use_forward1_next)
1264         sync += r1.forward_sel.eq(0)
1265
1266         with m.If(use_forward1_next):
1267             sync += r1.forward_sel.eq(r1.req.byte_sel)
1268         with m.Elif(use_forward2_next):
1269             sync += r1.forward_sel.eq(r1.forward_sel1)
1270
1271         sync += r1.forward_data2.eq(r1.forward_data1)
1272         with m.If(r1.write_bram):
1273             sync += r1.forward_data1.eq(r1.req.data)
1274             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1275             sync += r1.forward_way1.eq(r1.req.hit_way)
1276             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1277             sync += r1.forward_valid1.eq(1)
1278         with m.Else():
1279             with m.If(r1.dcbz):
1280                 sync += r1.forward_data1.eq(0)
1281             with m.Else():
1282                 sync += r1.forward_data1.eq(wb_in.dat)
1283             sync += r1.forward_sel1.eq(~0) # all 1s
1284             sync += r1.forward_way1.eq(replace_way)
1285             sync += r1.forward_row1.eq(r1.store_row)
1286             sync += r1.forward_valid1.eq(0)
1287
1288         # One cycle pulses reset
1289         sync += r1.slow_valid.eq(0)
1290         sync += r1.write_bram.eq(0)
1291         sync += r1.inc_acks.eq(0)
1292         sync += r1.dec_acks.eq(0)
1293
1294         sync += r1.ls_valid.eq(0)
1295         # complete tlbies and TLB loads in the third cycle
1296         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1297
1298         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1299             with m.If(~r0.mmu_req):
1300                 sync += r1.ls_valid.eq(1)
1301             with m.Else():
1302                 sync += r1.mmu_done.eq(1)
1303
1304         with m.If(r1.write_tag):
1305             # Store new tag in selected way
1306             for i in range(NUM_WAYS):
1307                 with m.If(i == replace_way):
1308                     ct = Signal(TAG_RAM_WIDTH)
1309                     comb += ct.eq(cache_tags[r1.store_index])
1310                     """
1311 TODO: check this
1312 cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
1313                     (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag;
1314                     """
1315                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1316                     sync += cache_tags[r1.store_index].eq(ct)
1317             sync += r1.store_way.eq(replace_way)
1318             sync += r1.write_tag.eq(0)
1319
1320         # Take request from r1.req if there is one there,
1321         # else from req_op, ra, etc.
1322         with m.If(r1.full):
1323             comb += req.eq(r1.req)
1324         with m.Else():
1325             comb += req.op.eq(req_op)
1326             comb += req.valid.eq(req_go)
1327             comb += req.mmu_req.eq(r0.mmu_req)
1328             comb += req.dcbz.eq(r0.req.dcbz)
1329             comb += req.real_addr.eq(ra)
1330
1331             with m.If(r0.req.dcbz):
1332                 # force data to 0 for dcbz
1333                 comb += req.data.eq(0)
1334             with m.Elif(r0.d_valid):
1335                 comb += req.data.eq(r0.req.data)
1336             with m.Else():
1337                 comb += req.data.eq(d_in.data)
1338
1339             # Select all bytes for dcbz
1340             # and for cacheable loads
1341             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1342                 comb += req.byte_sel.eq(~0) # all 1s
1343             with m.Else():
1344                 comb += req.byte_sel.eq(r0.req.byte_sel)
1345             comb += req.hit_way.eq(req_hit_way)
1346             comb += req.same_tag.eq(req_same_tag)
1347
1348             # Store the incoming request from r0,
1349             # if it is a slow request
1350             # Note that r1.full = 1 implies req_op = OP_NONE
1351             with m.If((req_op == Op.OP_LOAD_MISS)
1352                       | (req_op == Op.OP_LOAD_NC)
1353                       | (req_op == Op.OP_STORE_MISS)
1354                       | (req_op == Op.OP_STORE_HIT)):
1355                 sync += r1.req.eq(req)
1356                 sync += r1.full.eq(1)
1357
1358         # Main state machine
1359         with m.Switch(r1.state):
1360
1361             with m.Case(State.IDLE):
1362                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1363                 sync += r1.wb.sel.eq(req.byte_sel)
1364                 sync += r1.wb.dat.eq(req.data)
1365                 sync += r1.dcbz.eq(req.dcbz)
1366
1367                 # Keep track of our index and way
1368                 # for subsequent stores.
1369                 sync += r1.store_index.eq(req_idx)
1370                 sync += r1.store_row.eq(req_row)
1371                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1372                 sync += r1.reload_tag.eq(req_tag)
1373                 sync += r1.req.same_tag.eq(1)
1374
1375                 with m.If(req.op == Op.OP_STORE_HIT):
1376                     sync += r1.store_way.eq(req.hit_way)
1377
1378                 # Reset per-row valid bits,
1379                 # ready for handling OP_LOAD_MISS
1380                 for i in range(ROW_PER_LINE):
1381                     sync += r1.rows_valid[i].eq(0)
1382
1383                 with m.If(req_op != Op.OP_NONE):
1384                     sync += Display("cache op %d", req.op)
1385
1386                 with m.Switch(req.op):
1387                     with m.Case(Op.OP_LOAD_HIT):
1388                         # stay in IDLE state
1389                         pass
1390
1391                     with m.Case(Op.OP_LOAD_MISS):
1392                         sync += Display("cache miss real addr: %x " \
1393                                 "idx: %x tag: %x",
1394                                 req.real_addr, req_row, req_tag)
1395
1396                         # Start the wishbone cycle
1397                         sync += r1.wb.we.eq(0)
1398                         sync += r1.wb.cyc.eq(1)
1399                         sync += r1.wb.stb.eq(1)
1400
1401                         # Track that we had one request sent
1402                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1403                         sync += r1.write_tag.eq(1)
1404
1405                     with m.Case(Op.OP_LOAD_NC):
1406                         sync += r1.wb.cyc.eq(1)
1407                         sync += r1.wb.stb.eq(1)
1408                         sync += r1.wb.we.eq(0)
1409                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1410
1411                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1412                         with m.If(~req.dcbz):
1413                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1414                             sync += r1.acks_pending.eq(1)
1415                             sync += r1.full.eq(0)
1416                             sync += r1.slow_valid.eq(1)
1417
1418                             with m.If(~req.mmu_req):
1419                                 sync += r1.ls_valid.eq(1)
1420                             with m.Else():
1421                                 sync += r1.mmu_done.eq(1)
1422
1423                             with m.If(req.op == Op.OP_STORE_HIT):
1424                                 sync += r1.write_bram.eq(1)
1425                         with m.Else():
1426                             # dcbz is handled much like a load miss except
1427                             # that we are writing to memory instead of reading
1428                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1429
1430                             with m.If(req.op == Op.OP_STORE_MISS):
1431                                 sync += r1.write_tag.eq(1)
1432
1433                         sync += r1.wb.we.eq(1)
1434                         sync += r1.wb.cyc.eq(1)
1435                         sync += r1.wb.stb.eq(1)
1436
1437                     # OP_NONE and OP_BAD do nothing
1438                     # OP_BAD & OP_STCX_FAIL were
1439                     # handled above already
1440                     with m.Case(Op.OP_NONE):
1441                         pass
1442                     with m.Case(Op.OP_BAD):
1443                         pass
1444                     with m.Case(Op.OP_STCX_FAIL):
1445                         pass
1446
1447             with m.Case(State.RELOAD_WAIT_ACK):
1448                 ld_stbs_done = Signal()
1449                 # Requests are all sent if stb is 0
1450                 comb += ld_stbs_done.eq(~r1.wb.stb)
1451
1452                 # If we are still sending requests, was one accepted?
1453                 with m.If((~wb_in.stall) & r1.wb.stb):
1454                     # That was the last word?  We are done sending.
1455                     # Clear stb and set ld_stbs_done so we can handle an
1456                     # eventual last ack on the same cycle.
1457                     # sigh - reconstruct wb adr with 3 extra 0s at front
1458                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1459                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1460                         sync += r1.wb.stb.eq(0)
1461                         comb += ld_stbs_done.eq(1)
1462
1463                     # Calculate the next row address in the current cache line
1464                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1465                     comb += row.eq(r1.wb.adr)
1466                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1467
1468                 # Incoming acks processing
1469                 sync += r1.forward_valid1.eq(wb_in.ack)
1470                 with m.If(wb_in.ack):
1471                     srow = Signal(ROW_LINE_BITS)
1472                     comb += srow.eq(r1.store_row)
1473                     sync += r1.rows_valid[srow].eq(1)
1474
1475                     # If this is the data we were looking for,
1476                     # we can complete the request next cycle.
1477                     # Compare the whole address in case the
1478                     # request in r1.req is not the one that
1479                     # started this refill.
1480                     with m.If(req.valid & r1.req.same_tag &
1481                               ((r1.dcbz & r1.req.dcbz) |
1482                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1483                                 (r1.store_row == get_row(req.real_addr))):
1484                         sync += r1.full.eq(0)
1485                         sync += r1.slow_valid.eq(1)
1486                         with m.If(~r1.mmu_req):
1487                             sync += r1.ls_valid.eq(1)
1488                         with m.Else():
1489                             sync += r1.mmu_done.eq(1)
1490                         sync += r1.forward_sel.eq(~0) # all 1s
1491                         sync += r1.use_forward1.eq(1)
1492
1493                     # Check for completion
1494                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1495                                                       r1.end_row_ix)):
1496                         # Complete wishbone cycle
1497                         sync += r1.wb.cyc.eq(0)
1498
1499                         # Cache line is now valid
1500                         cv = Signal(INDEX_BITS)
1501                         comb += cv.eq(cache_valids[r1.store_index])
1502                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1503                         sync += cache_valids[r1.store_index].eq(cv)
1504
1505                         sync += r1.state.eq(State.IDLE)
1506                         sync += Display("cache valid set %x "
1507                                         "idx %d way %d",
1508                                          cv, r1.store_index, r1.store_way)
1509
1510                     # Increment store row counter
1511                     sync += r1.store_row.eq(next_row(r1.store_row))
1512
1513             with m.Case(State.STORE_WAIT_ACK):
1514                 st_stbs_done = Signal()
1515                 acks        = Signal(3)
1516                 adjust_acks = Signal(3)
1517
1518                 comb += st_stbs_done.eq(~r1.wb.stb)
1519                 comb += acks.eq(r1.acks_pending)
1520
1521                 with m.If(r1.inc_acks != r1.dec_acks):
1522                     with m.If(r1.inc_acks):
1523                         comb += adjust_acks.eq(acks + 1)
1524                     with m.Else():
1525                         comb += adjust_acks.eq(acks - 1)
1526                 with m.Else():
1527                     comb += adjust_acks.eq(acks)
1528
1529                 sync += r1.acks_pending.eq(adjust_acks)
1530
1531                 # Clear stb when slave accepted request
1532                 with m.If(~wb_in.stall):
1533                     # See if there is another store waiting
1534                     # to be done which is in the same real page.
1535                     with m.If(req.valid):
1536                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1537                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1538                         sync += r1.wb.dat.eq(req.data)
1539                         sync += r1.wb.sel.eq(req.byte_sel)
1540
1541                     with m.If((adjust_acks < 7) & req.same_tag &
1542                                 ((req.op == Op.OP_STORE_MISS)
1543                                  | (req.op == Op.OP_STORE_HIT))):
1544                         sync += r1.wb.stb.eq(1)
1545                         comb += st_stbs_done.eq(0)
1546
1547                         with m.If(req.op == Op.OP_STORE_HIT):
1548                             sync += r1.write_bram.eq(1)
1549                         sync += r1.full.eq(0)
1550                         sync += r1.slow_valid.eq(1)
1551
1552                         # Store requests never come from the MMU
1553                         sync += r1.ls_valid.eq(1)
1554                         comb += st_stbs_done.eq(0)
1555                         sync += r1.inc_acks.eq(1)
1556                     with m.Else():
1557                         sync += r1.wb.stb.eq(0)
1558                         comb += st_stbs_done.eq(1)
1559
1560                 # Got ack ? See if complete.
1561                 with m.If(wb_in.ack):
1562                     with m.If(st_stbs_done & (adjust_acks == 1)):
1563                         sync += r1.state.eq(State.IDLE)
1564                         sync += r1.wb.cyc.eq(0)
1565                         sync += r1.wb.stb.eq(0)
1566                     sync += r1.dec_acks.eq(1)
1567
1568             with m.Case(State.NC_LOAD_WAIT_ACK):
1569                 # Clear stb when slave accepted request
1570                 with m.If(~wb_in.stall):
1571                     sync += r1.wb.stb.eq(0)
1572
1573                 # Got ack ? complete.
1574                 with m.If(wb_in.ack):
1575                     sync += r1.state.eq(State.IDLE)
1576                     sync += r1.full.eq(0)
1577                     sync += r1.slow_valid.eq(1)
1578
1579                     with m.If(~r1.mmu_req):
1580                         sync += r1.ls_valid.eq(1)
1581                     with m.Else():
1582                         sync += r1.mmu_done.eq(1)
1583
1584                     sync += r1.forward_sel.eq(~0) # all 1s
1585                     sync += r1.use_forward1.eq(1)
1586                     sync += r1.wb.cyc.eq(0)
1587                     sync += r1.wb.stb.eq(0)
1588
1589     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out):
1590
1591         sync = m.d.sync
1592         d_out, wb_in, log_out = self.d_out, self.wb_in, self.log_out
1593
1594         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1595                                stall_out, req_op[:3], d_out.valid, d_out.error,
1596                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1597                                r1.real_adr[3:6]))
1598
1599     def elaborate(self, platform):
1600
1601         m = Module()
1602         comb = m.d.comb
1603         d_in = self.d_in
1604
1605         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1606         cache_tags       = CacheTagArray()
1607         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1608         cache_valids = CacheValidBitsArray()
1609
1610         # TODO attribute ram_style : string;
1611         # TODO attribute ram_style of cache_tags : signal is "distributed";
1612
1613         """note: these are passed to nmigen.hdl.Memory as "attributes".
1614            don't know how, just that they are.
1615         """
1616         dtlb_valid_bits = TLBValidBitsArray()
1617         dtlb_tags       = TLBTagsArray()
1618         dtlb_ptes       = TLBPtesArray()
1619         # TODO attribute ram_style of
1620         #  dtlb_tags : signal is "distributed";
1621         # TODO attribute ram_style of
1622         #  dtlb_ptes : signal is "distributed";
1623
1624         r0      = RegStage0("r0")
1625         r0_full = Signal()
1626
1627         r1 = RegStage1("r1")
1628
1629         reservation = Reservation()
1630
1631         # Async signals on incoming request
1632         req_index    = Signal(INDEX_BITS)
1633         req_row      = Signal(ROW_BITS)
1634         req_hit_way  = Signal(WAY_BITS)
1635         req_tag      = Signal(TAG_BITS)
1636         req_op       = Signal(Op)
1637         req_data     = Signal(64)
1638         req_same_tag = Signal()
1639         req_go       = Signal()
1640
1641         early_req_row     = Signal(ROW_BITS)
1642
1643         cancel_store      = Signal()
1644         set_rsrv          = Signal()
1645         clear_rsrv        = Signal()
1646
1647         r0_valid          = Signal()
1648         r0_stall          = Signal()
1649
1650         use_forward1_next = Signal()
1651         use_forward2_next = Signal()
1652
1653         cache_out_row     = Signal(WB_DATA_BITS)
1654
1655         plru_victim       = PLRUOut()
1656         replace_way       = Signal(WAY_BITS)
1657
1658         # Wishbone read/write/cache write formatting signals
1659         bus_sel           = Signal(8)
1660
1661         # TLB signals
1662         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1663         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1664         tlb_valid_way = Signal(TLB_NUM_WAYS)
1665         tlb_req_index = Signal(TLB_SET_BITS)
1666         tlb_hit       = Signal()
1667         tlb_hit_way   = Signal(TLB_WAY_BITS)
1668         pte           = Signal(TLB_PTE_BITS)
1669         ra            = Signal(REAL_ADDR_BITS)
1670         valid_ra      = Signal()
1671         perm_attr     = PermAttr("dc_perms")
1672         rc_ok         = Signal()
1673         perm_ok       = Signal()
1674         access_ok     = Signal()
1675
1676         tlb_plru_victim = TLBPLRUOut()
1677
1678         # we don't yet handle collisions between loadstore1 requests
1679         # and MMU requests
1680         comb += self.m_out.stall.eq(0)
1681
1682         # Hold off the request in r0 when r1 has an uncompleted request
1683         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1684         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1685         comb += self.stall_out.eq(r0_stall)
1686
1687         # Wire up wishbone request latch out of stage 1
1688         comb += self.wb_out.eq(r1.wb)
1689
1690         # deal with litex not doing wishbone pipeline mode
1691         # XXX in wrong way.  FIFOs are needed in the SRAM test
1692         # so that stb/ack match up
1693         comb += self.wb_in.stall.eq(self.wb_out.cyc & ~self.wb_in.ack)
1694
1695         # call sub-functions putting everything together, using shared
1696         # signals established above
1697         self.stage_0(m, r0, r1, r0_full)
1698         self.tlb_read(m, r0_stall, tlb_valid_way,
1699                       tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
1700                       dtlb_tags, dtlb_ptes)
1701         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1702                         tlb_valid_way, tlb_tag_way, tlb_hit_way,
1703                         tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra)
1704         self.tlb_update(m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
1705                         tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
1706                         dtlb_tags, tlb_pte_way, dtlb_ptes)
1707         self.maybe_plrus(m, r1, plru_victim)
1708         self.maybe_tlb_plrus(m, r1, tlb_plru_victim)
1709         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1710         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1711                            r0_valid, r1, cache_valids, replace_way,
1712                            use_forward1_next, use_forward2_next,
1713                            req_hit_way, plru_victim, rc_ok, perm_attr,
1714                            valid_ra, perm_ok, access_ok, req_op, req_go,
1715                            tlb_pte_way,
1716                            tlb_hit, tlb_hit_way, tlb_valid_way, cache_tag_set,
1717                            cancel_store, req_same_tag, r0_stall, early_req_row)
1718         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1719                            r0_valid, r0, reservation)
1720         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1721                            reservation, r0)
1722         self.writeback_control(m, r1, cache_out_row)
1723         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1724         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1725                         req_hit_way, req_index, req_tag, access_ok,
1726                         tlb_hit, tlb_hit_way, tlb_req_index)
1727         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1728                     cache_valids, r0, replace_way,
1729                     req_hit_way, req_same_tag,
1730                          r0_valid, req_op, cache_tags, req_go, ra)
1731         #self.dcache_log(m, r1, valid_ra, tlb_hit_way, stall_out)
1732
1733         return m
1734
1735
1736 if __name__ == '__main__':
1737     dut = DCache()
1738     vl = rtlil.convert(dut, ports=[])
1739     with open("test_dcache.il", "w") as f:
1740         f.write(vl)