src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
  17   (discussion about brams for ECP5)
  18
  19 """
  20
  21 import sys
  22
  23 from nmutil.gtkw import write_gtkw
  24
  25 sys.setrecursionlimit(1000000)
  26
  27 from enum import Enum, unique
  28
  29 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  30                     Record, Memory)
  31 from nmutil.util import Display
  32 from nmigen.lib.coding import Decoder
  33
  34 from copy import deepcopy
  35 from random import randint, seed
  36
  37 from nmigen_soc.wishbone.bus import Interface
  38
  39 from nmigen.cli import main
  40 from nmutil.iocontrol import RecordObject
  41 from nmigen.utils import log2_int
  42 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  43                                      DCacheToLoadStore1Type,
  44                                      MMUToDCacheType,
  45                                      DCacheToMMUType)
  46
  47 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  48                                 WBAddrType, WBDataType, WBSelType,
  49                                 WBMasterOut, WBSlaveOut,
  50                                 WBMasterOutVector, WBSlaveOutVector,
  51                                 WBIOMasterOut, WBIOSlaveOut)
  52
  53 from soc.experiment.cache_ram import CacheRam
  54 from soc.experiment.plru import PLRU, PLRUs
  55 #from nmutil.plru import PLRU, PLRUs
  56
  57 # for test
  58 from soc.bus.sram import SRAM
  59 from nmigen import Memory
  60 from nmigen.cli import rtlil
  61
  62 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  63 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  64 from nmutil.sim_tmp_alternative import Simulator
  65
  66 from nmutil.util import wrap
  67
  68
  69 # TODO: make these parameters of DCache at some point
  70 LINE_SIZE = 64    # Line size in bytes
  71 NUM_LINES = 64    # Number of lines in a set
  72 NUM_WAYS = 2      # Number of ways
  73 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  74 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  75 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  76 LOG_LENGTH = 0    # Non-zero to enable log data collection
  77
  78 # BRAM organisation: We never access more than
  79 #     -- WB_DATA_BITS at a time so to save
  80 #     -- resources we make the array only that wide, and
  81 #     -- use consecutive indices to make a cache "line"
  82 #     --
  83 #     -- ROW_SIZE is the width in bytes of the BRAM
  84 #     -- (based on WB, so 64-bits)
  85 ROW_SIZE = WB_DATA_BITS // 8;
  86
  87 # ROW_PER_LINE is the number of row (wishbone
  88 # transactions) in a line
  89 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  90
  91 # BRAM_ROWS is the number of rows in BRAM needed
  92 # to represent the full dcache
  93 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  94
  95 print ("ROW_SIZE", ROW_SIZE)
  96 print ("ROW_PER_LINE", ROW_PER_LINE)
  97 print ("BRAM_ROWS", BRAM_ROWS)
  98 print ("NUM_WAYS", NUM_WAYS)
  99
 100 # Bit fields counts in the address
 101
 102 # REAL_ADDR_BITS is the number of real address
 103 # bits that we store
 104 REAL_ADDR_BITS = 56
 105
 106 # ROW_BITS is the number of bits to select a row
 107 ROW_BITS = log2_int(BRAM_ROWS)
 108
 109 # ROW_LINE_BITS is the number of bits to select
 110 # a row within a line
 111 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 112
 113 # LINE_OFF_BITS is the number of bits for
 114 # the offset in a cache line
 115 LINE_OFF_BITS = log2_int(LINE_SIZE)
 116
 117 # ROW_OFF_BITS is the number of bits for
 118 # the offset in a row
 119 ROW_OFF_BITS = log2_int(ROW_SIZE)
 120
 121 # INDEX_BITS is the number if bits to
 122 # select a cache line
 123 INDEX_BITS = log2_int(NUM_LINES)
 124
 125 # SET_SIZE_BITS is the log base 2 of the set size
 126 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 127
 128 # TAG_BITS is the number of bits of
 129 # the tag part of the address
 130 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 131
 132 # TAG_WIDTH is the width in bits of each way of the tag RAM
 133 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 134
 135 # WAY_BITS is the number of bits to select a way
 136 WAY_BITS = log2_int(NUM_WAYS)
 137
 138 # Example of layout for 32 lines of 64 bytes:
 139 layout = f"""\
 140   DCache Layout:
 141  |.. -----------------------| REAL_ADDR_BITS ({REAL_ADDR_BITS})
 142   ..         |--------------| SET_SIZE_BITS ({SET_SIZE_BITS})
 143   ..  tag    |index|  line  |
 144   ..         |   row   |    |
 145   ..         |     |---|    | ROW_LINE_BITS ({ROW_LINE_BITS})
 146   ..         |     |--- - --| LINE_OFF_BITS ({LINE_OFF_BITS})
 147   ..         |         |- --| ROW_OFF_BITS  ({ROW_OFF_BITS})
 148   ..         |----- ---|    | ROW_BITS      ({ROW_BITS})
 149   ..         |-----|        | INDEX_BITS    ({INDEX_BITS})
 150   .. --------|              | TAG_BITS      ({TAG_BITS})
 151 """
 152 print (layout)
 153 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 154             (TAG_BITS, INDEX_BITS, ROW_BITS,
 155              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 156 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 157 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 158 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 159
 160 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 161
 162 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 163 print ("    TAG_WIDTH", TAG_WIDTH)
 164 print ("     NUM_WAYS", NUM_WAYS)
 165 print ("    NUM_LINES", NUM_LINES)
 166
 167
 168 def CacheTag(name=None):
 169     tag_layout = [('valid', NUM_WAYS),
 170                   ('tag', TAG_RAM_WIDTH),
 171                  ]
 172     return Record(tag_layout, name=name)
 173
 174
 175 def CacheTagArray():
 176     return Array(CacheTag(name="tag%d" % x) for x in range(NUM_LINES))
 177
 178
 179 def RowPerLineValidArray():
 180     return Array(Signal(name="rows_valid%d" % x) \
 181                         for x in range(ROW_PER_LINE))
 182
 183
 184 # L1 TLB
 185 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 186 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 187 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 188 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 189 TLB_PTE_BITS     = 64
 190 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 191
 192 def ispow2(x):
 193     return (1<<log2_int(x, False)) == x
 194
 195 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 196 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 197 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 198 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 199 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 200 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 201         "geometry bits don't add up"
 202 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 203         "geometry bits don't add up"
 204 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 205          "geometry bits don't add up"
 206 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 207 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 208
 209
 210 def TLBHit(name):
 211     return Record([('valid', 1),
 212                    ('way', TLB_WAY_BITS)], name=name)
 213
 214 def TLBTagEAArray():
 215     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 216                 for x in range (TLB_NUM_WAYS))
 217
 218 def TLBRecord(name):
 219     tlb_layout = [('valid', TLB_NUM_WAYS),
 220                   ('tag', TLB_TAG_WAY_BITS),
 221                   ('pte', TLB_PTE_WAY_BITS)
 222                  ]
 223     return Record(tlb_layout, name=name)
 224
 225 def TLBValidArray():
 226     return Array(Signal(TLB_NUM_WAYS, name="tlb_valid%d" % x)
 227                         for x in range(TLB_SET_SIZE))
 228
 229 def HitWaySet():
 230     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 231                         for x in range(TLB_NUM_WAYS))
 232
 233 # Cache RAM interface
 234 def CacheRamOut():
 235     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 236                  for x in range(NUM_WAYS))
 237
 238 # PLRU output interface
 239 def PLRUOut():
 240     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 241                 for x in range(NUM_LINES))
 242
 243 # TLB PLRU output interface
 244 def TLBPLRUOut():
 245     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 246                 for x in range(TLB_SET_SIZE))
 247
 248 # Helper functions to decode incoming requests
 249 #
 250 # Return the cache line index (tag index) for an address
 251 def get_index(addr):
 252     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 253
 254 # Return the cache row index (data memory) for an address
 255 def get_row(addr):
 256     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 257
 258 # Return the index of a row within a line
 259 def get_row_of_line(row):
 260     return row[:ROW_BITS][:ROW_LINE_BITS]
 261
 262 # Returns whether this is the last row of a line
 263 def is_last_row_addr(addr, last):
 264     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 265
 266 # Returns whether this is the last row of a line
 267 def is_last_row(row, last):
 268     return get_row_of_line(row) == last
 269
 270 # Return the next row in the current cache line. We use a
 271 # dedicated function in order to limit the size of the
 272 # generated adder to be only the bits within a cache line
 273 # (3 bits with default settings)
 274 def next_row(row):
 275     row_v = row[0:ROW_LINE_BITS] + 1
 276     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 277
 278 # Get the tag value from the address
 279 def get_tag(addr):
 280     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 281
 282 # Read a tag from a tag memory row
 283 def read_tag(way, tagset):
 284     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 285
 286 # Read a TLB tag from a TLB tag memory row
 287 def read_tlb_tag(way, tags):
 288     return tags.word_select(way, TLB_EA_TAG_BITS)
 289
 290 # Write a TLB tag to a TLB tag memory row
 291 def write_tlb_tag(way, tags, tag):
 292     return read_tlb_tag(way, tags).eq(tag)
 293
 294 # Read a PTE from a TLB PTE memory row
 295 def read_tlb_pte(way, ptes):
 296     return ptes.word_select(way, TLB_PTE_BITS)
 297
 298 def write_tlb_pte(way, ptes, newpte):
 299     return read_tlb_pte(way, ptes).eq(newpte)
 300
 301
 302 # Record for storing permission, attribute, etc. bits from a PTE
 303 class PermAttr(RecordObject):
 304     def __init__(self, name=None):
 305         super().__init__(name=name)
 306         self.reference = Signal()
 307         self.changed   = Signal()
 308         self.nocache   = Signal()
 309         self.priv      = Signal()
 310         self.rd_perm   = Signal()
 311         self.wr_perm   = Signal()
 312
 313
 314 def extract_perm_attr(pte):
 315     pa = PermAttr()
 316     return pa;
 317
 318
 319 # Type of operation on a "valid" input
 320 @unique
 321 class Op(Enum):
 322     OP_NONE       = 0
 323     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 324     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 325     OP_LOAD_HIT   = 3 # Cache hit on load
 326     OP_LOAD_MISS  = 4 # Load missing cache
 327     OP_LOAD_NC    = 5 # Non-cachable load
 328     OP_STORE_HIT  = 6 # Store hitting cache
 329     OP_STORE_MISS = 7 # Store missing cache
 330
 331
 332 # Cache state machine
 333 @unique
 334 class State(Enum):
 335     IDLE             = 0 # Normal load hit processing
 336     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 337     STORE_WAIT_ACK   = 2 # Store wait ack
 338     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 339
 340
 341 # Dcache operations:
 342 #
 343 # In order to make timing, we use the BRAMs with
 344 # an output buffer, which means that the BRAM
 345 # output is delayed by an extra cycle.
 346 #
 347 # Thus, the dcache has a 2-stage internal pipeline
 348 # for cache hits with no stalls.
 349 #
 350 # All other operations are handled via stalling
 351 # in the first stage.
 352 #
 353 # The second stage can thus complete a hit at the same
 354 # time as the first stage emits a stall for a complex op.
 355 #
 356 # Stage 0 register, basically contains just the latched request
 357
 358 class RegStage0(RecordObject):
 359     def __init__(self, name=None):
 360         super().__init__(name=name)
 361         self.req     = LoadStore1ToDCacheType(name="lsmem")
 362         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 363         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 364         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 365         self.mmu_req = Signal() # indicates source of request
 366         self.d_valid = Signal() # indicates req.data is valid now
 367
 368
 369 class MemAccessRequest(RecordObject):
 370     def __init__(self, name=None):
 371         super().__init__(name=name)
 372         self.op        = Signal(Op)
 373         self.valid     = Signal()
 374         self.dcbz      = Signal()
 375         self.real_addr = Signal(REAL_ADDR_BITS)
 376         self.data      = Signal(64)
 377         self.byte_sel  = Signal(8)
 378         self.hit_way   = Signal(WAY_BITS)
 379         self.same_tag  = Signal()
 380         self.mmu_req   = Signal()
 381
 382
 383 # First stage register, contains state for stage 1 of load hits
 384 # and for the state machine used by all other operations
 385 class RegStage1(RecordObject):
 386     def __init__(self, name=None):
 387         super().__init__(name=name)
 388         # Info about the request
 389         self.full             = Signal() # have uncompleted request
 390         self.mmu_req          = Signal() # request is from MMU
 391         self.req              = MemAccessRequest(name="reqmem")
 392
 393         # Cache hit state
 394         self.hit_way          = Signal(WAY_BITS)
 395         self.hit_load_valid   = Signal()
 396         self.hit_index        = Signal(INDEX_BITS)
 397         self.cache_hit        = Signal()
 398
 399         # TLB hit state
 400         self.tlb_hit          = TLBHit("tlb_hit")
 401         self.tlb_hit_index    = Signal(TLB_SET_BITS)
 402
 403         # 2-stage data buffer for data forwarded from writes to reads
 404         self.forward_data1    = Signal(64)
 405         self.forward_data2    = Signal(64)
 406         self.forward_sel1     = Signal(8)
 407         self.forward_valid1   = Signal()
 408         self.forward_way1     = Signal(WAY_BITS)
 409         self.forward_row1     = Signal(ROW_BITS)
 410         self.use_forward1     = Signal()
 411         self.forward_sel      = Signal(8)
 412
 413         # Cache miss state (reload state machine)
 414         self.state            = Signal(State)
 415         self.dcbz             = Signal()
 416         self.write_bram       = Signal()
 417         self.write_tag        = Signal()
 418         self.slow_valid       = Signal()
 419         self.wb               = WBMasterOut("wb")
 420         self.reload_tag       = Signal(TAG_BITS)
 421         self.store_way        = Signal(WAY_BITS)
 422         self.store_row        = Signal(ROW_BITS)
 423         self.store_index      = Signal(INDEX_BITS)
 424         self.end_row_ix       = Signal(ROW_LINE_BITS)
 425         self.rows_valid       = RowPerLineValidArray()
 426         self.acks_pending     = Signal(3)
 427         self.inc_acks         = Signal()
 428         self.dec_acks         = Signal()
 429
 430         # Signals to complete (possibly with error)
 431         self.ls_valid         = Signal()
 432         self.ls_error         = Signal()
 433         self.mmu_done         = Signal()
 434         self.mmu_error        = Signal()
 435         self.cache_paradox    = Signal()
 436
 437         # Signal to complete a failed stcx.
 438         self.stcx_fail        = Signal()
 439
 440
 441 # Reservation information
 442 class Reservation(RecordObject):
 443     def __init__(self, name=None):
 444         super().__init__(name=name)
 445         self.valid = Signal()
 446         self.addr  = Signal(64-LINE_OFF_BITS)
 447
 448
 449 class DTLBUpdate(Elaboratable):
 450     def __init__(self):
 451         self.tlbie    = Signal()
 452         self.tlbwe    = Signal()
 453         self.doall    = Signal()
 454         self.tlb_hit     = TLBHit("tlb_hit")
 455         self.tlb_req_index = Signal(TLB_SET_BITS)
 456
 457         self.repl_way        = Signal(TLB_WAY_BITS)
 458         self.eatag           = Signal(TLB_EA_TAG_BITS)
 459         self.pte_data        = Signal(TLB_PTE_BITS)
 460
 461         # read from dtlb array
 462         self.tlb_read       = Signal()
 463         self.tlb_read_index = Signal(TLB_SET_BITS)
 464         self.tlb_way        = TLBRecord("o_tlb_way")
 465
 466     def elaborate(self, platform):
 467         m = Module()
 468         comb = m.d.comb
 469         sync = m.d.sync
 470
 471         # there are 3 parts to this:
 472         # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
 473         # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
 474         # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
 475         # be a Memory because they can all be cleared (tlbie, doall), i mean,
 476         # we _could_, in theory, by overriding the Reset Signal of the Memory,
 477         # hmmm....
 478
 479         dtlb_valid = TLBValidArray()
 480         tlb_req_index = self.tlb_req_index
 481
 482         print ("TLB_TAG_WAY_BITS", TLB_TAG_WAY_BITS)
 483         print ("     TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
 484         print ("        TLB_NUM_WAYS", TLB_NUM_WAYS)
 485         print ("TLB_PTE_WAY_BITS", TLB_PTE_WAY_BITS)
 486         print ("    TLB_PTE_BITS", TLB_PTE_BITS)
 487         print ("    TLB_NUM_WAYS", TLB_NUM_WAYS)
 488
 489         # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
 490         tagway = Memory(depth=TLB_SET_SIZE, width=TLB_TAG_WAY_BITS)
 491         m.submodules.rd_tagway = rd_tagway = tagway.read_port()
 492         m.submodules.wr_tagway = wr_tagway = tagway.write_port(
 493                                     granularity=TLB_EA_TAG_BITS)
 494
 495         pteway = Memory(depth=TLB_SET_SIZE, width=TLB_PTE_WAY_BITS)
 496         m.submodules.rd_pteway = rd_pteway = pteway.read_port()
 497         m.submodules.wr_pteway = wr_pteway = pteway.write_port(
 498                                     granularity=TLB_PTE_BITS)
 499
 500         # commented out for now, can be put in if Memory.reset can be
 501         # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
 502         #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
 503         #m.submodules.rd_valid = rd_valid = validm.read_port()
 504         #m.submodules.wr_valid = wr_valid = validm.write_port(
 505                                     #granularity=1)
 506
 507         # connect up read and write addresses to Valid/PTE/TAG SRAMs
 508         m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
 509         m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
 510         #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
 511         m.d.comb += wr_tagway.addr.eq(tlb_req_index)
 512         m.d.comb += wr_pteway.addr.eq(tlb_req_index)
 513         #m.d.comb += wr_valid.addr.eq(tlb_req_index)
 514
 515         updated  = Signal()
 516         v_updated  = Signal()
 517         tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 518         db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 519         pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 520         dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 521
 522         comb += dv.eq(dtlb_valid[tlb_req_index])
 523         comb += db_out.eq(dv)
 524
 525         with m.If(self.tlbie & self.doall):
 526             # clear all valid bits at once
 527             # XXX hmmm, validm _could_ use Memory reset here...
 528             for i in range(TLB_SET_SIZE):
 529                 sync += dtlb_valid[i].eq(0)
 530         with m.Elif(self.tlbie):
 531             # invalidate just the hit_way
 532             with m.If(self.tlb_hit.valid):
 533                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 534                 comb += v_updated.eq(1)
 535         with m.Elif(self.tlbwe):
 536             # write to the requested tag and PTE
 537             comb += write_tlb_tag(self.repl_way, tb_out, self.eatag)
 538             comb += write_tlb_pte(self.repl_way, pb_out, self.pte_data)
 539             # set valid bit
 540             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 541
 542             comb += updated.eq(1)
 543             comb += v_updated.eq(1)
 544
 545         # above, sometimes valid is requested to be updated but data not
 546         # therefore split them out, here.  note the granularity thing matches
 547         # with the shift-up of the eatag/pte_data into the correct TLB way.
 548         # thus is it not necessary to write the entire lot, just the portion
 549         # being altered: hence writing the *old* copy of the row is not needed
 550         with m.If(updated): # PTE and TAG to be written
 551             comb += wr_pteway.data.eq(pb_out)
 552             comb += wr_pteway.en.eq(1<<self.repl_way)
 553             comb += wr_tagway.data.eq(tb_out)
 554             comb += wr_tagway.en.eq(1<<self.repl_way)
 555         with m.If(v_updated): # Valid to be written
 556             sync += dtlb_valid[tlb_req_index].eq(db_out)
 557             #comb += wr_valid.data.eq(db_out)
 558             #comb += wr_valid.en.eq(1<<self.repl_way)
 559
 560         # select one TLB way, use a register here
 561         r_delay = Signal()
 562         sync += r_delay.eq(self.tlb_read)
 563         # first deal with the valids, which are not in a Memory.
 564         # tlb way valid is output on a 1 clock delay with sync,
 565         # but have to explicitly deal with "forwarding" here
 566         with m.If(self.tlb_read):
 567             with m.If(v_updated): # write *and* read in same cycle: forward
 568                 sync += self.tlb_way.valid.eq(db_out)
 569             with m.Else():
 570                 sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
 571         # now deal with the Memory-read case. the output must remain
 572         # valid (stable) even when a read-request is not made, but stable
 573         # on a one-clock delay, hence the register
 574         r_tlb_way        = TLBRecord("r_tlb_way")
 575         with m.If(r_delay):
 576             # on one clock delay, capture the contents of the read port(s)
 577             comb += self.tlb_way.tag.eq(rd_tagway.data)
 578             comb += self.tlb_way.pte.eq(rd_pteway.data)
 579             sync += r_tlb_way.tag.eq(rd_tagway.data)
 580             sync += r_tlb_way.pte.eq(rd_pteway.data)
 581         with m.Else():
 582             # ... so that the register can output it when no read is requested
 583             # it's rather overkill but better to be safe than sorry
 584             comb += self.tlb_way.tag.eq(r_tlb_way.tag)
 585             comb += self.tlb_way.pte.eq(r_tlb_way.pte)
 586             #comb += self.tlb_way.eq(r_tlb_way)
 587
 588         return m
 589
 590
 591 class DCachePendingHit(Elaboratable):
 592
 593     def __init__(self, tlb_way,
 594                       cache_i_validdx, cache_tag_set,
 595                     req_addr):
 596
 597         self.go          = Signal()
 598         self.virt_mode   = Signal()
 599         self.is_hit      = Signal()
 600         self.tlb_hit      = TLBHit("tlb_hit")
 601         self.hit_way     = Signal(WAY_BITS)
 602         self.rel_match   = Signal()
 603         self.req_index   = Signal(INDEX_BITS)
 604         self.reload_tag  = Signal(TAG_BITS)
 605
 606         self.tlb_way = tlb_way
 607         self.cache_i_validdx = cache_i_validdx
 608         self.cache_tag_set = cache_tag_set
 609         self.req_addr = req_addr
 610
 611     def elaborate(self, platform):
 612         m = Module()
 613         comb = m.d.comb
 614         sync = m.d.sync
 615
 616         go = self.go
 617         virt_mode = self.virt_mode
 618         is_hit = self.is_hit
 619         tlb_way = self.tlb_way
 620         cache_i_validdx = self.cache_i_validdx
 621         cache_tag_set = self.cache_tag_set
 622         req_addr = self.req_addr
 623         tlb_hit = self.tlb_hit
 624         hit_way = self.hit_way
 625         rel_match = self.rel_match
 626         req_index = self.req_index
 627         reload_tag = self.reload_tag
 628
 629         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 630                                   for i in range(TLB_NUM_WAYS))
 631         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 632                                     for i in range(TLB_NUM_WAYS))
 633         hit_way_set = HitWaySet()
 634
 635         # Test if pending request is a hit on any way
 636         # In order to make timing in virtual mode,
 637         # when we are using the TLB, we compare each
 638         # way with each of the real addresses from each way of
 639         # the TLB, and then decide later which match to use.
 640
 641         with m.If(virt_mode):
 642             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 643                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 644                 s_hit       = Signal(name="s_hit%d" % j)
 645                 s_pte       = Signal(TLB_PTE_BITS, name="s_pte%d" % j)
 646                 s_ra        = Signal(REAL_ADDR_BITS, name="s_ra%d" % j)
 647                 # read the PTE, calc the Real Address, get tge tag
 648                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
 649                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 650                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 651                 comb += s_tag.eq(get_tag(s_ra))
 652                 # for each way check tge tag against the cache tag set
 653                 for i in range(NUM_WAYS): # way_t
 654                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 655                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 656                                   (read_tag(i, cache_tag_set) == s_tag)
 657                                   & (tlb_way.valid[j]))
 658                     with m.If(is_tag_hit):
 659                         comb += hit_way_set[j].eq(i)
 660                         comb += s_hit.eq(1)
 661                 comb += hit_set[j].eq(s_hit)
 662                 comb += rel_matches[j].eq(s_tag == reload_tag)
 663             with m.If(tlb_hit.valid):
 664                 comb += is_hit.eq(hit_set[tlb_hit.way])
 665                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 666                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 667         with m.Else():
 668             s_tag       = Signal(TAG_BITS)
 669             comb += s_tag.eq(get_tag(req_addr))
 670             for i in range(NUM_WAYS): # way_t
 671                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 672                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 673                           (read_tag(i, cache_tag_set) == s_tag))
 674                 with m.If(is_tag_hit):
 675                     comb += hit_way.eq(i)
 676                     comb += is_hit.eq(1)
 677             with m.If(s_tag == reload_tag):
 678                 comb += rel_match.eq(1)
 679
 680         return m
 681
 682
 683 class DCache(Elaboratable):
 684     """Set associative dcache write-through
 685
 686     TODO (in no specific order):
 687     * See list in icache.vhdl
 688     * Complete load misses on the cycle when WB data comes instead of
 689       at the end of line (this requires dealing with requests coming in
 690       while not idle...)
 691     """
 692     def __init__(self, pspec=None):
 693         self.d_in      = LoadStore1ToDCacheType("d_in")
 694         self.d_out     = DCacheToLoadStore1Type("d_out")
 695
 696         self.m_in      = MMUToDCacheType("m_in")
 697         self.m_out     = DCacheToMMUType("m_out")
 698
 699         self.stall_out = Signal()
 700         self.any_stall_out = Signal()
 701         self.dreq_when_stall = Signal()
 702         self.mreq_when_stall = Signal()
 703
 704         # standard naming (wired to non-standard for compatibility)
 705         self.bus = Interface(addr_width=32,
 706                             data_width=64,
 707                             granularity=8,
 708                             features={'stall'},
 709                             alignment=0,
 710                             name="dcache")
 711
 712         self.log_out   = Signal(20)
 713
 714         # test if microwatt compatibility is to be enabled
 715         self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
 716                                  (pspec.microwatt_compat == True))
 717
 718     def stage_0(self, m, r0, r1, r0_full):
 719         """Latch the request in r0.req as long as we're not stalling
 720         """
 721         comb = m.d.comb
 722         sync = m.d.sync
 723         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 724
 725         r = RegStage0("stage0")
 726
 727         # TODO, this goes in unit tests and formal proofs
 728         with m.If(d_in.valid & m_in.valid):
 729             sync += Display("request collision loadstore vs MMU")
 730
 731         with m.If(m_in.valid):
 732             comb += r.req.valid.eq(1)
 733             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 734             comb += r.req.dcbz.eq(0)
 735             comb += r.req.nc.eq(0)
 736             comb += r.req.reserve.eq(0)
 737             comb += r.req.virt_mode.eq(0)
 738             comb += r.req.priv_mode.eq(1)
 739             comb += r.req.addr.eq(m_in.addr)
 740             comb += r.req.data.eq(m_in.pte)
 741             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 742             comb += r.tlbie.eq(m_in.tlbie)
 743             comb += r.doall.eq(m_in.doall)
 744             comb += r.tlbld.eq(m_in.tlbld)
 745             comb += r.mmu_req.eq(1)
 746             comb += r.d_valid.eq(1)
 747             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 748                                  m_in.addr, m_in.pte, r.req.load)
 749
 750         with m.Else():
 751             comb += r.req.eq(d_in)
 752             comb += r.req.data.eq(0)
 753             comb += r.tlbie.eq(0)
 754             comb += r.doall.eq(0)
 755             comb += r.tlbld.eq(0)
 756             comb += r.mmu_req.eq(0)
 757             comb += r.d_valid.eq(0)
 758
 759         sync += r0_full.eq(0)
 760         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 761             sync += r0.eq(r)
 762             sync += r0_full.eq(r.req.valid)
 763         with m.Elif(~r0.d_valid):
 764             # Sample data the cycle after a request comes in from loadstore1.
 765             # If another request has come in already then the data will get
 766             # put directly into req.data below.
 767             sync += r0.req.data.eq(d_in.data)
 768             sync += r0.d_valid.eq(1)
 769         with m.If(d_in.valid):
 770             m.d.sync += Display("    DCACHE req cache "
 771                                 "virt %d addr %x data %x ld %d",
 772                                  r.req.virt_mode, r.req.addr,
 773                                  r.req.data, r.req.load)
 774
 775     def tlb_read(self, m, r0_stall, tlb_way):
 776         """TLB
 777         Operates in the second cycle on the request latched in r0.req.
 778         TLB updates write the entry at the end of the second cycle.
 779         """
 780         comb = m.d.comb
 781         sync = m.d.sync
 782         m_in, d_in = self.m_in, self.d_in
 783
 784         addrbits = Signal(TLB_SET_BITS)
 785
 786         amin = TLB_LG_PGSZ
 787         amax = TLB_LG_PGSZ + TLB_SET_BITS
 788
 789         with m.If(m_in.valid):
 790             comb += addrbits.eq(m_in.addr[amin : amax])
 791         with m.Else():
 792             comb += addrbits.eq(d_in.addr[amin : amax])
 793
 794         # If we have any op and the previous op isn't finished,
 795         # then keep the same output for next cycle.
 796         d = self.dtlb_update
 797         comb += d.tlb_read_index.eq(addrbits)
 798         comb += d.tlb_read.eq(~r0_stall)
 799         comb += tlb_way.eq(d.tlb_way)
 800
 801     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
 802         """Generate TLB PLRUs
 803         """
 804         comb = m.d.comb
 805         sync = m.d.sync
 806
 807         if TLB_NUM_WAYS == 0:
 808             return
 809
 810         # suite of PLRUs with a selection and output mechanism
 811         tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
 812         m.submodules.tlb_plrus = tlb_plrus
 813         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
 814         comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
 815         comb += tlb_plrus.index.eq(r1.tlb_hit_index)
 816         comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
 817         comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 818
 819     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 820                    tlb_way,
 821                    pte, tlb_hit, valid_ra, perm_attr, ra):
 822
 823         comb = m.d.comb
 824
 825         hitway = Signal(TLB_WAY_BITS)
 826         hit    = Signal()
 827         eatag  = Signal(TLB_EA_TAG_BITS)
 828
 829         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 830         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 831         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 832
 833         for i in range(TLB_NUM_WAYS):
 834             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 835             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 836             comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
 837             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 838             with m.If(is_tag_hit):
 839                 comb += hitway.eq(i)
 840                 comb += hit.eq(1)
 841
 842         comb += tlb_hit.valid.eq(hit & r0_valid)
 843         comb += tlb_hit.way.eq(hitway)
 844
 845         with m.If(tlb_hit.valid):
 846             comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
 847         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 848
 849         with m.If(r0.req.virt_mode):
 850             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 851                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 852                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 853             comb += perm_attr.reference.eq(pte[8])
 854             comb += perm_attr.changed.eq(pte[7])
 855             comb += perm_attr.nocache.eq(pte[5])
 856             comb += perm_attr.priv.eq(pte[3])
 857             comb += perm_attr.rd_perm.eq(pte[2])
 858             comb += perm_attr.wr_perm.eq(pte[1])
 859         with m.Else():
 860             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 861                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 862             comb += perm_attr.reference.eq(1)
 863             comb += perm_attr.changed.eq(1)
 864             comb += perm_attr.nocache.eq(0)
 865             comb += perm_attr.priv.eq(1)
 866             comb += perm_attr.rd_perm.eq(1)
 867             comb += perm_attr.wr_perm.eq(1)
 868
 869         with m.If(valid_ra):
 870             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 871                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 872             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 873             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 874             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 875             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 876             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 877             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 878
 879     def tlb_update(self, m, r0_valid, r0, tlb_req_index,
 880                     tlb_hit, tlb_plru_victim):
 881
 882         comb = m.d.comb
 883         sync = m.d.sync
 884
 885         tlbie    = Signal()
 886         tlbwe    = Signal()
 887
 888         comb += tlbie.eq(r0_valid & r0.tlbie)
 889         comb += tlbwe.eq(r0_valid & r0.tlbld)
 890
 891         d = self.dtlb_update
 892
 893         comb += d.tlbie.eq(tlbie)
 894         comb += d.tlbwe.eq(tlbwe)
 895         comb += d.doall.eq(r0.doall)
 896         comb += d.tlb_hit.eq(tlb_hit)
 897         comb += d.tlb_req_index.eq(tlb_req_index)
 898
 899         with m.If(tlb_hit.valid):
 900             comb += d.repl_way.eq(tlb_hit.way)
 901         with m.Else():
 902             comb += d.repl_way.eq(tlb_plru_victim)
 903         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 904         comb += d.pte_data.eq(r0.req.data)
 905
 906     def maybe_plrus(self, m, r1, plru_victim):
 907         """Generate PLRUs
 908         """
 909         comb = m.d.comb
 910         sync = m.d.sync
 911
 912         if TLB_NUM_WAYS == 0:
 913             return
 914
 915         # suite of PLRUs with a selection and output mechanism
 916         m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
 917         comb += plrus.way.eq(r1.hit_way)
 918         comb += plrus.valid.eq(r1.cache_hit)
 919         comb += plrus.index.eq(r1.hit_index)
 920         comb += plrus.isel.eq(r1.store_index) # select victim
 921         comb += plru_victim.eq(plrus.o_index) # selected victim
 922
 923     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 924         """Cache tag RAM read port
 925         """
 926         comb = m.d.comb
 927         sync = m.d.sync
 928         m_in, d_in = self.m_in, self.d_in
 929
 930         index = Signal(INDEX_BITS)
 931
 932         with m.If(r0_stall):
 933             comb += index.eq(req_index)
 934         with m.Elif(m_in.valid):
 935             comb += index.eq(get_index(m_in.addr))
 936         with m.Else():
 937             comb += index.eq(get_index(d_in.addr))
 938         sync += cache_tag_set.eq(cache_tags[index].tag)
 939
 940     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 941                        r0_valid, r1, cache_tags, replace_way,
 942                        use_forward1_next, use_forward2_next,
 943                        req_hit_way, plru_victim, rc_ok, perm_attr,
 944                        valid_ra, perm_ok, access_ok, req_op, req_go,
 945                        tlb_hit, tlb_way, cache_tag_set,
 946                        cancel_store, req_same_tag, r0_stall, early_req_row):
 947         """Cache request parsing and hit detection
 948         """
 949
 950         comb = m.d.comb
 951         m_in, d_in = self.m_in, self.d_in
 952
 953         is_hit      = Signal()
 954         hit_way     = Signal(WAY_BITS)
 955         op          = Signal(Op)
 956         opsel       = Signal(3)
 957         go          = Signal()
 958         nc          = Signal()
 959         cache_i_validdx = Signal(NUM_WAYS)
 960
 961         # Extract line, row and tag from request
 962         comb += req_index.eq(get_index(r0.req.addr))
 963         comb += req_row.eq(get_row(r0.req.addr))
 964         comb += req_tag.eq(get_tag(ra))
 965
 966         if False: # display on comb is a bit... busy.
 967             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 968                     r0.req.addr, ra, req_index, req_tag, req_row)
 969
 970         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 971         comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 972
 973         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
 974                                             cache_i_validdx, cache_tag_set,
 975                                             r0.req.addr)
 976         comb += dc.tlb_hit.eq(tlb_hit)
 977         comb += dc.reload_tag.eq(r1.reload_tag)
 978         comb += dc.virt_mode.eq(r0.req.virt_mode)
 979         comb += dc.go.eq(go)
 980         comb += dc.req_index.eq(req_index)
 981
 982         comb += is_hit.eq(dc.is_hit)
 983         comb += hit_way.eq(dc.hit_way)
 984         comb += req_same_tag.eq(dc.rel_match)
 985
 986         # See if the request matches the line currently being reloaded
 987         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 988                   (req_index == r1.store_index) & req_same_tag):
 989             # For a store, consider this a hit even if the row isn't
 990             # valid since it will be by the time we perform the store.
 991             # For a load, check the appropriate row valid bit.
 992             rrow = Signal(ROW_LINE_BITS)
 993             comb += rrow.eq(req_row)
 994             valid = r1.rows_valid[rrow]
 995             comb += is_hit.eq((~r0.req.load) | valid)
 996             comb += hit_way.eq(replace_way)
 997
 998         # Whether to use forwarded data for a load or not
 999         with m.If((get_row(r1.req.real_addr) == req_row) &
1000                   (r1.req.hit_way == hit_way)):
1001             # Only need to consider r1.write_bram here, since if we
1002             # are writing refill data here, then we don't have a
1003             # cache hit this cycle on the line being refilled.
1004             # (There is the possibility that the load following the
1005             # load miss that started the refill could be to the old
1006             # contents of the victim line, since it is a couple of
1007             # cycles after the refill starts before we see the updated
1008             # cache tag. In that case we don't use the bypass.)
1009             comb += use_forward1_next.eq(r1.write_bram)
1010         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
1011             comb += use_forward2_next.eq(r1.forward_valid1)
1012
1013         # The way that matched on a hit
1014         comb += req_hit_way.eq(hit_way)
1015
1016         # The way to replace on a miss
1017         with m.If(r1.write_tag):
1018             comb += replace_way.eq(plru_victim)
1019         with m.Else():
1020             comb += replace_way.eq(r1.store_way)
1021
1022         # work out whether we have permission for this access
1023         # NB we don't yet implement AMR, thus no KUAP
1024         comb += rc_ok.eq(perm_attr.reference
1025                          & (r0.req.load | perm_attr.changed))
1026         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
1027                            (perm_attr.wr_perm |
1028                               (r0.req.load & perm_attr.rd_perm)))
1029         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
1030
1031         # Combine the request and cache hit status to decide what
1032         # operation needs to be done
1033         comb += nc.eq(r0.req.nc | perm_attr.nocache)
1034         comb += op.eq(Op.OP_NONE)
1035         with m.If(go):
1036             with m.If(~access_ok):
1037                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
1038                                  valid_ra, perm_ok, rc_ok)
1039                 comb += op.eq(Op.OP_BAD)
1040             with m.Elif(cancel_store):
1041                 m.d.sync += Display("DCACHE cancel store")
1042                 comb += op.eq(Op.OP_STCX_FAIL)
1043             with m.Else():
1044                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
1045                                  valid_ra, nc, r0.req.load)
1046                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
1047                 with m.Switch(opsel):
1048                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
1049                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
1050                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
1051                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
1052                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
1053                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
1054                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
1055                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
1056         comb += req_op.eq(op)
1057         comb += req_go.eq(go)
1058
1059         # Version of the row number that is valid one cycle earlier
1060         # in the cases where we need to read the cache data BRAM.
1061         # If we're stalling then we need to keep reading the last
1062         # row requested.
1063         with m.If(~r0_stall):
1064             with m.If(m_in.valid):
1065                 comb += early_req_row.eq(get_row(m_in.addr))
1066             with m.Else():
1067                 comb += early_req_row.eq(get_row(d_in.addr))
1068         with m.Else():
1069             comb += early_req_row.eq(req_row)
1070
1071     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
1072                          r0_valid, r0, reservation):
1073         """Handle load-with-reservation and store-conditional instructions
1074         """
1075         comb = m.d.comb
1076
1077         with m.If(r0_valid & r0.req.reserve):
1078             # XXX generate alignment interrupt if address
1079             # is not aligned XXX or if r0.req.nc = '1'
1080             with m.If(r0.req.load):
1081                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
1082             with m.Else():
1083                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1084                 with m.If((~reservation.valid) |
1085                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1086                     comb += cancel_store.eq(1)
1087
1088     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1089                         reservation, r0):
1090         comb = m.d.comb
1091         sync = m.d.sync
1092
1093         with m.If(r0_valid & access_ok):
1094             with m.If(clear_rsrv):
1095                 sync += reservation.valid.eq(0)
1096             with m.Elif(set_rsrv):
1097                 sync += reservation.valid.eq(1)
1098                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1099
1100     def writeback_control(self, m, r1, cache_out_row):
1101         """Return data for loads & completion control logic
1102         """
1103         comb = m.d.comb
1104         sync = m.d.sync
1105         d_out, m_out = self.d_out, self.m_out
1106
1107         data_out = Signal(64)
1108         data_fwd = Signal(64)
1109
1110         # Use the bypass if are reading the row that was
1111         # written 1 or 2 cycles ago, including for the
1112         # slow_valid = 1 case (i.e. completing a load
1113         # miss or a non-cacheable load).
1114         with m.If(r1.use_forward1):
1115             comb += data_fwd.eq(r1.forward_data1)
1116         with m.Else():
1117             comb += data_fwd.eq(r1.forward_data2)
1118
1119         comb += data_out.eq(cache_out_row)
1120
1121         for i in range(8):
1122             with m.If(r1.forward_sel[i]):
1123                 dsel = data_fwd.word_select(i, 8)
1124                 comb += data_out.word_select(i, 8).eq(dsel)
1125
1126         # DCache output to LoadStore
1127         comb += d_out.valid.eq(r1.ls_valid)
1128         comb += d_out.data.eq(data_out)
1129         comb += d_out.store_done.eq(~r1.stcx_fail)
1130         comb += d_out.error.eq(r1.ls_error)
1131         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1132
1133         # Outputs to MMU
1134         comb += m_out.done.eq(r1.mmu_done)
1135         comb += m_out.err.eq(r1.mmu_error)
1136         comb += m_out.data.eq(data_out)
1137
1138         # We have a valid load or store hit or we just completed
1139         # a slow op such as a load miss, a NC load or a store
1140         #
1141         # Note: the load hit is delayed by one cycle. However it
1142         # can still not collide with r.slow_valid (well unless I
1143         # miscalculated) because slow_valid can only be set on a
1144         # subsequent request and not on its first cycle (the state
1145         # machine must have advanced), which makes slow_valid
1146         # at least 2 cycles from the previous hit_load_valid.
1147
1148         # Sanity: Only one of these must be set in any given cycle
1149
1150         if False: # TODO: need Display to get this to work
1151             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1152             "unexpected slow_valid collision with stcx_fail"
1153
1154             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1155              "unexpected hit_load_delayed collision with slow_valid"
1156
1157         with m.If(~r1.mmu_req):
1158             # Request came from loadstore1...
1159             # Load hit case is the standard path
1160             with m.If(r1.hit_load_valid):
1161                 sync += Display("completing load hit data=%x", data_out)
1162
1163             # error cases complete without stalling
1164             with m.If(r1.ls_error):
1165                 with m.If(r1.dcbz):
1166                     sync += Display("completing dcbz with error")
1167                 with m.Else():
1168                     sync += Display("completing ld/st with error")
1169
1170             # Slow ops (load miss, NC, stores)
1171             with m.If(r1.slow_valid):
1172                 sync += Display("completing store or load miss adr=%x data=%x",
1173                                 r1.req.real_addr, data_out)
1174
1175         with m.Else():
1176             # Request came from MMU
1177             with m.If(r1.hit_load_valid):
1178                 sync += Display("completing load hit to MMU, data=%x",
1179                                 m_out.data)
1180             # error cases complete without stalling
1181             with m.If(r1.mmu_error):
1182                 sync += Display("combpleting MMU ld with error")
1183
1184             # Slow ops (i.e. load miss)
1185             with m.If(r1.slow_valid):
1186                 sync += Display("completing MMU load miss, adr=%x data=%x",
1187                                 r1.req.real_addr, m_out.data)
1188
1189     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1190         """rams
1191         Generate a cache RAM for each way. This handles the normal
1192         reads, writes from reloads and the special store-hit update
1193         path as well.
1194
1195         Note: the BRAMs have an extra read buffer, meaning the output
1196         is pipelined an extra cycle. This differs from the
1197         icache. The writeback logic needs to take that into
1198         account by using 1-cycle delayed signals for load hits.
1199         """
1200         comb = m.d.comb
1201         bus = self.bus
1202
1203         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1204         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1205         m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
1206         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1207                    ~r1.write_bram))
1208         comb += rwe.i.eq(replace_way)
1209
1210         m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
1211         comb += hwe.i.eq(r1.hit_way)
1212
1213         # this one is gated with write_bram, and replace_way_e can never be
1214         # set at the same time.  that means that do_write can OR the outputs
1215         m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
1216         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1217         comb += hre.i.eq(r1.req.hit_way)
1218
1219         # common Signals
1220         do_read  = Signal()
1221         wr_addr  = Signal(ROW_BITS)
1222         wr_data  = Signal(WB_DATA_BITS)
1223         wr_sel   = Signal(ROW_SIZE)
1224         rd_addr  = Signal(ROW_BITS)
1225
1226         comb += do_read.eq(1) # always enable
1227         comb += rd_addr.eq(early_req_row)
1228
1229         # Write mux:
1230         #
1231         # Defaults to wishbone read responses (cache refill)
1232         #
1233         # For timing, the mux on wr_data/sel/addr is not
1234         # dependent on anything other than the current state.
1235
1236         with m.If(r1.write_bram):
1237             # Write store data to BRAM.  This happens one
1238             # cycle after the store is in r0.
1239             comb += wr_data.eq(r1.req.data)
1240             comb += wr_sel.eq(r1.req.byte_sel)
1241             comb += wr_addr.eq(get_row(r1.req.real_addr))
1242
1243         with m.Else():
1244             # Otherwise, we might be doing a reload or a DCBZ
1245             with m.If(r1.dcbz):
1246                 comb += wr_data.eq(0)
1247             with m.Else():
1248                 comb += wr_data.eq(bus.dat_r)
1249             comb += wr_addr.eq(r1.store_row)
1250             comb += wr_sel.eq(~0) # all 1s
1251
1252         # set up Cache Rams
1253         for i in range(NUM_WAYS):
1254             do_write = Signal(name="do_wr%d" % i)
1255             wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
1256             d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1257
1258             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1259             m.submodules["cacheram_%d" % i] = way
1260
1261             comb += way.rd_en.eq(do_read)
1262             comb += way.rd_addr.eq(rd_addr)
1263             comb += d_out.eq(way.rd_data_o)
1264             comb += way.wr_sel.eq(wr_sel_m)
1265             comb += way.wr_addr.eq(wr_addr)
1266             comb += way.wr_data.eq(wr_data)
1267
1268             # Cache hit reads
1269             with m.If(hwe.o[i]):
1270                 comb += cache_out_row.eq(d_out)
1271
1272             # these are mutually-exclusive via their Decoder-enablers
1273             # (note: Decoder-enable is inverted)
1274             comb += do_write.eq(hre.o[i] | rwe.o[i])
1275
1276             # Mask write selects with do_write since BRAM
1277             # doesn't have a global write-enable
1278             with m.If(do_write):
1279                 comb += wr_sel_m.eq(wr_sel)
1280
1281     # Cache hit synchronous machine for the easy case.
1282     # This handles load hits.
1283     # It also handles error cases (TLB miss, cache paradox)
1284     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1285                         req_hit_way, req_index, req_tag, access_ok,
1286                         tlb_hit, tlb_req_index):
1287         comb = m.d.comb
1288         sync = m.d.sync
1289
1290         with m.If(req_op != Op.OP_NONE):
1291             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1292                     req_op, r0.req.addr, r0.req.nc,
1293                     req_index, req_tag, req_hit_way)
1294
1295         with m.If(r0_valid):
1296             sync += r1.mmu_req.eq(r0.mmu_req)
1297
1298         # Fast path for load/store hits.
1299         # Set signals for the writeback controls.
1300         sync += r1.hit_way.eq(req_hit_way)
1301         sync += r1.hit_index.eq(req_index)
1302
1303         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1304         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1305                                 (req_op == Op.OP_STORE_HIT))
1306
1307         with m.If(req_op == Op.OP_BAD):
1308             sync += Display("Signalling ld/st error "
1309                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1310                             ~r0.mmu_req,r0.mmu_req,access_ok)
1311             sync += r1.ls_error.eq(~r0.mmu_req)
1312             sync += r1.mmu_error.eq(r0.mmu_req)
1313             sync += r1.cache_paradox.eq(access_ok)
1314         with m.Else():
1315             sync += r1.ls_error.eq(0)
1316             sync += r1.mmu_error.eq(0)
1317             sync += r1.cache_paradox.eq(0)
1318
1319         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1320
1321         # Record TLB hit information for updating TLB PLRU
1322         sync += r1.tlb_hit.eq(tlb_hit)
1323         sync += r1.tlb_hit_index.eq(tlb_req_index)
1324
1325     # Memory accesses are handled by this state machine:
1326     #
1327     #   * Cache load miss/reload (in conjunction with "rams")
1328     #   * Load hits for non-cachable forms
1329     #   * Stores (the collision case is handled in "rams")
1330     #
1331     # All wishbone requests generation is done here.
1332     # This machine operates at stage 1.
1333     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1334                     r0, replace_way,
1335                     req_hit_way, req_same_tag,
1336                     r0_valid, req_op, cache_tags, req_go, ra):
1337
1338         comb = m.d.comb
1339         sync = m.d.sync
1340         bus = self.bus
1341         d_in = self.d_in
1342
1343         req         = MemAccessRequest("mreq_ds")
1344
1345         r1_next_cycle = Signal()
1346         req_row = Signal(ROW_BITS)
1347         req_idx = Signal(INDEX_BITS)
1348         req_tag = Signal(TAG_BITS)
1349         comb += req_idx.eq(get_index(req.real_addr))
1350         comb += req_row.eq(get_row(req.real_addr))
1351         comb += req_tag.eq(get_tag(req.real_addr))
1352
1353         sync += r1.use_forward1.eq(use_forward1_next)
1354         sync += r1.forward_sel.eq(0)
1355
1356         with m.If(use_forward1_next):
1357             sync += r1.forward_sel.eq(r1.req.byte_sel)
1358         with m.Elif(use_forward2_next):
1359             sync += r1.forward_sel.eq(r1.forward_sel1)
1360
1361         sync += r1.forward_data2.eq(r1.forward_data1)
1362         with m.If(r1.write_bram):
1363             sync += r1.forward_data1.eq(r1.req.data)
1364             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1365             sync += r1.forward_way1.eq(r1.req.hit_way)
1366             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1367             sync += r1.forward_valid1.eq(1)
1368         with m.Else():
1369             with m.If(r1.dcbz):
1370                 sync += r1.forward_data1.eq(0)
1371             with m.Else():
1372                 sync += r1.forward_data1.eq(bus.dat_r)
1373             sync += r1.forward_sel1.eq(~0) # all 1s
1374             sync += r1.forward_way1.eq(replace_way)
1375             sync += r1.forward_row1.eq(r1.store_row)
1376             sync += r1.forward_valid1.eq(0)
1377
1378         # One cycle pulses reset
1379         sync += r1.slow_valid.eq(0)
1380         sync += r1.write_bram.eq(0)
1381         sync += r1.inc_acks.eq(0)
1382         sync += r1.dec_acks.eq(0)
1383
1384         sync += r1.ls_valid.eq(0)
1385         # complete tlbies and TLB loads in the third cycle
1386         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1387
1388         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1389             with m.If(r0.mmu_req):
1390                 sync += r1.mmu_done.eq(1)
1391             with m.Else():
1392                 sync += r1.ls_valid.eq(1)
1393
1394         with m.If(r1.write_tag):
1395             # Store new tag in selected way
1396             replace_way_onehot = Signal(NUM_WAYS)
1397             comb += replace_way_onehot.eq(1<<replace_way)
1398             for i in range(NUM_WAYS):
1399                 with m.If(replace_way_onehot[i]):
1400                     ct = Signal(TAG_RAM_WIDTH)
1401                     comb += ct.eq(cache_tags[r1.store_index].tag)
1402                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1403                     sync += cache_tags[r1.store_index].tag.eq(ct)
1404             sync += r1.store_way.eq(replace_way)
1405             sync += r1.write_tag.eq(0)
1406
1407         # Take request from r1.req if there is one there,
1408         # else from req_op, ra, etc.
1409         with m.If(r1.full):
1410             comb += req.eq(r1.req)
1411         with m.Else():
1412             comb += req.op.eq(req_op)
1413             comb += req.valid.eq(req_go)
1414             comb += req.mmu_req.eq(r0.mmu_req)
1415             comb += req.dcbz.eq(r0.req.dcbz)
1416             comb += req.real_addr.eq(ra)
1417
1418             with m.If(r0.req.dcbz):
1419                 # force data to 0 for dcbz
1420                 comb += req.data.eq(0)
1421             with m.Elif(r0.d_valid):
1422                 comb += req.data.eq(r0.req.data)
1423             with m.Else():
1424                 comb += req.data.eq(d_in.data)
1425
1426             # Select all bytes for dcbz
1427             # and for cacheable loads
1428             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1429                 comb += req.byte_sel.eq(~0) # all 1s
1430             with m.Else():
1431                 comb += req.byte_sel.eq(r0.req.byte_sel)
1432             comb += req.hit_way.eq(req_hit_way)
1433             comb += req.same_tag.eq(req_same_tag)
1434
1435             # Store the incoming request from r0,
1436             # if it is a slow request
1437             # Note that r1.full = 1 implies req_op = OP_NONE
1438             with m.If((req_op == Op.OP_LOAD_MISS)
1439                       | (req_op == Op.OP_LOAD_NC)
1440                       | (req_op == Op.OP_STORE_MISS)
1441                       | (req_op == Op.OP_STORE_HIT)):
1442                 sync += r1.req.eq(req)
1443                 sync += r1.full.eq(1)
1444                 # do not let r1.state RELOAD_WAIT_ACK or STORE_WAIT_ACK
1445                 # destroy r1.req by overwriting r1.full back to zero
1446                 comb += r1_next_cycle.eq(1)
1447
1448         # Main state machine
1449         with m.Switch(r1.state):
1450
1451             with m.Case(State.IDLE):
1452                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1453                 sync += r1.wb.sel.eq(req.byte_sel)
1454                 sync += r1.wb.dat.eq(req.data)
1455                 sync += r1.dcbz.eq(req.dcbz)
1456
1457                 # Keep track of our index and way
1458                 # for subsequent stores.
1459                 sync += r1.store_index.eq(req_idx)
1460                 sync += r1.store_row.eq(req_row)
1461                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1462                 sync += r1.reload_tag.eq(req_tag)
1463                 sync += r1.req.same_tag.eq(1)
1464
1465                 with m.If(req.op == Op.OP_STORE_HIT):
1466                     sync += r1.store_way.eq(req.hit_way)
1467
1468                 #with m.If(r1.dec_acks):
1469                 #    sync += r1.acks_pending.eq(r1.acks_pending - 1)
1470
1471                 # Reset per-row valid bits,
1472                 # ready for handling OP_LOAD_MISS
1473                 for i in range(ROW_PER_LINE):
1474                     sync += r1.rows_valid[i].eq(0)
1475
1476                 with m.If(req_op != Op.OP_NONE):
1477                     sync += Display("cache op %d", req.op)
1478
1479                 with m.Switch(req.op):
1480                     with m.Case(Op.OP_LOAD_HIT):
1481                         # stay in IDLE state
1482                         pass
1483
1484                     with m.Case(Op.OP_LOAD_MISS):
1485                         sync += Display("cache miss real addr: %x " \
1486                                 "idx: %x tag: %x",
1487                                 req.real_addr, req_row, req_tag)
1488
1489                         # Start the wishbone cycle
1490                         sync += r1.wb.we.eq(0)
1491                         sync += r1.wb.cyc.eq(1)
1492                         sync += r1.wb.stb.eq(1)
1493
1494                         # Track that we had one request sent
1495                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1496                         sync += r1.write_tag.eq(1)
1497
1498                     with m.Case(Op.OP_LOAD_NC):
1499                         sync += r1.wb.cyc.eq(1)
1500                         sync += r1.wb.stb.eq(1)
1501                         sync += r1.wb.we.eq(0)
1502                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1503
1504                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1505                         with m.If(~req.dcbz):
1506                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1507                             sync += r1.acks_pending.eq(1)
1508                             sync += r1.full.eq(0)
1509                             comb += r1_next_cycle.eq(0)
1510                             sync += r1.slow_valid.eq(1)
1511
1512                             with m.If(req.mmu_req):
1513                                 sync += r1.mmu_done.eq(1)
1514                             with m.Else():
1515                                 sync += r1.ls_valid.eq(1)
1516
1517                             with m.If(req.op == Op.OP_STORE_HIT):
1518                                 sync += r1.write_bram.eq(1)
1519                         with m.Else():
1520                             # dcbz is handled much like a load miss except
1521                             # that we are writing to memory instead of reading
1522                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1523
1524                             with m.If(req.op == Op.OP_STORE_MISS):
1525                                 sync += r1.write_tag.eq(1)
1526
1527                         sync += r1.wb.we.eq(1)
1528                         sync += r1.wb.cyc.eq(1)
1529                         sync += r1.wb.stb.eq(1)
1530
1531                     # OP_NONE and OP_BAD do nothing
1532                     # OP_BAD & OP_STCX_FAIL were
1533                     # handled above already
1534                     with m.Case(Op.OP_NONE):
1535                         pass
1536                     with m.Case(Op.OP_BAD):
1537                         pass
1538                     with m.Case(Op.OP_STCX_FAIL):
1539                         pass
1540
1541             with m.Case(State.RELOAD_WAIT_ACK):
1542                 ld_stbs_done = Signal()
1543                 # Requests are all sent if stb is 0
1544                 comb += ld_stbs_done.eq(~r1.wb.stb)
1545
1546                 # If we are still sending requests, was one accepted?
1547                 with m.If((~bus.stall) & r1.wb.stb):
1548                     # That was the last word?  We are done sending.
1549                     # Clear stb and set ld_stbs_done so we can handle an
1550                     # eventual last ack on the same cycle.
1551                     # sigh - reconstruct wb adr with 3 extra 0s at front
1552                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1553                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1554                         sync += r1.wb.stb.eq(0)
1555                         comb += ld_stbs_done.eq(1)
1556
1557                     # Calculate the next row address in the current cache line
1558                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1559                     comb += row.eq(r1.wb.adr)
1560                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1561
1562                 # Incoming acks processing
1563                 sync += r1.forward_valid1.eq(bus.ack)
1564                 with m.If(bus.ack):
1565                     srow = Signal(ROW_LINE_BITS)
1566                     comb += srow.eq(r1.store_row)
1567                     sync += r1.rows_valid[srow].eq(1)
1568
1569                     # If this is the data we were looking for,
1570                     # we can complete the request next cycle.
1571                     # Compare the whole address in case the
1572                     # request in r1.req is not the one that
1573                     # started this refill.
1574                     with m.If(r1.full & r1.req.same_tag &
1575                               ((r1.dcbz & req.dcbz) |
1576                                (r1.req.op == Op.OP_LOAD_MISS)) &
1577                                 (r1.store_row == get_row(r1.req.real_addr))):
1578                         sync += r1.full.eq(r1_next_cycle)
1579                         sync += r1.slow_valid.eq(1)
1580                         with m.If(r1.mmu_req):
1581                             sync += r1.mmu_done.eq(1)
1582                         with m.Else():
1583                             sync += r1.ls_valid.eq(1)
1584                         sync += r1.forward_sel.eq(~0) # all 1s
1585                         sync += r1.use_forward1.eq(1)
1586
1587                     # Check for completion
1588                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1589                                                       r1.end_row_ix)):
1590                         # Complete wishbone cycle
1591                         sync += r1.wb.cyc.eq(0)
1592
1593                         # Cache line is now valid
1594                         cv = Signal(INDEX_BITS)
1595                         comb += cv.eq(cache_tags[r1.store_index].valid)
1596                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1597                         sync += cache_tags[r1.store_index].valid.eq(cv)
1598
1599                         sync += r1.state.eq(State.IDLE)
1600                         sync += Display("cache valid set %x "
1601                                         "idx %d way %d",
1602                                          cv, r1.store_index, r1.store_way)
1603
1604                     # Increment store row counter
1605                     sync += r1.store_row.eq(next_row(r1.store_row))
1606
1607             with m.Case(State.STORE_WAIT_ACK):
1608                 st_stbs_done = Signal()
1609                 adjust_acks = Signal(3)
1610
1611                 comb += st_stbs_done.eq(~r1.wb.stb)
1612
1613                 with m.If(r1.inc_acks != r1.dec_acks):
1614                     with m.If(r1.inc_acks):
1615                         comb += adjust_acks.eq(r1.acks_pending + 1)
1616                     with m.Else():
1617                         comb += adjust_acks.eq(r1.acks_pending - 1)
1618                 with m.Else():
1619                     comb += adjust_acks.eq(r1.acks_pending)
1620
1621                 sync += r1.acks_pending.eq(adjust_acks)
1622
1623                 # Clear stb when slave accepted request
1624                 with m.If(~bus.stall):
1625                     # See if there is another store waiting
1626                     # to be done which is in the same real page.
1627                     with m.If(req.valid):
1628                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1629                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1630                         sync += r1.wb.dat.eq(req.data)
1631                         sync += r1.wb.sel.eq(req.byte_sel)
1632
1633                     with m.If((adjust_acks < 7) & req.same_tag &
1634                                 ((req.op == Op.OP_STORE_MISS) |
1635                                  (req.op == Op.OP_STORE_HIT))):
1636                         sync += r1.wb.stb.eq(1)
1637                         comb += st_stbs_done.eq(0)
1638                         sync += r1.store_way.eq(req.hit_way)
1639                         sync += r1.store_row.eq(get_row(req.real_addr))
1640
1641                         with m.If(req.op == Op.OP_STORE_HIT):
1642                             sync += r1.write_bram.eq(1)
1643                         sync += r1.full.eq(r1_next_cycle)
1644                         sync += r1.slow_valid.eq(1)
1645
1646                         # Store requests never come from the MMU
1647                         sync += r1.ls_valid.eq(1)
1648                         comb += st_stbs_done.eq(0)
1649                         sync += r1.inc_acks.eq(1)
1650                     with m.Else():
1651                         sync += r1.wb.stb.eq(0)
1652                         comb += st_stbs_done.eq(1)
1653
1654                 # Got ack ? See if complete.
1655                 sync += Display("got ack %d %d stbs %d adjust_acks %d",
1656                                 bus.ack, bus.ack, st_stbs_done, adjust_acks)
1657                 with m.If(bus.ack):
1658                     with m.If(st_stbs_done & (adjust_acks == 1)):
1659                         sync += r1.state.eq(State.IDLE)
1660                         sync += r1.wb.cyc.eq(0)
1661                         sync += r1.wb.stb.eq(0)
1662                     sync += r1.dec_acks.eq(1)
1663
1664             with m.Case(State.NC_LOAD_WAIT_ACK):
1665                 # Clear stb when slave accepted request
1666                 with m.If(~bus.stall):
1667                     sync += r1.wb.stb.eq(0)
1668
1669                 # Got ack ? complete.
1670                 with m.If(bus.ack):
1671                     sync += r1.state.eq(State.IDLE)
1672                     sync += r1.full.eq(r1_next_cycle)
1673                     sync += r1.slow_valid.eq(1)
1674
1675                     with m.If(r1.mmu_req):
1676                         sync += r1.mmu_done.eq(1)
1677                     with m.Else():
1678                         sync += r1.ls_valid.eq(1)
1679
1680                     sync += r1.forward_sel.eq(~0) # all 1s
1681                     sync += r1.use_forward1.eq(1)
1682                     sync += r1.wb.cyc.eq(0)
1683                     sync += r1.wb.stb.eq(0)
1684
1685     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1686
1687         sync = m.d.sync
1688         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1689
1690         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1691                                stall_out, req_op[:3], d_out.valid, d_out.error,
1692                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1693                                r1.real_adr[3:6]))
1694
1695     def elaborate(self, platform):
1696
1697         m = Module()
1698         comb, sync = m.d.comb, m.d.sync
1699         m_in, d_in = self.m_in, self.d_in
1700
1701         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1702         cache_tags       = CacheTagArray()
1703         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1704
1705         # TODO attribute ram_style : string;
1706         # TODO attribute ram_style of cache_tags : signal is "distributed";
1707
1708         """note: these are passed to nmigen.hdl.Memory as "attributes".
1709            don't know how, just that they are.
1710         """
1711         # TODO attribute ram_style of
1712         #  dtlb_tags : signal is "distributed";
1713         # TODO attribute ram_style of
1714         #  dtlb_ptes : signal is "distributed";
1715
1716         r0      = RegStage0("r0")
1717         r0_full = Signal()
1718
1719         r1 = RegStage1("r1")
1720
1721         reservation = Reservation("rsrv")
1722
1723         # Async signals on incoming request
1724         req_index    = Signal(INDEX_BITS)
1725         req_row      = Signal(ROW_BITS)
1726         req_hit_way  = Signal(WAY_BITS)
1727         req_tag      = Signal(TAG_BITS)
1728         req_op       = Signal(Op)
1729         req_data     = Signal(64)
1730         req_same_tag = Signal()
1731         req_go       = Signal()
1732
1733         early_req_row     = Signal(ROW_BITS)
1734
1735         cancel_store      = Signal()
1736         set_rsrv          = Signal()
1737         clear_rsrv        = Signal()
1738
1739         r0_valid          = Signal()
1740         r0_stall          = Signal()
1741
1742         use_forward1_next = Signal()
1743         use_forward2_next = Signal()
1744
1745         cache_out_row     = Signal(WB_DATA_BITS)
1746
1747         plru_victim       = Signal(WAY_BITS)
1748         replace_way       = Signal(WAY_BITS)
1749
1750         # Wishbone read/write/cache write formatting signals
1751         bus_sel           = Signal(8)
1752
1753         # TLB signals
1754         tlb_way       = TLBRecord("tlb_way")
1755         tlb_req_index = Signal(TLB_SET_BITS)
1756         tlb_hit       = TLBHit("tlb_hit")
1757         pte           = Signal(TLB_PTE_BITS)
1758         ra            = Signal(REAL_ADDR_BITS)
1759         valid_ra      = Signal()
1760         perm_attr     = PermAttr("dc_perms")
1761         rc_ok         = Signal()
1762         perm_ok       = Signal()
1763         access_ok     = Signal()
1764
1765         tlb_plru_victim = Signal(TLB_WAY_BITS)
1766
1767         # we don't yet handle collisions between loadstore1 requests
1768         # and MMU requests
1769         comb += self.m_out.stall.eq(0)
1770
1771         # Hold off the request in r0 when r1 has an uncompleted request
1772         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1773         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1774         comb += self.stall_out.eq(r0_stall)
1775         # debugging: detect if any stall ever requested, which is fine,
1776         # but if a request comes in when stall requested, that's bad.
1777         with m.If(r0_stall):
1778             sync += self.any_stall_out.eq(1)
1779             with m.If(d_in.valid):
1780                 sync += self.dreq_when_stall.eq(1)
1781             with m.If(m_in.valid):
1782                 sync += self.mreq_when_stall.eq(1)
1783
1784         # deal with litex not doing wishbone pipeline mode
1785         # XXX in wrong way.  FIFOs are needed in the SRAM test
1786         # so that stb/ack match up. same thing done in icache.py
1787         if not self.microwatt_compat:
1788             comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1789
1790         # Wire up wishbone request latch out of stage 1
1791         comb += self.bus.we.eq(r1.wb.we)
1792         comb += self.bus.adr.eq(r1.wb.adr)
1793         comb += self.bus.sel.eq(r1.wb.sel)
1794         comb += self.bus.stb.eq(r1.wb.stb)
1795         comb += self.bus.dat_w.eq(r1.wb.dat)
1796         comb += self.bus.cyc.eq(r1.wb.cyc)
1797
1798         # create submodule TLBUpdate
1799         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
1800
1801         # call sub-functions putting everything together, using shared
1802         # signals established above
1803         self.stage_0(m, r0, r1, r0_full)
1804         self.tlb_read(m, r0_stall, tlb_way)
1805         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1806                         tlb_way,
1807                         pte, tlb_hit, valid_ra, perm_attr, ra)
1808         self.tlb_update(m, r0_valid, r0, tlb_req_index,
1809                         tlb_hit, tlb_plru_victim)
1810         self.maybe_plrus(m, r1, plru_victim)
1811         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1812         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1813         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1814                            r0_valid, r1, cache_tags, replace_way,
1815                            use_forward1_next, use_forward2_next,
1816                            req_hit_way, plru_victim, rc_ok, perm_attr,
1817                            valid_ra, perm_ok, access_ok, req_op, req_go,
1818                            tlb_hit, tlb_way, cache_tag_set,
1819                            cancel_store, req_same_tag, r0_stall, early_req_row)
1820         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1821                            r0_valid, r0, reservation)
1822         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1823                            reservation, r0)
1824         self.writeback_control(m, r1, cache_out_row)
1825         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1826         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1827                         req_hit_way, req_index, req_tag, access_ok,
1828                         tlb_hit, tlb_req_index)
1829         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1830                     r0, replace_way,
1831                     req_hit_way, req_same_tag,
1832                          r0_valid, req_op, cache_tags, req_go, ra)
1833         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1834
1835         return m
1836
1837
1838 if __name__ == '__main__':
1839     dut = DCache()
1840     vl = rtlil.convert(dut, ports=[])
1841     with open("test_dcache.il", "w") as f:
1842         f.write(vl)