src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
  17   (discussion about brams for ECP5)
  18
  19 """
  20
  21 import sys
  22
  23 from nmutil.gtkw import write_gtkw
  24
  25 sys.setrecursionlimit(1000000)
  26
  27 from enum import Enum, unique
  28
  29 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  30                     Record, Memory)
  31 from nmutil.util import Display
  32 from nmigen.lib.coding import Decoder
  33
  34 from copy import deepcopy
  35 from random import randint, seed
  36
  37 from nmigen_soc.wishbone.bus import Interface
  38
  39 from nmigen.cli import main
  40 from nmutil.iocontrol import RecordObject
  41 from nmigen.utils import log2_int
  42 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  43                                      DCacheToLoadStore1Type,
  44                                      MMUToDCacheType,
  45                                      DCacheToMMUType)
  46
  47 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  48                                 WBAddrType, WBDataType, WBSelType,
  49                                 WBMasterOut, WBSlaveOut,
  50                                 WBMasterOutVector, WBSlaveOutVector,
  51                                 WBIOMasterOut, WBIOSlaveOut)
  52
  53 from soc.experiment.cache_ram import CacheRam
  54 from soc.experiment.plru import PLRU, PLRUs
  55 #from nmutil.plru import PLRU, PLRUs
  56
  57 # for test
  58 from soc.bus.sram import SRAM
  59 from nmigen import Memory
  60 from nmigen.cli import rtlil
  61
  62 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  63 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  64 from nmutil.sim_tmp_alternative import Simulator
  65
  66 from nmutil.util import wrap
  67
  68
  69 # TODO: make these parameters of DCache at some point
  70 LINE_SIZE = 64    # Line size in bytes
  71 NUM_LINES = 32    # Number of lines in a set
  72 NUM_WAYS = 4      # Number of ways
  73 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  74 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  75 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  76 LOG_LENGTH = 0    # Non-zero to enable log data collection
  77
  78 # BRAM organisation: We never access more than
  79 #     -- WB_DATA_BITS at a time so to save
  80 #     -- resources we make the array only that wide, and
  81 #     -- use consecutive indices to make a cache "line"
  82 #     --
  83 #     -- ROW_SIZE is the width in bytes of the BRAM
  84 #     -- (based on WB, so 64-bits)
  85 ROW_SIZE = WB_DATA_BITS // 8;
  86
  87 # ROW_PER_LINE is the number of row (wishbone
  88 # transactions) in a line
  89 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  90
  91 # BRAM_ROWS is the number of rows in BRAM needed
  92 # to represent the full dcache
  93 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  94
  95 print ("ROW_SIZE", ROW_SIZE)
  96 print ("ROW_PER_LINE", ROW_PER_LINE)
  97 print ("BRAM_ROWS", BRAM_ROWS)
  98 print ("NUM_WAYS", NUM_WAYS)
  99
 100 # Bit fields counts in the address
 101
 102 # REAL_ADDR_BITS is the number of real address
 103 # bits that we store
 104 REAL_ADDR_BITS = 56
 105
 106 # ROW_BITS is the number of bits to select a row
 107 ROW_BITS = log2_int(BRAM_ROWS)
 108
 109 # ROW_LINE_BITS is the number of bits to select
 110 # a row within a line
 111 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 112
 113 # LINE_OFF_BITS is the number of bits for
 114 # the offset in a cache line
 115 LINE_OFF_BITS = log2_int(LINE_SIZE)
 116
 117 # ROW_OFF_BITS is the number of bits for
 118 # the offset in a row
 119 ROW_OFF_BITS = log2_int(ROW_SIZE)
 120
 121 # INDEX_BITS is the number if bits to
 122 # select a cache line
 123 INDEX_BITS = log2_int(NUM_LINES)
 124
 125 # SET_SIZE_BITS is the log base 2 of the set size
 126 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 127
 128 # TAG_BITS is the number of bits of
 129 # the tag part of the address
 130 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 131
 132 # TAG_WIDTH is the width in bits of each way of the tag RAM
 133 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 134
 135 # WAY_BITS is the number of bits to select a way
 136 WAY_BITS = log2_int(NUM_WAYS)
 137
 138 # Example of layout for 32 lines of 64 bytes:
 139 layout = f"""\
 140   DCache Layout:
 141  |.. -----------------------| REAL_ADDR_BITS ({REAL_ADDR_BITS})
 142   ..         |--------------| SET_SIZE_BITS ({SET_SIZE_BITS})
 143   ..  tag    |index|  line  |
 144   ..         |   row   |    |
 145   ..         |     |---|    | ROW_LINE_BITS ({ROW_LINE_BITS})
 146   ..         |     |--- - --| LINE_OFF_BITS ({LINE_OFF_BITS})
 147   ..         |         |- --| ROW_OFF_BITS  ({ROW_OFF_BITS})
 148   ..         |----- ---|    | ROW_BITS      ({ROW_BITS})
 149   ..         |-----|        | INDEX_BITS    ({INDEX_BITS})
 150   .. --------|              | TAG_BITS      ({TAG_BITS})
 151 """
 152 print (layout)
 153 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 154             (TAG_BITS, INDEX_BITS, ROW_BITS,
 155              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 156 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 157 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 158 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 159
 160 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 161
 162 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 163 print ("    TAG_WIDTH", TAG_WIDTH)
 164 print ("     NUM_WAYS", NUM_WAYS)
 165 print ("    NUM_LINES", NUM_LINES)
 166
 167
 168 def CacheTag(name=None):
 169     tag_layout = [('valid', NUM_WAYS),
 170                   ('tag', TAG_RAM_WIDTH),
 171                  ]
 172     return Record(tag_layout, name=name)
 173
 174
 175 def CacheTagArray():
 176     return Array(CacheTag(name="tag%d" % x) for x in range(NUM_LINES))
 177
 178
 179 def RowPerLineValidArray():
 180     return Array(Signal(name="rows_valid%d" % x) \
 181                         for x in range(ROW_PER_LINE))
 182
 183
 184 # L1 TLB
 185 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 186 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 187 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 188 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 189 TLB_PTE_BITS     = 64
 190 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 191
 192 def ispow2(x):
 193     return (1<<log2_int(x, False)) == x
 194
 195 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 196 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 197 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 198 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 199 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 200 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 201         "geometry bits don't add up"
 202 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 203         "geometry bits don't add up"
 204 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 205          "geometry bits don't add up"
 206 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 207 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 208
 209
 210 def TLBHit(name):
 211     return Record([('valid', 1),
 212                    ('way', TLB_WAY_BITS)], name=name)
 213
 214 def TLBTagEAArray():
 215     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 216                 for x in range (TLB_NUM_WAYS))
 217
 218 def TLBRecord(name):
 219     tlb_layout = [('valid', TLB_NUM_WAYS),
 220                   ('tag', TLB_TAG_WAY_BITS),
 221                   ('pte', TLB_PTE_WAY_BITS)
 222                  ]
 223     return Record(tlb_layout, name=name)
 224
 225 def TLBValidArray():
 226     return Array(Signal(TLB_NUM_WAYS, name="tlb_valid%d" % x)
 227                         for x in range(TLB_SET_SIZE))
 228
 229 def HitWaySet():
 230     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 231                         for x in range(TLB_NUM_WAYS))
 232
 233 # Cache RAM interface
 234 def CacheRamOut():
 235     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 236                  for x in range(NUM_WAYS))
 237
 238 # PLRU output interface
 239 def PLRUOut():
 240     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 241                 for x in range(NUM_LINES))
 242
 243 # TLB PLRU output interface
 244 def TLBPLRUOut():
 245     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 246                 for x in range(TLB_SET_SIZE))
 247
 248 # Helper functions to decode incoming requests
 249 #
 250 # Return the cache line index (tag index) for an address
 251 def get_index(addr):
 252     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 253
 254 # Return the cache row index (data memory) for an address
 255 def get_row(addr):
 256     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 257
 258 # Return the index of a row within a line
 259 def get_row_of_line(row):
 260     return row[:ROW_BITS][:ROW_LINE_BITS]
 261
 262 # Returns whether this is the last row of a line
 263 def is_last_row_addr(addr, last):
 264     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 265
 266 # Returns whether this is the last row of a line
 267 def is_last_row(row, last):
 268     return get_row_of_line(row) == last
 269
 270 # Return the next row in the current cache line. We use a
 271 # dedicated function in order to limit the size of the
 272 # generated adder to be only the bits within a cache line
 273 # (3 bits with default settings)
 274 def next_row(row):
 275     row_v = row[0:ROW_LINE_BITS] + 1
 276     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 277
 278 # Get the tag value from the address
 279 def get_tag(addr):
 280     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 281
 282 # Read a tag from a tag memory row
 283 def read_tag(way, tagset):
 284     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 285
 286 # Read a TLB tag from a TLB tag memory row
 287 def read_tlb_tag(way, tags):
 288     return tags.word_select(way, TLB_EA_TAG_BITS)
 289
 290 # Write a TLB tag to a TLB tag memory row
 291 def write_tlb_tag(way, tags, tag):
 292     return read_tlb_tag(way, tags).eq(tag)
 293
 294 # Read a PTE from a TLB PTE memory row
 295 def read_tlb_pte(way, ptes):
 296     return ptes.word_select(way, TLB_PTE_BITS)
 297
 298 def write_tlb_pte(way, ptes, newpte):
 299     return read_tlb_pte(way, ptes).eq(newpte)
 300
 301
 302 # Record for storing permission, attribute, etc. bits from a PTE
 303 class PermAttr(RecordObject):
 304     def __init__(self, name=None):
 305         super().__init__(name=name)
 306         self.reference = Signal()
 307         self.changed   = Signal()
 308         self.nocache   = Signal()
 309         self.priv      = Signal()
 310         self.rd_perm   = Signal()
 311         self.wr_perm   = Signal()
 312
 313
 314 def extract_perm_attr(pte):
 315     pa = PermAttr()
 316     return pa;
 317
 318
 319 # Type of operation on a "valid" input
 320 @unique
 321 class Op(Enum):
 322     OP_NONE       = 0
 323     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 324     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 325     OP_LOAD_HIT   = 3 # Cache hit on load
 326     OP_LOAD_MISS  = 4 # Load missing cache
 327     OP_LOAD_NC    = 5 # Non-cachable load
 328     OP_STORE_HIT  = 6 # Store hitting cache
 329     OP_STORE_MISS = 7 # Store missing cache
 330
 331
 332 # Cache state machine
 333 @unique
 334 class State(Enum):
 335     IDLE             = 0 # Normal load hit processing
 336     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 337     STORE_WAIT_ACK   = 2 # Store wait ack
 338     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 339
 340
 341 # Dcache operations:
 342 #
 343 # In order to make timing, we use the BRAMs with
 344 # an output buffer, which means that the BRAM
 345 # output is delayed by an extra cycle.
 346 #
 347 # Thus, the dcache has a 2-stage internal pipeline
 348 # for cache hits with no stalls.
 349 #
 350 # All other operations are handled via stalling
 351 # in the first stage.
 352 #
 353 # The second stage can thus complete a hit at the same
 354 # time as the first stage emits a stall for a complex op.
 355 #
 356 # Stage 0 register, basically contains just the latched request
 357
 358 class RegStage0(RecordObject):
 359     def __init__(self, name=None):
 360         super().__init__(name=name)
 361         self.req     = LoadStore1ToDCacheType(name="lsmem")
 362         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 363         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 364         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 365         self.mmu_req = Signal() # indicates source of request
 366         self.d_valid = Signal() # indicates req.data is valid now
 367
 368
 369 class MemAccessRequest(RecordObject):
 370     def __init__(self, name=None):
 371         super().__init__(name=name)
 372         self.op        = Signal(Op)
 373         self.valid     = Signal()
 374         self.dcbz      = Signal()
 375         self.real_addr = Signal(REAL_ADDR_BITS)
 376         self.data      = Signal(64)
 377         self.byte_sel  = Signal(8)
 378         self.hit_way   = Signal(WAY_BITS)
 379         self.same_tag  = Signal()
 380         self.mmu_req   = Signal()
 381
 382
 383 # First stage register, contains state for stage 1 of load hits
 384 # and for the state machine used by all other operations
 385 class RegStage1(RecordObject):
 386     def __init__(self, name=None):
 387         super().__init__(name=name)
 388         # Info about the request
 389         self.full             = Signal() # have uncompleted request
 390         self.mmu_req          = Signal() # request is from MMU
 391         self.req              = MemAccessRequest(name="reqmem")
 392
 393         # Cache hit state
 394         self.hit_way          = Signal(WAY_BITS)
 395         self.hit_load_valid   = Signal()
 396         self.hit_index        = Signal(INDEX_BITS)
 397         self.cache_hit        = Signal()
 398
 399         # TLB hit state
 400         self.tlb_hit          = TLBHit("tlb_hit")
 401         self.tlb_hit_index    = Signal(TLB_SET_BITS)
 402
 403         # 2-stage data buffer for data forwarded from writes to reads
 404         self.forward_data1    = Signal(64)
 405         self.forward_data2    = Signal(64)
 406         self.forward_sel1     = Signal(8)
 407         self.forward_valid1   = Signal()
 408         self.forward_way1     = Signal(WAY_BITS)
 409         self.forward_row1     = Signal(ROW_BITS)
 410         self.use_forward1     = Signal()
 411         self.forward_sel      = Signal(8)
 412
 413         # Cache miss state (reload state machine)
 414         self.state            = Signal(State)
 415         self.dcbz             = Signal()
 416         self.write_bram       = Signal()
 417         self.write_tag        = Signal()
 418         self.slow_valid       = Signal()
 419         self.wb               = WBMasterOut("wb")
 420         self.reload_tag       = Signal(TAG_BITS)
 421         self.store_way        = Signal(WAY_BITS)
 422         self.store_row        = Signal(ROW_BITS)
 423         self.store_index      = Signal(INDEX_BITS)
 424         self.end_row_ix       = Signal(ROW_LINE_BITS)
 425         self.rows_valid       = RowPerLineValidArray()
 426         self.acks_pending     = Signal(3)
 427         self.inc_acks         = Signal()
 428         self.dec_acks         = Signal()
 429
 430         # Signals to complete (possibly with error)
 431         self.ls_valid         = Signal()
 432         self.ls_error         = Signal()
 433         self.mmu_done         = Signal()
 434         self.mmu_error        = Signal()
 435         self.cache_paradox    = Signal()
 436
 437         # Signal to complete a failed stcx.
 438         self.stcx_fail        = Signal()
 439
 440
 441 # Reservation information
 442 class Reservation(RecordObject):
 443     def __init__(self, name=None):
 444         super().__init__(name=name)
 445         self.valid = Signal()
 446         self.addr  = Signal(64-LINE_OFF_BITS)
 447
 448
 449 class DTLBUpdate(Elaboratable):
 450     def __init__(self):
 451         self.tlbie    = Signal()
 452         self.tlbwe    = Signal()
 453         self.doall    = Signal()
 454         self.tlb_hit     = TLBHit("tlb_hit")
 455         self.tlb_req_index = Signal(TLB_SET_BITS)
 456
 457         self.repl_way        = Signal(TLB_WAY_BITS)
 458         self.eatag           = Signal(TLB_EA_TAG_BITS)
 459         self.pte_data        = Signal(TLB_PTE_BITS)
 460
 461         # read from dtlb array
 462         self.tlb_read       = Signal()
 463         self.tlb_read_index = Signal(TLB_SET_BITS)
 464         self.tlb_way        = TLBRecord("o_tlb_way")
 465
 466     def elaborate(self, platform):
 467         m = Module()
 468         comb = m.d.comb
 469         sync = m.d.sync
 470
 471         # there are 3 parts to this:
 472         # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
 473         # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
 474         # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
 475         # be a Memory because they can all be cleared (tlbie, doall), i mean,
 476         # we _could_, in theory, by overriding the Reset Signal of the Memory,
 477         # hmmm....
 478
 479         dtlb_valid = TLBValidArray()
 480         tlb_req_index = self.tlb_req_index
 481
 482         print ("TLB_TAG_WAY_BITS", TLB_TAG_WAY_BITS)
 483         print ("     TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
 484         print ("        TLB_NUM_WAYS", TLB_NUM_WAYS)
 485         print ("TLB_PTE_WAY_BITS", TLB_PTE_WAY_BITS)
 486         print ("    TLB_PTE_BITS", TLB_PTE_BITS)
 487         print ("    TLB_NUM_WAYS", TLB_NUM_WAYS)
 488
 489         # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
 490         tagway = Memory(depth=TLB_SET_SIZE, width=TLB_TAG_WAY_BITS)
 491         m.submodules.rd_tagway = rd_tagway = tagway.read_port()
 492         m.submodules.wr_tagway = wr_tagway = tagway.write_port(
 493                                     granularity=TLB_EA_TAG_BITS)
 494
 495         pteway = Memory(depth=TLB_SET_SIZE, width=TLB_PTE_WAY_BITS)
 496         m.submodules.rd_pteway = rd_pteway = pteway.read_port()
 497         m.submodules.wr_pteway = wr_pteway = pteway.write_port(
 498                                     granularity=TLB_PTE_BITS)
 499
 500         # commented out for now, can be put in if Memory.reset can be
 501         # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
 502         #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
 503         #m.submodules.rd_valid = rd_valid = validm.read_port()
 504         #m.submodules.wr_valid = wr_valid = validm.write_port(
 505                                     #granularity=1)
 506
 507         # connect up read and write addresses to Valid/PTE/TAG SRAMs
 508         m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
 509         m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
 510         #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
 511         m.d.comb += wr_tagway.addr.eq(tlb_req_index)
 512         m.d.comb += wr_pteway.addr.eq(tlb_req_index)
 513         #m.d.comb += wr_valid.addr.eq(tlb_req_index)
 514
 515         updated  = Signal()
 516         v_updated  = Signal()
 517         tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 518         db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 519         pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 520         dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 521
 522         comb += dv.eq(dtlb_valid[tlb_req_index])
 523         comb += db_out.eq(dv)
 524
 525         with m.If(self.tlbie & self.doall):
 526             # clear all valid bits at once
 527             # XXX hmmm, validm _could_ use Memory reset here...
 528             for i in range(TLB_SET_SIZE):
 529                 sync += dtlb_valid[i].eq(0)
 530         with m.Elif(self.tlbie):
 531             # invalidate just the hit_way
 532             with m.If(self.tlb_hit.valid):
 533                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 534                 comb += v_updated.eq(1)
 535         with m.Elif(self.tlbwe):
 536             # write to the requested tag and PTE
 537             comb += write_tlb_tag(self.repl_way, tb_out, self.eatag)
 538             comb += write_tlb_pte(self.repl_way, pb_out, self.pte_data)
 539             # set valid bit
 540             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 541
 542             comb += updated.eq(1)
 543             comb += v_updated.eq(1)
 544
 545         # above, sometimes valid is requested to be updated but data not
 546         # therefore split them out, here.  note the granularity thing matches
 547         # with the shift-up of the eatag/pte_data into the correct TLB way.
 548         # thus is it not necessary to write the entire lot, just the portion
 549         # being altered: hence writing the *old* copy of the row is not needed
 550         with m.If(updated): # PTE and TAG to be written
 551             comb += wr_pteway.data.eq(pb_out)
 552             comb += wr_pteway.en.eq(1<<self.repl_way)
 553             comb += wr_tagway.data.eq(tb_out)
 554             comb += wr_tagway.en.eq(1<<self.repl_way)
 555         with m.If(v_updated): # Valid to be written
 556             sync += dtlb_valid[tlb_req_index].eq(db_out)
 557             #comb += wr_valid.data.eq(db_out)
 558             #comb += wr_valid.en.eq(1<<self.repl_way)
 559
 560         # select one TLB way, use a register here
 561         r_tlb_way        = TLBRecord("r_tlb_way")
 562         r_delay = Signal()
 563         sync += r_delay.eq(self.tlb_read)
 564         with m.If(self.tlb_read):
 565             sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
 566         with m.If(r_delay):
 567             # on one clock delay, output the contents of the read port(s)
 568             # comb += self.tlb_way.valid.eq(rd_valid.data)
 569             comb += self.tlb_way.tag.eq(rd_tagway.data)
 570             comb += self.tlb_way.pte.eq(rd_pteway.data)
 571             # and also capture the (delayed) output...
 572             #sync += r_tlb_way.valid.eq(rd_valid.data)
 573             sync += r_tlb_way.tag.eq(rd_tagway.data)
 574             sync += r_tlb_way.pte.eq(rd_pteway.data)
 575         with m.Else():
 576             # ... so that the register can output it when no read is requested
 577             # it's rather overkill but better to be safe than sorry
 578             comb += self.tlb_way.tag.eq(r_tlb_way.tag)
 579             comb += self.tlb_way.pte.eq(r_tlb_way.pte)
 580             #comb += self.tlb_way.eq(r_tlb_way)
 581
 582         return m
 583
 584
 585 class DCachePendingHit(Elaboratable):
 586
 587     def __init__(self, tlb_way,
 588                       cache_i_validdx, cache_tag_set,
 589                     req_addr):
 590
 591         self.go          = Signal()
 592         self.virt_mode   = Signal()
 593         self.is_hit      = Signal()
 594         self.tlb_hit      = TLBHit("tlb_hit")
 595         self.hit_way     = Signal(WAY_BITS)
 596         self.rel_match   = Signal()
 597         self.req_index   = Signal(INDEX_BITS)
 598         self.reload_tag  = Signal(TAG_BITS)
 599
 600         self.tlb_way = tlb_way
 601         self.cache_i_validdx = cache_i_validdx
 602         self.cache_tag_set = cache_tag_set
 603         self.req_addr = req_addr
 604
 605     def elaborate(self, platform):
 606         m = Module()
 607         comb = m.d.comb
 608         sync = m.d.sync
 609
 610         go = self.go
 611         virt_mode = self.virt_mode
 612         is_hit = self.is_hit
 613         tlb_way = self.tlb_way
 614         cache_i_validdx = self.cache_i_validdx
 615         cache_tag_set = self.cache_tag_set
 616         req_addr = self.req_addr
 617         tlb_hit = self.tlb_hit
 618         hit_way = self.hit_way
 619         rel_match = self.rel_match
 620         req_index = self.req_index
 621         reload_tag = self.reload_tag
 622
 623         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 624                                   for i in range(TLB_NUM_WAYS))
 625         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 626                                     for i in range(TLB_NUM_WAYS))
 627         hit_way_set = HitWaySet()
 628
 629         # Test if pending request is a hit on any way
 630         # In order to make timing in virtual mode,
 631         # when we are using the TLB, we compare each
 632         # way with each of the real addresses from each way of
 633         # the TLB, and then decide later which match to use.
 634
 635         with m.If(virt_mode):
 636             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 637                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 638                 s_hit       = Signal(name="s_hit%d" % j)
 639                 s_pte       = Signal(TLB_PTE_BITS, name="s_pte%d" % j)
 640                 s_ra        = Signal(REAL_ADDR_BITS, name="s_ra%d" % j)
 641                 # read the PTE, calc the Real Address, get tge tag
 642                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
 643                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 644                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 645                 comb += s_tag.eq(get_tag(s_ra))
 646                 # for each way check tge tag against the cache tag set
 647                 for i in range(NUM_WAYS): # way_t
 648                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 649                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 650                                   (read_tag(i, cache_tag_set) == s_tag)
 651                                   & (tlb_way.valid[j]))
 652                     with m.If(is_tag_hit):
 653                         comb += hit_way_set[j].eq(i)
 654                         comb += s_hit.eq(1)
 655                 comb += hit_set[j].eq(s_hit)
 656                 comb += rel_matches[j].eq(s_tag == reload_tag)
 657             with m.If(tlb_hit.valid):
 658                 comb += is_hit.eq(hit_set[tlb_hit.way])
 659                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 660                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 661         with m.Else():
 662             s_tag       = Signal(TAG_BITS)
 663             comb += s_tag.eq(get_tag(req_addr))
 664             for i in range(NUM_WAYS): # way_t
 665                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 666                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 667                           (read_tag(i, cache_tag_set) == s_tag))
 668                 with m.If(is_tag_hit):
 669                     comb += hit_way.eq(i)
 670                     comb += is_hit.eq(1)
 671             with m.If(s_tag == reload_tag):
 672                 comb += rel_match.eq(1)
 673
 674         return m
 675
 676
 677 class DCache(Elaboratable):
 678     """Set associative dcache write-through
 679
 680     TODO (in no specific order):
 681     * See list in icache.vhdl
 682     * Complete load misses on the cycle when WB data comes instead of
 683       at the end of line (this requires dealing with requests coming in
 684       while not idle...)
 685     """
 686     def __init__(self, pspec=None):
 687         self.d_in      = LoadStore1ToDCacheType("d_in")
 688         self.d_out     = DCacheToLoadStore1Type("d_out")
 689
 690         self.m_in      = MMUToDCacheType("m_in")
 691         self.m_out     = DCacheToMMUType("m_out")
 692
 693         self.stall_out = Signal()
 694
 695         # standard naming (wired to non-standard for compatibility)
 696         self.bus = Interface(addr_width=32,
 697                             data_width=64,
 698                             granularity=8,
 699                             features={'stall'},
 700                             alignment=0,
 701                             name="dcache")
 702
 703         self.log_out   = Signal(20)
 704
 705         # test if microwatt compatibility is to be enabled
 706         self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
 707                                  (pspec.microwatt_compat == True))
 708
 709     def stage_0(self, m, r0, r1, r0_full):
 710         """Latch the request in r0.req as long as we're not stalling
 711         """
 712         comb = m.d.comb
 713         sync = m.d.sync
 714         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 715
 716         r = RegStage0("stage0")
 717
 718         # TODO, this goes in unit tests and formal proofs
 719         with m.If(d_in.valid & m_in.valid):
 720             sync += Display("request collision loadstore vs MMU")
 721
 722         with m.If(m_in.valid):
 723             comb += r.req.valid.eq(1)
 724             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 725             comb += r.req.dcbz.eq(0)
 726             comb += r.req.nc.eq(0)
 727             comb += r.req.reserve.eq(0)
 728             comb += r.req.virt_mode.eq(0)
 729             comb += r.req.priv_mode.eq(1)
 730             comb += r.req.addr.eq(m_in.addr)
 731             comb += r.req.data.eq(m_in.pte)
 732             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 733             comb += r.tlbie.eq(m_in.tlbie)
 734             comb += r.doall.eq(m_in.doall)
 735             comb += r.tlbld.eq(m_in.tlbld)
 736             comb += r.mmu_req.eq(1)
 737             comb += r.d_valid.eq(1)
 738             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 739                                  m_in.addr, m_in.pte, r.req.load)
 740
 741         with m.Else():
 742             comb += r.req.eq(d_in)
 743             comb += r.req.data.eq(0)
 744             comb += r.tlbie.eq(0)
 745             comb += r.doall.eq(0)
 746             comb += r.tlbld.eq(0)
 747             comb += r.mmu_req.eq(0)
 748             comb += r.d_valid.eq(0)
 749
 750         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 751             sync += r0.eq(r)
 752             sync += r0_full.eq(r.req.valid)
 753         with m.Elif(~r0.d_valid):
 754             # Sample data the cycle after a request comes in from loadstore1.
 755             # If another request has come in already then the data will get
 756             # put directly into req.data below.
 757             sync += r0.req.data.eq(d_in.data)
 758             sync += r0.d_valid.eq(1)
 759         with m.If(d_in.valid):
 760             m.d.sync += Display("    DCACHE req cache "
 761                                 "virt %d addr %x data %x ld %d",
 762                                  r.req.virt_mode, r.req.addr,
 763                                  r.req.data, r.req.load)
 764
 765     def tlb_read(self, m, r0_stall, tlb_way):
 766         """TLB
 767         Operates in the second cycle on the request latched in r0.req.
 768         TLB updates write the entry at the end of the second cycle.
 769         """
 770         comb = m.d.comb
 771         sync = m.d.sync
 772         m_in, d_in = self.m_in, self.d_in
 773
 774         addrbits = Signal(TLB_SET_BITS)
 775
 776         amin = TLB_LG_PGSZ
 777         amax = TLB_LG_PGSZ + TLB_SET_BITS
 778
 779         with m.If(m_in.valid):
 780             comb += addrbits.eq(m_in.addr[amin : amax])
 781         with m.Else():
 782             comb += addrbits.eq(d_in.addr[amin : amax])
 783
 784         # If we have any op and the previous op isn't finished,
 785         # then keep the same output for next cycle.
 786         d = self.dtlb_update
 787         comb += d.tlb_read_index.eq(addrbits)
 788         comb += d.tlb_read.eq(~r0_stall)
 789         comb += tlb_way.eq(d.tlb_way)
 790
 791     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
 792         """Generate TLB PLRUs
 793         """
 794         comb = m.d.comb
 795         sync = m.d.sync
 796
 797         if TLB_NUM_WAYS == 0:
 798             return
 799
 800         # suite of PLRUs with a selection and output mechanism
 801         tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
 802         m.submodules.tlb_plrus = tlb_plrus
 803         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
 804         comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
 805         comb += tlb_plrus.index.eq(r1.tlb_hit_index)
 806         comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
 807         comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 808
 809     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 810                    tlb_way,
 811                    pte, tlb_hit, valid_ra, perm_attr, ra):
 812
 813         comb = m.d.comb
 814
 815         hitway = Signal(TLB_WAY_BITS)
 816         hit    = Signal()
 817         eatag  = Signal(TLB_EA_TAG_BITS)
 818
 819         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 820         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 821         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 822
 823         for i in range(TLB_NUM_WAYS):
 824             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 825             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 826             comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
 827             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 828             with m.If(is_tag_hit):
 829                 comb += hitway.eq(i)
 830                 comb += hit.eq(1)
 831
 832         comb += tlb_hit.valid.eq(hit & r0_valid)
 833         comb += tlb_hit.way.eq(hitway)
 834
 835         with m.If(tlb_hit.valid):
 836             comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
 837         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 838
 839         with m.If(r0.req.virt_mode):
 840             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 841                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 842                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 843             comb += perm_attr.reference.eq(pte[8])
 844             comb += perm_attr.changed.eq(pte[7])
 845             comb += perm_attr.nocache.eq(pte[5])
 846             comb += perm_attr.priv.eq(pte[3])
 847             comb += perm_attr.rd_perm.eq(pte[2])
 848             comb += perm_attr.wr_perm.eq(pte[1])
 849         with m.Else():
 850             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 851                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 852             comb += perm_attr.reference.eq(1)
 853             comb += perm_attr.changed.eq(1)
 854             comb += perm_attr.nocache.eq(0)
 855             comb += perm_attr.priv.eq(1)
 856             comb += perm_attr.rd_perm.eq(1)
 857             comb += perm_attr.wr_perm.eq(1)
 858
 859         with m.If(valid_ra):
 860             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 861                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 862             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 863             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 864             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 865             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 866             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 867             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 868
 869     def tlb_update(self, m, r0_valid, r0, tlb_req_index,
 870                     tlb_hit, tlb_plru_victim):
 871
 872         comb = m.d.comb
 873         sync = m.d.sync
 874
 875         tlbie    = Signal()
 876         tlbwe    = Signal()
 877
 878         comb += tlbie.eq(r0_valid & r0.tlbie)
 879         comb += tlbwe.eq(r0_valid & r0.tlbld)
 880
 881         d = self.dtlb_update
 882
 883         comb += d.tlbie.eq(tlbie)
 884         comb += d.tlbwe.eq(tlbwe)
 885         comb += d.doall.eq(r0.doall)
 886         comb += d.tlb_hit.eq(tlb_hit)
 887         comb += d.tlb_req_index.eq(tlb_req_index)
 888
 889         with m.If(tlb_hit.valid):
 890             comb += d.repl_way.eq(tlb_hit.way)
 891         with m.Else():
 892             comb += d.repl_way.eq(tlb_plru_victim)
 893         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 894         comb += d.pte_data.eq(r0.req.data)
 895
 896     def maybe_plrus(self, m, r1, plru_victim):
 897         """Generate PLRUs
 898         """
 899         comb = m.d.comb
 900         sync = m.d.sync
 901
 902         if TLB_NUM_WAYS == 0:
 903             return
 904
 905         # suite of PLRUs with a selection and output mechanism
 906         m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
 907         comb += plrus.way.eq(r1.hit_way)
 908         comb += plrus.valid.eq(r1.cache_hit)
 909         comb += plrus.index.eq(r1.hit_index)
 910         comb += plrus.isel.eq(r1.store_index) # select victim
 911         comb += plru_victim.eq(plrus.o_index) # selected victim
 912
 913     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 914         """Cache tag RAM read port
 915         """
 916         comb = m.d.comb
 917         sync = m.d.sync
 918         m_in, d_in = self.m_in, self.d_in
 919
 920         index = Signal(INDEX_BITS)
 921
 922         with m.If(r0_stall):
 923             comb += index.eq(req_index)
 924         with m.Elif(m_in.valid):
 925             comb += index.eq(get_index(m_in.addr))
 926         with m.Else():
 927             comb += index.eq(get_index(d_in.addr))
 928         sync += cache_tag_set.eq(cache_tags[index].tag)
 929
 930     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 931                        r0_valid, r1, cache_tags, replace_way,
 932                        use_forward1_next, use_forward2_next,
 933                        req_hit_way, plru_victim, rc_ok, perm_attr,
 934                        valid_ra, perm_ok, access_ok, req_op, req_go,
 935                        tlb_hit, tlb_way, cache_tag_set,
 936                        cancel_store, req_same_tag, r0_stall, early_req_row):
 937         """Cache request parsing and hit detection
 938         """
 939
 940         comb = m.d.comb
 941         m_in, d_in = self.m_in, self.d_in
 942
 943         is_hit      = Signal()
 944         hit_way     = Signal(WAY_BITS)
 945         op          = Signal(Op)
 946         opsel       = Signal(3)
 947         go          = Signal()
 948         nc          = Signal()
 949         cache_i_validdx = Signal(NUM_WAYS)
 950
 951         # Extract line, row and tag from request
 952         comb += req_index.eq(get_index(r0.req.addr))
 953         comb += req_row.eq(get_row(r0.req.addr))
 954         comb += req_tag.eq(get_tag(ra))
 955
 956         if False: # display on comb is a bit... busy.
 957             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 958                     r0.req.addr, ra, req_index, req_tag, req_row)
 959
 960         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 961         comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 962
 963         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
 964                                             cache_i_validdx, cache_tag_set,
 965                                             r0.req.addr)
 966         comb += dc.tlb_hit.eq(tlb_hit)
 967         comb += dc.reload_tag.eq(r1.reload_tag)
 968         comb += dc.virt_mode.eq(r0.req.virt_mode)
 969         comb += dc.go.eq(go)
 970         comb += dc.req_index.eq(req_index)
 971
 972         comb += is_hit.eq(dc.is_hit)
 973         comb += hit_way.eq(dc.hit_way)
 974         comb += req_same_tag.eq(dc.rel_match)
 975
 976         # See if the request matches the line currently being reloaded
 977         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 978                   (req_index == r1.store_index) & req_same_tag):
 979             # For a store, consider this a hit even if the row isn't
 980             # valid since it will be by the time we perform the store.
 981             # For a load, check the appropriate row valid bit.
 982             rrow = Signal(ROW_LINE_BITS)
 983             comb += rrow.eq(req_row)
 984             valid = r1.rows_valid[rrow]
 985             comb += is_hit.eq((~r0.req.load) | valid)
 986             comb += hit_way.eq(replace_way)
 987
 988         # Whether to use forwarded data for a load or not
 989         with m.If((get_row(r1.req.real_addr) == req_row) &
 990                   (r1.req.hit_way == hit_way)):
 991             # Only need to consider r1.write_bram here, since if we
 992             # are writing refill data here, then we don't have a
 993             # cache hit this cycle on the line being refilled.
 994             # (There is the possibility that the load following the
 995             # load miss that started the refill could be to the old
 996             # contents of the victim line, since it is a couple of
 997             # cycles after the refill starts before we see the updated
 998             # cache tag. In that case we don't use the bypass.)
 999             comb += use_forward1_next.eq(r1.write_bram)
1000         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
1001             comb += use_forward2_next.eq(r1.forward_valid1)
1002
1003         # The way that matched on a hit
1004         comb += req_hit_way.eq(hit_way)
1005
1006         # The way to replace on a miss
1007         with m.If(r1.write_tag):
1008             comb += replace_way.eq(plru_victim)
1009         with m.Else():
1010             comb += replace_way.eq(r1.store_way)
1011
1012         # work out whether we have permission for this access
1013         # NB we don't yet implement AMR, thus no KUAP
1014         comb += rc_ok.eq(perm_attr.reference
1015                          & (r0.req.load | perm_attr.changed))
1016         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
1017                            (perm_attr.wr_perm |
1018                               (r0.req.load & perm_attr.rd_perm)))
1019         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
1020
1021         # Combine the request and cache hit status to decide what
1022         # operation needs to be done
1023         comb += nc.eq(r0.req.nc | perm_attr.nocache)
1024         comb += op.eq(Op.OP_NONE)
1025         with m.If(go):
1026             with m.If(~access_ok):
1027                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
1028                                  valid_ra, perm_ok, rc_ok)
1029                 comb += op.eq(Op.OP_BAD)
1030             with m.Elif(cancel_store):
1031                 m.d.sync += Display("DCACHE cancel store")
1032                 comb += op.eq(Op.OP_STCX_FAIL)
1033             with m.Else():
1034                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
1035                                  valid_ra, nc, r0.req.load)
1036                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
1037                 with m.Switch(opsel):
1038                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
1039                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
1040                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
1041                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
1042                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
1043                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
1044                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
1045                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
1046         comb += req_op.eq(op)
1047         comb += req_go.eq(go)
1048
1049         # Version of the row number that is valid one cycle earlier
1050         # in the cases where we need to read the cache data BRAM.
1051         # If we're stalling then we need to keep reading the last
1052         # row requested.
1053         with m.If(~r0_stall):
1054             with m.If(m_in.valid):
1055                 comb += early_req_row.eq(get_row(m_in.addr))
1056             with m.Else():
1057                 comb += early_req_row.eq(get_row(d_in.addr))
1058         with m.Else():
1059             comb += early_req_row.eq(req_row)
1060
1061     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
1062                          r0_valid, r0, reservation):
1063         """Handle load-with-reservation and store-conditional instructions
1064         """
1065         comb = m.d.comb
1066
1067         with m.If(r0_valid & r0.req.reserve):
1068             # XXX generate alignment interrupt if address
1069             # is not aligned XXX or if r0.req.nc = '1'
1070             with m.If(r0.req.load):
1071                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
1072             with m.Else():
1073                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1074                 with m.If((~reservation.valid) |
1075                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1076                     comb += cancel_store.eq(1)
1077
1078     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1079                         reservation, r0):
1080         comb = m.d.comb
1081         sync = m.d.sync
1082
1083         with m.If(r0_valid & access_ok):
1084             with m.If(clear_rsrv):
1085                 sync += reservation.valid.eq(0)
1086             with m.Elif(set_rsrv):
1087                 sync += reservation.valid.eq(1)
1088                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1089
1090     def writeback_control(self, m, r1, cache_out_row):
1091         """Return data for loads & completion control logic
1092         """
1093         comb = m.d.comb
1094         sync = m.d.sync
1095         d_out, m_out = self.d_out, self.m_out
1096
1097         data_out = Signal(64)
1098         data_fwd = Signal(64)
1099
1100         # Use the bypass if are reading the row that was
1101         # written 1 or 2 cycles ago, including for the
1102         # slow_valid = 1 case (i.e. completing a load
1103         # miss or a non-cacheable load).
1104         with m.If(r1.use_forward1):
1105             comb += data_fwd.eq(r1.forward_data1)
1106         with m.Else():
1107             comb += data_fwd.eq(r1.forward_data2)
1108
1109         comb += data_out.eq(cache_out_row)
1110
1111         for i in range(8):
1112             with m.If(r1.forward_sel[i]):
1113                 dsel = data_fwd.word_select(i, 8)
1114                 comb += data_out.word_select(i, 8).eq(dsel)
1115
1116         # DCache output to LoadStore
1117         comb += d_out.valid.eq(r1.ls_valid)
1118         comb += d_out.data.eq(data_out)
1119         comb += d_out.store_done.eq(~r1.stcx_fail)
1120         comb += d_out.error.eq(r1.ls_error)
1121         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1122
1123         # Outputs to MMU
1124         comb += m_out.done.eq(r1.mmu_done)
1125         comb += m_out.err.eq(r1.mmu_error)
1126         comb += m_out.data.eq(data_out)
1127
1128         # We have a valid load or store hit or we just completed
1129         # a slow op such as a load miss, a NC load or a store
1130         #
1131         # Note: the load hit is delayed by one cycle. However it
1132         # can still not collide with r.slow_valid (well unless I
1133         # miscalculated) because slow_valid can only be set on a
1134         # subsequent request and not on its first cycle (the state
1135         # machine must have advanced), which makes slow_valid
1136         # at least 2 cycles from the previous hit_load_valid.
1137
1138         # Sanity: Only one of these must be set in any given cycle
1139
1140         if False: # TODO: need Display to get this to work
1141             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1142             "unexpected slow_valid collision with stcx_fail"
1143
1144             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1145              "unexpected hit_load_delayed collision with slow_valid"
1146
1147         with m.If(~r1.mmu_req):
1148             # Request came from loadstore1...
1149             # Load hit case is the standard path
1150             with m.If(r1.hit_load_valid):
1151                 sync += Display("completing load hit data=%x", data_out)
1152
1153             # error cases complete without stalling
1154             with m.If(r1.ls_error):
1155                 with m.If(r1.dcbz):
1156                     sync += Display("completing dcbz with error")
1157                 with m.Else():
1158                     sync += Display("completing ld/st with error")
1159
1160             # Slow ops (load miss, NC, stores)
1161             with m.If(r1.slow_valid):
1162                 sync += Display("completing store or load miss adr=%x data=%x",
1163                                 r1.req.real_addr, data_out)
1164
1165         with m.Else():
1166             # Request came from MMU
1167             with m.If(r1.hit_load_valid):
1168                 sync += Display("completing load hit to MMU, data=%x",
1169                                 m_out.data)
1170             # error cases complete without stalling
1171             with m.If(r1.mmu_error):
1172                 sync += Display("combpleting MMU ld with error")
1173
1174             # Slow ops (i.e. load miss)
1175             with m.If(r1.slow_valid):
1176                 sync += Display("completing MMU load miss, adr=%x data=%x",
1177                                 r1.req.real_addr, m_out.data)
1178
1179     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1180         """rams
1181         Generate a cache RAM for each way. This handles the normal
1182         reads, writes from reloads and the special store-hit update
1183         path as well.
1184
1185         Note: the BRAMs have an extra read buffer, meaning the output
1186         is pipelined an extra cycle. This differs from the
1187         icache. The writeback logic needs to take that into
1188         account by using 1-cycle delayed signals for load hits.
1189         """
1190         comb = m.d.comb
1191         bus = self.bus
1192
1193         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1194         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1195         m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
1196         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1197                    ~r1.write_bram))
1198         comb += rwe.i.eq(replace_way)
1199
1200         m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
1201         comb += hwe.i.eq(r1.hit_way)
1202
1203         # this one is gated with write_bram, and replace_way_e can never be
1204         # set at the same time.  that means that do_write can OR the outputs
1205         m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
1206         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1207         comb += hre.i.eq(r1.req.hit_way)
1208
1209         # common Signals
1210         do_read  = Signal()
1211         wr_addr  = Signal(ROW_BITS)
1212         wr_data  = Signal(WB_DATA_BITS)
1213         wr_sel   = Signal(ROW_SIZE)
1214         rd_addr  = Signal(ROW_BITS)
1215
1216         comb += do_read.eq(1) # always enable
1217         comb += rd_addr.eq(early_req_row)
1218
1219         # Write mux:
1220         #
1221         # Defaults to wishbone read responses (cache refill)
1222         #
1223         # For timing, the mux on wr_data/sel/addr is not
1224         # dependent on anything other than the current state.
1225
1226         with m.If(r1.write_bram):
1227             # Write store data to BRAM.  This happens one
1228             # cycle after the store is in r0.
1229             comb += wr_data.eq(r1.req.data)
1230             comb += wr_sel.eq(r1.req.byte_sel)
1231             comb += wr_addr.eq(get_row(r1.req.real_addr))
1232
1233         with m.Else():
1234             # Otherwise, we might be doing a reload or a DCBZ
1235             with m.If(r1.dcbz):
1236                 comb += wr_data.eq(0)
1237             with m.Else():
1238                 comb += wr_data.eq(bus.dat_r)
1239             comb += wr_addr.eq(r1.store_row)
1240             comb += wr_sel.eq(~0) # all 1s
1241
1242         # set up Cache Rams
1243         for i in range(NUM_WAYS):
1244             do_write = Signal(name="do_wr%d" % i)
1245             wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
1246             d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1247
1248             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1249             m.submodules["cacheram_%d" % i] = way
1250
1251             comb += way.rd_en.eq(do_read)
1252             comb += way.rd_addr.eq(rd_addr)
1253             comb += d_out.eq(way.rd_data_o)
1254             comb += way.wr_sel.eq(wr_sel_m)
1255             comb += way.wr_addr.eq(wr_addr)
1256             comb += way.wr_data.eq(wr_data)
1257
1258             # Cache hit reads
1259             with m.If(hwe.o[i]):
1260                 comb += cache_out_row.eq(d_out)
1261
1262             # these are mutually-exclusive via their Decoder-enablers
1263             # (note: Decoder-enable is inverted)
1264             comb += do_write.eq(hre.o[i] | rwe.o[i])
1265
1266             # Mask write selects with do_write since BRAM
1267             # doesn't have a global write-enable
1268             with m.If(do_write):
1269                 comb += wr_sel_m.eq(wr_sel)
1270
1271     # Cache hit synchronous machine for the easy case.
1272     # This handles load hits.
1273     # It also handles error cases (TLB miss, cache paradox)
1274     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1275                         req_hit_way, req_index, req_tag, access_ok,
1276                         tlb_hit, tlb_req_index):
1277         comb = m.d.comb
1278         sync = m.d.sync
1279
1280         with m.If(req_op != Op.OP_NONE):
1281             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1282                     req_op, r0.req.addr, r0.req.nc,
1283                     req_index, req_tag, req_hit_way)
1284
1285         with m.If(r0_valid):
1286             sync += r1.mmu_req.eq(r0.mmu_req)
1287
1288         # Fast path for load/store hits.
1289         # Set signals for the writeback controls.
1290         sync += r1.hit_way.eq(req_hit_way)
1291         sync += r1.hit_index.eq(req_index)
1292
1293         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1294         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1295                                 (req_op == Op.OP_STORE_HIT))
1296
1297         with m.If(req_op == Op.OP_BAD):
1298             sync += Display("Signalling ld/st error "
1299                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1300                             ~r0.mmu_req,r0.mmu_req,access_ok)
1301             sync += r1.ls_error.eq(~r0.mmu_req)
1302             sync += r1.mmu_error.eq(r0.mmu_req)
1303             sync += r1.cache_paradox.eq(access_ok)
1304         with m.Else():
1305             sync += r1.ls_error.eq(0)
1306             sync += r1.mmu_error.eq(0)
1307             sync += r1.cache_paradox.eq(0)
1308
1309         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1310
1311         # Record TLB hit information for updating TLB PLRU
1312         sync += r1.tlb_hit.eq(tlb_hit)
1313         sync += r1.tlb_hit_index.eq(tlb_req_index)
1314
1315     # Memory accesses are handled by this state machine:
1316     #
1317     #   * Cache load miss/reload (in conjunction with "rams")
1318     #   * Load hits for non-cachable forms
1319     #   * Stores (the collision case is handled in "rams")
1320     #
1321     # All wishbone requests generation is done here.
1322     # This machine operates at stage 1.
1323     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1324                     r0, replace_way,
1325                     req_hit_way, req_same_tag,
1326                     r0_valid, req_op, cache_tags, req_go, ra):
1327
1328         comb = m.d.comb
1329         sync = m.d.sync
1330         bus = self.bus
1331         d_in = self.d_in
1332
1333         req         = MemAccessRequest("mreq_ds")
1334
1335         r1_next_cycle = Signal()
1336         req_row = Signal(ROW_BITS)
1337         req_idx = Signal(INDEX_BITS)
1338         req_tag = Signal(TAG_BITS)
1339         comb += req_idx.eq(get_index(req.real_addr))
1340         comb += req_row.eq(get_row(req.real_addr))
1341         comb += req_tag.eq(get_tag(req.real_addr))
1342
1343         sync += r1.use_forward1.eq(use_forward1_next)
1344         sync += r1.forward_sel.eq(0)
1345
1346         with m.If(use_forward1_next):
1347             sync += r1.forward_sel.eq(r1.req.byte_sel)
1348         with m.Elif(use_forward2_next):
1349             sync += r1.forward_sel.eq(r1.forward_sel1)
1350
1351         sync += r1.forward_data2.eq(r1.forward_data1)
1352         with m.If(r1.write_bram):
1353             sync += r1.forward_data1.eq(r1.req.data)
1354             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1355             sync += r1.forward_way1.eq(r1.req.hit_way)
1356             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1357             sync += r1.forward_valid1.eq(1)
1358         with m.Else():
1359             with m.If(r1.dcbz):
1360                 sync += r1.forward_data1.eq(0)
1361             with m.Else():
1362                 sync += r1.forward_data1.eq(bus.dat_r)
1363             sync += r1.forward_sel1.eq(~0) # all 1s
1364             sync += r1.forward_way1.eq(replace_way)
1365             sync += r1.forward_row1.eq(r1.store_row)
1366             sync += r1.forward_valid1.eq(0)
1367
1368         # One cycle pulses reset
1369         sync += r1.slow_valid.eq(0)
1370         sync += r1.write_bram.eq(0)
1371         sync += r1.inc_acks.eq(0)
1372         sync += r1.dec_acks.eq(0)
1373
1374         sync += r1.ls_valid.eq(0)
1375         # complete tlbies and TLB loads in the third cycle
1376         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1377
1378         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1379             with m.If(r0.mmu_req):
1380                 sync += r1.mmu_done.eq(1)
1381             with m.Else():
1382                 sync += r1.ls_valid.eq(1)
1383
1384         with m.If(r1.write_tag):
1385             # Store new tag in selected way
1386             replace_way_onehot = Signal(NUM_WAYS)
1387             comb += replace_way_onehot.eq(1<<replace_way)
1388             for i in range(NUM_WAYS):
1389                 with m.If(replace_way_onehot[i]):
1390                     ct = Signal(TAG_RAM_WIDTH)
1391                     comb += ct.eq(cache_tags[r1.store_index].tag)
1392                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1393                     sync += cache_tags[r1.store_index].tag.eq(ct)
1394             sync += r1.store_way.eq(replace_way)
1395             sync += r1.write_tag.eq(0)
1396
1397         # Take request from r1.req if there is one there,
1398         # else from req_op, ra, etc.
1399         with m.If(r1.full):
1400             comb += req.eq(r1.req)
1401         with m.Else():
1402             comb += req.op.eq(req_op)
1403             comb += req.valid.eq(req_go)
1404             comb += req.mmu_req.eq(r0.mmu_req)
1405             comb += req.dcbz.eq(r0.req.dcbz)
1406             comb += req.real_addr.eq(ra)
1407
1408             with m.If(r0.req.dcbz):
1409                 # force data to 0 for dcbz
1410                 comb += req.data.eq(0)
1411             with m.Elif(r0.d_valid):
1412                 comb += req.data.eq(r0.req.data)
1413             with m.Else():
1414                 comb += req.data.eq(d_in.data)
1415
1416             # Select all bytes for dcbz
1417             # and for cacheable loads
1418             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1419                 comb += req.byte_sel.eq(~0) # all 1s
1420             with m.Else():
1421                 comb += req.byte_sel.eq(r0.req.byte_sel)
1422             comb += req.hit_way.eq(req_hit_way)
1423             comb += req.same_tag.eq(req_same_tag)
1424
1425             # Store the incoming request from r0,
1426             # if it is a slow request
1427             # Note that r1.full = 1 implies req_op = OP_NONE
1428             with m.If((req_op == Op.OP_LOAD_MISS)
1429                       | (req_op == Op.OP_LOAD_NC)
1430                       | (req_op == Op.OP_STORE_MISS)
1431                       | (req_op == Op.OP_STORE_HIT)):
1432                 sync += r1.req.eq(req)
1433                 sync += r1.full.eq(1)
1434                 # do not let r1.state RELOAD_WAIT_ACK or STORE_WAIT_ACK
1435                 # destroy r1.req by overwriting r1.full back to zero
1436                 comb += r1_next_cycle.eq(1)
1437
1438         # Main state machine
1439         with m.Switch(r1.state):
1440
1441             with m.Case(State.IDLE):
1442                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1443                 sync += r1.wb.sel.eq(req.byte_sel)
1444                 sync += r1.wb.dat.eq(req.data)
1445                 sync += r1.dcbz.eq(req.dcbz)
1446
1447                 # Keep track of our index and way
1448                 # for subsequent stores.
1449                 sync += r1.store_index.eq(req_idx)
1450                 sync += r1.store_row.eq(req_row)
1451                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1452                 sync += r1.reload_tag.eq(req_tag)
1453                 sync += r1.req.same_tag.eq(1)
1454
1455                 with m.If(req.op == Op.OP_STORE_HIT):
1456                     sync += r1.store_way.eq(req.hit_way)
1457
1458                 #with m.If(r1.dec_acks):
1459                 #    sync += r1.acks_pending.eq(r1.acks_pending - 1)
1460
1461                 # Reset per-row valid bits,
1462                 # ready for handling OP_LOAD_MISS
1463                 for i in range(ROW_PER_LINE):
1464                     sync += r1.rows_valid[i].eq(0)
1465
1466                 with m.If(req_op != Op.OP_NONE):
1467                     sync += Display("cache op %d", req.op)
1468
1469                 with m.Switch(req.op):
1470                     with m.Case(Op.OP_LOAD_HIT):
1471                         # stay in IDLE state
1472                         pass
1473
1474                     with m.Case(Op.OP_LOAD_MISS):
1475                         sync += Display("cache miss real addr: %x " \
1476                                 "idx: %x tag: %x",
1477                                 req.real_addr, req_row, req_tag)
1478
1479                         # Start the wishbone cycle
1480                         sync += r1.wb.we.eq(0)
1481                         sync += r1.wb.cyc.eq(1)
1482                         sync += r1.wb.stb.eq(1)
1483
1484                         # Track that we had one request sent
1485                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1486                         sync += r1.write_tag.eq(1)
1487
1488                     with m.Case(Op.OP_LOAD_NC):
1489                         sync += r1.wb.cyc.eq(1)
1490                         sync += r1.wb.stb.eq(1)
1491                         sync += r1.wb.we.eq(0)
1492                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1493
1494                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1495                         with m.If(~req.dcbz):
1496                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1497                             sync += r1.acks_pending.eq(1)
1498                             sync += r1.full.eq(0)
1499                             comb += r1_next_cycle.eq(0)
1500                             sync += r1.slow_valid.eq(1)
1501
1502                             with m.If(req.mmu_req):
1503                                 sync += r1.mmu_done.eq(1)
1504                             with m.Else():
1505                                 sync += r1.ls_valid.eq(1)
1506
1507                             with m.If(req.op == Op.OP_STORE_HIT):
1508                                 sync += r1.write_bram.eq(1)
1509                         with m.Else():
1510                             # dcbz is handled much like a load miss except
1511                             # that we are writing to memory instead of reading
1512                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1513
1514                             with m.If(req.op == Op.OP_STORE_MISS):
1515                                 sync += r1.write_tag.eq(1)
1516
1517                         sync += r1.wb.we.eq(1)
1518                         sync += r1.wb.cyc.eq(1)
1519                         sync += r1.wb.stb.eq(1)
1520
1521                     # OP_NONE and OP_BAD do nothing
1522                     # OP_BAD & OP_STCX_FAIL were
1523                     # handled above already
1524                     with m.Case(Op.OP_NONE):
1525                         pass
1526                     with m.Case(Op.OP_BAD):
1527                         pass
1528                     with m.Case(Op.OP_STCX_FAIL):
1529                         pass
1530
1531             with m.Case(State.RELOAD_WAIT_ACK):
1532                 ld_stbs_done = Signal()
1533                 # Requests are all sent if stb is 0
1534                 comb += ld_stbs_done.eq(~r1.wb.stb)
1535
1536                 # If we are still sending requests, was one accepted?
1537                 with m.If((~bus.stall) & r1.wb.stb):
1538                     # That was the last word?  We are done sending.
1539                     # Clear stb and set ld_stbs_done so we can handle an
1540                     # eventual last ack on the same cycle.
1541                     # sigh - reconstruct wb adr with 3 extra 0s at front
1542                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1543                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1544                         sync += r1.wb.stb.eq(0)
1545                         comb += ld_stbs_done.eq(1)
1546
1547                     # Calculate the next row address in the current cache line
1548                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1549                     comb += row.eq(r1.wb.adr)
1550                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1551
1552                 # Incoming acks processing
1553                 sync += r1.forward_valid1.eq(bus.ack)
1554                 with m.If(bus.ack):
1555                     srow = Signal(ROW_LINE_BITS)
1556                     comb += srow.eq(r1.store_row)
1557                     sync += r1.rows_valid[srow].eq(1)
1558
1559                     # If this is the data we were looking for,
1560                     # we can complete the request next cycle.
1561                     # Compare the whole address in case the
1562                     # request in r1.req is not the one that
1563                     # started this refill.
1564                     with m.If(r1.full & r1.req.same_tag &
1565                               ((r1.dcbz & req.dcbz) |
1566                                (r1.req.op == Op.OP_LOAD_MISS)) &
1567                                 (r1.store_row == get_row(r1.req.real_addr))):
1568                         sync += r1.full.eq(r1_next_cycle)
1569                         sync += r1.slow_valid.eq(1)
1570                         with m.If(r1.mmu_req):
1571                             sync += r1.mmu_done.eq(1)
1572                         with m.Else():
1573                             sync += r1.ls_valid.eq(1)
1574                         sync += r1.forward_sel.eq(~0) # all 1s
1575                         sync += r1.use_forward1.eq(1)
1576
1577                     # Check for completion
1578                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1579                                                       r1.end_row_ix)):
1580                         # Complete wishbone cycle
1581                         sync += r1.wb.cyc.eq(0)
1582
1583                         # Cache line is now valid
1584                         cv = Signal(INDEX_BITS)
1585                         comb += cv.eq(cache_tags[r1.store_index].valid)
1586                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1587                         sync += cache_tags[r1.store_index].valid.eq(cv)
1588
1589                         sync += r1.state.eq(State.IDLE)
1590                         sync += Display("cache valid set %x "
1591                                         "idx %d way %d",
1592                                          cv, r1.store_index, r1.store_way)
1593
1594                     # Increment store row counter
1595                     sync += r1.store_row.eq(next_row(r1.store_row))
1596
1597             with m.Case(State.STORE_WAIT_ACK):
1598                 st_stbs_done = Signal()
1599                 adjust_acks = Signal(3)
1600
1601                 comb += st_stbs_done.eq(~r1.wb.stb)
1602
1603                 with m.If(r1.inc_acks != r1.dec_acks):
1604                     with m.If(r1.inc_acks):
1605                         comb += adjust_acks.eq(r1.acks_pending + 1)
1606                     with m.Else():
1607                         comb += adjust_acks.eq(r1.acks_pending - 1)
1608                 with m.Else():
1609                     comb += adjust_acks.eq(r1.acks_pending)
1610
1611                 sync += r1.acks_pending.eq(adjust_acks)
1612
1613                 # Clear stb when slave accepted request
1614                 with m.If(~bus.stall):
1615                     # See if there is another store waiting
1616                     # to be done which is in the same real page.
1617                     with m.If(req.valid):
1618                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1619                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1620                         sync += r1.wb.dat.eq(req.data)
1621                         sync += r1.wb.sel.eq(req.byte_sel)
1622
1623                     with m.If((adjust_acks < 7) & req.same_tag &
1624                                 ((req.op == Op.OP_STORE_MISS) |
1625                                  (req.op == Op.OP_STORE_HIT))):
1626                         sync += r1.wb.stb.eq(1)
1627                         comb += st_stbs_done.eq(0)
1628                         sync += r1.store_way.eq(req.hit_way)
1629                         sync += r1.store_row.eq(get_row(req.real_addr))
1630
1631                         with m.If(req.op == Op.OP_STORE_HIT):
1632                             sync += r1.write_bram.eq(1)
1633                         sync += r1.full.eq(r1_next_cycle)
1634                         sync += r1.slow_valid.eq(1)
1635
1636                         # Store requests never come from the MMU
1637                         sync += r1.ls_valid.eq(1)
1638                         comb += st_stbs_done.eq(0)
1639                         sync += r1.inc_acks.eq(1)
1640                     with m.Else():
1641                         sync += r1.wb.stb.eq(0)
1642                         comb += st_stbs_done.eq(1)
1643
1644                 # Got ack ? See if complete.
1645                 sync += Display("got ack %d %d stbs %d adjust_acks %d",
1646                                 bus.ack, bus.ack, st_stbs_done, adjust_acks)
1647                 with m.If(bus.ack):
1648                     with m.If(st_stbs_done & (adjust_acks == 1)):
1649                         sync += r1.state.eq(State.IDLE)
1650                         sync += r1.wb.cyc.eq(0)
1651                         sync += r1.wb.stb.eq(0)
1652                     sync += r1.dec_acks.eq(1)
1653
1654             with m.Case(State.NC_LOAD_WAIT_ACK):
1655                 # Clear stb when slave accepted request
1656                 with m.If(~bus.stall):
1657                     sync += r1.wb.stb.eq(0)
1658
1659                 # Got ack ? complete.
1660                 with m.If(bus.ack):
1661                     sync += r1.state.eq(State.IDLE)
1662                     sync += r1.full.eq(r1_next_cycle)
1663                     sync += r1.slow_valid.eq(1)
1664
1665                     with m.If(r1.mmu_req):
1666                         sync += r1.mmu_done.eq(1)
1667                     with m.Else():
1668                         sync += r1.ls_valid.eq(1)
1669
1670                     sync += r1.forward_sel.eq(~0) # all 1s
1671                     sync += r1.use_forward1.eq(1)
1672                     sync += r1.wb.cyc.eq(0)
1673                     sync += r1.wb.stb.eq(0)
1674
1675     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1676
1677         sync = m.d.sync
1678         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1679
1680         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1681                                stall_out, req_op[:3], d_out.valid, d_out.error,
1682                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1683                                r1.real_adr[3:6]))
1684
1685     def elaborate(self, platform):
1686
1687         m = Module()
1688         comb = m.d.comb
1689         d_in = self.d_in
1690
1691         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1692         cache_tags       = CacheTagArray()
1693         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1694
1695         # TODO attribute ram_style : string;
1696         # TODO attribute ram_style of cache_tags : signal is "distributed";
1697
1698         """note: these are passed to nmigen.hdl.Memory as "attributes".
1699            don't know how, just that they are.
1700         """
1701         # TODO attribute ram_style of
1702         #  dtlb_tags : signal is "distributed";
1703         # TODO attribute ram_style of
1704         #  dtlb_ptes : signal is "distributed";
1705
1706         r0      = RegStage0("r0")
1707         r0_full = Signal()
1708
1709         r1 = RegStage1("r1")
1710
1711         reservation = Reservation("rsrv")
1712
1713         # Async signals on incoming request
1714         req_index    = Signal(INDEX_BITS)
1715         req_row      = Signal(ROW_BITS)
1716         req_hit_way  = Signal(WAY_BITS)
1717         req_tag      = Signal(TAG_BITS)
1718         req_op       = Signal(Op)
1719         req_data     = Signal(64)
1720         req_same_tag = Signal()
1721         req_go       = Signal()
1722
1723         early_req_row     = Signal(ROW_BITS)
1724
1725         cancel_store      = Signal()
1726         set_rsrv          = Signal()
1727         clear_rsrv        = Signal()
1728
1729         r0_valid          = Signal()
1730         r0_stall          = Signal()
1731
1732         use_forward1_next = Signal()
1733         use_forward2_next = Signal()
1734
1735         cache_out_row     = Signal(WB_DATA_BITS)
1736
1737         plru_victim       = Signal(WAY_BITS)
1738         replace_way       = Signal(WAY_BITS)
1739
1740         # Wishbone read/write/cache write formatting signals
1741         bus_sel           = Signal(8)
1742
1743         # TLB signals
1744         tlb_way       = TLBRecord("tlb_way")
1745         tlb_req_index = Signal(TLB_SET_BITS)
1746         tlb_hit       = TLBHit("tlb_hit")
1747         pte           = Signal(TLB_PTE_BITS)
1748         ra            = Signal(REAL_ADDR_BITS)
1749         valid_ra      = Signal()
1750         perm_attr     = PermAttr("dc_perms")
1751         rc_ok         = Signal()
1752         perm_ok       = Signal()
1753         access_ok     = Signal()
1754
1755         tlb_plru_victim = Signal(TLB_WAY_BITS)
1756
1757         # we don't yet handle collisions between loadstore1 requests
1758         # and MMU requests
1759         comb += self.m_out.stall.eq(0)
1760
1761         # Hold off the request in r0 when r1 has an uncompleted request
1762         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1763         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1764         comb += self.stall_out.eq(r0_stall)
1765
1766         # deal with litex not doing wishbone pipeline mode
1767         # XXX in wrong way.  FIFOs are needed in the SRAM test
1768         # so that stb/ack match up. same thing done in icache.py
1769         if not self.microwatt_compat:
1770             comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1771
1772         # Wire up wishbone request latch out of stage 1
1773         comb += self.bus.we.eq(r1.wb.we)
1774         comb += self.bus.adr.eq(r1.wb.adr)
1775         comb += self.bus.sel.eq(r1.wb.sel)
1776         comb += self.bus.stb.eq(r1.wb.stb)
1777         comb += self.bus.dat_w.eq(r1.wb.dat)
1778         comb += self.bus.cyc.eq(r1.wb.cyc)
1779
1780         # create submodule TLBUpdate
1781         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
1782
1783         # call sub-functions putting everything together, using shared
1784         # signals established above
1785         self.stage_0(m, r0, r1, r0_full)
1786         self.tlb_read(m, r0_stall, tlb_way)
1787         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1788                         tlb_way,
1789                         pte, tlb_hit, valid_ra, perm_attr, ra)
1790         self.tlb_update(m, r0_valid, r0, tlb_req_index,
1791                         tlb_hit, tlb_plru_victim)
1792         self.maybe_plrus(m, r1, plru_victim)
1793         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1794         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1795         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1796                            r0_valid, r1, cache_tags, replace_way,
1797                            use_forward1_next, use_forward2_next,
1798                            req_hit_way, plru_victim, rc_ok, perm_attr,
1799                            valid_ra, perm_ok, access_ok, req_op, req_go,
1800                            tlb_hit, tlb_way, cache_tag_set,
1801                            cancel_store, req_same_tag, r0_stall, early_req_row)
1802         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1803                            r0_valid, r0, reservation)
1804         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1805                            reservation, r0)
1806         self.writeback_control(m, r1, cache_out_row)
1807         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1808         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1809                         req_hit_way, req_index, req_tag, access_ok,
1810                         tlb_hit, tlb_req_index)
1811         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1812                     r0, replace_way,
1813                     req_hit_way, req_same_tag,
1814                          r0_valid, req_op, cache_tags, req_go, ra)
1815         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1816
1817         return m
1818
1819
1820 if __name__ == '__main__':
1821     dut = DCache()
1822     vl = rtlil.convert(dut, ports=[])
1823     with open("test_dcache.il", "w") as f:
1824         f.write(vl)