src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
  17   (discussion about brams for ECP5)
  18
  19 """
  20
  21 import sys
  22
  23 from nmutil.gtkw import write_gtkw
  24
  25 sys.setrecursionlimit(1000000)
  26
  27 from enum import Enum, unique
  28
  29 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  30                     Record, Memory)
  31 from nmutil.util import Display
  32 from nmigen.lib.coding import Decoder
  33
  34 from copy import deepcopy
  35 from random import randint, seed
  36
  37 from nmigen_soc.wishbone.bus import Interface
  38
  39 from nmigen.cli import main
  40 from nmutil.iocontrol import RecordObject
  41 from nmigen.utils import log2_int
  42 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  43                                      DCacheToLoadStore1Type,
  44                                      MMUToDCacheType,
  45                                      DCacheToMMUType)
  46
  47 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  48                                 WBAddrType, WBDataType, WBSelType,
  49                                 WBMasterOut, WBSlaveOut,
  50                                 WBMasterOutVector, WBSlaveOutVector,
  51                                 WBIOMasterOut, WBIOSlaveOut)
  52
  53 from soc.experiment.cache_ram import CacheRam
  54 from soc.experiment.plru import PLRU, PLRUs
  55 #from nmutil.plru import PLRU, PLRUs
  56
  57 # for test
  58 from soc.bus.sram import SRAM
  59 from nmigen import Memory
  60 from nmigen.cli import rtlil
  61
  62 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  63 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  64 from nmutil.sim_tmp_alternative import Simulator
  65
  66 from nmutil.util import wrap
  67
  68
  69 # TODO: make these parameters of DCache at some point
  70 LINE_SIZE = 64    # Line size in bytes
  71 NUM_LINES = 32    # Number of lines in a set
  72 NUM_WAYS = 4      # Number of ways
  73 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  74 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  75 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  76 LOG_LENGTH = 0    # Non-zero to enable log data collection
  77
  78 # BRAM organisation: We never access more than
  79 #     -- WB_DATA_BITS at a time so to save
  80 #     -- resources we make the array only that wide, and
  81 #     -- use consecutive indices to make a cache "line"
  82 #     --
  83 #     -- ROW_SIZE is the width in bytes of the BRAM
  84 #     -- (based on WB, so 64-bits)
  85 ROW_SIZE = WB_DATA_BITS // 8;
  86
  87 # ROW_PER_LINE is the number of row (wishbone
  88 # transactions) in a line
  89 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  90
  91 # BRAM_ROWS is the number of rows in BRAM needed
  92 # to represent the full dcache
  93 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  94
  95 print ("ROW_SIZE", ROW_SIZE)
  96 print ("ROW_PER_LINE", ROW_PER_LINE)
  97 print ("BRAM_ROWS", BRAM_ROWS)
  98 print ("NUM_WAYS", NUM_WAYS)
  99
 100 # Bit fields counts in the address
 101
 102 # REAL_ADDR_BITS is the number of real address
 103 # bits that we store
 104 REAL_ADDR_BITS = 56
 105
 106 # ROW_BITS is the number of bits to select a row
 107 ROW_BITS = log2_int(BRAM_ROWS)
 108
 109 # ROW_LINE_BITS is the number of bits to select
 110 # a row within a line
 111 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 112
 113 # LINE_OFF_BITS is the number of bits for
 114 # the offset in a cache line
 115 LINE_OFF_BITS = log2_int(LINE_SIZE)
 116
 117 # ROW_OFF_BITS is the number of bits for
 118 # the offset in a row
 119 ROW_OFF_BITS = log2_int(ROW_SIZE)
 120
 121 # INDEX_BITS is the number if bits to
 122 # select a cache line
 123 INDEX_BITS = log2_int(NUM_LINES)
 124
 125 # SET_SIZE_BITS is the log base 2 of the set size
 126 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 127
 128 # TAG_BITS is the number of bits of
 129 # the tag part of the address
 130 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 131
 132 # TAG_WIDTH is the width in bits of each way of the tag RAM
 133 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 134
 135 # WAY_BITS is the number of bits to select a way
 136 WAY_BITS = log2_int(NUM_WAYS)
 137
 138 # Example of layout for 32 lines of 64 bytes:
 139 layout = f"""\
 140   DCache Layout:
 141  |.. -----------------------| REAL_ADDR_BITS ({REAL_ADDR_BITS})
 142   ..         |--------------| SET_SIZE_BITS ({SET_SIZE_BITS})
 143   ..  tag    |index|  line  |
 144   ..         |   row   |    |
 145   ..         |     |---|    | ROW_LINE_BITS ({ROW_LINE_BITS})
 146   ..         |     |--- - --| LINE_OFF_BITS ({LINE_OFF_BITS})
 147   ..         |         |- --| ROW_OFF_BITS  ({ROW_OFF_BITS})
 148   ..         |----- ---|    | ROW_BITS      ({ROW_BITS})
 149   ..         |-----|        | INDEX_BITS    ({INDEX_BITS})
 150   .. --------|              | TAG_BITS      ({TAG_BITS})
 151 """
 152 print (layout)
 153 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 154             (TAG_BITS, INDEX_BITS, ROW_BITS,
 155              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 156 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 157 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 158 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 159
 160 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 161
 162 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 163 print ("    TAG_WIDTH", TAG_WIDTH)
 164 print ("     NUM_WAYS", NUM_WAYS)
 165 print ("    NUM_LINES", NUM_LINES)
 166
 167 def CacheTagArray():
 168     tag_layout = [('valid', NUM_WAYS),
 169                   ('tag', TAG_RAM_WIDTH),
 170                  ]
 171     return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
 172
 173 def RowPerLineValidArray():
 174     return Array(Signal(name="rows_valid%d" % x) \
 175                         for x in range(ROW_PER_LINE))
 176
 177 # L1 TLB
 178 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 179 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 180 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 181 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 182 TLB_PTE_BITS     = 64
 183 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 184
 185 def ispow2(x):
 186     return (1<<log2_int(x, False)) == x
 187
 188 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 189 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 190 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 191 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 192 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 193 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 194         "geometry bits don't add up"
 195 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 196         "geometry bits don't add up"
 197 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 198          "geometry bits don't add up"
 199 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 200 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 201
 202
 203 def TLBHit(name):
 204     return Record([('valid', 1),
 205                    ('way', TLB_WAY_BITS)], name=name)
 206
 207 def TLBTagEAArray():
 208     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 209                 for x in range (TLB_NUM_WAYS))
 210
 211 def TLBRecord(name):
 212     tlb_layout = [('valid', TLB_NUM_WAYS),
 213                   ('tag', TLB_TAG_WAY_BITS),
 214                   ('pte', TLB_PTE_WAY_BITS)
 215                  ]
 216     return Record(tlb_layout, name=name)
 217
 218 def TLBValidArray():
 219     return Array(Signal(TLB_NUM_WAYS, name="tlb_valid%d" % x)
 220                         for x in range(TLB_SET_SIZE))
 221
 222 def HitWaySet():
 223     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 224                         for x in range(TLB_NUM_WAYS))
 225
 226 # Cache RAM interface
 227 def CacheRamOut():
 228     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 229                  for x in range(NUM_WAYS))
 230
 231 # PLRU output interface
 232 def PLRUOut():
 233     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 234                 for x in range(NUM_LINES))
 235
 236 # TLB PLRU output interface
 237 def TLBPLRUOut():
 238     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 239                 for x in range(TLB_SET_SIZE))
 240
 241 # Helper functions to decode incoming requests
 242 #
 243 # Return the cache line index (tag index) for an address
 244 def get_index(addr):
 245     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 246
 247 # Return the cache row index (data memory) for an address
 248 def get_row(addr):
 249     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 250
 251 # Return the index of a row within a line
 252 def get_row_of_line(row):
 253     return row[:ROW_BITS][:ROW_LINE_BITS]
 254
 255 # Returns whether this is the last row of a line
 256 def is_last_row_addr(addr, last):
 257     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 258
 259 # Returns whether this is the last row of a line
 260 def is_last_row(row, last):
 261     return get_row_of_line(row) == last
 262
 263 # Return the next row in the current cache line. We use a
 264 # dedicated function in order to limit the size of the
 265 # generated adder to be only the bits within a cache line
 266 # (3 bits with default settings)
 267 def next_row(row):
 268     row_v = row[0:ROW_LINE_BITS] + 1
 269     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 270
 271 # Get the tag value from the address
 272 def get_tag(addr):
 273     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 274
 275 # Read a tag from a tag memory row
 276 def read_tag(way, tagset):
 277     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 278
 279 # Read a TLB tag from a TLB tag memory row
 280 def read_tlb_tag(way, tags):
 281     return tags.word_select(way, TLB_EA_TAG_BITS)
 282
 283 # Write a TLB tag to a TLB tag memory row
 284 def write_tlb_tag(way, tags, tag):
 285     return read_tlb_tag(way, tags).eq(tag)
 286
 287 # Read a PTE from a TLB PTE memory row
 288 def read_tlb_pte(way, ptes):
 289     return ptes.word_select(way, TLB_PTE_BITS)
 290
 291 def write_tlb_pte(way, ptes, newpte):
 292     return read_tlb_pte(way, ptes).eq(newpte)
 293
 294
 295 # Record for storing permission, attribute, etc. bits from a PTE
 296 class PermAttr(RecordObject):
 297     def __init__(self, name=None):
 298         super().__init__(name=name)
 299         self.reference = Signal()
 300         self.changed   = Signal()
 301         self.nocache   = Signal()
 302         self.priv      = Signal()
 303         self.rd_perm   = Signal()
 304         self.wr_perm   = Signal()
 305
 306
 307 def extract_perm_attr(pte):
 308     pa = PermAttr()
 309     return pa;
 310
 311
 312 # Type of operation on a "valid" input
 313 @unique
 314 class Op(Enum):
 315     OP_NONE       = 0
 316     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 317     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 318     OP_LOAD_HIT   = 3 # Cache hit on load
 319     OP_LOAD_MISS  = 4 # Load missing cache
 320     OP_LOAD_NC    = 5 # Non-cachable load
 321     OP_STORE_HIT  = 6 # Store hitting cache
 322     OP_STORE_MISS = 7 # Store missing cache
 323
 324
 325 # Cache state machine
 326 @unique
 327 class State(Enum):
 328     IDLE             = 0 # Normal load hit processing
 329     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 330     STORE_WAIT_ACK   = 2 # Store wait ack
 331     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 332
 333
 334 # Dcache operations:
 335 #
 336 # In order to make timing, we use the BRAMs with
 337 # an output buffer, which means that the BRAM
 338 # output is delayed by an extra cycle.
 339 #
 340 # Thus, the dcache has a 2-stage internal pipeline
 341 # for cache hits with no stalls.
 342 #
 343 # All other operations are handled via stalling
 344 # in the first stage.
 345 #
 346 # The second stage can thus complete a hit at the same
 347 # time as the first stage emits a stall for a complex op.
 348 #
 349 # Stage 0 register, basically contains just the latched request
 350
 351 class RegStage0(RecordObject):
 352     def __init__(self, name=None):
 353         super().__init__(name=name)
 354         self.req     = LoadStore1ToDCacheType(name="lsmem")
 355         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 356         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 357         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 358         self.mmu_req = Signal() # indicates source of request
 359         self.d_valid = Signal() # indicates req.data is valid now
 360
 361
 362 class MemAccessRequest(RecordObject):
 363     def __init__(self, name=None):
 364         super().__init__(name=name)
 365         self.op        = Signal(Op)
 366         self.valid     = Signal()
 367         self.dcbz      = Signal()
 368         self.real_addr = Signal(REAL_ADDR_BITS)
 369         self.data      = Signal(64)
 370         self.byte_sel  = Signal(8)
 371         self.hit_way   = Signal(WAY_BITS)
 372         self.same_tag  = Signal()
 373         self.mmu_req   = Signal()
 374
 375
 376 # First stage register, contains state for stage 1 of load hits
 377 # and for the state machine used by all other operations
 378 class RegStage1(RecordObject):
 379     def __init__(self, name=None):
 380         super().__init__(name=name)
 381         # Info about the request
 382         self.full             = Signal() # have uncompleted request
 383         self.mmu_req          = Signal() # request is from MMU
 384         self.req              = MemAccessRequest(name="reqmem")
 385
 386         # Cache hit state
 387         self.hit_way          = Signal(WAY_BITS)
 388         self.hit_load_valid   = Signal()
 389         self.hit_index        = Signal(INDEX_BITS)
 390         self.cache_hit        = Signal()
 391
 392         # TLB hit state
 393         self.tlb_hit          = TLBHit("tlb_hit")
 394         self.tlb_hit_index    = Signal(TLB_SET_BITS)
 395
 396         # 2-stage data buffer for data forwarded from writes to reads
 397         self.forward_data1    = Signal(64)
 398         self.forward_data2    = Signal(64)
 399         self.forward_sel1     = Signal(8)
 400         self.forward_valid1   = Signal()
 401         self.forward_way1     = Signal(WAY_BITS)
 402         self.forward_row1     = Signal(ROW_BITS)
 403         self.use_forward1     = Signal()
 404         self.forward_sel      = Signal(8)
 405
 406         # Cache miss state (reload state machine)
 407         self.state            = Signal(State)
 408         self.dcbz             = Signal()
 409         self.write_bram       = Signal()
 410         self.write_tag        = Signal()
 411         self.slow_valid       = Signal()
 412         self.wb               = WBMasterOut("wb")
 413         self.reload_tag       = Signal(TAG_BITS)
 414         self.store_way        = Signal(WAY_BITS)
 415         self.store_row        = Signal(ROW_BITS)
 416         self.store_index      = Signal(INDEX_BITS)
 417         self.end_row_ix       = Signal(ROW_LINE_BITS)
 418         self.rows_valid       = RowPerLineValidArray()
 419         self.acks_pending     = Signal(3)
 420         self.inc_acks         = Signal()
 421         self.dec_acks         = Signal()
 422
 423         # Signals to complete (possibly with error)
 424         self.ls_valid         = Signal()
 425         self.ls_error         = Signal()
 426         self.mmu_done         = Signal()
 427         self.mmu_error        = Signal()
 428         self.cache_paradox    = Signal()
 429
 430         # Signal to complete a failed stcx.
 431         self.stcx_fail        = Signal()
 432
 433
 434 # Reservation information
 435 class Reservation(RecordObject):
 436     def __init__(self):
 437         super().__init__()
 438         self.valid = Signal()
 439         self.addr  = Signal(64-LINE_OFF_BITS)
 440
 441
 442 class DTLBUpdate(Elaboratable):
 443     def __init__(self):
 444         self.tlbie    = Signal()
 445         self.tlbwe    = Signal()
 446         self.doall    = Signal()
 447         self.tlb_hit     = TLBHit("tlb_hit")
 448         self.tlb_req_index = Signal(TLB_SET_BITS)
 449
 450         self.repl_way        = Signal(TLB_WAY_BITS)
 451         self.eatag           = Signal(TLB_EA_TAG_BITS)
 452         self.pte_data        = Signal(TLB_PTE_BITS)
 453
 454         # read from dtlb array
 455         self.tlb_read       = Signal()
 456         self.tlb_read_index = Signal(TLB_SET_BITS)
 457         self.tlb_way        = TLBRecord("o_tlb_way")
 458
 459     def elaborate(self, platform):
 460         m = Module()
 461         comb = m.d.comb
 462         sync = m.d.sync
 463
 464         # there are 3 parts to this:
 465         # QTY TLB_NUM_WAYs TAGs - of width (say) 46 bits of Effective Address
 466         # QTY TLB_NUM_WAYs PTEs - of width (say) 64 bits
 467         # "Valid" bits, one per "way", of QTY TLB_NUM_WAYs.  these cannot
 468         # be a Memory because they can all be cleared (tlbie, doall), i mean,
 469         # we _could_, in theory, by overriding the Reset Signal of the Memory,
 470         # hmmm....
 471
 472         dtlb_valid = TLBValidArray()
 473         tlb_req_index = self.tlb_req_index
 474
 475         print ("TLB_TAG_WAY_BITS", TLB_TAG_WAY_BITS)
 476         print ("     TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
 477         print ("        TLB_NUM_WAYS", TLB_NUM_WAYS)
 478         print ("TLB_PTE_WAY_BITS", TLB_PTE_WAY_BITS)
 479         print ("    TLB_PTE_BITS", TLB_PTE_BITS)
 480         print ("    TLB_NUM_WAYS", TLB_NUM_WAYS)
 481
 482         # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
 483         tagway = Memory(depth=TLB_SET_SIZE, width=TLB_TAG_WAY_BITS)
 484         m.submodules.rd_tagway = rd_tagway = tagway.read_port()
 485         m.submodules.wr_tagway = wr_tagway = tagway.write_port(
 486                                     granularity=TLB_EA_TAG_BITS)
 487
 488         pteway = Memory(depth=TLB_SET_SIZE, width=TLB_PTE_WAY_BITS)
 489         m.submodules.rd_pteway = rd_pteway = pteway.read_port()
 490         m.submodules.wr_pteway = wr_pteway = pteway.write_port(
 491                                     granularity=TLB_PTE_BITS)
 492
 493         # commented out for now, can be put in if Memory.reset can be
 494         # used for tlbie&doall to reset the entire Memory to zero in 1 cycle
 495         #validm = Memory(depth=TLB_SET_SIZE, width=TLB_NUM_WAYS)
 496         #m.submodules.rd_valid = rd_valid = validm.read_port()
 497         #m.submodules.wr_valid = wr_valid = validm.write_port(
 498                                     #granularity=1)
 499
 500         # connect up read and write addresses to Valid/PTE/TAG SRAMs
 501         m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
 502         m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
 503         #m.d.comb += rd_valid.addr.eq(self.tlb_read_index)
 504         m.d.comb += wr_tagway.addr.eq(tlb_req_index)
 505         m.d.comb += wr_pteway.addr.eq(tlb_req_index)
 506         #m.d.comb += wr_valid.addr.eq(tlb_req_index)
 507
 508         updated  = Signal()
 509         v_updated  = Signal()
 510         tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 511         db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 512         pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 513         dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 514
 515         comb += dv.eq(dtlb_valid[tlb_req_index])
 516         comb += db_out.eq(dv)
 517
 518         with m.If(self.tlbie & self.doall):
 519             # clear all valid bits at once
 520             # XXX hmmm, validm _could_ use Memory reset here...
 521             for i in range(TLB_SET_SIZE):
 522                 sync += dtlb_valid[i].eq(0)
 523         with m.Elif(self.tlbie):
 524             # invalidate just the hit_way
 525             with m.If(self.tlb_hit.valid):
 526                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 527                 comb += v_updated.eq(1)
 528         with m.Elif(self.tlbwe):
 529             # write to the requested tag and PTE
 530             comb += write_tlb_tag(self.repl_way, tb_out, self.eatag)
 531             comb += write_tlb_pte(self.repl_way, pb_out, self.pte_data)
 532             # set valid bit
 533             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 534
 535             comb += updated.eq(1)
 536             comb += v_updated.eq(1)
 537
 538         # above, sometimes valid is requested to be updated but data not
 539         # therefore split them out, here.  note the granularity thing matches
 540         # with the shift-up of the eatag/pte_data into the correct TLB way.
 541         # thus is it not necessary to write the entire lot, just the portion
 542         # being altered: hence writing the *old* copy of the row is not needed
 543         with m.If(updated): # PTE and TAG to be written
 544             comb += wr_pteway.data.eq(pb_out)
 545             comb += wr_pteway.en.eq(1<<self.repl_way)
 546             comb += wr_tagway.data.eq(tb_out)
 547             comb += wr_tagway.en.eq(1<<self.repl_way)
 548         with m.If(v_updated): # Valid to be written
 549             sync += dtlb_valid[tlb_req_index].eq(db_out)
 550             #comb += wr_valid.data.eq(db_out)
 551             #comb += wr_valid.en.eq(1<<self.repl_way)
 552
 553         # select one TLB way, use a register here
 554         r_tlb_way        = TLBRecord("r_tlb_way")
 555         r_delay = Signal()
 556         sync += r_delay.eq(self.tlb_read)
 557         with m.If(self.tlb_read):
 558             sync += self.tlb_way.valid.eq(dtlb_valid[self.tlb_read_index])
 559         with m.If(r_delay):
 560             # on one clock delay, output the contents of the read port(s)
 561             # comb += self.tlb_way.valid.eq(rd_valid.data)
 562             comb += self.tlb_way.tag.eq(rd_tagway.data)
 563             comb += self.tlb_way.pte.eq(rd_pteway.data)
 564             # and also capture the (delayed) output...
 565             #sync += r_tlb_way.valid.eq(rd_valid.data)
 566             sync += r_tlb_way.tag.eq(rd_tagway.data)
 567             sync += r_tlb_way.pte.eq(rd_pteway.data)
 568         with m.Else():
 569             # ... so that the register can output it when no read is requested
 570             # it's rather overkill but better to be safe than sorry
 571             comb += self.tlb_way.tag.eq(r_tlb_way.tag)
 572             comb += self.tlb_way.pte.eq(r_tlb_way.pte)
 573             #comb += self.tlb_way.eq(r_tlb_way)
 574
 575         return m
 576
 577
 578 class DCachePendingHit(Elaboratable):
 579
 580     def __init__(self, tlb_way,
 581                       cache_i_validdx, cache_tag_set,
 582                     req_addr):
 583
 584         self.go          = Signal()
 585         self.virt_mode   = Signal()
 586         self.is_hit      = Signal()
 587         self.tlb_hit      = TLBHit("tlb_hit")
 588         self.hit_way     = Signal(WAY_BITS)
 589         self.rel_match   = Signal()
 590         self.req_index   = Signal(INDEX_BITS)
 591         self.reload_tag  = Signal(TAG_BITS)
 592
 593         self.tlb_way = tlb_way
 594         self.cache_i_validdx = cache_i_validdx
 595         self.cache_tag_set = cache_tag_set
 596         self.req_addr = req_addr
 597
 598     def elaborate(self, platform):
 599         m = Module()
 600         comb = m.d.comb
 601         sync = m.d.sync
 602
 603         go = self.go
 604         virt_mode = self.virt_mode
 605         is_hit = self.is_hit
 606         tlb_way = self.tlb_way
 607         cache_i_validdx = self.cache_i_validdx
 608         cache_tag_set = self.cache_tag_set
 609         req_addr = self.req_addr
 610         tlb_hit = self.tlb_hit
 611         hit_way = self.hit_way
 612         rel_match = self.rel_match
 613         req_index = self.req_index
 614         reload_tag = self.reload_tag
 615
 616         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 617                                   for i in range(TLB_NUM_WAYS))
 618         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 619                                     for i in range(TLB_NUM_WAYS))
 620         hit_way_set = HitWaySet()
 621
 622         # Test if pending request is a hit on any way
 623         # In order to make timing in virtual mode,
 624         # when we are using the TLB, we compare each
 625         # way with each of the real addresses from each way of
 626         # the TLB, and then decide later which match to use.
 627
 628         with m.If(virt_mode):
 629             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 630                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 631                 s_hit       = Signal(name="s_hit%d" % j)
 632                 s_pte       = Signal(TLB_PTE_BITS, name="s_pte%d" % j)
 633                 s_ra        = Signal(REAL_ADDR_BITS, name="s_ra%d" % j)
 634                 # read the PTE, calc the Real Address, get tge tag
 635                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
 636                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 637                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 638                 comb += s_tag.eq(get_tag(s_ra))
 639                 # for each way check tge tag against the cache tag set
 640                 for i in range(NUM_WAYS): # way_t
 641                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 642                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 643                                   (read_tag(i, cache_tag_set) == s_tag)
 644                                   & (tlb_way.valid[j]))
 645                     with m.If(is_tag_hit):
 646                         comb += hit_way_set[j].eq(i)
 647                         comb += s_hit.eq(1)
 648                 comb += hit_set[j].eq(s_hit)
 649                 comb += rel_matches[j].eq(s_tag == reload_tag)
 650             with m.If(tlb_hit.valid):
 651                 comb += is_hit.eq(hit_set[tlb_hit.way])
 652                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 653                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 654         with m.Else():
 655             s_tag       = Signal(TAG_BITS)
 656             comb += s_tag.eq(get_tag(req_addr))
 657             for i in range(NUM_WAYS): # way_t
 658                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 659                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 660                           (read_tag(i, cache_tag_set) == s_tag))
 661                 with m.If(is_tag_hit):
 662                     comb += hit_way.eq(i)
 663                     comb += is_hit.eq(1)
 664             with m.If(s_tag == reload_tag):
 665                 comb += rel_match.eq(1)
 666
 667         return m
 668
 669
 670 class DCache(Elaboratable):
 671     """Set associative dcache write-through
 672
 673     TODO (in no specific order):
 674     * See list in icache.vhdl
 675     * Complete load misses on the cycle when WB data comes instead of
 676       at the end of line (this requires dealing with requests coming in
 677       while not idle...)
 678     """
 679     def __init__(self, pspec=None):
 680         self.d_in      = LoadStore1ToDCacheType("d_in")
 681         self.d_out     = DCacheToLoadStore1Type("d_out")
 682
 683         self.m_in      = MMUToDCacheType("m_in")
 684         self.m_out     = DCacheToMMUType("m_out")
 685
 686         self.stall_out = Signal()
 687
 688         # standard naming (wired to non-standard for compatibility)
 689         self.bus = Interface(addr_width=32,
 690                             data_width=64,
 691                             granularity=8,
 692                             features={'stall'},
 693                             alignment=0,
 694                             name="dcache")
 695
 696         self.log_out   = Signal(20)
 697
 698         # test if microwatt compatibility is to be enabled
 699         self.microwatt_compat = (hasattr(pspec, "microwatt_compat") and
 700                                  (pspec.microwatt_compat == True))
 701
 702     def stage_0(self, m, r0, r1, r0_full):
 703         """Latch the request in r0.req as long as we're not stalling
 704         """
 705         comb = m.d.comb
 706         sync = m.d.sync
 707         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 708
 709         r = RegStage0("stage0")
 710
 711         # TODO, this goes in unit tests and formal proofs
 712         with m.If(d_in.valid & m_in.valid):
 713             sync += Display("request collision loadstore vs MMU")
 714
 715         with m.If(m_in.valid):
 716             comb += r.req.valid.eq(1)
 717             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 718             comb += r.req.dcbz.eq(0)
 719             comb += r.req.nc.eq(0)
 720             comb += r.req.reserve.eq(0)
 721             comb += r.req.virt_mode.eq(0)
 722             comb += r.req.priv_mode.eq(1)
 723             comb += r.req.addr.eq(m_in.addr)
 724             comb += r.req.data.eq(m_in.pte)
 725             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 726             comb += r.tlbie.eq(m_in.tlbie)
 727             comb += r.doall.eq(m_in.doall)
 728             comb += r.tlbld.eq(m_in.tlbld)
 729             comb += r.mmu_req.eq(1)
 730             comb += r.d_valid.eq(1)
 731             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 732                                  m_in.addr, m_in.pte, r.req.load)
 733
 734         with m.Else():
 735             comb += r.req.eq(d_in)
 736             comb += r.req.data.eq(0)
 737             comb += r.tlbie.eq(0)
 738             comb += r.doall.eq(0)
 739             comb += r.tlbld.eq(0)
 740             comb += r.mmu_req.eq(0)
 741             comb += r.d_valid.eq(0)
 742
 743         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 744             sync += r0.eq(r)
 745             sync += r0_full.eq(r.req.valid)
 746         with m.Elif(~r0.d_valid):
 747             # Sample data the cycle after a request comes in from loadstore1.
 748             # If another request has come in already then the data will get
 749             # put directly into req.data below.
 750             sync += r0.req.data.eq(d_in.data)
 751             sync += r0.d_valid.eq(1)
 752         with m.If(d_in.valid):
 753             m.d.sync += Display("    DCACHE req cache "
 754                                 "virt %d addr %x data %x ld %d",
 755                                  r.req.virt_mode, r.req.addr,
 756                                  r.req.data, r.req.load)
 757
 758     def tlb_read(self, m, r0_stall, tlb_way):
 759         """TLB
 760         Operates in the second cycle on the request latched in r0.req.
 761         TLB updates write the entry at the end of the second cycle.
 762         """
 763         comb = m.d.comb
 764         sync = m.d.sync
 765         m_in, d_in = self.m_in, self.d_in
 766
 767         addrbits = Signal(TLB_SET_BITS)
 768
 769         amin = TLB_LG_PGSZ
 770         amax = TLB_LG_PGSZ + TLB_SET_BITS
 771
 772         with m.If(m_in.valid):
 773             comb += addrbits.eq(m_in.addr[amin : amax])
 774         with m.Else():
 775             comb += addrbits.eq(d_in.addr[amin : amax])
 776
 777         # If we have any op and the previous op isn't finished,
 778         # then keep the same output for next cycle.
 779         d = self.dtlb_update
 780         comb += d.tlb_read_index.eq(addrbits)
 781         comb += d.tlb_read.eq(~r0_stall)
 782         comb += tlb_way.eq(d.tlb_way)
 783
 784     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
 785         """Generate TLB PLRUs
 786         """
 787         comb = m.d.comb
 788         sync = m.d.sync
 789
 790         if TLB_NUM_WAYS == 0:
 791             return
 792
 793         # suite of PLRUs with a selection and output mechanism
 794         tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
 795         m.submodules.tlb_plrus = tlb_plrus
 796         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
 797         comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
 798         comb += tlb_plrus.index.eq(r1.tlb_hit_index)
 799         comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
 800         comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 801
 802     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 803                    tlb_way,
 804                    pte, tlb_hit, valid_ra, perm_attr, ra):
 805
 806         comb = m.d.comb
 807
 808         hitway = Signal(TLB_WAY_BITS)
 809         hit    = Signal()
 810         eatag  = Signal(TLB_EA_TAG_BITS)
 811
 812         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 813         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 814         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 815
 816         for i in range(TLB_NUM_WAYS):
 817             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 818             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 819             comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
 820             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 821             with m.If(is_tag_hit):
 822                 comb += hitway.eq(i)
 823                 comb += hit.eq(1)
 824
 825         comb += tlb_hit.valid.eq(hit & r0_valid)
 826         comb += tlb_hit.way.eq(hitway)
 827
 828         with m.If(tlb_hit.valid):
 829             comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
 830         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 831
 832         with m.If(r0.req.virt_mode):
 833             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 834                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 835                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 836             comb += perm_attr.reference.eq(pte[8])
 837             comb += perm_attr.changed.eq(pte[7])
 838             comb += perm_attr.nocache.eq(pte[5])
 839             comb += perm_attr.priv.eq(pte[3])
 840             comb += perm_attr.rd_perm.eq(pte[2])
 841             comb += perm_attr.wr_perm.eq(pte[1])
 842         with m.Else():
 843             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 844                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 845             comb += perm_attr.reference.eq(1)
 846             comb += perm_attr.changed.eq(1)
 847             comb += perm_attr.nocache.eq(0)
 848             comb += perm_attr.priv.eq(1)
 849             comb += perm_attr.rd_perm.eq(1)
 850             comb += perm_attr.wr_perm.eq(1)
 851
 852         with m.If(valid_ra):
 853             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 854                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 855             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 856             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 857             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 858             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 859             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 860             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 861
 862     def tlb_update(self, m, r0_valid, r0, tlb_req_index,
 863                     tlb_hit, tlb_plru_victim):
 864
 865         comb = m.d.comb
 866         sync = m.d.sync
 867
 868         tlbie    = Signal()
 869         tlbwe    = Signal()
 870
 871         comb += tlbie.eq(r0_valid & r0.tlbie)
 872         comb += tlbwe.eq(r0_valid & r0.tlbld)
 873
 874         d = self.dtlb_update
 875
 876         comb += d.tlbie.eq(tlbie)
 877         comb += d.tlbwe.eq(tlbwe)
 878         comb += d.doall.eq(r0.doall)
 879         comb += d.tlb_hit.eq(tlb_hit)
 880         comb += d.tlb_req_index.eq(tlb_req_index)
 881
 882         with m.If(tlb_hit.valid):
 883             comb += d.repl_way.eq(tlb_hit.way)
 884         with m.Else():
 885             comb += d.repl_way.eq(tlb_plru_victim)
 886         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 887         comb += d.pte_data.eq(r0.req.data)
 888
 889     def maybe_plrus(self, m, r1, plru_victim):
 890         """Generate PLRUs
 891         """
 892         comb = m.d.comb
 893         sync = m.d.sync
 894
 895         if TLB_NUM_WAYS == 0:
 896             return
 897
 898         # suite of PLRUs with a selection and output mechanism
 899         m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
 900         comb += plrus.way.eq(r1.hit_way)
 901         comb += plrus.valid.eq(r1.cache_hit)
 902         comb += plrus.index.eq(r1.hit_index)
 903         comb += plrus.isel.eq(r1.store_index) # select victim
 904         comb += plru_victim.eq(plrus.o_index) # selected victim
 905
 906     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 907         """Cache tag RAM read port
 908         """
 909         comb = m.d.comb
 910         sync = m.d.sync
 911         m_in, d_in = self.m_in, self.d_in
 912
 913         index = Signal(INDEX_BITS)
 914
 915         with m.If(r0_stall):
 916             comb += index.eq(req_index)
 917         with m.Elif(m_in.valid):
 918             comb += index.eq(get_index(m_in.addr))
 919         with m.Else():
 920             comb += index.eq(get_index(d_in.addr))
 921         sync += cache_tag_set.eq(cache_tags[index].tag)
 922
 923     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 924                        r0_valid, r1, cache_tags, replace_way,
 925                        use_forward1_next, use_forward2_next,
 926                        req_hit_way, plru_victim, rc_ok, perm_attr,
 927                        valid_ra, perm_ok, access_ok, req_op, req_go,
 928                        tlb_hit, tlb_way, cache_tag_set,
 929                        cancel_store, req_same_tag, r0_stall, early_req_row):
 930         """Cache request parsing and hit detection
 931         """
 932
 933         comb = m.d.comb
 934         m_in, d_in = self.m_in, self.d_in
 935
 936         is_hit      = Signal()
 937         hit_way     = Signal(WAY_BITS)
 938         op          = Signal(Op)
 939         opsel       = Signal(3)
 940         go          = Signal()
 941         nc          = Signal()
 942         cache_i_validdx = Signal(NUM_WAYS)
 943
 944         # Extract line, row and tag from request
 945         comb += req_index.eq(get_index(r0.req.addr))
 946         comb += req_row.eq(get_row(r0.req.addr))
 947         comb += req_tag.eq(get_tag(ra))
 948
 949         if False: # display on comb is a bit... busy.
 950             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 951                     r0.req.addr, ra, req_index, req_tag, req_row)
 952
 953         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 954         comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 955
 956         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
 957                                             cache_i_validdx, cache_tag_set,
 958                                             r0.req.addr)
 959         comb += dc.tlb_hit.eq(tlb_hit)
 960         comb += dc.reload_tag.eq(r1.reload_tag)
 961         comb += dc.virt_mode.eq(r0.req.virt_mode)
 962         comb += dc.go.eq(go)
 963         comb += dc.req_index.eq(req_index)
 964
 965         comb += is_hit.eq(dc.is_hit)
 966         comb += hit_way.eq(dc.hit_way)
 967         comb += req_same_tag.eq(dc.rel_match)
 968
 969         # See if the request matches the line currently being reloaded
 970         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 971                   (req_index == r1.store_index) & req_same_tag):
 972             # For a store, consider this a hit even if the row isn't
 973             # valid since it will be by the time we perform the store.
 974             # For a load, check the appropriate row valid bit.
 975             rrow = Signal(ROW_LINE_BITS)
 976             comb += rrow.eq(req_row)
 977             valid = r1.rows_valid[rrow]
 978             comb += is_hit.eq((~r0.req.load) | valid)
 979             comb += hit_way.eq(replace_way)
 980
 981         # Whether to use forwarded data for a load or not
 982         with m.If((get_row(r1.req.real_addr) == req_row) &
 983                   (r1.req.hit_way == hit_way)):
 984             # Only need to consider r1.write_bram here, since if we
 985             # are writing refill data here, then we don't have a
 986             # cache hit this cycle on the line being refilled.
 987             # (There is the possibility that the load following the
 988             # load miss that started the refill could be to the old
 989             # contents of the victim line, since it is a couple of
 990             # cycles after the refill starts before we see the updated
 991             # cache tag. In that case we don't use the bypass.)
 992             comb += use_forward1_next.eq(r1.write_bram)
 993         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 994             comb += use_forward2_next.eq(r1.forward_valid1)
 995
 996         # The way that matched on a hit
 997         comb += req_hit_way.eq(hit_way)
 998
 999         # The way to replace on a miss
1000         with m.If(r1.write_tag):
1001             comb += replace_way.eq(plru_victim)
1002         with m.Else():
1003             comb += replace_way.eq(r1.store_way)
1004
1005         # work out whether we have permission for this access
1006         # NB we don't yet implement AMR, thus no KUAP
1007         comb += rc_ok.eq(perm_attr.reference
1008                          & (r0.req.load | perm_attr.changed))
1009         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
1010                            (perm_attr.wr_perm |
1011                               (r0.req.load & perm_attr.rd_perm)))
1012         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
1013
1014         # Combine the request and cache hit status to decide what
1015         # operation needs to be done
1016         comb += nc.eq(r0.req.nc | perm_attr.nocache)
1017         comb += op.eq(Op.OP_NONE)
1018         with m.If(go):
1019             with m.If(~access_ok):
1020                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
1021                                  valid_ra, perm_ok, rc_ok)
1022                 comb += op.eq(Op.OP_BAD)
1023             with m.Elif(cancel_store):
1024                 m.d.sync += Display("DCACHE cancel store")
1025                 comb += op.eq(Op.OP_STCX_FAIL)
1026             with m.Else():
1027                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
1028                                  valid_ra, nc, r0.req.load)
1029                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
1030                 with m.Switch(opsel):
1031                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
1032                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
1033                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
1034                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
1035                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
1036                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
1037                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
1038                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
1039         comb += req_op.eq(op)
1040         comb += req_go.eq(go)
1041
1042         # Version of the row number that is valid one cycle earlier
1043         # in the cases where we need to read the cache data BRAM.
1044         # If we're stalling then we need to keep reading the last
1045         # row requested.
1046         with m.If(~r0_stall):
1047             with m.If(m_in.valid):
1048                 comb += early_req_row.eq(get_row(m_in.addr))
1049             with m.Else():
1050                 comb += early_req_row.eq(get_row(d_in.addr))
1051         with m.Else():
1052             comb += early_req_row.eq(req_row)
1053
1054     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
1055                          r0_valid, r0, reservation):
1056         """Handle load-with-reservation and store-conditional instructions
1057         """
1058         comb = m.d.comb
1059
1060         with m.If(r0_valid & r0.req.reserve):
1061             # XXX generate alignment interrupt if address
1062             # is not aligned XXX or if r0.req.nc = '1'
1063             with m.If(r0.req.load):
1064                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
1065             with m.Else():
1066                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1067                 with m.If((~reservation.valid) |
1068                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1069                     comb += cancel_store.eq(1)
1070
1071     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1072                         reservation, r0):
1073         comb = m.d.comb
1074         sync = m.d.sync
1075
1076         with m.If(r0_valid & access_ok):
1077             with m.If(clear_rsrv):
1078                 sync += reservation.valid.eq(0)
1079             with m.Elif(set_rsrv):
1080                 sync += reservation.valid.eq(1)
1081                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1082
1083     def writeback_control(self, m, r1, cache_out_row):
1084         """Return data for loads & completion control logic
1085         """
1086         comb = m.d.comb
1087         sync = m.d.sync
1088         d_out, m_out = self.d_out, self.m_out
1089
1090         data_out = Signal(64)
1091         data_fwd = Signal(64)
1092
1093         # Use the bypass if are reading the row that was
1094         # written 1 or 2 cycles ago, including for the
1095         # slow_valid = 1 case (i.e. completing a load
1096         # miss or a non-cacheable load).
1097         with m.If(r1.use_forward1):
1098             comb += data_fwd.eq(r1.forward_data1)
1099         with m.Else():
1100             comb += data_fwd.eq(r1.forward_data2)
1101
1102         comb += data_out.eq(cache_out_row)
1103
1104         for i in range(8):
1105             with m.If(r1.forward_sel[i]):
1106                 dsel = data_fwd.word_select(i, 8)
1107                 comb += data_out.word_select(i, 8).eq(dsel)
1108
1109         # DCache output to LoadStore
1110         comb += d_out.valid.eq(r1.ls_valid)
1111         comb += d_out.data.eq(data_out)
1112         comb += d_out.store_done.eq(~r1.stcx_fail)
1113         comb += d_out.error.eq(r1.ls_error)
1114         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1115
1116         # Outputs to MMU
1117         comb += m_out.done.eq(r1.mmu_done)
1118         comb += m_out.err.eq(r1.mmu_error)
1119         comb += m_out.data.eq(data_out)
1120
1121         # We have a valid load or store hit or we just completed
1122         # a slow op such as a load miss, a NC load or a store
1123         #
1124         # Note: the load hit is delayed by one cycle. However it
1125         # can still not collide with r.slow_valid (well unless I
1126         # miscalculated) because slow_valid can only be set on a
1127         # subsequent request and not on its first cycle (the state
1128         # machine must have advanced), which makes slow_valid
1129         # at least 2 cycles from the previous hit_load_valid.
1130
1131         # Sanity: Only one of these must be set in any given cycle
1132
1133         if False: # TODO: need Display to get this to work
1134             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1135             "unexpected slow_valid collision with stcx_fail"
1136
1137             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1138              "unexpected hit_load_delayed collision with slow_valid"
1139
1140         with m.If(~r1.mmu_req):
1141             # Request came from loadstore1...
1142             # Load hit case is the standard path
1143             with m.If(r1.hit_load_valid):
1144                 sync += Display("completing load hit data=%x", data_out)
1145
1146             # error cases complete without stalling
1147             with m.If(r1.ls_error):
1148                 with m.If(r1.dcbz):
1149                     sync += Display("completing dcbz with error")
1150                 with m.Else():
1151                     sync += Display("completing ld/st with error")
1152
1153             # Slow ops (load miss, NC, stores)
1154             with m.If(r1.slow_valid):
1155                 sync += Display("completing store or load miss adr=%x data=%x",
1156                                 r1.req.real_addr, data_out)
1157
1158         with m.Else():
1159             # Request came from MMU
1160             with m.If(r1.hit_load_valid):
1161                 sync += Display("completing load hit to MMU, data=%x",
1162                                 m_out.data)
1163             # error cases complete without stalling
1164             with m.If(r1.mmu_error):
1165                 sync += Display("combpleting MMU ld with error")
1166
1167             # Slow ops (i.e. load miss)
1168             with m.If(r1.slow_valid):
1169                 sync += Display("completing MMU load miss, adr=%x data=%x",
1170                                 r1.req.real_addr, m_out.data)
1171
1172     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1173         """rams
1174         Generate a cache RAM for each way. This handles the normal
1175         reads, writes from reloads and the special store-hit update
1176         path as well.
1177
1178         Note: the BRAMs have an extra read buffer, meaning the output
1179         is pipelined an extra cycle. This differs from the
1180         icache. The writeback logic needs to take that into
1181         account by using 1-cycle delayed signals for load hits.
1182         """
1183         comb = m.d.comb
1184         bus = self.bus
1185
1186         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1187         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1188         m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
1189         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1190                    ~r1.write_bram))
1191         comb += rwe.i.eq(replace_way)
1192
1193         m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
1194         comb += hwe.i.eq(r1.hit_way)
1195
1196         # this one is gated with write_bram, and replace_way_e can never be
1197         # set at the same time.  that means that do_write can OR the outputs
1198         m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
1199         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1200         comb += hre.i.eq(r1.req.hit_way)
1201
1202         # common Signals
1203         do_read  = Signal()
1204         wr_addr  = Signal(ROW_BITS)
1205         wr_data  = Signal(WB_DATA_BITS)
1206         wr_sel   = Signal(ROW_SIZE)
1207         rd_addr  = Signal(ROW_BITS)
1208
1209         comb += do_read.eq(1) # always enable
1210         comb += rd_addr.eq(early_req_row)
1211
1212         # Write mux:
1213         #
1214         # Defaults to wishbone read responses (cache refill)
1215         #
1216         # For timing, the mux on wr_data/sel/addr is not
1217         # dependent on anything other than the current state.
1218
1219         with m.If(r1.write_bram):
1220             # Write store data to BRAM.  This happens one
1221             # cycle after the store is in r0.
1222             comb += wr_data.eq(r1.req.data)
1223             comb += wr_sel.eq(r1.req.byte_sel)
1224             comb += wr_addr.eq(get_row(r1.req.real_addr))
1225
1226         with m.Else():
1227             # Otherwise, we might be doing a reload or a DCBZ
1228             with m.If(r1.dcbz):
1229                 comb += wr_data.eq(0)
1230             with m.Else():
1231                 comb += wr_data.eq(bus.dat_r)
1232             comb += wr_addr.eq(r1.store_row)
1233             comb += wr_sel.eq(~0) # all 1s
1234
1235         # set up Cache Rams
1236         for i in range(NUM_WAYS):
1237             do_write = Signal(name="do_wr%d" % i)
1238             wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
1239             d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1240
1241             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1242             m.submodules["cacheram_%d" % i] = way
1243
1244             comb += way.rd_en.eq(do_read)
1245             comb += way.rd_addr.eq(rd_addr)
1246             comb += d_out.eq(way.rd_data_o)
1247             comb += way.wr_sel.eq(wr_sel_m)
1248             comb += way.wr_addr.eq(wr_addr)
1249             comb += way.wr_data.eq(wr_data)
1250
1251             # Cache hit reads
1252             with m.If(hwe.o[i]):
1253                 comb += cache_out_row.eq(d_out)
1254
1255             # these are mutually-exclusive via their Decoder-enablers
1256             # (note: Decoder-enable is inverted)
1257             comb += do_write.eq(hre.o[i] | rwe.o[i])
1258
1259             # Mask write selects with do_write since BRAM
1260             # doesn't have a global write-enable
1261             with m.If(do_write):
1262                 comb += wr_sel_m.eq(wr_sel)
1263
1264     # Cache hit synchronous machine for the easy case.
1265     # This handles load hits.
1266     # It also handles error cases (TLB miss, cache paradox)
1267     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1268                         req_hit_way, req_index, req_tag, access_ok,
1269                         tlb_hit, tlb_req_index):
1270         comb = m.d.comb
1271         sync = m.d.sync
1272
1273         with m.If(req_op != Op.OP_NONE):
1274             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1275                     req_op, r0.req.addr, r0.req.nc,
1276                     req_index, req_tag, req_hit_way)
1277
1278         with m.If(r0_valid):
1279             sync += r1.mmu_req.eq(r0.mmu_req)
1280
1281         # Fast path for load/store hits.
1282         # Set signals for the writeback controls.
1283         sync += r1.hit_way.eq(req_hit_way)
1284         sync += r1.hit_index.eq(req_index)
1285
1286         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1287         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1288                                 (req_op == Op.OP_STORE_HIT))
1289
1290         with m.If(req_op == Op.OP_BAD):
1291             sync += Display("Signalling ld/st error "
1292                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1293                             ~r0.mmu_req,r0.mmu_req,access_ok)
1294             sync += r1.ls_error.eq(~r0.mmu_req)
1295             sync += r1.mmu_error.eq(r0.mmu_req)
1296             sync += r1.cache_paradox.eq(access_ok)
1297         with m.Else():
1298             sync += r1.ls_error.eq(0)
1299             sync += r1.mmu_error.eq(0)
1300             sync += r1.cache_paradox.eq(0)
1301
1302         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1303
1304         # Record TLB hit information for updating TLB PLRU
1305         sync += r1.tlb_hit.eq(tlb_hit)
1306         sync += r1.tlb_hit_index.eq(tlb_req_index)
1307
1308     # Memory accesses are handled by this state machine:
1309     #
1310     #   * Cache load miss/reload (in conjunction with "rams")
1311     #   * Load hits for non-cachable forms
1312     #   * Stores (the collision case is handled in "rams")
1313     #
1314     # All wishbone requests generation is done here.
1315     # This machine operates at stage 1.
1316     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1317                     r0, replace_way,
1318                     req_hit_way, req_same_tag,
1319                     r0_valid, req_op, cache_tags, req_go, ra):
1320
1321         comb = m.d.comb
1322         sync = m.d.sync
1323         bus = self.bus
1324         d_in = self.d_in
1325
1326         req         = MemAccessRequest("mreq_ds")
1327
1328         req_row = Signal(ROW_BITS)
1329         req_idx = Signal(INDEX_BITS)
1330         req_tag = Signal(TAG_BITS)
1331         comb += req_idx.eq(get_index(req.real_addr))
1332         comb += req_row.eq(get_row(req.real_addr))
1333         comb += req_tag.eq(get_tag(req.real_addr))
1334
1335         sync += r1.use_forward1.eq(use_forward1_next)
1336         sync += r1.forward_sel.eq(0)
1337
1338         with m.If(use_forward1_next):
1339             sync += r1.forward_sel.eq(r1.req.byte_sel)
1340         with m.Elif(use_forward2_next):
1341             sync += r1.forward_sel.eq(r1.forward_sel1)
1342
1343         sync += r1.forward_data2.eq(r1.forward_data1)
1344         with m.If(r1.write_bram):
1345             sync += r1.forward_data1.eq(r1.req.data)
1346             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1347             sync += r1.forward_way1.eq(r1.req.hit_way)
1348             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1349             sync += r1.forward_valid1.eq(1)
1350         with m.Else():
1351             with m.If(r1.dcbz):
1352                 sync += r1.forward_data1.eq(0)
1353             with m.Else():
1354                 sync += r1.forward_data1.eq(bus.dat_r)
1355             sync += r1.forward_sel1.eq(~0) # all 1s
1356             sync += r1.forward_way1.eq(replace_way)
1357             sync += r1.forward_row1.eq(r1.store_row)
1358             sync += r1.forward_valid1.eq(0)
1359
1360         # One cycle pulses reset
1361         sync += r1.slow_valid.eq(0)
1362         sync += r1.write_bram.eq(0)
1363         sync += r1.inc_acks.eq(0)
1364         sync += r1.dec_acks.eq(0)
1365
1366         sync += r1.ls_valid.eq(0)
1367         # complete tlbies and TLB loads in the third cycle
1368         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1369
1370         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1371             with m.If(r0.mmu_req):
1372                 sync += r1.mmu_done.eq(1)
1373             with m.Else():
1374                 sync += r1.ls_valid.eq(1)
1375
1376         with m.If(r1.write_tag):
1377             # Store new tag in selected way
1378             replace_way_onehot = Signal(NUM_WAYS)
1379             comb += replace_way_onehot.eq(1<<replace_way)
1380             for i in range(NUM_WAYS):
1381                 with m.If(replace_way_onehot[i]):
1382                     ct = Signal(TAG_RAM_WIDTH)
1383                     comb += ct.eq(cache_tags[r1.store_index].tag)
1384                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1385                     sync += cache_tags[r1.store_index].tag.eq(ct)
1386             sync += r1.store_way.eq(replace_way)
1387             sync += r1.write_tag.eq(0)
1388
1389         # Take request from r1.req if there is one there,
1390         # else from req_op, ra, etc.
1391         with m.If(r1.full):
1392             comb += req.eq(r1.req)
1393         with m.Else():
1394             comb += req.op.eq(req_op)
1395             comb += req.valid.eq(req_go)
1396             comb += req.mmu_req.eq(r0.mmu_req)
1397             comb += req.dcbz.eq(r0.req.dcbz)
1398             comb += req.real_addr.eq(ra)
1399
1400             with m.If(r0.req.dcbz):
1401                 # force data to 0 for dcbz
1402                 comb += req.data.eq(0)
1403             with m.Elif(r0.d_valid):
1404                 comb += req.data.eq(r0.req.data)
1405             with m.Else():
1406                 comb += req.data.eq(d_in.data)
1407
1408             # Select all bytes for dcbz
1409             # and for cacheable loads
1410             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1411                 comb += req.byte_sel.eq(~0) # all 1s
1412             with m.Else():
1413                 comb += req.byte_sel.eq(r0.req.byte_sel)
1414             comb += req.hit_way.eq(req_hit_way)
1415             comb += req.same_tag.eq(req_same_tag)
1416
1417             # Store the incoming request from r0,
1418             # if it is a slow request
1419             # Note that r1.full = 1 implies req_op = OP_NONE
1420             with m.If((req_op == Op.OP_LOAD_MISS)
1421                       | (req_op == Op.OP_LOAD_NC)
1422                       | (req_op == Op.OP_STORE_MISS)
1423                       | (req_op == Op.OP_STORE_HIT)):
1424                 sync += r1.req.eq(req)
1425                 sync += r1.full.eq(1)
1426
1427         # Main state machine
1428         with m.Switch(r1.state):
1429
1430             with m.Case(State.IDLE):
1431                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1432                 sync += r1.wb.sel.eq(req.byte_sel)
1433                 sync += r1.wb.dat.eq(req.data)
1434                 sync += r1.dcbz.eq(req.dcbz)
1435
1436                 # Keep track of our index and way
1437                 # for subsequent stores.
1438                 sync += r1.store_index.eq(req_idx)
1439                 sync += r1.store_row.eq(req_row)
1440                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1441                 sync += r1.reload_tag.eq(req_tag)
1442                 sync += r1.req.same_tag.eq(1)
1443
1444                 with m.If(req.op == Op.OP_STORE_HIT):
1445                     sync += r1.store_way.eq(req.hit_way)
1446
1447                 #with m.If(r1.dec_acks):
1448                 #    sync += r1.acks_pending.eq(r1.acks_pending - 1)
1449
1450                 # Reset per-row valid bits,
1451                 # ready for handling OP_LOAD_MISS
1452                 for i in range(ROW_PER_LINE):
1453                     sync += r1.rows_valid[i].eq(0)
1454
1455                 with m.If(req_op != Op.OP_NONE):
1456                     sync += Display("cache op %d", req.op)
1457
1458                 with m.Switch(req.op):
1459                     with m.Case(Op.OP_LOAD_HIT):
1460                         # stay in IDLE state
1461                         pass
1462
1463                     with m.Case(Op.OP_LOAD_MISS):
1464                         sync += Display("cache miss real addr: %x " \
1465                                 "idx: %x tag: %x",
1466                                 req.real_addr, req_row, req_tag)
1467
1468                         # Start the wishbone cycle
1469                         sync += r1.wb.we.eq(0)
1470                         sync += r1.wb.cyc.eq(1)
1471                         sync += r1.wb.stb.eq(1)
1472
1473                         # Track that we had one request sent
1474                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1475                         sync += r1.write_tag.eq(1)
1476
1477                     with m.Case(Op.OP_LOAD_NC):
1478                         sync += r1.wb.cyc.eq(1)
1479                         sync += r1.wb.stb.eq(1)
1480                         sync += r1.wb.we.eq(0)
1481                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1482
1483                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1484                         with m.If(~req.dcbz):
1485                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1486                             sync += r1.acks_pending.eq(1)
1487                             sync += r1.full.eq(0)
1488                             sync += r1.slow_valid.eq(1)
1489
1490                             with m.If(req.mmu_req):
1491                                 sync += r1.mmu_done.eq(1)
1492                             with m.Else():
1493                                 sync += r1.ls_valid.eq(1)
1494
1495                             with m.If(req.op == Op.OP_STORE_HIT):
1496                                 sync += r1.write_bram.eq(1)
1497                         with m.Else():
1498                             # dcbz is handled much like a load miss except
1499                             # that we are writing to memory instead of reading
1500                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1501
1502                             with m.If(req.op == Op.OP_STORE_MISS):
1503                                 sync += r1.write_tag.eq(1)
1504
1505                         sync += r1.wb.we.eq(1)
1506                         sync += r1.wb.cyc.eq(1)
1507                         sync += r1.wb.stb.eq(1)
1508
1509                     # OP_NONE and OP_BAD do nothing
1510                     # OP_BAD & OP_STCX_FAIL were
1511                     # handled above already
1512                     with m.Case(Op.OP_NONE):
1513                         pass
1514                     with m.Case(Op.OP_BAD):
1515                         pass
1516                     with m.Case(Op.OP_STCX_FAIL):
1517                         pass
1518
1519             with m.Case(State.RELOAD_WAIT_ACK):
1520                 ld_stbs_done = Signal()
1521                 # Requests are all sent if stb is 0
1522                 comb += ld_stbs_done.eq(~r1.wb.stb)
1523
1524                 # If we are still sending requests, was one accepted?
1525                 with m.If((~bus.stall) & r1.wb.stb):
1526                     # That was the last word?  We are done sending.
1527                     # Clear stb and set ld_stbs_done so we can handle an
1528                     # eventual last ack on the same cycle.
1529                     # sigh - reconstruct wb adr with 3 extra 0s at front
1530                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1531                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1532                         sync += r1.wb.stb.eq(0)
1533                         comb += ld_stbs_done.eq(1)
1534
1535                     # Calculate the next row address in the current cache line
1536                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1537                     comb += row.eq(r1.wb.adr)
1538                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1539
1540                 # Incoming acks processing
1541                 sync += r1.forward_valid1.eq(bus.ack)
1542                 with m.If(bus.ack):
1543                     srow = Signal(ROW_LINE_BITS)
1544                     comb += srow.eq(r1.store_row)
1545                     sync += r1.rows_valid[srow].eq(1)
1546
1547                     # If this is the data we were looking for,
1548                     # we can complete the request next cycle.
1549                     # Compare the whole address in case the
1550                     # request in r1.req is not the one that
1551                     # started this refill.
1552                     with m.If(req.valid & r1.req.same_tag &
1553                               ((r1.dcbz & r1.req.dcbz) |
1554                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1555                                 (r1.store_row == get_row(req.real_addr))):
1556                         sync += r1.full.eq(0)
1557                         sync += r1.slow_valid.eq(1)
1558                         with m.If(r1.mmu_req):
1559                             sync += r1.mmu_done.eq(1)
1560                         with m.Else():
1561                             sync += r1.ls_valid.eq(1)
1562                         sync += r1.forward_sel.eq(~0) # all 1s
1563                         sync += r1.use_forward1.eq(1)
1564
1565                     # Check for completion
1566                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1567                                                       r1.end_row_ix)):
1568                         # Complete wishbone cycle
1569                         sync += r1.wb.cyc.eq(0)
1570
1571                         # Cache line is now valid
1572                         cv = Signal(INDEX_BITS)
1573                         comb += cv.eq(cache_tags[r1.store_index].valid)
1574                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1575                         sync += cache_tags[r1.store_index].valid.eq(cv)
1576
1577                         sync += r1.state.eq(State.IDLE)
1578                         sync += Display("cache valid set %x "
1579                                         "idx %d way %d",
1580                                          cv, r1.store_index, r1.store_way)
1581
1582                     # Increment store row counter
1583                     sync += r1.store_row.eq(next_row(r1.store_row))
1584
1585             with m.Case(State.STORE_WAIT_ACK):
1586                 st_stbs_done = Signal()
1587                 adjust_acks = Signal(3)
1588
1589                 comb += st_stbs_done.eq(~r1.wb.stb)
1590
1591                 with m.If(r1.inc_acks != r1.dec_acks):
1592                     with m.If(r1.inc_acks):
1593                         comb += adjust_acks.eq(r1.acks_pending + 1)
1594                     with m.Else():
1595                         comb += adjust_acks.eq(r1.acks_pending - 1)
1596                 with m.Else():
1597                     comb += adjust_acks.eq(r1.acks_pending)
1598
1599                 sync += r1.acks_pending.eq(adjust_acks)
1600
1601                 # Clear stb when slave accepted request
1602                 with m.If(~bus.stall):
1603                     # See if there is another store waiting
1604                     # to be done which is in the same real page.
1605                     with m.If(req.valid):
1606                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1607                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1608                         sync += r1.wb.dat.eq(req.data)
1609                         sync += r1.wb.sel.eq(req.byte_sel)
1610
1611                     with m.If((adjust_acks < 7) & req.same_tag &
1612                                 ((req.op == Op.OP_STORE_MISS) |
1613                                  (req.op == Op.OP_STORE_HIT))):
1614                         sync += r1.wb.stb.eq(1)
1615                         comb += st_stbs_done.eq(0)
1616                         sync += r1.store_way.eq(req.hit_way)
1617                         sync += r1.store_row.eq(get_row(req.real_addr))
1618
1619                         with m.If(req.op == Op.OP_STORE_HIT):
1620                             sync += r1.write_bram.eq(1)
1621                         sync += r1.full.eq(0)
1622                         sync += r1.slow_valid.eq(1)
1623
1624                         # Store requests never come from the MMU
1625                         sync += r1.ls_valid.eq(1)
1626                         comb += st_stbs_done.eq(0)
1627                         sync += r1.inc_acks.eq(1)
1628                     with m.Else():
1629                         sync += r1.wb.stb.eq(0)
1630                         comb += st_stbs_done.eq(1)
1631
1632                 # Got ack ? See if complete.
1633                 sync += Display("got ack %d %d stbs %d adjust_acks %d",
1634                                 bus.ack, bus.ack, st_stbs_done, adjust_acks)
1635                 with m.If(bus.ack):
1636                     with m.If(st_stbs_done & (adjust_acks == 1)):
1637                         sync += r1.state.eq(State.IDLE)
1638                         sync += r1.wb.cyc.eq(0)
1639                         sync += r1.wb.stb.eq(0)
1640                     sync += r1.dec_acks.eq(1)
1641
1642             with m.Case(State.NC_LOAD_WAIT_ACK):
1643                 # Clear stb when slave accepted request
1644                 with m.If(~bus.stall):
1645                     sync += r1.wb.stb.eq(0)
1646
1647                 # Got ack ? complete.
1648                 with m.If(bus.ack):
1649                     sync += r1.state.eq(State.IDLE)
1650                     sync += r1.full.eq(0)
1651                     sync += r1.slow_valid.eq(1)
1652
1653                     with m.If(r1.mmu_req):
1654                         sync += r1.mmu_done.eq(1)
1655                     with m.Else():
1656                         sync += r1.ls_valid.eq(1)
1657
1658                     sync += r1.forward_sel.eq(~0) # all 1s
1659                     sync += r1.use_forward1.eq(1)
1660                     sync += r1.wb.cyc.eq(0)
1661                     sync += r1.wb.stb.eq(0)
1662
1663     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1664
1665         sync = m.d.sync
1666         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1667
1668         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1669                                stall_out, req_op[:3], d_out.valid, d_out.error,
1670                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1671                                r1.real_adr[3:6]))
1672
1673     def elaborate(self, platform):
1674
1675         m = Module()
1676         comb = m.d.comb
1677         d_in = self.d_in
1678
1679         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1680         cache_tags       = CacheTagArray()
1681         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1682
1683         # TODO attribute ram_style : string;
1684         # TODO attribute ram_style of cache_tags : signal is "distributed";
1685
1686         """note: these are passed to nmigen.hdl.Memory as "attributes".
1687            don't know how, just that they are.
1688         """
1689         # TODO attribute ram_style of
1690         #  dtlb_tags : signal is "distributed";
1691         # TODO attribute ram_style of
1692         #  dtlb_ptes : signal is "distributed";
1693
1694         r0      = RegStage0("r0")
1695         r0_full = Signal()
1696
1697         r1 = RegStage1("r1")
1698
1699         reservation = Reservation()
1700
1701         # Async signals on incoming request
1702         req_index    = Signal(INDEX_BITS)
1703         req_row      = Signal(ROW_BITS)
1704         req_hit_way  = Signal(WAY_BITS)
1705         req_tag      = Signal(TAG_BITS)
1706         req_op       = Signal(Op)
1707         req_data     = Signal(64)
1708         req_same_tag = Signal()
1709         req_go       = Signal()
1710
1711         early_req_row     = Signal(ROW_BITS)
1712
1713         cancel_store      = Signal()
1714         set_rsrv          = Signal()
1715         clear_rsrv        = Signal()
1716
1717         r0_valid          = Signal()
1718         r0_stall          = Signal()
1719
1720         use_forward1_next = Signal()
1721         use_forward2_next = Signal()
1722
1723         cache_out_row     = Signal(WB_DATA_BITS)
1724
1725         plru_victim       = Signal(WAY_BITS)
1726         replace_way       = Signal(WAY_BITS)
1727
1728         # Wishbone read/write/cache write formatting signals
1729         bus_sel           = Signal(8)
1730
1731         # TLB signals
1732         tlb_way       = TLBRecord("tlb_way")
1733         tlb_req_index = Signal(TLB_SET_BITS)
1734         tlb_hit       = TLBHit("tlb_hit")
1735         pte           = Signal(TLB_PTE_BITS)
1736         ra            = Signal(REAL_ADDR_BITS)
1737         valid_ra      = Signal()
1738         perm_attr     = PermAttr("dc_perms")
1739         rc_ok         = Signal()
1740         perm_ok       = Signal()
1741         access_ok     = Signal()
1742
1743         tlb_plru_victim = Signal(TLB_WAY_BITS)
1744
1745         # we don't yet handle collisions between loadstore1 requests
1746         # and MMU requests
1747         comb += self.m_out.stall.eq(0)
1748
1749         # Hold off the request in r0 when r1 has an uncompleted request
1750         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1751         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1752         comb += self.stall_out.eq(r0_stall)
1753
1754         # deal with litex not doing wishbone pipeline mode
1755         # XXX in wrong way.  FIFOs are needed in the SRAM test
1756         # so that stb/ack match up. same thing done in icache.py
1757         if not self.microwatt_compat:
1758             comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1759
1760         # Wire up wishbone request latch out of stage 1
1761         comb += self.bus.we.eq(r1.wb.we)
1762         comb += self.bus.adr.eq(r1.wb.adr)
1763         comb += self.bus.sel.eq(r1.wb.sel)
1764         comb += self.bus.stb.eq(r1.wb.stb)
1765         comb += self.bus.dat_w.eq(r1.wb.dat)
1766         comb += self.bus.cyc.eq(r1.wb.cyc)
1767
1768         # create submodule TLBUpdate
1769         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
1770
1771         # call sub-functions putting everything together, using shared
1772         # signals established above
1773         self.stage_0(m, r0, r1, r0_full)
1774         self.tlb_read(m, r0_stall, tlb_way)
1775         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1776                         tlb_way,
1777                         pte, tlb_hit, valid_ra, perm_attr, ra)
1778         self.tlb_update(m, r0_valid, r0, tlb_req_index,
1779                         tlb_hit, tlb_plru_victim)
1780         self.maybe_plrus(m, r1, plru_victim)
1781         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1782         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1783         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1784                            r0_valid, r1, cache_tags, replace_way,
1785                            use_forward1_next, use_forward2_next,
1786                            req_hit_way, plru_victim, rc_ok, perm_attr,
1787                            valid_ra, perm_ok, access_ok, req_op, req_go,
1788                            tlb_hit, tlb_way, cache_tag_set,
1789                            cancel_store, req_same_tag, r0_stall, early_req_row)
1790         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1791                            r0_valid, r0, reservation)
1792         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1793                            reservation, r0)
1794         self.writeback_control(m, r1, cache_out_row)
1795         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1796         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1797                         req_hit_way, req_index, req_tag, access_ok,
1798                         tlb_hit, tlb_req_index)
1799         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1800                     r0, replace_way,
1801                     req_hit_way, req_same_tag,
1802                          r0_valid, req_op, cache_tags, req_go, ra)
1803         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1804
1805         return m
1806
1807
1808 if __name__ == '__main__':
1809     dut = DCache()
1810     vl = rtlil.convert(dut, ports=[])
1811     with open("test_dcache.il", "w") as f:
1812         f.write(vl)