src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 """
   6
   7 from enum import Enum, unique
   8
   9 from nmigen import Module, Signal, Elaboratable,
  10                    Cat, Repl
  11 from nmigen.cli import main
  12 from nmigen.iocontrol import RecordObject
  13 from nmigen.util import log2_int
  14
  15 from experiment.mem_types import LoadStore1ToDCacheType,
  16                                  DCacheToLoadStore1Type,
  17                                  MMUToDCacheType,
  18                                  DCacheToMMUType
  19
  20 from experiment.wb_types import WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  21                                 WBAddrType, WBDataType, WBSelType,
  22                                 WbMasterOut, WBSlaveOut,
  23                                 WBMasterOutVector, WBSlaveOutVector,
  24                                 WBIOMasterOut, WBIOSlaveOut
  25
  26 # TODO: make these parameters of DCache at some point
  27 LINE_SIZE = 64    # Line size in bytes
  28 NUM_LINES = 32    # Number of lines in a set
  29 NUM_WAYS = 4      # Number of ways
  30 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  31 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  32 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  33 LOG_LENGTH = 0    # Non-zero to enable log data collection
  34
  35 # BRAM organisation: We never access more than
  36 #     -- wishbone_data_bits at a time so to save
  37 #     -- resources we make the array only that wide, and
  38 #     -- use consecutive indices for to make a cache "line"
  39 #     --
  40 #     -- ROW_SIZE is the width in bytes of the BRAM
  41 #     -- (based on WB, so 64-bits)
  42 ROW_SIZE = WB_DATA_BITS // 8;
  43
  44 # ROW_PER_LINE is the number of row (wishbone
  45 # transactions) in a line
  46 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  47
  48 # BRAM_ROWS is the number of rows in BRAM needed
  49 # to represent the full dcache
  50 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  51
  52
  53 # Bit fields counts in the address
  54
  55 # REAL_ADDR_BITS is the number of real address
  56 # bits that we store
  57 REAL_ADDR_BITS = 56
  58
  59 # ROW_BITS is the number of bits to select a row
  60 ROW_BITS = log2_int(BRAM_ROWS)
  61
  62 # ROW_LINE_BITS is the number of bits to select
  63 # a row within a line
  64 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  65
  66 # LINE_OFF_BITS is the number of bits for
  67 # the offset in a cache line
  68 LINE_OFF_BITS = log2_int(LINE_SIZE)
  69
  70 # ROW_OFF_BITS is the number of bits for
  71 # the offset in a row
  72 ROW_OFF_BITS = log2_int(ROW_SIZE)
  73
  74 # INDEX_BITS is the number if bits to
  75 # select a cache line
  76 INDEX_BITS = log2_int(NUM_LINES)
  77
  78 # SET_SIZE_BITS is the log base 2 of the set size
  79 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
  80
  81 # TAG_BITS is the number of bits of
  82 # the tag part of the address
  83 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
  84
  85 # TAG_WIDTH is the width in bits of each way of the tag RAM
  86 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
  87
  88 # WAY_BITS is the number of bits to select a way
  89 WAY_BITS = log2_int(NUM_WAYS)
  90
  91 # Example of layout for 32 lines of 64 bytes:
  92 #
  93 # ..  tag    |index|  line  |
  94 # ..         |   row   |    |
  95 # ..         |     |---|    | ROW_LINE_BITS  (3)
  96 # ..         |     |--- - --| LINE_OFF_BITS (6)
  97 # ..         |         |- --| ROW_OFF_BITS  (3)
  98 # ..         |----- ---|    | ROW_BITS      (8)
  99 # ..         |-----|        | INDEX_BITS    (5)
 100 # .. --------|              | TAG_BITS      (45)
 101
 102 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 103
 104 def CacheTagArray():
 105     return Array(CacheTagSet() for x in range(NUM_LINES))
 106
 107 def CacheValidBitsArray():
 108     return Array(CacheWayValidBits() for x in range(NUM_LINES))
 109
 110 def RowPerLineValidArray():
 111     return Array(Signal() for x in range(ROW_PER_LINE))
 112
 113 # L1 TLB
 114 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 115 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 116 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 117 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 118 TLB_PTE_BITS     = 64
 119 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 120
 121 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 122 assert (LINE_SIZE % 2) == 0, "LINE_SIZE not power of 2"
 123 assert (NUM_LINES % 2) == 0, "NUM_LINES not power of 2"
 124 assert (ROW_PER_LINE % 2) == 0, "ROW_PER_LINE not power of 2"
 125 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 126 assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS), \
 127         "geometry bits don't add up"
 128 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 129         "geometry bits don't add up"
 130 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 131          "geometry bits don't add up"
 132 assert 64 == wishbone_data_bits, "Can't yet handle wb width that isn't 64-bits"
 133 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 134
 135
 136 def TLBValidBitsArray():
 137     return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
 138
 139 def TLBTagsArray():
 140     return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
 141
 142 def TLBPtesArray():
 143     return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
 144
 145 def HitWaySet():
 146     return Array(Signal(NUM_WAYS) for x in range(TLB_NUM_WAYS))
 147
 148 # Cache RAM interface
 149 def CacheRamOut():
 150     return Array(Signal(WB_DATA_BITS) for x in range(NUM_WAYS))
 151
 152 # PLRU output interface
 153 def PLRUOut():
 154     return Array(Signal(WAY_BITS) for x in range(Index()))
 155
 156 # TLB PLRU output interface
 157 def TLBPLRUOut():
 158     return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
 159
 160 # Helper functions to decode incoming requests
 161 #
 162 # Return the cache line index (tag index) for an address
 163 def get_index(addr):
 164     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 165
 166 # Return the cache row index (data memory) for an address
 167 def get_row(addr):
 168     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 169
 170 # Return the index of a row within a line
 171 def get_row_of_line(row):
 172     row_v = Signal(ROW_BITS)
 173     row_v = Signal(row)
 174     return row_v[0:ROW_LINE_BITS]
 175
 176 # Returns whether this is the last row of a line
 177 def is_last_row_addr(addr, last):
 178     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 179
 180 # Returns whether this is the last row of a line
 181 def is_last_row(row, last):
 182     return get_row_of_line(row) == last
 183
 184 # Return the address of the next row in the current cache line
 185 def next_row_addr(addr):
 186     row_idx = Signal(ROW_LINE_BITS)
 187     result  = WBAddrType()
 188     # Is there no simpler way in VHDL to
 189     # generate that 3 bits adder ?
 190     row_idx = addr[ROW_OFF_BITS:LINE_OFF_BITS]
 191     row_idx = Signal(row_idx + 1)
 192     result = addr
 193     result[ROW_OFF_BITS:LINE_OFF_BITS] = row_idx
 194     return result
 195
 196 # Return the next row in the current cache line. We use a
 197 # dedicated function in order to limit the size of the
 198 # generated adder to be only the bits within a cache line
 199 # (3 bits with default settings)
 200 def next_row(row)
 201     row_v = row[0:ROW_LINE_BITS] + 1
 202     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 203
 204 # Get the tag value from the address
 205 def get_tag(addr):
 206     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 207
 208 # Read a tag from a tag memory row
 209 def read_tag(way, tagset):
 210     return tagset[way *TAG_WIDTH:way * TAG_WIDTH + TAG_BITS]
 211
 212 # Read a TLB tag from a TLB tag memory row
 213 def read_tlb_tag(way, tags):
 214     j = way * TLB_EA_TAG_BITS
 215     return tags[j:j + TLB_EA_TAG_BITS]
 216
 217 # Write a TLB tag to a TLB tag memory row
 218 def write_tlb_tag(way, tags), tag):
 219     j = way * TLB_EA_TAG_BITS
 220     tags[j:j + TLB_EA_TAG_BITS] = tag
 221
 222 # Read a PTE from a TLB PTE memory row
 223 def read_tlb_pte(way, ptes):
 224     j = way * TLB_PTE_BITS
 225     return ptes[j:j + TLB_PTE_BITS]
 226
 227 def write_tlb_pte(way, ptes,newpte):
 228     j = way * TLB_PTE_BITS
 229     return ptes[j:j + TLB_PTE_BITS].eq(newpte)
 230
 231
 232 # Record for storing permission, attribute, etc. bits from a PTE
 233 class PermAttr(RecordObject):
 234     def __init__(self):
 235         super().__init__()
 236         self.reference = Signal()
 237         self.changed   = Signal()
 238         self.nocache   = Signal()
 239         self.priv      = Signal()
 240         self.rd_perm   = Signal()
 241         self.wr_perm   = Signal()
 242
 243
 244 def extract_perm_attr(pte):
 245     pa = PermAttr()
 246     pa.reference = pte[8]
 247     pa.changed   = pte[7]
 248     pa.nocache   = pte[5]
 249     pa.priv      = pte[3]
 250     pa.rd_perm   = pte[2]
 251     pa.wr_perm   = pte[1]
 252     return pa;
 253
 254
 255 # Type of operation on a "valid" input
 256 @unique
 257 class Op(Enum):
 258     OP_NONE       = 0
 259     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 260     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 261     OP_LOAD_HIT   = 3 # Cache hit on load
 262     OP_LOAD_MISS  = 4 # Load missing cache
 263     OP_LOAD_NC    = 5 # Non-cachable load
 264     OP_STORE_HIT  = 6 # Store hitting cache
 265     OP_STORE_MISS = 7 # Store missing cache
 266
 267
 268 # Cache state machine
 269 @unique
 270 class State(Enum):
 271     IDLE             = 0 # Normal load hit processing
 272     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 273     STORE_WAIT_ACK   = 2 # Store wait ack
 274     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 275
 276
 277 # Dcache operations:
 278 #
 279 # In order to make timing, we use the BRAMs with
 280 # an output buffer, which means that the BRAM
 281 # output is delayed by an extra cycle.
 282 #
 283 # Thus, the dcache has a 2-stage internal pipeline
 284 # for cache hits with no stalls.
 285 #
 286 # All other operations are handled via stalling
 287 # in the first stage.
 288 #
 289 # The second stage can thus complete a hit at the same
 290 # time as the first stage emits a stall for a complex op.
 291 #
 292 # Stage 0 register, basically contains just the latched request
 293
 294 class RegStage0(RecordObject):
 295     def __init__(self):
 296         super().__init__()
 297         self.req     = LoadStore1ToDCacheType()
 298         self.tlbie   = Signal()
 299         self.doall   = Signal()
 300         self.tlbld   = Signal()
 301         self.mmu_req = Signal() # indicates source of request
 302
 303
 304 class MemAccessRequest(RecordObject):
 305     def __init__(self):
 306         super().__init__()
 307         self.op        = Op()
 308         self.valid     = Signal()
 309         self.dcbz      = Signal()
 310         self.real_addr = Signal(REAL_ADDR_BITS)
 311         self.data      = Signal(64)
 312         self.byte_sel  = Signal(8)
 313         self.hit_way   = Signal(WAY_BITS)
 314         self.same_tag  = Signal()
 315         self.mmu_req   = Signal()
 316
 317
 318 # First stage register, contains state for stage 1 of load hits
 319 # and for the state machine used by all other operations
 320 class RegStage1(RecordObject):
 321     def __init__(self):
 322         super().__init__()
 323         # Info about the request
 324         self.full             = Signal() # have uncompleted request
 325         self.mmu_req          = Signal() # request is from MMU
 326         self.req              = MemAccessRequest()
 327
 328         # Cache hit state
 329         self.hit_way          = Signal(WAY_BITS)
 330         self.hit_load_valid   = Signal()
 331         self.hit_index        = Signal(NUM_LINES)
 332         self.cache_hit        = Signal()
 333
 334         # TLB hit state
 335         self.tlb_hit          = Signal()
 336         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 337         self.tlb_hit_index    = Signal(TLB_WAY_BITS)
 338
 339         # 2-stage data buffer for data forwarded from writes to reads
 340         self.forward_data1    = Signal(64)
 341         self.forward_data2    = Signal(64)
 342         self.forward_sel1     = Signal(8)
 343         self.forward_valid1   = Signal()
 344         self.forward_way1     = Signal(WAY_BITS)
 345         self.forward_row1     = Signal(ROW_BITS)
 346         self.use_forward1     = Signal()
 347         self.forward_sel      = Signal(8)
 348
 349         # Cache miss state (reload state machine)
 350         self.state            = State()
 351         self.dcbz             = Signal()
 352         self.write_bram       = Signal()
 353         self.write_tag        = Signal()
 354         self.slow_valid       = Signal()
 355         self.wb               = WishboneMasterOut()
 356         self.reload_tag       = Signal(TAG_BITS)
 357         self.store_way        = Signal(WAY_BITS)
 358         self.store_row        = Signal(ROW_BITS)
 359         self.store_index      = Signal(INDEX_BITS)
 360         self.end_row_ix       = Signal(log2_int(ROW_LINE_BITS))
 361         self.rows_valid       = RowPerLineValidArray()
 362         self.acks_pending     = Signal(3)
 363         self.inc_acks         = Signal()
 364         self.dec_acks         = Signal()
 365
 366         # Signals to complete (possibly with error)
 367         self.ls_valid         = Signal()
 368         self.ls_error         = Signal()
 369         self.mmu_done         = Signal()
 370         self.mmu_error        = Signal()
 371         self.cache_paradox    = Signal()
 372
 373         # Signal to complete a failed stcx.
 374         self.stcx_fail        = Signal()
 375
 376
 377 # Reservation information
 378 class Reservation(RecordObject):
 379     def __init__(self):
 380         super().__init__()
 381         self.valid = Signal()
 382         self.addr  = Signal(64-LINE_OFF_BITS)
 383
 384
 385 class DCache(Elaboratable):
 386     """Set associative dcache write-through
 387     TODO (in no specific order):
 388     * See list in icache.vhdl
 389     * Complete load misses on the cycle when WB data comes instead of
 390       at the end of line (this requires dealing with requests coming in
 391       while not idle...)
 392     """
 393     def __init__(self):
 394         self.d_in      = LoadStore1ToDCacheType()
 395         self.d_out     = DCacheToLoadStore1Type()
 396
 397         self.m_in      = MMUToDCacheType()
 398         self.m_out     = DCacheToMMUType()
 399
 400         self.stall_out = Signal()
 401
 402         self.wb_out    = WBMasterOut()
 403         self.wb_in     = WBSlaveOut()
 404
 405         self.log_out   = Signal(20)
 406
 407     def stage_0(self, m, d_in, m_in):
 408         """Latch the request in r0.req as long as we're not stalling
 409         """
 410         comb = m.d.comb
 411         sync = m.d.sync
 412
 413         r = RegStage0()
 414
 415         # TODO, this goes in unit tests and formal proofs
 416         with m.If(~(d_in.valid & m_in.valid)):
 417             #sync += Display("request collision loadstore vs MMU")
 418             pass
 419
 420         with m.If(m_in.valid):
 421             sync += r.req.valid.eq(1)
 422             sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 423             sync += r.req.dcbz.eq(0)
 424             sync += r.req.nc.eq(0)
 425             sync += r.req.reserve.eq(0)
 426             sync += r.req.virt_mode.eq(1)
 427             sync += r.req.priv_mode.eq(1)
 428             sync += r.req.addr.eq(m_in.addr)
 429             sync += r.req.data.eq(m_in.pte)
 430             sync += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 431             sync += r.tlbie.eq(m_in.tlbie)
 432             sync += r.doall.eq(m_in.doall)
 433             sync += r.tlbld.eq(m_in.tlbld)
 434             sync += r.mmu_req.eq(1)
 435         with m.Else():
 436             sync += r.req.eq(d_in)
 437             sync += r.req.tlbie.eq(0)
 438             sync += r.req.doall.eq(0)
 439             sync += r.req.tlbd.eq(0)
 440             sync += r.req.mmu_req.eq(0)
 441             with m.If(~(r1.full & r0_full)):
 442                 sync += r0.eq(r)
 443                 sync += r0_full.eq(r.req.valid)
 444
 445     def tlb_read(self, m, m_in, d_in, r0_stall, tlb_valid_way,
 446                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 447                  dtlb_tags, dtlb_ptes):
 448         """TLB
 449         Operates in the second cycle on the request latched in r0.req.
 450         TLB updates write the entry at the end of the second cycle.
 451         """
 452         comb = m.d.comb
 453         sync = m.d.sync
 454
 455         index    = Signal(TLB_SET_BITS)
 456         addrbits = Signal(TLB_SET_BITS)
 457
 458         amin = TLB_LG_PGSZ
 459         amax = TLB_LG_PGSZ + TLB_SET_BITS
 460
 461         with m.If(m_in.valid):
 462             comb += addrbits.eq(m_in.addr[amin : amax])
 463         with m.Else():
 464             comb += addrbits.eq(d_in.addr[amin : amax])
 465         comb += index.eq(addrbits)
 466
 467         # If we have any op and the previous op isn't finished,
 468         # then keep the same output for next cycle.
 469         with m.If(~r0_stall):
 470             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 471             sync += tlb_tag_way.eq(dtlb_tags[index])
 472             sync += tlb_pte_way.eq(dtlb_ptes[index])
 473
 474     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, acc, acc_en, lru):
 475         """Generate TLB PLRUs
 476         """
 477         comb = m.d.comb
 478         sync = m.d.sync
 479
 480         with m.If(TLB_NUM_WAYS > 1):
 481             for i in range(TLB_SET_SIZE):
 482                 # TLB PLRU interface
 483                 tlb_plru        = PLRU(TLB_WAY_BITS)
 484                 setattr(m.submodules, "maybe_plru_%d" % i, tlb_plru)
 485                 tlb_plru_acc    = Signal(TLB_WAY_BITS)
 486                 tlb_plru_acc_en = Signal()
 487                 tlb_plru_out    = Signal(TLB_WAY_BITS)
 488
 489                 comb += tlb_plru.acc.eq(tlb_plru_acc)
 490                 comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 491                 comb += tlb_plru.lru.eq(tlb_plru_out)
 492
 493                 # PLRU interface
 494                 with m.If(r1.tlb_hit_index == i):
 495                     comb += tlb_plru.acc_en.eq(r1.tlb_hit)
 496                 with m.Else():
 497                     comb += tlb_plru.acc_en.eq(0)
 498                 comb += tlb_plru.acc.eq(r1.tlb_hit_way)
 499
 500                 comb += tlb_plru_victim[i].eq(tlb_plru.lru)
 501
 502     def tlb_search(self, m, tlb_req_index, r0, tlb_valid_way_ tlb_tag_way,
 503                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 504
 505         comb = m.d.comb
 506         sync = m.d.sync
 507
 508         hitway = Signal(TLB_WAY_BITS)
 509         hit    = Signal()
 510         eatag  = Signal(TLB_EA_TAG_BITS)
 511
 512         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 513         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 514         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 515
 516         for i in range(TLB_NUM_WAYS):
 517             with m.If(tlb_valid_way(i)
 518                       & read_tlb_tag(i, tlb_tag_way) == eatag):
 519                 comb += hitway.eq(i)
 520                 comb += hit.eq(1)
 521
 522         comb += tlb_hit.eq(hit & r0_valid)
 523         comb += tlb_hit_way.eq(hitway)
 524
 525         with m.If(tlb_hit):
 526             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 527         with m.Else():
 528             comb += pte.eq(0)
 529         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 530         with m.If(r0.req.virt_mode):
 531             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 532                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 533                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 534             comb += perm_attr.eq(extract_perm_attr(pte))
 535         with m.Else():
 536             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 537                               r0.rq.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 538
 539             comb += perm_attr.reference.eq(1)
 540             comb += perm_attr.changed.eq(1)
 541             comb += perm_attr.priv.eq(1)
 542             comb += perm_attr.nocache.eq(0)
 543             comb += perm_attr.rd_perm.eq(1)
 544             comb += perm_attr.wr_perm.eq(1)
 545
 546     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 547                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 548                     dtlb_tags, tlb_pte_way, dtlb_ptes, dtlb_valid_bits):
 549
 550         comb = m.d.comb
 551         sync = m.d.sync
 552
 553         tlbie    = Signal()
 554         tlbwe    = Signal()
 555         repl_way = Signal(TLB_WAY_BITS)
 556         eatag    = Signal(TLB_EA_TAG_BITS)
 557         tagset   = TLBWayTags()
 558         pteset   = TLBWayPtes()
 559
 560         comb += tlbie.eq(r0_valid & r0.tlbie)
 561         comb += tlbwe.eq(r0_valid & r0.tlbldoi)
 562
 563         with m.If(tlbie & r0.doall):
 564             # clear all valid bits at once
 565             for i in range(TLB_SET_SIZE):
 566                 sync += dtlb_valid_bits[i].eq(0)
 567
 568         with m.Elif(tlbie):
 569             with m.If(tlb_hit):
 570                 sync += dtlb_valid_bits[tlb_req_index][tlb_hit_way].eq(0)
 571         with m.Elif(tlbwe):
 572             with m.If(tlb_hit):
 573                 comb += repl_way.eq(tlb_hit_way)
 574             with m.Else():
 575                 comb += repl_way.eq(tlb_plru_victim[tlb_req_index])
 576             comb += eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 577             comb += tagset.eq(tlb_tag_way)
 578             sync += write_tlb_tag(repl_way, tagset, eatag)
 579             sync += dtlb_tags[tlb_req_index].eq(tagset)
 580             comb += pteset.eq(tlb_pte_way)
 581             sync += write_tlb_pte(repl_way, pteset, r0.req.data)
 582             sync += dtlb_ptes[tlb_req_index].eq(pteset)
 583             sync += dtlb_valid_bits[tlb_req_index][repl_way].eq(1)
 584
 585     def maybe_plrus(self, r1):
 586         """Generate PLRUs
 587         """
 588         comb = m.d.comb
 589         sync = m.d.sync
 590
 591         for i in range(NUM_LINES):
 592             # PLRU interface
 593             plru        = PLRU(TLB_WAY_BITS)
 594             setattr(m.submodules, "plru%d" % i, plru)
 595             plru_acc    = Signal(WAY_BITS)
 596             plru_acc_en = Signal()
 597             plru_out    = Signal(WAY_BITS)
 598
 599             comb += plru.acc.eq(plru_acc)
 600             comb += plru.acc_en.eq(plru_acc_en)
 601             comb += plru.lru.eq(plru_out)
 602
 603             with m.If(r1.hit_index == i):
 604                 comb += plru_acc_en.eq(r1.cache_hit)
 605
 606             comb += plru_acc.eq(r1.hit_way)
 607             comb += plru_victim[i].eq(plru_out)
 608
 609     def cache_tag_read(self, m, r0_stall, req_index, m_in, d_in,
 610                        cache_tag_set, cache_tags):
 611         """Cache tag RAM read port
 612         """
 613         comb = m.d.comb
 614         sync = m.d.sync
 615
 616         index = Signal(INDEX_BITS)
 617
 618         with m.If(r0_stall):
 619             comb += index.eq(req_index)
 620         with m.Elif(m_in.valid):
 621             comb += index.eq(get_index(m_in.addr))
 622         with m.Else():
 623             comb += index.eq(get_index(d_in.addr))
 624         sync += cache_tag_set.eq(cache_tags[index])
 625
 626     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 627                        r0_valid, r1, cache_valid_bits, replace_way,
 628                        use_forward1_next, use_forward2_next,
 629                        req_hit_way, plru_victim, rc_ok, perm_attr,
 630                        valid_ra, perm_ok, access_ok, req_op, req_ok,
 631                        r0_stall, m_in, early_req_row, d_in):
 632         """Cache request parsing and hit detection
 633         """
 634
 635         comb = m.d.comb
 636         sync = m.d.sync
 637
 638         is_hit      = Signal()
 639         hit_way     = Signal(WAY_BITS)
 640         op          = Op()
 641         opsel       = Signal(3)
 642         go          = Signal()
 643         nc          = Signal()
 644         s_hit       = Signal()
 645         s_tag       = Signal(TAG_BITS)
 646         s_pte       = Signal(TLB_PTE_BITS)
 647         s_ra        = Signal(REAL_ADDR_BITS)
 648         hit_set     = Signal(TLB_NUM_WAYS)
 649         hit_way_set = HitWaySet()
 650         rel_matches = Signal(TLB_NUM_WAYS)
 651         rel_match   = Signal()
 652
 653         # Extract line, row and tag from request
 654         comb += req_index.eq(get_index(r0.req.addr))
 655         comb += req_row.eq(get_row(r0.req.addr))
 656         comb += req_tag.eq(get_tag(ra))
 657
 658         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 659
 660         # Test if pending request is a hit on any way
 661         # In order to make timing in virtual mode,
 662         # when we are using the TLB, we compare each
 663         # way with each of the real addresses from each way of
 664         # the TLB, and then decide later which match to use.
 665
 666         with m.If(r0.req.virt_mode):
 667             comb += rel_matches.eq(0)
 668             for j in range(TLB_NUM_WAYS):
 669                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 670                 comb += s_ra.eq(Cat(r0.req.addr[0:TLB_LG_PGSZ],
 671                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 672                 comb += s_tag.eq(get_tag(s_ra))
 673
 674                 for i in range(NUM_WAYS):
 675                     with m.If(go & cache_valid_bits[req_index][i] &
 676                               read_tag(i, cache_tag_set) == s_tag
 677                               & tlb_valid_way[j]):
 678                         comb += hit_way_set[j].eq(i)
 679                         comb += s_hit.eq(1)
 680                 comb += hit_set[j].eq(s_hit)
 681                 with m.If(s_tag == r1.reload_tag):
 682                     comb += rel_matches[j].eq(1)
 683             with m.If(tlb_hit):
 684                 comb += is_hit.eq(hit_set[tlb_hit_way])
 685                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 686                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 687         with m.Else():
 688             comb += s_tag.eq(get_tag(r0.req.addr))
 689             for i in range(NUM_WAYS):
 690                 with m.If(go & cache_valid_bits[req_index][i] &
 691                           read_tag(i, cache_tag_set) == s_tag):
 692                     comb += hit_way.eq(i)
 693                     comb += is_hit.eq(1)
 694             with m.If(s_tag == r1.reload_tag):
 695                 comb += rel_match.eq(1)
 696         comb += req_same_tag.eq(rel_match)
 697
 698         # See if the request matches the line currently being reloaded
 699         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 700                   (req_index == r1.store_index) & rel_match):
 701             # For a store, consider this a hit even if the row isn't
 702             # valid since it will be by the time we perform the store.
 703             # For a load, check the appropriate row valid bit.
 704             valid = r1.rows_valid[req_row % ROW_PER_LINE]
 705             comb += is_hit.eq(~r0.req.load | valid)
 706             comb += hit_way.eq(replace_way)
 707
 708         # Whether to use forwarded data for a load or not
 709         comb += use_forward1_next.eq(0)
 710         with m.If((get_row(r1.req.real_addr) == req_row)
 711                   & (r1.req.hit_way == hit_way))
 712             # Only need to consider r1.write_bram here, since if we
 713             # are writing refill data here, then we don't have a
 714             # cache hit this cycle on the line being refilled.
 715             # (There is the possibility that the load following the
 716             # load miss that started the refill could be to the old
 717             # contents of the victim line, since it is a couple of
 718             # cycles after the refill starts before we see the updated
 719             # cache tag. In that case we don't use the bypass.)
 720             comb += use_forward1_next.eq(r1.write_bram)
 721         comb += use_forward2_next.eq(0)
 722         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 723             comb += use_forward2_next.eq(r1.forward_valid1)
 724
 725         # The way that matched on a hit
 726         comb += req_hit_way.eq(hit_way)
 727
 728         # The way to replace on a miss
 729         with m.If(r1.write_tag):
 730             replace_way.eq(plru_victim[r1.store_index])
 731         with m.Else():
 732             comb += replace_way.eq(r1.store_way)
 733
 734         # work out whether we have permission for this access
 735         # NB we don't yet implement AMR, thus no KUAP
 736         comb += rc_ok.eq(perm_attr.reference
 737                          & (r0.req.load | perm_attr.changed)
 738                 )
 739         comb += perm_ok.eq((r0.req.prive_mode | ~perm_attr.priv)
 740                            & perm_attr.wr_perm
 741                            | (r0.req.load & perm_attr.rd_perm)
 742                           )
 743         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 744         # Combine the request and cache hit status to decide what
 745         # operation needs to be done
 746         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 747         comb += op.eq(Op.OP_NONE)
 748         with m.If(go):
 749             with m.If(~access_ok):
 750                 comb += op.eq(Op.OP_BAD)
 751             with m.Elif(cancel_store):
 752                 comb += op.eq(Op.OP_STCX_FAIL)
 753             with m.Else():
 754                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 755                 with m.Switch(opsel):
 756                     with m.Case(Const(0b101, 3)):
 757                         comb += op.eq(Op.OP_LOAD_HIT)
 758                     with m.Case(Cosnt(0b100, 3)):
 759                         comb += op.eq(Op.OP_LOAD_MISS)
 760                     with m.Case(Const(0b110, 3)):
 761                         comb += op.eq(Op.OP_LOAD_NC)
 762                     with m.Case(Const(0b001, 3)):
 763                         comb += op.eq(Op.OP_STORE_HIT)
 764                     with m.Case(Const(0b000, 3)):
 765                         comb += op.eq(Op.OP_STORE_MISS)
 766                     with m.Case(Const(0b010, 3)):
 767                         comb += op.eq(Op.OP_STORE_MISS)
 768                     with m.Case(Const(0b011, 3)):
 769                         comb += op.eq(Op.OP_BAD)
 770                     with m.Case(Const(0b111, 3)):
 771                         comb += op.eq(Op.OP_BAD)
 772                     with m.Default():
 773                         comb += op.eq(Op.OP_NONE)
 774         comb += req_op.eq(op)
 775         comb += req_go.eq(go)
 776
 777         # Version of the row number that is valid one cycle earlier
 778         # in the cases where we need to read the cache data BRAM.
 779         # If we're stalling then we need to keep reading the last
 780         # row requested.
 781         with m.If(~r0_stall):
 782             with m.If(m_in.valid):
 783                 comb += early_req_row.eq(get_row(m_in.addr))
 784             with m.Else():
 785                 comb += early_req_row.eq(get_row(d_in.addr))
 786         with m.Else():
 787             comb += early_req_row.eq(req_row)
 788
 789     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 790                          r0_valid, r0, reservation):
 791         """Handle load-with-reservation and store-conditional instructions
 792         """
 793         comb = m.d.comb
 794         sync = m.d.sync
 795
 796         with m.If(r0_valid & r0.req.reserve):
 797
 798             # XXX generate alignment interrupt if address
 799             # is not aligned XXX or if r0.req.nc = '1'
 800             with m.If(r0.req.load):
 801                 comb += set_rsrv(1) # load with reservation
 802             with m.Else():
 803                 comb += clear_rsrv.eq(1) # store conditional
 804                 with m.If(~reservation.valid | r0.req.addr[LINE_OFF_BITS:64]):
 805                     comb += cancel_store.eq(1)
 806
 807     def reservation_reg(self, m, r0_valid, access_ok, clear_rsrv,
 808                         reservation, r0):
 809
 810         comb = m.d.comb
 811         sync = m.d.sync
 812
 813         with m.If(r0_valid & access_ok):
 814             with m.If(clear_rsrv):
 815                 sync += reservation.valid.eq(0)
 816             with m.Elif(set_rsrv):
 817                 sync += reservation.valid.eq(1)
 818                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 819
 820     def writeback_control(self, m, r1, cache_out, d_out, m_out):
 821         """Return data for loads & completion control logic
 822         """
 823         comb = m.d.comb
 824         sync = m.d.sync
 825
 826         data_out = Signal(64)
 827         data_fwd = Signal(64)
 828
 829         # Use the bypass if are reading the row that was
 830         # written 1 or 2 cycles ago, including for the
 831         # slow_valid = 1 case (i.e. completing a load
 832         # miss or a non-cacheable load).
 833         with m.If(r1.use_forward1):
 834             comb += data_fwd.eq(r1.forward_data1)
 835         with m.Else():
 836             comb += data_fwd.eq(r1.forward_data2)
 837
 838         comb += data_out.eq(cache_out[r1.hit_way])
 839
 840         for i in range(8):
 841             with m.If(r1.forward_sel[i]):
 842                 dsel = data_fwd.word_select(i, 8)
 843                 comb += data_out.word_select(i, 8).eq(dsel)
 844
 845         comb += d_out.valid.eq(r1.ls_valid)
 846         comb += d_out.data.eq(data_out)
 847         comb += d_out.store_done.eq(~r1.stcx_fail)
 848         comb += d_out.error.eq(r1.ls_error)
 849         comb += d_out.cache_paradox.eq(r1.cache_paradox)
 850
 851         # Outputs to MMU
 852         comb += m_out.done.eq(r1.mmu_done)
 853         comb += m_out.err.eq(r1.mmu_error)
 854         comb += m_out.data.eq(data_out)
 855
 856         # We have a valid load or store hit or we just completed
 857         # a slow op such as a load miss, a NC load or a store
 858         #
 859         # Note: the load hit is delayed by one cycle. However it
 860         # can still not collide with r.slow_valid (well unless I
 861         # miscalculated) because slow_valid can only be set on a
 862         # subsequent request and not on its first cycle (the state
 863         # machine must have advanced), which makes slow_valid
 864         # at least 2 cycles from the previous hit_load_valid.
 865
 866         # Sanity: Only one of these must be set in any given cycle
 867
 868         if False: # TODO: need Display to get this to work
 869             assert (r1.slow_valid & r1.stcx_fail) != 1 "unexpected" \
 870              "slow_valid collision with stcx_fail -!- severity FAILURE"
 871
 872             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1
 873              "unexpected hit_load_delayed collision with slow_valid -!-" \
 874              "severity FAILURE"
 875
 876         with m.If(~r1._mmu_req):
 877             # Request came from loadstore1...
 878             # Load hit case is the standard path
 879             with m.If(r1.hit_load_valid):
 880                 #Display(f"completing load hit data={data_out}")
 881                 pass
 882
 883             # error cases complete without stalling
 884             with m.If(r1.ls_error):
 885                 # Display("completing ld/st with error")
 886                 pass
 887
 888             # Slow ops (load miss, NC, stores)
 889             with m.If(r1.slow_valid):
 890                 #Display(f"completing store or load miss data={data_out}")
 891                 pass
 892
 893         with m.Else():
 894             # Request came from MMU
 895             with m.If(r1.hit_load_valid):
 896                 # Display(f"completing load hit to MMU, data={m_out.data}")
 897                 pass
 898             # error cases complete without stalling
 899             with m.If(r1.mmu_error):
 900                 #Display("combpleting MMU ld with error")
 901                 pass
 902
 903             # Slow ops (i.e. load miss)
 904             with m.If(r1.slow_valid):
 905                 #Display("completing MMU load miss, data={m_out.data}")
 906                 pass
 907
 908     def rams(self, m):
 909         """rams
 910         Generate a cache RAM for each way. This handles the normal
 911         reads, writes from reloads and the special store-hit update
 912         path as well.
 913
 914         Note: the BRAMs have an extra read buffer, meaning the output
 915         is pipelined an extra cycle. This differs from the
 916         icache. The writeback logic needs to take that into
 917         account by using 1-cycle delayed signals for load hits.
 918         """
 919         comb = m.d.comb
 920
 921         for i in range(NUM_WAYS):
 922             do_read  = Signal()
 923             rd_addr  = Signal(ROW_BITS)
 924             do_write = Signal()
 925             wr_addr  = Signal(ROW_BITS)
 926             wr_data  = Signal(WB_DATA_BITS)
 927             wr_sel   = Signal(ROW_SIZE)
 928             wr_sel_m = Signal(ROW_SIZE)
 929             _d_out   = Signal(WB_DATA_BITS)
 930
 931             way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
 932             setattr(m.submodules, "cacheram_%d" % i, way)
 933
 934             comb += way.rd_en.eq(do_read)
 935             comb += way.rd_addr.eq(rd_addr)
 936             comb += _d_out.eq(way.rd_data)
 937             comb += way.wr_sel.eq(wr_sel_m)
 938             comb += way.wr_addr.eq(wr_addr)
 939             comb += way.wr_data.eq(wr_data)
 940
 941             # Cache hit reads
 942             comb += do_read.eq(1)
 943             comb += rd_addr.eq(early_req_row)
 944             comb += cache_out[i].eq(_d_out)
 945
 946             # Write mux:
 947             #
 948             # Defaults to wishbone read responses (cache refill)
 949             #
 950             # For timing, the mux on wr_data/sel/addr is not
 951             # dependent on anything other than the current state.
 952
 953             with m.If(r1.write_bram):
 954                 # Write store data to BRAM.  This happens one
 955                 # cycle after the store is in r0.
 956                 comb += wr_data.eq(r1.req.data)
 957                 comb += wr_sel.eq(r1.req.byte_sel)
 958                 comb += wr_addr.eq(get_row(r1.req.real_addr))
 959
 960                 with m.If(i == r1.req.hit_way):
 961                     comb += do_write.eq(1)
 962             with m.Else():
 963                 # Otherwise, we might be doing a reload or a DCBZ
 964                 with m.If(r1.dcbz):
 965                     comb += wr_data.eq(0)
 966                 with m.Else():
 967                     comb += wr_data.eq(wishbone_in.dat)
 968                 comb += wr_addr.eq(r1.store_row)
 969                 comb += wr_sel.eq(~0) # all 1s
 970
 971             with m.If((r1.state == State.RELOAD_WAIT_ACK)
 972                       & wishbone_in.ack & (relpace_way == i)):
 973                 comb += do_write.eq(1)
 974
 975                 # Mask write selects with do_write since BRAM
 976                 # doesn't have a global write-enable
 977                 with m.If(do_write):
 978                     comb += wr_sel_m.eq(wr_sel)
 979
 980     # Cache hit synchronous machine for the easy case.
 981     # This handles load hits.
 982     # It also handles error cases (TLB miss, cache paradox)
 983     def dcache_fast_hit(self, m, req_op, r0_valid, r1):
 984
 985         comb = m.d.comb
 986         sync = m.d.sync
 987
 988         with m.If(req_op != Op.OP_NONE):
 989             #Display(f"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \
 990             #      f"idx:{req_index} tag:{req_tag} way: {req_hit_way}"
 991             #     )
 992             pass
 993
 994         with m.If(r0_valid):
 995             sync += r1.mmu_req.eq(r0.mmu_req)
 996
 997         # Fast path for load/store hits.
 998         # Set signals for the writeback controls.
 999         sync += r1.hit_way.eq(req_hit_way)
1000         sync += r1.hit_index.eq(req_index)
1001
1002         with m.If(req_op == Op.OP_LOAD_HIT):
1003             sync += r1.hit_load_valid.eq(1)
1004         with m.Else():
1005             sync += r1.hit_load_valid.eq(0)
1006
1007         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1008             sync += r1.cache_hit.eq(1)
1009         with m.Else():
1010             sync += r1.cache_hit.eq(0)
1011
1012         with m.If(req_op == Op.OP_BAD):
1013             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1014             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1015             sync += r1.ls_error.eq(~r0.mmu_req)
1016             sync += r1.mmu_error.eq(r0.mmu_req)
1017             sync += r1.cache_paradox.eq(access_ok)
1018
1019             with m.Else():
1020                 sync += r1.ls_error.eq(0)
1021                 sync += r1.mmu_error.eq(0)
1022                 sync += r1.cache_paradox.eq(0)
1023
1024         with m.If(req_op == Op.OP_STCX_FAIL):
1025             r1.stcx_fail.eq(1)
1026         with m.Else():
1027             sync += r1.stcx_fail.eq(0)
1028
1029         # Record TLB hit information for updating TLB PLRU
1030         sync += r1.tlb_hit.eq(tlb_hit)
1031         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1032         sync += r1.tlb_hit_index.eq(tlb_req_index)
1033
1034     # Memory accesses are handled by this state machine:
1035     #
1036     #   * Cache load miss/reload (in conjunction with "rams")
1037     #   * Load hits for non-cachable forms
1038     #   * Stores (the collision case is handled in "rams")
1039     #
1040     # All wishbone requests generation is done here.
1041     # This machine operates at stage 1.
1042     def dcache_slow(self, m, r1, use_forward1_next, cache_valid_bits, r0,
1043                     r0_valid, req_op, cache_tag, req_go, ra, wb_in):
1044
1045         comb = m.d.comb
1046         sync = m.d.sync
1047
1048         req         = MemAccessRequest()
1049         acks        = Signal(3)
1050         adjust_acks = Signal(3)
1051
1052         sync += r1.use_forward1.eq(use_forward1_next)
1053         sync += r1.forward_sel.eq(0)
1054
1055         with m.If(use_forward1_next):
1056             sync += r1.forward_sel.eq(r1.req.byte_sel)
1057         with m.Elif(use_forward2_next):
1058             sync += r1.forward_sel.eq(r1.forward_sel1)
1059
1060         sync += r1.forward_data2.eq(r1.forward_data1)
1061         with m.If(r1.write_bram):
1062             sync += r1.forward_data1.eq(r1.req.data)
1063             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1064             sync += r1.forward_way1.eq(r1.req.hit_way)
1065             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1066             sync += r1.forward_valid1.eq(1)
1067         with m.Else():
1068             with m.If(r1.bcbz):
1069                 sync += r1.forward_data1.eq(0)
1070             with m.Else():
1071                 sync += r1.forward_data1.eq(wb_in.dat)
1072             sync += r1.forward_sel1.eq(~0) # all 1s
1073             sync += r1.forward_way1.eq(replace_way)
1074             sync += r1.forward_row1.eq(r1.store_row)
1075             sync += r1.forward_valid1.eq(0)
1076
1077         # One cycle pulses reset
1078         sync += r1.slow_valid.eq(0)
1079         sync += r1.write_bram.eq(0)
1080         sync += r1.inc_acks.eq(0)
1081         sync += r1.dec_acks.eq(0)
1082
1083         sync += r1.ls_valid.eq(0)
1084         # complete tlbies and TLB loads in the third cycle
1085         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1086
1087         with m.If((req_op == Op.OP_LOAD_HIT)
1088                   | (req_op == Op.OP_STCX_FAIL)):
1089             with m.If(~r0.mmu_req):
1090                 sync += r1.ls_valid.eq(1)
1091             with m.Else():
1092                 sync += r1.mmu_done.eq(1)
1093
1094         with m.If(r1.write_tag):
1095             # Store new tag in selected way
1096             for i in range(NUM_WAYS):
1097                 with m.If(i == replace_way):
1098                     idx = r1.store_index
1099                     trange = range(i * TAG_WIDTH, (i+1) * TAG_WIDTH)
1100                     sync += cache_tag[idx][trange].eq(r1.reload_tag)
1101             sync += r1.store_way.eq(replace_way)
1102             sync += r1.write_tag.eq(0)
1103
1104         # Take request from r1.req if there is one there,
1105         # else from req_op, ra, etc.
1106         with m.If(r1.full)
1107             comb += req.eq(r1.req)
1108         with m.Else():
1109             comb += req.op.eq(req_op)
1110             comb += req.valid.eq(req_go)
1111             comb += req.mmu_req.eq(r0.mmu_req)
1112             comb += req.dcbz.eq(r0.req.dcbz)
1113             comb += req.real_addr.eq(ra)
1114
1115             with m.If(~r0.req.dcbz):
1116                 comb += req.data.eq(r0.req.data)
1117             with m.Else():
1118                 comb += req.data.eq(0)
1119
1120             # Select all bytes for dcbz
1121             # and for cacheable loads
1122             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc):
1123                 comb += req.byte_sel.eq(~0) # all 1s
1124             with m.Else():
1125                 comb += req.byte_sel.eq(r0.req.byte_sel)
1126             comb += req.hit_way.eq(req_hit_way)
1127             comb += req.same_tag.eq(req_same_tag)
1128
1129             # Store the incoming request from r0,
1130             # if it is a slow request
1131             # Note that r1.full = 1 implies req_op = OP_NONE
1132             with m.If((req_op == Op.OP_LOAD_MISS)
1133                       | (req_op == Op.OP_LOAD_NC)
1134                       | (req_op == Op.OP_STORE_MISS)
1135                       | (req_op == Op.OP_STORE_HIT)):
1136                 sync += r1.req(req)
1137                 sync += r1.full.eq(1)
1138
1139         # Main state machine
1140         with m.Switch(r1.state):
1141
1142             with m.Case(State.IDLE)
1143 # XXX check 'left downto.  probably means len(r1.wb.adr)
1144 #                     r1.wb.adr <= req.real_addr(
1145 #                                   r1.wb.adr'left downto 0
1146 #                                  );
1147                 sync += r1.wb.adr.eq(req.real_addr[0:r1.wb.adr])
1148                 sync += r1.wb.sel.eq(req.byte_sel)
1149                 sync += r1.wb.dat.eq(req.data)
1150                 sync += r1.dcbz.eq(req.dcbz)
1151
1152                 # Keep track of our index and way
1153                 # for subsequent stores.
1154                 sync += r1.store_index.eq(get_index(req.real_addr))
1155                 sync += r1.store_row.eq(get_row(req.real_addr))
1156                 sync += r1.end_row_ix.eq(
1157                          get_row_of_line(get_row(req.real_addr))
1158                         )
1159                 sync += r1.reload_tag.eq(get_tag(req.real_addr))
1160                 sync += r1.req.same_tag.eq(1)
1161
1162                 with m.If(req.op == Op.OP_STORE_HIT):
1163                     sync += r1.store_way.eq(req.hit_way)
1164
1165                 # Reset per-row valid bits,
1166                 # ready for handling OP_LOAD_MISS
1167                 for i in range(ROW_PER_LINE):
1168                     sync += r1.rows_valid[i].eq(0)
1169
1170                 with m.Switch(req.op):
1171                     with m.Case(Op.OP_LOAD_HIT):
1172                         # stay in IDLE state
1173                         pass
1174
1175                     with m.Case(Op.OP_LOAD_MISS):
1176                         #Display(f"cache miss real addr:" \
1177                         #      f"{req_real_addr}" \
1178                         #      f" idx:{get_index(req_real_addr)}" \
1179                         #      f" tag:{get_tag(req.real_addr)}")
1180                         pass
1181
1182                         # Start the wishbone cycle
1183                         sync += r1.wb.we.eq(0)
1184                         sync += r1.wb.cyc.eq(1)
1185                         sync += r1.wb.stb.eq(1)
1186
1187                         # Track that we had one request sent
1188                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1189                         sync += r1.write_tag.eq(1)
1190
1191                     with m.Case(Op.OP_LOAD_NC):
1192                         sync += r1.wb.cyc.eq(1)
1193                         sync += r1.wb.stb.eq(1)
1194                         sync += r1.wb.we.eq(0)
1195                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1196
1197                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1198                         with m.If(~req.bcbz):
1199                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1200                             sync += r1.acks_pending.eq(1)
1201                             sync += r1.full.eq(0)
1202                             sync += r1.slow_valid.eq(1)
1203
1204                             with m.If(~req.mmu_req):
1205                                 sync += r1.ls_valid.eq(1)
1206                             with m.Else():
1207                                 sync += r1.mmu_done.eq(1)
1208
1209                             with m.If(req.op == Op.OP_STORE_HIT):
1210                                 sync += r1.write_bram.eq(1)
1211                         with m.Else():
1212                             sync += r1.state.eq(Op.RELOAD_WAIT_ACK)
1213
1214                             with m.If(req.op == Op.OP_STORE_MISS):
1215                                 sync += r1.write_tag.eq(1)
1216
1217                         sync += r1.wb.we.eq(1)
1218                         sync += r1.wb.cyc.eq(1)
1219                         sync += r1.wb.stb.eq(1)
1220
1221                     # OP_NONE and OP_BAD do nothing
1222                     # OP_BAD & OP_STCX_FAIL were
1223                     # handled above already
1224                     with m.Case(Op.OP_NONE):
1225                         pass
1226                     with m.Case(OP_BAD):
1227                         pass
1228                     with m.Case(OP_STCX_FAIL):
1229                         pass
1230
1231             with m.Case(State.RELOAD_WAIT_ACK):
1232                 # Requests are all sent if stb is 0
1233                 comb += stbs_done.eq(~r1.wb.stb)
1234
1235                 with m.If(~wb_in.stall & ~stbs_done):
1236                     # That was the last word?
1237                     # We are done sending.
1238                     # Clear stb and set stbs_done
1239                     # so we can handle an eventual
1240                     # last ack on the same cycle.
1241                     with m.If(is_last_row_addr(
1242                               r1.wb.adr, r1.end_row_ix)):
1243                         sync += r1.wb.stb.eq(0)
1244                         comb += stbs_done.eq(0)
1245
1246                     # Calculate the next row address
1247                     sync += r1.wb.adr.eq(next_row_addr(r1.wb.adr))
1248
1249                 # Incoming acks processing
1250                 sync += r1.forward_valid1.eq(wb_in.ack)
1251                 with m.If(wb_in.ack):
1252                     # XXX needs an Array bit-accessor here
1253                     sync += r1.rows_valid[r1.store_row % ROW_PER_LINE].eq(1)
1254
1255                     # If this is the data we were looking for,
1256                     # we can complete the request next cycle.
1257                     # Compare the whole address in case the
1258                     # request in r1.req is not the one that
1259                     # started this refill.
1260                     with m.If(r1.full & r1.req.same_tag &
1261                               ((r1.dcbz & r1.req.dcbz) |
1262                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1263                                 (r1.store_row == get_row(r1.req.real_addr))):
1264                         sync += r1.full.eq(0)
1265                         sync += r1.slow_valid.eq(1)
1266                             with m.If(~r1.mmu_req):
1267                                 sync += r1.ls_valid.eq(1)
1268                             with m.Else():
1269                                 sync += r1.mmu_done.eq(1)
1270                         sync += r1.forward_sel.eq(~0) # all 1s
1271                         sync += r1.use_forward1.eq(1)
1272
1273                     # Check for completion
1274                     with m.If(stbs_done & is_last_row(r1.store_row,
1275                                                       r1.end_row_ix)):
1276                         # Complete wishbone cycle
1277                         sync += r1.wb.cyc.eq(0)
1278
1279                         # Cache line is now valid
1280                         cv = cache_valid_bits[r1.store_index]
1281                         sync += cv[r1.store_way].eq(1)
1282                         sync += r1.state.eq(State.IDLE)
1283
1284                     # Increment store row counter
1285                     sync += r1.store_row.eq(next_row(r1.store_row))
1286
1287             with m.Case(State.STORE_WAIT_ACK):
1288                 comb += stbs_done.eq(~r1.wb.stb)
1289                 comb += acks.eq(r1.acks_pending)
1290
1291                 with m.If(r1.inc_acks != r1.dec_acks):
1292                     with m.If(r1.inc_acks):
1293                         comb += adjust_acks.eq(acks + 1)
1294                     with m.Else():
1295                         comb += adjust_acks.eq(acks - 1)
1296                 with m.Else():
1297                     comb += adjust_acks.eq(acks)
1298
1299                 sync += r1.acks_pending.eq(adjust_acks)
1300
1301                 # Clear stb when slave accepted request
1302                 with m.If(~wb_in.stall):
1303                     # See if there is another store waiting
1304                     # to be done which is in the same real page.
1305                     with m.If(req.valid):
1306                         ra = req.real_addr[0:SET_SIZE_BITS]
1307                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
1308                         sync += r1.wb.dat.eq(req.data)
1309                         sync += r1.wb.sel.eq(req.byte_sel)
1310
1311                     with m.Elif((adjust_acks < 7) & req.same_tag &
1312                                 ((req.op == Op.Op_STORE_MISS)
1313                                  | (req.op == Op.OP_SOTRE_HIT))):
1314                         sync += r1.wb.stb.eq(1)
1315                         comb += stbs_done.eq(0)
1316
1317                         with m.If(req.op == Op.OP_STORE_HIT):
1318                             sync += r1.write_bram.eq(1)
1319                         sync += r1.full.eq(0)
1320                         sync += r1.slow_valid.eq(1)
1321
1322                         # Store requests never come from the MMU
1323                         sync += r1.ls_valid.eq(1)
1324                         comb += stbs_done.eq(0)
1325                         sync += r1.inc_acks.eq(1)
1326                     with m.Else():
1327                         sync += r1.wb.stb.eq(0)
1328                         comb += stbs_done.eq(1)
1329
1330                 # Got ack ? See if complete.
1331                 with m.If(wb_in.ack):
1332                     with m.If(stbs_done & (adjust_acks == 1))
1333                         sync += r1.state.eq(State.IDLE)
1334                         sync += r1.wb.cyc.eq(0)
1335                         sync += r1.wb.stb.eq(0)
1336                     sync += r1.dec_acks.eq(1)
1337
1338             with m.Case(State.NC_LOAD_WAIT_ACK):
1339                 # Clear stb when slave accepted request
1340                 with m.If(~wb_in.stall):
1341                     sync += r1.wb.stb.eq(0)
1342
1343                 # Got ack ? complete.
1344                 with m.If(wb_in.ack):
1345                     sync += r1.state.eq(State.IDLE)
1346                     sync += r1.full.eq(0)
1347                     sync += r1.slow_valid.eq(1)
1348
1349                     with m.If(~r1.mmu_req):
1350                         sync += r1.ls_valid.eq(1)
1351                     with m.Else():
1352                         sync += r1.mmu_done.eq(1)
1353
1354                     sync += r1.forward_sel.eq(~0) # all 1s
1355                     sync += r1.use_forward1.eq(1)
1356                     sync += r1.wb.cyc.eq(0)
1357                     sync += r1.wb.stb.eq(0)
1358
1359     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out,
1360                    d_out, wb_in, log_out):
1361
1362         sync = m.d.sync
1363
1364         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1365                                stall_out, req_op[:3], d_out.valid, d_out.error,
1366                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1367                                r1.wb.adr[3:6]))
1368
1369     def elaborate(self, platform):
1370
1371         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1372         cache_tags       = CacheTagArray()
1373         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1374         cache_valid_bits = CacheValidBitsArray()
1375
1376         # TODO attribute ram_style : string;
1377         # TODO attribute ram_style of cache_tags : signal is "distributed";
1378
1379         """note: these are passed to nmigen.hdl.Memory as "attributes".
1380            don't know how, just that they are.
1381         """
1382         dtlb_valid_bits = TLBValidBitsArray()
1383         dtlb_tags       = TLBTagsArray()
1384         dtlb_ptes       = TLBPtesArray()
1385         # TODO attribute ram_style of
1386         #  dtlb_tags : signal is "distributed";
1387         # TODO attribute ram_style of
1388         #  dtlb_ptes : signal is "distributed";
1389
1390         r0      = RegStage0()
1391         r0_full = Signal()
1392
1393         r1 = RegStage1()
1394
1395         reservation = Reservation()
1396
1397         # Async signals on incoming request
1398         req_index    = Signal(INDEX_BITS)
1399         req_row      = Signal(ROW_BITS)
1400         req_hit_way  = Signal(WAY_BITS)
1401         req_tag      = Signal(TAG_BITS)
1402         req_op       = Op()
1403         req_data     = Signal(64)
1404         req_same_tag = Signal()
1405         req_go       = Signal()
1406
1407         early_req_row     = Signal(ROW_BITS)
1408
1409         cancel_store      = Signal()
1410         set_rsrv          = Signal()
1411         clear_rsrv        = Signal()
1412
1413         r0_valid          = Signal()
1414         r0_stall          = Signal()
1415
1416         use_forward1_next = Signal()
1417         use_forward2_next = Signal()
1418
1419         cache_out         = CacheRamOut()
1420
1421         plru_victim       = PLRUOut()
1422         replace_way       = Signal(WAY_BITS)
1423
1424         # Wishbone read/write/cache write formatting signals
1425         bus_sel           = Signal(8)
1426
1427         # TLB signals
1428         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1429         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1430         tlb_valid_way = Signal(TLB_NUM_WAYS)
1431         tlb_req_index = Signal(TLB_SET_BITS)
1432         tlb_hit       = Signal()
1433         tlb_hit_way   = Signal(TLB_WAY_BITS)
1434         pte           = Signal(TLB_PTE_BITS)
1435         ra            = Signal(REAL_ADDR_BITS)
1436         valid_ra      = Signal()
1437         perm_attr     = PermAttr()
1438         rc_ok         = Signal()
1439         perm_ok       = Signal()
1440         access_ok     = Signal()
1441
1442         tlb_plru_victim = TLBPLRUOut()
1443
1444         # we don't yet handle collisions between loadstore1 requests
1445         # and MMU requests
1446         comb += m_out.stall.eq(0)
1447
1448         # Hold off the request in r0 when r1 has an uncompleted request
1449         comb += r0_stall.eq(r0_full & r1.full)
1450         comb += r0_valid.eq(r0_full & ~r1.full)
1451         comb += stall_out.eq(r0_stall)
1452
1453         # Wire up wishbone request latch out of stage 1
1454         comb += wishbone_out.eq(r1.wb)
1455
1456
1457
1458 # dcache_tb.vhdl
1459 #
1460 # entity dcache_tb is
1461 # end dcache_tb;
1462 #
1463 # architecture behave of dcache_tb is
1464 #     signal clk          : std_ulogic;
1465 #     signal rst          : std_ulogic;
1466 #
1467 #     signal d_in         : Loadstore1ToDcacheType;
1468 #     signal d_out        : DcacheToLoadstore1Type;
1469 #
1470 #     signal m_in         : MmuToDcacheType;
1471 #     signal m_out        : DcacheToMmuType;
1472 #
1473 #     signal wb_bram_in   : wishbone_master_out;
1474 #     signal wb_bram_out  : wishbone_slave_out;
1475 #
1476 #     constant clk_period : time := 10 ns;
1477 # begin
1478 #     dcache0: entity work.dcache
1479 #         generic map(
1480 #
1481 #             LINE_SIZE => 64,
1482 #             NUM_LINES => 4
1483 #             )
1484 #         port map(
1485 #             clk => clk,
1486 #             rst => rst,
1487 #             d_in => d_in,
1488 #             d_out => d_out,
1489 #             m_in => m_in,
1490 #             m_out => m_out,
1491 #             wishbone_out => wb_bram_in,
1492 #             wishbone_in => wb_bram_out
1493 #             );
1494 #
1495 #     -- BRAM Memory slave
1496 #     bram0: entity work.wishbone_bram_wrapper
1497 #         generic map(
1498 #             MEMORY_SIZE   => 1024,
1499 #             RAM_INIT_FILE => "icache_test.bin"
1500 #             )
1501 #         port map(
1502 #             clk => clk,
1503 #             rst => rst,
1504 #             wishbone_in => wb_bram_in,
1505 #             wishbone_out => wb_bram_out
1506 #             );
1507 #
1508 #     clk_process: process
1509 #     begin
1510 #         clk <= '0';
1511 #         wait for clk_period/2;
1512 #         clk <= '1';
1513 #         wait for clk_period/2;
1514 #     end process;
1515 #
1516 #     rst_process: process
1517 #     begin
1518 #         rst <= '1';
1519 #         wait for 2*clk_period;
1520 #         rst <= '0';
1521 #         wait;
1522 #     end process;
1523 #
1524 #     stim: process
1525 #     begin
1526 #     -- Clear stuff
1527 #     d_in.valid <= '0';
1528 #     d_in.load <= '0';
1529 #     d_in.nc <= '0';
1530 #     d_in.addr <= (others => '0');
1531 #     d_in.data <= (others => '0');
1532 #         m_in.valid <= '0';
1533 #         m_in.addr <= (others => '0');
1534 #         m_in.pte <= (others => '0');
1535 #
1536 #         wait for 4*clk_period;
1537 #     wait until rising_edge(clk);
1538 #
1539 #     -- Cacheable read of address 4
1540 #     d_in.load <= '1';
1541 #     d_in.nc <= '0';
1542 #         d_in.addr <= x"0000000000000004";
1543 #         d_in.valid <= '1';
1544 #     wait until rising_edge(clk);
1545 #         d_in.valid <= '0';
1546 #
1547 #     wait until rising_edge(clk) and d_out.valid = '1';
1548 #         assert d_out.data = x"0000000100000000"
1549 #         report "data @" & to_hstring(d_in.addr) &
1550 #         "=" & to_hstring(d_out.data) &
1551 #         " expected 0000000100000000"
1552 #         severity failure;
1553 # --      wait for clk_period;
1554 #
1555 #     -- Cacheable read of address 30
1556 #     d_in.load <= '1';
1557 #     d_in.nc <= '0';
1558 #         d_in.addr <= x"0000000000000030";
1559 #         d_in.valid <= '1';
1560 #     wait until rising_edge(clk);
1561 #         d_in.valid <= '0';
1562 #
1563 #     wait until rising_edge(clk) and d_out.valid = '1';
1564 #         assert d_out.data = x"0000000D0000000C"
1565 #         report "data @" & to_hstring(d_in.addr) &
1566 #         "=" & to_hstring(d_out.data) &
1567 #         " expected 0000000D0000000C"
1568 #         severity failure;
1569 #
1570 #     -- Non-cacheable read of address 100
1571 #     d_in.load <= '1';
1572 #     d_in.nc <= '1';
1573 #         d_in.addr <= x"0000000000000100";
1574 #         d_in.valid <= '1';
1575 #     wait until rising_edge(clk);
1576 #     d_in.valid <= '0';
1577 #     wait until rising_edge(clk) and d_out.valid = '1';
1578 #         assert d_out.data = x"0000004100000040"
1579 #         report "data @" & to_hstring(d_in.addr) &
1580 #         "=" & to_hstring(d_out.data) &
1581 #         " expected 0000004100000040"
1582 #         severity failure;
1583 #
1584 #     wait until rising_edge(clk);
1585 #     wait until rising_edge(clk);
1586 #     wait until rising_edge(clk);
1587 #     wait until rising_edge(clk);
1588 #
1589 #     std.env.finish;
1590 #     end process;
1591 # end;
1592 def dcache_sim(dut):
1593     # clear stuff
1594     yield dut.d_in.valid.eq(0)
1595     yield dut.d_in.load.eq(0)
1596     yield dut.d_in.nc.eq(0)
1597     yield dut.d_in.adrr.eq(0)
1598     yield dut.d_in.data.eq(0)
1599     yield dut.m_in.valid.eq(0)
1600     yield dut.m_in.addr.eq(0)
1601     yield dut.m_in.pte.eq(0)
1602     # wait 4 * clk_period
1603     yield
1604     yield
1605     yield
1606     yield
1607     # wait_until rising_edge(clk)
1608     yield
1609     # Cacheable read of address 4
1610     yield dut.d_in.load.eq(1)
1611     yield dut.d_in.nc.eq(0)
1612     yield dut.d_in.addr.eq(Const(0x0000000000000004, 64))
1613     yield dut.d_in.valid.eq(1)
1614     # wait-until rising_edge(clk)
1615     yield
1616     yield dut.d_in.valid.eq(0)
1617     yield
1618     while not (yield dut.d_out.valid):
1619         yield
1620     assert dut.d_out.data == Const(0x0000000100000000, 64) f"data @" \
1621         f"{dut.d_in.addr}={dut.d_in.data} expected 0000000100000000" \
1622         " -!- severity failure"
1623
1624
1625     # Cacheable read of address 30
1626     yield dut.d_in.load.eq(1)
1627     yield dut.d_in.nc.eq(0)
1628     yield dut.d_in.addr.eq(Const(0x0000000000000030, 64))
1629     yield dut.d_in.valid.eq(1)
1630     yield
1631     yield dut.d_in.valid.eq(0)
1632     yield
1633     while not (yield dut.d_out.valid):
1634         yield
1635     assert dut.d_out.data == Const(0x0000000D0000000C, 64) f"data @" \
1636         f"{dut.d_in.addr}={dut.d_out.data} expected 0000000D0000000C" \
1637         f"-!- severity failure"
1638
1639     # Non-cacheable read of address 100
1640     yield dut.d_in.load.eq(1)
1641     yield dut.d_in.nc.eq(1)
1642     yield dut.d_in.addr.eq(Const(0x0000000000000100, 64))
1643     yield dut.d_in.valid.eq(1)
1644     yield
1645     yield dut.d_in.valid.eq(0)
1646     yield
1647     while not (yield dut.d_out.valid):
1648         yield
1649     assert dut.d_out.data == Const(0x0000004100000040, 64) f"data @" \
1650         f"{dut.d_in.addr}={dut.d_out.data} expected 0000004100000040" \
1651         f"-!- severity failure"
1652
1653     yield
1654     yield
1655     yield
1656     yield
1657
1658
1659 def test_dcache():
1660     dut = DCache()
1661     vl = rtlil.convert(dut, ports=[])
1662     with open("test_dcache.il", "w") as f:
1663         f.write(vl)
1664
1665     run_simulation(dut, dcache_sim(), vcd_name='test_dcache.vcd')
1666
1667 if __name__ == '__main__':
1668     test_dcache()
1669