src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 """
   6
   7 from enum import Enum, unique
   8
   9 from nmigen import Module, Signal, Elaboratable,
  10                    Cat, Repl
  11 from nmigen.cli import main
  12 from nmigen.iocontrol import RecordObject
  13 from nmigen.util import log2_int
  14
  15 from experiment.mem_types import LoadStore1ToDCacheType,
  16                                  DCacheToLoadStore1Type,
  17                                  MMUToDCacheType,
  18                                  DCacheToMMUType
  19
  20 from experiment.wb_types import WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  21                                 WBAddrType, WBDataType, WBSelType,
  22                                 WbMasterOut, WBSlaveOut,
  23                                 WBMasterOutVector, WBSlaveOutVector,
  24                                 WBIOMasterOut, WBIOSlaveOut
  25
  26 # TODO: make these parameters of DCache at some point
  27 LINE_SIZE = 64    # Line size in bytes
  28 NUM_LINES = 32    # Number of lines in a set
  29 NUM_WAYS = 4      # Number of ways
  30 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  31 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  32 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  33 LOG_LENGTH = 0    # Non-zero to enable log data collection
  34
  35 # BRAM organisation: We never access more than
  36 #     -- wishbone_data_bits at a time so to save
  37 #     -- resources we make the array only that wide, and
  38 #     -- use consecutive indices for to make a cache "line"
  39 #     --
  40 #     -- ROW_SIZE is the width in bytes of the BRAM
  41 #     -- (based on WB, so 64-bits)
  42 ROW_SIZE = WB_DATA_BITS // 8;
  43
  44 # ROW_PER_LINE is the number of row (wishbone
  45 # transactions) in a line
  46 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  47
  48 # BRAM_ROWS is the number of rows in BRAM needed
  49 # to represent the full dcache
  50 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  51
  52
  53 # Bit fields counts in the address
  54
  55 # REAL_ADDR_BITS is the number of real address
  56 # bits that we store
  57 REAL_ADDR_BITS = 56
  58
  59 # ROW_BITS is the number of bits to select a row
  60 ROW_BITS = log2_int(BRAM_ROWS)
  61
  62 # ROW_LINE_BITS is the number of bits to select
  63 # a row within a line
  64 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
  65
  66 # LINE_OFF_BITS is the number of bits for
  67 # the offset in a cache line
  68 LINE_OFF_BITS = log2_int(LINE_SIZE)
  69
  70 # ROW_OFF_BITS is the number of bits for
  71 # the offset in a row
  72 ROW_OFF_BITS = log2_int(ROW_SIZE)
  73
  74 # INDEX_BITS is the number if bits to
  75 # select a cache line
  76 INDEX_BITS = log2_int(NUM_LINES)
  77
  78 # SET_SIZE_BITS is the log base 2 of the set size
  79 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
  80
  81 # TAG_BITS is the number of bits of
  82 # the tag part of the address
  83 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
  84
  85 # TAG_WIDTH is the width in bits of each way of the tag RAM
  86 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
  87
  88 # WAY_BITS is the number of bits to select a way
  89 WAY_BITS = log2_int(NUM_WAYS)
  90
  91 # Example of layout for 32 lines of 64 bytes:
  92 #
  93 # ..  tag    |index|  line  |
  94 # ..         |   row   |    |
  95 # ..         |     |---|    | ROW_LINE_BITS  (3)
  96 # ..         |     |--- - --| LINE_OFF_BITS (6)
  97 # ..         |         |- --| ROW_OFF_BITS  (3)
  98 # ..         |----- ---|    | ROW_BITS      (8)
  99 # ..         |-----|        | INDEX_BITS    (5)
 100 # .. --------|              | TAG_BITS      (45)
 101
 102 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 103
 104 def CacheTagArray():
 105     return Array(CacheTagSet() for x in range(NUM_LINES))
 106
 107 def CacheValidBitsArray():
 108     return Array(CacheWayValidBits() for x in range(NUM_LINES))
 109
 110 def RowPerLineValidArray():
 111     return Array(Signal() for x in range(ROW_PER_LINE))
 112
 113 # L1 TLB
 114 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 115 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 116 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 117 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 118 TLB_PTE_BITS     = 64
 119 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 120
 121 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 122 assert (LINE_SIZE % 2) == 0, "LINE_SIZE not power of 2"
 123 assert (NUM_LINES % 2) == 0, "NUM_LINES not power of 2"
 124 assert (ROW_PER_LINE % 2) == 0, "ROW_PER_LINE not power of 2"
 125 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 126 assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS), \
 127         "geometry bits don't add up"
 128 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 129         "geometry bits don't add up"
 130 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 131          "geometry bits don't add up"
 132 assert 64 == wishbone_data_bits, "Can't yet handle wb width that isn't 64-bits"
 133 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 134
 135
 136 def TLBValidBitsArray():
 137     return Array(Signal(TLB_NUM_WAYS) for x in range(TLB_SET_SIZE))
 138
 139 def TLBTagsArray():
 140     return Array(Signal(TLB_TAG_WAY_BITS) for x in range (TLB_SET_SIZE))
 141
 142 def TLBPtesArray():
 143     return Array(Signal(TLB_PTE_WAY_BITS) for x in range(TLB_SET_SIZE))
 144
 145 def HitWaySet():
 146     return Array(Signal(NUM_WAYS) for x in range(TLB_NUM_WAYS))
 147
 148 # Cache RAM interface
 149 def CacheRamOut():
 150     return Array(Signal(WB_DATA_BITS) for x in range(NUM_WAYS))
 151
 152 # PLRU output interface
 153 def PLRUOut():
 154     return Array(Signal(WAY_BITS) for x in range(Index()))
 155
 156 # TLB PLRU output interface
 157 def TLBPLRUOut():
 158     return Array(Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE))
 159
 160 # Helper functions to decode incoming requests
 161 #
 162 # Return the cache line index (tag index) for an address
 163 def get_index(addr):
 164     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 165
 166 # Return the cache row index (data memory) for an address
 167 def get_row(addr):
 168     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 169
 170 # Return the index of a row within a line
 171 def get_row_of_line(row):
 172     row_v = Signal(ROW_BITS)
 173     row_v = Signal(row)
 174     return row_v[0:ROW_LINE_BITS]
 175
 176 # Returns whether this is the last row of a line
 177 def is_last_row_addr(addr, last):
 178     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 179
 180 # Returns whether this is the last row of a line
 181 def is_last_row(row, last):
 182     return get_row_of_line(row) == last
 183
 184 # Return the address of the next row in the current cache line
 185 def next_row_addr(addr):
 186     row_idx = Signal(ROW_LINE_BITS)
 187     result  = WBAddrType()
 188     # Is there no simpler way in VHDL to
 189     # generate that 3 bits adder ?
 190     row_idx = addr[ROW_OFF_BITS:LINE_OFF_BITS]
 191     row_idx = Signal(row_idx + 1)
 192     result = addr
 193     result[ROW_OFF_BITS:LINE_OFF_BITS] = row_idx
 194     return result
 195
 196 # Return the next row in the current cache line. We use a
 197 # dedicated function in order to limit the size of the
 198 # generated adder to be only the bits within a cache line
 199 # (3 bits with default settings)
 200 def next_row(row)
 201     row_v = row[0:ROW_LINE_BITS] + 1
 202     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 203
 204 # Get the tag value from the address
 205 def get_tag(addr):
 206     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 207
 208 # Read a tag from a tag memory row
 209 def read_tag(way, tagset):
 210     return tagset[way *TAG_WIDTH:way * TAG_WIDTH + TAG_BITS]
 211
 212 # Read a TLB tag from a TLB tag memory row
 213 def read_tlb_tag(way, tags):
 214     j = way * TLB_EA_TAG_BITS
 215     return tags[j:j + TLB_EA_TAG_BITS]
 216
 217 # Write a TLB tag to a TLB tag memory row
 218 def write_tlb_tag(way, tags), tag):
 219     j = way * TLB_EA_TAG_BITS
 220     tags[j:j + TLB_EA_TAG_BITS] = tag
 221
 222 # Read a PTE from a TLB PTE memory row
 223 def read_tlb_pte(way, ptes):
 224     j = way * TLB_PTE_BITS
 225     return ptes[j:j + TLB_PTE_BITS]
 226
 227 def write_tlb_pte(way, ptes,newpte):
 228     j = way * TLB_PTE_BITS
 229     return ptes[j:j + TLB_PTE_BITS].eq(newpte)
 230
 231
 232 # Record for storing permission, attribute, etc. bits from a PTE
 233 class PermAttr(RecordObject):
 234     def __init__(self):
 235         super().__init__()
 236         self.reference = Signal()
 237         self.changed   = Signal()
 238         self.nocache   = Signal()
 239         self.priv      = Signal()
 240         self.rd_perm   = Signal()
 241         self.wr_perm   = Signal()
 242
 243
 244 def extract_perm_attr(pte):
 245     pa = PermAttr()
 246     pa.reference = pte[8]
 247     pa.changed   = pte[7]
 248     pa.nocache   = pte[5]
 249     pa.priv      = pte[3]
 250     pa.rd_perm   = pte[2]
 251     pa.wr_perm   = pte[1]
 252     return pa;
 253
 254
 255 # Type of operation on a "valid" input
 256 @unique
 257 class Op(Enum):
 258     OP_NONE       = 0
 259     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 260     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 261     OP_LOAD_HIT   = 3 # Cache hit on load
 262     OP_LOAD_MISS  = 4 # Load missing cache
 263     OP_LOAD_NC    = 5 # Non-cachable load
 264     OP_STORE_HIT  = 6 # Store hitting cache
 265     OP_STORE_MISS = 7 # Store missing cache
 266
 267
 268 # Cache state machine
 269 @unique
 270 class State(Enum):
 271     IDLE             = 0 # Normal load hit processing
 272     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 273     STORE_WAIT_ACK   = 2 # Store wait ack
 274     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 275
 276
 277 # Dcache operations:
 278 #
 279 # In order to make timing, we use the BRAMs with
 280 # an output buffer, which means that the BRAM
 281 # output is delayed by an extra cycle.
 282 #
 283 # Thus, the dcache has a 2-stage internal pipeline
 284 # for cache hits with no stalls.
 285 #
 286 # All other operations are handled via stalling
 287 # in the first stage.
 288 #
 289 # The second stage can thus complete a hit at the same
 290 # time as the first stage emits a stall for a complex op.
 291 #
 292 # Stage 0 register, basically contains just the latched request
 293
 294 class RegStage0(RecordObject):
 295     def __init__(self):
 296         super().__init__()
 297         self.req     = LoadStore1ToDCacheType()
 298         self.tlbie   = Signal()
 299         self.doall   = Signal()
 300         self.tlbld   = Signal()
 301         self.mmu_req = Signal() # indicates source of request
 302
 303
 304 class MemAccessRequest(RecordObject):
 305     def __init__(self):
 306         super().__init__()
 307         self.op        = Op()
 308         self.valid     = Signal()
 309         self.dcbz      = Signal()
 310         self.real_addr = Signal(REAL_ADDR_BITS)
 311         self.data      = Signal(64)
 312         self.byte_sel  = Signal(8)
 313         self.hit_way   = Signal(WAY_BITS)
 314         self.same_tag  = Signal()
 315         self.mmu_req   = Signal()
 316
 317
 318 # First stage register, contains state for stage 1 of load hits
 319 # and for the state machine used by all other operations
 320 class RegStage1(RecordObject):
 321     def __init__(self):
 322         super().__init__()
 323         # Info about the request
 324         self.full             = Signal() # have uncompleted request
 325         self.mmu_req          = Signal() # request is from MMU
 326         self.req              = MemAccessRequest()
 327
 328         # Cache hit state
 329         self.hit_way          = Signal(WAY_BITS)
 330         self.hit_load_valid   = Signal()
 331         self.hit_index        = Signal(NUM_LINES)
 332         self.cache_hit        = Signal()
 333
 334         # TLB hit state
 335         self.tlb_hit          = Signal()
 336         self.tlb_hit_way      = Signal(TLB_NUM_WAYS)
 337         self.tlb_hit_index    = Signal(TLB_SET_SIZE)
 338         self.
 339         # 2-stage data buffer for data forwarded from writes to reads
 340         self.forward_data1    = Signal(64)
 341         self.forward_data2    = Signal(64)
 342         self.forward_sel1     = Signal(8)
 343         self.forward_valid1   = Signal()
 344         self.forward_way1     = Signal(WAY_BITS)
 345         self.forward_row1     = Signal(BRAM_ROWS)
 346         self.use_forward1     = Signal()
 347         self.forward_sel      = Signal(8)
 348
 349         # Cache miss state (reload state machine)
 350         self.state            = State()
 351         self.dcbz             = Signal()
 352         self.write_bram       = Signal()
 353         self.write_tag        = Signal()
 354         self.slow_valid       = Signal()
 355         self.wb               = WishboneMasterOut()
 356         self.reload_tag       = Signal(TAG_BITS)
 357         self.store_way        = Signal(WAY_BITS)
 358         self.store_row        = Signal(BRAM_ROWS)
 359         self.store_index      = Signal(NUM_LINES)
 360         self.end_row_ix       = Signal(ROW_LINE_BIT)
 361         self.rows_valid       = RowPerLineValidArray()
 362         self.acks_pending     = Signal(3)
 363         self.inc_acks         = Signal()
 364         self.dec_acks         = Signal()
 365
 366         # Signals to complete (possibly with error)
 367         self.ls_valid         = Signal()
 368         self.ls_error         = Signal()
 369         self.mmu_done         = Signal()
 370         self.mmu_error        = Signal()
 371         self.cache_paradox    = Signal()
 372
 373         # Signal to complete a failed stcx.
 374         self.stcx_fail        = Signal()
 375
 376
 377 # Reservation information
 378 class Reservation(RecordObject):
 379     def __init__(self):
 380         super().__init__()
 381         self.valid = Signal()
 382         self.addr  = Signal(64-LINE_OFF_BITS)
 383
 384
 385 class DCache(Elaboratable):
 386     """Set associative dcache write-through
 387     TODO (in no specific order):
 388     * See list in icache.vhdl
 389     * Complete load misses on the cycle when WB data comes instead of
 390       at the end of line (this requires dealing with requests coming in
 391       while not idle...)
 392     """
 393     def __init__(self):
 394         self.d_in      = LoadStore1ToDCacheType()
 395         self.d_out     = DCacheToLoadStore1Type()
 396
 397         self.m_in      = MMUToDCacheType()
 398         self.m_out     = DCacheToMMUType()
 399
 400         self.stall_out = Signal()
 401
 402         self.wb_out    = WBMasterOut()
 403         self.wb_in     = WBSlaveOut()
 404
 405         self.log_out   = Signal(20)
 406
 407     def stage_0(self, m, d_in, m_in):
 408         """Latch the request in r0.req as long as we're not stalling
 409         """
 410         comb = m.d.comb
 411         sync = m.d.sync
 412
 413         r = RegStage0()
 414
 415         # TODO, this goes in unit tests and formal proofs
 416         with m.If(~(d_in.valid & m_in.valid)):
 417             #sync += Display("request collision loadstore vs MMU")
 418             pass
 419
 420         with m.If(m_in.valid):
 421             sync += r.req.valid.eq(1)
 422             sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))
 423             sync += r.req.dcbz.eq(0)
 424             sync += r.req.nc.eq(0)
 425             sync += r.req.reserve.eq(0)
 426             sync += r.req.virt_mode.eq(1)
 427             sync += r.req.priv_mode.eq(1)
 428             sync += r.req.addr.eq(m_in.addr)
 429             sync += r.req.data.eq(m_in.pte)
 430             sync += r.req.byte_sel.eq(-1) # Const -1 sets all to 0b111....
 431             sync += r.tlbie.eq(m_in.tlbie)
 432             sync += r.doall.eq(m_in.doall)
 433             sync += r.tlbld.eq(m_in.tlbld)
 434             sync += r.mmu_req.eq(1)
 435         with m.Else():
 436             sync += r.req.eq(d_in)
 437             sync += r.req.tlbie.eq(0)
 438             sync += r.req.doall.eq(0)
 439             sync += r.req.tlbd.eq(0)
 440             sync += r.req.mmu_req.eq(0)
 441             with m.If(~(r1.full & r0_full)):
 442                 sync += r0.eq(r)
 443                 sync += r0_full.eq(r.req.valid)
 444
 445     def tlb_read(self, m, m_in, d_in, r0_stall, tlb_valid_way,
 446                  tlb_tag_way, tlb_pte_way, dtlb_valid_bits,
 447                  dtlb_tags, dtlb_ptes):
 448         """TLB
 449         Operates in the second cycle on the request latched in r0.req.
 450         TLB updates write the entry at the end of the second cycle.
 451         """
 452         comb = m.d.comb
 453         sync = m.d.sync
 454
 455         index    = Signal(log2_int(TLB_SET_BITS), False)
 456         addrbits = Signal(TLB_SET_BITS)
 457
 458         amin = TLB_LG_PGSZ
 459         amax = TLB_LG_PGSZ + TLB_SET_BITS
 460
 461         with m.If(m_in.valid):
 462             comb += addrbits.eq(m_in.addr[amin : amax])
 463         with m.Else():
 464             comb += addrbits.eq(d_in.addr[amin : amax])
 465         comb += index.eq(addrbits)
 466
 467         # If we have any op and the previous op isn't finished,
 468         # then keep the same output for next cycle.
 469         with m.If(~r0_stall):
 470             sync += tlb_valid_way.eq(dtlb_valid_bits[index])
 471             sync += tlb_tag_way.eq(dtlb_tags[index])
 472             sync += tlb_pte_way.eq(dtlb_ptes[index])
 473
 474     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, acc, acc_en, lru):
 475         """Generate TLB PLRUs
 476         """
 477         comb = m.d.comb
 478         sync = m.d.sync
 479
 480         with m.If(TLB_NUM_WAYS > 1):
 481             for i in range(TLB_SET_SIZE):
 482                 # TLB PLRU interface
 483                 tlb_plru        = PLRU(TLB_WAY_BITS)
 484                 tlb_plru_acc    = Signal(TLB_WAY_BITS)
 485                 tlb_plru_acc_en = Signal()
 486                 tlb_plru_out    = Signal(TLB_WAY_BITS)
 487
 488                 comb += tlb_plru.acc.eq(tlb_plru_acc)
 489                 comb += tlb_plru.acc_en.eq(tlb_plru_acc_en)
 490                 comb += tlb_plru.lru.eq(tlb_plru_out)
 491
 492                 # PLRU interface
 493                 with m.If(r1.tlb_hit_index == i):
 494                     comb += tlb_plru.acc_en.eq(r1.tlb_hit)
 495                 with m.Else():
 496                     comb += tlb_plru.acc_en.eq(0)
 497                 comb += tlb_plru.acc.eq(r1.tlb_hit_way)
 498
 499                 comb += tlb_plru_victim[i].eq(tlb_plru.lru)
 500
 501     def tlb_search(self, m, tlb_req_index, r0, tlb_valid_way_ tlb_tag_way,
 502                    tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra):
 503
 504         comb = m.d.comb
 505         sync = m.d.sync
 506
 507         hitway = Signal(TLB_WAY_BITS)
 508         hit    = Signal()
 509         eatag  = Signal(log2_int(TLB_EA_TAG_BITS, False))
 510
 511         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 512         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 513         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 514
 515         for i in range(TLB_NUM_WAYS):
 516             with m.If(tlb_valid_way(i)
 517                       & read_tlb_tag(i, tlb_tag_way) == eatag):
 518                 comb += hitway.eq(i)
 519                 comb += hit.eq(1)
 520
 521         comb += tlb_hit.eq(hit & r0_valid)
 522         comb += tlb_hit_way.eq(hitway)
 523
 524         with m.If(tlb_hit):
 525             comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way))
 526         with m.Else():
 527             comb += pte.eq(0)
 528         comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode)
 529         with m.If(r0.req.virt_mode):
 530             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 531                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 532                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 533             comb += perm_attr.eq(extract_perm_attr(pte))
 534         with m.Else():
 535             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 536                               r0.rq.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 537
 538             comb += perm_attr.reference.eq(1)
 539             comb += perm_attr.changed.eq(1)
 540             comb += perm_attr.priv.eq(1)
 541             comb += perm_attr.nocache.eq(0)
 542             comb += perm_attr.rd_perm.eq(1)
 543             comb += perm_attr.wr_perm.eq(1)
 544
 545     def tlb_update(self, m, r0_valid, r0, dtlb_valid_bits, tlb_req_index,
 546                     tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way,
 547                     dtlb_tags, tlb_pte_way, dtlb_ptes, dtlb_valid_bits):
 548
 549         comb = m.d.comb
 550         sync = m.d.sync
 551
 552     #         variable tlbie : std_ulogic;
 553     #         variable tlbwe : std_ulogic;
 554     #         variable repl_way : tlb_way_t;
 555     #         variable eatag : tlb_tag_t;
 556     #         variable tagset : tlb_way_tags_t;
 557     #         variable pteset : tlb_way_ptes_t;
 558     #type tlb_tags_t is array(tlb_index_t) of tlb_way_tags_t;
 559     # --> Array([Signal(log(way_tags length)) for i in range(number of tlbs)])
 560
 561         tlbie    = Signal()
 562         tlbwe    = Signal()
 563         repl_way = Signal(TLB_WAY_BITS)
 564         eatag    = Signal(TLB_EA_TAG_BITS)
 565         tagset   = TLBWayTags()
 566         pteset   = TLBWayPtes()
 567
 568         comb += tlbie.eq(r0_valid & r0.tlbie)
 569         comb += tlbwe.eq(r0_valid & r0.tlbldoi)
 570
 571         with m.If(tlbie & r0.doall):
 572             # clear all valid bits at once
 573             for i in range(TLB_SET_SIZE):
 574                 sync += dtlb_valid_bits[i].eq(0)
 575
 576         with m.Elif(tlbie):
 577             with m.If(tlb_hit):
 578                 sync += dtlb_valid_bits[tlb_req_index][tlb_hit_way].eq(0)
 579         with m.Elif(tlbwe):
 580             with m.If(tlb_hit):
 581                 comb += repl_way.eq(tlb_hit_way)
 582             with m.Else():
 583                 comb += repl_way.eq(tlb_plru_victim[tlb_req_index])
 584             comb += eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 585             comb += tagset.eq(tlb_tag_way)
 586             sync += write_tlb_tag(repl_way, tagset, eatag)
 587             sync += dtlb_tags[tlb_req_index].eq(tagset)
 588             comb += pteset.eq(tlb_pte_way)
 589             sync += write_tlb_pte(repl_way, pteset, r0.req.data)
 590             sync += dtlb_ptes[tlb_req_index].eq(pteset)
 591             sync += dtlb_valid_bits[tlb_req_index][repl_way].eq(1)
 592
 593     def maybe_plrus(self, r1):
 594         """Generate PLRUs
 595         """
 596         comb = m.d.comb
 597         sync = m.d.sync
 598
 599         for i in range(NUM_LINES):
 600             # PLRU interface
 601             plru        = PLRU(TLB_WAY_BITS)
 602             setattr(m.submodules, "plru%d" % i, plru)
 603             plru_acc    = Signal(TLB_WAY_BITS)
 604             plru_acc_en = Signal()
 605             plru_out    = Signal(TLB_WAY_BITS)
 606
 607             comb += plru.acc.eq(plru_acc)
 608             comb += plru.acc_en.eq(plru_acc_en)
 609             comb += plru.lru.eq(plru_out)
 610
 611             with m.If(r1.hit_index == i):
 612                 comb += plru_acc_en.eq(r1.cache_hit)
 613
 614             comb += plru_acc.eq(r1.hit_way)
 615             comb += plru_victim[i].eq(plru_out)
 616
 617     def cache_tag_read(self, m, r0_stall, req_index, m_in, d_in,
 618                        cache_tag_set, cache_tags):
 619         """Cache tag RAM read port
 620         """
 621         comb = m.d.comb
 622         sync = m.d.sync
 623
 624         index = Signal(INDEX_BITS)
 625
 626         with m.If(r0_stall):
 627             comb += index.eq(req_index)
 628         with m.Elif(m_in.valid):
 629             comb += index.eq(get_index(m_in.addr))
 630         with m.Else():
 631             comb += index.eq(get_index(d_in.addr))
 632         sync += cache_tag_set.eq(cache_tags[index])
 633
 634     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 635                        r0_valid, r1, cache_valid_bits, replace_way,
 636                        use_forward1_next, use_forward2_next,
 637                        req_hit_way, plru_victim, rc_ok, perm_attr,
 638                        valid_ra, perm_ok, access_ok, req_op, req_ok,
 639                        r0_stall, m_in, early_req_row, d_in):
 640         """Cache request parsing and hit detection
 641         """
 642
 643         comb = m.d.comb
 644         sync = m.d.sync
 645
 646         is_hit      = Signal()
 647         hit_way     = Signal(WAY_BITS)
 648         op          = Op()
 649         opsel       = Signal(3)
 650         go          = Signal()
 651         nc          = Signal()
 652         s_hit       = Signal()
 653         s_tag       = Signal(TAG_BITS)
 654         s_pte       = Signal(TLB_PTE_BITS)
 655         s_ra        = Signal(REAL_ADDR_BITS)
 656         hit_set     = Signal(TLB_NUM_WAYS)
 657         hit_way_set = HitWaySet()
 658         rel_matches = Signal(TLB_NUM_WAYS)
 659         rel_match   = Signal()
 660
 661         # Extract line, row and tag from request
 662         comb += req_index.eq(get_index(r0.req.addr))
 663         comb += req_row.eq(get_row(r0.req.addr))
 664         comb += req_tag.eq(get_tag(ra))
 665
 666         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 667
 668         # Test if pending request is a hit on any way
 669         # In order to make timing in virtual mode,
 670         # when we are using the TLB, we compare each
 671         # way with each of the real addresses from each way of
 672         # the TLB, and then decide later which match to use.
 673
 674         with m.If(r0.req.virt_mode):
 675             comb += rel_matches.eq(0)
 676             for j in range(TLB_NUM_WAYS):
 677                 comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way))
 678                 comb += s_ra.eq(Cat(r0.req.addr[0:TLB_LG_PGSZ],
 679                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 680                 comb += s_tag.eq(get_tag(s_ra))
 681
 682                 for i in range(NUM_WAYS):
 683                     with m.If(go & cache_valid_bits[req_index][i] &
 684                               read_tag(i, cache_tag_set) == s_tag
 685                               & tlb_valid_way[j]):
 686                         comb += hit_way_set[j].eq(i)
 687                         comb += s_hit.eq(1)
 688                 comb += hit_set[j].eq(s_hit)
 689                 with m.If(s_tag == r1.reload_tag):
 690                     comb += rel_matches[j].eq(1)
 691             with m.If(tlb_hit):
 692                 comb += is_hit.eq(hit_set[tlb_hit_way])
 693                 comb += hit_way.eq(hit_way_set[tlb_hit_way])
 694                 comb += rel_match.eq(rel_matches[tlb_hit_way])
 695         with m.Else():
 696             comb += s_tag.eq(get_tag(r0.req.addr))
 697             for i in range(NUM_WAYS):
 698                 with m.If(go & cache_valid_bits[req_index][i] &
 699                           read_tag(i, cache_tag_set) == s_tag):
 700                     comb += hit_way.eq(i)
 701                     comb += is_hit.eq(1)
 702             with m.If(s_tag == r1.reload_tag):
 703                 comb += rel_match.eq(1)
 704         comb += req_same_tag.eq(rel_match)
 705
 706         # See if the request matches the line currently being reloaded
 707         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 708                   (req_index == r1.store_index) & rel_match):
 709             # For a store, consider this a hit even if the row isn't
 710             # valid since it will be by the time we perform the store.
 711             # For a load, check the appropriate row valid bit.
 712             valid = r1.rows_valid[req_row % ROW_PER_LINE]
 713             comb += is_hit.eq(~r0.req.load | valid)
 714             comb += hit_way.eq(replace_way)
 715
 716         # Whether to use forwarded data for a load or not
 717         comb += use_forward1_next.eq(0)
 718         with m.If((get_row(r1.req.real_addr) == req_row)
 719                   & (r1.req.hit_way == hit_way))
 720             # Only need to consider r1.write_bram here, since if we
 721             # are writing refill data here, then we don't have a
 722             # cache hit this cycle on the line being refilled.
 723             # (There is the possibility that the load following the
 724             # load miss that started the refill could be to the old
 725             # contents of the victim line, since it is a couple of
 726             # cycles after the refill starts before we see the updated
 727             # cache tag. In that case we don't use the bypass.)
 728             comb += use_forward1_next.eq(r1.write_bram)
 729         comb += use_forward2_next.eq(0)
 730         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 731             comb += use_forward2_next.eq(r1.forward_valid1)
 732
 733         # The way that matched on a hit
 734         comb += req_hit_way.eq(hit_way)
 735
 736         # The way to replace on a miss
 737         with m.If(r1.write_tag):
 738             replace_way.eq(plru_victim[r1.store_index])
 739         with m.Else():
 740             comb += replace_way.eq(r1.store_way)
 741
 742         # work out whether we have permission for this access
 743         # NB we don't yet implement AMR, thus no KUAP
 744         comb += rc_ok.eq(perm_attr.reference
 745                          & (r0.req.load | perm_attr.changed)
 746                 )
 747         comb += perm_ok.eq((r0.req.prive_mode | ~perm_attr.priv)
 748                            & perm_attr.wr_perm
 749                            | (r0.req.load & perm_attr.rd_perm)
 750                           )
 751         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 752         # Combine the request and cache hit status to decide what
 753         # operation needs to be done
 754         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 755         comb += op.eq(Op.OP_NONE)
 756         with m.If(go):
 757             with m.If(~access_ok):
 758                 comb += op.eq(Op.OP_BAD)
 759             with m.Elif(cancel_store):
 760                 comb += op.eq(Op.OP_STCX_FAIL)
 761             with m.Else():
 762                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 763                 with m.Switch(opsel):
 764                     with m.Case(Const(0b101, 3)):
 765                         comb += op.eq(Op.OP_LOAD_HIT)
 766                     with m.Case(Cosnt(0b100, 3)):
 767                         comb += op.eq(Op.OP_LOAD_MISS)
 768                     with m.Case(Const(0b110, 3)):
 769                         comb += op.eq(Op.OP_LOAD_NC)
 770                     with m.Case(Const(0b001, 3)):
 771                         comb += op.eq(Op.OP_STORE_HIT)
 772                     with m.Case(Const(0b000, 3)):
 773                         comb += op.eq(Op.OP_STORE_MISS)
 774                     with m.Case(Const(0b010, 3)):
 775                         comb += op.eq(Op.OP_STORE_MISS)
 776                     with m.Case(Const(0b011, 3)):
 777                         comb += op.eq(Op.OP_BAD)
 778                     with m.Case(Const(0b111, 3)):
 779                         comb += op.eq(Op.OP_BAD)
 780                     with m.Default():
 781                         comb += op.eq(Op.OP_NONE)
 782         comb += req_op.eq(op)
 783         comb += req_go.eq(go)
 784
 785         # Version of the row number that is valid one cycle earlier
 786         # in the cases where we need to read the cache data BRAM.
 787         # If we're stalling then we need to keep reading the last
 788         # row requested.
 789         with m.If(~r0_stall):
 790             with m.If(m_in.valid):
 791                 comb += early_req_row.eq(get_row(m_in.addr))
 792             with m.Else():
 793                 comb += early_req_row.eq(get_row(d_in.addr))
 794         with m.Else():
 795             comb += early_req_row.eq(req_row)
 796
 797     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
 798                          r0_valid, r0, reservation):
 799         """Handle load-with-reservation and store-conditional instructions
 800         """
 801         comb = m.d.comb
 802         sync = m.d.sync
 803
 804         with m.If(r0_valid & r0.req.reserve):
 805
 806             # XXX generate alignment interrupt if address
 807             # is not aligned XXX or if r0.req.nc = '1'
 808             with m.If(r0.req.load):
 809                 comb += set_rsrv(1) # load with reservation
 810             with m.Else():
 811                 comb += clear_rsrv.eq(1) # store conditional
 812                 with m.If(~reservation.valid | r0.req.addr[LINE_OFF_BITS:64]):
 813                     comb += cancel_store.eq(1)
 814
 815     def reservation_reg(self, m, r0_valid, access_ok, clear_rsrv,
 816                         reservation, r0):
 817
 818         comb = m.d.comb
 819         sync = m.d.sync
 820
 821         with m.If(r0_valid & access_ok):
 822             with m.If(clear_rsrv):
 823                 sync += reservation.valid.eq(0)
 824             with m.Elif(set_rsrv):
 825                 sync += reservation.valid.eq(1)
 826                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
 827
 828     def writeback_control(self, m, r1, cache_out, d_out, m_out):
 829         """Return data for loads & completion control logic
 830         """
 831         comb = m.d.comb
 832         sync = m.d.sync
 833
 834         data_out = Signal(64)
 835         data_fwd = Signal(64)
 836         j        = Signal()
 837
 838         # Use the bypass if are reading the row that was
 839         # written 1 or 2 cycles ago, including for the
 840         # slow_valid = 1 case (i.e. completing a load
 841         # miss or a non-cacheable load).
 842         with m.If(r1.use_forward1):
 843             comb += data_fwd.eq(r1.forward_data1)
 844         with m.Else():
 845             comb += data_fwd.eq(r1.forward_data2)
 846
 847         comb += data_out.eq(cache_out[r1.hit_way])
 848
 849         for i in range(8):
 850             with m.If(r1.forward_sel[i]):
 851                 dsel = data_fwd.word_select(i, 8)
 852                 comb += data_out.word_select(i, 8).eq(dsel)
 853
 854         comb += d_out.valid.eq(r1.ls_valid)
 855         comb += d_out.data.eq(data_out)
 856         comb += d_out.store_done.eq(~r1.stcx_fail)
 857         comb += d_out.error.eq(r1.ls_error)
 858         comb += d_out.cache_paradox.eq(r1.cache_paradox)
 859
 860         # Outputs to MMU
 861         comb += m_out.done.eq(r1.mmu_done)
 862         comb += m_out.err.eq(r1.mmu_error)
 863         comb += m_out.data.eq(data_out)
 864
 865         # We have a valid load or store hit or we just completed
 866         # a slow op such as a load miss, a NC load or a store
 867         #
 868         # Note: the load hit is delayed by one cycle. However it
 869         # can still not collide with r.slow_valid (well unless I
 870         # miscalculated) because slow_valid can only be set on a
 871         # subsequent request and not on its first cycle (the state
 872         # machine must have advanced), which makes slow_valid
 873         # at least 2 cycles from the previous hit_load_valid.
 874
 875         # Sanity: Only one of these must be set in any given cycle
 876
 877         if False: # TODO: need Display to get this to work
 878             assert (r1.slow_valid & r1.stcx_fail) != 1 "unexpected" \
 879              "slow_valid collision with stcx_fail -!- severity FAILURE"
 880
 881             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1
 882              "unexpected hit_load_delayed collision with slow_valid -!-" \
 883              "severity FAILURE"
 884
 885         with m.If(~r1._mmu_req):
 886             # Request came from loadstore1...
 887             # Load hit case is the standard path
 888             with m.If(r1.hit_load_valid):
 889                 #Display(f"completing load hit data={data_out}")
 890                 pass
 891
 892             # error cases complete without stalling
 893             with m.If(r1.ls_error):
 894                 # Display("completing ld/st with error")
 895                 pass
 896
 897             # Slow ops (load miss, NC, stores)
 898             with m.If(r1.slow_valid):
 899                 #Display(f"completing store or load miss data={data_out}")
 900                 pass
 901
 902         with m.Else():
 903             # Request came from MMU
 904             with m.If(r1.hit_load_valid):
 905                 # Display(f"completing load hit to MMU, data={m_out.data}")
 906                 pass
 907             # error cases complete without stalling
 908             with m.If(r1.mmu_error):
 909                 #Display("combpleting MMU ld with error")
 910                 pass
 911
 912             # Slow ops (i.e. load miss)
 913             with m.If(r1.slow_valid):
 914                 #Display("completing MMU load miss, data={m_out.data}")
 915                 pass
 916
 917     def rams(self, m):
 918         """rams
 919         Generate a cache RAM for each way. This handles the normal
 920         reads, writes from reloads and the special store-hit update
 921         path as well.
 922
 923         Note: the BRAMs have an extra read buffer, meaning the output
 924         is pipelined an extra cycle. This differs from the
 925         icache. The writeback logic needs to take that into
 926         account by using 1-cycle delayed signals for load hits.
 927         """
 928         comb = m.d.comb
 929
 930         for i in range(NUM_WAYS):
 931             do_read  = Signal()
 932             rd_addr  = Signal(ROW_BITS)
 933             do_write = Signal()
 934             wr_addr  = Signal(ROW_BITS)
 935             wr_data  = Signal(WB_DATA_BITS)
 936             wr_sel   = Signal(ROW_SIZE)
 937             wr_sel_m = Signal(ROW_SIZE)
 938             _d_out   = Signal(WB_DATA_BITS)
 939
 940             way = CacheRam(ROW_BITS, WB_DATA_BITS, True)
 941             setattr(m.submodules, "cacheram_%d" % i, way)
 942
 943             comb += way.rd_en.eq(do_read)
 944             comb += way.rd_addr.eq(rd_addr)
 945             comb += _d_out.eq(way.rd_data)
 946             comb += way.wr_sel.eq(wr_sel_m)
 947             comb += way.wr_addr.eq(wr_addr)
 948             comb += way.wr_data.eq(wr_data)
 949
 950             # Cache hit reads
 951             comb += do_read.eq(1)
 952             comb += rd_addr.eq(early_req_row)
 953             comb += cache_out[i].eq(_d_out)
 954
 955             # Write mux:
 956             #
 957             # Defaults to wishbone read responses (cache refill)
 958             #
 959             # For timing, the mux on wr_data/sel/addr is not
 960             # dependent on anything other than the current state.
 961
 962             with m.If(r1.write_bram):
 963                 # Write store data to BRAM.  This happens one
 964                 # cycle after the store is in r0.
 965                 comb += wr_data.eq(r1.req.data)
 966                 comb += wr_sel.eq(r1.req.byte_sel)
 967                 comb += wr_addr.eq(get_row(r1.req.real_addr))
 968
 969                 with m.If(i == r1.req.hit_way):
 970                     comb += do_write.eq(1)
 971             with m.Else():
 972                 # Otherwise, we might be doing a reload or a DCBZ
 973                 with m.If(r1.dcbz):
 974                     comb += wr_data.eq(0)
 975                 with m.Else():
 976                     comb += wr_data.eq(wishbone_in.dat)
 977                 comb += wr_addr.eq(r1.store_row)
 978                 comb += wr_sel.eq(~0) # all 1s
 979
 980             with m.If((r1.state == State.RELOAD_WAIT_ACK)
 981                       & wishbone_in.ack & (relpace_way == i)):
 982                 comb += do_write.eq(1)
 983
 984                 # Mask write selects with do_write since BRAM
 985                 # doesn't have a global write-enable
 986                 with m.If(do_write):
 987                     comb += wr_sel_m.eq(wr_sel)
 988
 989     # Cache hit synchronous machine for the easy case.
 990     # This handles load hits.
 991     # It also handles error cases (TLB miss, cache paradox)
 992     def dcache_fast_hit(self, m, req_op, r0_valid, r1, ):
 993
 994         comb = m.d.comb
 995         sync = m.d.sync
 996
 997         with m.If(req_op != Op.OP_NONE):
 998             #Display(f"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \
 999             #      f"idx:{req_index} tag:{req_tag} way: {req_hit_way}"
1000             #     )
1001             pass
1002
1003         with m.If(r0_valid):
1004             sync += r1.mmu_req.eq(r0.mmu_req)
1005
1006         # Fast path for load/store hits.
1007         # Set signals for the writeback controls.
1008         sync += r1.hit_way.eq(req_hit_way)
1009         sync += r1.hit_index.eq(req_index)
1010
1011         with m.If(req_op == Op.OP_LOAD_HIT):
1012             sync += r1.hit_load_valid.eq(1)
1013         with m.Else():
1014             sync += r1.hit_load_valid.eq(0)
1015
1016         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STORE_HIT)):
1017             sync += r1.cache_hit.eq(1)
1018         with m.Else():
1019             sync += r1.cache_hit.eq(0)
1020
1021         with m.If(req_op == Op.OP_BAD):
1022             # Display(f"Signalling ld/st error valid_ra={valid_ra}"
1023             #      f"rc_ok={rc_ok} perm_ok={perm_ok}"
1024             sync += r1.ls_error.eq(~r0.mmu_req)
1025             sync += r1.mmu_error.eq(r0.mmu_req)
1026             sync += r1.cache_paradox.eq(access_ok)
1027
1028             with m.Else():
1029                 sync += r1.ls_error.eq(0)
1030                 sync += r1.mmu_error.eq(0)
1031                 sync += r1.cache_paradox.eq(0)
1032
1033         with m.If(req_op == Op.OP_STCX_FAIL):
1034             r1.stcx_fail.eq(1)
1035         with m.Else():
1036             sync += r1.stcx_fail.eq(0)
1037
1038         # Record TLB hit information for updating TLB PLRU
1039         sync += r1.tlb_hit.eq(tlb_hit)
1040         sync += r1.tlb_hit_way.eq(tlb_hit_way)
1041         sync += r1.tlb_hit_index.eq(tlb_req_index)
1042
1043     # Memory accesses are handled by this state machine:
1044     #
1045     #   * Cache load miss/reload (in conjunction with "rams")
1046     #   * Load hits for non-cachable forms
1047     #   * Stores (the collision case is handled in "rams")
1048     #
1049     # All wishbone requests generation is done here.
1050     # This machine operates at stage 1.
1051     def dcache_slow(self, m, r1, use_forward1_next, cache_valid_bits, r0,
1052                     r0_valid, req_op, cache_tag, req_go, ra, wb_in):
1053
1054         comb = m.d.comb
1055         sync = m.d.sync
1056
1057         req         = MemAccessRequest()
1058         acks        = Signal(3)
1059         adjust_acks = Signal(3)
1060
1061         sync += r1.use_forward1.eq(use_forward1_next)
1062         sync += r1.forward_sel.eq(0)
1063
1064         with m.If(use_forward1_next):
1065             sync += r1.forward_sel.eq(r1.req.byte_sel)
1066         with m.Elif(use_forward2_next):
1067             sync += r1.forward_sel.eq(r1.forward_sel1)
1068
1069         sync += r1.forward_data2.eq(r1.forward_data1)
1070         with m.If(r1.write_bram):
1071             sync += r1.forward_data1.eq(r1.req.data)
1072             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1073             sync += r1.forward_way1.eq(r1.req.hit_way)
1074             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1075             sync += r1.forward_valid1.eq(1)
1076         with m.Else():
1077             with m.If(r1.bcbz):
1078                 sync += r1.forward_data1.eq(0)
1079             with m.Else():
1080                 sync += r1.forward_data1.eq(wb_in.dat)
1081             sync += r1.forward_sel1.eq(~0) # all 1s
1082             sync += r1.forward_way1.eq(replace_way)
1083             sync += r1.forward_row1.eq(r1.store_row)
1084             sync += r1.forward_valid1.eq(0)
1085
1086         # One cycle pulses reset
1087         sync += r1.slow_valid.eq(0)
1088         sync += r1.write_bram.eq(0)
1089         sync += r1.inc_acks.eq(0)
1090         sync += r1.dec_acks.eq(0)
1091
1092         sync += r1.ls_valid.eq(0)
1093         # complete tlbies and TLB loads in the third cycle
1094         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1095
1096         with m.If((req_op == Op.OP_LOAD_HIT)
1097                   | (req_op == Op.OP_STCX_FAIL)):
1098             with m.If(~r0.mmu_req):
1099                 sync += r1.ls_valid.eq(1)
1100             with m.Else():
1101                 sync += r1.mmu_done.eq(1)
1102
1103         with m.If(r1.write_tag):
1104             # Store new tag in selected way
1105             for i in range(NUM_WAYS):
1106                 with m.If(i == replace_way):
1107                     idx = r1.store_index
1108                     trange = range(i * TAG_WIDTH, (i+1) * TAG_WIDTH)
1109                     sync += cache_tag[idx][trange].eq(r1.reload_tag)
1110             sync += r1.store_way.eq(replace_way)
1111             sync += r1.write_tag.eq(0)
1112
1113         # Take request from r1.req if there is one there,
1114         # else from req_op, ra, etc.
1115         with m.If(r1.full)
1116             comb += req.eq(r1.req)
1117         with m.Else():
1118             comb += req.op.eq(req_op)
1119             comb += req.valid.eq(req_go)
1120             comb += req.mmu_req.eq(r0.mmu_req)
1121             comb += req.dcbz.eq(r0.req.dcbz)
1122             comb += req.real_addr.eq(ra)
1123
1124             with m.If(~r0.req.dcbz):
1125                 comb += req.data.eq(r0.req.data)
1126             with m.Else():
1127                 comb += req.data.eq(0)
1128
1129             # Select all bytes for dcbz
1130             # and for cacheable loads
1131             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc):
1132                 comb += req.byte_sel.eq(~0) # all 1s
1133             with m.Else():
1134                 comb += req.byte_sel.eq(r0.req.byte_sel)
1135             comb += req.hit_way.eq(req_hit_way)
1136             comb += req.same_tag.eq(req_same_tag)
1137
1138             # Store the incoming request from r0,
1139             # if it is a slow request
1140             # Note that r1.full = 1 implies req_op = OP_NONE
1141             with m.If((req_op == Op.OP_LOAD_MISS)
1142                       | (req_op == Op.OP_LOAD_NC)
1143                       | (req_op == Op.OP_STORE_MISS)
1144                       | (req_op == Op.OP_STORE_HIT)):
1145                 sync += r1.req(req)
1146                 sync += r1.full.eq(1)
1147
1148         # Main state machine
1149         with m.Switch(r1.state):
1150
1151             with m.Case(State.IDLE)
1152 # XXX check 'left downto.  probably means len(r1.wb.adr)
1153 #                     r1.wb.adr <= req.real_addr(
1154 #                                   r1.wb.adr'left downto 0
1155 #                                  );
1156                 sync += r1.wb.adr.eq(req.real_addr[0:r1.wb.adr])
1157                 sync += r1.wb.sel.eq(req.byte_sel)
1158                 sync += r1.wb.dat.eq(req.data)
1159                 sync += r1.dcbz.eq(req.dcbz)
1160
1161                 # Keep track of our index and way
1162                 # for subsequent stores.
1163                 sync += r1.store_index.eq(get_index(req.real_addr))
1164                 sync += r1.store_row.eq(get_row(req.real_addr))
1165                 sync += r1.end_row_ix.eq(
1166                          get_row_of_line(get_row(req.real_addr))
1167                         )
1168                 sync += r1.reload_tag.eq(get_tag(req.real_addr))
1169                 sync += r1.req.same_tag.eq(1)
1170
1171                 with m.If(req.op == Op.OP_STORE_HIT):
1172                     sync += r1.store_way.eq(req.hit_way)
1173
1174                 # Reset per-row valid bits,
1175                 # ready for handling OP_LOAD_MISS
1176                 for i in range(ROW_PER_LINE):
1177                     sync += r1.rows_valid[i].eq(0)
1178
1179                 with m.Switch(req.op):
1180                     with m.Case(Op.OP_LOAD_HIT):
1181                         # stay in IDLE state
1182                         pass
1183
1184                     with m.Case(Op.OP_LOAD_MISS):
1185                         #Display(f"cache miss real addr:" \
1186                         #      f"{req_real_addr}" \
1187                         #      f" idx:{get_index(req_real_addr)}" \
1188                         #      f" tag:{get_tag(req.real_addr)}")
1189                         pass
1190
1191                         # Start the wishbone cycle
1192                         sync += r1.wb.we.eq(0)
1193                         sync += r1.wb.cyc.eq(1)
1194                         sync += r1.wb.stb.eq(1)
1195
1196                         # Track that we had one request sent
1197                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1198                         sync += r1.write_tag.eq(1)
1199
1200                     with m.Case(Op.OP_LOAD_NC):
1201                         sync += r1.wb.cyc.eq(1)
1202                         sync += r1.wb.stb.eq(1)
1203                         sync += r1.wb.we.eq(0)
1204                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1205
1206                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1207                         with m.If(~req.bcbz):
1208                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1209                             sync += r1.acks_pending.eq(1)
1210                             sync += r1.full.eq(0)
1211                             sync += r1.slow_valid.eq(1)
1212
1213                             with m.If(~req.mmu_req):
1214                                 sync += r1.ls_valid.eq(1)
1215                             with m.Else():
1216                                 sync += r1.mmu_done.eq(1)
1217
1218                             with m.If(req.op == Op.OP_STORE_HIT):
1219                                 sync += r1.write_bram.eq(1)
1220                         with m.Else():
1221                             sync += r1.state.eq(Op.RELOAD_WAIT_ACK)
1222
1223                             with m.If(req.op == Op.OP_STORE_MISS):
1224                                 sync += r1.write_tag.eq(1)
1225
1226                         sync += r1.wb.we.eq(1)
1227                         sync += r1.wb.cyc.eq(1)
1228                         sync += r1.wb.stb.eq(1)
1229
1230                     # OP_NONE and OP_BAD do nothing
1231                     # OP_BAD & OP_STCX_FAIL were
1232                     # handled above already
1233                     with m.Case(Op.OP_NONE):
1234                         pass
1235                     with m.Case(OP_BAD):
1236                         pass
1237                     with m.Case(OP_STCX_FAIL):
1238                         pass
1239
1240             with m.Case(State.RELOAD_WAIT_ACK):
1241                 # Requests are all sent if stb is 0
1242                 comb += stbs_done.eq(~r1.wb.stb)
1243
1244                 with m.If(~wb_in.stall & ~stbs_done):
1245                     # That was the last word?
1246                     # We are done sending.
1247                     # Clear stb and set stbs_done
1248                     # so we can handle an eventual
1249                     # last ack on the same cycle.
1250                     with m.If(is_last_row_addr(
1251                               r1.wb.adr, r1.end_row_ix)):
1252                         sync += r1.wb.stb.eq(0)
1253                         comb += stbs_done.eq(0)
1254
1255                     # Calculate the next row address
1256                     sync += r1.wb.adr.eq(next_row_addr(r1.wb.adr))
1257
1258                 # Incoming acks processing
1259                 sync += r1.forward_valid1.eq(wb_in.ack)
1260                 with m.If(wb_in.ack):
1261                     # XXX needs an Array bit-accessor here
1262                     sync += r1.rows_valid[r1.store_row % ROW_PER_LINE].eq(1)
1263
1264                     # If this is the data we were looking for,
1265                     # we can complete the request next cycle.
1266                     # Compare the whole address in case the
1267                     # request in r1.req is not the one that
1268                     # started this refill.
1269                     with m.If(r1.full & r1.req.same_tag &
1270                               ((r1.dcbz & r1.req.dcbz) |
1271                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1272                                 (r1.store_row == get_row(r1.req.real_addr))):
1273                         sync += r1.full.eq(0)
1274                         sync += r1.slow_valid.eq(1)
1275                             with m.If(~r1.mmu_req):
1276                                 sync += r1.ls_valid.eq(1)
1277                             with m.Else():
1278                                 sync += r1.mmu_done.eq(1)
1279                         sync += r1.forward_sel.eq(~0) # all 1s
1280                         sync += r1.use_forward1.eq(1)
1281
1282                     # Check for completion
1283                     with m.If(stbs_done & is_last_row(r1.store_row,
1284                                                       r1.end_row_ix)):
1285                         # Complete wishbone cycle
1286                         sync += r1.wb.cyc.eq(0)
1287
1288                         # Cache line is now valid
1289                         cv = cache_valid_bits[r1.store_index]
1290                         sync += cv[r1.store_way].eq(1)
1291                         sync += r1.state.eq(State.IDLE)
1292
1293                     # Increment store row counter
1294                     sync += r1.store_row.eq(next_row(r1.store_row))
1295
1296             with m.Case(State.STORE_WAIT_ACK):
1297                 comb += stbs_done.eq(~r1.wb.stb)
1298                 comb += acks.eq(r1.acks_pending)
1299
1300                 with m.If(r1.inc_acks != r1.dec_acks):
1301                     with m.If(r1.inc_acks):
1302                         comb += adjust_acks.eq(acks + 1)
1303                     with m.Else():
1304                         comb += adjust_acks.eq(acks - 1)
1305                 with m.Else():
1306                     comb += adjust_acks.eq(acks)
1307
1308                 sync += r1.acks_pending.eq(adjust_acks)
1309
1310                 # Clear stb when slave accepted request
1311                 with m.If(~wb_in.stall):
1312                     # See if there is another store waiting
1313                     # to be done which is in the same real page.
1314                     with m.If(req.valid):
1315                         ra = req.real_addr[0:SET_SIZE_BITS]
1316                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(ra)
1317                         sync += r1.wb.dat.eq(req.data)
1318                         sync += r1.wb.sel.eq(req.byte_sel)
1319
1320                     with m.Elif((adjust_acks < 7) & req.same_tag &
1321                                 ((req.op == Op.Op_STORE_MISS)
1322                                  | (req.op == Op.OP_SOTRE_HIT))):
1323                         sync += r1.wb.stb.eq(1)
1324                         comb += stbs_done.eq(0)
1325
1326                         with m.If(req.op == Op.OP_STORE_HIT):
1327                             sync += r1.write_bram.eq(1)
1328                         sync += r1.full.eq(0)
1329                         sync += r1.slow_valid.eq(1)
1330
1331                         # Store requests never come from the MMU
1332                         sync += r1.ls_valid.eq(1)
1333                         comb += stbs_done.eq(0)
1334                         sync += r1.inc_acks.eq(1)
1335                     with m.Else():
1336                         sync += r1.wb.stb.eq(0)
1337                         comb += stbs_done.eq(1)
1338
1339                 # Got ack ? See if complete.
1340                 with m.If(wb_in.ack):
1341                     with m.If(stbs_done & (adjust_acks == 1))
1342                         sync += r1.state.eq(State.IDLE)
1343                         sync += r1.wb.cyc.eq(0)
1344                         sync += r1.wb.stb.eq(0)
1345                     sync += r1.dec_acks.eq(1)
1346
1347             with m.Case(State.NC_LOAD_WAIT_ACK):
1348                 # Clear stb when slave accepted request
1349                 with m.If(~wb_in.stall):
1350                     sync += r1.wb.stb.eq(0)
1351
1352                 # Got ack ? complete.
1353                 with m.If(wb_in.ack):
1354                     sync += r1.state.eq(State.IDLE)
1355                     sync += r1.full.eq(0)
1356                     sync += r1.slow_valid.eq(1)
1357
1358                     with m.If(~r1.mmu_req):
1359                         sync += r1.ls_valid.eq(1)
1360                     with m.Else():
1361                         sync += r1.mmu_done.eq(1)
1362
1363                     sync += r1.forward_sel.eq(~0) # all 1s
1364                     sync += r1.use_forward1.eq(1)
1365                     sync += r1.wb.cyc.eq(0)
1366                     sync += r1.wb.stb.eq(0)
1367
1368     def dcache_log(self, m, r1, valid_ra, tlb_hit_way, stall_out,
1369                    d_out, wb_in, log_out):
1370
1371         sync = m.d.sync
1372
1373         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit_way[:3],
1374                                stall_out, req_op[:3], d_out.valid, d_out.error,
1375                                r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall,
1376                                r1.wb.adr[3:6]))
1377
1378     def elaborate(self, platform):
1379
1380         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1381         cache_tags       = CacheTagArray()
1382         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1383         cache_valid_bits = CacheValidBitsArray()
1384
1385         # TODO attribute ram_style : string;
1386         # TODO attribute ram_style of cache_tags : signal is "distributed";
1387
1388 """note: these are passed to nmigen.hdl.Memory as "attributes".
1389    don't know how, just that they are.
1390 """
1391         dtlb_valid_bits = TLBValidBitsArray()
1392         dtlb_tags       = TLBTagsArray()
1393         dtlb_ptes       = TLBPtesArray()
1394         # TODO attribute ram_style of
1395         #  dtlb_tags : signal is "distributed";
1396         # TODO attribute ram_style of
1397         #  dtlb_ptes : signal is "distributed";
1398
1399         r0      = RegStage0()
1400         r0_full = Signal()
1401
1402         r1 = RegStage1()
1403
1404         reservation = Reservation()
1405
1406         # Async signals on incoming request
1407         req_index    = Signal(NUM_LINES)
1408         req_row      = Signal(BRAM_ROWS)
1409         req_hit_way  = Signal(WAY_BITS)
1410         req_tag      = Signal(TAG_BITS)
1411         req_op       = Op()
1412         req_data     = Signal(64)
1413         req_same_tag = Signal()
1414         req_go       = Signal()
1415
1416         early_req_row     = Signal(BRAM_ROWS)
1417
1418         cancel_store      = Signal()
1419         set_rsrv          = Signal()
1420         clear_rsrv        = Signal()
1421
1422         r0_valid          = Signal()
1423         r0_stall          = Signal()
1424
1425         use_forward1_next = Signal()
1426         use_forward2_next = Signal()
1427
1428         cache_out         = CacheRamOut()
1429
1430         plru_victim       = PLRUOut()
1431         replace_way       = Signal(WAY_BITS)
1432
1433         # Wishbone read/write/cache write formatting signals
1434         bus_sel           = Signal(8)
1435
1436         # TLB signals
1437         tlb_tag_way   = Signal(TLB_TAG_WAY_BITS)
1438         tlb_pte_way   = Signal(TLB_PTE_WAY_BITS)
1439         tlb_valid_way = Signal(TLB_NUM_WAYS)
1440         tlb_req_index = Signal(TLB_SET_SIZE)
1441         tlb_hit       = Signal()
1442         tlb_hit_way   = Signal(TLB_NUM_WAYS)
1443         pte           = Signal(TLB_PTE_BITS)
1444         ra            = Signal(REAL_ADDR_BITS)
1445         valid_ra      = Signal()
1446         perm_attr     = PermAttr()
1447         rc_ok         = Signal()
1448         perm_ok       = Signal()
1449         access_ok     = Signal()
1450
1451         tlb_plru_victim = TLBPLRUOut()
1452
1453         # we don't yet handle collisions between loadstore1 requests
1454         # and MMU requests
1455         comb += m_out.stall.eq(0)
1456
1457         # Hold off the request in r0 when r1 has an uncompleted request
1458         comb += r0_stall.eq(r0_full & r1.full)
1459         comb += r0_valid.eq(r0_full & ~r1.full)
1460         comb += stall_out.eq(r0_stall)
1461
1462         # Wire up wishbone request latch out of stage 1
1463         comb += wishbone_out.eq(r1.wb)
1464
1465
1466
1467 # dcache_tb.vhdl
1468 #
1469 # entity dcache_tb is
1470 # end dcache_tb;
1471 #
1472 # architecture behave of dcache_tb is
1473 #     signal clk          : std_ulogic;
1474 #     signal rst          : std_ulogic;
1475 #
1476 #     signal d_in         : Loadstore1ToDcacheType;
1477 #     signal d_out        : DcacheToLoadstore1Type;
1478 #
1479 #     signal m_in         : MmuToDcacheType;
1480 #     signal m_out        : DcacheToMmuType;
1481 #
1482 #     signal wb_bram_in   : wishbone_master_out;
1483 #     signal wb_bram_out  : wishbone_slave_out;
1484 #
1485 #     constant clk_period : time := 10 ns;
1486 # begin
1487 #     dcache0: entity work.dcache
1488 #         generic map(
1489 #
1490 #             LINE_SIZE => 64,
1491 #             NUM_LINES => 4
1492 #             )
1493 #         port map(
1494 #             clk => clk,
1495 #             rst => rst,
1496 #             d_in => d_in,
1497 #             d_out => d_out,
1498 #             m_in => m_in,
1499 #             m_out => m_out,
1500 #             wishbone_out => wb_bram_in,
1501 #             wishbone_in => wb_bram_out
1502 #             );
1503 #
1504 #     -- BRAM Memory slave
1505 #     bram0: entity work.wishbone_bram_wrapper
1506 #         generic map(
1507 #             MEMORY_SIZE   => 1024,
1508 #             RAM_INIT_FILE => "icache_test.bin"
1509 #             )
1510 #         port map(
1511 #             clk => clk,
1512 #             rst => rst,
1513 #             wishbone_in => wb_bram_in,
1514 #             wishbone_out => wb_bram_out
1515 #             );
1516 #
1517 #     clk_process: process
1518 #     begin
1519 #         clk <= '0';
1520 #         wait for clk_period/2;
1521 #         clk <= '1';
1522 #         wait for clk_period/2;
1523 #     end process;
1524 #
1525 #     rst_process: process
1526 #     begin
1527 #         rst <= '1';
1528 #         wait for 2*clk_period;
1529 #         rst <= '0';
1530 #         wait;
1531 #     end process;
1532 #
1533 #     stim: process
1534 #     begin
1535 #     -- Clear stuff
1536 #     d_in.valid <= '0';
1537 #     d_in.load <= '0';
1538 #     d_in.nc <= '0';
1539 #     d_in.addr <= (others => '0');
1540 #     d_in.data <= (others => '0');
1541 #         m_in.valid <= '0';
1542 #         m_in.addr <= (others => '0');
1543 #         m_in.pte <= (others => '0');
1544 #
1545 #         wait for 4*clk_period;
1546 #     wait until rising_edge(clk);
1547 #
1548 #     -- Cacheable read of address 4
1549 #     d_in.load <= '1';
1550 #     d_in.nc <= '0';
1551 #         d_in.addr <= x"0000000000000004";
1552 #         d_in.valid <= '1';
1553 #     wait until rising_edge(clk);
1554 #         d_in.valid <= '0';
1555 #
1556 #     wait until rising_edge(clk) and d_out.valid = '1';
1557 #         assert d_out.data = x"0000000100000000"
1558 #         report "data @" & to_hstring(d_in.addr) &
1559 #         "=" & to_hstring(d_out.data) &
1560 #         " expected 0000000100000000"
1561 #         severity failure;
1562 # --      wait for clk_period;
1563 #
1564 #     -- Cacheable read of address 30
1565 #     d_in.load <= '1';
1566 #     d_in.nc <= '0';
1567 #         d_in.addr <= x"0000000000000030";
1568 #         d_in.valid <= '1';
1569 #     wait until rising_edge(clk);
1570 #         d_in.valid <= '0';
1571 #
1572 #     wait until rising_edge(clk) and d_out.valid = '1';
1573 #         assert d_out.data = x"0000000D0000000C"
1574 #         report "data @" & to_hstring(d_in.addr) &
1575 #         "=" & to_hstring(d_out.data) &
1576 #         " expected 0000000D0000000C"
1577 #         severity failure;
1578 #
1579 #     -- Non-cacheable read of address 100
1580 #     d_in.load <= '1';
1581 #     d_in.nc <= '1';
1582 #         d_in.addr <= x"0000000000000100";
1583 #         d_in.valid <= '1';
1584 #     wait until rising_edge(clk);
1585 #     d_in.valid <= '0';
1586 #     wait until rising_edge(clk) and d_out.valid = '1';
1587 #         assert d_out.data = x"0000004100000040"
1588 #         report "data @" & to_hstring(d_in.addr) &
1589 #         "=" & to_hstring(d_out.data) &
1590 #         " expected 0000004100000040"
1591 #         severity failure;
1592 #
1593 #     wait until rising_edge(clk);
1594 #     wait until rising_edge(clk);
1595 #     wait until rising_edge(clk);
1596 #     wait until rising_edge(clk);
1597 #
1598 #     std.env.finish;
1599 #     end process;
1600 # end;
1601 def dcache_sim(dut):
1602     # clear stuff
1603     yield dut.d_in.valid.eq(0)
1604     yield dut.d_in.load.eq(0)
1605     yield dut.d_in.nc.eq(0)
1606     yield dut.d_in.adrr.eq(0)
1607     yield dut.d_in.data.eq(0)
1608     yield dut.m_in.valid.eq(0)
1609     yield dut.m_in.addr.eq(0)
1610     yield dut.m_in.pte.eq(0)
1611     # wait 4 * clk_period
1612     yield
1613     yield
1614     yield
1615     yield
1616     # wait_until rising_edge(clk)
1617     yield
1618     # Cacheable read of address 4
1619     yield dut.d_in.load.eq(1)
1620     yield dut.d_in.nc.eq(0)
1621     yield dut.d_in.addr.eq(Const(0x0000000000000004, 64))
1622     yield dut.d_in.valid.eq(1)
1623     # wait-until rising_edge(clk)
1624     yield
1625     yield dut.d_in.valid.eq(0)
1626     yield
1627     while not (yield dut.d_out.valid):
1628         yield
1629     assert dut.d_out.data == Const(0x0000000100000000, 64) f"data @" \
1630         f"{dut.d_in.addr}={dut.d_in.data} expected 0000000100000000" \
1631         " -!- severity failure"
1632
1633
1634     # Cacheable read of address 30
1635     yield dut.d_in.load.eq(1)
1636     yield dut.d_in.nc.eq(0)
1637     yield dut.d_in.addr.eq(Const(0x0000000000000030, 64))
1638     yield dut.d_in.valid.eq(1)
1639     yield
1640     yield dut.d_in.valid.eq(0)
1641     yield
1642     while not (yield dut.d_out.valid):
1643         yield
1644     assert dut.d_out.data == Const(0x0000000D0000000C, 64) f"data @" \
1645         f"{dut.d_in.addr}={dut.d_out.data} expected 0000000D0000000C" \
1646         f"-!- severity failure"
1647
1648     # Non-cacheable read of address 100
1649     yield dut.d_in.load.eq(1)
1650     yield dut.d_in.nc.eq(1)
1651     yield dut.d_in.addr.eq(Const(0x0000000000000100, 64))
1652     yield dut.d_in.valid.eq(1)
1653     yield
1654     yield dut.d_in.valid.eq(0)
1655     yield
1656     while not (yield dut.d_out.valid):
1657         yield
1658     assert dut.d_out.data == Const(0x0000004100000040, 64) f"data @" \
1659         f"{dut.d_in.addr}={dut.d_out.data} expected 0000004100000040" \
1660         f"-!- severity failure"
1661
1662     yield
1663     yield
1664     yield
1665     yield
1666
1667
1668 def test_dcache():
1669     dut = DCache()
1670     vl = rtlil.convert(dut, ports=[])
1671     with open("test_dcache.il", "w") as f:
1672         f.write(vl)
1673
1674     run_simulation(dut, dcache_sim(), vcd_name='test_dcache.vcd')
1675
1676 if __name__ == '__main__':
1677     test_dcache()
1678