src/soc/experiment/dcache.py

   1 """DCache
   2
   3 based on Anton Blanchard microwatt dcache.vhdl
   4
   5 note that the microwatt dcache wishbone interface expects "stall".
   6 for simplicity at the moment this is hard-coded to cyc & ~ack.
   7 see WB4 spec, p84, section 5.2.1
   8
   9 IMPORTANT: for store, the data is sampled the cycle AFTER the "valid"
  10 is raised.  sigh
  11
  12 Links:
  13
  14 * https://libre-soc.org/3d_gpu/architecture/set_associative_cache.jpg
  15 * https://bugs.libre-soc.org/show_bug.cgi?id=469
  16
  17 """
  18
  19 import sys
  20
  21 from nmutil.gtkw import write_gtkw
  22
  23 sys.setrecursionlimit(1000000)
  24
  25 from enum import Enum, unique
  26
  27 from nmigen import (Module, Signal, Elaboratable, Cat, Repl, Array, Const,
  28                     Record, Memory)
  29 from nmutil.util import Display
  30 from nmigen.lib.coding import Decoder
  31
  32 from copy import deepcopy
  33 from random import randint, seed
  34
  35 from nmigen_soc.wishbone.bus import Interface
  36
  37 from nmigen.cli import main
  38 from nmutil.iocontrol import RecordObject
  39 from nmigen.utils import log2_int
  40 from soc.experiment.mem_types import (LoadStore1ToDCacheType,
  41                                      DCacheToLoadStore1Type,
  42                                      MMUToDCacheType,
  43                                      DCacheToMMUType)
  44
  45 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS, WB_SEL_BITS,
  46                                 WBAddrType, WBDataType, WBSelType,
  47                                 WBMasterOut, WBSlaveOut,
  48                                 WBMasterOutVector, WBSlaveOutVector,
  49                                 WBIOMasterOut, WBIOSlaveOut)
  50
  51 from soc.experiment.cache_ram import CacheRam
  52 #from soc.experiment.plru import PLRU
  53 from nmutil.plru import PLRU, PLRUs
  54
  55 # for test
  56 from soc.bus.sram import SRAM
  57 from nmigen import Memory
  58 from nmigen.cli import rtlil
  59
  60 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  61 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  62 from nmutil.sim_tmp_alternative import Simulator
  63
  64 from nmutil.util import wrap
  65
  66
  67 # TODO: make these parameters of DCache at some point
  68 LINE_SIZE = 64    # Line size in bytes
  69 NUM_LINES = 16    # Number of lines in a set
  70 NUM_WAYS = 4      # Number of ways
  71 TLB_SET_SIZE = 64 # L1 DTLB entries per set
  72 TLB_NUM_WAYS = 2  # L1 DTLB number of sets
  73 TLB_LG_PGSZ = 12  # L1 DTLB log_2(page_size)
  74 LOG_LENGTH = 0    # Non-zero to enable log data collection
  75
  76 # BRAM organisation: We never access more than
  77 #     -- WB_DATA_BITS at a time so to save
  78 #     -- resources we make the array only that wide, and
  79 #     -- use consecutive indices to make a cache "line"
  80 #     --
  81 #     -- ROW_SIZE is the width in bytes of the BRAM
  82 #     -- (based on WB, so 64-bits)
  83 ROW_SIZE = WB_DATA_BITS // 8;
  84
  85 # ROW_PER_LINE is the number of row (wishbone
  86 # transactions) in a line
  87 ROW_PER_LINE = LINE_SIZE // ROW_SIZE
  88
  89 # BRAM_ROWS is the number of rows in BRAM needed
  90 # to represent the full dcache
  91 BRAM_ROWS = NUM_LINES * ROW_PER_LINE
  92
  93 print ("ROW_SIZE", ROW_SIZE)
  94 print ("ROW_PER_LINE", ROW_PER_LINE)
  95 print ("BRAM_ROWS", BRAM_ROWS)
  96 print ("NUM_WAYS", NUM_WAYS)
  97
  98 # Bit fields counts in the address
  99
 100 # REAL_ADDR_BITS is the number of real address
 101 # bits that we store
 102 REAL_ADDR_BITS = 56
 103
 104 # ROW_BITS is the number of bits to select a row
 105 ROW_BITS = log2_int(BRAM_ROWS)
 106
 107 # ROW_LINE_BITS is the number of bits to select
 108 # a row within a line
 109 ROW_LINE_BITS = log2_int(ROW_PER_LINE)
 110
 111 # LINE_OFF_BITS is the number of bits for
 112 # the offset in a cache line
 113 LINE_OFF_BITS = log2_int(LINE_SIZE)
 114
 115 # ROW_OFF_BITS is the number of bits for
 116 # the offset in a row
 117 ROW_OFF_BITS = log2_int(ROW_SIZE)
 118
 119 # INDEX_BITS is the number if bits to
 120 # select a cache line
 121 INDEX_BITS = log2_int(NUM_LINES)
 122
 123 # SET_SIZE_BITS is the log base 2 of the set size
 124 SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
 125
 126 # TAG_BITS is the number of bits of
 127 # the tag part of the address
 128 TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
 129
 130 # TAG_WIDTH is the width in bits of each way of the tag RAM
 131 TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 132
 133 # WAY_BITS is the number of bits to select a way
 134 WAY_BITS = log2_int(NUM_WAYS)
 135
 136 # Example of layout for 32 lines of 64 bytes:
 137 layout = """\
 138   ..  tag    |index|  line  |
 139   ..         |   row   |    |
 140   ..         |     |---|    | ROW_LINE_BITS  (3)
 141   ..         |     |--- - --| LINE_OFF_BITS (6)
 142   ..         |         |- --| ROW_OFF_BITS  (3)
 143   ..         |----- ---|    | ROW_BITS      (8)
 144   ..         |-----|        | INDEX_BITS    (5)
 145   .. --------|              | TAG_BITS      (45)
 146 """
 147 print (layout)
 148 print ("Dcache TAG %d IDX %d ROW_BITS %d ROFF %d LOFF %d RLB %d" % \
 149             (TAG_BITS, INDEX_BITS, ROW_BITS,
 150              ROW_OFF_BITS, LINE_OFF_BITS, ROW_LINE_BITS))
 151 print ("index @: %d-%d" % (LINE_OFF_BITS, SET_SIZE_BITS))
 152 print ("row @: %d-%d" % (LINE_OFF_BITS, ROW_OFF_BITS))
 153 print ("tag @: %d-%d width %d" % (SET_SIZE_BITS, REAL_ADDR_BITS, TAG_WIDTH))
 154
 155 TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS
 156
 157 print ("TAG_RAM_WIDTH", TAG_RAM_WIDTH)
 158 print ("    TAG_WIDTH", TAG_WIDTH)
 159 print ("     NUM_WAYS", NUM_WAYS)
 160
 161 def CacheTagArray():
 162     tag_layout = [('valid', 1),
 163                   ('tag', TAG_RAM_WIDTH),
 164                  ]
 165     return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
 166
 167 def RowPerLineValidArray():
 168     return Array(Signal(name="rows_valid%d" % x) \
 169                         for x in range(ROW_PER_LINE))
 170
 171 # L1 TLB
 172 TLB_SET_BITS     = log2_int(TLB_SET_SIZE)
 173 TLB_WAY_BITS     = log2_int(TLB_NUM_WAYS)
 174 TLB_EA_TAG_BITS  = 64 - (TLB_LG_PGSZ + TLB_SET_BITS)
 175 TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS
 176 TLB_PTE_BITS     = 64
 177 TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS;
 178
 179 def ispow2(x):
 180     return (1<<log2_int(x, False)) == x
 181
 182 assert (LINE_SIZE % ROW_SIZE) == 0, "LINE_SIZE not multiple of ROW_SIZE"
 183 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 184 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 185 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 186 assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS), "geometry bits don't add up"
 187 assert (LINE_OFF_BITS == ROW_OFF_BITS + ROW_LINE_BITS), \
 188         "geometry bits don't add up"
 189 assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS), \
 190         "geometry bits don't add up"
 191 assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS), \
 192          "geometry bits don't add up"
 193 assert 64 == WB_DATA_BITS, "Can't yet handle wb width that isn't 64-bits"
 194 assert SET_SIZE_BITS <= TLB_LG_PGSZ, "Set indexed by virtual address"
 195
 196
 197 def TLBHit(name):
 198     return Record([('valid', 1),
 199                    ('way', TLB_WAY_BITS)], name=name)
 200
 201 def TLBTagEAArray():
 202     return Array(Signal(TLB_EA_TAG_BITS, name="tlbtagea%d" % x) \
 203                 for x in range (TLB_NUM_WAYS))
 204
 205 def TLBRecord(name):
 206     tlb_layout = [('valid', TLB_NUM_WAYS),
 207                   ('tag', TLB_TAG_WAY_BITS),
 208                   ('pte', TLB_PTE_WAY_BITS)
 209                  ]
 210     return Record(tlb_layout, name=name)
 211
 212 def TLBValidArray():
 213     return Array(Signal(TLB_NUM_WAYS, name="tlb_valid%d" % x)
 214                         for x in range(TLB_SET_SIZE))
 215
 216 def HitWaySet():
 217     return Array(Signal(WAY_BITS, name="hitway_%d" % x) \
 218                         for x in range(TLB_NUM_WAYS))
 219
 220 # Cache RAM interface
 221 def CacheRamOut():
 222     return Array(Signal(WB_DATA_BITS, name="cache_out%d" % x) \
 223                  for x in range(NUM_WAYS))
 224
 225 # PLRU output interface
 226 def PLRUOut():
 227     return Array(Signal(WAY_BITS, name="plru_out%d" % x) \
 228                 for x in range(NUM_LINES))
 229
 230 # TLB PLRU output interface
 231 def TLBPLRUOut():
 232     return Array(Signal(TLB_WAY_BITS, name="tlbplru_out%d" % x) \
 233                 for x in range(TLB_SET_SIZE))
 234
 235 # Helper functions to decode incoming requests
 236 #
 237 # Return the cache line index (tag index) for an address
 238 def get_index(addr):
 239     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 240
 241 # Return the cache row index (data memory) for an address
 242 def get_row(addr):
 243     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 244
 245 # Return the index of a row within a line
 246 def get_row_of_line(row):
 247     return row[:ROW_BITS][:ROW_LINE_BITS]
 248
 249 # Returns whether this is the last row of a line
 250 def is_last_row_addr(addr, last):
 251     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 252
 253 # Returns whether this is the last row of a line
 254 def is_last_row(row, last):
 255     return get_row_of_line(row) == last
 256
 257 # Return the next row in the current cache line. We use a
 258 # dedicated function in order to limit the size of the
 259 # generated adder to be only the bits within a cache line
 260 # (3 bits with default settings)
 261 def next_row(row):
 262     row_v = row[0:ROW_LINE_BITS] + 1
 263     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 264
 265 # Get the tag value from the address
 266 def get_tag(addr):
 267     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 268
 269 # Read a tag from a tag memory row
 270 def read_tag(way, tagset):
 271     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 272
 273 # Read a TLB tag from a TLB tag memory row
 274 def read_tlb_tag(way, tags):
 275     return tags.word_select(way, TLB_EA_TAG_BITS)
 276
 277 # Write a TLB tag to a TLB tag memory row
 278 def write_tlb_tag(way, tags, tag):
 279     return read_tlb_tag(way, tags).eq(tag)
 280
 281 # Read a PTE from a TLB PTE memory row
 282 def read_tlb_pte(way, ptes):
 283     return ptes.word_select(way, TLB_PTE_BITS)
 284
 285 def write_tlb_pte(way, ptes, newpte):
 286     return read_tlb_pte(way, ptes).eq(newpte)
 287
 288
 289 # Record for storing permission, attribute, etc. bits from a PTE
 290 class PermAttr(RecordObject):
 291     def __init__(self, name=None):
 292         super().__init__(name=name)
 293         self.reference = Signal()
 294         self.changed   = Signal()
 295         self.nocache   = Signal()
 296         self.priv      = Signal()
 297         self.rd_perm   = Signal()
 298         self.wr_perm   = Signal()
 299
 300
 301 def extract_perm_attr(pte):
 302     pa = PermAttr()
 303     return pa;
 304
 305
 306 # Type of operation on a "valid" input
 307 @unique
 308 class Op(Enum):
 309     OP_NONE       = 0
 310     OP_BAD        = 1 # NC cache hit, TLB miss, prot/RC failure
 311     OP_STCX_FAIL  = 2 # conditional store w/o reservation
 312     OP_LOAD_HIT   = 3 # Cache hit on load
 313     OP_LOAD_MISS  = 4 # Load missing cache
 314     OP_LOAD_NC    = 5 # Non-cachable load
 315     OP_STORE_HIT  = 6 # Store hitting cache
 316     OP_STORE_MISS = 7 # Store missing cache
 317
 318
 319 # Cache state machine
 320 @unique
 321 class State(Enum):
 322     IDLE             = 0 # Normal load hit processing
 323     RELOAD_WAIT_ACK  = 1 # Cache reload wait ack
 324     STORE_WAIT_ACK   = 2 # Store wait ack
 325     NC_LOAD_WAIT_ACK = 3 # Non-cachable load wait ack
 326
 327
 328 # Dcache operations:
 329 #
 330 # In order to make timing, we use the BRAMs with
 331 # an output buffer, which means that the BRAM
 332 # output is delayed by an extra cycle.
 333 #
 334 # Thus, the dcache has a 2-stage internal pipeline
 335 # for cache hits with no stalls.
 336 #
 337 # All other operations are handled via stalling
 338 # in the first stage.
 339 #
 340 # The second stage can thus complete a hit at the same
 341 # time as the first stage emits a stall for a complex op.
 342 #
 343 # Stage 0 register, basically contains just the latched request
 344
 345 class RegStage0(RecordObject):
 346     def __init__(self, name=None):
 347         super().__init__(name=name)
 348         self.req     = LoadStore1ToDCacheType(name="lsmem")
 349         self.tlbie   = Signal() # indicates a tlbie request (from MMU)
 350         self.doall   = Signal() # with tlbie, indicates flush whole TLB
 351         self.tlbld   = Signal() # indicates a TLB load request (from MMU)
 352         self.mmu_req = Signal() # indicates source of request
 353         self.d_valid = Signal() # indicates req.data is valid now
 354
 355
 356 class MemAccessRequest(RecordObject):
 357     def __init__(self, name=None):
 358         super().__init__(name=name)
 359         self.op        = Signal(Op)
 360         self.valid     = Signal()
 361         self.dcbz      = Signal()
 362         self.real_addr = Signal(REAL_ADDR_BITS)
 363         self.data      = Signal(64)
 364         self.byte_sel  = Signal(8)
 365         self.hit_way   = Signal(WAY_BITS)
 366         self.same_tag  = Signal()
 367         self.mmu_req   = Signal()
 368
 369
 370 # First stage register, contains state for stage 1 of load hits
 371 # and for the state machine used by all other operations
 372 class RegStage1(RecordObject):
 373     def __init__(self, name=None):
 374         super().__init__(name=name)
 375         # Info about the request
 376         self.full             = Signal() # have uncompleted request
 377         self.mmu_req          = Signal() # request is from MMU
 378         self.req              = MemAccessRequest(name="reqmem")
 379
 380         # Cache hit state
 381         self.hit_way          = Signal(WAY_BITS)
 382         self.hit_load_valid   = Signal()
 383         self.hit_index        = Signal(INDEX_BITS)
 384         self.cache_hit        = Signal()
 385
 386         # TLB hit state
 387         self.tlb_hit          = TLBHit("tlb_hit")
 388         self.tlb_hit_index    = Signal(TLB_SET_BITS)
 389
 390         # 2-stage data buffer for data forwarded from writes to reads
 391         self.forward_data1    = Signal(64)
 392         self.forward_data2    = Signal(64)
 393         self.forward_sel1     = Signal(8)
 394         self.forward_valid1   = Signal()
 395         self.forward_way1     = Signal(WAY_BITS)
 396         self.forward_row1     = Signal(ROW_BITS)
 397         self.use_forward1     = Signal()
 398         self.forward_sel      = Signal(8)
 399
 400         # Cache miss state (reload state machine)
 401         self.state            = Signal(State)
 402         self.dcbz             = Signal()
 403         self.write_bram       = Signal()
 404         self.write_tag        = Signal()
 405         self.slow_valid       = Signal()
 406         self.wb               = WBMasterOut("wb")
 407         self.reload_tag       = Signal(TAG_BITS)
 408         self.store_way        = Signal(WAY_BITS)
 409         self.store_row        = Signal(ROW_BITS)
 410         self.store_index      = Signal(INDEX_BITS)
 411         self.end_row_ix       = Signal(ROW_LINE_BITS)
 412         self.rows_valid       = RowPerLineValidArray()
 413         self.acks_pending     = Signal(3)
 414         self.inc_acks         = Signal()
 415         self.dec_acks         = Signal()
 416
 417         # Signals to complete (possibly with error)
 418         self.ls_valid         = Signal()
 419         self.ls_error         = Signal()
 420         self.mmu_done         = Signal()
 421         self.mmu_error        = Signal()
 422         self.cache_paradox    = Signal()
 423
 424         # Signal to complete a failed stcx.
 425         self.stcx_fail        = Signal()
 426
 427
 428 # Reservation information
 429 class Reservation(RecordObject):
 430     def __init__(self):
 431         super().__init__()
 432         self.valid = Signal()
 433         self.addr  = Signal(64-LINE_OFF_BITS)
 434
 435
 436 class DTLBUpdate(Elaboratable):
 437     def __init__(self):
 438         self.dtlb     = TLBValidArray()
 439         self.tlbie    = Signal()
 440         self.tlbwe    = Signal()
 441         self.doall    = Signal()
 442         self.tlb_hit     = TLBHit("tlb_hit")
 443         self.tlb_req_index = Signal(TLB_SET_BITS)
 444
 445         self.tlb_tag_way     = Signal(TLB_TAG_WAY_BITS)
 446         self.tlb_pte_way     = Signal(TLB_PTE_WAY_BITS)
 447         self.repl_way        = Signal(TLB_WAY_BITS)
 448         self.eatag           = Signal(TLB_EA_TAG_BITS)
 449         self.pte_data        = Signal(TLB_PTE_BITS)
 450
 451         # read from dtlb array
 452         self.tlb_read       = Signal()
 453         self.tlb_read_index = Signal(TLB_SET_BITS)
 454         self.tlb_way        = TLBRecord("o_tlb_way")
 455
 456     def elaborate(self, platform):
 457         m = Module()
 458         comb = m.d.comb
 459         sync = m.d.sync
 460
 461         dtlb, tlb_req_index = self.dtlb, self.tlb_req_index
 462
 463         print ("TLB_TAG_WAY_BITS", TLB_TAG_WAY_BITS)
 464         print ("     TLB_EA_TAG_BITS", TLB_EA_TAG_BITS)
 465         print ("        TLB_NUM_WAYS", TLB_NUM_WAYS)
 466         print ("TLB_PTE_WAY_BITS", TLB_PTE_WAY_BITS)
 467         print ("    TLB_PTE_BITS", TLB_PTE_BITS)
 468         print ("    TLB_NUM_WAYS", TLB_NUM_WAYS)
 469
 470         # TAG and PTE Memory SRAMs. transparent, write-enables are TLB_NUM_WAYS
 471         tagway = Memory(depth=TLB_SET_SIZE, width=TLB_TAG_WAY_BITS)
 472         m.submodules.rd_tagway = rd_tagway = tagway.read_port()
 473         m.submodules.wr_tagway = wr_tagway = tagway.write_port(
 474                                     granularity=TLB_EA_TAG_BITS)
 475
 476         pteway = Memory(depth=TLB_SET_SIZE, width=TLB_PTE_WAY_BITS)
 477         m.submodules.rd_pteway = rd_pteway = pteway.read_port()
 478         m.submodules.wr_pteway = wr_pteway = pteway.write_port(
 479                                     granularity=TLB_PTE_BITS)
 480
 481         m.d.comb += rd_pteway.addr.eq(self.tlb_read_index)
 482         m.d.comb += rd_tagway.addr.eq(self.tlb_read_index)
 483         m.d.comb += wr_tagway.addr.eq(tlb_req_index)
 484         m.d.comb += wr_pteway.addr.eq(tlb_req_index)
 485
 486         tagset   = Signal(TLB_TAG_WAY_BITS)
 487         pteset   = Signal(TLB_PTE_WAY_BITS)
 488         updated  = Signal()
 489         v_updated  = Signal()
 490         tb_out = Signal(TLB_TAG_WAY_BITS) # tlb_way_tags_t
 491         db_out = Signal(TLB_NUM_WAYS)     # tlb_way_valids_t
 492         pb_out = Signal(TLB_PTE_WAY_BITS) # tlb_way_ptes_t
 493         dv = Signal(TLB_NUM_WAYS) # tlb_way_valids_t
 494
 495         comb += dv.eq(dtlb[tlb_req_index])
 496         comb += db_out.eq(dv)
 497
 498         with m.If(self.tlbie & self.doall):
 499             # clear all valid bits at once
 500             for i in range(TLB_SET_SIZE):
 501                 sync += dtlb[i].eq(0)
 502         with m.Elif(self.tlbie):
 503             # invalidate just the hit_way
 504             with m.If(self.tlb_hit.valid):
 505                 comb += db_out.bit_select(self.tlb_hit.way, 1).eq(0)
 506                 comb += v_updated.eq(1)
 507         with m.Elif(self.tlbwe):
 508             # write to the requested tag and PTE
 509             comb += write_tlb_tag(self.repl_way, tb_out, self.eatag)
 510             comb += write_tlb_pte(self.repl_way, pb_out, self.pte_data)
 511             # set valid bit
 512             comb += db_out.bit_select(self.repl_way, 1).eq(1)
 513
 514             comb += updated.eq(1)
 515             comb += v_updated.eq(1)
 516
 517         with m.If(updated):
 518             comb += wr_pteway.data.eq(pb_out)
 519             comb += wr_pteway.en.eq(1<<self.repl_way)
 520             comb += wr_tagway.data.eq(tb_out)
 521             comb += wr_tagway.en.eq(1<<self.repl_way)
 522         with m.If(v_updated):
 523             sync += dtlb[tlb_req_index].eq(db_out)
 524
 525         # select one TLB way
 526         r_tlb_way        = TLBRecord("r_tlb_way")
 527         r_delay = Signal()
 528         sync += r_delay.eq(self.tlb_read)
 529         with m.If(self.tlb_read):
 530             sync += self.tlb_way.valid.eq(dtlb[self.tlb_read_index])
 531         with m.If(r_delay):
 532             comb += self.tlb_way.tag.eq(rd_tagway.data)
 533             comb += self.tlb_way.pte.eq(rd_pteway.data)
 534             sync += r_tlb_way.tag.eq(rd_tagway.data)
 535             sync += r_tlb_way.pte.eq(rd_pteway.data)
 536         with m.Else():
 537             comb += self.tlb_way.tag.eq(r_tlb_way.tag)
 538             comb += self.tlb_way.pte.eq(r_tlb_way.pte)
 539
 540         return m
 541
 542
 543 class DCachePendingHit(Elaboratable):
 544
 545     def __init__(self, tlb_way,
 546                       cache_i_validdx, cache_tag_set,
 547                     req_addr,
 548                     hit_set):
 549
 550         self.go          = Signal()
 551         self.virt_mode   = Signal()
 552         self.is_hit      = Signal()
 553         self.tlb_hit      = TLBHit("tlb_hit")
 554         self.hit_way     = Signal(WAY_BITS)
 555         self.rel_match   = Signal()
 556         self.req_index   = Signal(INDEX_BITS)
 557         self.reload_tag  = Signal(TAG_BITS)
 558
 559         self.tlb_way = tlb_way
 560         self.cache_i_validdx = cache_i_validdx
 561         self.cache_tag_set = cache_tag_set
 562         self.req_addr = req_addr
 563         self.hit_set = hit_set
 564
 565     def elaborate(self, platform):
 566         m = Module()
 567         comb = m.d.comb
 568         sync = m.d.sync
 569
 570         go = self.go
 571         virt_mode = self.virt_mode
 572         is_hit = self.is_hit
 573         tlb_way = self.tlb_way
 574         cache_i_validdx = self.cache_i_validdx
 575         cache_tag_set = self.cache_tag_set
 576         req_addr = self.req_addr
 577         tlb_hit = self.tlb_hit
 578         hit_set = self.hit_set
 579         hit_way = self.hit_way
 580         rel_match = self.rel_match
 581         req_index = self.req_index
 582         reload_tag = self.reload_tag
 583
 584         rel_matches = Array(Signal(name="rel_matches_%d" % i) \
 585                                     for i in range(TLB_NUM_WAYS))
 586         hit_way_set = HitWaySet()
 587
 588         # Test if pending request is a hit on any way
 589         # In order to make timing in virtual mode,
 590         # when we are using the TLB, we compare each
 591         # way with each of the real addresses from each way of
 592         # the TLB, and then decide later which match to use.
 593
 594         with m.If(virt_mode):
 595             for j in range(TLB_NUM_WAYS): # tlb_num_way_t
 596                 s_tag       = Signal(TAG_BITS, name="s_tag%d" % j)
 597                 s_hit       = Signal()
 598                 s_pte       = Signal(TLB_PTE_BITS)
 599                 s_ra        = Signal(REAL_ADDR_BITS)
 600                 comb += s_pte.eq(read_tlb_pte(j, tlb_way.pte))
 601                 comb += s_ra.eq(Cat(req_addr[0:TLB_LG_PGSZ],
 602                                     s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 603                 comb += s_tag.eq(get_tag(s_ra))
 604
 605                 for i in range(NUM_WAYS): # way_t
 606                     is_tag_hit = Signal(name="is_tag_hit_%d_%d" % (j, i))
 607                     comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 608                                   (read_tag(i, cache_tag_set) == s_tag)
 609                                   & (tlb_way.valid[j]))
 610                     with m.If(is_tag_hit):
 611                         comb += hit_way_set[j].eq(i)
 612                         comb += s_hit.eq(1)
 613                 comb += hit_set[j].eq(s_hit)
 614                 with m.If(s_tag == reload_tag):
 615                     comb += rel_matches[j].eq(1)
 616             with m.If(tlb_hit.valid):
 617                 comb += is_hit.eq(hit_set[tlb_hit.way])
 618                 comb += hit_way.eq(hit_way_set[tlb_hit.way])
 619                 comb += rel_match.eq(rel_matches[tlb_hit.way])
 620         with m.Else():
 621             s_tag       = Signal(TAG_BITS)
 622             comb += s_tag.eq(get_tag(req_addr))
 623             for i in range(NUM_WAYS): # way_t
 624                 is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 625                 comb += is_tag_hit.eq(go & cache_i_validdx[i] &
 626                           (read_tag(i, cache_tag_set) == s_tag))
 627                 with m.If(is_tag_hit):
 628                     comb += hit_way.eq(i)
 629                     comb += is_hit.eq(1)
 630             with m.If(s_tag == reload_tag):
 631                 comb += rel_match.eq(1)
 632
 633         return m
 634
 635
 636 class DCache(Elaboratable):
 637     """Set associative dcache write-through
 638
 639     TODO (in no specific order):
 640     * See list in icache.vhdl
 641     * Complete load misses on the cycle when WB data comes instead of
 642       at the end of line (this requires dealing with requests coming in
 643       while not idle...)
 644     """
 645     def __init__(self):
 646         self.d_in      = LoadStore1ToDCacheType("d_in")
 647         self.d_out     = DCacheToLoadStore1Type("d_out")
 648
 649         self.m_in      = MMUToDCacheType("m_in")
 650         self.m_out     = DCacheToMMUType("m_out")
 651
 652         self.stall_out = Signal()
 653
 654         # standard naming (wired to non-standard for compatibility)
 655         self.bus = Interface(addr_width=32,
 656                             data_width=64,
 657                             granularity=8,
 658                             features={'stall'},
 659                             alignment=0,
 660                             name="dcache")
 661
 662         self.log_out   = Signal(20)
 663
 664     def stage_0(self, m, r0, r1, r0_full):
 665         """Latch the request in r0.req as long as we're not stalling
 666         """
 667         comb = m.d.comb
 668         sync = m.d.sync
 669         d_in, d_out, m_in = self.d_in, self.d_out, self.m_in
 670
 671         r = RegStage0("stage0")
 672
 673         # TODO, this goes in unit tests and formal proofs
 674         with m.If(d_in.valid & m_in.valid):
 675             sync += Display("request collision loadstore vs MMU")
 676
 677         with m.If(m_in.valid):
 678             comb += r.req.valid.eq(1)
 679             comb += r.req.load.eq(~(m_in.tlbie | m_in.tlbld))# no invalidate
 680             comb += r.req.dcbz.eq(0)
 681             comb += r.req.nc.eq(0)
 682             comb += r.req.reserve.eq(0)
 683             comb += r.req.virt_mode.eq(0)
 684             comb += r.req.priv_mode.eq(1)
 685             comb += r.req.addr.eq(m_in.addr)
 686             comb += r.req.data.eq(m_in.pte)
 687             comb += r.req.byte_sel.eq(~0) # Const -1 sets all to 0b111....
 688             comb += r.tlbie.eq(m_in.tlbie)
 689             comb += r.doall.eq(m_in.doall)
 690             comb += r.tlbld.eq(m_in.tlbld)
 691             comb += r.mmu_req.eq(1)
 692             m.d.sync += Display("    DCACHE req mmu addr %x pte %x ld %d",
 693                                  m_in.addr, m_in.pte, r.req.load)
 694
 695         with m.Else():
 696             comb += r.req.eq(d_in)
 697             comb += r.req.data.eq(0)
 698             comb += r.tlbie.eq(0)
 699             comb += r.doall.eq(0)
 700             comb += r.tlbld.eq(0)
 701             comb += r.mmu_req.eq(0)
 702         with m.If((~r1.full & ~d_in.hold) | ~r0_full):
 703             sync += r0.eq(r)
 704             sync += r0_full.eq(r.req.valid)
 705             # Sample data the cycle after a request comes in from loadstore1.
 706             # If another request has come in already then the data will get
 707             # put directly into req.data below.
 708             with m.If(r0.req.valid & ~r.req.valid & ~r0.d_valid &
 709                      ~r0.mmu_req):
 710                 sync += r0.req.data.eq(d_in.data)
 711                 sync += r0.d_valid.eq(1)
 712         with m.If(d_in.valid):
 713             m.d.sync += Display("    DCACHE req cache "
 714                                 "virt %d addr %x data %x ld %d",
 715                                  r.req.virt_mode, r.req.addr,
 716                                  r.req.data, r.req.load)
 717
 718     def tlb_read(self, m, r0_stall, tlb_way, dtlb):
 719         """TLB
 720         Operates in the second cycle on the request latched in r0.req.
 721         TLB updates write the entry at the end of the second cycle.
 722         """
 723         comb = m.d.comb
 724         sync = m.d.sync
 725         m_in, d_in = self.m_in, self.d_in
 726
 727         addrbits = Signal(TLB_SET_BITS)
 728
 729         amin = TLB_LG_PGSZ
 730         amax = TLB_LG_PGSZ + TLB_SET_BITS
 731
 732         with m.If(m_in.valid):
 733             comb += addrbits.eq(m_in.addr[amin : amax])
 734         with m.Else():
 735             comb += addrbits.eq(d_in.addr[amin : amax])
 736
 737         # If we have any op and the previous op isn't finished,
 738         # then keep the same output for next cycle.
 739         d = self.dtlb_update
 740         comb += d.tlb_read_index.eq(addrbits)
 741         comb += d.tlb_read.eq(~r0_stall)
 742         comb += tlb_way.eq(d.tlb_way)
 743
 744     def maybe_tlb_plrus(self, m, r1, tlb_plru_victim, tlb_req_index):
 745         """Generate TLB PLRUs
 746         """
 747         comb = m.d.comb
 748         sync = m.d.sync
 749
 750         if TLB_NUM_WAYS == 0:
 751             return
 752
 753         # Binary-to-Unary one-hot, enabled by tlb_hit valid
 754         tlb_plrus = PLRUs(TLB_SET_SIZE, TLB_WAY_BITS)
 755         m.submodules.tlb_plrus = tlb_plrus
 756         comb += tlb_plrus.way.eq(r1.tlb_hit.way)
 757         comb += tlb_plrus.valid.eq(r1.tlb_hit.valid)
 758         comb += tlb_plrus.index.eq(r1.tlb_hit_index)
 759         comb += tlb_plrus.isel.eq(tlb_req_index) # select victim
 760         comb += tlb_plru_victim.eq(tlb_plrus.o_index) # selected victim
 761
 762     def tlb_search(self, m, tlb_req_index, r0, r0_valid,
 763                    tlb_way,
 764                    pte, tlb_hit, valid_ra, perm_attr, ra):
 765
 766         comb = m.d.comb
 767
 768         hitway = Signal(TLB_WAY_BITS)
 769         hit    = Signal()
 770         eatag  = Signal(TLB_EA_TAG_BITS)
 771
 772         TLB_LG_END = TLB_LG_PGSZ + TLB_SET_BITS
 773         comb += tlb_req_index.eq(r0.req.addr[TLB_LG_PGSZ : TLB_LG_END])
 774         comb += eatag.eq(r0.req.addr[TLB_LG_END : 64 ])
 775
 776         for i in range(TLB_NUM_WAYS):
 777             is_tag_hit = Signal(name="is_tag_hit%d" % i)
 778             tlb_tag = Signal(TLB_EA_TAG_BITS, name="tlb_tag%d" % i)
 779             comb += tlb_tag.eq(read_tlb_tag(i, tlb_way.tag))
 780             comb += is_tag_hit.eq((tlb_way.valid[i]) & (tlb_tag == eatag))
 781             with m.If(is_tag_hit):
 782                 comb += hitway.eq(i)
 783                 comb += hit.eq(1)
 784
 785         comb += tlb_hit.valid.eq(hit & r0_valid)
 786         comb += tlb_hit.way.eq(hitway)
 787
 788         with m.If(tlb_hit.valid):
 789             comb += pte.eq(read_tlb_pte(hitway, tlb_way.pte))
 790         comb += valid_ra.eq(tlb_hit.valid | ~r0.req.virt_mode)
 791
 792         with m.If(r0.req.virt_mode):
 793             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 794                               r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ],
 795                               pte[TLB_LG_PGSZ:REAL_ADDR_BITS]))
 796             comb += perm_attr.reference.eq(pte[8])
 797             comb += perm_attr.changed.eq(pte[7])
 798             comb += perm_attr.nocache.eq(pte[5])
 799             comb += perm_attr.priv.eq(pte[3])
 800             comb += perm_attr.rd_perm.eq(pte[2])
 801             comb += perm_attr.wr_perm.eq(pte[1])
 802         with m.Else():
 803             comb += ra.eq(Cat(Const(0, ROW_OFF_BITS),
 804                               r0.req.addr[ROW_OFF_BITS:REAL_ADDR_BITS]))
 805             comb += perm_attr.reference.eq(1)
 806             comb += perm_attr.changed.eq(1)
 807             comb += perm_attr.nocache.eq(0)
 808             comb += perm_attr.priv.eq(1)
 809             comb += perm_attr.rd_perm.eq(1)
 810             comb += perm_attr.wr_perm.eq(1)
 811
 812         with m.If(valid_ra):
 813             m.d.sync += Display("DCACHE virt mode %d hit %d ra %x pte %x",
 814                                 r0.req.virt_mode, tlb_hit.valid, ra, pte)
 815             m.d.sync += Display("       perm ref=%d", perm_attr.reference)
 816             m.d.sync += Display("       perm chg=%d", perm_attr.changed)
 817             m.d.sync += Display("       perm noc=%d", perm_attr.nocache)
 818             m.d.sync += Display("       perm prv=%d", perm_attr.priv)
 819             m.d.sync += Display("       perm rdp=%d", perm_attr.rd_perm)
 820             m.d.sync += Display("       perm wrp=%d", perm_attr.wr_perm)
 821
 822     def tlb_update(self, m, r0_valid, r0, dtlb, tlb_req_index,
 823                     tlb_hit, tlb_plru_victim, tlb_way):
 824
 825         comb = m.d.comb
 826         sync = m.d.sync
 827
 828         tlbie    = Signal()
 829         tlbwe    = Signal()
 830
 831         comb += tlbie.eq(r0_valid & r0.tlbie)
 832         comb += tlbwe.eq(r0_valid & r0.tlbld)
 833
 834         d = self.dtlb_update
 835
 836         comb += d.tlbie.eq(tlbie)
 837         comb += d.tlbwe.eq(tlbwe)
 838         comb += d.doall.eq(r0.doall)
 839         comb += d.tlb_hit.eq(tlb_hit)
 840         comb += d.tlb_tag_way.eq(tlb_way.tag)
 841         comb += d.tlb_pte_way.eq(tlb_way.pte)
 842         comb += d.tlb_req_index.eq(tlb_req_index)
 843
 844         with m.If(tlb_hit.valid):
 845             comb += d.repl_way.eq(tlb_hit.way)
 846         with m.Else():
 847             comb += d.repl_way.eq(tlb_plru_victim)
 848         comb += d.eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64])
 849         comb += d.pte_data.eq(r0.req.data)
 850
 851     def maybe_plrus(self, m, r1, plru_victim):
 852         """Generate PLRUs
 853         """
 854         comb = m.d.comb
 855         sync = m.d.sync
 856
 857         if TLB_NUM_WAYS == 0:
 858             return
 859
 860         m.submodules.plrus = plrus = PLRUs(NUM_LINES, WAY_BITS)
 861         comb += plrus.way.eq(r1.hit_way)
 862         comb += plrus.valid.eq(r1.cache_hit)
 863         comb += plrus.index.eq(r1.hit_index)
 864         comb += plrus.isel.eq(r1.store_index) # select victim
 865         comb += plru_victim.eq(plrus.o_index) # selected victim
 866
 867     def cache_tag_read(self, m, r0_stall, req_index, cache_tag_set, cache_tags):
 868         """Cache tag RAM read port
 869         """
 870         comb = m.d.comb
 871         sync = m.d.sync
 872         m_in, d_in = self.m_in, self.d_in
 873
 874         index = Signal(INDEX_BITS)
 875
 876         with m.If(r0_stall):
 877             comb += index.eq(req_index)
 878         with m.Elif(m_in.valid):
 879             comb += index.eq(get_index(m_in.addr))
 880         with m.Else():
 881             comb += index.eq(get_index(d_in.addr))
 882         sync += cache_tag_set.eq(cache_tags[index].tag)
 883
 884     def dcache_request(self, m, r0, ra, req_index, req_row, req_tag,
 885                        r0_valid, r1, cache_tags, replace_way,
 886                        use_forward1_next, use_forward2_next,
 887                        req_hit_way, plru_victim, rc_ok, perm_attr,
 888                        valid_ra, perm_ok, access_ok, req_op, req_go,
 889                        tlb_hit, tlb_way, cache_tag_set,
 890                        cancel_store, req_same_tag, r0_stall, early_req_row):
 891         """Cache request parsing and hit detection
 892         """
 893
 894         comb = m.d.comb
 895         m_in, d_in = self.m_in, self.d_in
 896
 897         is_hit      = Signal()
 898         hit_way     = Signal(WAY_BITS)
 899         op          = Signal(Op)
 900         opsel       = Signal(3)
 901         go          = Signal()
 902         nc          = Signal()
 903         hit_set     = Array(Signal(name="hit_set_%d" % i) \
 904                                   for i in range(TLB_NUM_WAYS))
 905         cache_i_validdx = Signal(NUM_WAYS)
 906
 907         # Extract line, row and tag from request
 908         comb += req_index.eq(get_index(r0.req.addr))
 909         comb += req_row.eq(get_row(r0.req.addr))
 910         comb += req_tag.eq(get_tag(ra))
 911
 912         if False: # display on comb is a bit... busy.
 913             comb += Display("dcache_req addr:%x ra: %x idx: %x tag: %x row: %x",
 914                     r0.req.addr, ra, req_index, req_tag, req_row)
 915
 916         comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error)
 917         comb += cache_i_validdx.eq(cache_tags[req_index].valid)
 918
 919         m.submodules.dcache_pend = dc = DCachePendingHit(tlb_way,
 920                                             cache_i_validdx, cache_tag_set,
 921                                             r0.req.addr,
 922                                             hit_set)
 923         comb += dc.tlb_hit.eq(tlb_hit)
 924         comb += dc.reload_tag.eq(r1.reload_tag)
 925         comb += dc.virt_mode.eq(r0.req.virt_mode)
 926         comb += dc.go.eq(go)
 927         comb += dc.req_index.eq(req_index)
 928
 929         comb += is_hit.eq(dc.is_hit)
 930         comb += hit_way.eq(dc.hit_way)
 931         comb += req_same_tag.eq(dc.rel_match)
 932
 933         # See if the request matches the line currently being reloaded
 934         with m.If((r1.state == State.RELOAD_WAIT_ACK) &
 935                   (req_index == r1.store_index) & req_same_tag):
 936             # For a store, consider this a hit even if the row isn't
 937             # valid since it will be by the time we perform the store.
 938             # For a load, check the appropriate row valid bit.
 939             rrow = Signal(ROW_LINE_BITS)
 940             comb += rrow.eq(req_row)
 941             valid = r1.rows_valid[rrow]
 942             comb += is_hit.eq((~r0.req.load) | valid)
 943             comb += hit_way.eq(replace_way)
 944
 945         # Whether to use forwarded data for a load or not
 946         with m.If((get_row(r1.req.real_addr) == req_row) &
 947                   (r1.req.hit_way == hit_way)):
 948             # Only need to consider r1.write_bram here, since if we
 949             # are writing refill data here, then we don't have a
 950             # cache hit this cycle on the line being refilled.
 951             # (There is the possibility that the load following the
 952             # load miss that started the refill could be to the old
 953             # contents of the victim line, since it is a couple of
 954             # cycles after the refill starts before we see the updated
 955             # cache tag. In that case we don't use the bypass.)
 956             comb += use_forward1_next.eq(r1.write_bram)
 957         with m.If((r1.forward_row1 == req_row) & (r1.forward_way1 == hit_way)):
 958             comb += use_forward2_next.eq(r1.forward_valid1)
 959
 960         # The way that matched on a hit
 961         comb += req_hit_way.eq(hit_way)
 962
 963         # The way to replace on a miss
 964         with m.If(r1.write_tag):
 965             comb += replace_way.eq(plru_victim)
 966         with m.Else():
 967             comb += replace_way.eq(r1.store_way)
 968
 969         # work out whether we have permission for this access
 970         # NB we don't yet implement AMR, thus no KUAP
 971         comb += rc_ok.eq(perm_attr.reference
 972                          & (r0.req.load | perm_attr.changed))
 973         comb += perm_ok.eq((r0.req.priv_mode | (~perm_attr.priv)) &
 974                            (perm_attr.wr_perm |
 975                               (r0.req.load & perm_attr.rd_perm)))
 976         comb += access_ok.eq(valid_ra & perm_ok & rc_ok)
 977
 978         # Combine the request and cache hit status to decide what
 979         # operation needs to be done
 980         comb += nc.eq(r0.req.nc | perm_attr.nocache)
 981         comb += op.eq(Op.OP_NONE)
 982         with m.If(go):
 983             with m.If(~access_ok):
 984                 m.d.sync += Display("DCACHE access fail valid_ra=%d p=%d rc=%d",
 985                                  valid_ra, perm_ok, rc_ok)
 986                 comb += op.eq(Op.OP_BAD)
 987             with m.Elif(cancel_store):
 988                 m.d.sync += Display("DCACHE cancel store")
 989                 comb += op.eq(Op.OP_STCX_FAIL)
 990             with m.Else():
 991                 m.d.sync += Display("DCACHE valid_ra=%d nc=%d ld=%d",
 992                                  valid_ra, nc, r0.req.load)
 993                 comb += opsel.eq(Cat(is_hit, nc, r0.req.load))
 994                 with m.Switch(opsel):
 995                     with m.Case(0b101): comb += op.eq(Op.OP_LOAD_HIT)
 996                     with m.Case(0b100): comb += op.eq(Op.OP_LOAD_MISS)
 997                     with m.Case(0b110): comb += op.eq(Op.OP_LOAD_NC)
 998                     with m.Case(0b001): comb += op.eq(Op.OP_STORE_HIT)
 999                     with m.Case(0b000): comb += op.eq(Op.OP_STORE_MISS)
1000                     with m.Case(0b010): comb += op.eq(Op.OP_STORE_MISS)
1001                     with m.Case(0b011): comb += op.eq(Op.OP_BAD)
1002                     with m.Case(0b111): comb += op.eq(Op.OP_BAD)
1003         comb += req_op.eq(op)
1004         comb += req_go.eq(go)
1005
1006         # Version of the row number that is valid one cycle earlier
1007         # in the cases where we need to read the cache data BRAM.
1008         # If we're stalling then we need to keep reading the last
1009         # row requested.
1010         with m.If(~r0_stall):
1011             with m.If(m_in.valid):
1012                 comb += early_req_row.eq(get_row(m_in.addr))
1013             with m.Else():
1014                 comb += early_req_row.eq(get_row(d_in.addr))
1015         with m.Else():
1016             comb += early_req_row.eq(req_row)
1017
1018     def reservation_comb(self, m, cancel_store, set_rsrv, clear_rsrv,
1019                          r0_valid, r0, reservation):
1020         """Handle load-with-reservation and store-conditional instructions
1021         """
1022         comb = m.d.comb
1023
1024         with m.If(r0_valid & r0.req.reserve):
1025             # XXX generate alignment interrupt if address
1026             # is not aligned XXX or if r0.req.nc = '1'
1027             with m.If(r0.req.load):
1028                 comb += set_rsrv.eq(r0.req.atomic_last) # load with reservation
1029             with m.Else():
1030                 comb += clear_rsrv.eq(r0.req.atomic_last) # store conditional
1031                 with m.If((~reservation.valid) |
1032                          (r0.req.addr[LINE_OFF_BITS:64] != reservation.addr)):
1033                     comb += cancel_store.eq(1)
1034
1035     def reservation_reg(self, m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1036                         reservation, r0):
1037         comb = m.d.comb
1038         sync = m.d.sync
1039
1040         with m.If(r0_valid & access_ok):
1041             with m.If(clear_rsrv):
1042                 sync += reservation.valid.eq(0)
1043             with m.Elif(set_rsrv):
1044                 sync += reservation.valid.eq(1)
1045                 sync += reservation.addr.eq(r0.req.addr[LINE_OFF_BITS:64])
1046
1047     def writeback_control(self, m, r1, cache_out_row):
1048         """Return data for loads & completion control logic
1049         """
1050         comb = m.d.comb
1051         sync = m.d.sync
1052         d_out, m_out = self.d_out, self.m_out
1053
1054         data_out = Signal(64)
1055         data_fwd = Signal(64)
1056
1057         # Use the bypass if are reading the row that was
1058         # written 1 or 2 cycles ago, including for the
1059         # slow_valid = 1 case (i.e. completing a load
1060         # miss or a non-cacheable load).
1061         with m.If(r1.use_forward1):
1062             comb += data_fwd.eq(r1.forward_data1)
1063         with m.Else():
1064             comb += data_fwd.eq(r1.forward_data2)
1065
1066         comb += data_out.eq(cache_out_row)
1067
1068         for i in range(8):
1069             with m.If(r1.forward_sel[i]):
1070                 dsel = data_fwd.word_select(i, 8)
1071                 comb += data_out.word_select(i, 8).eq(dsel)
1072
1073         # DCache output to LoadStore
1074         comb += d_out.valid.eq(r1.ls_valid)
1075         comb += d_out.data.eq(data_out)
1076         comb += d_out.store_done.eq(~r1.stcx_fail)
1077         comb += d_out.error.eq(r1.ls_error)
1078         comb += d_out.cache_paradox.eq(r1.cache_paradox)
1079
1080         # Outputs to MMU
1081         comb += m_out.done.eq(r1.mmu_done)
1082         comb += m_out.err.eq(r1.mmu_error)
1083         comb += m_out.data.eq(data_out)
1084
1085         # We have a valid load or store hit or we just completed
1086         # a slow op such as a load miss, a NC load or a store
1087         #
1088         # Note: the load hit is delayed by one cycle. However it
1089         # can still not collide with r.slow_valid (well unless I
1090         # miscalculated) because slow_valid can only be set on a
1091         # subsequent request and not on its first cycle (the state
1092         # machine must have advanced), which makes slow_valid
1093         # at least 2 cycles from the previous hit_load_valid.
1094
1095         # Sanity: Only one of these must be set in any given cycle
1096
1097         if False: # TODO: need Display to get this to work
1098             assert (r1.slow_valid & r1.stcx_fail) != 1, \
1099             "unexpected slow_valid collision with stcx_fail"
1100
1101             assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1, \
1102              "unexpected hit_load_delayed collision with slow_valid"
1103
1104         with m.If(~r1.mmu_req):
1105             # Request came from loadstore1...
1106             # Load hit case is the standard path
1107             with m.If(r1.hit_load_valid):
1108                 sync += Display("completing load hit data=%x", data_out)
1109
1110             # error cases complete without stalling
1111             with m.If(r1.ls_error):
1112                 with m.If(r1.dcbz):
1113                     sync += Display("completing dcbz with error")
1114                 with m.Else():
1115                     sync += Display("completing ld/st with error")
1116
1117             # Slow ops (load miss, NC, stores)
1118             with m.If(r1.slow_valid):
1119                 sync += Display("completing store or load miss adr=%x data=%x",
1120                                 r1.req.real_addr, data_out)
1121
1122         with m.Else():
1123             # Request came from MMU
1124             with m.If(r1.hit_load_valid):
1125                 sync += Display("completing load hit to MMU, data=%x",
1126                                 m_out.data)
1127             # error cases complete without stalling
1128             with m.If(r1.mmu_error):
1129                 sync += Display("combpleting MMU ld with error")
1130
1131             # Slow ops (i.e. load miss)
1132             with m.If(r1.slow_valid):
1133                 sync += Display("completing MMU load miss, adr=%x data=%x",
1134                                 r1.req.real_addr, m_out.data)
1135
1136     def rams(self, m, r1, early_req_row, cache_out_row, replace_way):
1137         """rams
1138         Generate a cache RAM for each way. This handles the normal
1139         reads, writes from reloads and the special store-hit update
1140         path as well.
1141
1142         Note: the BRAMs have an extra read buffer, meaning the output
1143         is pipelined an extra cycle. This differs from the
1144         icache. The writeback logic needs to take that into
1145         account by using 1-cycle delayed signals for load hits.
1146         """
1147         comb = m.d.comb
1148         bus = self.bus
1149
1150         # a Binary-to-Unary one-hots here.  replace-way one-hot is gated
1151         # (enabled) by bus.ack, not-write-bram, and state RELOAD_WAIT_ACK
1152         m.submodules.rams_replace_way_e = rwe = Decoder(NUM_WAYS)
1153         comb += rwe.n.eq(~((r1.state == State.RELOAD_WAIT_ACK) & bus.ack &
1154                    ~r1.write_bram))
1155         comb += rwe.i.eq(replace_way)
1156
1157         m.submodules.rams_hit_way_e = hwe = Decoder(NUM_WAYS)
1158         comb += hwe.i.eq(r1.hit_way)
1159
1160         # this one is gated with write_bram, and replace_way_e can never be
1161         # set at the same time.  that means that do_write can OR the outputs
1162         m.submodules.rams_hit_req_way_e = hre = Decoder(NUM_WAYS)
1163         comb += hre.n.eq(~r1.write_bram) # Decoder.n is inverted
1164         comb += hre.i.eq(r1.req.hit_way)
1165
1166         # common Signals
1167         do_read  = Signal()
1168         wr_addr  = Signal(ROW_BITS)
1169         wr_data  = Signal(WB_DATA_BITS)
1170         wr_sel   = Signal(ROW_SIZE)
1171         rd_addr  = Signal(ROW_BITS)
1172
1173         comb += do_read.eq(1) # always enable
1174         comb += rd_addr.eq(early_req_row)
1175
1176         # Write mux:
1177         #
1178         # Defaults to wishbone read responses (cache refill)
1179         #
1180         # For timing, the mux on wr_data/sel/addr is not
1181         # dependent on anything other than the current state.
1182
1183         with m.If(r1.write_bram):
1184             # Write store data to BRAM.  This happens one
1185             # cycle after the store is in r0.
1186             comb += wr_data.eq(r1.req.data)
1187             comb += wr_sel.eq(r1.req.byte_sel)
1188             comb += wr_addr.eq(get_row(r1.req.real_addr))
1189
1190         with m.Else():
1191             # Otherwise, we might be doing a reload or a DCBZ
1192             with m.If(r1.dcbz):
1193                 comb += wr_data.eq(0)
1194             with m.Else():
1195                 comb += wr_data.eq(bus.dat_r)
1196             comb += wr_addr.eq(r1.store_row)
1197             comb += wr_sel.eq(~0) # all 1s
1198
1199         # set up Cache Rams
1200         for i in range(NUM_WAYS):
1201             do_write = Signal(name="do_wr%d" % i)
1202             wr_sel_m = Signal(ROW_SIZE, name="wr_sel_m_%d" % i)
1203             d_out   = Signal(WB_DATA_BITS, name="dout_%d" % i) # cache_row_t
1204
1205             way = CacheRam(ROW_BITS, WB_DATA_BITS, ADD_BUF=True, ram_num=i)
1206             setattr(m.submodules, "cacheram_%d" % i, way)
1207
1208             comb += way.rd_en.eq(do_read)
1209             comb += way.rd_addr.eq(rd_addr)
1210             comb += d_out.eq(way.rd_data_o)
1211             comb += way.wr_sel.eq(wr_sel_m)
1212             comb += way.wr_addr.eq(wr_addr)
1213             comb += way.wr_data.eq(wr_data)
1214
1215             # Cache hit reads
1216             with m.If(hwe.o[i]):
1217                 comb += cache_out_row.eq(d_out)
1218
1219             # these are mutually-exclusive via their Decoder-enablers
1220             # (note: Decoder-enable is inverted)
1221             comb += do_write.eq(hre.o[i] | rwe.o[i])
1222
1223             # Mask write selects with do_write since BRAM
1224             # doesn't have a global write-enable
1225             with m.If(do_write):
1226                 comb += wr_sel_m.eq(wr_sel)
1227
1228     # Cache hit synchronous machine for the easy case.
1229     # This handles load hits.
1230     # It also handles error cases (TLB miss, cache paradox)
1231     def dcache_fast_hit(self, m, req_op, r0_valid, r0, r1,
1232                         req_hit_way, req_index, req_tag, access_ok,
1233                         tlb_hit, tlb_req_index):
1234         comb = m.d.comb
1235         sync = m.d.sync
1236
1237         with m.If(req_op != Op.OP_NONE):
1238             sync += Display("op:%d addr:%x nc: %d idx: %x tag: %x way: %x",
1239                     req_op, r0.req.addr, r0.req.nc,
1240                     req_index, req_tag, req_hit_way)
1241
1242         with m.If(r0_valid):
1243             sync += r1.mmu_req.eq(r0.mmu_req)
1244
1245         # Fast path for load/store hits.
1246         # Set signals for the writeback controls.
1247         sync += r1.hit_way.eq(req_hit_way)
1248         sync += r1.hit_index.eq(req_index)
1249
1250         sync += r1.hit_load_valid.eq(req_op == Op.OP_LOAD_HIT)
1251         sync += r1.cache_hit.eq((req_op == Op.OP_LOAD_HIT) |
1252                                 (req_op == Op.OP_STORE_HIT))
1253
1254         with m.If(req_op == Op.OP_BAD):
1255             sync += Display("Signalling ld/st error "
1256                             "ls_error=%i mmu_error=%i cache_paradox=%i",
1257                             ~r0.mmu_req,r0.mmu_req,access_ok)
1258             sync += r1.ls_error.eq(~r0.mmu_req)
1259             sync += r1.mmu_error.eq(r0.mmu_req)
1260             sync += r1.cache_paradox.eq(access_ok)
1261         with m.Else():
1262             sync += r1.ls_error.eq(0)
1263             sync += r1.mmu_error.eq(0)
1264             sync += r1.cache_paradox.eq(0)
1265
1266         sync += r1.stcx_fail.eq(req_op == Op.OP_STCX_FAIL)
1267
1268         # Record TLB hit information for updating TLB PLRU
1269         sync += r1.tlb_hit.eq(tlb_hit)
1270         sync += r1.tlb_hit_index.eq(tlb_req_index)
1271
1272     # Memory accesses are handled by this state machine:
1273     #
1274     #   * Cache load miss/reload (in conjunction with "rams")
1275     #   * Load hits for non-cachable forms
1276     #   * Stores (the collision case is handled in "rams")
1277     #
1278     # All wishbone requests generation is done here.
1279     # This machine operates at stage 1.
1280     def dcache_slow(self, m, r1, use_forward1_next, use_forward2_next,
1281                     r0, replace_way,
1282                     req_hit_way, req_same_tag,
1283                     r0_valid, req_op, cache_tags, req_go, ra):
1284
1285         comb = m.d.comb
1286         sync = m.d.sync
1287         bus = self.bus
1288         d_in = self.d_in
1289
1290         req         = MemAccessRequest("mreq_ds")
1291
1292         req_row = Signal(ROW_BITS)
1293         req_idx = Signal(INDEX_BITS)
1294         req_tag = Signal(TAG_BITS)
1295         comb += req_idx.eq(get_index(req.real_addr))
1296         comb += req_row.eq(get_row(req.real_addr))
1297         comb += req_tag.eq(get_tag(req.real_addr))
1298
1299         sync += r1.use_forward1.eq(use_forward1_next)
1300         sync += r1.forward_sel.eq(0)
1301
1302         with m.If(use_forward1_next):
1303             sync += r1.forward_sel.eq(r1.req.byte_sel)
1304         with m.Elif(use_forward2_next):
1305             sync += r1.forward_sel.eq(r1.forward_sel1)
1306
1307         sync += r1.forward_data2.eq(r1.forward_data1)
1308         with m.If(r1.write_bram):
1309             sync += r1.forward_data1.eq(r1.req.data)
1310             sync += r1.forward_sel1.eq(r1.req.byte_sel)
1311             sync += r1.forward_way1.eq(r1.req.hit_way)
1312             sync += r1.forward_row1.eq(get_row(r1.req.real_addr))
1313             sync += r1.forward_valid1.eq(1)
1314         with m.Else():
1315             with m.If(r1.dcbz):
1316                 sync += r1.forward_data1.eq(0)
1317             with m.Else():
1318                 sync += r1.forward_data1.eq(bus.dat_r)
1319             sync += r1.forward_sel1.eq(~0) # all 1s
1320             sync += r1.forward_way1.eq(replace_way)
1321             sync += r1.forward_row1.eq(r1.store_row)
1322             sync += r1.forward_valid1.eq(0)
1323
1324         # One cycle pulses reset
1325         sync += r1.slow_valid.eq(0)
1326         sync += r1.write_bram.eq(0)
1327         sync += r1.inc_acks.eq(0)
1328         sync += r1.dec_acks.eq(0)
1329
1330         sync += r1.ls_valid.eq(0)
1331         # complete tlbies and TLB loads in the third cycle
1332         sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld))
1333
1334         with m.If((req_op == Op.OP_LOAD_HIT) | (req_op == Op.OP_STCX_FAIL)):
1335             with m.If(~r0.mmu_req):
1336                 sync += r1.ls_valid.eq(1)
1337             with m.Else():
1338                 sync += r1.mmu_done.eq(1)
1339
1340         with m.If(r1.write_tag):
1341             # Store new tag in selected way
1342             replace_way_onehot = Signal(NUM_WAYS)
1343             comb += replace_way_onehot.eq(1<<replace_way)
1344             for i in range(NUM_WAYS):
1345                 with m.If(replace_way_onehot[i]):
1346                     ct = Signal(TAG_RAM_WIDTH)
1347                     comb += ct.eq(cache_tags[r1.store_index].tag)
1348                     comb += ct.word_select(i, TAG_WIDTH).eq(r1.reload_tag)
1349                     sync += cache_tags[r1.store_index].tag.eq(ct)
1350             sync += r1.store_way.eq(replace_way)
1351             sync += r1.write_tag.eq(0)
1352
1353         # Take request from r1.req if there is one there,
1354         # else from req_op, ra, etc.
1355         with m.If(r1.full):
1356             comb += req.eq(r1.req)
1357         with m.Else():
1358             comb += req.op.eq(req_op)
1359             comb += req.valid.eq(req_go)
1360             comb += req.mmu_req.eq(r0.mmu_req)
1361             comb += req.dcbz.eq(r0.req.dcbz)
1362             comb += req.real_addr.eq(ra)
1363
1364             with m.If(r0.req.dcbz):
1365                 # force data to 0 for dcbz
1366                 comb += req.data.eq(0)
1367             with m.Elif(r0.d_valid):
1368                 comb += req.data.eq(r0.req.data)
1369             with m.Else():
1370                 comb += req.data.eq(d_in.data)
1371
1372             # Select all bytes for dcbz
1373             # and for cacheable loads
1374             with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc)):
1375                 comb += req.byte_sel.eq(~0) # all 1s
1376             with m.Else():
1377                 comb += req.byte_sel.eq(r0.req.byte_sel)
1378             comb += req.hit_way.eq(req_hit_way)
1379             comb += req.same_tag.eq(req_same_tag)
1380
1381             # Store the incoming request from r0,
1382             # if it is a slow request
1383             # Note that r1.full = 1 implies req_op = OP_NONE
1384             with m.If((req_op == Op.OP_LOAD_MISS)
1385                       | (req_op == Op.OP_LOAD_NC)
1386                       | (req_op == Op.OP_STORE_MISS)
1387                       | (req_op == Op.OP_STORE_HIT)):
1388                 sync += r1.req.eq(req)
1389                 sync += r1.full.eq(1)
1390
1391         # Main state machine
1392         with m.Switch(r1.state):
1393
1394             with m.Case(State.IDLE):
1395                 sync += r1.wb.adr.eq(req.real_addr[ROW_LINE_BITS:])
1396                 sync += r1.wb.sel.eq(req.byte_sel)
1397                 sync += r1.wb.dat.eq(req.data)
1398                 sync += r1.dcbz.eq(req.dcbz)
1399
1400                 # Keep track of our index and way
1401                 # for subsequent stores.
1402                 sync += r1.store_index.eq(req_idx)
1403                 sync += r1.store_row.eq(req_row)
1404                 sync += r1.end_row_ix.eq(get_row_of_line(req_row)-1)
1405                 sync += r1.reload_tag.eq(req_tag)
1406                 sync += r1.req.same_tag.eq(1)
1407
1408                 with m.If(req.op == Op.OP_STORE_HIT):
1409                     sync += r1.store_way.eq(req.hit_way)
1410
1411                 # Reset per-row valid bits,
1412                 # ready for handling OP_LOAD_MISS
1413                 for i in range(ROW_PER_LINE):
1414                     sync += r1.rows_valid[i].eq(0)
1415
1416                 with m.If(req_op != Op.OP_NONE):
1417                     sync += Display("cache op %d", req.op)
1418
1419                 with m.Switch(req.op):
1420                     with m.Case(Op.OP_LOAD_HIT):
1421                         # stay in IDLE state
1422                         pass
1423
1424                     with m.Case(Op.OP_LOAD_MISS):
1425                         sync += Display("cache miss real addr: %x " \
1426                                 "idx: %x tag: %x",
1427                                 req.real_addr, req_row, req_tag)
1428
1429                         # Start the wishbone cycle
1430                         sync += r1.wb.we.eq(0)
1431                         sync += r1.wb.cyc.eq(1)
1432                         sync += r1.wb.stb.eq(1)
1433
1434                         # Track that we had one request sent
1435                         sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1436                         sync += r1.write_tag.eq(1)
1437
1438                     with m.Case(Op.OP_LOAD_NC):
1439                         sync += r1.wb.cyc.eq(1)
1440                         sync += r1.wb.stb.eq(1)
1441                         sync += r1.wb.we.eq(0)
1442                         sync += r1.state.eq(State.NC_LOAD_WAIT_ACK)
1443
1444                     with m.Case(Op.OP_STORE_HIT, Op.OP_STORE_MISS):
1445                         with m.If(~req.dcbz):
1446                             sync += r1.state.eq(State.STORE_WAIT_ACK)
1447                             sync += r1.acks_pending.eq(1)
1448                             sync += r1.full.eq(0)
1449                             sync += r1.slow_valid.eq(1)
1450
1451                             with m.If(~req.mmu_req):
1452                                 sync += r1.ls_valid.eq(1)
1453                             with m.Else():
1454                                 sync += r1.mmu_done.eq(1)
1455
1456                             with m.If(req.op == Op.OP_STORE_HIT):
1457                                 sync += r1.write_bram.eq(1)
1458                         with m.Else():
1459                             # dcbz is handled much like a load miss except
1460                             # that we are writing to memory instead of reading
1461                             sync += r1.state.eq(State.RELOAD_WAIT_ACK)
1462
1463                             with m.If(req.op == Op.OP_STORE_MISS):
1464                                 sync += r1.write_tag.eq(1)
1465
1466                         sync += r1.wb.we.eq(1)
1467                         sync += r1.wb.cyc.eq(1)
1468                         sync += r1.wb.stb.eq(1)
1469
1470                     # OP_NONE and OP_BAD do nothing
1471                     # OP_BAD & OP_STCX_FAIL were
1472                     # handled above already
1473                     with m.Case(Op.OP_NONE):
1474                         pass
1475                     with m.Case(Op.OP_BAD):
1476                         pass
1477                     with m.Case(Op.OP_STCX_FAIL):
1478                         pass
1479
1480             with m.Case(State.RELOAD_WAIT_ACK):
1481                 ld_stbs_done = Signal()
1482                 # Requests are all sent if stb is 0
1483                 comb += ld_stbs_done.eq(~r1.wb.stb)
1484
1485                 # If we are still sending requests, was one accepted?
1486                 with m.If((~bus.stall) & r1.wb.stb):
1487                     # That was the last word?  We are done sending.
1488                     # Clear stb and set ld_stbs_done so we can handle an
1489                     # eventual last ack on the same cycle.
1490                     # sigh - reconstruct wb adr with 3 extra 0s at front
1491                     wb_adr = Cat(Const(0, ROW_OFF_BITS), r1.wb.adr)
1492                     with m.If(is_last_row_addr(wb_adr, r1.end_row_ix)):
1493                         sync += r1.wb.stb.eq(0)
1494                         comb += ld_stbs_done.eq(1)
1495
1496                     # Calculate the next row address in the current cache line
1497                     row = Signal(LINE_OFF_BITS-ROW_OFF_BITS)
1498                     comb += row.eq(r1.wb.adr)
1499                     sync += r1.wb.adr[:LINE_OFF_BITS-ROW_OFF_BITS].eq(row+1)
1500
1501                 # Incoming acks processing
1502                 sync += r1.forward_valid1.eq(bus.ack)
1503                 with m.If(bus.ack):
1504                     srow = Signal(ROW_LINE_BITS)
1505                     comb += srow.eq(r1.store_row)
1506                     sync += r1.rows_valid[srow].eq(1)
1507
1508                     # If this is the data we were looking for,
1509                     # we can complete the request next cycle.
1510                     # Compare the whole address in case the
1511                     # request in r1.req is not the one that
1512                     # started this refill.
1513                     with m.If(req.valid & r1.req.same_tag &
1514                               ((r1.dcbz & r1.req.dcbz) |
1515                                (~r1.dcbz & (r1.req.op == Op.OP_LOAD_MISS))) &
1516                                 (r1.store_row == get_row(req.real_addr))):
1517                         sync += r1.full.eq(0)
1518                         sync += r1.slow_valid.eq(1)
1519                         with m.If(~r1.mmu_req):
1520                             sync += r1.ls_valid.eq(1)
1521                         with m.Else():
1522                             sync += r1.mmu_done.eq(1)
1523                         sync += r1.forward_sel.eq(~0) # all 1s
1524                         sync += r1.use_forward1.eq(1)
1525
1526                     # Check for completion
1527                     with m.If(ld_stbs_done & is_last_row(r1.store_row,
1528                                                       r1.end_row_ix)):
1529                         # Complete wishbone cycle
1530                         sync += r1.wb.cyc.eq(0)
1531
1532                         # Cache line is now valid
1533                         cv = Signal(INDEX_BITS)
1534                         comb += cv.eq(cache_tags[r1.store_index].valid)
1535                         comb += cv.bit_select(r1.store_way, 1).eq(1)
1536                         sync += cache_tags[r1.store_index].valid.eq(cv)
1537
1538                         sync += r1.state.eq(State.IDLE)
1539                         sync += Display("cache valid set %x "
1540                                         "idx %d way %d",
1541                                          cv, r1.store_index, r1.store_way)
1542
1543                     # Increment store row counter
1544                     sync += r1.store_row.eq(next_row(r1.store_row))
1545
1546             with m.Case(State.STORE_WAIT_ACK):
1547                 st_stbs_done = Signal()
1548                 acks        = Signal(3)
1549                 adjust_acks = Signal(3)
1550
1551                 comb += st_stbs_done.eq(~r1.wb.stb)
1552                 comb += acks.eq(r1.acks_pending)
1553
1554                 with m.If(r1.inc_acks != r1.dec_acks):
1555                     with m.If(r1.inc_acks):
1556                         comb += adjust_acks.eq(acks + 1)
1557                     with m.Else():
1558                         comb += adjust_acks.eq(acks - 1)
1559                 with m.Else():
1560                     comb += adjust_acks.eq(acks)
1561
1562                 sync += r1.acks_pending.eq(adjust_acks)
1563
1564                 # Clear stb when slave accepted request
1565                 with m.If(~bus.stall):
1566                     # See if there is another store waiting
1567                     # to be done which is in the same real page.
1568                     with m.If(req.valid):
1569                         _ra = req.real_addr[ROW_LINE_BITS:SET_SIZE_BITS]
1570                         sync += r1.wb.adr[0:SET_SIZE_BITS].eq(_ra)
1571                         sync += r1.wb.dat.eq(req.data)
1572                         sync += r1.wb.sel.eq(req.byte_sel)
1573
1574                     with m.If((adjust_acks < 7) & req.same_tag &
1575                                 ((req.op == Op.OP_STORE_MISS)
1576                                  | (req.op == Op.OP_STORE_HIT))):
1577                         sync += r1.wb.stb.eq(1)
1578                         comb += st_stbs_done.eq(0)
1579
1580                         with m.If(req.op == Op.OP_STORE_HIT):
1581                             sync += r1.write_bram.eq(1)
1582                         sync += r1.full.eq(0)
1583                         sync += r1.slow_valid.eq(1)
1584
1585                         # Store requests never come from the MMU
1586                         sync += r1.ls_valid.eq(1)
1587                         comb += st_stbs_done.eq(0)
1588                         sync += r1.inc_acks.eq(1)
1589                     with m.Else():
1590                         sync += r1.wb.stb.eq(0)
1591                         comb += st_stbs_done.eq(1)
1592
1593                 # Got ack ? See if complete.
1594                 with m.If(bus.ack):
1595                     with m.If(st_stbs_done & (adjust_acks == 1)):
1596                         sync += r1.state.eq(State.IDLE)
1597                         sync += r1.wb.cyc.eq(0)
1598                         sync += r1.wb.stb.eq(0)
1599                     sync += r1.dec_acks.eq(1)
1600
1601             with m.Case(State.NC_LOAD_WAIT_ACK):
1602                 # Clear stb when slave accepted request
1603                 with m.If(~bus.stall):
1604                     sync += r1.wb.stb.eq(0)
1605
1606                 # Got ack ? complete.
1607                 with m.If(bus.ack):
1608                     sync += r1.state.eq(State.IDLE)
1609                     sync += r1.full.eq(0)
1610                     sync += r1.slow_valid.eq(1)
1611
1612                     with m.If(~r1.mmu_req):
1613                         sync += r1.ls_valid.eq(1)
1614                     with m.Else():
1615                         sync += r1.mmu_done.eq(1)
1616
1617                     sync += r1.forward_sel.eq(~0) # all 1s
1618                     sync += r1.use_forward1.eq(1)
1619                     sync += r1.wb.cyc.eq(0)
1620                     sync += r1.wb.stb.eq(0)
1621
1622     def dcache_log(self, m, r1, valid_ra, tlb_hit, stall_out):
1623
1624         sync = m.d.sync
1625         d_out, bus, log_out = self.d_out, self.bus, self.log_out
1626
1627         sync += log_out.eq(Cat(r1.state[:3], valid_ra, tlb_hit.way[:3],
1628                                stall_out, req_op[:3], d_out.valid, d_out.error,
1629                                r1.wb.cyc, r1.wb.stb, bus.ack, bus.stall,
1630                                r1.real_adr[3:6]))
1631
1632     def elaborate(self, platform):
1633
1634         m = Module()
1635         comb = m.d.comb
1636         d_in = self.d_in
1637
1638         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
1639         cache_tags       = CacheTagArray()
1640         cache_tag_set    = Signal(TAG_RAM_WIDTH)
1641
1642         # TODO attribute ram_style : string;
1643         # TODO attribute ram_style of cache_tags : signal is "distributed";
1644
1645         """note: these are passed to nmigen.hdl.Memory as "attributes".
1646            don't know how, just that they are.
1647         """
1648         # TODO attribute ram_style of
1649         #  dtlb_tags : signal is "distributed";
1650         # TODO attribute ram_style of
1651         #  dtlb_ptes : signal is "distributed";
1652
1653         r0      = RegStage0("r0")
1654         r0_full = Signal()
1655
1656         r1 = RegStage1("r1")
1657
1658         reservation = Reservation()
1659
1660         # Async signals on incoming request
1661         req_index    = Signal(INDEX_BITS)
1662         req_row      = Signal(ROW_BITS)
1663         req_hit_way  = Signal(WAY_BITS)
1664         req_tag      = Signal(TAG_BITS)
1665         req_op       = Signal(Op)
1666         req_data     = Signal(64)
1667         req_same_tag = Signal()
1668         req_go       = Signal()
1669
1670         early_req_row     = Signal(ROW_BITS)
1671
1672         cancel_store      = Signal()
1673         set_rsrv          = Signal()
1674         clear_rsrv        = Signal()
1675
1676         r0_valid          = Signal()
1677         r0_stall          = Signal()
1678
1679         use_forward1_next = Signal()
1680         use_forward2_next = Signal()
1681
1682         cache_out_row     = Signal(WB_DATA_BITS)
1683
1684         plru_victim       = Signal(WAY_BITS)
1685         replace_way       = Signal(WAY_BITS)
1686
1687         # Wishbone read/write/cache write formatting signals
1688         bus_sel           = Signal(8)
1689
1690         # TLB signals
1691         tlb_way       = TLBRecord("tlb_way")
1692         tlb_req_index = Signal(TLB_SET_BITS)
1693         tlb_hit       = TLBHit("tlb_hit")
1694         pte           = Signal(TLB_PTE_BITS)
1695         ra            = Signal(REAL_ADDR_BITS)
1696         valid_ra      = Signal()
1697         perm_attr     = PermAttr("dc_perms")
1698         rc_ok         = Signal()
1699         perm_ok       = Signal()
1700         access_ok     = Signal()
1701
1702         tlb_plru_victim = Signal(TLB_WAY_BITS)
1703
1704         # we don't yet handle collisions between loadstore1 requests
1705         # and MMU requests
1706         comb += self.m_out.stall.eq(0)
1707
1708         # Hold off the request in r0 when r1 has an uncompleted request
1709         comb += r0_stall.eq(r0_full & (r1.full | d_in.hold))
1710         comb += r0_valid.eq(r0_full & ~r1.full & ~d_in.hold)
1711         comb += self.stall_out.eq(r0_stall)
1712
1713         # deal with litex not doing wishbone pipeline mode
1714         # XXX in wrong way.  FIFOs are needed in the SRAM test
1715         # so that stb/ack match up. same thing done in icache.py
1716         comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
1717
1718         # Wire up wishbone request latch out of stage 1
1719         comb += self.bus.we.eq(r1.wb.we)
1720         comb += self.bus.adr.eq(r1.wb.adr)
1721         comb += self.bus.sel.eq(r1.wb.sel)
1722         comb += self.bus.stb.eq(r1.wb.stb)
1723         comb += self.bus.dat_w.eq(r1.wb.dat)
1724         comb += self.bus.cyc.eq(r1.wb.cyc)
1725
1726         # create submodule TLBUpdate
1727         m.submodules.dtlb_update = self.dtlb_update = DTLBUpdate()
1728         dtlb = self.dtlb_update.dtlb
1729
1730         # call sub-functions putting everything together, using shared
1731         # signals established above
1732         self.stage_0(m, r0, r1, r0_full)
1733         self.tlb_read(m, r0_stall, tlb_way, dtlb)
1734         self.tlb_search(m, tlb_req_index, r0, r0_valid,
1735                         tlb_way,
1736                         pte, tlb_hit, valid_ra, perm_attr, ra)
1737         self.tlb_update(m, r0_valid, r0, dtlb, tlb_req_index,
1738                         tlb_hit, tlb_plru_victim,
1739                         tlb_way)
1740         self.maybe_plrus(m, r1, plru_victim)
1741         self.maybe_tlb_plrus(m, r1, tlb_plru_victim, tlb_req_index)
1742         self.cache_tag_read(m, r0_stall, req_index, cache_tag_set, cache_tags)
1743         self.dcache_request(m, r0, ra, req_index, req_row, req_tag,
1744                            r0_valid, r1, cache_tags, replace_way,
1745                            use_forward1_next, use_forward2_next,
1746                            req_hit_way, plru_victim, rc_ok, perm_attr,
1747                            valid_ra, perm_ok, access_ok, req_op, req_go,
1748                            tlb_hit, tlb_way, cache_tag_set,
1749                            cancel_store, req_same_tag, r0_stall, early_req_row)
1750         self.reservation_comb(m, cancel_store, set_rsrv, clear_rsrv,
1751                            r0_valid, r0, reservation)
1752         self.reservation_reg(m, r0_valid, access_ok, set_rsrv, clear_rsrv,
1753                            reservation, r0)
1754         self.writeback_control(m, r1, cache_out_row)
1755         self.rams(m, r1, early_req_row, cache_out_row, replace_way)
1756         self.dcache_fast_hit(m, req_op, r0_valid, r0, r1,
1757                         req_hit_way, req_index, req_tag, access_ok,
1758                         tlb_hit, tlb_req_index)
1759         self.dcache_slow(m, r1, use_forward1_next, use_forward2_next,
1760                     r0, replace_way,
1761                     req_hit_way, req_same_tag,
1762                          r0_valid, req_op, cache_tags, req_go, ra)
1763         #self.dcache_log(m, r1, valid_ra, tlb_hit, stall_out)
1764
1765         return m
1766
1767
1768 if __name__ == '__main__':
1769     dut = DCache()
1770     vl = rtlil.convert(dut, ports=[])
1771     with open("test_dcache.il", "w") as f:
1772         f.write(vl)