src/soc/experiment/icache.py

   1 """ICache
   2
   3 based on Anton Blanchard microwatt icache.vhdl
   4
   5 Set associative icache
   6
   7 TODO (in no specific order):
   8 * Add debug interface to inspect cache content
   9 * Add snoop/invalidate path
  10 * Add multi-hit error detection
  11 * Pipelined bus interface (wb or axi)
  12 * Maybe add parity? There's a few bits free in each BRAM row on Xilinx
  13 * Add optimization: service hits on partially loaded lines
  14 * Add optimization: (maybe) interrupt reload on fluch/redirect
  15 * Check if playing with the geometry of the cache tags allow for more
  16   efficient use of distributed RAM and less logic/muxes. Currently we
  17   write TAG_BITS width which may not match full ram blocks and might
  18   cause muxes to be inferred for "partial writes".
  19 * Check if making the read size of PLRU a ROM helps utilization
  20
  21 Links:
  22
  23 * https://bugs.libre-soc.org/show_bug.cgi?id=485
  24 * https://libre-soc.org/irclog-microwatt/%23microwatt.2021-12-07.log.html
  25   (discussion about brams for ECP5)
  26
  27 """
  28
  29 from enum import (Enum, unique)
  30 from nmigen import (Module, Signal, Elaboratable, Cat, Array, Const, Repl,
  31                     Record)
  32 from nmigen.cli import main, rtlil
  33 from nmutil.iocontrol import RecordObject
  34 from nmigen.utils import log2_int
  35 from nmigen.lib.coding import Decoder
  36 from nmutil.util import Display
  37
  38 #from nmutil.plru import PLRU
  39 from soc.experiment.plru import PLRU, PLRUs
  40 from soc.experiment.cache_ram import CacheRam
  41
  42 from soc.experiment.mem_types import (Fetch1ToICacheType,
  43                                       ICacheToDecode1Type,
  44                                       MMUToICacheType)
  45
  46 from soc.experiment.wb_types import (WB_ADDR_BITS, WB_DATA_BITS,
  47                                      WB_SEL_BITS, WBAddrType, WBDataType,
  48                                      WBSelType, WBMasterOut, WBSlaveOut,
  49                                      )
  50
  51 from nmigen_soc.wishbone.bus import Interface
  52 from soc.minerva.units.fetch import FetchUnitInterface
  53
  54
  55 # for test
  56 from soc.bus.sram import SRAM
  57 from nmigen import Memory
  58 from nmutil.util import wrap
  59 from nmigen.cli import main, rtlil
  60
  61 # NOTE: to use cxxsim, export NMIGEN_SIM_MODE=cxxsim from the shell
  62 # Also, check out the cxxsim nmigen branch, and latest yosys from git
  63 from nmutil.sim_tmp_alternative import Simulator, Settle
  64
  65
  66 SIM            = 0
  67 LINE_SIZE      = 64
  68 # BRAM organisation: We never access more than wishbone_data_bits
  69 # at a time so to save resources we make the array only that wide,
  70 # and use consecutive indices for to make a cache "line"
  71 #
  72 # ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
  73 ROW_SIZE       = WB_DATA_BITS // 8
  74 # Number of lines in a set
  75 NUM_LINES      = 32
  76 # Number of ways
  77 NUM_WAYS       = 4
  78 # L1 ITLB number of entries (direct mapped)
  79 TLB_SIZE       = 64
  80 # L1 ITLB log_2(page_size)
  81 TLB_LG_PGSZ    = 12
  82 # Number of real address bits that we store
  83 REAL_ADDR_BITS = 56
  84 # Non-zero to enable log data collection
  85 LOG_LENGTH     = 0
  86
  87 ROW_SIZE_BITS  = ROW_SIZE * 8
  88 # ROW_PER_LINE is the number of row (wishbone) transactions in a line
  89 ROW_PER_LINE   = LINE_SIZE // ROW_SIZE
  90 # BRAM_ROWS is the number of rows in BRAM needed to represent the full icache
  91 BRAM_ROWS      = NUM_LINES * ROW_PER_LINE
  92 # INSN_PER_ROW is the number of 32bit instructions per BRAM row
  93 INSN_PER_ROW   = ROW_SIZE_BITS // 32
  94
  95 # Bit fields counts in the address
  96 #
  97 # INSN_BITS is the number of bits to select an instruction in a row
  98 INSN_BITS      = log2_int(INSN_PER_ROW)
  99 # ROW_BITS is the number of bits to select a row
 100 ROW_BITS       = log2_int(BRAM_ROWS)
 101 # ROW_LINE_BITS is the number of bits to select a row within a line
 102 ROW_LINE_BITS  = log2_int(ROW_PER_LINE)
 103 # LINE_OFF_BITS is the number of bits for the offset in a cache line
 104 LINE_OFF_BITS  = log2_int(LINE_SIZE)
 105 # ROW_OFF_BITS is the number of bits for the offset in a row
 106 ROW_OFF_BITS   = log2_int(ROW_SIZE)
 107 # INDEX_BITS is the number of bits to select a cache line
 108 INDEX_BITS     = log2_int(NUM_LINES)
 109 # SET_SIZE_BITS is the log base 2 of the set size
 110 SET_SIZE_BITS  = LINE_OFF_BITS + INDEX_BITS
 111 # TAG_BITS is the number of bits of the tag part of the address
 112 TAG_BITS       = REAL_ADDR_BITS - SET_SIZE_BITS
 113 # TAG_WIDTH is the width in bits of each way of the tag RAM
 114 TAG_WIDTH      = TAG_BITS + 7 - ((TAG_BITS + 7) % 8)
 115
 116 # WAY_BITS is the number of bits to select a way
 117 WAY_BITS       = log2_int(NUM_WAYS)
 118 TAG_RAM_WIDTH  = TAG_BITS * NUM_WAYS
 119
 120 # L1 ITLB
 121 TLB_BITS        = log2_int(TLB_SIZE)
 122 TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
 123 TLB_PTE_BITS    = 64
 124
 125 print("BRAM_ROWS       =", BRAM_ROWS)
 126 print("INDEX_BITS      =", INDEX_BITS)
 127 print("INSN_BITS       =", INSN_BITS)
 128 print("INSN_PER_ROW    =", INSN_PER_ROW)
 129 print("LINE_SIZE       =", LINE_SIZE)
 130 print("LINE_OFF_BITS   =", LINE_OFF_BITS)
 131 print("LOG_LENGTH      =", LOG_LENGTH)
 132 print("NUM_LINES       =", NUM_LINES)
 133 print("NUM_WAYS        =", NUM_WAYS)
 134 print("REAL_ADDR_BITS  =", REAL_ADDR_BITS)
 135 print("ROW_BITS        =", ROW_BITS)
 136 print("ROW_OFF_BITS    =", ROW_OFF_BITS)
 137 print("ROW_LINE_BITS   =", ROW_LINE_BITS)
 138 print("ROW_PER_LINE    =", ROW_PER_LINE)
 139 print("ROW_SIZE        =", ROW_SIZE)
 140 print("ROW_SIZE_BITS   =", ROW_SIZE_BITS)
 141 print("SET_SIZE_BITS   =", SET_SIZE_BITS)
 142 print("SIM             =", SIM)
 143 print("TAG_BITS        =", TAG_BITS)
 144 print("TAG_RAM_WIDTH   =", TAG_RAM_WIDTH)
 145 print("TAG_BITS        =", TAG_BITS)
 146 print("TLB_BITS        =", TLB_BITS)
 147 print("TLB_EA_TAG_BITS =", TLB_EA_TAG_BITS)
 148 print("TLB_LG_PGSZ     =", TLB_LG_PGSZ)
 149 print("TLB_PTE_BITS    =", TLB_PTE_BITS)
 150 print("TLB_SIZE        =", TLB_SIZE)
 151 print("WAY_BITS        =", WAY_BITS)
 152
 153 # from microwatt/utils.vhdl
 154 def ispow2(n):
 155     return n != 0 and (n & (n - 1)) == 0
 156
 157 assert LINE_SIZE % ROW_SIZE == 0
 158 assert ispow2(LINE_SIZE), "LINE_SIZE not power of 2"
 159 assert ispow2(NUM_LINES), "NUM_LINES not power of 2"
 160 assert ispow2(ROW_PER_LINE), "ROW_PER_LINE not power of 2"
 161 assert ispow2(INSN_PER_ROW), "INSN_PER_ROW not power of 2"
 162 assert (ROW_BITS == (INDEX_BITS + ROW_LINE_BITS)), \
 163     "geometry bits don't add up"
 164 assert (LINE_OFF_BITS == (ROW_OFF_BITS + ROW_LINE_BITS)), \
 165    "geometry bits don't add up"
 166 assert (REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS + LINE_OFF_BITS)), \
 167     "geometry bits don't add up"
 168 assert (REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS)), \
 169     "geometry bits don't add up"
 170
 171 # Example of layout for 32 lines of 64 bytes:
 172 #
 173 # ..  tag    |index|  line  |
 174 # ..         |   row   |    |
 175 # ..         |     |   | |00| zero          (2)
 176 # ..         |     |   |-|  | INSN_BITS     (1)
 177 # ..         |     |---|    | ROW_LINE_BITS  (3)
 178 # ..         |     |--- - --| LINE_OFF_BITS (6)
 179 # ..         |         |- --| ROW_OFF_BITS  (3)
 180 # ..         |----- ---|    | ROW_BITS      (8)
 181 # ..         |-----|        | INDEX_BITS    (5)
 182 # .. --------|              | TAG_BITS      (53)
 183
 184 # The cache data BRAM organized as described above for each way
 185 #subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
 186 #
 187 # The cache tags LUTRAM has a row per set. Vivado is a pain and will
 188 # not handle a clean (commented) definition of the cache tags as a 3d
 189 # memory. For now, work around it by putting all the tags
 190 def CacheTagArray():
 191     tag_layout = [('valid', NUM_WAYS),
 192                   ('tag', TAG_RAM_WIDTH),
 193                  ]
 194     return Array(Record(tag_layout, name="tag%d" % x) for x in range(NUM_LINES))
 195
 196 def RowPerLineValidArray():
 197     return Array(Signal(name="rows_valid_%d" %x) \
 198                  for x in range(ROW_PER_LINE))
 199
 200
 201 # TODO to be passed to nigmen as ram attributes
 202 # attribute ram_style : string;
 203 # attribute ram_style of cache_tags : signal is "distributed";
 204
 205 def TLBArray():
 206     tlb_layout = [('valid', 1),
 207                   ('tag', TLB_EA_TAG_BITS),
 208                   ('pte', TLB_PTE_BITS)
 209                  ]
 210     return Array(Record(tlb_layout, name="tlb%d" % x) for x in range(TLB_SIZE))
 211
 212 # Cache RAM interface
 213 def CacheRamOut():
 214     return Array(Signal(ROW_SIZE_BITS, name="cache_out_%d" %x) \
 215                  for x in range(NUM_WAYS))
 216
 217 # PLRU output interface
 218 def PLRUOut():
 219     return Array(Signal(WAY_BITS, name="plru_out_%d" %x) \
 220                  for x in range(NUM_LINES))
 221
 222 # Return the cache line index (tag index) for an address
 223 def get_index(addr):
 224     return addr[LINE_OFF_BITS:SET_SIZE_BITS]
 225
 226 # Return the cache row index (data memory) for an address
 227 def get_row(addr):
 228     return addr[ROW_OFF_BITS:SET_SIZE_BITS]
 229
 230 # Return the index of a row within a line
 231 def get_row_of_line(row):
 232     return row[:ROW_BITS][:ROW_LINE_BITS]
 233
 234 # Returns whether this is the last row of a line
 235 def is_last_row_addr(addr, last):
 236     return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last
 237
 238 # Returns whether this is the last row of a line
 239 def is_last_row(row, last):
 240     return get_row_of_line(row) == last
 241
 242 # Return the next row in the current cache line. We use a dedicated
 243 # function in order to limit the size of the generated adder to be
 244 # only the bits within a cache line (3 bits with default settings)
 245 def next_row(row):
 246     row_v = row[0:ROW_LINE_BITS] + 1
 247     return Cat(row_v[:ROW_LINE_BITS], row[ROW_LINE_BITS:])
 248
 249 # Read the instruction word for the given address
 250 # in the current cache row
 251 def read_insn_word(addr, data):
 252     word = addr[2:INSN_BITS+2]
 253     return data.word_select(word, 32)
 254
 255 # Get the tag value from the address
 256 def get_tag(addr):
 257     return addr[SET_SIZE_BITS:REAL_ADDR_BITS]
 258
 259 # Read a tag from a tag memory row
 260 def read_tag(way, tagset):
 261     return tagset.word_select(way, TAG_WIDTH)[:TAG_BITS]
 262
 263 # Write a tag to tag memory row
 264 def write_tag(way, tagset, tag):
 265     return read_tag(way, tagset).eq(tag)
 266
 267 # Simple hash for direct-mapped TLB index
 268 def hash_ea(addr):
 269     hsh = (addr[TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_BITS] ^
 270            addr[TLB_LG_PGSZ + TLB_BITS:TLB_LG_PGSZ + 2 * TLB_BITS ] ^
 271            addr[TLB_LG_PGSZ + 2 * TLB_BITS:TLB_LG_PGSZ + 3 * TLB_BITS])
 272     return hsh
 273
 274
 275 # Cache reload state machine
 276 @unique
 277 class State(Enum):
 278     IDLE     = 0
 279     CLR_TAG  = 1
 280     WAIT_ACK = 2
 281
 282
 283 class RegInternal(RecordObject):
 284     def __init__(self):
 285         super().__init__()
 286         # Cache hit state (Latches for 1 cycle BRAM access)
 287         self.hit_way      = Signal(WAY_BITS)
 288         self.hit_nia      = Signal(64)
 289         self.hit_smark    = Signal()
 290         self.hit_valid    = Signal()
 291
 292         # Cache miss state (reload state machine)
 293         self.state        = Signal(State, reset=State.IDLE)
 294         self.wb           = WBMasterOut("wb")
 295         self.req_adr      = Signal(64)
 296         self.store_way    = Signal(WAY_BITS)
 297         self.store_index  = Signal(INDEX_BITS)
 298         self.store_row    = Signal(ROW_BITS)
 299         self.store_tag    = Signal(TAG_BITS)
 300         self.store_valid  = Signal()
 301         self.end_row_ix   = Signal(ROW_LINE_BITS)
 302         self.rows_valid   = RowPerLineValidArray()
 303
 304         # TLB miss state
 305         self.fetch_failed = Signal()
 306
 307
 308 class ICache(FetchUnitInterface, Elaboratable):
 309     """64 bit direct mapped icache. All instructions are 4B aligned."""
 310     def __init__(self, pspec):
 311         FetchUnitInterface.__init__(self, pspec)
 312         self.i_in           = Fetch1ToICacheType(name="i_in")
 313         self.i_out          = ICacheToDecode1Type(name="i_out")
 314
 315         self.m_in           = MMUToICacheType(name="m_in")
 316
 317         self.stall_in       = Signal()
 318         self.stall_out      = Signal()
 319         self.flush_in       = Signal()
 320         self.inval_in       = Signal()
 321
 322         # standard naming (wired to non-standard for compatibility)
 323         self.bus = Interface(addr_width=32,
 324                             data_width=64,
 325                             granularity=8,
 326                             features={'stall'},
 327                             alignment=0,
 328                             name="icache_wb")
 329
 330         self.log_out        = Signal(54)
 331
 332         # use FetchUnitInterface, helps keep some unit tests running
 333         self.use_fetch_iface = False
 334
 335     def use_fetch_interface(self):
 336         self.use_fetch_iface = True
 337
 338     # Generate a cache RAM for each way
 339     def rams(self, m, r, cache_out_row, use_previous,
 340              replace_way, req_row):
 341
 342         comb = m.d.comb
 343         sync = m.d.sync
 344
 345         bus, stall_in = self.bus, self.stall_in
 346
 347         # read condition (for every cache ram)
 348         do_read  = Signal()
 349         comb += do_read.eq(~(stall_in | use_previous))
 350
 351         rd_addr  = Signal(ROW_BITS)
 352         wr_addr  = Signal(ROW_BITS)
 353         comb += rd_addr.eq(req_row)
 354         comb += wr_addr.eq(r.store_row)
 355
 356         # binary-to-unary converters: replace-way enabled by bus.ack,
 357         # hit-way left permanently enabled
 358         m.submodules.replace_way_e = re = Decoder(NUM_WAYS)
 359         m.submodules.hit_way_e = he = Decoder(NUM_WAYS)
 360         comb += re.i.eq(replace_way)
 361         comb += re.n.eq(~bus.ack)
 362         comb += he.i.eq(r.hit_way)
 363
 364         for i in range(NUM_WAYS):
 365             do_write = Signal(name="do_wr_%d" % i)
 366             d_out    = Signal(ROW_SIZE_BITS, name="d_out_%d" % i)
 367             wr_sel   = Signal(ROW_SIZE, name="wr_sel_%d" % i)
 368
 369             way = CacheRam(ROW_BITS, ROW_SIZE_BITS, TRACE=True, ram_num=i)
 370             m.submodules["cacheram_%d" % i] =  way
 371
 372             comb += way.rd_en.eq(do_read)
 373             comb += way.rd_addr.eq(rd_addr)
 374             comb += d_out.eq(way.rd_data_o)
 375             comb += way.wr_sel.eq(wr_sel)
 376             comb += way.wr_addr.eq(wr_addr)
 377             comb += way.wr_data.eq(bus.dat_r)
 378
 379             comb += do_write.eq(re.o[i])
 380
 381             with m.If(do_write):
 382                 sync += Display("cache write adr: %x data: %lx",
 383                                 wr_addr, way.wr_data)
 384
 385             with m.If(he.o[i]):
 386                 comb += cache_out_row.eq(d_out)
 387                 with m.If(do_read):
 388                     sync += Display("cache read adr: %x data: %x",
 389                                      req_row, d_out)
 390
 391             comb += wr_sel.eq(Repl(do_write, ROW_SIZE))
 392
 393     # Generate PLRUs
 394     def maybe_plrus(self, m, r, plru_victim):
 395         comb = m.d.comb
 396
 397         if NUM_WAYS == 0:
 398             return
 399
 400
 401         m.submodules.plrus = plru = PLRUs(NUM_LINES, WAY_BITS)
 402         comb += plru.way.eq(r.hit_way)
 403         comb += plru.valid.eq(r.hit_valid)
 404         comb += plru.index.eq(get_index(r.hit_nia))
 405         comb += plru.isel.eq(r.store_index) # select victim
 406         comb += plru_victim.eq(plru.o_index) # selected victim
 407
 408     # TLB hit detection and real address generation
 409     def itlb_lookup(self, m, tlb_req_index, itlb,
 410                     real_addr, ra_valid, eaa_priv,
 411                     priv_fault, access_ok):
 412
 413         comb = m.d.comb
 414
 415         i_in = self.i_in
 416
 417         pte  = Signal(TLB_PTE_BITS)
 418         ttag = Signal(TLB_EA_TAG_BITS)
 419
 420         comb += tlb_req_index.eq(hash_ea(i_in.nia))
 421         comb += pte.eq(itlb[tlb_req_index].pte)
 422         comb += ttag.eq(itlb[tlb_req_index].tag)
 423
 424         with m.If(i_in.virt_mode):
 425             comb += real_addr.eq(Cat(
 426                      i_in.nia[:TLB_LG_PGSZ],
 427                      pte[TLB_LG_PGSZ:REAL_ADDR_BITS]
 428                     ))
 429
 430             with m.If(ttag == i_in.nia[TLB_LG_PGSZ + TLB_BITS:64]):
 431                 comb += ra_valid.eq(itlb[tlb_req_index].valid)
 432
 433             comb += eaa_priv.eq(pte[3])
 434
 435         with m.Else():
 436             comb += real_addr.eq(i_in.nia[:REAL_ADDR_BITS])
 437             comb += ra_valid.eq(1)
 438             comb += eaa_priv.eq(1)
 439
 440         # No IAMR, so no KUEP support for now
 441         comb += priv_fault.eq(eaa_priv & ~i_in.priv_mode)
 442         comb += access_ok.eq(ra_valid & ~priv_fault)
 443
 444     # iTLB update
 445     def itlb_update(self, m, itlb):
 446         comb = m.d.comb
 447         sync = m.d.sync
 448
 449         m_in = self.m_in
 450
 451         wr_index = Signal(TLB_SIZE)
 452         comb += wr_index.eq(hash_ea(m_in.addr))
 453
 454         with m.If(m_in.tlbie & m_in.doall):
 455             # Clear all valid bits
 456             for i in range(TLB_SIZE):
 457                 sync += itlb[i].valid.eq(0)
 458
 459         with m.Elif(m_in.tlbie):
 460             # Clear entry regardless of hit or miss
 461             sync += itlb[wr_index].valid.eq(0)
 462
 463         with m.Elif(m_in.tlbld):
 464             sync += itlb[wr_index].tag.eq(m_in.addr[TLB_LG_PGSZ + TLB_BITS:64])
 465             sync += itlb[wr_index].pte.eq(m_in.pte)
 466             sync += itlb[wr_index].valid.eq(1)
 467
 468     # Cache hit detection, output to fetch2 and other misc logic
 469     def icache_comb(self, m, use_previous, r, req_index, req_row,
 470                     req_hit_way, req_tag, real_addr, req_laddr,
 471                     cache_tags, access_ok,
 472                     req_is_hit, req_is_miss, replace_way,
 473                     plru_victim, cache_out_row):
 474
 475         comb = m.d.comb
 476
 477         i_in, i_out, bus = self.i_in, self.i_out, self.bus
 478         flush_in, stall_out = self.flush_in, self.stall_out
 479
 480         is_hit  = Signal()
 481         hit_way = Signal(WAY_BITS)
 482
 483         # i_in.sequential means that i_in.nia this cycle is 4 more than
 484         # last cycle.  If we read more than 32 bits at a time, had a
 485         # cache hit last cycle, and we don't want the first 32-bit chunk
 486         # then we can keep the data we read last cycle and just use that.
 487         with m.If(i_in.nia[2:INSN_BITS+2] != 0):
 488             comb += use_previous.eq(i_in.sequential & r.hit_valid)
 489
 490         # Extract line, row and tag from request
 491         comb += req_index.eq(get_index(i_in.nia))
 492         comb += req_row.eq(get_row(i_in.nia))
 493         comb += req_tag.eq(get_tag(real_addr))
 494
 495         # Calculate address of beginning of cache row, will be
 496         # used for cache miss processing if needed
 497         comb += req_laddr.eq(Cat(
 498                  Const(0, ROW_OFF_BITS),
 499                  real_addr[ROW_OFF_BITS:REAL_ADDR_BITS],
 500                 ))
 501
 502         # Test if pending request is a hit on any way
 503         hitcond = Signal()
 504         comb += hitcond.eq((r.state == State.WAIT_ACK)
 505                  & (req_index == r.store_index)
 506                  & r.rows_valid[req_row % ROW_PER_LINE]
 507                 )
 508         # i_in.req asserts Decoder active
 509         cvb = Signal(NUM_WAYS)
 510         ctag = Signal(TAG_RAM_WIDTH)
 511         comb += ctag.eq(cache_tags[req_index].tag)
 512         comb += cvb.eq(cache_tags[req_index].valid)
 513         m.submodules.store_way_e = se = Decoder(NUM_WAYS)
 514         comb += se.i.eq(r.store_way)
 515         comb += se.n.eq(~i_in.req)
 516         for i in range(NUM_WAYS):
 517             tagi = Signal(TAG_BITS, name="tag_i%d" % i)
 518             hit_test = Signal(name="hit_test%d" % i)
 519             is_tag_hit = Signal(name="is_tag_hit_%d" % i)
 520             comb += tagi.eq(read_tag(i, ctag))
 521             comb += hit_test.eq(se.o[i])
 522             comb += is_tag_hit.eq((cvb[i] | (hitcond & hit_test)) &
 523                                   (tagi == req_tag))
 524             with m.If(is_tag_hit):
 525                 comb += hit_way.eq(i)
 526                 comb += is_hit.eq(1)
 527
 528         # Generate the "hit" and "miss" signals
 529         # for the synchronous blocks
 530         with m.If(i_in.req & access_ok & ~flush_in):
 531             comb += req_is_hit.eq(is_hit)
 532             comb += req_is_miss.eq(~is_hit)
 533
 534         comb += req_hit_way.eq(hit_way)
 535
 536         # The way to replace on a miss
 537         with m.If(r.state == State.CLR_TAG):
 538             comb += replace_way.eq(plru_victim)
 539         with m.Else():
 540             comb += replace_way.eq(r.store_way)
 541
 542         # Output instruction from current cache row
 543         #
 544         # Note: This is a mild violation of our design principle of
 545         # having pipeline stages output from a clean latch. In this
 546         # case we output the result of a mux. The alternative would
 547         # be output an entire row which I prefer not to do just yet
 548         # as it would force fetch2 to know about some of the cache
 549         # geometry information.
 550         comb += i_out.insn.eq(read_insn_word(r.hit_nia, cache_out_row))
 551         comb += i_out.valid.eq(r.hit_valid)
 552         comb += i_out.nia.eq(r.hit_nia)
 553         comb += i_out.stop_mark.eq(r.hit_smark)
 554         comb += i_out.fetch_failed.eq(r.fetch_failed)
 555
 556         # Stall fetch1 if we have a miss on cache or TLB
 557         # or a protection fault
 558         comb += stall_out.eq(~(is_hit & access_ok))
 559
 560         # Wishbone requests output (from the cache miss reload machine)
 561         comb += bus.we.eq(r.wb.we)
 562         comb += bus.adr.eq(r.wb.adr)
 563         comb += bus.sel.eq(r.wb.sel)
 564         comb += bus.stb.eq(r.wb.stb)
 565         comb += bus.dat_w.eq(r.wb.dat)
 566         comb += bus.cyc.eq(r.wb.cyc)
 567
 568     # Cache hit synchronous machine
 569     def icache_hit(self, m, use_previous, r, req_is_hit, req_hit_way,
 570                    req_index, req_tag, real_addr):
 571         sync = m.d.sync
 572
 573         i_in, stall_in = self.i_in, self.stall_in
 574         flush_in       = self.flush_in
 575
 576         # keep outputs to fetch2 unchanged on a stall
 577         # except that flush or reset sets valid to 0
 578         # If use_previous, keep the same data as last
 579         # cycle and use the second half
 580         with m.If(stall_in | use_previous):
 581             with m.If(flush_in):
 582                 sync += r.hit_valid.eq(0)
 583         with m.Else():
 584             # On a hit, latch the request for the next cycle,
 585             # when the BRAM data will be available on the
 586             # cache_out output of the corresponding way
 587             sync += r.hit_valid.eq(req_is_hit)
 588
 589             with m.If(req_is_hit):
 590                 sync += r.hit_way.eq(req_hit_way)
 591                 sync += Display("cache hit nia:%x IR:%x SM:%x idx:%x tag:%x "
 592                                 "way:%x RA:%x", i_in.nia, i_in.virt_mode,
 593                                  i_in.stop_mark, req_index, req_tag,
 594                                  req_hit_way, real_addr)
 595
 596         with m.If(~stall_in):
 597             # Send stop marks and NIA down regardless of validity
 598             sync += r.hit_smark.eq(i_in.stop_mark)
 599             sync += r.hit_nia.eq(i_in.nia)
 600
 601     def icache_miss_idle(self, m, r, req_is_miss, req_laddr,
 602                          req_index, req_tag, replace_way, real_addr):
 603         comb = m.d.comb
 604         sync = m.d.sync
 605
 606         i_in = self.i_in
 607
 608         # Reset per-row valid flags, only used in WAIT_ACK
 609         for i in range(ROW_PER_LINE):
 610             sync += r.rows_valid[i].eq(0)
 611
 612         # We need to read a cache line
 613         with m.If(req_is_miss):
 614             sync += Display(
 615                      "cache miss nia:%x IR:%x SM:%x idx:%x "
 616                      " way:%x tag:%x RA:%x", i_in.nia,
 617                      i_in.virt_mode, i_in.stop_mark, req_index,
 618                      replace_way, req_tag, real_addr)
 619
 620             # Keep track of our index and way for subsequent stores
 621             st_row = Signal(ROW_BITS)
 622             comb += st_row.eq(get_row(req_laddr))
 623             sync += r.store_index.eq(req_index)
 624             sync += r.store_row.eq(st_row)
 625             sync += r.store_tag.eq(req_tag)
 626             sync += r.store_valid.eq(1)
 627             sync += r.end_row_ix.eq(get_row_of_line(st_row) - 1)
 628
 629             # Prep for first wishbone read.  We calculate the address
 630             # of the start of the cache line and start the WB cycle.
 631             sync += r.req_adr.eq(req_laddr)
 632             sync += r.wb.cyc.eq(1)
 633             sync += r.wb.stb.eq(1)
 634
 635             # Track that we had one request sent
 636             sync += r.state.eq(State.CLR_TAG)
 637
 638     def icache_miss_clr_tag(self, m, r, replace_way,
 639                             req_index,
 640                             tagset, cache_tags):
 641         comb = m.d.comb
 642         sync = m.d.sync
 643
 644         # Get victim way from plru
 645         sync += r.store_way.eq(replace_way)
 646
 647         # Force misses on that way while reloading that line
 648         cv = Signal(INDEX_BITS)
 649         comb += cv.eq(cache_tags[req_index].valid)
 650         comb += cv.bit_select(replace_way, 1).eq(0)
 651         sync += cache_tags[req_index].valid.eq(cv)
 652
 653         for i in range(NUM_WAYS):
 654             with m.If(i == replace_way):
 655                 comb += tagset.eq(cache_tags[r.store_index].tag)
 656                 comb += write_tag(i, tagset, r.store_tag)
 657                 sync += cache_tags[r.store_index].tag.eq(tagset)
 658
 659         sync += r.state.eq(State.WAIT_ACK)
 660
 661     def icache_miss_wait_ack(self, m, r, replace_way, inval_in,
 662                              cache_tags, stbs_done):
 663         comb = m.d.comb
 664         sync = m.d.sync
 665
 666         bus = self.bus
 667
 668         # Requests are all sent if stb is 0
 669         stbs_zero = Signal()
 670         comb += stbs_zero.eq(r.wb.stb == 0)
 671         comb += stbs_done.eq(stbs_zero)
 672
 673         # If we are still sending requests, was one accepted?
 674         with m.If(~bus.stall & ~stbs_zero):
 675             # That was the last word? We are done sending.
 676             # Clear stb and set stbs_done so we can handle
 677             # an eventual last ack on the same cycle.
 678             with m.If(is_last_row_addr(r.req_adr, r.end_row_ix)):
 679                 sync += Display("IS_LAST_ROW_ADDR r.wb.addr:%x "
 680                          "r.end_row_ix:%x r.wb.stb:%x stbs_zero:%x "
 681                          "stbs_done:%x", r.wb.adr, r.end_row_ix,
 682                          r.wb.stb, stbs_zero, stbs_done)
 683                 sync += r.wb.stb.eq(0)
 684                 comb += stbs_done.eq(1)
 685
 686             # Calculate the next row address
 687             rarange = Signal(LINE_OFF_BITS - ROW_OFF_BITS)
 688             comb += rarange.eq(r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS] + 1)
 689             sync += r.req_adr[ROW_OFF_BITS:LINE_OFF_BITS].eq(rarange)
 690             sync += Display("RARANGE r.req_adr:%x rarange:%x "
 691                             "stbs_zero:%x stbs_done:%x",
 692                             r.req_adr, rarange, stbs_zero, stbs_done)
 693
 694         # Incoming acks processing
 695         with m.If(bus.ack):
 696             sync += Display("WB_IN_ACK data:%x stbs_zero:%x "
 697                             "stbs_done:%x",
 698                             bus.dat_r, stbs_zero, stbs_done)
 699
 700             sync += r.rows_valid[r.store_row % ROW_PER_LINE].eq(1)
 701
 702             # Check for completion
 703             with m.If(stbs_done & is_last_row(r.store_row, r.end_row_ix)):
 704                 # Complete wishbone cycle
 705                 sync += r.wb.cyc.eq(0)
 706                 # be nice, clear addr
 707                 sync += r.req_adr.eq(0)
 708
 709                 # Cache line is now valid
 710                 cv = Signal(INDEX_BITS)
 711                 comb += cv.eq(cache_tags[r.store_index].valid)
 712                 comb += cv.bit_select(replace_way, 1).eq(
 713                          r.store_valid & ~inval_in)
 714                 sync += cache_tags[r.store_index].valid.eq(cv)
 715
 716                 sync += r.state.eq(State.IDLE)
 717
 718             # move on to next request in row
 719             # Increment store row counter
 720             sync += r.store_row.eq(next_row(r.store_row))
 721
 722     # Cache miss/reload synchronous machine
 723     def icache_miss(self, m, r, req_is_miss,
 724                     req_index, req_laddr, req_tag, replace_way,
 725                     cache_tags, access_ok, real_addr):
 726         comb = m.d.comb
 727         sync = m.d.sync
 728
 729         i_in, bus, m_in  = self.i_in, self.bus, self.m_in
 730         stall_in, flush_in = self.stall_in, self.flush_in
 731         inval_in           = self.inval_in
 732
 733         tagset    = Signal(TAG_RAM_WIDTH)
 734         stbs_done = Signal()
 735
 736         comb += r.wb.sel.eq(-1)
 737         comb += r.wb.adr.eq(r.req_adr[3:])
 738
 739         # Process cache invalidations
 740         with m.If(inval_in):
 741             for i in range(NUM_LINES):
 742                 sync += cache_tags[i].valid.eq(0)
 743             sync += r.store_valid.eq(0)
 744
 745         # Main state machine
 746         with m.Switch(r.state):
 747
 748             with m.Case(State.IDLE):
 749                 self.icache_miss_idle(m, r, req_is_miss, req_laddr,
 750                                       req_index, req_tag, replace_way,
 751                                       real_addr)
 752
 753             with m.Case(State.CLR_TAG, State.WAIT_ACK):
 754                 with m.If(r.state == State.CLR_TAG):
 755                     self.icache_miss_clr_tag(m, r, replace_way,
 756                                              req_index, tagset, cache_tags)
 757
 758                 self.icache_miss_wait_ack(m, r, replace_way, inval_in,
 759                                           cache_tags, stbs_done)
 760
 761         # TLB miss and protection fault processing
 762         with m.If(flush_in | m_in.tlbld):
 763             sync += r.fetch_failed.eq(0)
 764         with m.Elif(i_in.req & ~access_ok & ~stall_in):
 765             sync += r.fetch_failed.eq(1)
 766
 767     # icache_log: if LOG_LENGTH > 0 generate
 768     def icache_log(self, m, req_hit_way, ra_valid, access_ok,
 769                    req_is_miss, req_is_hit, lway, wstate, r):
 770         comb = m.d.comb
 771         sync = m.d.sync
 772
 773         bus, i_out       = self.bus, self.i_out
 774         log_out, stall_out = self.log_out, self.stall_out
 775
 776         # Output data to logger
 777         for i in range(LOG_LENGTH):
 778             log_data = Signal(54)
 779             lway     = Signal(WAY_BITS)
 780             wstate   = Signal()
 781
 782             sync += lway.eq(req_hit_way)
 783             sync += wstate.eq(0)
 784
 785             with m.If(r.state != State.IDLE):
 786                 sync += wstate.eq(1)
 787
 788             sync += log_data.eq(Cat(
 789                      ra_valid, access_ok, req_is_miss, req_is_hit,
 790                      lway, wstate, r.hit_nia[2:6], r.fetch_failed,
 791                      stall_out, bus.stall, r.wb.cyc, r.wb.stb,
 792                      r.real_addr[3:6], bus.ack, i_out.insn, i_out.valid
 793                     ))
 794             comb += log_out.eq(log_data)
 795
 796     def elaborate(self, platform):
 797
 798         m                = Module()
 799         comb             = m.d.comb
 800
 801         # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
 802         cache_tags       = CacheTagArray()
 803
 804         # TLB Array
 805         itlb            = TLBArray()
 806
 807         # TODO to be passed to nmigen as ram attributes
 808         # attribute ram_style of itlb_tags : signal is "distributed";
 809         # attribute ram_style of itlb_ptes : signal is "distributed";
 810
 811         # Privilege bit from PTE EAA field
 812         eaa_priv         = Signal()
 813
 814         r                = RegInternal()
 815
 816         # Async signal on incoming request
 817         req_index        = Signal(INDEX_BITS)
 818         req_row          = Signal(ROW_BITS)
 819         req_hit_way      = Signal(WAY_BITS)
 820         req_tag          = Signal(TAG_BITS)
 821         req_is_hit       = Signal()
 822         req_is_miss      = Signal()
 823         req_laddr        = Signal(64)
 824
 825         tlb_req_index    = Signal(TLB_BITS)
 826         real_addr        = Signal(REAL_ADDR_BITS)
 827         ra_valid         = Signal()
 828         priv_fault       = Signal()
 829         access_ok        = Signal()
 830         use_previous     = Signal()
 831
 832         cache_out_row    = Signal(ROW_SIZE_BITS)
 833
 834         plru_victim      = Signal(WAY_BITS)
 835         replace_way      = Signal(WAY_BITS)
 836
 837         # call sub-functions putting everything together,
 838         # using shared signals established above
 839         self.rams(m, r, cache_out_row, use_previous, replace_way, req_row)
 840         self.maybe_plrus(m, r, plru_victim)
 841         self.itlb_lookup(m, tlb_req_index, itlb, real_addr,
 842                          ra_valid, eaa_priv, priv_fault,
 843                          access_ok)
 844         self.itlb_update(m, itlb)
 845         self.icache_comb(m, use_previous, r, req_index, req_row, req_hit_way,
 846                          req_tag, real_addr, req_laddr,
 847                          cache_tags, access_ok, req_is_hit, req_is_miss,
 848                          replace_way, plru_victim, cache_out_row)
 849         self.icache_hit(m, use_previous, r, req_is_hit, req_hit_way,
 850                         req_index, req_tag, real_addr)
 851         self.icache_miss(m, r, req_is_miss, req_index,
 852                          req_laddr, req_tag, replace_way, cache_tags,
 853                          access_ok, real_addr)
 854         #self.icache_log(m, log_out, req_hit_way, ra_valid, access_ok,
 855         #                req_is_miss, req_is_hit, lway, wstate, r)
 856
 857         # don't connect up to FetchUnitInterface so that some unit tests
 858         # can continue to operate
 859         if not self.use_fetch_iface:
 860             return m
 861
 862         # connect to FetchUnitInterface. FetchUnitInterface is undocumented
 863         # so needs checking and iterative revising
 864         i_in, bus, i_out = self.i_in, self.bus, self.i_out
 865         comb += i_in.req.eq(self.a_i_valid)
 866         comb += i_in.nia.eq(self.a_pc_i)
 867         comb += self.stall_in.eq(self.a_stall_i)
 868         comb += self.f_fetch_err_o.eq(i_out.fetch_failed)
 869         comb += self.f_badaddr_o.eq(i_out.nia)
 870         comb += self.f_instr_o.eq(i_out.insn)
 871         comb += self.f_busy_o.eq(~i_out.valid) # probably
 872
 873         # TODO, connect dcache wb_in/wb_out to "standard" nmigen Wishbone bus
 874         ibus = self.ibus
 875         comb += ibus.adr.eq(self.bus.adr)
 876         comb += ibus.dat_w.eq(self.bus.dat_w)
 877         comb += ibus.sel.eq(self.bus.sel)
 878         comb += ibus.cyc.eq(self.bus.cyc)
 879         comb += ibus.stb.eq(self.bus.stb)
 880         comb += ibus.we.eq(self.bus.we)
 881
 882         comb += self.bus.dat_r.eq(ibus.dat_r)
 883         comb += self.bus.ack.eq(ibus.ack)
 884         if hasattr(ibus, "stall"):
 885             comb += self.bus.stall.eq(ibus.stall)
 886         else:
 887             # fake-up the wishbone stall signal to comply with pipeline mode
 888             # same thing is done in dcache.py
 889             comb += self.bus.stall.eq(self.bus.cyc & ~self.bus.ack)
 890
 891         return m
 892
 893
 894 def icache_sim(dut):
 895     i_in = dut.i_in
 896     i_out  = dut.i_out
 897     m_out = dut.m_in
 898
 899     yield i_in.priv_mode.eq(1)
 900     yield i_in.req.eq(0)
 901     yield i_in.nia.eq(0)
 902     yield i_in.stop_mark.eq(0)
 903     yield m_out.tlbld.eq(0)
 904     yield m_out.tlbie.eq(0)
 905     yield m_out.addr.eq(0)
 906     yield m_out.pte.eq(0)
 907     yield
 908     yield
 909     yield
 910     yield
 911
 912     # miss, stalls for a bit
 913     yield i_in.req.eq(1)
 914     yield i_in.nia.eq(Const(0x0000000000000004, 64))
 915     yield
 916     valid = yield i_out.valid
 917     while not valid:
 918         yield
 919         valid = yield i_out.valid
 920     yield i_in.req.eq(0)
 921
 922     insn  = yield i_out.insn
 923     nia   = yield i_out.nia
 924     assert insn == 0x00000001, \
 925         "insn @%x=%x expected 00000001" % (nia, insn)
 926     yield i_in.req.eq(0)
 927     yield
 928
 929     # hit
 930     yield i_in.req.eq(1)
 931     yield i_in.nia.eq(Const(0x0000000000000008, 64))
 932     yield
 933     valid = yield i_out.valid
 934     while not valid:
 935         yield
 936         valid = yield i_out.valid
 937     yield i_in.req.eq(0)
 938
 939     nia   = yield i_out.nia
 940     insn  = yield i_out.insn
 941     yield
 942     assert insn == 0x00000002, \
 943         "insn @%x=%x expected 00000002" % (nia, insn)
 944
 945     # another miss
 946     yield i_in.req.eq(1)
 947     yield i_in.nia.eq(Const(0x0000000000000040, 64))
 948     yield
 949     valid = yield i_out.valid
 950     while not valid:
 951         yield
 952         valid = yield i_out.valid
 953     yield i_in.req.eq(0)
 954
 955     nia   = yield i_in.nia
 956     insn  = yield i_out.insn
 957     assert insn == 0x00000010, \
 958         "insn @%x=%x expected 00000010" % (nia, insn)
 959
 960     # test something that aliases (this only works because
 961     # the unit test SRAM is a depth of 512)
 962     yield i_in.req.eq(1)
 963     yield i_in.nia.eq(Const(0x0000000000000100, 64))
 964     yield
 965     yield
 966     valid = yield i_out.valid
 967     assert ~valid
 968     for i in range(30):
 969         yield
 970     yield
 971     insn  = yield i_out.insn
 972     valid = yield i_out.valid
 973     insn  = yield i_out.insn
 974     assert valid
 975     assert insn == 0x00000040, \
 976          "insn @%x=%x expected 00000040" % (nia, insn)
 977     yield i_in.req.eq(0)
 978
 979
 980 def test_icache(mem):
 981     from soc.config.test.test_loadstore import TestMemPspec
 982     pspec = TestMemPspec(addr_wid=32,
 983                          mask_wid=8,
 984                          reg_wid=64,
 985                          )
 986     dut    = ICache(pspec)
 987
 988     memory = Memory(width=64, depth=512, init=mem)
 989     sram   = SRAM(memory=memory, granularity=8)
 990
 991     m      = Module()
 992
 993     m.submodules.icache = dut
 994     m.submodules.sram   = sram
 995
 996     m.d.comb += sram.bus.cyc.eq(dut.bus.cyc)
 997     m.d.comb += sram.bus.stb.eq(dut.bus.stb)
 998     m.d.comb += sram.bus.we.eq(dut.bus.we)
 999     m.d.comb += sram.bus.sel.eq(dut.bus.sel)
1000     m.d.comb += sram.bus.adr.eq(dut.bus.adr)
1001     m.d.comb += sram.bus.dat_w.eq(dut.bus.dat_w)
1002
1003     m.d.comb += dut.bus.ack.eq(sram.bus.ack)
1004     m.d.comb += dut.bus.dat_r.eq(sram.bus.dat_r)
1005
1006     # nmigen Simulation
1007     sim = Simulator(m)
1008     sim.add_clock(1e-6)
1009
1010     sim.add_sync_process(wrap(icache_sim(dut)))
1011     with sim.write_vcd('test_icache.vcd'):
1012          sim.run()
1013
1014
1015 if __name__ == '__main__':
1016     from soc.config.test.test_loadstore import TestMemPspec
1017     pspec = TestMemPspec(addr_wid=64,
1018                          mask_wid=8,
1019                          reg_wid=64,
1020                          )
1021     dut = ICache(pspec)
1022     vl = rtlil.convert(dut, ports=[])
1023     with open("test_icache.il", "w") as f:
1024         f.write(vl)
1025
1026     # set up memory every 32-bits with incrementing values 0 1 2 ...
1027     mem = []
1028     for i in range(512):
1029         mem.append((i*2) | ((i*2+1)<<32))
1030
1031     test_icache(mem)