From: Cole Poirier Date: Tue, 25 Aug 2020 20:33:17 +0000 (-0700) Subject: dcache.py rearrange, transform classes into functions with input X-Git-Tag: semi_working_ecp5~255 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=d197e57f759767537576f4115c97bc63b0a62401;p=soc.git dcache.py rearrange, transform classes into functions with input parameters, fix typos, whitespace, syntax --- diff --git a/src/soc/experiment/dcache.py b/src/soc/experiment/dcache.py index f9ea561d..70c5a073 100644 --- a/src/soc/experiment/dcache.py +++ b/src/soc/experiment/dcache.py @@ -209,618 +209,1045 @@ class Dcache(Elaboratable): self.log_out = Signal(20) - def elaborate(self, platform): - LINE_SIZE = self.LINE_SIZE - NUM_LINES = self.NUM_LINES - NUM_WAYS = self.NUM_WAYS - TLB_SET_SIZE = self.TLB_SET_SIZE - TLB_NUM_WAYS = self.TLB_NUM_WAYS - TLB_LG_PGSZ = self.TLB_LG_PGSZ - LOG_LENGTH = self.LOG_LENGTH - - # BRAM organisation: We never access more than - # -- wishbone_data_bits at a time so to save - # -- resources we make the array only that wide, and - # -- use consecutive indices for to make a cache "line" - # -- - # -- ROW_SIZE is the width in bytes of the BRAM - # -- (based on WB, so 64-bits) - ROW_SIZE = WB_DATA_BITS / 8; - - # ROW_PER_LINE is the number of row (wishbone - # transactions) in a line - ROW_PER_LINE = LINE_SIZE // ROW_SIZE - - # BRAM_ROWS is the number of rows in BRAM needed - # to represent the full dcache - BRAM_ROWS = NUM_LINES * ROW_PER_LINE + # Latch the request in r0.req as long as we're not stalling + def stage_0(self, m, d_in, m_in): + comb = m.d.comb + sync = m.d.sync + + # variable r : reg_stage_0_t; + r = RegStage0() + comb += r + + # begin + # if rising_edge(clk) then + # assert (d_in.valid and m_in.valid) = '0' + # report "request collision loadstore vs MMU"; + assert ~(d_in.valid & m_in.valid) "request collision + loadstore vs MMU" + + # if m_in.valid = '1' then + with m.If(m_in.valid): + # r.req.valid := '1'; + # r.req.load := not (m_in.tlbie or m_in.tlbld); + # r.req.dcbz := '0'; + # r.req.nc := '0'; + # r.req.reserve := '0'; + # r.req.virt_mode := '0'; + # r.req.priv_mode := '1'; + # r.req.addr := m_in.addr; + # r.req.data := m_in.pte; + # r.req.byte_sel := (others => '1'); + # r.tlbie := m_in.tlbie; + # r.doall := m_in.doall; + # r.tlbld := m_in.tlbld; + # r.mmu_req := '1'; + sync += r.req.valid.eq(1) + sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld)) + sync += r.req.priv_mode.eq(1) + sync += r.req.addr.eq(m_in.addr) + sync += r.req.data.eq(m_in.pte) + sync += r.req.byte_sel.eq(1) + sync += r.tlbie.eq(m_in.tlbie) + sync += r.doall.eq(m_in.doall) + sync += r.tlbld.eq(m_in.tlbld) + sync += r.mmu_req.eq(1) + # else + with m.Else(): + # r.req := d_in; + # r.tlbie := '0'; + # r.doall := '0'; + # r.tlbld := '0'; + # r.mmu_req := '0'; + sync += r.req.eq(d_in) + # end if; + # if rst = '1' then + # r0_full <= '0'; + # elsif r1.full = '0' or r0_full = '0' then + with m.If(~r1.full | ~r0_full): + # r0 <= r; + # r0_full <= r.req.valid; + sync += r0.eq(r) + sync += r0_full.eq(r.req.valid) + # end if; + # end if; + # end process; + + # TLB + # Operates in the second cycle on the request latched in r0.req. + # TLB updates write the entry at the end of the second cycle. + def tlb_read(self, m, m_in, d_in, r0_stall, tlb_valid_way, + tlb_tag_way, tlb_pte_way, dtlb_valid_bits, + dtlb_tags, dtlb_ptes): + comb = m.d.comb + sync = m.d.sync - # Bit fields counts in the address + # variable index : tlb_index_t; + # variable addrbits : + # std_ulogic_vector(TLB_SET_BITS - 1 downto 0); + index = TLB_SET_SIZE + addrbits = Signal(TLB_SET_BITS) - # REAL_ADDR_BITS is the number of real address - # bits that we store - REAL_ADDR_BITS = 56 + comb += index + comb += addrbits - # ROW_BITS is the number of bits to select a row - ROW_BITS = log2_int(BRAM_ROWS) + # begin + # if rising_edge(clk) then + # if m_in.valid = '1' then + with m.If(m_in.valid): + # addrbits := m_in.addr(TLB_LG_PGSZ + TLB_SET_BITS + # - 1 downto TLB_LG_PGSZ); + sync += addrbits.eq(m_in.addr[ + TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_SET_BITS + ]) + # else + with m.Else(): + # addrbits := d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS + # - 1 downto TLB_LG_PGSZ); + sync += addrbits.eq(d_in.addr[ + TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_SET_BITS + ]) + # end if; - # ROW_LINE_BITS is the number of bits to select - # a row within a line - ROW_LINE_BITS = log2_int(ROW_PER_LINE) + # index := to_integer(unsigned(addrbits)); + sync += index.eq(addrbits) + # -- If we have any op and the previous op isn't + # -- finished, then keep the same output for next cycle. + # if r0_stall = '0' then + # If we have any op and the previous op isn't finished, + # then keep the same output for next cycle. + with m.If(~r0_stall): + sync += tlb_valid_way.eq(dtlb_valid_bits[index]) + sync += tlb_tag_way.eq(dtlb_tags[index]) + sync += tlb_pte_way.eq(dtlb_ptes[index]) + # end if; + # end if; + # end process; + + # -- Generate TLB PLRUs + # maybe_tlb_plrus: if TLB_NUM_WAYS > 1 generate + # Generate TLB PLRUs + def maybe_tlb_plrus(self, m, r1, tlb_plru_victim): + comb = m.d.comb + sync = m.d.sync + + with m.If(TLB_NUM_WAYS > 1): + # begin + # TODO understand how to conver generate statements + # tlb_plrus: for i in 0 to TLB_SET_SIZE - 1 generate + # -- TLB PLRU interface + # signal tlb_plru_acc : + # std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + # signal tlb_plru_acc_en : std_ulogic; + # signal tlb_plru_out : + # std_ulogic_vector(TLB_WAY_BITS-1 downto 0); + # begin + # tlb_plru : entity work.plru + # generic map ( + # BITS => TLB_WAY_BITS + # ) + # port map ( + # clk => clk, + # rst => rst, + # acc => tlb_plru_acc, + # acc_en => tlb_plru_acc_en, + # lru => tlb_plru_out + # ); + # + # process(all) + # begin + # -- PLRU interface + # if r1.tlb_hit_index = i then + # tlb_plru_acc_en <= r1.tlb_hit; + # else + # tlb_plru_acc_en <= '0'; + # end if; + # tlb_plru_acc <= + # std_ulogic_vector(to_unsigned( + # r1.tlb_hit_way, TLB_WAY_BITS + # )); + # tlb_plru_victim(i) <= tlb_plru_out; + # end process; + # end generate; + # end generate; + # end TODO + + def tlb_search(self, tlb_req_index, r0, tlb_valid_way_ tlb_tag_way, + tlb_pte_way, pte, tlb_hit, valid_ra, perm_attr, ra): - # LINE_OFF_BITS is the number of bits for - # the offset in a cache line - LINE_OFF_BITS = log2_int(LINE_SIZE) + comb = m.d.comb + sync = m.d.sync - # ROW_OFF_BITS is the number of bits for - # the offset in a row - ROW_OFF_BITS = log2_int(ROW_SIZE) +# variable hitway : tlb_way_t; +# variable hit : std_ulogic; +# variable eatag : tlb_tag_t; + hitway = TLBWay() + hit = Signal() + eatag = TLBTag() - # INDEX_BITS is the number if bits to - # select a cache line - INDEX_BITS = log2_int(NUM_LINES) + comb += hitway + comb += hit + comb += eatag - # SET_SIZE_BITS is the log base 2 of the set size - SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS +# begin +# tlb_req_index <= +# to_integer(unsigned(r0.req.addr( +# TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ +# ))); +# hitway := 0; +# hit := '0'; +# eatag := r0.req.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); +# for i in tlb_way_t loop +# if tlb_valid_way(i) = '1' and +# read_tlb_tag(i, tlb_tag_way) = eatag then +# hitway := i; +# hit := '1'; +# end if; +# end loop; +# tlb_hit <= hit and r0_valid; +# tlb_hit_way <= hitway; + comb += tlb_req_index.eq(r0.req.addr[ + TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_SET_BITS + ]) - # TAG_BITS is the number of bits of - # the tag part of the address - TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS + comb += eatag.eq(r0.req.addr[ + TLB_LG_PGSZ + TLB_SET_BITS:64 + ]) - # TAG_WIDTH is the width in bits of each way of the tag RAM - TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8) + for i in TLBWay(): + with m.If(tlb_valid_way(i) + & read_tlb_tag(i, tlb_tag_way) == eatag): - # WAY_BITS is the number of bits to select a way - WAY_BITS = log2_int(NUM_WAYS) + comb += hitway.eq(i) + comb += hit.eq(1) - # Example of layout for 32 lines of 64 bytes: - # - # .. tag |index| line | - # .. | row | | - # .. | |---| | ROW_LINE_BITS (3) - # .. | |--- - --| LINE_OFF_BITS (6) - # .. | |- --| ROW_OFF_BITS (3) - # .. |----- ---| | ROW_BITS (8) - # .. |-----| | INDEX_BITS (5) - # .. --------| | TAG_BITS (45) + comb += tlb_hit.eq(hit & r0_valid) + comb += tlb_hit_way.eq(hitway) +# if tlb_hit = '1' then + with m.If(tlb_hit): +# pte <= read_tlb_pte(hitway, tlb_pte_way); + comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way)) +# else + with m.Else(): +# pte <= (others => '0'); + comb += pte.eq(0) +# end if; +# valid_ra <= tlb_hit or not r0.req.virt_mode; + comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode) +# if r0.req.virt_mode = '1' then + with m.If(r0.req.virt_mode): +# ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & +# r0.req.addr(TLB_LG_PGSZ - 1 downto ROW_OFF_BITS) & +# (ROW_OFF_BITS-1 downto 0 => '0'); +# perm_attr <= extract_perm_attr(pte); + comb += ra.eq(Cat( + Const(ROW_OFF_BITS, ROW_OFF_BITS), + r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ], + pte[TLB_LG_PGSZ:REAL_ADDR_BITS] + )) + comb += perm_attr.eq(extract_perm_attr(pte)) +# else + with m.Else(): +# ra <= r0.req.addr( +# REAL_ADDR_BITS - 1 downto ROW_OFF_BITS +# ) & (ROW_OFF_BITS-1 downto 0 => '0'); + comb += ra.eq(Cat( + Const(ROW_OFF_BITS, ROW_OFF_BITS), + r0.rq.addr[ROW_OFF_BITS:REAL_ADDR_BITS] + ) -# subtype row_t is integer range 0 to BRAM_ROWS-1; -# subtype index_t is integer range 0 to NUM_LINES-1; -"""wherever way_t is used to make a Signal it must be substituted with - log2_int(NUM_WAYS) i.e. WAY_BITS. this because whilst the *range* - of the number is 0..NUM_WAYS it requires log2_int(NUM_WAYS) i.e. - WAY_BITS of space to store it -""" -# subtype way_t is integer range 0 to NUM_WAYS-1; -# subtype row_in_line_t is unsigned(ROW_LINE_BITS-1 downto 0); - ROW = BRAM_ROWS # yyyeah not really necessary, delete - INDEX = NUM_LINES # yyyeah not really necessary, delete - WAY = NUM_WAYS # yyyeah not really necessary, delete - ROW_IN_LINE = ROW_LINE_BITS # yyyeah not really necessary, delete +# perm_attr <= real_mode_perm_attr; + comb += perm_attr.reference.eq(1) + comb += perm_attr.changed.eq(1) + comb += perm_attr.priv.eq(1) + comb += perm_attr.nocache.eq(0) + comb += perm_attr.rd_perm.eq(1) + comb += perm_attr.wr_perm.eq(1) +# end if; +# end process; -# -- The cache data BRAM organized as described above for each way -# subtype cache_row_t is -# std_ulogic_vector(wishbone_data_bits-1 downto 0); - # The cache data BRAM organized as described above for each way - CACHE_ROW = WB_DATA_BITS + def tlb_update(self, r0_valid, r0, dtlb_valid_bits, tlb_req_index, + tlb_hit_way, tlb_hit, tlb_plru_victim, tlb_tag_way, + dtlb_tags, tlb_pte_way, dtlb_ptes, dtlb_valid_bits): -# -- The cache tags LUTRAM has a row per set. -# -- Vivado is a pain and will not handle a -# -- clean (commented) definition of the cache -# -- tags as a 3d memory. For now, work around -# -- it by putting all the tags -# subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0); - # The cache tags LUTRAM has a row per set. - # Vivado is a pain and will not handle a - # clean (commented) definition of the cache - # tags as a 3d memory. For now, work around - # it by putting all the tags - CACHE_TAG = TAG_BITS + comb = m.d.comb + sync = m.d.sync -# -- type cache_tags_set_t is array(way_t) of cache_tag_t; -# -- type cache_tags_array_t is array(index_t) of cache_tags_set_t; -# constant TAG_RAM_WIDTH : natural := TAG_WIDTH * NUM_WAYS; -# subtype cache_tags_set_t is -# std_logic_vector(TAG_RAM_WIDTH-1 downto 0); -# type cache_tags_array_t is array(index_t) of cache_tags_set_t; - # type cache_tags_set_t is array(way_t) of cache_tag_t; - # type cache_tags_array_t is array(index_t) of cache_tags_set_t; - TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS + # variable tlbie : std_ulogic; + # variable tlbwe : std_ulogic; + # variable repl_way : tlb_way_t; + # variable eatag : tlb_tag_t; + # variable tagset : tlb_way_tags_t; + # variable pteset : tlb_way_ptes_t; + tlbie = Signal() + tlbwe = Signal() + repl_way = TLBWay() + eatag = TLBTag() + tagset = TLBWayTags() + pteset = TLBWayPtes() - CACHE_TAG_SET = TAG_RAM_WIDTH + comb += tlbie + comb += tlbwe + comb += repl_way + comb += eatag + comb += tagset + comb += pteset - def CacheTagArray(): - return Array(CacheTagSet() for x in range(INDEX)) + # begin + # if rising_edge(clk) then + # tlbie := r0_valid and r0.tlbie; + # tlbwe := r0_valid and r0.tlbldoi; + sync += tlbie.eq(r0_valid & r0.tlbie) + sync += tlbwe.eq(r0_valid & r0.tlbldoi) -# -- The cache valid bits -# subtype cache_way_valids_t is -# std_ulogic_vector(NUM_WAYS-1 downto 0); -# type cache_valids_t is array(index_t) of cache_way_valids_t; -# type row_per_line_valid_t is -# array(0 to ROW_PER_LINE - 1) of std_ulogic; - # The cache valid bits - CACHE_WAY_VALID_BITS = NUM_WAYS + # if rst = '1' or (tlbie = '1' and r0.doall = '1') then + # with m.If (TODO understand how signal resets work in nmigen) + # -- clear all valid bits at once + # for i in tlb_index_t loop + # dtlb_valids(i) <= (others => '0'); + # end loop; + # clear all valid bits at once + for i in range(TLB_SET_SIZE): + sync += dtlb_valid_bits[i].eq(0) + # elsif tlbie = '1' then + with m.Elif(tlbie): + # if tlb_hit = '1' then + with m.If(tlb_hit): + # dtlb_valids(tlb_req_index)(tlb_hit_way) <= '0'; + sync += dtlb_valid_bits[tlb_req_index][tlb_hit_way].eq(0) + # end if; + # elsif tlbwe = '1' then + with m.Elif(tlbwe): + # if tlb_hit = '1' then + with m.If(tlb_hit): + # repl_way := tlb_hit_way; + sync += repl_way.eq(tlb_hit_way) + # else + with m.Else(): + # repl_way := to_integer(unsigned( + # tlb_plru_victim(tlb_req_index))); + sync += repl_way.eq(tlb_plru_victim[tlb_req_index]) + # end if; + # eatag := r0.req.addr( + # 63 downto TLB_LG_PGSZ + TLB_SET_BITS + # ); + # tagset := tlb_tag_way; + # write_tlb_tag(repl_way, tagset, eatag); + # dtlb_tags(tlb_req_index) <= tagset; + # pteset := tlb_pte_way; + # write_tlb_pte(repl_way, pteset, r0.req.data); + # dtlb_ptes(tlb_req_index) <= pteset; + # dtlb_valids(tlb_req_index)(repl_way) <= '1'; + sync += eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64]) + sync += tagset.eq(tlb_tag_way) + sync += write_tlb_tag(repl_way, tagset, eatag) + sync += dtlb_tags[tlb_req_index].eq(tagset) + sync += pteset.eq(tlb_pte_way) + sync += write_tlb_pte(repl_way, pteset, r0.req.data) + sync += dtlb_ptes[tlb_req_index].eq(pteset) + sync += dtlb_valid_bits[tlb_req_index][repl_way].eq(1) + # end if; + # end if; + # end process; - def CacheValidBitsArray(): - return Array(CacheWayValidBits() for x in range(INDEX)) +# -- Generate PLRUs +# maybe_plrus: if NUM_WAYS > 1 generate + # Generate PLRUs + def maybe_plrus(self, r1): - def RowPerLineValidArray(): - return Array(Signal() for x in range(ROW_PER_LINE)) + comb = m.d.comb + sync = m.d.sync -# -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs -# signal cache_tags : cache_tags_array_t; -# signal cache_tag_set : cache_tags_set_t; -# signal cache_valids : cache_valids_t; +# begin + # TODO learn translation of generate into nmgien @lkcl +# plrus: for i in 0 to NUM_LINES-1 generate +# -- PLRU interface +# signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); +# signal plru_acc_en : std_ulogic; +# signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); # -# attribute ram_style : string; -# attribute ram_style of cache_tags : signal is "distributed"; - # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs - cache_tags = CacheTagArray() - cache_tag_set = Signal(CACHE_TAG_SET) - cache_valid_bits = CacheValidBitsArray() - - # TODO attribute ram_style : string; - # TODO attribute ram_style of cache_tags : signal is "distributed"; - -# -- L1 TLB. -# constant TLB_SET_BITS : natural := log2(TLB_SET_SIZE); -# constant TLB_WAY_BITS : natural := log2(TLB_NUM_WAYS); -# constant TLB_EA_TAG_BITS : natural := -# 64 - (TLB_LG_PGSZ + TLB_SET_BITS); -# constant TLB_TAG_WAY_BITS : natural := -# TLB_NUM_WAYS * TLB_EA_TAG_BITS; -# constant TLB_PTE_BITS : natural := 64; -# constant TLB_PTE_WAY_BITS : natural := -# TLB_NUM_WAYS * TLB_PTE_BITS; - # L1 TLB - TLB_SET_BITS = log2_int(TLB_SET_SIZE) - TLB_WAY_BITS = log2_int(TLB_NUM_WAYS) - TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_SET_BITS) - TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS - TLB_PTE_BITS = 64 - TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS; - -# subtype tlb_way_t is integer range 0 to TLB_NUM_WAYS - 1; -# subtype tlb_index_t is integer range 0 to TLB_SET_SIZE - 1; -# subtype tlb_way_valids_t is -# std_ulogic_vector(TLB_NUM_WAYS-1 downto 0); -# type tlb_valids_t is -# array(tlb_index_t) of tlb_way_valids_t; -# subtype tlb_tag_t is -# std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0); -# subtype tlb_way_tags_t is -# std_ulogic_vector(TLB_TAG_WAY_BITS-1 downto 0); -# type tlb_tags_t is -# array(tlb_index_t) of tlb_way_tags_t; -# subtype tlb_pte_t is -# std_ulogic_vector(TLB_PTE_BITS - 1 downto 0); -# subtype tlb_way_ptes_t is -# std_ulogic_vector(TLB_PTE_WAY_BITS-1 downto 0); -# type tlb_ptes_t is array(tlb_index_t) of tlb_way_ptes_t; -# type hit_way_set_t is array(tlb_way_t) of way_t; - TLB_WAY = TLB_NUM_WAYS +# begin + # TODO learn tranlation of entity, generic map, port map in + # nmigen @lkcl +# plru : entity work.plru +# generic map ( +# BITS => WAY_BITS +# ) +# port map ( +# clk => clk, +# rst => rst, +# acc => plru_acc, +# acc_en => plru_acc_en, +# lru => plru_out +# ); +# +# process(all) +# begin +# -- PLRU interface +# if r1.hit_index = i then + # PLRU interface + with m.If(r1.hit_index == i): +# plru_acc_en <= r1.cache_hit; + comb += plru_acc_en.eq(r1.cache_hit) +# else + with m.Else(): +# plru_acc_en <= '0'; + comb += plru_acc_en.eq(0) +# end if; +# plru_acc <= std_ulogic_vector(to_unsigned( +# r1.hit_way, WAY_BITS +# )); +# plru_victim(i) <= plru_out; + comb += plru_acc.eq(r1.hit_way) + comb += plru_victim[i].eq(plru_out) +# end process; +# end generate; +# end generate; - TLB_INDEX = TLB_SET_SIZE +# -- Cache tag RAM read port +# cache_tag_read : process(clk) + # Cache tag RAM read port + def cache_tag_read(self, r0_stall, req_index, m_in, d_in, + cache_tag_set, cache_tags): - TLB_WAY_VALID_BITS = TLB_NUM_WAYS + comb = m.d.comb + sync = m.d.sync - def TLBValidBitsArray(): - return Array( - Signal(TLB_WAY_VALID_BITS) for x in range(TLB_SET_SIZE) - ) +# variable index : index_t; + index = Signal(INDEX) - TLB_TAG = TLB_EA_TAG_BITS + comb += index - TLB_WAY_TAGS = TLB_TAG_WAY_BITS +# begin +# if rising_edge(clk) then +# if r0_stall = '1' then + with m.If(r0_stall): +# index := req_index; + sync += index.eq(req_index) - def TLBTagsArray(): - return Array( - Signal(TLB_WAY_TAGS) for x in range (TLB_SET_SIZE) - ) +# elsif m_in.valid = '1' then + with m.Elif(m_in.valid): +# index := get_index(m_in.addr); + sync += index.eq(get_index(m_in.addr)) - TLB_PTE = TLB_PTE_BITS +# else + with m.Else(): +# index := get_index(d_in.addr); + sync += index.eq(get_index(d_in.addr)) +# end if; +# cache_tag_set <= cache_tags(index); + sync += cache_tag_set.eq(cache_tags[index]) +# end if; +# end process; - TLB_WAY_PTES = TLB_PTE_WAY_BITS + # Cache request parsing and hit detection + def dcache_request(self, r0, ra, req_index, req_row, req_tag, + r0_valid, r1, cache_valid_bits, replace_way, + use_forward1_next, use_forward2_next, + req_hit_way, plru_victim, rc_ok, perm_attr, + valid_ra, perm_ok, access_ok, req_op, req_ok, + r0_stall, m_in, early_req_row, d_in): - def TLBPtesArray(): - return Array( - Signal(TLB_WAY_PTES) for x in range(TLB_SET_SIZE) - ) + comb = m.d.comb + sync = m.d.sync - def HitWaySet(): - return Array(Signal(NUM_WAYS) for x in range(TLB_NUM_WAYS)) +# variable is_hit : std_ulogic; +# variable hit_way : way_t; +# variable op : op_t; +# variable opsel : std_ulogic_vector(2 downto 0); +# variable go : std_ulogic; +# variable nc : std_ulogic; +# variable s_hit : std_ulogic; +# variable s_tag : cache_tag_t; +# variable s_pte : tlb_pte_t; +# variable s_ra : std_ulogic_vector( +# REAL_ADDR_BITS - 1 downto 0 +# ); +# variable hit_set : std_ulogic_vector( +# TLB_NUM_WAYS - 1 downto 0 +# ); +# variable hit_way_set : hit_way_set_t; +# variable rel_matches : std_ulogic_vector( +# TLB_NUM_WAYS - 1 downto 0 +# ); + rel_match = Signal() + is_hit = Signal() + hit_way = Signal(WAY_BITS) + op = Op() + opsel = Signal(3) + go = Signal() + nc = Signal() + s_hit = Signal() + s_tag = Signal(CACHE_TAG) + s_pte = Signal(TLB_PTE) + s_ra = Signal(REAL_ADDR_BITS) + hit_set = Signal(TLB_NUM_WAYS) + hit_way_set = HitWaySet() + rel_matches = Signal(TLB_NUM_WAYS) + rel_match = Signal() -# signal dtlb_valids : tlb_valids_t; -# signal dtlb_tags : tlb_tags_t; -# signal dtlb_ptes : tlb_ptes_t; + comb += rel_match + comb += is_hit + comb += hit_way + comb += op + comb += opsel + comb += go + comb += nc + comb += s_hit + comb += s_tag + comb += s_pte + comb += s_ra + comb += hit_set + comb += hit_way_set + comb += rel_matches + comb += rel_match -"""note: these are passed to nmigen.hdl.Memory as "attributes". - don't know how, just that they are. -""" -# attribute ram_style of dtlb_tags : signal is "distributed"; -# attribute ram_style of dtlb_ptes : signal is "distributed"; - dtlb_valids = TLBValidBitsArray() - dtlb_tags = TLBTagsArray() - dtlb_ptes = TLBPtesArray() - # TODO attribute ram_style of - # dtlb_tags : signal is "distributed"; - # TODO attribute ram_style of - # dtlb_ptes : signal is "distributed"; +# begin +# -- Extract line, row and tag from request +# req_index <= get_index(r0.req.addr); +# req_row <= get_row(r0.req.addr); +# req_tag <= get_tag(ra); +# +# go := r0_valid and not (r0.tlbie or r0.tlbld) +# and not r1.ls_error; + # Extract line, row and tag from request + comb += req_index.eq(get_index(r0.req.addr)) + comb += req_row.eq(get_row(r0.req.addr)) + comb += req_tag.eq(get_tag(ra)) -# signal r0 : reg_stage_0_t; -# signal r0_full : std_ulogic; - r0 = RegStage0() - r0_full = Signal() + comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error) -# signal r1 : reg_stage_1_t; - r1 = RegStage1() +# hit_way := 0; +# is_hit := '0'; +# rel_match := '0'; + # Test if pending request is a hit on any way + # In order to make timing in virtual mode, + # when we are using the TLB, we compare each + # way with each of the real addresses from each way of + # the TLB, and then decide later which match to use. + comb += hit_way.eq(0) + comb += is_hit.eq(0) + comb += rel_match.eq(0) -# signal reservation : reservation_t; - reservation = Reservation() +# if r0.req.virt_mode = '1' then + with m.If(r0.req.virt_mode): +# rel_matches := (others => '0'); + comb += rel_matches.eq(0) +# for j in tlb_way_t loop + for j in range(TLB_WAY): +# hit_way_set(j) := 0; +# s_hit := '0'; +# s_pte := read_tlb_pte(j, tlb_pte_way); +# s_ra := s_pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) +# & r0.req.addr(TLB_LG_PGSZ - 1 downto 0); +# s_tag := get_tag(s_ra); + comb += hit_way_set[j].eq(0) + comb += s_hit.eq(0) + comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way)) + comb += s_ra.eq(Cat( + r0.req.addr[0:TLB_LG_PGSZ], + s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS] + )) + comb += s_tag.eq(get_tag(s_ra)) -# -- Async signals on incoming request -# signal req_index : index_t; -# signal req_row : row_t; -# signal req_hit_way : way_t; -# signal req_tag : cache_tag_t; -# signal req_op : op_t; -# signal req_data : std_ulogic_vector(63 downto 0); -# signal req_same_tag : std_ulogic; -# signal req_go : std_ulogic; - # Async signals on incoming request - req_index = Signal(INDEX) - req_row = Signal(ROW) - req_hit_way = Signal(WAY_BITS) - req_tag = Signal(CACHE_TAG) - req_op = Op() - req_data = Signal(64) - req_same_tag = Signal() - req_go = Signal() - -# signal early_req_row : row_t; -# -# signal cancel_store : std_ulogic; -# signal set_rsrv : std_ulogic; -# signal clear_rsrv : std_ulogic; -# -# signal r0_valid : std_ulogic; -# signal r0_stall : std_ulogic; -# -# signal use_forward1_next : std_ulogic; -# signal use_forward2_next : std_ulogic; - early_req_row = Signal(ROW) +# for i in way_t loop + for i in range(NUM_WAYS): +# if go = '1' and cache_valids(req_index)(i) = '1' +# and read_tag(i, cache_tag_set) = s_tag +# and tlb_valid_way(j) = '1' then + with m.If(go & cache_valid_bits[req_index][i] & + read_tag(i, cache_tag_set) == s_tag + & tlb_valid_way[j]): +# hit_way_set(j) := i; +# s_hit := '1'; + comb += hit_way_set[j].eq(i) + comb += s_hit.eq(1) +# end if; +# end loop; +# hit_set(j) := s_hit; + comb += hit_set[j].eq(s_hit) +# if s_tag = r1.reload_tag then + with m.If(s_tag == r1.reload_tag): +# rel_matches(j) := '1'; + comb += rel_matches[j].eq(1) +# end if; +# end loop; +# if tlb_hit = '1' then + with m.If(tlb_hit): +# is_hit := hit_set(tlb_hit_way); +# hit_way := hit_way_set(tlb_hit_way); +# rel_match := rel_matches(tlb_hit_way); + comb += is_hit.eq(hit_set[tlb_hit_way]) + comb += hit_way.eq(hit_way_set[tlb_hit_way]) + comb += rel_match.eq(rel_matches[tlb_hit_way]) +# end if; +# else + with m.Else(): +# s_tag := get_tag(r0.req.addr); + comb += s_tag.eq(get_tag(r0.req.addr)) +# for i in way_t loop + for i in range(NUM_WAYS): +# if go = '1' and cache_valids(req_index)(i) = '1' and +# read_tag(i, cache_tag_set) = s_tag then + with m.If(go & cache_valid_bits[req_index][i] & + read_tag(i, cache_tag_set) == s_tag): +# hit_way := i; +# is_hit := '1'; + comb += hit_way.eq(i) + comb += is_hit.eq(1) +# end if; +# end loop; +# if s_tag = r1.reload_tag then + with m.If(s_tag == r1.reload_tag): +# rel_match := '1'; + comb += rel_match.eq(1) +# end if; +# end if; +# req_same_tag <= rel_match; + comb += req_same_tag.eq(rel_match) - cancel_store = Signal() - set_rsrv = Signal() - clear_rsrv = Signal() +# if r1.state = RELOAD_WAIT_ACK and req_index = r1.store_index +# and rel_match = '1' then + # See if the request matches the line currently being reloaded + with m.If(r1.state == State.RELOAD_WAIT_ACK & req_index == + r1.store_index & rel_match): + # For a store, consider this a hit even if the row isn't + # valid since it will be by the time we perform the store. + # For a load, check the appropriate row valid bit. +# is_hit := +# not r0.req.load +# or r1.rows_valid(req_row mod ROW_PER_LINE); +# hit_way := replace_way; + comb += is_hit.eq(~r0.req.load + | r1.rows_valid[req_row % ROW_PER_LINE] + ) + comb += hit_way.eq(replace_way) +# end if; - r0_valid = Signal() - r0_stall = Signal() +# -- Whether to use forwarded data for a load or not + # Whether to use forwarded data for a load or not +# use_forward1_next <= '0'; + comb += use_forward1_next.eq(0) +# if get_row(r1.req.real_addr) = req_row +# and r1.req.hit_way = hit_way then + with m.If(get_row(r1.req.real_addr) == req_row + & r1.req.hit_way == hit_way) + # Only need to consider r1.write_bram here, since if we + # are writing refill data here, then we don't have a + # cache hit this cycle on the line being refilled. + # (There is the possibility that the load following the + # load miss that started the refill could be to the old + # contents of the victim line, since it is a couple of + # cycles after the refill starts before we see the updated + # cache tag. In that case we don't use the bypass.) +# use_forward1_next <= r1.write_bram; + comb += use_forward1_next.eq(r1.write_bram) +# end if; +# use_forward2_next <= '0'; + comb += use_forward2_next.eq(0) +# if r1.forward_row1 = req_row +# and r1.forward_way1 = hit_way then + with m.If(r1.forward_row1 == req_row + & r1.forward_way1 == hit_way): +# use_forward2_next <= r1.forward_valid1; + comb += use_forward2_next.eq(r1.forward_valid1) +# end if; - use_forward1_next = Signal() - use_forward2_next = Signal() + # The way that matched on a hit +# req_hit_way <= hit_way; + comb += req_hit_way.eq(hit_way) -# -- Cache RAM interface -# type cache_ram_out_t is array(way_t) of cache_row_t; -# signal cache_out : cache_ram_out_t; - # Cache RAM interface - def CacheRamOut(): - return Array(Signal(CACHE_ROW) for x in range(NUM_WAYS)) + # The way to replace on a miss +# if r1.write_tag = '1' then + with m.If(r1.write_tag): +# replace_way <= to_integer(unsigned( +# plru_victim(r1.store_index) +# )); + replace_way.eq(plru_victim[r1.store_index]) +# else + with m.Else(): +# replace_way <= r1.store_way; + comb += replace_way.eq(r1.store_way) +# end if; - cache_out = CacheRamOut() + # work out whether we have permission for this access + # NB we don't yet implement AMR, thus no KUAP +# rc_ok <= perm_attr.reference and +# (r0.req.load or perm_attr.changed); +# perm_ok <= (r0.req.priv_mode or not perm_attr.priv) and +# (perm_attr.wr_perm or (r0.req.load +# and perm_attr.rd_perm)); +# access_ok <= valid_ra and perm_ok and rc_ok; + comb += rc_ok.eq( + perm_attr.reference + & (r0.req.load | perm_attr.changed) + ) + comb += perm_ok.eq((r0.req.prive_mode | ~perm_attr.priv) + & perm_attr.wr_perm + | (r0.req.load & perm_attr.rd_perm) + ) + comb += access_ok.eq(valid_ra & perm_ok & rc_ok) +# nc := r0.req.nc or perm_attr.nocache; +# op := OP_NONE; + # Combine the request and cache hit status to decide what + # operation needs to be done + comb += nc.eq(r0.req.nc | perm_attr.nocache) + comb += op.eq(Op.OP_NONE) +# if go = '1' then + with m.If(go): +# if access_ok = '0' then + with m.If(~access_ok): +# op := OP_BAD; + comb += op.eq(Op.OP_BAD) +# elsif cancel_store = '1' then + with m.Elif(cancel_store): +# op := OP_STCX_FAIL; + comb += op.eq(Op.OP_STCX_FAIL) +# else + with m.Else(): +# opsel := r0.req.load & nc & is_hit; + comb += opsel.eq(Cat(is_hit, nc, r0.req.load)) +# case opsel is + with m.Switch(opsel): +# when "101" => op := OP_LOAD_HIT; +# when "100" => op := OP_LOAD_MISS; +# when "110" => op := OP_LOAD_NC; +# when "001" => op := OP_STORE_HIT; +# when "000" => op := OP_STORE_MISS; +# when "010" => op := OP_STORE_MISS; +# when "011" => op := OP_BAD; +# when "111" => op := OP_BAD; +# when others => op := OP_NONE; + with m.Case(Const(0b101, 3)): + comb += op.eq(Op.OP_LOAD_HIT) -# -- PLRU output interface -# type plru_out_t is array(index_t) of -# std_ulogic_vector(WAY_BITS-1 downto 0); -# signal plru_victim : plru_out_t; -# signal replace_way : way_t; - # PLRU output interface - def PLRUOut(): - return Array(Signal(WAY_BITS) for x in range(Index())) + with m.Case(Cosnt(0b100, 3)): + comb += op.eq(Op.OP_LOAD_MISS) - plru_victim = PLRUOut() - replace_way = Signal(WAY_BITS) + with m.Case(Const(0b110, 3)): + comb += op.eq(Op.OP_LOAD_NC) -# -- Wishbone read/write/cache write formatting signals -# signal bus_sel : std_ulogic_vector(7 downto 0); - # Wishbone read/write/cache write formatting signals - bus_sel = Signal(8) + with m.Case(Const(0b001, 3)): + comb += op.eq(Op.OP_STORE_HIT) -# -- TLB signals -# signal tlb_tag_way : tlb_way_tags_t; -# signal tlb_pte_way : tlb_way_ptes_t; -# signal tlb_valid_way : tlb_way_valids_t; -# signal tlb_req_index : tlb_index_t; -# signal tlb_hit : std_ulogic; -# signal tlb_hit_way : tlb_way_t; -# signal pte : tlb_pte_t; -# signal ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); -# signal valid_ra : std_ulogic; -# signal perm_attr : perm_attr_t; -# signal rc_ok : std_ulogic; -# signal perm_ok : std_ulogic; -# signal access_ok : std_ulogic; - # TLB signals - tlb_tag_way = Signal(TLB_WAY_TAGS) - tlb_pte_way = Signal(TLB_WAY_PTES) - tlb_valid_way = Signal(TLB_WAY_VALID_BITS) - tlb_req_index = Signal(TLB_SET_SIZE) - tlb_hit = Signal() - tlb_hit_way = Signal(TLB_WAY) - pte = Signal(TLB_PTE) - ra = Signal(REAL_ADDR_BITS) - valid_ra = Signal() - perm_attr = PermAttr() - rc_ok = Signal() - perm_ok = Signal() - access_ok = Signal() + with m.Case(Const(0b000, 3)): + comb += op.eq(Op.OP_STORE_MISS) -# -- TLB PLRU output interface -# type tlb_plru_out_t is array(tlb_index_t) of -# std_ulogic_vector(TLB_WAY_BITS-1 downto 0); -# signal tlb_plru_victim : tlb_plru_out_t; - # TLB PLRU output interface - def TLBPLRUOut(): - return Array( - Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE) - ) + with m.Case(Const(0b010, 3)): + comb += op.eq(Op.OP_STORE_MISS) - tlb_plru_victim = TLBPLRUOut() + with m.Case(Const(0b011, 3)): + comb += op.eq(Op.OP_BAD) -# -- Helper functions to decode incoming requests -# -# -- Return the cache line index (tag index) for an address -# function get_index(addr: std_ulogic_vector) return index_t is -# begin -# return to_integer( -# unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)) -# ); -# end; -# Helper functions to decode incoming requests -# - # Return the cache line index (tag index) for an address - def get_index(addr): - return addr[LINE_OFF_BITS:SET_SIZE_BITS] + with m.Case(Const(0b111, 3)): + comb += op.eq(Op.OP_BAD) -# -- Return the cache row index (data memory) for an address -# function get_row(addr: std_ulogic_vector) return row_t is -# begin -# return to_integer( -# unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)) -# ); -# end; - # Return the cache row index (data memory) for an address - def get_row(addr): - return addr[ROW_OFF_BITS:SET_SIZE_BITS] + with m.Default(): + comb += op.eq(Op.OP_NONE) +# end case; +# end if; +# end if; +# req_op <= op; +# req_go <= go; + comb += req_op.eq(op) + comb += req_go.eq(go) -# -- Return the index of a row within a line -# function get_row_of_line(row: row_t) return row_in_line_t is -# variable row_v : unsigned(ROW_BITS-1 downto 0); -# begin -# row_v := to_unsigned(row, ROW_BITS); -# return row_v(ROW_LINEBITS-1 downto 0); -# end; - # Return the index of a row within a line - def get_row_of_line(row): - row_v = Signal(ROW_BITS) - row_v = Signal(row) - return row_v[0:ROW_LINE_BITS] + # Version of the row number that is valid one cycle earlier + # in the cases where we need to read the cache data BRAM. + # If we're stalling then we need to keep reading the last + # row requested. +# if r0_stall = '0' then + with m.If(~r0_stall): +# if m_in.valid = '1' then + with m.If(m_in.valid): +# early_req_row <= get_row(m_in.addr); + comb += early_req_row.eq(get_row(m_in.addr)) +# else + with m.Else(): +# early_req_row <= get_row(d_in.addr); + comb += early_req_row.eq(get_row(d_in.addr)) +# end if; +# else + with m.Else(): +# early_req_row <= req_row; + comb += early_req_row.eq(req_row) +# end if; +# end process; -# -- Returns whether this is the last row of a line -# function is_last_row_addr(addr: wishbone_addr_type; -# last: row_in_line_t) return boolean is -# begin -# return -# unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last; -# end; - # Returns whether this is the last row of a line - def is_last_row_addr(addr, last): - return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last + # Handle load-with-reservation and store-conditional instructions + def reservation_comb(self, cancel_store, set_rsrv, clear_rsrv, + r0_valid, r0, reservation): -# -- Returns whether this is the last row of a line -# function is_last_row(row: row_t; last: row_in_line_t) -# return boolean is -# begin -# return get_row_of_line(row) = last; -# end; - # Returns whether this is the last row of a line - def is_last_row(row, last): - return get_row_of_line(row) == last + comb = m.d.comb + sync = m.d.sync -# -- Return the address of the next row in the current cache line -# function next_row_addr(addr: wishbone_addr_type) -# return std_ulogic_vector is -# variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0); -# variable result : wishbone_addr_type; # begin -# -- Is there no simpler way in VHDL to -# -- generate that 3 bits adder ? -# row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS); -# row_idx := std_ulogic_vector(unsigned(row_idx) + 1); -# result := addr; -# result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx; -# return result; -# end; - # Return the address of the next row in the current cache line - def next_row_addr(addr): - row_idx = Signal(ROW_LINE_BITS) - result = WBAddrType() - # Is there no simpler way in VHDL to - # generate that 3 bits adder ? - row_idx = addr[ROW_OFF_BITS:LINE_OFF_BITS] - row_idx = Signal(row_idx + 1) - result = addr - result[ROW_OFF_BITS:LINE_OFF_BITS] = row_idx - return result +# cancel_store <= '0'; +# set_rsrv <= '0'; +# clear_rsrv <= '0'; +# if r0_valid = '1' and r0.req.reserve = '1' then + with m.If(r0_valid & r0.req.reserve): -# -- Return the next row in the current cache line. We use a -# -- dedicated function in order to limit the size of the -# -- generated adder to be only the bits within a cache line -# -- (3 bits with default settings) -# function next_row(row: row_t) return row_t is -# variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0); -# variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0); -# variable result : std_ulogic_vector(ROW_BITS-1 downto 0); -# begin -# row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS)); -# row_idx := row_v(ROW_LINEBITS-1 downto 0); -# row_v(ROW_LINEBITS-1 downto 0) := -# std_ulogic_vector(unsigned(row_idx) + 1); -# return to_integer(unsigned(row_v)); -# end; -# Return the next row in the current cache line. We use a -# dedicated function in order to limit the size of the -# generated adder to be only the bits within a cache line -# (3 bits with default settings) - def next_row(row) - row_v = Signal(ROW_BITS) - row_idx = Signal(ROW_LINE_BITS) - result = Signal(ROW_BITS) +# -- XXX generate alignment interrupt if address +# -- is not aligned XXX or if r0.req.nc = '1' +# if r0.req.load = '1' then + # XXX generate alignment interrupt if address + # is not aligned XXX or if r0.req.nc = '1' + with m.If(r0.req.load): +# -- load with reservation +# set_rsrv <= '1'; + # load with reservation + comb += set_rsrv(1) +# else + with m.Else(): +# -- store conditional +# clear_rsrv <= '1'; + # store conditional + comb += clear_rsrv.eq(1) +# if reservation.valid = '0' or r0.req.addr(63 +# downto LINE_OFF_BITS) /= reservation.addr then + with m.If(~reservation.valid + | r0.req.addr[LINE_OFF_BITS:64]): +# cancel_store <= '1'; + comb += cancel_store.eq(1) +# end if; +# end if; +# end if; +# end process; - row_v = Signal(row) - row_idx = row_v[ROW_LINE_BITS] - row_v[0:ROW_LINE_BITS] = Signal(row_idx + 1) - return row_v + def reservation_reg(self, r0_valid, access_ok, clear_rsrv, + reservation, r0): -# -- Get the tag value from the address -# function get_tag(addr: std_ulogic_vector) return cache_tag_t is -# begin -# return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS); -# end; - # Get the tag value from the address - def get_tag(addr): - return addr[SET_SIZE_BITS:REAL_ADDR_BITS] + comb = m.d.comb + sync = m.d.sync -# -- Read a tag from a tag memory row -# function read_tag(way: way_t; tagset: cache_tags_set_t) -# return cache_tag_t is # begin -# return tagset(way * TAG_WIDTH + TAG_BITS -# - 1 downto way * TAG_WIDTH); -# end; - # Read a tag from a tag memory row - def read_tag(way, tagset): - return tagset[way *TAG_WIDTH:way * TAG_WIDTH + TAG_BITS] +# if rising_edge(clk) then +# if rst = '1' then +# reservation.valid <= '0'; + # TODO understand how resets work in nmigen +# elsif r0_valid = '1' and access_ok = '1' then + with m.Elif(r0_valid & access_ok)"" +# if clear_rsrv = '1' then + with m.If(clear_rsrv): +# reservation.valid <= '0'; + sync += reservation.valid.ea(0) +# elsif set_rsrv = '1' then + with m.Elif(set_rsrv): +# reservation.valid <= '1'; +# reservation.addr <= +# r0.req.addr(63 downto LINE_OFF_BITS); + sync += reservation.valid.eq(1) + sync += reservation.addr.eq( + r0.req.addr[LINE_OFF_BITS:64] + ) +# end if; +# end if; +# end if; +# end process; -# -- Read a TLB tag from a TLB tag memory row -# function read_tlb_tag(way: tlb_way_t; tags: tlb_way_tags_t) -# return tlb_tag_t is -# variable j : integer; -# begin -# j := way * TLB_EA_TAG_BITS; -# return tags(j + TLB_EA_TAG_BITS - 1 downto j); -# end; - # Read a TLB tag from a TLB tag memory row - def read_tlb_tag(way, tags): - j = Signal() + # Return data for loads & completion control logic + def writeback_control(self, r1, cache_out, d_out, m_out): - j = way * TLB_EA_TAG_BITS - return tags[j:j + TLB_EA_TAG_BITS] + comb = m.d.comb + sync = m.d.sync + +# variable data_out : std_ulogic_vector(63 downto 0); +# variable data_fwd : std_ulogic_vector(63 downto 0); +# variable j : integer; + data_out = Signal(64) + data_fwd = Signal(64) + j = Signal() -# -- Write a TLB tag to a TLB tag memory row -# procedure write_tlb_tag(way: tlb_way_t; tags: inout tlb_way_tags_t; -# tag: tlb_tag_t) is -# variable j : integer; # begin -# j := way * TLB_EA_TAG_BITS; -# tags(j + TLB_EA_TAG_BITS - 1 downto j) := tag; -# end; - # Write a TLB tag to a TLB tag memory row - def write_tlb_tag(way, tags), tag): - j = Signal() +# -- Use the bypass if are reading the row that was +# -- written 1 or 2 cycles ago, including for the +# -- slow_valid = 1 case (i.e. completing a load +# -- miss or a non-cacheable load). +# if r1.use_forward1 = '1' then + # Use the bypass if are reading the row that was + # written 1 or 2 cycles ago, including for the + # slow_valid = 1 case (i.e. completing a load + # miss or a non-cacheable load). + with m.If(r1.use_forward1): +# data_fwd := r1.forward_data1; + comb += data_fwd.eq(r1.forward_data1) +# else + with m.Else(): +# data_fwd := r1.forward_data2; + comb += data_fwd.eq(r1.forward_data2) +# end if; - j = way * TLB_EA_TAG_BITS - tags[j:j + TLB_EA_TAG_BITS] = tag +# data_out := cache_out(r1.hit_way); + comb += data_out.eq(cache_out[r1.hit_way]) -# -- Read a PTE from a TLB PTE memory row -# function read_tlb_pte(way: tlb_way_t; ptes: tlb_way_ptes_t) -# return tlb_pte_t is -# variable j : integer; -# begin -# j := way * TLB_PTE_BITS; -# return ptes(j + TLB_PTE_BITS - 1 downto j); -# end; - # Read a PTE from a TLB PTE memory row - def read_tlb_pte(way, ptes): - j = Signal() +# for i in 0 to 7 loop + for i in range(8): +# j := i * 8; + comb += i * 8 - j = way * TLB_PTE_BITS - return ptes[j:j + TLB_PTE_BITS] +# if r1.forward_sel(i) = '1' then + with m.If(r1.forward_sel[i]): +# data_out(j + 7 downto j) := data_fwd(j + 7 downto j); + comb += data_out[j:j+8].eq(data_fwd[j:j+8]) +# end if; +# end loop; -# procedure write_tlb_pte(way: tlb_way_t; -# ptes: inout tlb_way_ptes_t; newpte: tlb_pte_t) is -# variable j : integer; -# begin -# j := way * TLB_PTE_BITS; -# ptes(j + TLB_PTE_BITS - 1 downto j) := newpte; -# end; - def write_tlb_pte(way, ptes,newpte): - j = Signal() +# d_out.valid <= r1.ls_valid; +# d_out.data <= data_out; +# d_out.store_done <= not r1.stcx_fail; +# d_out.error <= r1.ls_error; +# d_out.cache_paradox <= r1.cache_paradox; + comb += d_out.valid.eq(r1.ls_valid) + comb += d_out.data.eq(data_out) + comb += d_out.store_done.eq(~r1.stcx_fail) + comb += d_out.error.eq(r1.ls_error) + comb += d_out.cache_paradox.eq(r1.cache_paradox) - j = way * TLB_PTE_BITS - return ptes[j:j + TLB_PTE_BITS] = newpte +# -- Outputs to MMU +# m_out.done <= r1.mmu_done; +# m_out.err <= r1.mmu_error; +# m_out.data <= data_out; + comb += m_out.done.eq(r1.mmu_done) + comb += m_out.err.eq(r1.mmu_error) + comb += m_out.data.eq(data_out) -# begin +# -- We have a valid load or store hit or we just completed +# -- a slow op such as a load miss, a NC load or a store +# -- +# -- Note: the load hit is delayed by one cycle. However it +# -- can still not collide with r.slow_valid (well unless I +# -- miscalculated) because slow_valid can only be set on a +# -- subsequent request and not on its first cycle (the state +# -- machine must have advanced), which makes slow_valid +# -- at least 2 cycles from the previous hit_load_valid. # -"""these, because they are constants, can actually be done *as* - python asserts: - assert LINE_SIZE % ROWSIZE == 0, "line size not ...." -""" -# assert LINE_SIZE mod ROW_SIZE = 0 -# report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; -# assert ispow2(LINE_SIZE) -# report "LINE_SIZE not power of 2" severity FAILURE; -# assert ispow2(NUM_LINES) -# report "NUM_LINES not power of 2" severity FAILURE; -# assert ispow2(ROW_PER_LINE) -# report "ROW_PER_LINE not power of 2" severity FAILURE; -# assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS) -# report "geometry bits don't add up" severity FAILURE; -# assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) -# report "geometry bits don't add up" severity FAILURE; -# assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) -# report "geometry bits don't add up" severity FAILURE; -# assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS) -# report "geometry bits don't add up" severity FAILURE; -# assert (64 = wishbone_data_bits) -# report "Can't yet handle a wishbone width that isn't 64-bits" -# severity FAILURE; -# assert SET_SIZE_BITS <= TLB_LG_PGSZ -# report "Set indexed by virtual address" severity FAILURE; - assert (LINE_SIZE % ROW_SIZE) == 0 "LINE_SIZE not " \ - "multiple of ROW_SIZE" - - assert (LINE_SIZE % 2) == 0 "LINE_SIZE not power of 2" - - assert (NUM_LINES % 2) == 0 "NUM_LINES not power of 2" - - assert (ROW_PER_LINE % 2) == 0 "ROW_PER_LINE not" \ - "power of 2" +# -- Sanity: Only one of these must be set in any given cycle +# assert (r1.slow_valid and r1.stcx_fail) /= '1' +# report "unexpected slow_valid collision with stcx_fail" +# severity FAILURE; +# assert ((r1.slow_valid or r1.stcx_fail) and r1.hit_load_valid) +# /= '1' report "unexpected hit_load_delayed collision with +# slow_valid" severity FAILURE; + # We have a valid load or store hit or we just completed + # a slow op such as a load miss, a NC load or a store + # + # Note: the load hit is delayed by one cycle. However it + # can still not collide with r.slow_valid (well unless I + # miscalculated) because slow_valid can only be set on a + # subsequent request and not on its first cycle (the state + # machine must have advanced), which makes slow_valid + # at least 2 cycles from the previous hit_load_valid. - assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS) \ - "geometry bits don't add up" + # Sanity: Only one of these must be set in any given cycle + assert (r1.slow_valid & r1.stcx_fail) != 1 "unexpected" \ + "slow_valid collision with stcx_fail -!- severity FAILURE" - assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) \ - "geometry bits don't add up" + assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1 + "unexpected hit_load_delayed collision with slow_valid -!-" \ + "severity FAILURE" - assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS \ - + LINE_OFF_BITS) "geometry bits don't add up" +# if r1.mmu_req = '0' then + with m.If(~r1._mmu_req): +# -- Request came from loadstore1... +# -- Load hit case is the standard path +# if r1.hit_load_valid = '1' then + # Request came from loadstore1... + # Load hit case is the standard path + with m.If(r1.hit_load_valid): +# report +# "completing load hit data=" & to_hstring(data_out); + print(f"completing load hit data={data_out}") +# end if; - assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS) \ - "geometry bits don't add up" +# -- error cases complete without stalling +# if r1.ls_error = '1' then + # error cases complete without stalling + with m.If(r1.ls_error): +# report "completing ld/st with error"; + print("completing ld/st with error") +# end if; - assert 64 == wishbone_data_bits "Can't yet handle a" \ - "wishbone width that isn't 64-bits" +# -- Slow ops (load miss, NC, stores) +# if r1.slow_valid = '1' then + # Slow ops (load miss, NC, stores) + with m.If(r1.slow_valid): +# report +# "completing store or load miss data=" +# & to_hstring(data_out); + print(f"completing store or load miss data={data_out}") +# end if; - assert SET_SIZE_BITS <= TLB_LG_PGSZ "Set indexed by" \ - "virtual address" +# else + with m.Else(): +# -- Request came from MMU +# if r1.hit_load_valid = '1' then + # Request came from MMU + with m.If(r1.hit_load_valid): +# report "completing load hit to MMU, data=" +# & to_hstring(m_out.data); + print(f"completing load hit to MMU, data={m_out.data}") +# end if; +# +# -- error cases complete without stalling +# if r1.mmu_error = '1' then +# report "completing MMU ld with error"; + # error cases complete without stalling + with m.If(r1.mmu_error): + print("combpleting MMU ld with error") +# end if; +# +# -- Slow ops (i.e. load miss) +# if r1.slow_valid = '1' then + # Slow ops (i.e. load miss) + with m.If(r1.slow_valid): +# report "completing MMU load miss, data=" +# & to_hstring(m_out.data); + print("completing MMU load miss, data={m_out.data}") +# end if; +# end if; +# end process; -# -- Latch the request in r0.req as long as we're not stalling -# stage_0 : process(clk) -# Latch the request in r0.req as long as we're not stalling -class Stage0(Elaboratable): +# begin TODO +# -- Generate a cache RAM for each way. This handles the normal +# -- reads, writes from reloads and the special store-hit update +# -- path as well. +# -- +# -- Note: the BRAMs have an extra read buffer, meaning the output +# -- is pipelined an extra cycle. This differs from the +# -- icache. The writeback logic needs to take that into +# -- account by using 1-cycle delayed signals for load hits. +# -- +# rams: for i in 0 to NUM_WAYS-1 generate +# signal do_read : std_ulogic; +# signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0); +# signal do_write : std_ulogic; +# signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0); +# signal wr_data : +# std_ulogic_vector(wishbone_data_bits-1 downto 0); +# signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0); +# signal wr_sel_m : std_ulogic_vector(ROW_SIZE-1 downto 0); +# signal dout : cache_row_t; +# begin +# way: entity work.cache_ram +# generic map ( +# ROW_BITS => ROW_BITS, +# WIDTH => wishbone_data_bits, +# ADD_BUF => true +# ) +# port map ( +# clk => clk, +# rd_en => do_read, +# rd_addr => rd_addr, +# rd_data => dout, +# wr_sel => wr_sel_m, +# wr_addr => wr_addr, +# wr_data => wr_data +# ); +# process(all) +# end TODO +class TODO(Elaboratable): def __init__(self): pass @@ -830,2114 +1257,1581 @@ class Stage0(Elaboratable): comb = m.d.comb sync = m.d.sync -# variable r : reg_stage_0_t; - r = RegStage0() - comb += r - -# begin -# if rising_edge(clk) then -# assert (d_in.valid and m_in.valid) = '0' -# report "request collision loadstore vs MMU"; - assert ~(d_in.valid & m_in.valid) "request collision - loadstore vs MMU" - -# if m_in.valid = '1' then - with m.If(m_in.valid): -# r.req.valid := '1'; -# r.req.load := not (m_in.tlbie or m_in.tlbld); -# r.req.dcbz := '0'; -# r.req.nc := '0'; -# r.req.reserve := '0'; -# r.req.virt_mode := '0'; -# r.req.priv_mode := '1'; -# r.req.addr := m_in.addr; -# r.req.data := m_in.pte; -# r.req.byte_sel := (others => '1'); -# r.tlbie := m_in.tlbie; -# r.doall := m_in.doall; -# r.tlbld := m_in.tlbld; -# r.mmu_req := '1'; - sync += r.req.valid.eq(1) - sync += r.req.load.eq(~(m_in.tlbie | m_in.tlbld)) - sync += r.req.priv_mode.eq(1) - sync += r.req.addr.eq(m_in.addr) - sync += r.req.data.eq(m_in.pte) - sync += r.req.byte_sel.eq(1) - sync += r.tlbie.eq(m_in.tlbie) - sync += r.doall.eq(m_in.doall) - sync += r.tlbld.eq(m_in.tlbld) - sync += r.mmu_req.eq(1) -# else - with m.Else(): -# r.req := d_in; -# r.tlbie := '0'; -# r.doall := '0'; -# r.tlbld := '0'; -# r.mmu_req := '0'; - sync += r.req.eq(d_in) -# end if; -# if rst = '1' then -# r0_full <= '0'; -# elsif r1.full = '0' or r0_full = '0' then - with m.If(~r1.full | ~r0_full): -# r0 <= r; -# r0_full <= r.req.valid; - sync += r0.eq(r) - sync += r0_full.eq(r.req.valid) -# end if; -# end if; -# end process; -# -# -- we don't yet handle collisions between loadstore1 requests -# -- and MMU requests -# m_out.stall <= '0'; -# we don't yet handle collisions between loadstore1 requests -# and MMU requests -comb += m_out.stall.eq(0) +# begin +# -- Cache hit reads +# do_read <= '1'; +# rd_addr <= +# std_ulogic_vector(to_unsigned(early_req_row, ROW_BITS)); +# cache_out(i) <= dout; + # Cache hit reads + comb += do_read.eq(1) + comb += rd_addr.eq(Signal(ROW)) + comb += cache_out[i].eq(dout) -# -- Hold off the request in r0 when r1 has an uncompleted request -# r0_stall <= r0_full and r1.full; -# r0_valid <= r0_full and not r1.full; -# stall_out <= r0_stall; -# Hold off the request in r0 when r1 has an uncompleted request -comb += r0_stall.eq(r0_full & r1.full) -comb += r0_valid.eq(r0_full & ~r1.full) -comb += stall_out.eq(r0_stall) +# -- Write mux: +# -- +# -- Defaults to wishbone read responses (cache refill) +# -- +# -- For timing, the mux on wr_data/sel/addr is not +# -- dependent on anything other than the current state. + # Write mux: + # + # Defaults to wishbone read responses (cache refill) + # + # For timing, the mux on wr_data/sel/addr is not + # dependent on anything other than the current state. +# wr_sel_m <= (others => '0'); + comb += wr_sel_m.eq(0) -# -- TLB -# -- Operates in the second cycle on the request latched in r0.req. -# -- TLB updates write the entry at the end of the second cycle. -# tlb_read : process(clk) -# TLB -# Operates in the second cycle on the request latched in r0.req. -# TLB updates write the entry at the end of the second cycle. -class TLBRead(Elaboratable): - def __init__(self): - pass +# do_write <= '0'; + comb += do_write.eq(0) +# if r1.write_bram = '1' then + with m.If(r1.write_bram): +# -- Write store data to BRAM. This happens one +# -- cycle after the store is in r0. + # Write store data to BRAM. This happens one + # cycle after the store is in r0. +# wr_data <= r1.req.data; +# wr_sel <= r1.req.byte_sel; +# wr_addr <= std_ulogic_vector(to_unsigned( +# get_row(r1.req.real_addr), ROW_BITS +# )); + comb += wr_data.eq(r1.req.data) + comb += wr_sel.eq(r1.req.byte_sel) + comb += wr_addr.eq(Signal(get_row(r1.req.real_addr))) - def elaborate(self, platform): - m = Module() +# if i = r1.req.hit_way then + with m.If(i == r1.req.hit_way): +# do_write <= '1'; + comb += do_write.eq(1) +# end if; +# else + with m.Else(): +# -- Otherwise, we might be doing a reload or a DCBZ +# if r1.dcbz = '1' then + # Otherwise, we might be doing a reload or a DCBZ + with m.If(r1.dcbz): +# wr_data <= (others => '0'); + comb += wr_data.eq(0) +# else + with m.Else(): +# wr_data <= wishbone_in.dat; + comb += wr_data.eq(wishbone_in.dat) +# end if; - comb = m.d.comb - sync = m.d.sync +# wr_addr <= std_ulogic_vector(to_unsigned( +# r1.store_row, ROW_BITS +# )); +# wr_sel <= (others => '1'); + comb += wr_addr.eq(Signal(r1.store_row)) + comb += wr_sel.eq(1) -# variable index : tlb_index_t; -# variable addrbits : -# std_ulogic_vector(TLB_SET_BITS - 1 downto 0); - index = TLB_SET_SIZE - addrbits = Signal(TLB_SET_BITS) +# if r1.state = RELOAD_WAIT_ACK and +# wishbone_in.ack = '1' and replace_way = i then + with m.If(r1.state == State.RELOAD_WAIT_ACK + & wishbone_in.ack & relpace_way == i): +# do_write <= '1'; + comb += do_write.eq(1) +# end if; +# end if; - comb += index - comb += addrbits +# -- Mask write selects with do_write since BRAM +# -- doesn't have a global write-enable +# if do_write = '1' then +# -- Mask write selects with do_write since BRAM +# -- doesn't have a global write-enable + with m.If(do_write): +# wr_sel_m <= wr_sel; + comb += wr_sel_m.eq(wr_sel) +# end if; +# end process; +# end generate; + + # Cache hit synchronous machine for the easy case. + # This handles load hits. + # It also handles error cases (TLB miss, cache paradox) + def dcache_fast_hit(self, req_op, r0_valid, r1, ): + + comb = m.d.comb + sync = m.d.sync # begin # if rising_edge(clk) then -# if m_in.valid = '1' then - with m.If(m_in.valid): -# addrbits := m_in.addr(TLB_LG_PGSZ + TLB_SET_BITS -# - 1 downto TLB_LG_PGSZ); - sync += addrbits.eq(m_in.addr[ - TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_SET_BITS - ]) -# else - with m.Else(): -# addrbits := d_in.addr(TLB_LG_PGSZ + TLB_SET_BITS -# - 1 downto TLB_LG_PGSZ); - sync += addrbits.eq(d_in.addr[ - TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_SET_BITS - ]) +# if req_op /= OP_NONE then + with m.If(req_op != Op.OP_NONE): +# report "op:" & op_t'image(req_op) & +# " addr:" & to_hstring(r0.req.addr) & +# " nc:" & std_ulogic'image(r0.req.nc) & +# " idx:" & integer'image(req_index) & +# " tag:" & to_hstring(req_tag) & +# " way: " & integer'image(req_hit_way); + print(f"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \ + f"idx:{req_index} tag:{req_tag} way: {req_hit_way}" + ) +# end if; +# if r0_valid = '1' then + with m.If(r0_valid): +# r1.mmu_req <= r0.mmu_req; + sync += r1.mmu_req.eq(r0.mmu_req) # end if; -# index := to_integer(unsigned(addrbits)); - sync += index.eq(addrbits) -# -- If we have any op and the previous op isn't finished, -# -- then keep the same output for next cycle. -# if r0_stall = '0' then -# If we have any op and the previous op isn't finished, -# then keep the same output for next cycle. - with m.If(~r0_stall): - sync += tlb_valid_way.eq(dtlb_valids[index]) - sync += tlb_tag_way.eq(dtlb_tags[index]) - sync += tlb_pte_way.eq(dtlb_ptes[index]) -# end if; -# end if; -# end process; +# -- Fast path for load/store hits. +# -- Set signals for the writeback controls. +# r1.hit_way <= req_hit_way; +# r1.hit_index <= req_index; + # Fast path for load/store hits. + # Set signals for the writeback controls. + sync += r1.hit_way.eq(req_hit_way) + sync += r1.hit_index.eq(req_index) -# -- Generate TLB PLRUs -# maybe_tlb_plrus: if TLB_NUM_WAYS > 1 generate -# Generate TLB PLRUs -class MaybeTLBPLRUs(Elaboratable): - def __init__(self): - pass +# if req_op = OP_LOAD_HIT then + with m.If(req_op == Op.OP_LOAD_HIT): +# r1.hit_load_valid <= '1'; + sync += r1.hit_load_valid.eq(1) - def elaborate(self, platform): - m = Module() +# else + with m.Else(): +# r1.hit_load_valid <= '0'; + sync += r1.hit_load_valid.eq(0) +# end if; - comb = m.d.comb - sync = m.d.sync +# if req_op = OP_LOAD_HIT or req_op = OP_STORE_HIT then + with m.If(req_op == Op.OP_LOAD_HIT | req_op == Op.OP_STORE_HIT): +# r1.cache_hit <= '1'; + sync += r1.cache_hit.eq(1) +# else + with m.Else(): +# r1.cache_hit <= '0'; + sync += r1.cache_hit.eq(0) +# end if; - with m.If(TLB_NUM_WAYS > 1): -# begin -# TODO understand how to conver generate statements -# tlb_plrus: for i in 0 to TLB_SET_SIZE - 1 generate -# -- TLB PLRU interface -# signal tlb_plru_acc : -# std_ulogic_vector(TLB_WAY_BITS-1 downto 0); -# signal tlb_plru_acc_en : std_ulogic; -# signal tlb_plru_out : -# std_ulogic_vector(TLB_WAY_BITS-1 downto 0); -# begin -# tlb_plru : entity work.plru -# generic map ( -# BITS => TLB_WAY_BITS -# ) -# port map ( -# clk => clk, -# rst => rst, -# acc => tlb_plru_acc, -# acc_en => tlb_plru_acc_en, -# lru => tlb_plru_out -# ); +# if req_op = OP_BAD then + with m.If(req_op == Op.OP_BAD): +# report "Signalling ld/st error valid_ra=" & +# std_ulogic'image(valid_ra) & " rc_ok=" & +# std_ulogic'image(rc_ok) & " perm_ok=" & +# std_ulogic'image(perm_ok); + print(f"Signalling ld/st error valid_ra={valid_ra}" + f"rc_ok={rc_ok} perm_ok={perm_ok}" + +# r1.ls_error <= not r0.mmu_req; +# r1.mmu_error <= r0.mmu_req; +# r1.cache_paradox <= access_ok; + sync += r1.ls_error.eq(~r0.mmu_req) + sync += r1.mmu_error.eq(r0.mmu_req) + sync += r1.cache_paradox.eq(access_ok) + +# else + with m.Else(): +# r1.ls_error <= '0'; +# r1.mmu_error <= '0'; +# r1.cache_paradox <= '0'; + sync += r1.ls_error.eq(0) + sync += r1.mmu_error.eq(0) + sync += r1.cache_paradox.eq(0) +# end if; # -# process(all) -# begin -# -- PLRU interface -# if r1.tlb_hit_index = i then -# tlb_plru_acc_en <= r1.tlb_hit; -# else -# tlb_plru_acc_en <= '0'; -# end if; -# tlb_plru_acc <= -# std_ulogic_vector(to_unsigned( -# r1.tlb_hit_way, TLB_WAY_BITS -# )); -# tlb_plru_victim(i) <= tlb_plru_out; -# end process; -# end generate; -# end generate; -# end TODO +# if req_op = OP_STCX_FAIL then + with m.If(req_op == Op.OP_STCX_FAIL): +# r1.stcx_fail <= '1'; + r1.stcx_fail.eq(1) -# tlb_search : process(all) -class TLBSearch(Elaboratable): - def __init__(self): - pass +# else + with m.Else(): +# r1.stcx_fail <= '0'; + sync += r1.stcx_fail.eq(0) +# end if; +# +# -- Record TLB hit information for updating TLB PLRU +# r1.tlb_hit <= tlb_hit; +# r1.tlb_hit_way <= tlb_hit_way; +# r1.tlb_hit_index <= tlb_req_index; + # Record TLB hit information for updating TLB PLRU + sync += r1.tlb_hit.eq(tlb_hit) + sync += r1.tlb_hit_way.eq(tlb_hit_way) + sync += r1.tlb_hit_index.eq(tlb_req_index) +# end if; +# end process; - def elborate(self, platform): - m = Module() + # Memory accesses are handled by this state machine: + # + # * Cache load miss/reload (in conjunction with "rams") + # * Load hits for non-cachable forms + # * Stores (the collision case is handled in "rams") + # + # All wishbone requests generation is done here. + # This machine operates at stage 1. + def dcache_slow(self, r1, use_forward1_next, cache_valid_bits, r0, + r0_valid, req_op, cache_tag, req_go, ra, wb_in): comb = m.d.comb sync = m.d.sync -# variable hitway : tlb_way_t; -# variable hit : std_ulogic; -# variable eatag : tlb_tag_t; - hitway = TLBWay() - hit = Signal() - eatag = TLBTag() - - comb += hitway - comb += hit - comb += eatag +# variable stbs_done : boolean; +# variable req : mem_access_request_t; +# variable acks : unsigned(2 downto 0); + stbs_done = Signal() + req = MemAccessRequest() + acks = Signal(3) -# begin -# tlb_req_index <= -# to_integer(unsigned(r0.req.addr( -# TLB_LG_PGSZ + TLB_SET_BITS - 1 downto TLB_LG_PGSZ -# ))); -# hitway := 0; -# hit := '0'; -# eatag := r0.req.addr(63 downto TLB_LG_PGSZ + TLB_SET_BITS); -# for i in tlb_way_t loop -# if tlb_valid_way(i) = '1' and -# read_tlb_tag(i, tlb_tag_way) = eatag then -# hitway := i; -# hit := '1'; -# end if; -# end loop; -# tlb_hit <= hit and r0_valid; -# tlb_hit_way <= hitway; - comb += tlb_req_index.eq(r0.req.addr[ - TLB_LG_PGSZ:TLB_LG_PGSZ + TLB_SET_BITS - ]) + comb += stbs_done + comb += req + comb += acks - comb += eatag.eq(r0.req.addr[ - TLB_LG_PGSZ + TLB_SET_BITS:64 - ]) +# begin +# if rising_edge(clk) then +# r1.use_forward1 <= use_forward1_next; +# r1.forward_sel <= (others => '0'); + sync += r1.use_forward1.eq(use_forward1_next) + sync += r1.forward_sel.eq(0) - for i in TLBWay(): - with m.If(tlb_valid_way(i) - & read_tlb_tag(i, tlb_tag_way) == eatag): +# if use_forward1_next = '1' then + with m.If(use_forward1_next): +# r1.forward_sel <= r1.req.byte_sel; + sync += r1.forward_sel.eq(r1.req.byte_sel) - comb += hitway.eq(i) - comb += hit.eq(1) +# elsif use_forward2_next = '1' then + with m.Elif(use_forward2_next): +# r1.forward_sel <= r1.forward_sel1; + sync += r1.forward_sel.eq(r1.forward_sel1) +# end if; - comb += tlb_hit.eq(hit & r0_valid) - comb += tlb_hit_way.eq(hitway) +# r1.forward_data2 <= r1.forward_data1; + sync += r1.forward_data2.eq(r1.forward_data1) -# if tlb_hit = '1' then - with m.If(tlb_hit): -# pte <= read_tlb_pte(hitway, tlb_pte_way); - comb += pte.eq(read_tlb_pte(hitway, tlb_pte_way)) -# else - with m.Else(): -# pte <= (others => '0'); - comb += pte.eq(0) -# end if; -# valid_ra <= tlb_hit or not r0.req.virt_mode; - comb += valid_ra.eq(tlb_hit | ~r0.req.virt_mode) -# if r0.req.virt_mode = '1' then - with m.If(r0.req.virt_mode): -# ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) & -# r0.req.addr(TLB_LG_PGSZ - 1 downto ROW_OFF_BITS) & -# (ROW_OFF_BITS-1 downto 0 => '0'); -# perm_attr <= extract_perm_attr(pte); - comb += ra.eq(Cat( - Const(ROW_OFF_BITS, ROW_OFF_BITS), - r0.req.addr[ROW_OFF_BITS:TLB_LG_PGSZ], - pte[TLB_LG_PGSZ:REAL_ADDR_BITS] - )) - comb += perm_attr.eq(extract_perm_attr(pte)) -# else +# if r1.write_bram = '1' then + with m.If(r1.write_bram): +# r1.forward_data1 <= r1.req.data; +# r1.forward_sel1 <= r1.req.byte_sel; +# r1.forward_way1 <= r1.req.hit_way; +# r1.forward_row1 <= get_row(r1.req.real_addr); +# r1.forward_valid1 <= '1'; + sync += r1.forward_data1.eq(r1.req.data) + sync += r1.forward_sel1.eq(r1.req.byte_sel) + sync += r1.forward_way1.eq(r1.req.hit_way) + sync += r1.forward_row1.eq(get_row(r1.req.real_addr)) + sync += r1.forward_valid1.eq(1) +# else with m.Else(): -# ra <= r0.req.addr( -# REAL_ADDR_BITS - 1 downto ROW_OFF_BITS -# ) & (ROW_OFF_BITS-1 downto 0 => '0'); - comb += ra.eq(Cat( - Const(ROW_OFF_BITS, ROW_OFF_BITS), - r0.rq.addr[ROW_OFF_BITS:REAL_ADDR_BITS] - ) - -# perm_attr <= real_mode_perm_attr; - comb += perm_attr.reference.eq(1) - comb += perm_attr.changed.eq(1) - comb += perm_attr.priv.eq(1) - comb += perm_attr.nocache.eq(0) - comb += perm_attr.rd_perm.eq(1) - comb += perm_attr.wr_perm.eq(1) -# end if; -# end process; - -# tlb_update : process(clk) -class TLBUpdate(Elaboratable): - def __init__(self): - pass - - def elaborate(self, platform): - m = Module() - - comb = m.d.comb - sync = m.d.sync - -# variable tlbie : std_ulogic; -# variable tlbwe : std_ulogic; -# variable repl_way : tlb_way_t; -# variable eatag : tlb_tag_t; -# variable tagset : tlb_way_tags_t; -# variable pteset : tlb_way_ptes_t; - tlbie = Signal() - tlbwe = Signal() - repl_way = TLBWay() - eatag = TLBTag() - tagset = TLBWayTags() - pteset = TLBWayPtes() - - comb += tlbie - comb += tlbwe - comb += repl_way - comb += eatag - comb += tagset - comb += pteset -# begin -# if rising_edge(clk) then -# tlbie := r0_valid and r0.tlbie; -# tlbwe := r0_valid and r0.tlbldoi; - sync += tlbie.eq(r0_valid & r0.tlbie) - sync += tlbwe.eq(r0_valid & r0.tlbldoi) +# if r1.dcbz = '1' then + with m.If(r1.bcbz): +# r1.forward_data1 <= (others => '0'); + sync += r1.forward_data1.eq(0) -# if rst = '1' or (tlbie = '1' and r0.doall = '1') then -# with m.If (TODO understand how signal resets work in nmigen) -# -- clear all valid bits at once -# for i in tlb_index_t loop -# dtlb_valids(i) <= (others => '0'); -# end loop; - # clear all valid bits at once - for i in range(TLB_SET_SIZE): - sync += dtlb_valids[i].eq(0) -# elsif tlbie = '1' then - with m.Elif(tlbie): -# if tlb_hit = '1' then - with m.If(tlb_hit): -# dtlb_valids(tlb_req_index)(tlb_hit_way) <= '0'; - sync += dtlb_valids[tlb_req_index][tlb_hit_way].eq(0) -# end if; -# elsif tlbwe = '1' then - with m.Elif(tlbwe): -# if tlb_hit = '1' then - with m.If(tlb_hit): -# repl_way := tlb_hit_way; - sync += repl_way.eq(tlb_hit_way) # else with m.Else(): -# repl_way := to_integer(unsigned( -# tlb_plru_victim(tlb_req_index))); - sync += repl_way.eq(tlb_plru_victim[tlb_req_index]) +# r1.forward_data1 <= wishbone_in.dat; + sync += r1.forward_data1.eq(wb_in.dat) # end if; -# eatag := r0.req.addr( -# 63 downto TLB_LG_PGSZ + TLB_SET_BITS -# ); -# tagset := tlb_tag_way; -# write_tlb_tag(repl_way, tagset, eatag); -# dtlb_tags(tlb_req_index) <= tagset; -# pteset := tlb_pte_way; -# write_tlb_pte(repl_way, pteset, r0.req.data); -# dtlb_ptes(tlb_req_index) <= pteset; -# dtlb_valids(tlb_req_index)(repl_way) <= '1'; - sync += eatag.eq(r0.req.addr[TLB_LG_PGSZ + TLB_SET_BITS:64]) - sync += tagset.eq(tlb_tag_way) - sync += write_tlb_tag(repl_way, tagset, eatag) - sync += dtlb_tags[tlb_req_index].eq(tagset) - sync += pteset.eq(tlb_pte_way) - sync += write_tlb_pte(repl_way, pteset, r0.req.data) - sync += dtlb_ptes[tlb_req_index].eq(pteset) - sync += dtlb_valids[tlb_req_index][repl_way].eq(1) -# end if; -# end if; -# end process; -# -- Generate PLRUs -# maybe_plrus: if NUM_WAYS > 1 generate -class MaybePLRUs(Elaboratable): - def __init__(self): - pass +# r1.forward_sel1 <= (others => '1'); +# r1.forward_way1 <= replace_way; +# r1.forward_row1 <= r1.store_row; +# r1.forward_valid1 <= '0'; + sync += r1.forward_sel1.eq(1) + sync += r1.forward_way1.eq(replace_way) + sync += r1.forward_row1.eq(r1.store_row) + sync += r1.forward_valid1.eq(0) +# end if; - def elaborate(self, platform): - m = Module() +# -- On reset, clear all valid bits to force misses +# if rst = '1' then + # On reset, clear all valid bits to force misses + # TODO figure out how reset signal works in nmigeni + with m.If("""TODO RST???"""): +# for i in index_t loop + for i in range(INDEX): +# cache_valids(i) <= (others => '0'); + sync += cache_valid_bits[i].eq(0) +# end loop; - comb = m.d.comb - sync = m.d.sync +# r1.state <= IDLE; +# r1.full <= '0'; +# r1.slow_valid <= '0'; +# r1.wb.cyc <= '0'; +# r1.wb.stb <= '0'; +# r1.ls_valid <= '0'; +# r1.mmu_done <= '0'; + sync += r1.state.eq(State.IDLE) + sync += r1.full.eq(0) + sync += r1.slow_valid.eq(0) + sync += r1.wb.cyc.eq(0) + sync += r1.wb.stb.eq(0) + sync += r1.ls_valid.eq(0) + sync += r1.mmu_done.eq(0) -# begin - # TODO learn translation of generate into nmgien @lkcl -# plrus: for i in 0 to NUM_LINES-1 generate -# -- PLRU interface -# signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); -# signal plru_acc_en : std_ulogic; -# signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); -# -# begin - # TODO learn tranlation of entity, generic map, port map in - # nmigen @lkcl -# plru : entity work.plru -# generic map ( -# BITS => WAY_BITS -# ) -# port map ( -# clk => clk, -# rst => rst, -# acc => plru_acc, -# acc_en => plru_acc_en, -# lru => plru_out -# ); -# -# process(all) -# begin -# -- PLRU interface -# if r1.hit_index = i then - # PLRU interface - with m.If(r1.hit_index == i): -# plru_acc_en <= r1.cache_hit; - comb += plru_acc_en.eq(r1.cache_hit) -# else +# -- Not useful normally but helps avoiding +# -- tons of sim warnings + # Not useful normally but helps avoiding + # tons of sim warnings +# r1.wb.adr <= (others => '0'); + sync += r1.wb.adr.eq(0) +# else with m.Else(): -# plru_acc_en <= '0'; - comb += plru_acc_en.eq(0) -# end if; -# plru_acc <= std_ulogic_vector(to_unsigned( -# r1.hit_way, WAY_BITS -# )); -# plru_victim(i) <= plru_out; - comb += plru_acc.eq(r1.hit_way) - comb += plru_victime[i].eq(plru_out) -# end process; -# end generate; -# end generate; - -# -- Cache tag RAM read port -# cache_tag_read : process(clk) -# Cache tag RAM read port -class CacheTagRead(Elaboratable): - def __init__(self): - pass - - def elaborate(self, platform): - m = Module() - - comb = m.d.comb - sync = m.d.sync - -# variable index : index_t; - index = Signal(INDEX) - - comb += index - -# begin -# if rising_edge(clk) then -# if r0_stall = '1' then - with m.If(r0_stall): -# index := req_index; - sync += index.eq(req_index) - -# elsif m_in.valid = '1' then - with m.Elif(m_in.valid): -# index := get_index(m_in.addr); - sync += index.eq(get_index(m_in.addr)) - -# else - with m.Else(): -# index := get_index(d_in.addr); - sync += index.eq(get_index(d_in.addr)) -# end if; -# cache_tag_set <= cache_tags(index); - sync += cache_tag_set.eq(cache_tags(index)) -# end if; -# end process; - -# -- Cache request parsing and hit detection -# dcache_request : process(all) -# Cache request parsing and hit detection -class DcacheRequest(Elaboratable): - def __init__(self): - pass - - def elaborate(self, platform): -# variable is_hit : std_ulogic; -# variable hit_way : way_t; -# variable op : op_t; -# variable opsel : std_ulogic_vector(2 downto 0); -# variable go : std_ulogic; -# variable nc : std_ulogic; -# variable s_hit : std_ulogic; -# variable s_tag : cache_tag_t; -# variable s_pte : tlb_pte_t; -# variable s_ra : std_ulogic_vector( -# REAL_ADDR_BITS - 1 downto 0 -# ); -# variable hit_set : std_ulogic_vector( -# TLB_NUM_WAYS - 1 downto 0 -# ); -# variable hit_way_set : hit_way_set_t; -# variable rel_matches : std_ulogic_vector( -# TLB_NUM_WAYS - 1 downto 0 -# ); - rel_match = Signal() - is_hit = Signal() - hit_way = Signal(WAY_BITS) - op = Op() - opsel = Signal(3) - go = Signal() - nc = Signal() - s_hit = Signal() - s_tag = Signal(CACHE_TAG) - s_pte = Signal(TLB_PTE) - s_ra = Signal(REAL_ADDR_BITS) - hit_set = Signal(TLB_NUM_WAYS) - hit_way_set = HitWaySet() - rel_matches = Signal(TLB_NUM_WAYS) - rel_match = Signal() - - comb += rel_match - comb += is_hit - comb += hit_way - comb += op - comb += opsel - comb += go - comb += nc - comb += s_hit - comb += s_tag - comb += s_pte - comb += s_ra - comb += hit_set - comb += hit_way_set - comb += rel_matches - comb += rel_match - -# begin -# -- Extract line, row and tag from request -# req_index <= get_index(r0.req.addr); -# req_row <= get_row(r0.req.addr); -# req_tag <= get_tag(ra); +# -- One cycle pulses reset +# r1.slow_valid <= '0'; +# r1.write_bram <= '0'; +# r1.inc_acks <= '0'; +# r1.dec_acks <= '0'; # -# go := r0_valid and not (r0.tlbie or r0.tlbld) -# and not r1.ls_error; - # Extract line, row and tag from request - comb += req_index.eq(get_index(r0.req.addr)) - comb += req_row.eq(get_row(r0.req.addr)) - comb += req_tag.eq(get_tag(ra)) - - comb += go.eq(r0_valid & ~(r0.tlbie | r0.tlbld) & ~r1.ls_error) - -# -- Test if pending request is a hit on any way -# -- In order to make timing in virtual mode, -# -- when we are using the TLB, we compare each -# --way with each of the real addresses from each way of -# -- the TLB, and then decide later which match to use. -# hit_way := 0; -# is_hit := '0'; -# rel_match := '0'; - # Test if pending request is a hit on any way - # In order to make timing in virtual mode, - # when we are using the TLB, we compare each - # way with each of the real addresses from each way of - # the TLB, and then decide later which match to use. - comb += hit_way.eq(0) - comb += is_hit.eq(0) - comb += rel_match.eq(0) +# r1.ls_valid <= '0'; +# -- complete tlbies and TLB loads in the third cycle +# r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld); + # One cycle pulses reset + sync += r1.slow_valid.eq(0) + sync += r1.write_bram.eq(0) + sync += r1.inc_acks.eq(0) + sync += r1.dec_acks.eq(0) -# if r0.req.virt_mode = '1' then - with m.If(r0.req.virt_mode): -# rel_matches := (others => '0'); - comb += rel_matches.eq(0) -# for j in tlb_way_t loop - for j in range(TLB_WAY): -# hit_way_set(j) := 0; -# s_hit := '0'; -# s_pte := read_tlb_pte(j, tlb_pte_way); -# s_ra := s_pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) -# & r0.req.addr(TLB_LG_PGSZ - 1 downto 0); -# s_tag := get_tag(s_ra); - comb += hit_way_set[j].eq(0) - comb += s_hit.eq(0) - comb += s_pte.eq(read_tlb_pte(j, tlb_pte_way)) - comb += s_ra.eq(Cat( - r0.req.addr[0:TLB_LG_PGSZ], - s_pte[TLB_LG_PGSZ:REAL_ADDR_BITS] - )) - comb += s_tag.eq(get_tag(s_ra)) + sync += r1.ls_valid.eq(0) + # complete tlbies and TLB loads in the third cycle + sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld)) -# for i in way_t loop - for i in range(NUM_WAYS): -# if go = '1' and cache_valids(req_index)(i) = '1' -# and read_tag(i, cache_tag_set) = s_tag -# and tlb_valid_way(j) = '1' then - with m.If(go & cache_valid_bits[req_index][i] & - read_tag(i, cache_tag_set) == s_tag - & tlb_valid_way[j]): -# hit_way_set(j) := i; -# s_hit := '1'; - comb += hit_way_set[j].eq(i) - comb += s_hit.eq(1) +# if req_op = OP_LOAD_HIT or req_op = OP_STCX_FAIL then + with m.If(req_op == Op.OP_LOAD_HIT + | req_op == Op.OP_STCX_FAIL): +# if r0.mmu_req = '0' then + with m.If(~r0.mmu_req): +# r1.ls_valid <= '1'; + sync += r1.ls_valid.eq(1) +# else + with m.Else(): +# r1.mmu_done <= '1'; + sync += r1.mmu_done.eq(1) # end if; -# end loop; -# hit_set(j) := s_hit; - comb += hit_set[j].eq(s_hit) -# if s_tag = r1.reload_tag then - with m.If(s_tag == r1.reload_tag): -# rel_matches(j) := '1'; - comb += rel_matches[j].eq(1) # end if; -# end loop; -# if tlb_hit = '1' then - with m.If(tlb_hit): -# is_hit := hit_set(tlb_hit_way); -# hit_way := hit_way_set(tlb_hit_way); -# rel_match := rel_matches(tlb_hit_way); - comb += is_hit.eq(hit_set[tlb_hit_way]) - comb += hit_way.eq(hit_way_set[tlb_hit_way]) - comb += rel_match.eq(rel_matches[tlb_hit_way]) -# end if; -# else - with m.Else(): -# s_tag := get_tag(r0.req.addr); - comb += s_tag.eq(get_tag(r0.req.addr)) -# for i in way_t loop - for i in range(NUM_WAYS): -# if go = '1' and cache_valids(req_index)(i) = '1' and -# read_tag(i, cache_tag_set) = s_tag then - with m.If(go & cache_valid_bits[req_index][i] & - read_tag(i, cache_tag_set) == s_tag): -# hit_way := i; -# is_hit := '1'; - comb += hit_way.eq(i) - comb += is_hit.eq(1) + +# if r1.write_tag = '1' then + with m.If(r1.write_tag): +# -- Store new tag in selected way +# for i in 0 to NUM_WAYS-1 loop + # Store new tag in selected way + for i in range(NUM_WAYS): +# if i = replace_way then + with m.If(i == replace_way): +# cache_tags(r1.store_index)( +# (i + 1) * TAG_WIDTH - 1 +# downto i * TAG_WIDTH +# ) <= +# (TAG_WIDTH - 1 downto TAG_BITS => '0') +# & r1.reload_tag; + sync += cache_tag[ + r1.store_index + ][i * TAG_WIDTH:(i +1) * TAG_WIDTH].eq( + Const(TAG_WIDTH, TAG_WIDTH) + & r1.reload_tag + ) +# end if; +# end loop; +# r1.store_way <= replace_way; +# r1.write_tag <= '0'; + sync += r1.store_way.eq(replace_way) + sync += r1.write_tag.eq(0) # end if; -# end loop; -# if s_tag = r1.reload_tag then - with m.If(s_tag == r1.reload_tag): -# rel_match := '1'; - comb += rel_match.eq(1) -# end if; -# end if; -# req_same_tag <= rel_match; - comb += req_same_tag.eq(rel_match) -# -- See if the request matches the line -# -- currently being reloaded -# if r1.state = RELOAD_WAIT_ACK and req_index = r1.store_index -# and rel_match = '1' then - # See if the request matches the line currently being reloaded - with m.If(r1.state == State.RELOAD_WAIT_ACK & req_index == - r1.store_index & rel_match): -# -- For a store, consider this a hit even if the row -# -- isn't valid since it will be by the time we -# -- perform the store. For a load, check the -# -- appropriate row valid bit. - # For a store, consider this a hit even if the row isn't - # valid since it will be by the time we perform the store. - # For a load, check the appropriate row valid bit. -# is_hit := -# not r0.req.load -# or r1.rows_valid(req_row mod ROW_PER_LINE); -# hit_way := replace_way; - comb += is_hit.eq(~r0.req.load - | r1.rows_valid[req_row % ROW_PER_LINE]) - comb += hit_way.eq(replace_way) -# end if; +# -- Take request from r1.req if there is one there, +# -- else from req_op, ra, etc. +# if r1.full = '1' then + # Take request from r1.req if there is one there, + # else from req_op, ra, etc. + with m.If(r1.full) +# req := r1.req; + sync += req.eq(r1.req) + +# else + with m.Else(): +# req.op := req_op; +# req.valid := req_go; +# req.mmu_req := r0.mmu_req; +# req.dcbz := r0.req.dcbz; +# req.real_addr := ra; + sync += req.op.eq(req_op) + sync += req.valid.eq(req_go) + sync += req.mmu_req.eq(r0.mmu_req) + sync += req.dcbz.eq(r0.req.dcbz) + sync += req.real_addr.eq(ra) + +# -- Force data to 0 for dcbz +# if r0.req.dcbz = '0' then + with m.If(~r0.req.dcbz): +# req.data := r0.req.data; + sync += req.data.eq(r0.req.data) -# -- Whether to use forwarded data for a load or not - # Whether to use forwarded data for a load or not -# use_forward1_next <= '0'; - comb += use_forward1_next.eq(0) -# if get_row(r1.req.real_addr) = req_row -# and r1.req.hit_way = hit_way then - with m.If(get_row(r1.req.real_addr) == req_row - & r1.req.hit_way == hit_way) -# -- Only need to consider r1.write_bram here, since if we -# -- are writing refill data here, then we don't have a -# -- cache hit this cycle on the line being refilled. -# -- (There is the possibility that the load following -# -- the load miss that started the refill could be to -# -- the old contents of the victim line, since it is a -# -- couple of cycles after the refill starts before we -# -- see the updated cache tag. -# -- In that case we don't use the bypass.) - # Only need to consider r1.write_bram here, since if we - # are writing refill data here, then we don't have a - # cache hit this cycle on the line being refilled. - # (There is the possibility that the load following the - # load miss that started the refill could be to the old - # contents of the victim line, since it is a couple of - # cycles after the refill starts before we see the updated - # cache tag. In that case we don't use the bypass.) -# use_forward1_next <= r1.write_bram; - comb += use_forward1_next.eq(r1.write_bram) -# end if; -# use_forward2_next <= '0'; - comb += use_forward2_next.eq(0) -# if r1.forward_row1 = req_row -# and r1.forward_way1 = hit_way then - with m.If(r1.forward_row1 == req_row - & r1.forward_way1 == hit_way): -# use_forward2_next <= r1.forward_valid1; - comb += use_forward2_next.eq(r1.forward_valid1) -# end if; +# else + with m.Else(): +# req.data := (others => '0'); + sync += req.data.eq(0) +# end if; -# -- The way that matched on a hit - # The way that matched on a hit -# req_hit_way <= hit_way; - comb += req_hit_way.eq(hit_way) +# -- Select all bytes for dcbz +# -- and for cacheable loads +# if r0.req.dcbz = '1' +# or (r0.req.load = '1' and r0.req.nc = '0') then + # Select all bytes for dcbz + # and for cacheable loads + with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc): +# req.byte_sel := (others => '1'); + sync += req.byte_sel.eq(1) -# -- The way to replace on a miss - # The way to replace on a miss -# if r1.write_tag = '1' then - with m.If(r1.write_tag): -# replace_way <= to_integer(unsigned( -# plru_victim(r1.store_index) -# )); - replace_way.eq(plru_victim[r1.store_index]) -# else - with m.Else(): -# replace_way <= r1.store_way; - comb += replace_way.eq(r1.store_way) -# end if; +# else + with m.Else(): +# req.byte_sel := r0.req.byte_sel; + sync += req.byte_sel.eq(r0.req.byte_sel) +# end if; -# -- work out whether we have permission for this access -# -- NB we don't yet implement AMR, thus no KUAP - # work out whether we have permission for this access - # NB we don't yet implement AMR, thus no KUAP -# rc_ok <= perm_attr.reference and -# (r0.req.load or perm_attr.changed); -# perm_ok <= (r0.req.priv_mode or not perm_attr.priv) and -# (perm_attr.wr_perm or (r0.req.load -# and perm_attr.rd_perm)); -# access_ok <= valid_ra and perm_ok and rc_ok; - comb += rc_ok.eq( - perm_attr.reference - & (r0.req.load | perm_attr.changed) - ) - comb += perm_ok.eq((r0.req.prive_mode | ~perm_attr.priv) - & perm_attr.wr_perm - | (r0.req.load & perm_attr.rd_perm) - ) - comb += access_ok.eq(valid_ra & perm_ok & rc_ok) -# -- Combine the request and cache hit status to decide what -# -- operation needs to be done -# nc := r0.req.nc or perm_attr.nocache; -# op := OP_NONE; - # Combine the request and cache hit status to decide what - # operation needs to be done - comb += nc.eq(r0.req.nc | perm_attr.nocache) - comb += op.eq(Op.OP_NONE) -# if go = '1' then - with m.If(go): -# if access_ok = '0' then - with m.If(~access_ok): -# op := OP_BAD; - comb += op.eq(Op.OP_BAD) -# elsif cancel_store = '1' then - with m.Elif(cancel_store): -# op := OP_STCX_FAIL; - comb += op.eq(Op.OP_STCX_FAIL) -# else - with m.Else(): -# opsel := r0.req.load & nc & is_hit; - comb += opsel.eq(Cat(is_hit, nc, r0.req.load)) -# case opsel is - with m.Switch(opsel): -# when "101" => op := OP_LOAD_HIT; -# when "100" => op := OP_LOAD_MISS; -# when "110" => op := OP_LOAD_NC; -# when "001" => op := OP_STORE_HIT; -# when "000" => op := OP_STORE_MISS; -# when "010" => op := OP_STORE_MISS; -# when "011" => op := OP_BAD; -# when "111" => op := OP_BAD; -# when others => op := OP_NONE; - with m.Case(Const(0b101, 3)): - comb += op.eq(Op.OP_LOAD_HIT) +# req.hit_way := req_hit_way; +# req.same_tag := req_same_tag; + sync += req.hit_way.eq(req_hit_way) + sync += req.same_tag.eq(req_same_tag) - with m.Case(Cosnt(0b100, 3)): - comb += op.eq(Op.OP_LOAD_MISS) +# -- Store the incoming request from r0, +# -- if it is a slow request +# -- Note that r1.full = 1 implies req_op = OP_NONE +# if req_op = OP_LOAD_MISS or req_op = OP_LOAD_NC +# or req_op = OP_STORE_MISS +# or req_op = OP_STORE_HIT then + # Store the incoming request from r0, + # if it is a slow request + # Note that r1.full = 1 implies req_op = OP_NONE + with m.If(req_op == Op.OP_LOAD_MISS + | req_op == Op.OP_LOAD_NC + | req_op == Op.OP_STORE_MISS + | req_op == Op.OP_STORE_HIT): +# r1.req <= req; +# r1.full <= '1'; + sync += r1.req(req) + sync += r1.full.eq(1) +# end if; +# end if; +# +# -- Main state machine +# case r1.state is + # Main state machine + with m.Switch(r1.state): - with m.Case(Const(0b110, 3)): - comb += op.eq(Op.OP_LOAD_NC) +# when IDLE => + with m.Case(State.IDLE) +# r1.wb.adr <= req.real_addr( +# r1.wb.adr'left downto 0 +# ); +# r1.wb.sel <= req.byte_sel; +# r1.wb.dat <= req.data; +# r1.dcbz <= req.dcbz; +# +# -- Keep track of our index and way +# -- for subsequent stores. +# r1.store_index <= get_index(req.real_addr); +# r1.store_row <= get_row(req.real_addr); +# r1.end_row_ix <= +# get_row_of_line(get_row(req.real_addr)) - 1; +# r1.reload_tag <= get_tag(req.real_addr); +# r1.req.same_tag <= '1'; + sync += r1.wb.adr.eq(req.real_addr[0:r1.wb.adr]) + sync += r1.wb.sel.eq(req.byte_sel) + sync += r1.wb.dat.eq(req.data) + sync += r1.dcbz.eq(req.dcbz) - with m.Case(Const(0b001, 3)): - comb += op.eq(Op.OP_STORE_HIT) + # Keep track of our index and way + # for subsequent stores. + sync += r1.store_index.eq(get_index(req.real_addr)) + sync += r1.store_row.eq(get_row(req.real_addr)) + sync += r1.end_row_ix.eq( + get_row_of_line(get_row(req.real_addr)) + ) + sync += r1.reload_tag.eq(get_tag(req.real_addr)) + sync += r1.req.same_tag.eq(1) - with m.Case(Const(0b000, 3)): - comb += op.eq(Op.OP_STORE_MISS) +# if req.op = OP_STORE_HIT theni + with m.If(req.op == Op.OP_STORE_HIT): +# r1.store_way <= req.hit_way; + sync += r1.store_way.eq(req.hit_way) +# end if; - with m.Case(Const(0b010, 3)): - comb += op.eq(Op.OP_STORE_MISS) +# -- Reset per-row valid bits, +# -- ready for handling OP_LOAD_MISS +# for i in 0 to ROW_PER_LINE - 1 loop + # Reset per-row valid bits, + # ready for handling OP_LOAD_MISS + for i in range(ROW_PER_LINE): +# r1.rows_valid(i) <= '0'; + sync += r1.rows_valid[i].eq(0) +# end loop; - with m.Case(Const(0b011, 3)): - comb += op.eq(Op.OP_BAD) +# case req.op is + with m.Switch(req.op): +# when OP_LOAD_HIT => + with m.Case(Op.OP_LOAD_HIT): +# -- stay in IDLE state + # stay in IDLE state + pass - with m.Case(Const(0b111, 3)): - comb += op.eq(Op.OP_BAD) +# when OP_LOAD_MISS => + with m.Case(Op.OP_LOAD_MISS): +# -- Normal load cache miss, +# -- start the reload machine +# report "cache miss real addr:" & +# to_hstring(req.real_addr) & " idx:" & +# integer'image(get_index(req.real_addr)) & +# " tag:" & to_hstring(get_tag(req.real_addr)); + # Normal load cache miss, + # start the reload machine + print(f"cache miss real addr:" \ + f"{req_real_addr}" \ + f" idx:{get_index(req_real_addr)}" \ + f" tag:{get_tag(req.real_addr)}") - with m.Default(): - comb += op.eq(Op.OP_NONE) -# end case; -# end if; -# end if; -# req_op <= op; -# req_go <= go; - comb += req_op.eq(op) - comb += req_go.eq(go) +# -- Start the wishbone cycle +# r1.wb.we <= '0'; +# r1.wb.cyc <= '1'; +# r1.wb.stb <= '1'; + # Start the wishbone cycle + sync += r1.wb.we.eq(0) + sync += r1.wb.cyc.eq(1) + sync += r1.wb.stb.eq(1) -# -- Version of the row number that is valid one cycle earlier -# -- in the cases where we need to read the cache data BRAM. -# -- If we're stalling then we need to keep reading the last -# -- row requested. - # Version of the row number that is valid one cycle earlier - # in the cases where we need to read the cache data BRAM. - # If we're stalling then we need to keep reading the last - # row requested. -# if r0_stall = '0' then - with m.If(~r0_stall): -# if m_in.valid = '1' then - with m.If(m_in.valid): -# early_req_row <= get_row(m_in.addr); - comb += early_req_row.eq(get_row(m_in.addr)) -# else - with m.Else(): -# early_req_row <= get_row(d_in.addr); - comb += early_req_row.eq(get_row(d_in.addr)) -# end if; -# else - with m.Else(): -# early_req_row <= req_row; - comb += early_req_row.eq(req_row) -# end if; -# end process; +# -- Track that we had one request sent +# r1.state <= RELOAD_WAIT_ACK; +# r1.write_tag <= '1'; + # Track that we had one request sent + sync += r1.state.eq(State.RELOAD_WAIT_ACK) + sync += r1.write_tag.eq(1) -# -- Wire up wishbone request latch out of stage 1 -# wishbone_out <= r1.wb; - # Wire up wishbone request latch out of stage 1 - comb += wishbone_out.eq(r1.wb) +# when OP_LOAD_NC => + with m.Case(Op.OP_LOAD_NC): +# r1.wb.cyc <= '1'; +# r1.wb.stb <= '1'; +# r1.wb.we <= '0'; +# r1.state <= NC_LOAD_WAIT_ACK; + sync += r1.wb.cyc.eq(1) + sync += r1.wb.stb.eq(1) + sync += r1.wb.we.eq(0) + sync += r1.state.eq(State.NC_LOAD_WAIT_ACK) -# -- Handle load-with-reservation and store-conditional instructions -# reservation_comb: process(all) -# Handle load-with-reservation and store-conditional instructions -class ReservationComb(Elaboratable): - def __init__(self): - pass +# when OP_STORE_HIT | OP_STORE_MISS => + with m.Case(Op.OP_STORE_HIT + | Op.OP_STORE_MISS): +# if req.dcbz = '0' then + with m.If(~req.bcbz): +# r1.state <= STORE_WAIT_ACK; +# r1.acks_pending <= to_unsigned(1, 3); +# r1.full <= '0'; +# r1.slow_valid <= '1'; + sync += r1.state.eq( + State.STORE_WAIT_ACK + ) + sync += r1.acks_pending.eq( + '''TODO to_unsignes(1,3)''' + ) + sync += r1.full.eq(0) + sync += r1.slow_valid.eq(1) - def elaborate(self, platform): - m = Module() +# if req.mmu_req = '0' then + with m.If(~req.mmu_req): +# r1.ls_valid <= '1'; + sync += r1.ls_valid.eq(1) +# else + with m.Else(): +# r1.mmu_done <= '1'; + sync += r1.mmu_done.eq(1) +# end if; - comb = m.d.comb - sync = m.d.sync +# if req.op = OP_STORE_HIT then + with m.If(req.op == Op.OP_STORE_HIT): +# r1.write_bram <= '1'; + sync += r1.write_bram.eq(1) +# end if; -# begin -# cancel_store <= '0'; -# set_rsrv <= '0'; -# clear_rsrv <= '0'; -# if r0_valid = '1' and r0.req.reserve = '1' then - with m.If(r0_valid & r0.req.reserve): +# else + with m.Else(): +# -- dcbz is handled much like a load +# -- miss except that we are writing +# -- to memory instead of reading +# r1.state <= RELOAD_WAIT_ACK; + # dcbz is handled much like a load + # miss except that we are writing + # to memory instead of reading + sync += r1.state.eq(Op.RELOAD_WAIT_ACK) -# -- XXX generate alignment interrupt if address -# -- is not aligned XXX or if r0.req.nc = '1' -# if r0.req.load = '1' then - # XXX generate alignment interrupt if address - # is not aligned XXX or if r0.req.nc = '1' - with m.If(r0.req.load): -# -- load with reservation -# set_rsrv <= '1'; - # load with reservation - comb += set_rsrv(1) -# else - with m.Else(): -# -- store conditional -# clear_rsrv <= '1'; - # store conditional - comb += clear_rsrv.eq(1) -# if reservation.valid = '0' or r0.req.addr(63 -# downto LINE_OFF_BITS) /= reservation.addr then - with m.If(~reservation.valid - | r0.req.addr[LINE_OFF_BITS:64]): -# cancel_store <= '1'; - comb += cancel_store.eq(1) -# end if; -# end if; -# end if; -# end process; +# if req.op = OP_STORE_MISS then + with m.If(req.op == Op.OP_STORE_MISS): +# r1.write_tag <= '1'; + sync += r1.write_tag.eq(1) +# end if; +# end if; -# reservation_reg: process(clk) -class ReservationReg(Elaboratable): - def __init__(self): - pass +# r1.wb.we <= '1'; +# r1.wb.cyc <= '1'; +# r1.wb.stb <= '1'; + sync += r1.wb.we.eq(1) + sync += r1.wb.cyc.eq(1) + sync += r1.wb.stb.eq(1) - def elaborate(self, platform): - m = Module() +# -- OP_NONE and OP_BAD do nothing +# -- OP_BAD & OP_STCX_FAIL were handled above already +# when OP_NONE => +# when OP_BAD => +# when OP_STCX_FAIL => + # OP_NONE and OP_BAD do nothing + # OP_BAD & OP_STCX_FAIL were + # handled above already + with m.Case(Op.OP_NONE): + pass - comb = m.d.comb - sync = m.d.sync + with m.Case(OP_BAD): + pass -# begin -# if rising_edge(clk) then -# if rst = '1' then -# reservation.valid <= '0'; - # TODO understand how resets work in nmigen -# elsif r0_valid = '1' and access_ok = '1' then - with m.Elif(r0_valid & access_ok)"" -# if clear_rsrv = '1' then - with m.If(clear_rsrv): -# reservation.valid <= '0'; - sync += reservation.valid.ea(0) -# elsif set_rsrv = '1' then - with m.Elif(set_rsrv): -# reservation.valid <= '1'; -# reservation.addr <= -# r0.req.addr(63 downto LINE_OFF_BITS); - sync += reservation.valid.eq(1) - sync += reservation.addr.eq( - r0.req.addr[LINE_OFF_BITS:64] - ) -# end if; -# end if; -# end if; -# end process; -# -# -- Return data for loads & completion control logic -# writeback_control: process(all) -# Return data for loads & completion control logic -class WriteBackControl(Elaboratable): - def __init__(self): - pass + with m.Case(OP_STCX_FAIL): + pass +# end case; - def elaborate(self, platform): - m = Module() +# when RELOAD_WAIT_ACK => + with m.Case(State.RELOAD_WAIT_ACK): +# -- Requests are all sent if stb is 0 + # Requests are all sent if stb is 0 + sync += stbs_done.eq(~r1.wb.stb) +# stbs_done := r1.wb.stb = '0'; - comb = m.d.comb - sync = m.d.sync +# -- If we are still sending requests, +# -- was one accepted? +# if wishbone_in.stall = '0' and not stbs_done then + # If we are still sending requests, + # was one accepted? + with m.If(~wb_in.stall & ~stbs_done): +# -- That was the last word ? We are done sending. +# -- Clear stb and set stbs_done so we can handle +# -- an eventual last ack on the same cycle. +# if is_last_row_addr( +# r1.wb.adr, r1.end_row_ix +# ) then + # That was the last word? + # We are done sending. + # Clear stb and set stbs_done + # so we can handle an eventual + # last ack on the same cycle. + with m.If(is_last_row_addr( + r1.wb.adr, r1.end_row_ix)): +# r1.wb.stb <= '0'; +# stbs_done := true; + sync += r1.wb.stb.eq(0) + sync += stbs_done.eq(0) +# end if; -# variable data_out : std_ulogic_vector(63 downto 0); -# variable data_fwd : std_ulogic_vector(63 downto 0); -# variable j : integer; - data_out = Signal(64) - data_fwd = Signal(64) - j = Signal() +# -- Calculate the next row address +# r1.wb.adr <= next_row_addr(r1.wb.adr); + # Calculate the next row address + sync += r1.wb.adr.eq(next_row_addr(r1.wb.adr)) +# end if; -# begin -# -- Use the bypass if are reading the row that was -# -- written 1 or 2 cycles ago, including for the -# -- slow_valid = 1 case (i.e. completing a load -# -- miss or a non-cacheable load). -# if r1.use_forward1 = '1' then - # Use the bypass if are reading the row that was - # written 1 or 2 cycles ago, including for the - # slow_valid = 1 case (i.e. completing a load - # miss or a non-cacheable load). - with m.If(r1.use_forward1): -# data_fwd := r1.forward_data1; - comb += data_fwd.eq(r1.forward_data1) -# else - with m.Else(): -# data_fwd := r1.forward_data2; - comb += data_fwd.eq(r1.forward_data2) -# end if; +# -- Incoming acks processing +# r1.forward_valid1 <= wishbone_in.ack; + # Incoming acks processing + sync += r1.forward_valid1.eq(wb_in.ack) -# data_out := cache_out(r1.hit_way); - comb += data_out.eq(cache_out[r1.hit_way]) +# if wishbone_in.ack = '1' then + with m.If(wb_in.ack): +# r1.rows_valid( +# r1.store_row mod ROW_PER_LINE +# ) <= '1'; + sync += r1.rows_valid[ + r1.store_row % ROW_PER_LINE + ].eq(1) -# for i in 0 to 7 loop - for i in range(8): -# j := i * 8; - comb += i * 8 +# -- If this is the data we were looking for, +# -- we can complete the request next cycle. +# -- Compare the whole address in case the +# -- request in r1.req is not the one that +# -- started this refill. +# if r1.full = '1' and r1.req.same_tag = '1' +# and ((r1.dcbz = '1' and r1.req.dcbz = '1') +# or (r1.dcbz = '0' and r1.req.op = OP_LOAD_MISS)) +# and r1.store_row = get_row(r1.req.real_addr) then + # If this is the data we were looking for, + # we can complete the request next cycle. + # Compare the whole address in case the + # request in r1.req is not the one that + # started this refill. + with m.If(r1.full & r1.req.same_tag & + ((r1.dcbz & r1.req.dcbz) + (~r1.dcbz & + r1.req.op == Op.OP_LOAD_MISS) + ) & + r1.store_row + == get_row(r1.req.real_addr): +# r1.full <= '0'; +# r1.slow_valid <= '1'; + sync += r1.full.eq(0) + sync += r1.slow_valid.eq(1) -# if r1.forward_sel(i) = '1' then - with m.If(r1.forward_sel[i]): -# data_out(j + 7 downto j) := data_fwd(j + 7 downto j); - comb += data_out[j:j+8].eq(data_fwd[j:j+8]) -# end if; -# end loop; +# if r1.mmu_req = '0' then + with m.If(~r1.mmu_req): +# r1.ls_valid <= '1'; + sync += r1.ls_valid.eq(1) +# else + with m.Else(): +# r1.mmu_done <= '1'; + sync += r1.mmu_done.eq(1) +# end if; +# r1.forward_sel <= (others => '1'); +# r1.use_forward1 <= '1'; + sync += r1.forward_sel.eq(1) + sync += r1.use_forward1.eq(1) +# end if; -# d_out.valid <= r1.ls_valid; -# d_out.data <= data_out; -# d_out.store_done <= not r1.stcx_fail; -# d_out.error <= r1.ls_error; -# d_out.cache_paradox <= r1.cache_paradox; - comb += d_out.valid.eq(r1.ls_valid) - comb += d_out.data.eq(data_out) - comb += d_out.store_done.eq(~r1.stcx_fail) - comb += d_out.error.eq(r1.ls_error) - comb += d_out.cache_paradox.eq(r1.cache_paradox) +# -- Check for completion +# if stbs_done and is_last_row(r1.store_row, +# r1.end_row_ix) then + # Check for completion + with m.If(stbs_done & + is_last_row(r1.store_row, + r1.end_row_ix)): -# -- Outputs to MMU -# m_out.done <= r1.mmu_done; -# m_out.err <= r1.mmu_error; -# m_out.data <= data_out; - comb += m_out.done.eq(r1.mmu_done) - comb += m_out.err.eq(r1.mmu_error) - comb += m_out.data.eq(data_out) +# -- Complete wishbone cycle +# r1.wb.cyc <= '0'; + # Complete wishbone cycle + sync += r1.wb.cyc.eq(0) -# -- We have a valid load or store hit or we just completed -# -- a slow op such as a load miss, a NC load or a store -# -- -# -- Note: the load hit is delayed by one cycle. However it -# -- can still not collide with r.slow_valid (well unless I -# -- miscalculated) because slow_valid can only be set on a -# -- subsequent request and not on its first cycle (the state -# -- machine must have advanced), which makes slow_valid -# -- at least 2 cycles from the previous hit_load_valid. -# -# -- Sanity: Only one of these must be set in any given cycle -# assert (r1.slow_valid and r1.stcx_fail) /= '1' -# report "unexpected slow_valid collision with stcx_fail" -# severity FAILURE; -# assert ((r1.slow_valid or r1.stcx_fail) and r1.hit_load_valid) -# /= '1' report "unexpected hit_load_delayed collision with -# slow_valid" severity FAILURE; - # We have a valid load or store hit or we just completed - # a slow op such as a load miss, a NC load or a store - # - # Note: the load hit is delayed by one cycle. However it - # can still not collide with r.slow_valid (well unless I - # miscalculated) because slow_valid can only be set on a - # subsequent request and not on its first cycle (the state - # machine must have advanced), which makes slow_valid - # at least 2 cycles from the previous hit_load_valid. +# -- Cache line is now valid +# cache_valids(r1.store_index)( +# r1.store_way +# ) <= '1'; + # Cache line is now valid + sync += cache_valid_bits[ + r1.store_index + ][r1.store_way].eq(1) - # Sanity: Only one of these must be set in any given cycle - assert (r1.slow_valid & r1.stcx_fail) != 1 "unexpected" \ - "slow_valid collision with stcx_fail -!- severity FAILURE" +# r1.state <= IDLE; + sync += r1.state.eq(State.IDLE) +# end if; - assert ((r1.slow_valid | r1.stcx_fail) | r1.hit_load_valid) != 1 - "unexpected hit_load_delayed collision with slow_valid -!-" \ - "severity FAILURE" +# -- Increment store row counter +# r1.store_row <= next_row(r1.store_row); + # Increment store row counter + sync += r1.store_row.eq(next_row( + r1.store_row + )) +# end if; -# if r1.mmu_req = '0' then - with m.If(~r1._mmu_req): -# -- Request came from loadstore1... -# -- Load hit case is the standard path -# if r1.hit_load_valid = '1' then - # Request came from loadstore1... - # Load hit case is the standard path - with m.If(r1.hit_load_valid): -# report -# "completing load hit data=" & to_hstring(data_out); - print(f"completing load hit data={data_out}") -# end if; +# when STORE_WAIT_ACK => + with m.Case(State.STORE_WAIT_ACK): +# stbs_done := r1.wb.stb = '0'; +# acks := r1.acks_pending; + sync += stbs_done.eq(~r1.wb.stb) + sync += acks.eq(r1.acks_pending) -# -- error cases complete without stalling -# if r1.ls_error = '1' then - # error cases complete without stalling - with m.If(r1.ls_error): -# report "completing ld/st with error"; - print("completing ld/st with error") -# end if; +# if r1.inc_acks /= r1.dec_acks then + with m.If(r1.inc_acks != r1.dec_acks): -# -- Slow ops (load miss, NC, stores) -# if r1.slow_valid = '1' then - # Slow ops (load miss, NC, stores) - with m.If(r1.slow_valid): -# report -# "completing store or load miss data=" -# & to_hstring(data_out); - print(f"completing store or load miss data={data_out}") -# end if; +# if r1.inc_acks = '1' then + with m.If(r1.inc_acks): +# acks := acks + 1; + sync += acks.eq(acks + 1) -# else - with m.Else(): -# -- Request came from MMU -# if r1.hit_load_valid = '1' then - # Request came from MMU - with m.If(r1.hit_load_valid): -# report "completing load hit to MMU, data=" -# & to_hstring(m_out.data); - print(f"completing load hit to MMU, data={m_out.data}") -# end if; -# -# -- error cases complete without stalling -# if r1.mmu_error = '1' then -# report "completing MMU ld with error"; - # error cases complete without stalling - with m.If(r1.mmu_error): - print("combpleting MMU ld with error") -# end if; -# -# -- Slow ops (i.e. load miss) -# if r1.slow_valid = '1' then - # Slow ops (i.e. load miss) - with m.If(r1.slow_valid): -# report "completing MMU load miss, data=" -# & to_hstring(m_out.data); - print("completing MMU load miss, data={m_out.data}") -# end if; -# end if; -# end process; +# else + with m.Else(): +# acks := acks - 1; + sync += acks.eq(acks - 1) +# end if; +# end if; -# begin TODO -# -- Generate a cache RAM for each way. This handles the normal -# -- reads, writes from reloads and the special store-hit update -# -- path as well. -# -- -# -- Note: the BRAMs have an extra read buffer, meaning the output -# -- is pipelined an extra cycle. This differs from the -# -- icache. The writeback logic needs to take that into -# -- account by using 1-cycle delayed signals for load hits. -# -- -# rams: for i in 0 to NUM_WAYS-1 generate -# signal do_read : std_ulogic; -# signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0); -# signal do_write : std_ulogic; -# signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0); -# signal wr_data : -# std_ulogic_vector(wishbone_data_bits-1 downto 0); -# signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0); -# signal wr_sel_m : std_ulogic_vector(ROW_SIZE-1 downto 0); -# signal dout : cache_row_t; -# begin -# way: entity work.cache_ram -# generic map ( -# ROW_BITS => ROW_BITS, -# WIDTH => wishbone_data_bits, -# ADD_BUF => true -# ) -# port map ( -# clk => clk, -# rd_en => do_read, -# rd_addr => rd_addr, -# rd_data => dout, -# wr_sel => wr_sel_m, -# wr_addr => wr_addr, -# wr_data => wr_data -# ); -# process(all) -# end TODO -class TODO(Elaboratable): - def __init__(self): - pass +# r1.acks_pending <= acks; + sync += r1.acks_pending.eq(acks) - def elaborate(self, platform): - m = Module() +# -- Clear stb when slave accepted request +# if wishbone_in.stall = '0' then + # Clear stb when slave accepted request + with m.If(~wb_in.stall): +# -- See if there is another store waiting +# -- to be done which is in the same real page. +# if req.valid = '1' then + # See if there is another store waiting + # to be done which is in the same real page. + with m.If(req.valid): +# r1.wb.adr( +# SET_SIZE_BITS - 1 downto 0 +# ) <= req.real_addr( +# SET_SIZE_BITS - 1 downto 0 +# ); +# r1.wb.dat <= req.data; +# r1.wb.sel <= req.byte_sel; + sync += r1.wb.adr[0:SET_SIZE_BITS].eq( + req.real_addr[0:SET_SIZE_BITS] + ) +# end if; - comb = m.d.comb - sync = m.d.sync +# if acks < 7 and req.same_tag = '1' +# and (req.op = OP_STORE_MISS +# or req.op = OP_STORE_HIT) then + with m.Elif(acks < 7 & req.same_tag & + (req.op == Op.Op_STORE_MISS + | req.op == Op.OP_SOTRE_HIT)): +# r1.wb.stb <= '1'; +# stbs_done := false; + sync += r1.wb.stb.eq(1) + sync += stbs_done.eq(0) -# begin -# -- Cache hit reads -# do_read <= '1'; -# rd_addr <= -# std_ulogic_vector(to_unsigned(early_req_row, ROW_BITS)); -# cache_out(i) <= dout; - # Cache hit reads - comb += do_read.eq(1) - comb += rd_addr.eq(Signal(ROW)) - comb += cache_out[i].eq(dout) +# if req.op = OP_STORE_HIT then + with m.If(req.op == Op.OP_STORE_HIT): +# r1.write_bram <= '1'; + sync += r1.write_bram.eq(1) +# end if; +# r1.full <= '0'; +# r1.slow_valid <= '1'; + sync += r1.full.eq(0) + sync += r1.slow_valid.eq(1) -# -- Write mux: -# -- -# -- Defaults to wishbone read responses (cache refill) -# -- -# -- For timing, the mux on wr_data/sel/addr is not -# -- dependent on anything other than the current state. - # Write mux: - # - # Defaults to wishbone read responses (cache refill) - # - # For timing, the mux on wr_data/sel/addr is not - # dependent on anything other than the current state. -# wr_sel_m <= (others => '0'); - comb += wr_sel_m.eq(0) +# -- Store requests never come from the MMU +# r1.ls_valid <= '1'; +# stbs_done := false; +# r1.inc_acks <= '1'; + # Store request never come from the MMU + sync += r1.ls_valid.eq(1) + sync += stbs_done.eq(0) + sync += r1.inc_acks.eq(1) +# else + with m.Else(): +# r1.wb.stb <= '0'; +# stbs_done := true; + sync += r1.wb.stb.eq(0) + sync += stbs_done.eq(1) +# end if; +# end if; -# do_write <= '0'; - comb += do_write.eq(0) -# if r1.write_bram = '1' then - with m.If(r1.write_bram): -# -- Write store data to BRAM. This happens one -# -- cycle after the store is in r0. - # Write store data to BRAM. This happens one - # cycle after the store is in r0. -# wr_data <= r1.req.data; -# wr_sel <= r1.req.byte_sel; -# wr_addr <= std_ulogic_vector(to_unsigned( -# get_row(r1.req.real_addr), ROW_BITS -# )); - comb += wr_data.eq(r1.req.data) - comb += wr_sel.eq(r1.req.byte_sel) - comb += wr_addr.eq(Signal(get_row(r1.req.real_addr))) +# -- Got ack ? See if complete. +# if wishbone_in.ack = '1' then + # Got ack ? See if complete. + with m.If(wb_in.ack): +# if stbs_done and acks = 1 then + with m.If(stbs_done & acks) +# r1.state <= IDLE; +# r1.wb.cyc <= '0'; +# r1.wb.stb <= '0'; + sync += r1.state.eq(State.IDLE) + sync += r1.wb.cyc.eq(0) + sync += r1.wb.stb.eq(0) +# end if; +# r1.dec_acks <= '1'; + sync += r1.dec_acks.eq(1) +# end if; -# if i = r1.req.hit_way then - with m.If(i == r1.req.hit_way): -# do_write <= '1'; - comb += do_write.eq(1) -# end if; -# else - with m.Else(): -# -- Otherwise, we might be doing a reload or a DCBZ -# if r1.dcbz = '1' then - # Otherwise, we might be doing a reload or a DCBZ - with m.If(r1.dcbz): -# wr_data <= (others => '0'); - comb += wr_data.eq(0) -# else - with m.Else(): -# wr_data <= wishbone_in.dat; - comb += wr_data.eq(wishbone_in.dat) -# end if; +# when NC_LOAD_WAIT_ACK => + with m.Case(State.NC_LOAD_WAIT_ACK): +# -- Clear stb when slave accepted request +# if wishbone_in.stall = '0' then + # Clear stb when slave accepted request + with m.If(~wb_in.stall): +# r1.wb.stb <= '0'; + sync += r1.wb.stb.eq(0) +# end if; -# wr_addr <= std_ulogic_vector(to_unsigned( -# r1.store_row, ROW_BITS -# )); -# wr_sel <= (others => '1'); - comb += wr_addr.eq(Signal(r1.store_row)) - comb += wr_sel.eq(1) +# -- Got ack ? complete. +# if wishbone_in.ack = '1' then + # Got ack ? complete. + with m.If(wb_in.ack): +# r1.state <= IDLE; +# r1.full <= '0'; +# r1.slow_valid <= '1'; + sync += r1.state.eq(State.IDLE) + sync += r1.full.eq(0) + sync += r1.slow_valid.eq(1) -# if r1.state = RELOAD_WAIT_ACK and -# wishbone_in.ack = '1' and replace_way = i then - with m.If(r1.state == State.RELOAD_WAIT_ACK - & wishbone_in.ack & relpace_way == i): -# do_write <= '1'; - comb += do_write.eq(1) -# end if; -# end if; +# if r1.mmu_req = '0' then + with m.If(~r1.mmu_req): +# r1.ls_valid <= '1'; + sync += r1.ls_valid.eq(1) -# -- Mask write selects with do_write since BRAM -# -- doesn't have a global write-enable -# if do_write = '1' then -# -- Mask write selects with do_write since BRAM -# -- doesn't have a global write-enable - with m.If(do_write): -# wr_sel_m <= wr_sel; - comb += wr_sel_m.eq(wr_sel) -# end if; -# end process; -# end generate; +# else + with m.Else(): +# r1.mmu_done <= '1'; + sync += r1.mmu_done.eq(1) +# end if; -# -- Cache hit synchronous machine for the easy case. -# -- This handles load hits. -# -- It also handles error cases (TLB miss, cache paradox) -# dcache_fast_hit : process(clk) -# Cache hit synchronous machine for the easy case. -# This handles load hits. -# It also handles error cases (TLB miss, cache paradox) -class DcacheFastHit(Elaboratable): - def __init__(self): - pass +# r1.forward_sel <= (others => '1'); +# r1.use_forward1 <= '1'; +# r1.wb.cyc <= '0'; +# r1.wb.stb <= '0'; + sync += r1.forward_sel.eq(1) + sync += r1.use_forward1.eq(1) + sync += r1.wb.cyc.eq(0) + sync += r1.wb.stb.eq(0) +# end if; +# end case; +# end if; +# end if; +# end process; - def elaborate(self, platform): - m = Module() +# dc_log: if LOG_LENGTH > 0 generate +# TODO learn how to tranlate vhdl generate into nmigen + def dcache_log(self, r1, valid_ra, tlb_hit_way, stall_out, + d_out, wb_in, log_out): comb = m.d.comb sync = m.d.sync +# signal log_data : std_ulogic_vector(19 downto 0); + log_data = Signal(20) + + comb += log_data + # begin -# if rising_edge(clk) then -# if req_op /= OP_NONE then - with m.If(req_op != Op.OP_NONE): -# report "op:" & op_t'image(req_op) & -# " addr:" & to_hstring(r0.req.addr) & -# " nc:" & std_ulogic'image(r0.req.nc) & -# " idx:" & integer'image(req_index) & -# " tag:" & to_hstring(req_tag) & -# " way: " & integer'image(req_hit_way); - print(f"op:{req_op} addr:{r0.req.addr} nc: {r0.req.nc}" \ - f"idx:{req_index} tag:{req_tag} way: {req_hit_way}" - ) -# end if; -# if r0_valid = '1' then - with m.If(r0_valid): -# r1.mmu_req <= r0.mmu_req; - sync += r1.mmu_req.eq(r0.mmu_req) +# dcache_log: process(clk) +# begin +# if rising_edge(clk) then +# log_data <= r1.wb.adr(5 downto 3) & +# wishbone_in.stall & +# wishbone_in.ack & +# r1.wb.stb & r1.wb.cyc & +# d_out.error & +# d_out.valid & +# std_ulogic_vector( +# to_unsigned(op_t'pos(req_op), 3)) & +# stall_out & +# std_ulogic_vector( +# to_unsigned(tlb_hit_way, 3)) & +# valid_ra & +# std_ulogic_vector( +# to_unsigned(state_t'pos(r1.state), 3)); + sync += log_data.eq(Cat( + Const(r1.state, 3), valid_ra, Const(tlb_hit_way, 3), + stall_out, Const(req_op, 3), d_out.valid, d_out.error, + r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall, + r1.wb.adr[3:6] + )) # end if; +# end process; +# log_out <= log_data; + # TODO ??? I am very confused need help + comb += log_out.eq(log_data) +# end generate; +# end; + + def elaborate(self, platform): + LINE_SIZE = self.LINE_SIZE + NUM_LINES = self.NUM_LINES + NUM_WAYS = self.NUM_WAYS + TLB_SET_SIZE = self.TLB_SET_SIZE + TLB_NUM_WAYS = self.TLB_NUM_WAYS + TLB_LG_PGSZ = self.TLB_LG_PGSZ + LOG_LENGTH = self.LOG_LENGTH -# -- Fast path for load/store hits. -# -- Set signals for the writeback controls. -# r1.hit_way <= req_hit_way; -# r1.hit_index <= req_index; - # Fast path for load/store hits. - # Set signals for the writeback controls. - sync += r1.hit_way.eq(req_hit_way) - sync += r1.hit_index.eq(req_index) + # BRAM organisation: We never access more than + # -- wishbone_data_bits at a time so to save + # -- resources we make the array only that wide, and + # -- use consecutive indices for to make a cache "line" + # -- + # -- ROW_SIZE is the width in bytes of the BRAM + # -- (based on WB, so 64-bits) + ROW_SIZE = WB_DATA_BITS / 8; -# if req_op = OP_LOAD_HIT then - with m.If(req_op == Op.OP_LOAD_HIT): -# r1.hit_load_valid <= '1'; - sync += r1.hit_load_valid.eq(1) + # ROW_PER_LINE is the number of row (wishbone + # transactions) in a line + ROW_PER_LINE = LINE_SIZE // ROW_SIZE -# else - with m.Else(): -# r1.hit_load_valid <= '0'; - sync += r1.hit_load_valid.eq(0) -# end if; + # BRAM_ROWS is the number of rows in BRAM needed + # to represent the full dcache + BRAM_ROWS = NUM_LINES * ROW_PER_LINE -# if req_op = OP_LOAD_HIT or req_op = OP_STORE_HIT then - with m.If(req_op == Op.OP_LOAD_HIT | req_op == Op.OP_STORE_HIT): -# r1.cache_hit <= '1'; - sync += r1.cache_hit.eq(1) -# else - with m.Else(): -# r1.cache_hit <= '0'; - sync += r1.cache_hit.eq(0) -# end if; -# if req_op = OP_BAD then - with m.If(req_op == Op.OP_BAD): -# report "Signalling ld/st error valid_ra=" & -# std_ulogic'image(valid_ra) & " rc_ok=" & -# std_ulogic'image(rc_ok) & " perm_ok=" & -# std_ulogic'image(perm_ok); - print(f"Signalling ld/st error valid_ra={valid_ra}" - f"rc_ok={rc_ok} perm_ok={perm_ok}" + # Bit fields counts in the address -# r1.ls_error <= not r0.mmu_req; -# r1.mmu_error <= r0.mmu_req; -# r1.cache_paradox <= access_ok; - sync += r1.ls_error.eq(~r0.mmu_req) - sync += r1.mmu_error.eq(r0.mmu_req) - sync += r1.cache_paradox.eq(access_ok) + # REAL_ADDR_BITS is the number of real address + # bits that we store + REAL_ADDR_BITS = 56 -# else - with m.Else(): -# r1.ls_error <= '0'; -# r1.mmu_error <= '0'; -# r1.cache_paradox <= '0'; - sync += r1.ls_error.eq(0) - sync += r1.mmu_error.eq(0) - sync += r1.cache_paradox.eq(0) -# end if; -# -# if req_op = OP_STCX_FAIL then - with m.If(req_op == Op.OP_STCX_FAIL): -# r1.stcx_fail <= '1'; - r1.stcx_fail.eq(1) + # ROW_BITS is the number of bits to select a row + ROW_BITS = log2_int(BRAM_ROWS) -# else - with m.Else(): -# r1.stcx_fail <= '0'; - sync += r1.stcx_fail.eq(0) -# end if; -# -# -- Record TLB hit information for updating TLB PLRU -# r1.tlb_hit <= tlb_hit; -# r1.tlb_hit_way <= tlb_hit_way; -# r1.tlb_hit_index <= tlb_req_index; - # Record TLB hit information for updating TLB PLRU - sync += r1.tlb_hit.eq(tlb_hit) - sync += r1.tlb_hit_way.eq(tlb_hit_way) - sync += r1.tlb_hit_index.eq(tlb_req_index) -# end if; -# end process; + # ROW_LINE_BITS is the number of bits to select + # a row within a line + ROW_LINE_BITS = log2_int(ROW_PER_LINE) -# -- Memory accesses are handled by this state machine: -# -- -# -- * Cache load miss/reload (in conjunction with "rams") -# -- * Load hits for non-cachable forms -# -- * Stores (the collision case is handled in "rams") -# -- -# -- All wishbone requests generation is done here. -# -- This machine operates at stage 1. -# dcache_slow : process(clk) -# Memory accesses are handled by this state machine: -# -# * Cache load miss/reload (in conjunction with "rams") -# * Load hits for non-cachable forms -# * Stores (the collision case is handled in "rams") -# -# All wishbone requests generation is done here. -# This machine operates at stage 1. -class DcacheSlow(Elaboratable): - def __init__(self): - pass + # LINE_OFF_BITS is the number of bits for + # the offset in a cache line + LINE_OFF_BITS = log2_int(LINE_SIZE) - def elaborate(self, platform): - m = Module() + # ROW_OFF_BITS is the number of bits for + # the offset in a row + ROW_OFF_BITS = log2_int(ROW_SIZE) - comb = m.d.comb - sync = m.d.sync + # INDEX_BITS is the number if bits to + # select a cache line + INDEX_BITS = log2_int(NUM_LINES) -# variable stbs_done : boolean; -# variable req : mem_access_request_t; -# variable acks : unsigned(2 downto 0); - stbs_done = Signal() - req = MemAccessRequest() - acks = Signal(3) + # SET_SIZE_BITS is the log base 2 of the set size + SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS - comb += stbs_done - comb += req - comb += acks + # TAG_BITS is the number of bits of + # the tag part of the address + TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS -# begin -# if rising_edge(clk) then -# r1.use_forward1 <= use_forward1_next; -# r1.forward_sel <= (others => '0'); - sync += r1.use_forward1.eq(use_forward1_next) - sync += r1.forward_sel.eq(0) + # TAG_WIDTH is the width in bits of each way of the tag RAM + TAG_WIDTH = TAG_BITS + 7 - ((TAG_BITS + 7) % 8) -# if use_forward1_next = '1' then - with m.If(use_forward1_next): -# r1.forward_sel <= r1.req.byte_sel; - sync += r1.forward_sel.eq(r1.req.byte_sel) + # WAY_BITS is the number of bits to select a way + WAY_BITS = log2_int(NUM_WAYS) -# elsif use_forward2_next = '1' then - with m.Elif(use_forward2_next): -# r1.forward_sel <= r1.forward_sel1; - sync += r1.forward_sel.eq(r1.forward_sel1) -# end if; + # Example of layout for 32 lines of 64 bytes: + # + # .. tag |index| line | + # .. | row | | + # .. | |---| | ROW_LINE_BITS (3) + # .. | |--- - --| LINE_OFF_BITS (6) + # .. | |- --| ROW_OFF_BITS (3) + # .. |----- ---| | ROW_BITS (8) + # .. |-----| | INDEX_BITS (5) + # .. --------| | TAG_BITS (45) -# r1.forward_data2 <= r1.forward_data1; - sync += r1.forward_data2.eq(r1.forward_data1) -# if r1.write_bram = '1' then - with m.If(r1.write_bram): -# r1.forward_data1 <= r1.req.data; -# r1.forward_sel1 <= r1.req.byte_sel; -# r1.forward_way1 <= r1.req.hit_way; -# r1.forward_row1 <= get_row(r1.req.real_addr); -# r1.forward_valid1 <= '1'; - sync += r1.forward_data1.eq(r1.req.data) - sync += r1.forward_sel1.eq(r1.req.byte_sel) - sync += r1.forward_way1.eq(r1.req.hit_way) - sync += r1.forward_row1.eq(get_row(r1.req.real_addr)) - sync += r1.forward_valid1.eq(1) -# else - with m.Else(): +# subtype row_t is integer range 0 to BRAM_ROWS-1; +# subtype index_t is integer range 0 to NUM_LINES-1; +"""wherever way_t is used to make a Signal it must be substituted with + log2_int(NUM_WAYS) i.e. WAY_BITS. this because whilst the *range* + of the number is 0..NUM_WAYS it requires log2_int(NUM_WAYS) i.e. + WAY_BITS of space to store it +""" +# subtype way_t is integer range 0 to NUM_WAYS-1; +# subtype row_in_line_t is unsigned(ROW_LINE_BITS-1 downto 0); + ROW = BRAM_ROWS # yyyeah not really necessary, delete + INDEX = NUM_LINES # yyyeah not really necessary, delete + WAY = NUM_WAYS # yyyeah not really necessary, delete + ROW_IN_LINE = ROW_LINE_BITS # yyyeah not really necessary, delete -# if r1.dcbz = '1' then - with m.If(r1.bcbz): -# r1.forward_data1 <= (others => '0'); - sync += r1.forward_data1.eq(0) +# -- The cache data BRAM organized as described above for each way +# subtype cache_row_t is +# std_ulogic_vector(wishbone_data_bits-1 downto 0); + # The cache data BRAM organized as described above for each way + CACHE_ROW = WB_DATA_BITS -# else - with m.Else(): -# r1.forward_data1 <= wishbone_in.dat; - sync += r1.forward_data1.eq(wb_in.dat) -# end if; +# -- The cache tags LUTRAM has a row per set. +# -- Vivado is a pain and will not handle a +# -- clean (commented) definition of the cache +# -- tags as a 3d memory. For now, work around +# -- it by putting all the tags +# subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0); + # The cache tags LUTRAM has a row per set. + # Vivado is a pain and will not handle a + # clean (commented) definition of the cache + # tags as a 3d memory. For now, work around + # it by putting all the tags + CACHE_TAG = TAG_BITS -# r1.forward_sel1 <= (others => '1'); -# r1.forward_way1 <= replace_way; -# r1.forward_row1 <= r1.store_row; -# r1.forward_valid1 <= '0'; - sync += r1.forward_sel1.eq(1) - sync += r1.forward_way1.eq(replace_way) - sync += r1.forward_row1.eq(r1.store_row) - sync += r1.forward_valid1.eq(0) -# end if; +# -- type cache_tags_set_t is array(way_t) of cache_tag_t; +# -- type cache_tags_array_t is array(index_t) of cache_tags_set_t; +# constant TAG_RAM_WIDTH : natural := TAG_WIDTH * NUM_WAYS; +# subtype cache_tags_set_t is +# std_logic_vector(TAG_RAM_WIDTH-1 downto 0); +# type cache_tags_array_t is array(index_t) of cache_tags_set_t; + # type cache_tags_set_t is array(way_t) of cache_tag_t; + # type cache_tags_array_t is array(index_t) of cache_tags_set_t; + TAG_RAM_WIDTH = TAG_WIDTH * NUM_WAYS -# -- On reset, clear all valid bits to force misses -# if rst = '1' then - # On reset, clear all valid bits to force misses - # TODO figure out how reset signal works in nmigeni - with m.If("""TODO RST???"""): -# for i in index_t loop - for i in range(INDEX): -# cache_valids(i) <= (others => '0'); - sync += cache_valid_bits[i].eq(0) -# end loop; + CACHE_TAG_SET = TAG_RAM_WIDTH -# r1.state <= IDLE; -# r1.full <= '0'; -# r1.slow_valid <= '0'; -# r1.wb.cyc <= '0'; -# r1.wb.stb <= '0'; -# r1.ls_valid <= '0'; -# r1.mmu_done <= '0'; - sync += r1.state.eq(State.IDLE) - sync += r1.full.eq(0) - sync += r1.slow_valid.eq(0) - sync += r1.wb.cyc.eq(0) - sync += r1.wb.stb.eq(0) - sync += r1.ls_valid.eq(0) - sync += r1.mmu_done.eq(0) + def CacheTagArray(): + return Array(CacheTagSet() for x in range(INDEX)) -# -- Not useful normally but helps avoiding -# -- tons of sim warnings - # Not useful normally but helps avoiding - # tons of sim warnings -# r1.wb.adr <= (others => '0'); - sync += r1.wb.adr.eq(0) -# else - with m.Else(): -# -- One cycle pulses reset -# r1.slow_valid <= '0'; -# r1.write_bram <= '0'; -# r1.inc_acks <= '0'; -# r1.dec_acks <= '0'; +# -- The cache valid bits +# subtype cache_way_valids_t is +# std_ulogic_vector(NUM_WAYS-1 downto 0); +# type cache_valids_t is array(index_t) of cache_way_valids_t; +# type row_per_line_valid_t is +# array(0 to ROW_PER_LINE - 1) of std_ulogic; + # The cache valid bits + CACHE_WAY_VALID_BITS = NUM_WAYS + + def CacheValidBitsArray(): + return Array(CacheWayValidBits() for x in range(INDEX)) + + def RowPerLineValidArray(): + return Array(Signal() for x in range(ROW_PER_LINE)) + +# -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs +# signal cache_tags : cache_tags_array_t; +# signal cache_tag_set : cache_tags_set_t; +# signal cache_valids : cache_valids_t; # -# r1.ls_valid <= '0'; -# -- complete tlbies and TLB loads in the third cycle -# r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld); - # One cycle pulses reset - sync += r1.slow_valid.eq(0) - sync += r1.write_bram.eq(0) - sync += r1.inc_acks.eq(0) - sync += r1.dec_acks.eq(0) +# attribute ram_style : string; +# attribute ram_style of cache_tags : signal is "distributed"; + # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs + cache_tags = CacheTagArray() + cache_tag_set = Signal(CACHE_TAG_SET) + cache_valid_bits = CacheValidBitsArray() - sync += r1.ls_valid.eq(0) - # complete tlbies and TLB loads in the third cycle - sync += r1.mmu_done.eq(r0_valid & (r0.tlbie | r0.tlbld)) + # TODO attribute ram_style : string; + # TODO attribute ram_style of cache_tags : signal is "distributed"; -# if req_op = OP_LOAD_HIT or req_op = OP_STCX_FAIL then - with m.If(req_op == Op.OP_LOAD_HIT - | req_op == Op.OP_STCX_FAIL): -# if r0.mmu_req = '0' then - with m.If(~r0.mmu_req): -# r1.ls_valid <= '1'; - sync += r1.ls_valid.eq(1) -# else - with m.Else(): -# r1.mmu_done <= '1'; - sync += r1.mmu_done.eq(1) -# end if; -# end if; +# -- L1 TLB. +# constant TLB_SET_BITS : natural := log2(TLB_SET_SIZE); +# constant TLB_WAY_BITS : natural := log2(TLB_NUM_WAYS); +# constant TLB_EA_TAG_BITS : natural := +# 64 - (TLB_LG_PGSZ + TLB_SET_BITS); +# constant TLB_TAG_WAY_BITS : natural := +# TLB_NUM_WAYS * TLB_EA_TAG_BITS; +# constant TLB_PTE_BITS : natural := 64; +# constant TLB_PTE_WAY_BITS : natural := +# TLB_NUM_WAYS * TLB_PTE_BITS; + # L1 TLB + TLB_SET_BITS = log2_int(TLB_SET_SIZE) + TLB_WAY_BITS = log2_int(TLB_NUM_WAYS) + TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_SET_BITS) + TLB_TAG_WAY_BITS = TLB_NUM_WAYS * TLB_EA_TAG_BITS + TLB_PTE_BITS = 64 + TLB_PTE_WAY_BITS = TLB_NUM_WAYS * TLB_PTE_BITS; -# if r1.write_tag = '1' then - with m.If(r1.write_tag): -# -- Store new tag in selected way -# for i in 0 to NUM_WAYS-1 loop - # Store new tag in selected way - for i in range(NUM_WAYS): -# if i = replace_way then - with m.If(i == replace_way): -# cache_tags(r1.store_index)( -# (i + 1) * TAG_WIDTH - 1 -# downto i * TAG_WIDTH -# ) <= -# (TAG_WIDTH - 1 downto TAG_BITS => '0') -# & r1.reload_tag; - sync += cache_tag[ - r1.store_index - ][i * TAG_WIDTH:(i +1) * TAG_WIDTH].eq( - Const(TAG_WIDTH, TAG_WIDTH) - & r1.reload_tag - ) -# end if; -# end loop; -# r1.store_way <= replace_way; -# r1.write_tag <= '0'; - sync += r1.store_way.eq(replace_way) - sync += r1.write_tag.eq(0) -# end if; +# subtype tlb_way_t is integer range 0 to TLB_NUM_WAYS - 1; +# subtype tlb_index_t is integer range 0 to TLB_SET_SIZE - 1; +# subtype tlb_way_valids_t is +# std_ulogic_vector(TLB_NUM_WAYS-1 downto 0); +# type tlb_valids_t is +# array(tlb_index_t) of tlb_way_valids_t; +# subtype tlb_tag_t is +# std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0); +# subtype tlb_way_tags_t is +# std_ulogic_vector(TLB_TAG_WAY_BITS-1 downto 0); +# type tlb_tags_t is +# array(tlb_index_t) of tlb_way_tags_t; +# subtype tlb_pte_t is +# std_ulogic_vector(TLB_PTE_BITS - 1 downto 0); +# subtype tlb_way_ptes_t is +# std_ulogic_vector(TLB_PTE_WAY_BITS-1 downto 0); +# type tlb_ptes_t is array(tlb_index_t) of tlb_way_ptes_t; +# type hit_way_set_t is array(tlb_way_t) of way_t; + TLB_WAY = TLB_NUM_WAYS -# -- Take request from r1.req if there is one there, -# -- else from req_op, ra, etc. -# if r1.full = '1' then - # Take request from r1.req if there is one there, - # else from req_op, ra, etc. - with m.If(r1.full) -# req := r1.req; - sync += req.eq(r1.req) + TLB_INDEX = TLB_SET_SIZE -# else - with m.Else(): -# req.op := req_op; -# req.valid := req_go; -# req.mmu_req := r0.mmu_req; -# req.dcbz := r0.req.dcbz; -# req.real_addr := ra; - sync += req.op.eq(req_op) - sync += req.valid.eq(req_go) - sync += req.mmu_req.eq(r0.mmu_req) - sync += req.dcbz.eq(r0.req.dcbz) - sync += req.real_addr.eq(ra) + TLB_WAY_VALID_BITS = TLB_NUM_WAYS -# -- Force data to 0 for dcbz -# if r0.req.dcbz = '0' then - with m.If(~r0.req.dcbz): -# req.data := r0.req.data; - sync += req.data.eq(r0.req.data) + def TLBValidBitsArray(): + return Array( + Signal(TLB_WAY_VALID_BITS) for x in range(TLB_SET_SIZE) + ) -# else - with m.Else(): -# req.data := (others => '0'); - sync += req.data.eq(0) -# end if; + TLB_TAG = TLB_EA_TAG_BITS -# -- Select all bytes for dcbz -# -- and for cacheable loads -# if r0.req.dcbz = '1' -# or (r0.req.load = '1' and r0.req.nc = '0') then - # Select all bytes for dcbz - # and for cacheable loads - with m.If(r0.req.dcbz | (r0.req.load & ~r0.req.nc): -# req.byte_sel := (others => '1'); - sync += req.byte_sel.eq(1) + TLB_WAY_TAGS = TLB_TAG_WAY_BITS -# else - with m.Else(): -# req.byte_sel := r0.req.byte_sel; - sync += req.byte_sel.eq(r0.req.byte_sel) -# end if; + def TLBTagsArray(): + return Array( + Signal(TLB_WAY_TAGS) for x in range (TLB_SET_SIZE) + ) -# req.hit_way := req_hit_way; -# req.same_tag := req_same_tag; - sync += req.hit_way.eq(req_hit_way) - sync += req.same_tag.eq(req_same_tag) + TLB_PTE = TLB_PTE_BITS -# -- Store the incoming request from r0, -# -- if it is a slow request -# -- Note that r1.full = 1 implies req_op = OP_NONE -# if req_op = OP_LOAD_MISS or req_op = OP_LOAD_NC -# or req_op = OP_STORE_MISS -# or req_op = OP_STORE_HIT then - # Store the incoming request from r0, - # if it is a slow request - # Note that r1.full = 1 implies req_op = OP_NONE - with m.If(req_op == Op.OP_LOAD_MISS - | req_op == Op.OP_LOAD_NC - | req_op == Op.OP_STORE_MISS - | req_op == Op.OP_STORE_HIT): -# r1.req <= req; -# r1.full <= '1'; - sync += r1.req(req) - sync += r1.full.eq(1) -# end if; -# end if; -# -# -- Main state machine -# case r1.state is - # Main state machine - with m.Switch(r1.state): + TLB_WAY_PTES = TLB_PTE_WAY_BITS -# when IDLE => - with m.Case(State.IDLE) -# r1.wb.adr <= req.real_addr( -# r1.wb.adr'left downto 0 -# ); -# r1.wb.sel <= req.byte_sel; -# r1.wb.dat <= req.data; -# r1.dcbz <= req.dcbz; -# -# -- Keep track of our index and way -# -- for subsequent stores. -# r1.store_index <= get_index(req.real_addr); -# r1.store_row <= get_row(req.real_addr); -# r1.end_row_ix <= -# get_row_of_line(get_row(req.real_addr)) - 1; -# r1.reload_tag <= get_tag(req.real_addr); -# r1.req.same_tag <= '1'; - sync += r1.wb.adr.eq(req.real_addr[0:r1.wb.adr]) - sync += r1.wb.sel.eq(req.byte_sel) - sync += r1.wb.dat.eq(req.data) - sync += r1.dcbz.eq(req.dcbz) + def TLBPtesArray(): + return Array( + Signal(TLB_WAY_PTES) for x in range(TLB_SET_SIZE) + ) - # Keep track of our index and way - # for subsequent stores. - sync += r1.store_index.eq(get_index(req.real_addr)) - sync += r1.store_row.eq(get_row(req.real_addr)) - sync += r1.end_row_ix.eq( - get_row_of_line(get_row(req.real_addr)) - ) - sync += r1.reload_tag.eq(get_tag(req.real_addr)) - sync += r1.req.same_tag.eq(1) + def HitWaySet(): + return Array(Signal(NUM_WAYS) for x in range(TLB_NUM_WAYS)) + +# signal dtlb_valids : tlb_valids_t; +# signal dtlb_tags : tlb_tags_t; +# signal dtlb_ptes : tlb_ptes_t; + +"""note: these are passed to nmigen.hdl.Memory as "attributes". + don't know how, just that they are. +""" +# attribute ram_style of dtlb_tags : signal is "distributed"; +# attribute ram_style of dtlb_ptes : signal is "distributed"; + dtlb_valid_bits = TLBValidBitsArray() + dtlb_tags = TLBTagsArray() + dtlb_ptes = TLBPtesArray() + # TODO attribute ram_style of + # dtlb_tags : signal is "distributed"; + # TODO attribute ram_style of + # dtlb_ptes : signal is "distributed"; -# if req.op = OP_STORE_HIT theni - with m.If(req.op == Op.OP_STORE_HIT): -# r1.store_way <= req.hit_way; - sync += r1.store_way.eq(req.hit_way) -# end if; +# signal r0 : reg_stage_0_t; +# signal r0_full : std_ulogic; + r0 = RegStage0() + r0_full = Signal() -# -- Reset per-row valid bits, -# -- ready for handling OP_LOAD_MISS -# for i in 0 to ROW_PER_LINE - 1 loop - # Reset per-row valid bits, - # ready for handling OP_LOAD_MISS - for i in range(ROW_PER_LINE): -# r1.rows_valid(i) <= '0'; - sync += r1.rows_valid[i].eq(0) -# end loop; +# signal r1 : reg_stage_1_t; + r1 = RegStage1() -# case req.op is - with m.Switch(req.op): -# when OP_LOAD_HIT => - with m.Case(Op.OP_LOAD_HIT): -# -- stay in IDLE state - # stay in IDLE state - pass +# signal reservation : reservation_t; + reservation = Reservation() -# when OP_LOAD_MISS => - with m.Case(Op.OP_LOAD_MISS): -# -- Normal load cache miss, -# -- start the reload machine -# report "cache miss real addr:" & -# to_hstring(req.real_addr) & " idx:" & -# integer'image(get_index(req.real_addr)) & -# " tag:" & to_hstring(get_tag(req.real_addr)); - # Normal load cache miss, - # start the reload machine - print(f"cache miss real addr:" \ - f"{req_real_addr}" \ - f" idx:{get_index(req_real_addr)}" \ - f" tag:{get_tag(req.real_addr)}") +# -- Async signals on incoming request +# signal req_index : index_t; +# signal req_row : row_t; +# signal req_hit_way : way_t; +# signal req_tag : cache_tag_t; +# signal req_op : op_t; +# signal req_data : std_ulogic_vector(63 downto 0); +# signal req_same_tag : std_ulogic; +# signal req_go : std_ulogic; + # Async signals on incoming request + req_index = Signal(INDEX) + req_row = Signal(ROW) + req_hit_way = Signal(WAY_BITS) + req_tag = Signal(CACHE_TAG) + req_op = Op() + req_data = Signal(64) + req_same_tag = Signal() + req_go = Signal() -# -- Start the wishbone cycle -# r1.wb.we <= '0'; -# r1.wb.cyc <= '1'; -# r1.wb.stb <= '1'; - # Start the wishbone cycle - sync += r1.wb.we.eq(0) - sync += r1.wb.cyc.eq(1) - sync += r1.wb.stb.eq(1) +# signal early_req_row : row_t; +# +# signal cancel_store : std_ulogic; +# signal set_rsrv : std_ulogic; +# signal clear_rsrv : std_ulogic; +# +# signal r0_valid : std_ulogic; +# signal r0_stall : std_ulogic; +# +# signal use_forward1_next : std_ulogic; +# signal use_forward2_next : std_ulogic; + early_req_row = Signal(ROW) -# -- Track that we had one request sent -# r1.state <= RELOAD_WAIT_ACK; -# r1.write_tag <= '1'; - # Track that we had one request sent - sync += r1.state.eq(State.RELOAD_WAIT_ACK) - sync += r1.write_tag.eq(1) + cancel_store = Signal() + set_rsrv = Signal() + clear_rsrv = Signal() -# when OP_LOAD_NC => - with m.Case(Op.OP_LOAD_NC): -# r1.wb.cyc <= '1'; -# r1.wb.stb <= '1'; -# r1.wb.we <= '0'; -# r1.state <= NC_LOAD_WAIT_ACK; - sync += r1.wb.cyc.eq(1) - sync += r1.wb.stb.eq(1) - sync += r1.wb.we.eq(0) - sync += r1.state.eq(State.NC_LOAD_WAIT_ACK) + r0_valid = Signal() + r0_stall = Signal() -# when OP_STORE_HIT | OP_STORE_MISS => - with m.Case(Op.OP_STORE_HIT - | Op.OP_STORE_MISS): -# if req.dcbz = '0' then - with m.If(~req.bcbz): -# r1.state <= STORE_WAIT_ACK; -# r1.acks_pending <= to_unsigned(1, 3); -# r1.full <= '0'; -# r1.slow_valid <= '1'; - sync += r1.state.eq( - State.STORE_WAIT_ACK - ) - sync += r1.acks_pending.eq( - '''TODO to_unsignes(1,3)''' - ) - sync += r1.full.eq(0) - sync += r1.slow_valid.eq(1) + use_forward1_next = Signal() + use_forward2_next = Signal() -# if req.mmu_req = '0' then - with m.If(~req.mmu_req): -# r1.ls_valid <= '1'; - sync += r1.ls_valid.eq(1) -# else - with m.Else(): -# r1.mmu_done <= '1'; - sync += r1.mmu_done.eq(1) -# end if; +# -- Cache RAM interface +# type cache_ram_out_t is array(way_t) of cache_row_t; +# signal cache_out : cache_ram_out_t; + # Cache RAM interface + def CacheRamOut(): + return Array(Signal(CACHE_ROW) for x in range(NUM_WAYS)) -# if req.op = OP_STORE_HIT then - with m.If(req.op == Op.OP_STORE_HIT): -# r1.write_bram <= '1'; - sync += r1.write_bram.eq(1) -# end if; + cache_out = CacheRamOut() -# else - with m.Else(): -# -- dcbz is handled much like a load -# -- miss except that we are writing -# -- to memory instead of reading -# r1.state <= RELOAD_WAIT_ACK; - # dcbz is handled much like a load - # miss except that we are writing - # to memory instead of reading - sync += r1.state.eq(Op.RELOAD_WAIT_ACK) +# -- PLRU output interface +# type plru_out_t is array(index_t) of +# std_ulogic_vector(WAY_BITS-1 downto 0); +# signal plru_victim : plru_out_t; +# signal replace_way : way_t; + # PLRU output interface + def PLRUOut(): + return Array(Signal(WAY_BITS) for x in range(Index())) -# if req.op = OP_STORE_MISS then - with m.If(req.op == Op.OP_STORE_MISS): -# r1.write_tag <= '1'; - sync += r1.write_tag.eq(1) -# end if; -# end if; + plru_victim = PLRUOut() + replace_way = Signal(WAY_BITS) -# r1.wb.we <= '1'; -# r1.wb.cyc <= '1'; -# r1.wb.stb <= '1'; - sync += r1.wb.we.eq(1) - sync += r1.wb.cyc.eq(1) - sync += r1.wb.stb.eq(1) +# -- Wishbone read/write/cache write formatting signals +# signal bus_sel : std_ulogic_vector(7 downto 0); + # Wishbone read/write/cache write formatting signals + bus_sel = Signal(8) -# -- OP_NONE and OP_BAD do nothing -# -- OP_BAD & OP_STCX_FAIL were handled above already -# when OP_NONE => -# when OP_BAD => -# when OP_STCX_FAIL => - # OP_NONE and OP_BAD do nothing - # OP_BAD & OP_STCX_FAIL were - # handled above already - with m.Case(Op.OP_NONE): - pass +# -- TLB signals +# signal tlb_tag_way : tlb_way_tags_t; +# signal tlb_pte_way : tlb_way_ptes_t; +# signal tlb_valid_way : tlb_way_valids_t; +# signal tlb_req_index : tlb_index_t; +# signal tlb_hit : std_ulogic; +# signal tlb_hit_way : tlb_way_t; +# signal pte : tlb_pte_t; +# signal ra : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); +# signal valid_ra : std_ulogic; +# signal perm_attr : perm_attr_t; +# signal rc_ok : std_ulogic; +# signal perm_ok : std_ulogic; +# signal access_ok : std_ulogic; + # TLB signals + tlb_tag_way = Signal(TLB_WAY_TAGS) + tlb_pte_way = Signal(TLB_WAY_PTES) + tlb_valid_way = Signal(TLB_WAY_VALID_BITS) + tlb_req_index = Signal(TLB_SET_SIZE) + tlb_hit = Signal() + tlb_hit_way = Signal(TLB_WAY) + pte = Signal(TLB_PTE) + ra = Signal(REAL_ADDR_BITS) + valid_ra = Signal() + perm_attr = PermAttr() + rc_ok = Signal() + perm_ok = Signal() + access_ok = Signal() - with m.Case(OP_BAD): - pass +# -- TLB PLRU output interface +# type tlb_plru_out_t is array(tlb_index_t) of +# std_ulogic_vector(TLB_WAY_BITS-1 downto 0); +# signal tlb_plru_victim : tlb_plru_out_t; + # TLB PLRU output interface + def TLBPLRUOut(): + return Array( + Signal(TLB_WAY_BITS) for x in range(TLB_SET_SIZE) + ) - with m.Case(OP_STCX_FAIL): - pass -# end case; + tlb_plru_victim = TLBPLRUOut() -# when RELOAD_WAIT_ACK => - with m.Case(State.RELOAD_WAIT_ACK): -# -- Requests are all sent if stb is 0 - # Requests are all sent if stb is 0 - sync += stbs_done.eq(~r1.wb.stb) -# stbs_done := r1.wb.stb = '0'; +# -- Helper functions to decode incoming requests +# +# -- Return the cache line index (tag index) for an address +# function get_index(addr: std_ulogic_vector) return index_t is +# begin +# return to_integer( +# unsigned(addr(SET_SIZE_BITS - 1 downto LINE_OFF_BITS)) +# ); +# end; +# Helper functions to decode incoming requests +# + # Return the cache line index (tag index) for an address + def get_index(addr): + return addr[LINE_OFF_BITS:SET_SIZE_BITS] -# -- If we are still sending requests, -# -- was one accepted? -# if wishbone_in.stall = '0' and not stbs_done then - # If we are still sending requests, - # was one accepted? - with m.If(~wb_in.stall & ~stbs_done): -# -- That was the last word ? We are done sending. -# -- Clear stb and set stbs_done so we can handle -# -- an eventual last ack on the same cycle. -# if is_last_row_addr( -# r1.wb.adr, r1.end_row_ix -# ) then - # That was the last word? - # We are done sending. - # Clear stb and set stbs_done - # so we can handle an eventual - # last ack on the same cycle. - with m.If(is_last_row_addr( - r1.wb.adr, r1.end_row_ix)): -# r1.wb.stb <= '0'; -# stbs_done := true; - sync += r1.wb.stb.eq(0) - sync += stbs_done.eq(0) -# end if; +# -- Return the cache row index (data memory) for an address +# function get_row(addr: std_ulogic_vector) return row_t is +# begin +# return to_integer( +# unsigned(addr(SET_SIZE_BITS - 1 downto ROW_OFF_BITS)) +# ); +# end; + # Return the cache row index (data memory) for an address + def get_row(addr): + return addr[ROW_OFF_BITS:SET_SIZE_BITS] -# -- Calculate the next row address -# r1.wb.adr <= next_row_addr(r1.wb.adr); - # Calculate the next row address - sync += r1.wb.adr.eq(next_row_addr(r1.wb.adr)) -# end if; +# -- Return the index of a row within a line +# function get_row_of_line(row: row_t) return row_in_line_t is +# variable row_v : unsigned(ROW_BITS-1 downto 0); +# begin +# row_v := to_unsigned(row, ROW_BITS); +# return row_v(ROW_LINEBITS-1 downto 0); +# end; + # Return the index of a row within a line + def get_row_of_line(row): + row_v = Signal(ROW_BITS) + row_v = Signal(row) + return row_v[0:ROW_LINE_BITS] -# -- Incoming acks processing -# r1.forward_valid1 <= wishbone_in.ack; - # Incoming acks processing - sync += r1.forward_valid1.eq(wb_in.ack) +# -- Returns whether this is the last row of a line +# function is_last_row_addr(addr: wishbone_addr_type; +# last: row_in_line_t) return boolean is +# begin +# return +# unsigned(addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS)) = last; +# end; + # Returns whether this is the last row of a line + def is_last_row_addr(addr, last): + return addr[ROW_OFF_BITS:LINE_OFF_BITS] == last -# if wishbone_in.ack = '1' then - with m.If(wb_in.ack): -# r1.rows_valid( -# r1.store_row mod ROW_PER_LINE -# ) <= '1'; - sync += r1.rows_valid[ - r1.store_row % ROW_PER_LINE - ].eq(1) +# -- Returns whether this is the last row of a line +# function is_last_row(row: row_t; last: row_in_line_t) +# return boolean is +# begin +# return get_row_of_line(row) = last; +# end; + # Returns whether this is the last row of a line + def is_last_row(row, last): + return get_row_of_line(row) == last -# -- If this is the data we were looking for, -# -- we can complete the request next cycle. -# -- Compare the whole address in case the -# -- request in r1.req is not the one that -# -- started this refill. -# if r1.full = '1' and r1.req.same_tag = '1' -# and ((r1.dcbz = '1' and r1.req.dcbz = '1') -# or (r1.dcbz = '0' and r1.req.op = OP_LOAD_MISS)) -# and r1.store_row = get_row(r1.req.real_addr) then - # If this is the data we were looking for, - # we can complete the request next cycle. - # Compare the whole address in case the - # request in r1.req is not the one that - # started this refill. - with m.If(r1.full & r1.req.same_tag & - ((r1.dcbz & r1.req.dcbz) - (~r1.dcbz & - r1.req.op == Op.OP_LOAD_MISS) - ) & - r1.store_row - == get_row(r1.req.real_addr): -# r1.full <= '0'; -# r1.slow_valid <= '1'; - sync += r1.full.eq(0) - sync += r1.slow_valid.eq(1) +# -- Return the address of the next row in the current cache line +# function next_row_addr(addr: wishbone_addr_type) +# return std_ulogic_vector is +# variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0); +# variable result : wishbone_addr_type; +# begin +# -- Is there no simpler way in VHDL to +# -- generate that 3 bits adder ? +# row_idx := addr(LINE_OFF_BITS-1 downto ROW_OFF_BITS); +# row_idx := std_ulogic_vector(unsigned(row_idx) + 1); +# result := addr; +# result(LINE_OFF_BITS-1 downto ROW_OFF_BITS) := row_idx; +# return result; +# end; + # Return the address of the next row in the current cache line + def next_row_addr(addr): + row_idx = Signal(ROW_LINE_BITS) + result = WBAddrType() + # Is there no simpler way in VHDL to + # generate that 3 bits adder ? + row_idx = addr[ROW_OFF_BITS:LINE_OFF_BITS] + row_idx = Signal(row_idx + 1) + result = addr + result[ROW_OFF_BITS:LINE_OFF_BITS] = row_idx + return result -# if r1.mmu_req = '0' then - with m.If(~r1.mmu_req): -# r1.ls_valid <= '1'; - sync += r1.ls_valid.eq(1) -# else - with m.Else(): -# r1.mmu_done <= '1'; - sync += r1.mmu_done.eq(1) -# end if; -# r1.forward_sel <= (others => '1'); -# r1.use_forward1 <= '1'; - sync += r1.forward_sel.eq(1) - sync += r1.use_forward1.eq(1) -# end if; +# -- Return the next row in the current cache line. We use a +# -- dedicated function in order to limit the size of the +# -- generated adder to be only the bits within a cache line +# -- (3 bits with default settings) +# function next_row(row: row_t) return row_t is +# variable row_v : std_ulogic_vector(ROW_BITS-1 downto 0); +# variable row_idx : std_ulogic_vector(ROW_LINEBITS-1 downto 0); +# variable result : std_ulogic_vector(ROW_BITS-1 downto 0); +# begin +# row_v := std_ulogic_vector(to_unsigned(row, ROW_BITS)); +# row_idx := row_v(ROW_LINEBITS-1 downto 0); +# row_v(ROW_LINEBITS-1 downto 0) := +# std_ulogic_vector(unsigned(row_idx) + 1); +# return to_integer(unsigned(row_v)); +# end; +# Return the next row in the current cache line. We use a +# dedicated function in order to limit the size of the +# generated adder to be only the bits within a cache line +# (3 bits with default settings) + def next_row(row) + row_v = Signal(ROW_BITS) + row_idx = Signal(ROW_LINE_BITS) + result = Signal(ROW_BITS) -# -- Check for completion -# if stbs_done and is_last_row(r1.store_row, -# r1.end_row_ix) then - # Check for completion - with m.If(stbs_done & - is_last_row(r1.store_row, - r1.end_row_ix)): + row_v = Signal(row) + row_idx = row_v[ROW_LINE_BITS] + row_v[0:ROW_LINE_BITS] = Signal(row_idx + 1) + return row_v -# -- Complete wishbone cycle -# r1.wb.cyc <= '0'; - # Complete wishbone cycle - sync += r1.wb.cyc.eq(0) +# -- Get the tag value from the address +# function get_tag(addr: std_ulogic_vector) return cache_tag_t is +# begin +# return addr(REAL_ADDR_BITS - 1 downto SET_SIZE_BITS); +# end; + # Get the tag value from the address + def get_tag(addr): + return addr[SET_SIZE_BITS:REAL_ADDR_BITS] -# -- Cache line is now valid -# cache_valids(r1.store_index)( -# r1.store_way -# ) <= '1'; - # Cache line is now valid - sync += cache_valid_bits[ - r1.store_index - ][r1.store_way].eq(1) +# -- Read a tag from a tag memory row +# function read_tag(way: way_t; tagset: cache_tags_set_t) +# return cache_tag_t is +# begin +# return tagset(way * TAG_WIDTH + TAG_BITS +# - 1 downto way * TAG_WIDTH); +# end; + # Read a tag from a tag memory row + def read_tag(way, tagset): + return tagset[way *TAG_WIDTH:way * TAG_WIDTH + TAG_BITS] -# r1.state <= IDLE; - sync += r1.state.eq(State.IDLE) -# end if; +# -- Read a TLB tag from a TLB tag memory row +# function read_tlb_tag(way: tlb_way_t; tags: tlb_way_tags_t) +# return tlb_tag_t is +# variable j : integer; +# begin +# j := way * TLB_EA_TAG_BITS; +# return tags(j + TLB_EA_TAG_BITS - 1 downto j); +# end; + # Read a TLB tag from a TLB tag memory row + def read_tlb_tag(way, tags): + j = Signal() -# -- Increment store row counter -# r1.store_row <= next_row(r1.store_row); - # Increment store row counter - sync += r1.store_row.eq(next_row( - r1.store_row - )) -# end if; + j = way * TLB_EA_TAG_BITS + return tags[j:j + TLB_EA_TAG_BITS] -# when STORE_WAIT_ACK => - with m.Case(State.STORE_WAIT_ACK): -# stbs_done := r1.wb.stb = '0'; -# acks := r1.acks_pending; - sync += stbs_done.eq(~r1.wb.stb) - sync += acks.eq(r1.acks_pending) +# -- Write a TLB tag to a TLB tag memory row +# procedure write_tlb_tag(way: tlb_way_t; tags: inout tlb_way_tags_t; +# tag: tlb_tag_t) is +# variable j : integer; +# begin +# j := way * TLB_EA_TAG_BITS; +# tags(j + TLB_EA_TAG_BITS - 1 downto j) := tag; +# end; + # Write a TLB tag to a TLB tag memory row + def write_tlb_tag(way, tags), tag): + j = Signal() -# if r1.inc_acks /= r1.dec_acks then - with m.If(r1.inc_acks != r1.dec_acks): + j = way * TLB_EA_TAG_BITS + tags[j:j + TLB_EA_TAG_BITS] = tag -# if r1.inc_acks = '1' then - with m.If(r1.inc_acks): -# acks := acks + 1; - sync += acks.eq(acks + 1) +# -- Read a PTE from a TLB PTE memory row +# function read_tlb_pte(way: tlb_way_t; ptes: tlb_way_ptes_t) +# return tlb_pte_t is +# variable j : integer; +# begin +# j := way * TLB_PTE_BITS; +# return ptes(j + TLB_PTE_BITS - 1 downto j); +# end; + # Read a PTE from a TLB PTE memory row + def read_tlb_pte(way, ptes): + j = Signal() -# else - with m.Else(): -# acks := acks - 1; - sync += acks.eq(acks - 1) -# end if; -# end if; + j = way * TLB_PTE_BITS + return ptes[j:j + TLB_PTE_BITS] -# r1.acks_pending <= acks; - sync += r1.acks_pending.eq(acks) +# procedure write_tlb_pte(way: tlb_way_t; +# ptes: inout tlb_way_ptes_t; newpte: tlb_pte_t) is +# variable j : integer; +# begin +# j := way * TLB_PTE_BITS; +# ptes(j + TLB_PTE_BITS - 1 downto j) := newpte; +# end; + def write_tlb_pte(way, ptes,newpte): + j = Signal() -# -- Clear stb when slave accepted request -# if wishbone_in.stall = '0' then - # Clear stb when slave accepted request - with m.If(~wb_in.stall): -# -- See if there is another store waiting -# -- to be done which is in the same real page. -# if req.valid = '1' then - # See if there is another store waiting - # to be done which is in the same real page. - with m.If(req.valid): -# r1.wb.adr( -# SET_SIZE_BITS - 1 downto 0 -# ) <= req.real_addr( -# SET_SIZE_BITS - 1 downto 0 -# ); -# r1.wb.dat <= req.data; -# r1.wb.sel <= req.byte_sel; - sync += r1.wb.adr[0:SET_SIZE_BITS].eq( - req.real_addr[0:SET_SIZE_BITS] - ) -# end if; + j = way * TLB_PTE_BITS + return ptes[j:j + TLB_PTE_BITS] = newpte -# if acks < 7 and req.same_tag = '1' -# and (req.op = OP_STORE_MISS -# or req.op = OP_STORE_HIT) then - with m.Elif(acks < 7 & req.same_tag & - (req.op == Op.Op_STORE_MISS - | req.op == Op.OP_SOTRE_HIT)): -# r1.wb.stb <= '1'; -# stbs_done := false; - sync += r1.wb.stb.eq(1) - sync += stbs_done.eq(0) +# begin +# +"""these, because they are constants, can actually be done *as* + python asserts: + assert LINE_SIZE % ROWSIZE == 0, "line size not ...." +""" +# assert LINE_SIZE mod ROW_SIZE = 0 +# report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE; +# assert ispow2(LINE_SIZE) +# report "LINE_SIZE not power of 2" severity FAILURE; +# assert ispow2(NUM_LINES) +# report "NUM_LINES not power of 2" severity FAILURE; +# assert ispow2(ROW_PER_LINE) +# report "ROW_PER_LINE not power of 2" severity FAILURE; +# assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS) +# report "geometry bits don't add up" severity FAILURE; +# assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) +# report "geometry bits don't add up" severity FAILURE; +# assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) +# report "geometry bits don't add up" severity FAILURE; +# assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS) +# report "geometry bits don't add up" severity FAILURE; +# assert (64 = wishbone_data_bits) +# report "Can't yet handle a wishbone width that isn't 64-bits" +# severity FAILURE; +# assert SET_SIZE_BITS <= TLB_LG_PGSZ +# report "Set indexed by virtual address" severity FAILURE; + assert (LINE_SIZE % ROW_SIZE) == 0 "LINE_SIZE not " \ + "multiple of ROW_SIZE" -# if req.op = OP_STORE_HIT then - with m.If(req.op == Op.OP_STORE_HIT): -# r1.write_bram <= '1'; - sync += r1.write_bram.eq(1) -# end if; -# r1.full <= '0'; -# r1.slow_valid <= '1'; - sync += r1.full.eq(0) - sync += r1.slow_valid.eq(1) + assert (LINE_SIZE % 2) == 0 "LINE_SIZE not power of 2" -# -- Store requests never come from the MMU -# r1.ls_valid <= '1'; -# stbs_done := false; -# r1.inc_acks <= '1'; - # Store request never come from the MMU - sync += r1.ls_valid.eq(1) - sync += stbs_done.eq(0) - sync += r1.inc_acks.eq(1) -# else - with m.Else(): -# r1.wb.stb <= '0'; -# stbs_done := true; - sync += r1.wb.stb.eq(0) - sync += stbs_done.eq(1) -# end if; -# end if; + assert (NUM_LINES % 2) == 0 "NUM_LINES not power of 2" -# -- Got ack ? See if complete. -# if wishbone_in.ack = '1' then - # Got ack ? See if complete. - with m.If(wb_in.ack): -# if stbs_done and acks = 1 then - with m.If(stbs_done & acks) -# r1.state <= IDLE; -# r1.wb.cyc <= '0'; -# r1.wb.stb <= '0'; - sync += r1.state.eq(State.IDLE) - sync += r1.wb.cyc.eq(0) - sync += r1.wb.stb.eq(0) -# end if; -# r1.dec_acks <= '1'; - sync += r1.dec_acks.eq(1) -# end if; + assert (ROW_PER_LINE % 2) == 0 "ROW_PER_LINE not" \ + "power of 2" -# when NC_LOAD_WAIT_ACK => - with m.Case(State.NC_LOAD_WAIT_ACK): -# -- Clear stb when slave accepted request -# if wishbone_in.stall = '0' then - # Clear stb when slave accepted request - with m.If(~wb_in.stall): -# r1.wb.stb <= '0'; - sync += r1.wb.stb.eq(0) -# end if; + assert ROW_BITS == (INDEX_BITS + ROW_LINE_BITS) \ + "geometry bits don't add up" -# -- Got ack ? complete. -# if wishbone_in.ack = '1' then - # Got ack ? complete. - with m.If(wb_in.ack): -# r1.state <= IDLE; -# r1.full <= '0'; -# r1.slow_valid <= '1'; - sync += r1.state.eq(State.IDLE) - sync += r1.full.eq(0) - sync += r1.slow_valid.eq(1) + assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) \ + "geometry bits don't add up" -# if r1.mmu_req = '0' then - with m.If(~r1.mmu_req): -# r1.ls_valid <= '1'; - sync += r1.ls_valid.eq(1) + assert REAL_ADDR_BITS == (TAG_BITS + INDEX_BITS \ + + LINE_OFF_BITS) "geometry bits don't add up" -# else - with m.Else(): -# r1.mmu_done <= '1'; - sync += r1.mmu_done.eq(1) -# end if; + assert REAL_ADDR_BITS == (TAG_BITS + ROW_BITS + ROW_OFF_BITS) \ + "geometry bits don't add up" -# r1.forward_sel <= (others => '1'); -# r1.use_forward1 <= '1'; -# r1.wb.cyc <= '0'; -# r1.wb.stb <= '0'; - sync += r1.forward_sel.eq(1) - sync += r1.use_forward1.eq(1) - sync += r1.wb.cyc.eq(0) - sync += r1.wb.stb.eq(0) -# end if; -# end case; -# end if; -# end if; -# end process; + assert 64 == wishbone_data_bits "Can't yet handle a" \ + "wishbone width that isn't 64-bits" -# dc_log: if LOG_LENGTH > 0 generate -# TODO learn how to tranlate vhdl generate into nmigen -class DcacheLog(Elaborate): - def __init__(self): - pass + assert SET_SIZE_BITS <= TLB_LG_PGSZ "Set indexed by" \ + "virtual address" - def elaborate(self, platform): - m = Module() +# -- we don't yet handle collisions between loadstore1 requests +# -- and MMU requests +# m_out.stall <= '0'; +# we don't yet handle collisions between loadstore1 requests +# and MMU requests +comb += m_out.stall.eq(0) - comb = m.d.comb - sync = m.d.sync +# -- Hold off the request in r0 when r1 has an uncompleted request +# r0_stall <= r0_full and r1.full; +# r0_valid <= r0_full and not r1.full; +# stall_out <= r0_stall; +# Hold off the request in r0 when r1 has an uncompleted request +comb += r0_stall.eq(r0_full & r1.full) +comb += r0_valid.eq(r0_full & ~r1.full) +comb += stall_out.eq(r0_stall) -# signal log_data : std_ulogic_vector(19 downto 0); - log_data = Signal(20) - comb += log_data +# -- Wire up wishbone request latch out of stage 1 +# wishbone_out <= r1.wb; + # Wire up wishbone request latch out of stage 1 + comb += wishbone_out.eq(r1.wb) -# begin -# dcache_log: process(clk) -# begin -# if rising_edge(clk) then -# log_data <= r1.wb.adr(5 downto 3) & -# wishbone_in.stall & -# wishbone_in.ack & -# r1.wb.stb & r1.wb.cyc & -# d_out.error & -# d_out.valid & -# std_ulogic_vector( -# to_unsigned(op_t'pos(req_op), 3)) & -# stall_out & -# std_ulogic_vector( -# to_unsigned(tlb_hit_way, 3)) & -# valid_ra & -# std_ulogic_vector( -# to_unsigned(state_t'pos(r1.state), 3)); - sync += log_data.eq(Cat( - Const(r1.state, 3), valid_ra, Const(tlb_hit_way, 3), - stall_out, Const(req_op, 3), d_out.valid, d_out.error, - r1.wb.cyc, r1.wb.stb, wb_in.ack, wb_in.stall, - r1.wb.adr[3:6] - )) -# end if; -# end process; -# log_out <= log_data; - # TODO ??? I am very confused need help - comb += log_out.eq(log_data) -# end generate; -# end;