From: Cole Poirier Date: Thu, 10 Sep 2020 23:30:23 +0000 (-0700) Subject: icache.py rearrange the code within the base class ICache X-Git-Tag: semi_working_ecp5~112 X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=3dfba77ef496d2beeb94a4084b9e99af1758e9e9;p=soc.git icache.py rearrange the code within the base class ICache --- diff --git a/src/soc/experiment/icache.py b/src/soc/experiment/icache.py index a8b3bad0..94a1068c 100644 --- a/src/soc/experiment/icache.py +++ b/src/soc/experiment/icache.py @@ -338,409 +338,134 @@ class ICache(Elaboratable): ^ addr[TLB_LG_PGSZ + 2 * TLB_BITS: TLB_LG_PGSZE + 3 * TLB_BITS] return hash - def elaborate(self, platform): -# architecture rtl of icache is -# constant ROW_SIZE_BITS : natural := ROW_SIZE*8; -# -- ROW_PER_LINE is the number of row (wishbone transactions) in a line -# constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE; -# -- BRAM_ROWS is the number of rows in BRAM needed to represent the full -# -- icache -# constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE; -# -- INSN_PER_ROW is the number of 32bit instructions per BRAM row -# constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32; -# -- Bit fields counts in the address -# -# -- INSN_BITS is the number of bits to select an instruction in a row -# constant INSN_BITS : natural := log2(INSN_PER_ROW); -# -- ROW_BITS is the number of bits to select a row -# constant ROW_BITS : natural := log2(BRAM_ROWS); -# -- ROW_LINEBITS is the number of bits to select a row within a line -# constant ROW_LINEBITS : natural := log2(ROW_PER_LINE); -# -- LINE_OFF_BITS is the number of bits for the offset in a cache line -# constant LINE_OFF_BITS : natural := log2(LINE_SIZE); -# -- ROW_OFF_BITS is the number of bits for the offset in a row -# constant ROW_OFF_BITS : natural := log2(ROW_SIZE); -# -- INDEX_BITS is the number of bits to select a cache line -# constant INDEX_BITS : natural := log2(NUM_LINES); -# -- SET_SIZE_BITS is the log base 2 of the set size -# constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS; -# -- TAG_BITS is the number of bits of the tag part of the address -# constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS; -# -- WAY_BITS is the number of bits to select a way -# constant WAY_BITS : natural := log2(NUM_WAYS); +# -- Generate a cache RAM for each way +# rams: for i in 0 to NUM_WAYS-1 generate +# signal do_read : std_ulogic; +# signal do_write : std_ulogic; +# signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0); +# signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0); +# signal dout : cache_row_t; +# signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0); +# begin +# way: entity work.cache_ram +# generic map ( +# ROW_BITS => ROW_BITS, +# WIDTH => ROW_SIZE_BITS +# ) +# port map ( +# clk => clk, +# rd_en => do_read, +# rd_addr => rd_addr, +# rd_data => dout, +# wr_sel => wr_sel, +# wr_addr => wr_addr, +# wr_data => wishbone_in.dat +# ); +# process(all) +# begin +# do_read <= not (stall_in or use_previous); +# do_write <= '0'; +# if wishbone_in.ack = '1' and replace_way = i then +# do_write <= '1'; +# end if; +# cache_out(i) <= dout; +# rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); +# wr_addr <= std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS)); +# for i in 0 to ROW_SIZE-1 loop +# wr_sel(i) <= do_write; +# end loop; +# end process; +# end generate; + def rams(self, m): + comb = m.d.comb - ROW_SIZE_BITS = ROW_SIZE * 8 - # ROW_PER_LINE is the number of row - # (wishbone) transactions in a line - ROW_PER_LINE = LINE_SIZE / ROW_SIZE - # BRAM_ROWS is the number of rows in - # BRAM needed to represent the full icache - BRAM_ROWS = NUM_LINES * ROW_PER_LINE - # INSN_PER_ROW is the number of 32bit - # instructions per BRAM row - INSN_PER_ROW = ROW_SIZE_BITS / 32 + do_read = Signal() + do_write = Signal() + rd_addr = Signal(ROW_BITS) + wr_addr = Signal(ROW_BITS) + _d_out = Signal(ROW_SIZE_BITS) + wr_sel = Signal(ROW_SIZE) - # Bit fields counts in the address - # - # INSN_BITS is the number of bits to - # select an instruction in a row - INSN_BITS = log2_int(INSN_PER_ROW) - # ROW_BITS is the number of bits to - # select a row - ROW_BITS = log2_int(BRAM_ROWS) - # ROW_LINEBITS is the number of bits to - # select a row within a line - ROW_LINE_BITS = log2_int(ROW_PER_LINE) - # LINE_OFF_BITS is the number of bits for - # the offset in a cache line - LINE_OFF_BITS = log2_int(LINE_SIZE) - # ROW_OFF_BITS is the number of bits for - # the offset in a row - ROW_OFF_BITS = log2_int(ROW_SIZE) - # INDEX_BITS is the number of bits to - # select a cache line - INDEX_BITS = log2_int(NUM_LINES) - # SET_SIZE_BITS is the log base 2 of - # the set size - SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS - # TAG_BITS is the number of bits of - # the tag part of the address - TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS - # WAY_BITS is the number of bits to - # select a way - WAY_BITS = log2_int(NUM_WAYS) - TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS + for i in range(NUM_WAYS) + way = CacheRam(ROW_BITS, ROW_SIZE_BITS) + comb += way.rd_en.eq(do_read) + comb += way.rd_addr.eq(rd_addr) + comb += way.rd_data.eq(_d_out) + comb += way.wr_sel.eq(wr_sel) + comb += way.wr_add.eq(wr_addr) + comb += way.wr_data.eq('''TODO ?? wishbone_in.data ??''') -# -- Example of layout for 32 lines of 64 bytes: -# -- -# -- .. tag |index| line | -# -- .. | row | | -# -- .. | | | |00| zero (2) -# -- .. | | |-| | INSN_BITS (1) -# -- .. | |---| | ROW_LINEBITS (3) -# -- .. | |--- - --| LINE_OFF_BITS (6) -# -- .. | |- --| ROW_OFF_BITS (3) -# -- .. |----- ---| | ROW_BITS (8) -# -- .. |-----| | INDEX_BITS (5) -# -- .. --------| | TAG_BITS (53) - # Example of layout for 32 lines of 64 bytes: - # - # .. tag |index| line | - # .. | row | | - # .. | | | |00| zero (2) - # .. | | |-| | INSN_BITS (1) - # .. | |---| | ROW_LINEBITS (3) - # .. | |--- - --| LINE_OFF_BITS (6) - # .. | |- --| ROW_OFF_BITS (3) - # .. |----- ---| | ROW_BITS (8) - # .. |-----| | INDEX_BITS (5) - # .. --------| | TAG_BITS (53) + comb += do_read.eq(~(stall_in | use_previous)) + comb += do_write.eq(0) -# subtype row_t is integer range 0 to BRAM_ROWS-1; -# subtype index_t is integer range 0 to NUM_LINES-1; -# subtype way_t is integer range 0 to NUM_WAYS-1; -# subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0); + with m.If(wb_in.ack & replace_way == i): + do_write.eq(1) + + comb += cache_out[i].eq(_d_out) + comb += rd_addr.eq(Signal(req_row)) + comb += wr_addr.eq(Signal(r.store_row)) + for j in range(ROW_SIZE): + comb += wr_sel[j].eq(do_write) + +# -- Generate PLRUs +# maybe_plrus: if NUM_WAYS > 1 generate +# begin +# plrus: for i in 0 to NUM_LINES-1 generate +# -- PLRU interface +# signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); +# signal plru_acc_en : std_ulogic; +# signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); # -# -- The cache data BRAM organized as described above for each way -# subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0); +# begin +# plru : entity work.plru +# generic map ( +# BITS => WAY_BITS +# ) +# port map ( +# clk => clk, +# rst => rst, +# acc => plru_acc, +# acc_en => plru_acc_en, +# lru => plru_out +# ); # -# -- The cache tags LUTRAM has a row per set. Vivado is a pain and will -# -- not handle a clean (commented) definition of the cache tags as a 3d -# -- memory. For now, work around it by putting all the tags -# subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0); -# -- type cache_tags_set_t is array(way_t) of cache_tag_t; -# -- type cache_tags_array_t is array(index_t) of cache_tags_set_t; -# constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS; -# subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0); -# type cache_tags_array_t is array(index_t) of cache_tags_set_t; - def CacheTagArray(): - return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES)) +# process(all) +# begin +# -- PLRU interface +# if get_index(r.hit_nia) = i then +# plru_acc_en <= r.hit_valid; +# else +# plru_acc_en <= '0'; +# end if; +# plru_acc <= +# std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS)); +# plru_victim(i) <= plru_out; +# end process; +# end generate; +# end generate; + def maybe_plrus(self, m): + comb += m.d.comb -# -- The cache valid bits -# subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0); -# type cache_valids_t is array(index_t) of cache_way_valids_t; -# type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic; - def CacheValidBitsArray(): - return Array(Signal() for x in ROW_PER_LINE) + with m.If(NUM_WAYS > 1): + plru_acc = Signal(WAY_BITS) + plru_acc_en = Signal() + plru_out = Signal(WAY_BITS) - def RowPerLineValidArray(): - return Array(Signal() for x in range ROW_PER_LINE) + for i in range(NUM_LINES): + plru = PLRU(WAY_BITS) + comb += plru.acc.eq(plru_acc) + comb += plru.acc_en.eq(plru_acc_en) + comb += plru.lru.eq(plru_out) -# -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs -# signal cache_tags : cache_tags_array_t; -# signal cache_valids : cache_valids_t; - # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs - cache_tags = CacheTagArray() - cache_valid_bits = CacheValidBitsArray() + # PLRU interface + with m.If(get_index(r.hit_nia) == i): + comb += plru.acc_en.eq(r.hit_valid) -# attribute ram_style : string; -# attribute ram_style of cache_tags : signal is "distributed"; - # TODO to be passed to nigmen as ram attributes - # attribute ram_style : string; - # attribute ram_style of cache_tags : signal is "distributed"; + with m.Else(): + comb += plru.acc_en.eq(0) -# -- L1 ITLB. -# constant TLB_BITS : natural := log2(TLB_SIZE); -# constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS); -# constant TLB_PTE_BITS : natural := 64; - TLB_BITS = log2_int(TLB_SIZE) - TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS) - TLB_PTE_BITS = 64 - -# subtype tlb_index_t is integer range 0 to TLB_SIZE - 1; -# type tlb_valids_t is array(tlb_index_t) of std_ulogic; -# subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0); -# type tlb_tags_t is array(tlb_index_t) of tlb_tag_t; -# subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0); -# type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t; - def TLBValidBitsArray(): - return Array(Signal() for x in range(TLB_SIZE)) - - def TLBTagArray(): - return Array(Signal(TLB_EA_TAG_BITS) for x in range(TLB_SIZE)) - - def TLBPTEArray(): - return Array(Signal(LTB_PTE_BITS) for x in range(TLB_SIZE)) - -# signal itlb_valids : tlb_valids_t; -# signal itlb_tags : tlb_tags_t; -# signal itlb_ptes : tlb_ptes_t; -# attribute ram_style of itlb_tags : signal is "distributed"; -# attribute ram_style of itlb_ptes : signal is "distributed"; - itlb_valid_bits = TLBValidBitsArray() - itlb_tags = TLBTagArray() - itlb_ptes = TLBPTEArray() - # TODO to be passed to nmigen as ram attributes - # attribute ram_style of itlb_tags : signal is "distributed"; - # attribute ram_style of itlb_ptes : signal is "distributed"; - -# -- Privilege bit from PTE EAA field -# signal eaa_priv : std_ulogic; - # Privilege bit from PTE EAA field - eaa_priv = Signal() - - -# signal r : reg_internal_t; - r = RegInternal() - -# -- Async signals on incoming request -# signal req_index : index_t; -# signal req_row : row_t; -# signal req_hit_way : way_t; -# signal req_tag : cache_tag_t; -# signal req_is_hit : std_ulogic; -# signal req_is_miss : std_ulogic; -# signal req_laddr : std_ulogic_vector(63 downto 0); - # Async signal on incoming request - req_index = Signal(NUM_LINES) - req_row = Signal(BRAM_ROWS) - req_hit_way = Signal(NUM_WAYS) - req_tag = Signal(TAG_BITS) - req_is_hit = Signal() - req_is_miss = Signal() - req_laddr = Signal(64) - -# signal tlb_req_index : tlb_index_t; -# signal real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); -# signal ra_valid : std_ulogic; -# signal priv_fault : std_ulogic; -# signal access_ok : std_ulogic; -# signal use_previous : std_ulogic; - tlb_req_index = Signal(TLB_SIZE) - real_addr = Signal(REAL_ADDR_BITS) - ra_valid = Signal() - priv_fault = Signal() - access_ok = Signal() - use_previous = Signal() - -# -- Cache RAM interface -# type cache_ram_out_t is array(way_t) of cache_row_t; -# signal cache_out : cache_ram_out_t; - # Cache RAM interface - def CacheRamOut(): - return Array(Signal(ROW_SIZE_BITS) for x in range(NUM_WAYS)) - - cache_out = CacheRamOut() - -# -- PLRU output interface -# type plru_out_t is array(index_t) of -# std_ulogic_vector(WAY_BITS-1 downto 0); -# signal plru_victim : plru_out_t; -# signal replace_way : way_t; - # PLRU output interface - def PLRUOut(): - return Array(Signal(WAY_BITS) for x in range(NUM_LINES)) - - plru_victim = PLRUOut() - replace_way = Signal(NUM_WAYS) - -# begin -# -# assert LINE_SIZE mod ROW_SIZE = 0; -# assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" -# severity FAILURE; -# assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" -# severity FAILURE; -# assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2" -# severity FAILURE; -# assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2" -# severity FAILURE; -# assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS) -# report "geometry bits don't add up" severity FAILURE; -# assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) -# report "geometry bits don't add up" severity FAILURE; -# assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) -# report "geometry bits don't add up" severity FAILURE; -# assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS) -# report "geometry bits don't add up" severity FAILURE; -# -# sim_debug: if SIM generate -# debug: process -# begin -# report "ROW_SIZE = " & natural'image(ROW_SIZE); -# report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE); -# report "BRAM_ROWS = " & natural'image(BRAM_ROWS); -# report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW); -# report "INSN_BITS = " & natural'image(INSN_BITS); -# report "ROW_BITS = " & natural'image(ROW_BITS); -# report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS); -# report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS); -# report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS); -# report "INDEX_BITS = " & natural'image(INDEX_BITS); -# report "TAG_BITS = " & natural'image(TAG_BITS); -# report "WAY_BITS = " & natural'image(WAY_BITS); -# wait; -# end process; -# end generate; - -# -- Generate a cache RAM for each way -# rams: for i in 0 to NUM_WAYS-1 generate -# signal do_read : std_ulogic; -# signal do_write : std_ulogic; -# signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0); -# signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0); -# signal dout : cache_row_t; -# signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0); -# begin -# way: entity work.cache_ram -# generic map ( -# ROW_BITS => ROW_BITS, -# WIDTH => ROW_SIZE_BITS -# ) -# port map ( -# clk => clk, -# rd_en => do_read, -# rd_addr => rd_addr, -# rd_data => dout, -# wr_sel => wr_sel, -# wr_addr => wr_addr, -# wr_data => wishbone_in.dat -# ); -# process(all) -# begin -# do_read <= not (stall_in or use_previous); -# do_write <= '0'; -# if wishbone_in.ack = '1' and replace_way = i then -# do_write <= '1'; -# end if; -# cache_out(i) <= dout; -# rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS)); -# wr_addr <= std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS)); -# for i in 0 to ROW_SIZE-1 loop -# wr_sel(i) <= do_write; -# end loop; -# end process; -# end generate; - def rams(self, m): - comb = m.d.comb - - do_read = Signal() - do_write = Signal() - rd_addr = Signal(ROW_BITS) - wr_addr = Signal(ROW_BITS) - _d_out = Signal(ROW_SIZE_BITS) - wr_sel = Signal(ROW_SIZE) - - for i in range(NUM_WAYS) - way = CacheRam(ROW_BITS, ROW_SIZE_BITS) - comb += way.rd_en.eq(do_read) - comb += way.rd_addr.eq(rd_addr) - comb += way.rd_data.eq(_d_out) - comb += way.wr_sel.eq(wr_sel) - comb += way.wr_add.eq(wr_addr) - comb += way.wr_data.eq('''TODO ?? wishbone_in.data ??''') - - comb += do_read.eq(~(stall_in | use_previous)) - comb += do_write.eq(0) - - with m.If(wb_in.ack & replace_way == i): - do_write.eq(1) - - comb += cache_out[i].eq(_d_out) - comb += rd_addr.eq(Signal(req_row)) - comb += wr_addr.eq(Signal(r.store_row)) - for j in range(ROW_SIZE): - comb += wr_sel[j].eq(do_write) - -# -- Generate PLRUs -# maybe_plrus: if NUM_WAYS > 1 generate -# begin -# plrus: for i in 0 to NUM_LINES-1 generate -# -- PLRU interface -# signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0); -# signal plru_acc_en : std_ulogic; -# signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0); -# -# begin -# plru : entity work.plru -# generic map ( -# BITS => WAY_BITS -# ) -# port map ( -# clk => clk, -# rst => rst, -# acc => plru_acc, -# acc_en => plru_acc_en, -# lru => plru_out -# ); -# -# process(all) -# begin -# -- PLRU interface -# if get_index(r.hit_nia) = i then -# plru_acc_en <= r.hit_valid; -# else -# plru_acc_en <= '0'; -# end if; -# plru_acc <= -# std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS)); -# plru_victim(i) <= plru_out; -# end process; -# end generate; -# end generate; - def maybe_plrus(self, m): - comb += m.d.comb - - with m.If(NUM_WAYS > 1): - plru_acc = Signal(WAY_BITS) - plru_acc_en = Signal() - plru_out = Signal(WAY_BITS) - - for i in range(NUM_LINES): - plru = PLRU(WAY_BITS) - comb += plru.acc.eq(plru_acc) - comb += plru.acc_en.eq(plru_acc_en) - comb += plru.lru.eq(plru_out) - - # PLRU interface - with m.If(get_index(r.hit_nia) == i): - comb += plru.acc_en.eq(r.hit_valid) - - with m.Else(): - comb += plru.acc_en.eq(0) - - comb += plru.acc.eq(r.hit_way) - comb += plru_victim[i].eq(plru.lru) + comb += plru.acc.eq(r.hit_way) + comb += plru_victim[i].eq(plru.lru) # -- TLB hit detection and real address generation # itlb_lookup : process(all) @@ -1226,154 +951,430 @@ class ICache(Elaboratable): tagset ) -# r.state <= WAIT_ACK; - sync += r.state.eq(State.WAIT_ACK) -# end if; +# r.state <= WAIT_ACK; + sync += r.state.eq(State.WAIT_ACK) +# end if; + +# -- Requests are all sent if stb is 0 +# stbs_done := r.wb.stb = '0'; + # Requests are all sent if stb is 0 + comb += stbs_done.eq(r.wb.stb == 0) + +# -- If we are still sending requests, was one accepted ? +# if wishbone_in.stall = '0' and not stbs_done then + # If we are still sending requests, was one accepted? + with m.If(~wb_in.stall & ~stbs_done): +# -- That was the last word ? We are done sending. +# -- Clear stb and set stbs_done so we can handle +# -- an eventual last ack on the same cycle. +# if is_last_row_addr(r.wb.adr, r.end_row_ix) then +# r.wb.stb <= '0'; +# stbs_done := true; +# end if; + # That was the last word ? We are done sending. + # Clear stb and set stbs_done so we can handle + # an eventual last ack on the same cycle. + with m.If(is_last_row_addr( + r.wb.adr, r.end_row_ix)): + sync += r.wb.stb.eq(0) + stbs_done.eq(1) + +# -- Calculate the next row address +# r.wb.adr <= next_row_addr(r.wb.adr); + # Calculate the next row address + sync += r.wb.adr.eq(next_row_addr(r.wb.adr)) +# end if; + +# -- Incoming acks processing +# if wishbone_in.ack = '1' then + # Incoming acks processing + with m.If(wb_in.ack): +# r.rows_valid(r.store_row mod ROW_PER_LINE) <= '1'; + sync += r.rows_valid[ + r.store_row & ROW_PER_LINE + ].eq(1) + +# -- Check for completion +# if stbs_done and +# is_last_row(r.store_row, r.end_row_ix) then + # Check for completion + with m.If(stbs_done & is_last_row( + r.store_row, r.end_row_ix)): +# -- Complete wishbone cycle +# r.wb.cyc <= '0'; + # Complete wishbone cycle + sync += r.wb.cyc.eq(0) + +# -- Cache line is now valid +# cache_valids(r.store_index)(replace_way) <= +# r.store_valid and not inval_in; + # Cache line is now valid + sync += cache_valid_bits[ + r.store_index + ][relace_way].eq( + r.store_valid & ~inval_in + ) + +# -- We are done +# r.state <= IDLE; + # We are done + sync += r.state.eq(State.IDLE) +# end if; + +# -- Increment store row counter +# r.store_row <= next_row(r.store_row); + # Increment store row counter + sync += store_row.eq(next_row(r.store_row)) +# end if; +# end case; +# end if; +# +# -- TLB miss and protection fault processing +# if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then +# r.fetch_failed <= '0'; +# elsif i_in.req = '1' and access_ok = '0' and stall_in = '0' then +# r.fetch_failed <= '1'; +# end if; + # TLB miss and protection fault processing + with m.If('''TODO nmigen rst''' | flush_in | m_in.tlbld): + sync += r.fetch_failed.eq(0) + + with m.Elif(i_in.req & ~access_ok & ~stall_in): + sync += r.fetch_failed.eq(1) +# end if; +# end process; + +# icache_log: if LOG_LENGTH > 0 generate + def icache_log(self, m, log_out): + comb = m.d.comb + sync = m.d.sync + +# -- Output data to logger +# signal log_data : std_ulogic_vector(53 downto 0); +# begin +# data_log: process(clk) +# variable lway: way_t; +# variable wstate: std_ulogic; + # Output data to logger + for i in range(LOG_LENGTH) + # Output data to logger + log_data = Signal(54) + lway = Signal(NUM_WAYS) + wstate = Signal() + +# begin +# if rising_edge(clk) then +# lway := req_hit_way; +# wstate := '0'; + comb += lway.eq(req_hit_way) + comb += wstate.eq(0) + +# if r.state /= IDLE then +# wstate := '1'; +# end if; + with m.If(r.state != State.IDLE): + comb += wstate.eq(1) + +# log_data <= i_out.valid & +# i_out.insn & +# wishbone_in.ack & +# r.wb.adr(5 downto 3) & +# r.wb.stb & r.wb.cyc & +# wishbone_in.stall & +# stall_out & +# r.fetch_failed & +# r.hit_nia(5 downto 2) & +# wstate & +# std_ulogic_vector(to_unsigned(lway, 3)) & +# req_is_hit & req_is_miss & +# access_ok & +# ra_valid; + sync += log_data.eq(Cat( + ra_valid, access_ok, req_is_miss, req_is_hit, + lway '''truncate to 3 bits?''', wstate, r.hit_nia[2:6], + r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc, + r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn, + i_out.valid + )) +# end if; +# end process; +# log_out <= log_data; + comb += log_out.eq(log_data) +# end generate; +# end; + + def elaborate(self, platform): +# architecture rtl of icache is +# constant ROW_SIZE_BITS : natural := ROW_SIZE*8; +# -- ROW_PER_LINE is the number of row (wishbone transactions) in a line +# constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE; +# -- BRAM_ROWS is the number of rows in BRAM needed to represent the full +# -- icache +# constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE; +# -- INSN_PER_ROW is the number of 32bit instructions per BRAM row +# constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32; +# -- Bit fields counts in the address +# +# -- INSN_BITS is the number of bits to select an instruction in a row +# constant INSN_BITS : natural := log2(INSN_PER_ROW); +# -- ROW_BITS is the number of bits to select a row +# constant ROW_BITS : natural := log2(BRAM_ROWS); +# -- ROW_LINEBITS is the number of bits to select a row within a line +# constant ROW_LINEBITS : natural := log2(ROW_PER_LINE); +# -- LINE_OFF_BITS is the number of bits for the offset in a cache line +# constant LINE_OFF_BITS : natural := log2(LINE_SIZE); +# -- ROW_OFF_BITS is the number of bits for the offset in a row +# constant ROW_OFF_BITS : natural := log2(ROW_SIZE); +# -- INDEX_BITS is the number of bits to select a cache line +# constant INDEX_BITS : natural := log2(NUM_LINES); +# -- SET_SIZE_BITS is the log base 2 of the set size +# constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS; +# -- TAG_BITS is the number of bits of the tag part of the address +# constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS; +# -- WAY_BITS is the number of bits to select a way +# constant WAY_BITS : natural := log2(NUM_WAYS); + + ROW_SIZE_BITS = ROW_SIZE * 8 + # ROW_PER_LINE is the number of row + # (wishbone) transactions in a line + ROW_PER_LINE = LINE_SIZE / ROW_SIZE + # BRAM_ROWS is the number of rows in + # BRAM needed to represent the full icache + BRAM_ROWS = NUM_LINES * ROW_PER_LINE + # INSN_PER_ROW is the number of 32bit + # instructions per BRAM row + INSN_PER_ROW = ROW_SIZE_BITS / 32 + + # Bit fields counts in the address + # + # INSN_BITS is the number of bits to + # select an instruction in a row + INSN_BITS = log2_int(INSN_PER_ROW) + # ROW_BITS is the number of bits to + # select a row + ROW_BITS = log2_int(BRAM_ROWS) + # ROW_LINEBITS is the number of bits to + # select a row within a line + ROW_LINE_BITS = log2_int(ROW_PER_LINE) + # LINE_OFF_BITS is the number of bits for + # the offset in a cache line + LINE_OFF_BITS = log2_int(LINE_SIZE) + # ROW_OFF_BITS is the number of bits for + # the offset in a row + ROW_OFF_BITS = log2_int(ROW_SIZE) + # INDEX_BITS is the number of bits to + # select a cache line + INDEX_BITS = log2_int(NUM_LINES) + # SET_SIZE_BITS is the log base 2 of + # the set size + SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS + # TAG_BITS is the number of bits of + # the tag part of the address + TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS + # WAY_BITS is the number of bits to + # select a way + WAY_BITS = log2_int(NUM_WAYS) + TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS + +# -- Example of layout for 32 lines of 64 bytes: +# -- +# -- .. tag |index| line | +# -- .. | row | | +# -- .. | | | |00| zero (2) +# -- .. | | |-| | INSN_BITS (1) +# -- .. | |---| | ROW_LINEBITS (3) +# -- .. | |--- - --| LINE_OFF_BITS (6) +# -- .. | |- --| ROW_OFF_BITS (3) +# -- .. |----- ---| | ROW_BITS (8) +# -- .. |-----| | INDEX_BITS (5) +# -- .. --------| | TAG_BITS (53) + # Example of layout for 32 lines of 64 bytes: + # + # .. tag |index| line | + # .. | row | | + # .. | | | |00| zero (2) + # .. | | |-| | INSN_BITS (1) + # .. | |---| | ROW_LINEBITS (3) + # .. | |--- - --| LINE_OFF_BITS (6) + # .. | |- --| ROW_OFF_BITS (3) + # .. |----- ---| | ROW_BITS (8) + # .. |-----| | INDEX_BITS (5) + # .. --------| | TAG_BITS (53) + +# subtype row_t is integer range 0 to BRAM_ROWS-1; +# subtype index_t is integer range 0 to NUM_LINES-1; +# subtype way_t is integer range 0 to NUM_WAYS-1; +# subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0); +# +# -- The cache data BRAM organized as described above for each way +# subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0); +# +# -- The cache tags LUTRAM has a row per set. Vivado is a pain and will +# -- not handle a clean (commented) definition of the cache tags as a 3d +# -- memory. For now, work around it by putting all the tags +# subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0); +# -- type cache_tags_set_t is array(way_t) of cache_tag_t; +# -- type cache_tags_array_t is array(index_t) of cache_tags_set_t; +# constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS; +# subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0); +# type cache_tags_array_t is array(index_t) of cache_tags_set_t; + def CacheTagArray(): + return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES)) + +# -- The cache valid bits +# subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0); +# type cache_valids_t is array(index_t) of cache_way_valids_t; +# type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic; + def CacheValidBitsArray(): + return Array(Signal() for x in ROW_PER_LINE) + + def RowPerLineValidArray(): + return Array(Signal() for x in range ROW_PER_LINE) + +# -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs +# signal cache_tags : cache_tags_array_t; +# signal cache_valids : cache_valids_t; + # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs + cache_tags = CacheTagArray() + cache_valid_bits = CacheValidBitsArray() + +# attribute ram_style : string; +# attribute ram_style of cache_tags : signal is "distributed"; + # TODO to be passed to nigmen as ram attributes + # attribute ram_style : string; + # attribute ram_style of cache_tags : signal is "distributed"; + +# -- L1 ITLB. +# constant TLB_BITS : natural := log2(TLB_SIZE); +# constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS); +# constant TLB_PTE_BITS : natural := 64; + TLB_BITS = log2_int(TLB_SIZE) + TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS) + TLB_PTE_BITS = 64 -# -- Requests are all sent if stb is 0 -# stbs_done := r.wb.stb = '0'; - # Requests are all sent if stb is 0 - comb += stbs_done.eq(r.wb.stb == 0) +# subtype tlb_index_t is integer range 0 to TLB_SIZE - 1; +# type tlb_valids_t is array(tlb_index_t) of std_ulogic; +# subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0); +# type tlb_tags_t is array(tlb_index_t) of tlb_tag_t; +# subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0); +# type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t; + def TLBValidBitsArray(): + return Array(Signal() for x in range(TLB_SIZE)) -# -- If we are still sending requests, was one accepted ? -# if wishbone_in.stall = '0' and not stbs_done then - # If we are still sending requests, was one accepted? - with m.If(~wb_in.stall & ~stbs_done): -# -- That was the last word ? We are done sending. -# -- Clear stb and set stbs_done so we can handle -# -- an eventual last ack on the same cycle. -# if is_last_row_addr(r.wb.adr, r.end_row_ix) then -# r.wb.stb <= '0'; -# stbs_done := true; -# end if; - # That was the last word ? We are done sending. - # Clear stb and set stbs_done so we can handle - # an eventual last ack on the same cycle. - with m.If(is_last_row_addr( - r.wb.adr, r.end_row_ix)): - sync += r.wb.stb.eq(0) - stbs_done.eq(1) + def TLBTagArray(): + return Array(Signal(TLB_EA_TAG_BITS) for x in range(TLB_SIZE)) -# -- Calculate the next row address -# r.wb.adr <= next_row_addr(r.wb.adr); - # Calculate the next row address - sync += r.wb.adr.eq(next_row_addr(r.wb.adr)) -# end if; + def TLBPTEArray(): + return Array(Signal(LTB_PTE_BITS) for x in range(TLB_SIZE)) -# -- Incoming acks processing -# if wishbone_in.ack = '1' then - # Incoming acks processing - with m.If(wb_in.ack): -# r.rows_valid(r.store_row mod ROW_PER_LINE) <= '1'; - sync += r.rows_valid[ - r.store_row & ROW_PER_LINE - ].eq(1) +# signal itlb_valids : tlb_valids_t; +# signal itlb_tags : tlb_tags_t; +# signal itlb_ptes : tlb_ptes_t; +# attribute ram_style of itlb_tags : signal is "distributed"; +# attribute ram_style of itlb_ptes : signal is "distributed"; + itlb_valid_bits = TLBValidBitsArray() + itlb_tags = TLBTagArray() + itlb_ptes = TLBPTEArray() + # TODO to be passed to nmigen as ram attributes + # attribute ram_style of itlb_tags : signal is "distributed"; + # attribute ram_style of itlb_ptes : signal is "distributed"; -# -- Check for completion -# if stbs_done and -# is_last_row(r.store_row, r.end_row_ix) then - # Check for completion - with m.If(stbs_done & is_last_row( - r.store_row, r.end_row_ix)): -# -- Complete wishbone cycle -# r.wb.cyc <= '0'; - # Complete wishbone cycle - sync += r.wb.cyc.eq(0) +# -- Privilege bit from PTE EAA field +# signal eaa_priv : std_ulogic; + # Privilege bit from PTE EAA field + eaa_priv = Signal() -# -- Cache line is now valid -# cache_valids(r.store_index)(replace_way) <= -# r.store_valid and not inval_in; - # Cache line is now valid - sync += cache_valid_bits[ - r.store_index - ][relace_way].eq( - r.store_valid & ~inval_in - ) -# -- We are done -# r.state <= IDLE; - # We are done - sync += r.state.eq(State.IDLE) -# end if; +# signal r : reg_internal_t; + r = RegInternal() -# -- Increment store row counter -# r.store_row <= next_row(r.store_row); - # Increment store row counter - sync += store_row.eq(next_row(r.store_row)) -# end if; -# end case; -# end if; -# -# -- TLB miss and protection fault processing -# if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then -# r.fetch_failed <= '0'; -# elsif i_in.req = '1' and access_ok = '0' and stall_in = '0' then -# r.fetch_failed <= '1'; -# end if; - # TLB miss and protection fault processing - with m.If('''TODO nmigen rst''' | flush_in | m_in.tlbld): - sync += r.fetch_failed.eq(0) +# -- Async signals on incoming request +# signal req_index : index_t; +# signal req_row : row_t; +# signal req_hit_way : way_t; +# signal req_tag : cache_tag_t; +# signal req_is_hit : std_ulogic; +# signal req_is_miss : std_ulogic; +# signal req_laddr : std_ulogic_vector(63 downto 0); + # Async signal on incoming request + req_index = Signal(NUM_LINES) + req_row = Signal(BRAM_ROWS) + req_hit_way = Signal(NUM_WAYS) + req_tag = Signal(TAG_BITS) + req_is_hit = Signal() + req_is_miss = Signal() + req_laddr = Signal(64) - with m.Elif(i_in.req & ~access_ok & ~stall_in): - sync += r.fetch_failed.eq(1) -# end if; -# end process; +# signal tlb_req_index : tlb_index_t; +# signal real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0); +# signal ra_valid : std_ulogic; +# signal priv_fault : std_ulogic; +# signal access_ok : std_ulogic; +# signal use_previous : std_ulogic; + tlb_req_index = Signal(TLB_SIZE) + real_addr = Signal(REAL_ADDR_BITS) + ra_valid = Signal() + priv_fault = Signal() + access_ok = Signal() + use_previous = Signal() -# icache_log: if LOG_LENGTH > 0 generate - def icache_log(self, m, log_out): - comb = m.d.comb - sync = m.d.sync +# -- Cache RAM interface +# type cache_ram_out_t is array(way_t) of cache_row_t; +# signal cache_out : cache_ram_out_t; + # Cache RAM interface + def CacheRamOut(): + return Array(Signal(ROW_SIZE_BITS) for x in range(NUM_WAYS)) -# -- Output data to logger -# signal log_data : std_ulogic_vector(53 downto 0); -# begin -# data_log: process(clk) -# variable lway: way_t; -# variable wstate: std_ulogic; - # Output data to logger - for i in range(LOG_LENGTH) - # Output data to logger - log_data = Signal(54) - lway = Signal(NUM_WAYS) - wstate = Signal() + cache_out = CacheRamOut() -# begin -# if rising_edge(clk) then -# lway := req_hit_way; -# wstate := '0'; - comb += lway.eq(req_hit_way) - comb += wstate.eq(0) +# -- PLRU output interface +# type plru_out_t is array(index_t) of +# std_ulogic_vector(WAY_BITS-1 downto 0); +# signal plru_victim : plru_out_t; +# signal replace_way : way_t; + # PLRU output interface + def PLRUOut(): + return Array(Signal(WAY_BITS) for x in range(NUM_LINES)) -# if r.state /= IDLE then -# wstate := '1'; -# end if; - with m.If(r.state != State.IDLE): - comb += wstate.eq(1) + plru_victim = PLRUOut() + replace_way = Signal(NUM_WAYS) -# log_data <= i_out.valid & -# i_out.insn & -# wishbone_in.ack & -# r.wb.adr(5 downto 3) & -# r.wb.stb & r.wb.cyc & -# wishbone_in.stall & -# stall_out & -# r.fetch_failed & -# r.hit_nia(5 downto 2) & -# wstate & -# std_ulogic_vector(to_unsigned(lway, 3)) & -# req_is_hit & req_is_miss & -# access_ok & -# ra_valid; - sync += log_data.eq(Cat( - ra_valid, access_ok, req_is_miss, req_is_hit, - lway '''truncate to 3 bits?''', wstate, r.hit_nia[2:6], - r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc, - r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn, - i_out.valid - )) -# end if; -# end process; -# log_out <= log_data; - comb += log_out.eq(log_data) +# begin +# +# assert LINE_SIZE mod ROW_SIZE = 0; +# assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" +# severity FAILURE; +# assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" +# severity FAILURE; +# assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2" +# severity FAILURE; +# assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2" +# severity FAILURE; +# assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS) +# report "geometry bits don't add up" severity FAILURE; +# assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS) +# report "geometry bits don't add up" severity FAILURE; +# assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS) +# report "geometry bits don't add up" severity FAILURE; +# assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS) +# report "geometry bits don't add up" severity FAILURE; +# +# sim_debug: if SIM generate +# debug: process +# begin +# report "ROW_SIZE = " & natural'image(ROW_SIZE); +# report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE); +# report "BRAM_ROWS = " & natural'image(BRAM_ROWS); +# report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW); +# report "INSN_BITS = " & natural'image(INSN_BITS); +# report "ROW_BITS = " & natural'image(ROW_BITS); +# report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS); +# report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS); +# report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS); +# report "INDEX_BITS = " & natural'image(INDEX_BITS); +# report "TAG_BITS = " & natural'image(TAG_BITS); +# report "WAY_BITS = " & natural'image(WAY_BITS); +# wait; +# end process; # end generate; -# end; +