^ addr[TLB_LG_PGSZ + 2 * TLB_BITS: TLB_LG_PGSZE + 3 * TLB_BITS]
return hash
- def elaborate(self, platform):
-# architecture rtl of icache is
-# constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
-# -- ROW_PER_LINE is the number of row (wishbone transactions) in a line
-# constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
-# -- BRAM_ROWS is the number of rows in BRAM needed to represent the full
-# -- icache
-# constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
-# -- INSN_PER_ROW is the number of 32bit instructions per BRAM row
-# constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
-# -- Bit fields counts in the address
-#
-# -- INSN_BITS is the number of bits to select an instruction in a row
-# constant INSN_BITS : natural := log2(INSN_PER_ROW);
-# -- ROW_BITS is the number of bits to select a row
-# constant ROW_BITS : natural := log2(BRAM_ROWS);
-# -- ROW_LINEBITS is the number of bits to select a row within a line
-# constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
-# -- LINE_OFF_BITS is the number of bits for the offset in a cache line
-# constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
-# -- ROW_OFF_BITS is the number of bits for the offset in a row
-# constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
-# -- INDEX_BITS is the number of bits to select a cache line
-# constant INDEX_BITS : natural := log2(NUM_LINES);
-# -- SET_SIZE_BITS is the log base 2 of the set size
-# constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
-# -- TAG_BITS is the number of bits of the tag part of the address
-# constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
-# -- WAY_BITS is the number of bits to select a way
-# constant WAY_BITS : natural := log2(NUM_WAYS);
+# -- Generate a cache RAM for each way
+# rams: for i in 0 to NUM_WAYS-1 generate
+# signal do_read : std_ulogic;
+# signal do_write : std_ulogic;
+# signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
+# signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
+# signal dout : cache_row_t;
+# signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
+# begin
+# way: entity work.cache_ram
+# generic map (
+# ROW_BITS => ROW_BITS,
+# WIDTH => ROW_SIZE_BITS
+# )
+# port map (
+# clk => clk,
+# rd_en => do_read,
+# rd_addr => rd_addr,
+# rd_data => dout,
+# wr_sel => wr_sel,
+# wr_addr => wr_addr,
+# wr_data => wishbone_in.dat
+# );
+# process(all)
+# begin
+# do_read <= not (stall_in or use_previous);
+# do_write <= '0';
+# if wishbone_in.ack = '1' and replace_way = i then
+# do_write <= '1';
+# end if;
+# cache_out(i) <= dout;
+# rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
+# wr_addr <= std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
+# for i in 0 to ROW_SIZE-1 loop
+# wr_sel(i) <= do_write;
+# end loop;
+# end process;
+# end generate;
+ def rams(self, m):
+ comb = m.d.comb
- ROW_SIZE_BITS = ROW_SIZE * 8
- # ROW_PER_LINE is the number of row
- # (wishbone) transactions in a line
- ROW_PER_LINE = LINE_SIZE / ROW_SIZE
- # BRAM_ROWS is the number of rows in
- # BRAM needed to represent the full icache
- BRAM_ROWS = NUM_LINES * ROW_PER_LINE
- # INSN_PER_ROW is the number of 32bit
- # instructions per BRAM row
- INSN_PER_ROW = ROW_SIZE_BITS / 32
+ do_read = Signal()
+ do_write = Signal()
+ rd_addr = Signal(ROW_BITS)
+ wr_addr = Signal(ROW_BITS)
+ _d_out = Signal(ROW_SIZE_BITS)
+ wr_sel = Signal(ROW_SIZE)
- # Bit fields counts in the address
- #
- # INSN_BITS is the number of bits to
- # select an instruction in a row
- INSN_BITS = log2_int(INSN_PER_ROW)
- # ROW_BITS is the number of bits to
- # select a row
- ROW_BITS = log2_int(BRAM_ROWS)
- # ROW_LINEBITS is the number of bits to
- # select a row within a line
- ROW_LINE_BITS = log2_int(ROW_PER_LINE)
- # LINE_OFF_BITS is the number of bits for
- # the offset in a cache line
- LINE_OFF_BITS = log2_int(LINE_SIZE)
- # ROW_OFF_BITS is the number of bits for
- # the offset in a row
- ROW_OFF_BITS = log2_int(ROW_SIZE)
- # INDEX_BITS is the number of bits to
- # select a cache line
- INDEX_BITS = log2_int(NUM_LINES)
- # SET_SIZE_BITS is the log base 2 of
- # the set size
- SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
- # TAG_BITS is the number of bits of
- # the tag part of the address
- TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
- # WAY_BITS is the number of bits to
- # select a way
- WAY_BITS = log2_int(NUM_WAYS)
- TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
+ for i in range(NUM_WAYS)
+ way = CacheRam(ROW_BITS, ROW_SIZE_BITS)
+ comb += way.rd_en.eq(do_read)
+ comb += way.rd_addr.eq(rd_addr)
+ comb += way.rd_data.eq(_d_out)
+ comb += way.wr_sel.eq(wr_sel)
+ comb += way.wr_add.eq(wr_addr)
+ comb += way.wr_data.eq('''TODO ?? wishbone_in.data ??''')
-# -- Example of layout for 32 lines of 64 bytes:
-# --
-# -- .. tag |index| line |
-# -- .. | row | |
-# -- .. | | | |00| zero (2)
-# -- .. | | |-| | INSN_BITS (1)
-# -- .. | |---| | ROW_LINEBITS (3)
-# -- .. | |--- - --| LINE_OFF_BITS (6)
-# -- .. | |- --| ROW_OFF_BITS (3)
-# -- .. |----- ---| | ROW_BITS (8)
-# -- .. |-----| | INDEX_BITS (5)
-# -- .. --------| | TAG_BITS (53)
- # Example of layout for 32 lines of 64 bytes:
- #
- # .. tag |index| line |
- # .. | row | |
- # .. | | | |00| zero (2)
- # .. | | |-| | INSN_BITS (1)
- # .. | |---| | ROW_LINEBITS (3)
- # .. | |--- - --| LINE_OFF_BITS (6)
- # .. | |- --| ROW_OFF_BITS (3)
- # .. |----- ---| | ROW_BITS (8)
- # .. |-----| | INDEX_BITS (5)
- # .. --------| | TAG_BITS (53)
+ comb += do_read.eq(~(stall_in | use_previous))
+ comb += do_write.eq(0)
-# subtype row_t is integer range 0 to BRAM_ROWS-1;
-# subtype index_t is integer range 0 to NUM_LINES-1;
-# subtype way_t is integer range 0 to NUM_WAYS-1;
-# subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
+ with m.If(wb_in.ack & replace_way == i):
+ do_write.eq(1)
+
+ comb += cache_out[i].eq(_d_out)
+ comb += rd_addr.eq(Signal(req_row))
+ comb += wr_addr.eq(Signal(r.store_row))
+ for j in range(ROW_SIZE):
+ comb += wr_sel[j].eq(do_write)
+
+# -- Generate PLRUs
+# maybe_plrus: if NUM_WAYS > 1 generate
+# begin
+# plrus: for i in 0 to NUM_LINES-1 generate
+# -- PLRU interface
+# signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
+# signal plru_acc_en : std_ulogic;
+# signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
#
-# -- The cache data BRAM organized as described above for each way
-# subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
+# begin
+# plru : entity work.plru
+# generic map (
+# BITS => WAY_BITS
+# )
+# port map (
+# clk => clk,
+# rst => rst,
+# acc => plru_acc,
+# acc_en => plru_acc_en,
+# lru => plru_out
+# );
#
-# -- The cache tags LUTRAM has a row per set. Vivado is a pain and will
-# -- not handle a clean (commented) definition of the cache tags as a 3d
-# -- memory. For now, work around it by putting all the tags
-# subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
-# -- type cache_tags_set_t is array(way_t) of cache_tag_t;
-# -- type cache_tags_array_t is array(index_t) of cache_tags_set_t;
-# constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
-# subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
-# type cache_tags_array_t is array(index_t) of cache_tags_set_t;
- def CacheTagArray():
- return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
+# process(all)
+# begin
+# -- PLRU interface
+# if get_index(r.hit_nia) = i then
+# plru_acc_en <= r.hit_valid;
+# else
+# plru_acc_en <= '0';
+# end if;
+# plru_acc <=
+# std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
+# plru_victim(i) <= plru_out;
+# end process;
+# end generate;
+# end generate;
+ def maybe_plrus(self, m):
+ comb += m.d.comb
-# -- The cache valid bits
-# subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
-# type cache_valids_t is array(index_t) of cache_way_valids_t;
-# type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
- def CacheValidBitsArray():
- return Array(Signal() for x in ROW_PER_LINE)
+ with m.If(NUM_WAYS > 1):
+ plru_acc = Signal(WAY_BITS)
+ plru_acc_en = Signal()
+ plru_out = Signal(WAY_BITS)
- def RowPerLineValidArray():
- return Array(Signal() for x in range ROW_PER_LINE)
+ for i in range(NUM_LINES):
+ plru = PLRU(WAY_BITS)
+ comb += plru.acc.eq(plru_acc)
+ comb += plru.acc_en.eq(plru_acc_en)
+ comb += plru.lru.eq(plru_out)
-# -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
-# signal cache_tags : cache_tags_array_t;
-# signal cache_valids : cache_valids_t;
- # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
- cache_tags = CacheTagArray()
- cache_valid_bits = CacheValidBitsArray()
+ # PLRU interface
+ with m.If(get_index(r.hit_nia) == i):
+ comb += plru.acc_en.eq(r.hit_valid)
-# attribute ram_style : string;
-# attribute ram_style of cache_tags : signal is "distributed";
- # TODO to be passed to nigmen as ram attributes
- # attribute ram_style : string;
- # attribute ram_style of cache_tags : signal is "distributed";
+ with m.Else():
+ comb += plru.acc_en.eq(0)
-# -- L1 ITLB.
-# constant TLB_BITS : natural := log2(TLB_SIZE);
-# constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
-# constant TLB_PTE_BITS : natural := 64;
- TLB_BITS = log2_int(TLB_SIZE)
- TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
- TLB_PTE_BITS = 64
-
-# subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
-# type tlb_valids_t is array(tlb_index_t) of std_ulogic;
-# subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
-# type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
-# subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
-# type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
- def TLBValidBitsArray():
- return Array(Signal() for x in range(TLB_SIZE))
-
- def TLBTagArray():
- return Array(Signal(TLB_EA_TAG_BITS) for x in range(TLB_SIZE))
-
- def TLBPTEArray():
- return Array(Signal(LTB_PTE_BITS) for x in range(TLB_SIZE))
-
-# signal itlb_valids : tlb_valids_t;
-# signal itlb_tags : tlb_tags_t;
-# signal itlb_ptes : tlb_ptes_t;
-# attribute ram_style of itlb_tags : signal is "distributed";
-# attribute ram_style of itlb_ptes : signal is "distributed";
- itlb_valid_bits = TLBValidBitsArray()
- itlb_tags = TLBTagArray()
- itlb_ptes = TLBPTEArray()
- # TODO to be passed to nmigen as ram attributes
- # attribute ram_style of itlb_tags : signal is "distributed";
- # attribute ram_style of itlb_ptes : signal is "distributed";
-
-# -- Privilege bit from PTE EAA field
-# signal eaa_priv : std_ulogic;
- # Privilege bit from PTE EAA field
- eaa_priv = Signal()
-
-
-# signal r : reg_internal_t;
- r = RegInternal()
-
-# -- Async signals on incoming request
-# signal req_index : index_t;
-# signal req_row : row_t;
-# signal req_hit_way : way_t;
-# signal req_tag : cache_tag_t;
-# signal req_is_hit : std_ulogic;
-# signal req_is_miss : std_ulogic;
-# signal req_laddr : std_ulogic_vector(63 downto 0);
- # Async signal on incoming request
- req_index = Signal(NUM_LINES)
- req_row = Signal(BRAM_ROWS)
- req_hit_way = Signal(NUM_WAYS)
- req_tag = Signal(TAG_BITS)
- req_is_hit = Signal()
- req_is_miss = Signal()
- req_laddr = Signal(64)
-
-# signal tlb_req_index : tlb_index_t;
-# signal real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
-# signal ra_valid : std_ulogic;
-# signal priv_fault : std_ulogic;
-# signal access_ok : std_ulogic;
-# signal use_previous : std_ulogic;
- tlb_req_index = Signal(TLB_SIZE)
- real_addr = Signal(REAL_ADDR_BITS)
- ra_valid = Signal()
- priv_fault = Signal()
- access_ok = Signal()
- use_previous = Signal()
-
-# -- Cache RAM interface
-# type cache_ram_out_t is array(way_t) of cache_row_t;
-# signal cache_out : cache_ram_out_t;
- # Cache RAM interface
- def CacheRamOut():
- return Array(Signal(ROW_SIZE_BITS) for x in range(NUM_WAYS))
-
- cache_out = CacheRamOut()
-
-# -- PLRU output interface
-# type plru_out_t is array(index_t) of
-# std_ulogic_vector(WAY_BITS-1 downto 0);
-# signal plru_victim : plru_out_t;
-# signal replace_way : way_t;
- # PLRU output interface
- def PLRUOut():
- return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
-
- plru_victim = PLRUOut()
- replace_way = Signal(NUM_WAYS)
-
-# begin
-#
-# assert LINE_SIZE mod ROW_SIZE = 0;
-# assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
-# severity FAILURE;
-# assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
-# severity FAILURE;
-# assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
-# severity FAILURE;
-# assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
-# severity FAILURE;
-# assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
-# report "geometry bits don't add up" severity FAILURE;
-# assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
-# report "geometry bits don't add up" severity FAILURE;
-# assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
-# report "geometry bits don't add up" severity FAILURE;
-# assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
-# report "geometry bits don't add up" severity FAILURE;
-#
-# sim_debug: if SIM generate
-# debug: process
-# begin
-# report "ROW_SIZE = " & natural'image(ROW_SIZE);
-# report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
-# report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
-# report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
-# report "INSN_BITS = " & natural'image(INSN_BITS);
-# report "ROW_BITS = " & natural'image(ROW_BITS);
-# report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
-# report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
-# report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
-# report "INDEX_BITS = " & natural'image(INDEX_BITS);
-# report "TAG_BITS = " & natural'image(TAG_BITS);
-# report "WAY_BITS = " & natural'image(WAY_BITS);
-# wait;
-# end process;
-# end generate;
-
-# -- Generate a cache RAM for each way
-# rams: for i in 0 to NUM_WAYS-1 generate
-# signal do_read : std_ulogic;
-# signal do_write : std_ulogic;
-# signal rd_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
-# signal wr_addr : std_ulogic_vector(ROW_BITS-1 downto 0);
-# signal dout : cache_row_t;
-# signal wr_sel : std_ulogic_vector(ROW_SIZE-1 downto 0);
-# begin
-# way: entity work.cache_ram
-# generic map (
-# ROW_BITS => ROW_BITS,
-# WIDTH => ROW_SIZE_BITS
-# )
-# port map (
-# clk => clk,
-# rd_en => do_read,
-# rd_addr => rd_addr,
-# rd_data => dout,
-# wr_sel => wr_sel,
-# wr_addr => wr_addr,
-# wr_data => wishbone_in.dat
-# );
-# process(all)
-# begin
-# do_read <= not (stall_in or use_previous);
-# do_write <= '0';
-# if wishbone_in.ack = '1' and replace_way = i then
-# do_write <= '1';
-# end if;
-# cache_out(i) <= dout;
-# rd_addr <= std_ulogic_vector(to_unsigned(req_row, ROW_BITS));
-# wr_addr <= std_ulogic_vector(to_unsigned(r.store_row, ROW_BITS));
-# for i in 0 to ROW_SIZE-1 loop
-# wr_sel(i) <= do_write;
-# end loop;
-# end process;
-# end generate;
- def rams(self, m):
- comb = m.d.comb
-
- do_read = Signal()
- do_write = Signal()
- rd_addr = Signal(ROW_BITS)
- wr_addr = Signal(ROW_BITS)
- _d_out = Signal(ROW_SIZE_BITS)
- wr_sel = Signal(ROW_SIZE)
-
- for i in range(NUM_WAYS)
- way = CacheRam(ROW_BITS, ROW_SIZE_BITS)
- comb += way.rd_en.eq(do_read)
- comb += way.rd_addr.eq(rd_addr)
- comb += way.rd_data.eq(_d_out)
- comb += way.wr_sel.eq(wr_sel)
- comb += way.wr_add.eq(wr_addr)
- comb += way.wr_data.eq('''TODO ?? wishbone_in.data ??''')
-
- comb += do_read.eq(~(stall_in | use_previous))
- comb += do_write.eq(0)
-
- with m.If(wb_in.ack & replace_way == i):
- do_write.eq(1)
-
- comb += cache_out[i].eq(_d_out)
- comb += rd_addr.eq(Signal(req_row))
- comb += wr_addr.eq(Signal(r.store_row))
- for j in range(ROW_SIZE):
- comb += wr_sel[j].eq(do_write)
-
-# -- Generate PLRUs
-# maybe_plrus: if NUM_WAYS > 1 generate
-# begin
-# plrus: for i in 0 to NUM_LINES-1 generate
-# -- PLRU interface
-# signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
-# signal plru_acc_en : std_ulogic;
-# signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
-#
-# begin
-# plru : entity work.plru
-# generic map (
-# BITS => WAY_BITS
-# )
-# port map (
-# clk => clk,
-# rst => rst,
-# acc => plru_acc,
-# acc_en => plru_acc_en,
-# lru => plru_out
-# );
-#
-# process(all)
-# begin
-# -- PLRU interface
-# if get_index(r.hit_nia) = i then
-# plru_acc_en <= r.hit_valid;
-# else
-# plru_acc_en <= '0';
-# end if;
-# plru_acc <=
-# std_ulogic_vector(to_unsigned(r.hit_way, WAY_BITS));
-# plru_victim(i) <= plru_out;
-# end process;
-# end generate;
-# end generate;
- def maybe_plrus(self, m):
- comb += m.d.comb
-
- with m.If(NUM_WAYS > 1):
- plru_acc = Signal(WAY_BITS)
- plru_acc_en = Signal()
- plru_out = Signal(WAY_BITS)
-
- for i in range(NUM_LINES):
- plru = PLRU(WAY_BITS)
- comb += plru.acc.eq(plru_acc)
- comb += plru.acc_en.eq(plru_acc_en)
- comb += plru.lru.eq(plru_out)
-
- # PLRU interface
- with m.If(get_index(r.hit_nia) == i):
- comb += plru.acc_en.eq(r.hit_valid)
-
- with m.Else():
- comb += plru.acc_en.eq(0)
-
- comb += plru.acc.eq(r.hit_way)
- comb += plru_victim[i].eq(plru.lru)
+ comb += plru.acc.eq(r.hit_way)
+ comb += plru_victim[i].eq(plru.lru)
# -- TLB hit detection and real address generation
# itlb_lookup : process(all)
tagset
)
-# r.state <= WAIT_ACK;
- sync += r.state.eq(State.WAIT_ACK)
-# end if;
+# r.state <= WAIT_ACK;
+ sync += r.state.eq(State.WAIT_ACK)
+# end if;
+
+# -- Requests are all sent if stb is 0
+# stbs_done := r.wb.stb = '0';
+ # Requests are all sent if stb is 0
+ comb += stbs_done.eq(r.wb.stb == 0)
+
+# -- If we are still sending requests, was one accepted ?
+# if wishbone_in.stall = '0' and not stbs_done then
+ # If we are still sending requests, was one accepted?
+ with m.If(~wb_in.stall & ~stbs_done):
+# -- That was the last word ? We are done sending.
+# -- Clear stb and set stbs_done so we can handle
+# -- an eventual last ack on the same cycle.
+# if is_last_row_addr(r.wb.adr, r.end_row_ix) then
+# r.wb.stb <= '0';
+# stbs_done := true;
+# end if;
+ # That was the last word ? We are done sending.
+ # Clear stb and set stbs_done so we can handle
+ # an eventual last ack on the same cycle.
+ with m.If(is_last_row_addr(
+ r.wb.adr, r.end_row_ix)):
+ sync += r.wb.stb.eq(0)
+ stbs_done.eq(1)
+
+# -- Calculate the next row address
+# r.wb.adr <= next_row_addr(r.wb.adr);
+ # Calculate the next row address
+ sync += r.wb.adr.eq(next_row_addr(r.wb.adr))
+# end if;
+
+# -- Incoming acks processing
+# if wishbone_in.ack = '1' then
+ # Incoming acks processing
+ with m.If(wb_in.ack):
+# r.rows_valid(r.store_row mod ROW_PER_LINE) <= '1';
+ sync += r.rows_valid[
+ r.store_row & ROW_PER_LINE
+ ].eq(1)
+
+# -- Check for completion
+# if stbs_done and
+# is_last_row(r.store_row, r.end_row_ix) then
+ # Check for completion
+ with m.If(stbs_done & is_last_row(
+ r.store_row, r.end_row_ix)):
+# -- Complete wishbone cycle
+# r.wb.cyc <= '0';
+ # Complete wishbone cycle
+ sync += r.wb.cyc.eq(0)
+
+# -- Cache line is now valid
+# cache_valids(r.store_index)(replace_way) <=
+# r.store_valid and not inval_in;
+ # Cache line is now valid
+ sync += cache_valid_bits[
+ r.store_index
+ ][relace_way].eq(
+ r.store_valid & ~inval_in
+ )
+
+# -- We are done
+# r.state <= IDLE;
+ # We are done
+ sync += r.state.eq(State.IDLE)
+# end if;
+
+# -- Increment store row counter
+# r.store_row <= next_row(r.store_row);
+ # Increment store row counter
+ sync += store_row.eq(next_row(r.store_row))
+# end if;
+# end case;
+# end if;
+#
+# -- TLB miss and protection fault processing
+# if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
+# r.fetch_failed <= '0';
+# elsif i_in.req = '1' and access_ok = '0' and stall_in = '0' then
+# r.fetch_failed <= '1';
+# end if;
+ # TLB miss and protection fault processing
+ with m.If('''TODO nmigen rst''' | flush_in | m_in.tlbld):
+ sync += r.fetch_failed.eq(0)
+
+ with m.Elif(i_in.req & ~access_ok & ~stall_in):
+ sync += r.fetch_failed.eq(1)
+# end if;
+# end process;
+
+# icache_log: if LOG_LENGTH > 0 generate
+ def icache_log(self, m, log_out):
+ comb = m.d.comb
+ sync = m.d.sync
+
+# -- Output data to logger
+# signal log_data : std_ulogic_vector(53 downto 0);
+# begin
+# data_log: process(clk)
+# variable lway: way_t;
+# variable wstate: std_ulogic;
+ # Output data to logger
+ for i in range(LOG_LENGTH)
+ # Output data to logger
+ log_data = Signal(54)
+ lway = Signal(NUM_WAYS)
+ wstate = Signal()
+
+# begin
+# if rising_edge(clk) then
+# lway := req_hit_way;
+# wstate := '0';
+ comb += lway.eq(req_hit_way)
+ comb += wstate.eq(0)
+
+# if r.state /= IDLE then
+# wstate := '1';
+# end if;
+ with m.If(r.state != State.IDLE):
+ comb += wstate.eq(1)
+
+# log_data <= i_out.valid &
+# i_out.insn &
+# wishbone_in.ack &
+# r.wb.adr(5 downto 3) &
+# r.wb.stb & r.wb.cyc &
+# wishbone_in.stall &
+# stall_out &
+# r.fetch_failed &
+# r.hit_nia(5 downto 2) &
+# wstate &
+# std_ulogic_vector(to_unsigned(lway, 3)) &
+# req_is_hit & req_is_miss &
+# access_ok &
+# ra_valid;
+ sync += log_data.eq(Cat(
+ ra_valid, access_ok, req_is_miss, req_is_hit,
+ lway '''truncate to 3 bits?''', wstate, r.hit_nia[2:6],
+ r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
+ r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
+ i_out.valid
+ ))
+# end if;
+# end process;
+# log_out <= log_data;
+ comb += log_out.eq(log_data)
+# end generate;
+# end;
+
+ def elaborate(self, platform):
+# architecture rtl of icache is
+# constant ROW_SIZE_BITS : natural := ROW_SIZE*8;
+# -- ROW_PER_LINE is the number of row (wishbone transactions) in a line
+# constant ROW_PER_LINE : natural := LINE_SIZE / ROW_SIZE;
+# -- BRAM_ROWS is the number of rows in BRAM needed to represent the full
+# -- icache
+# constant BRAM_ROWS : natural := NUM_LINES * ROW_PER_LINE;
+# -- INSN_PER_ROW is the number of 32bit instructions per BRAM row
+# constant INSN_PER_ROW : natural := ROW_SIZE_BITS / 32;
+# -- Bit fields counts in the address
+#
+# -- INSN_BITS is the number of bits to select an instruction in a row
+# constant INSN_BITS : natural := log2(INSN_PER_ROW);
+# -- ROW_BITS is the number of bits to select a row
+# constant ROW_BITS : natural := log2(BRAM_ROWS);
+# -- ROW_LINEBITS is the number of bits to select a row within a line
+# constant ROW_LINEBITS : natural := log2(ROW_PER_LINE);
+# -- LINE_OFF_BITS is the number of bits for the offset in a cache line
+# constant LINE_OFF_BITS : natural := log2(LINE_SIZE);
+# -- ROW_OFF_BITS is the number of bits for the offset in a row
+# constant ROW_OFF_BITS : natural := log2(ROW_SIZE);
+# -- INDEX_BITS is the number of bits to select a cache line
+# constant INDEX_BITS : natural := log2(NUM_LINES);
+# -- SET_SIZE_BITS is the log base 2 of the set size
+# constant SET_SIZE_BITS : natural := LINE_OFF_BITS + INDEX_BITS;
+# -- TAG_BITS is the number of bits of the tag part of the address
+# constant TAG_BITS : natural := REAL_ADDR_BITS - SET_SIZE_BITS;
+# -- WAY_BITS is the number of bits to select a way
+# constant WAY_BITS : natural := log2(NUM_WAYS);
+
+ ROW_SIZE_BITS = ROW_SIZE * 8
+ # ROW_PER_LINE is the number of row
+ # (wishbone) transactions in a line
+ ROW_PER_LINE = LINE_SIZE / ROW_SIZE
+ # BRAM_ROWS is the number of rows in
+ # BRAM needed to represent the full icache
+ BRAM_ROWS = NUM_LINES * ROW_PER_LINE
+ # INSN_PER_ROW is the number of 32bit
+ # instructions per BRAM row
+ INSN_PER_ROW = ROW_SIZE_BITS / 32
+
+ # Bit fields counts in the address
+ #
+ # INSN_BITS is the number of bits to
+ # select an instruction in a row
+ INSN_BITS = log2_int(INSN_PER_ROW)
+ # ROW_BITS is the number of bits to
+ # select a row
+ ROW_BITS = log2_int(BRAM_ROWS)
+ # ROW_LINEBITS is the number of bits to
+ # select a row within a line
+ ROW_LINE_BITS = log2_int(ROW_PER_LINE)
+ # LINE_OFF_BITS is the number of bits for
+ # the offset in a cache line
+ LINE_OFF_BITS = log2_int(LINE_SIZE)
+ # ROW_OFF_BITS is the number of bits for
+ # the offset in a row
+ ROW_OFF_BITS = log2_int(ROW_SIZE)
+ # INDEX_BITS is the number of bits to
+ # select a cache line
+ INDEX_BITS = log2_int(NUM_LINES)
+ # SET_SIZE_BITS is the log base 2 of
+ # the set size
+ SET_SIZE_BITS = LINE_OFF_BITS + INDEX_BITS
+ # TAG_BITS is the number of bits of
+ # the tag part of the address
+ TAG_BITS = REAL_ADDR_BITS - SET_SIZE_BITS
+ # WAY_BITS is the number of bits to
+ # select a way
+ WAY_BITS = log2_int(NUM_WAYS)
+ TAG_RAM_WIDTH = TAG_BITS * NUM_WAYS
+
+# -- Example of layout for 32 lines of 64 bytes:
+# --
+# -- .. tag |index| line |
+# -- .. | row | |
+# -- .. | | | |00| zero (2)
+# -- .. | | |-| | INSN_BITS (1)
+# -- .. | |---| | ROW_LINEBITS (3)
+# -- .. | |--- - --| LINE_OFF_BITS (6)
+# -- .. | |- --| ROW_OFF_BITS (3)
+# -- .. |----- ---| | ROW_BITS (8)
+# -- .. |-----| | INDEX_BITS (5)
+# -- .. --------| | TAG_BITS (53)
+ # Example of layout for 32 lines of 64 bytes:
+ #
+ # .. tag |index| line |
+ # .. | row | |
+ # .. | | | |00| zero (2)
+ # .. | | |-| | INSN_BITS (1)
+ # .. | |---| | ROW_LINEBITS (3)
+ # .. | |--- - --| LINE_OFF_BITS (6)
+ # .. | |- --| ROW_OFF_BITS (3)
+ # .. |----- ---| | ROW_BITS (8)
+ # .. |-----| | INDEX_BITS (5)
+ # .. --------| | TAG_BITS (53)
+
+# subtype row_t is integer range 0 to BRAM_ROWS-1;
+# subtype index_t is integer range 0 to NUM_LINES-1;
+# subtype way_t is integer range 0 to NUM_WAYS-1;
+# subtype row_in_line_t is unsigned(ROW_LINEBITS-1 downto 0);
+#
+# -- The cache data BRAM organized as described above for each way
+# subtype cache_row_t is std_ulogic_vector(ROW_SIZE_BITS-1 downto 0);
+#
+# -- The cache tags LUTRAM has a row per set. Vivado is a pain and will
+# -- not handle a clean (commented) definition of the cache tags as a 3d
+# -- memory. For now, work around it by putting all the tags
+# subtype cache_tag_t is std_logic_vector(TAG_BITS-1 downto 0);
+# -- type cache_tags_set_t is array(way_t) of cache_tag_t;
+# -- type cache_tags_array_t is array(index_t) of cache_tags_set_t;
+# constant TAG_RAM_WIDTH : natural := TAG_BITS * NUM_WAYS;
+# subtype cache_tags_set_t is std_logic_vector(TAG_RAM_WIDTH-1 downto 0);
+# type cache_tags_array_t is array(index_t) of cache_tags_set_t;
+ def CacheTagArray():
+ return Array(Signal(TAG_RAM_WIDTH) for x in range(NUM_LINES))
+
+# -- The cache valid bits
+# subtype cache_way_valids_t is std_ulogic_vector(NUM_WAYS-1 downto 0);
+# type cache_valids_t is array(index_t) of cache_way_valids_t;
+# type row_per_line_valid_t is array(0 to ROW_PER_LINE - 1) of std_ulogic;
+ def CacheValidBitsArray():
+ return Array(Signal() for x in ROW_PER_LINE)
+
+ def RowPerLineValidArray():
+ return Array(Signal() for x in range ROW_PER_LINE)
+
+# -- Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
+# signal cache_tags : cache_tags_array_t;
+# signal cache_valids : cache_valids_t;
+ # Storage. Hopefully "cache_rows" is a BRAM, the rest is LUTs
+ cache_tags = CacheTagArray()
+ cache_valid_bits = CacheValidBitsArray()
+
+# attribute ram_style : string;
+# attribute ram_style of cache_tags : signal is "distributed";
+ # TODO to be passed to nigmen as ram attributes
+ # attribute ram_style : string;
+ # attribute ram_style of cache_tags : signal is "distributed";
+
+# -- L1 ITLB.
+# constant TLB_BITS : natural := log2(TLB_SIZE);
+# constant TLB_EA_TAG_BITS : natural := 64 - (TLB_LG_PGSZ + TLB_BITS);
+# constant TLB_PTE_BITS : natural := 64;
+ TLB_BITS = log2_int(TLB_SIZE)
+ TLB_EA_TAG_BITS = 64 - (TLB_LG_PGSZ + TLB_BITS)
+ TLB_PTE_BITS = 64
-# -- Requests are all sent if stb is 0
-# stbs_done := r.wb.stb = '0';
- # Requests are all sent if stb is 0
- comb += stbs_done.eq(r.wb.stb == 0)
+# subtype tlb_index_t is integer range 0 to TLB_SIZE - 1;
+# type tlb_valids_t is array(tlb_index_t) of std_ulogic;
+# subtype tlb_tag_t is std_ulogic_vector(TLB_EA_TAG_BITS - 1 downto 0);
+# type tlb_tags_t is array(tlb_index_t) of tlb_tag_t;
+# subtype tlb_pte_t is std_ulogic_vector(TLB_PTE_BITS - 1 downto 0);
+# type tlb_ptes_t is array(tlb_index_t) of tlb_pte_t;
+ def TLBValidBitsArray():
+ return Array(Signal() for x in range(TLB_SIZE))
-# -- If we are still sending requests, was one accepted ?
-# if wishbone_in.stall = '0' and not stbs_done then
- # If we are still sending requests, was one accepted?
- with m.If(~wb_in.stall & ~stbs_done):
-# -- That was the last word ? We are done sending.
-# -- Clear stb and set stbs_done so we can handle
-# -- an eventual last ack on the same cycle.
-# if is_last_row_addr(r.wb.adr, r.end_row_ix) then
-# r.wb.stb <= '0';
-# stbs_done := true;
-# end if;
- # That was the last word ? We are done sending.
- # Clear stb and set stbs_done so we can handle
- # an eventual last ack on the same cycle.
- with m.If(is_last_row_addr(
- r.wb.adr, r.end_row_ix)):
- sync += r.wb.stb.eq(0)
- stbs_done.eq(1)
+ def TLBTagArray():
+ return Array(Signal(TLB_EA_TAG_BITS) for x in range(TLB_SIZE))
-# -- Calculate the next row address
-# r.wb.adr <= next_row_addr(r.wb.adr);
- # Calculate the next row address
- sync += r.wb.adr.eq(next_row_addr(r.wb.adr))
-# end if;
+ def TLBPTEArray():
+ return Array(Signal(LTB_PTE_BITS) for x in range(TLB_SIZE))
-# -- Incoming acks processing
-# if wishbone_in.ack = '1' then
- # Incoming acks processing
- with m.If(wb_in.ack):
-# r.rows_valid(r.store_row mod ROW_PER_LINE) <= '1';
- sync += r.rows_valid[
- r.store_row & ROW_PER_LINE
- ].eq(1)
+# signal itlb_valids : tlb_valids_t;
+# signal itlb_tags : tlb_tags_t;
+# signal itlb_ptes : tlb_ptes_t;
+# attribute ram_style of itlb_tags : signal is "distributed";
+# attribute ram_style of itlb_ptes : signal is "distributed";
+ itlb_valid_bits = TLBValidBitsArray()
+ itlb_tags = TLBTagArray()
+ itlb_ptes = TLBPTEArray()
+ # TODO to be passed to nmigen as ram attributes
+ # attribute ram_style of itlb_tags : signal is "distributed";
+ # attribute ram_style of itlb_ptes : signal is "distributed";
-# -- Check for completion
-# if stbs_done and
-# is_last_row(r.store_row, r.end_row_ix) then
- # Check for completion
- with m.If(stbs_done & is_last_row(
- r.store_row, r.end_row_ix)):
-# -- Complete wishbone cycle
-# r.wb.cyc <= '0';
- # Complete wishbone cycle
- sync += r.wb.cyc.eq(0)
+# -- Privilege bit from PTE EAA field
+# signal eaa_priv : std_ulogic;
+ # Privilege bit from PTE EAA field
+ eaa_priv = Signal()
-# -- Cache line is now valid
-# cache_valids(r.store_index)(replace_way) <=
-# r.store_valid and not inval_in;
- # Cache line is now valid
- sync += cache_valid_bits[
- r.store_index
- ][relace_way].eq(
- r.store_valid & ~inval_in
- )
-# -- We are done
-# r.state <= IDLE;
- # We are done
- sync += r.state.eq(State.IDLE)
-# end if;
+# signal r : reg_internal_t;
+ r = RegInternal()
-# -- Increment store row counter
-# r.store_row <= next_row(r.store_row);
- # Increment store row counter
- sync += store_row.eq(next_row(r.store_row))
-# end if;
-# end case;
-# end if;
-#
-# -- TLB miss and protection fault processing
-# if rst = '1' or flush_in = '1' or m_in.tlbld = '1' then
-# r.fetch_failed <= '0';
-# elsif i_in.req = '1' and access_ok = '0' and stall_in = '0' then
-# r.fetch_failed <= '1';
-# end if;
- # TLB miss and protection fault processing
- with m.If('''TODO nmigen rst''' | flush_in | m_in.tlbld):
- sync += r.fetch_failed.eq(0)
+# -- Async signals on incoming request
+# signal req_index : index_t;
+# signal req_row : row_t;
+# signal req_hit_way : way_t;
+# signal req_tag : cache_tag_t;
+# signal req_is_hit : std_ulogic;
+# signal req_is_miss : std_ulogic;
+# signal req_laddr : std_ulogic_vector(63 downto 0);
+ # Async signal on incoming request
+ req_index = Signal(NUM_LINES)
+ req_row = Signal(BRAM_ROWS)
+ req_hit_way = Signal(NUM_WAYS)
+ req_tag = Signal(TAG_BITS)
+ req_is_hit = Signal()
+ req_is_miss = Signal()
+ req_laddr = Signal(64)
- with m.Elif(i_in.req & ~access_ok & ~stall_in):
- sync += r.fetch_failed.eq(1)
-# end if;
-# end process;
+# signal tlb_req_index : tlb_index_t;
+# signal real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
+# signal ra_valid : std_ulogic;
+# signal priv_fault : std_ulogic;
+# signal access_ok : std_ulogic;
+# signal use_previous : std_ulogic;
+ tlb_req_index = Signal(TLB_SIZE)
+ real_addr = Signal(REAL_ADDR_BITS)
+ ra_valid = Signal()
+ priv_fault = Signal()
+ access_ok = Signal()
+ use_previous = Signal()
-# icache_log: if LOG_LENGTH > 0 generate
- def icache_log(self, m, log_out):
- comb = m.d.comb
- sync = m.d.sync
+# -- Cache RAM interface
+# type cache_ram_out_t is array(way_t) of cache_row_t;
+# signal cache_out : cache_ram_out_t;
+ # Cache RAM interface
+ def CacheRamOut():
+ return Array(Signal(ROW_SIZE_BITS) for x in range(NUM_WAYS))
-# -- Output data to logger
-# signal log_data : std_ulogic_vector(53 downto 0);
-# begin
-# data_log: process(clk)
-# variable lway: way_t;
-# variable wstate: std_ulogic;
- # Output data to logger
- for i in range(LOG_LENGTH)
- # Output data to logger
- log_data = Signal(54)
- lway = Signal(NUM_WAYS)
- wstate = Signal()
+ cache_out = CacheRamOut()
-# begin
-# if rising_edge(clk) then
-# lway := req_hit_way;
-# wstate := '0';
- comb += lway.eq(req_hit_way)
- comb += wstate.eq(0)
+# -- PLRU output interface
+# type plru_out_t is array(index_t) of
+# std_ulogic_vector(WAY_BITS-1 downto 0);
+# signal plru_victim : plru_out_t;
+# signal replace_way : way_t;
+ # PLRU output interface
+ def PLRUOut():
+ return Array(Signal(WAY_BITS) for x in range(NUM_LINES))
-# if r.state /= IDLE then
-# wstate := '1';
-# end if;
- with m.If(r.state != State.IDLE):
- comb += wstate.eq(1)
+ plru_victim = PLRUOut()
+ replace_way = Signal(NUM_WAYS)
-# log_data <= i_out.valid &
-# i_out.insn &
-# wishbone_in.ack &
-# r.wb.adr(5 downto 3) &
-# r.wb.stb & r.wb.cyc &
-# wishbone_in.stall &
-# stall_out &
-# r.fetch_failed &
-# r.hit_nia(5 downto 2) &
-# wstate &
-# std_ulogic_vector(to_unsigned(lway, 3)) &
-# req_is_hit & req_is_miss &
-# access_ok &
-# ra_valid;
- sync += log_data.eq(Cat(
- ra_valid, access_ok, req_is_miss, req_is_hit,
- lway '''truncate to 3 bits?''', wstate, r.hit_nia[2:6],
- r.fetch_failed, stall_out, wb_in.stall, r.wb.cyc,
- r.wb.stb, r.wb.adr[3:6], wb_in.ack, i_out.insn,
- i_out.valid
- ))
-# end if;
-# end process;
-# log_out <= log_data;
- comb += log_out.eq(log_data)
+# begin
+#
+# assert LINE_SIZE mod ROW_SIZE = 0;
+# assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2"
+# severity FAILURE;
+# assert ispow2(NUM_LINES) report "NUM_LINES not power of 2"
+# severity FAILURE;
+# assert ispow2(ROW_PER_LINE) report "ROW_PER_LINE not power of 2"
+# severity FAILURE;
+# assert ispow2(INSN_PER_ROW) report "INSN_PER_ROW not power of 2"
+# severity FAILURE;
+# assert (ROW_BITS = INDEX_BITS + ROW_LINEBITS)
+# report "geometry bits don't add up" severity FAILURE;
+# assert (LINE_OFF_BITS = ROW_OFF_BITS + ROW_LINEBITS)
+# report "geometry bits don't add up" severity FAILURE;
+# assert (REAL_ADDR_BITS = TAG_BITS + INDEX_BITS + LINE_OFF_BITS)
+# report "geometry bits don't add up" severity FAILURE;
+# assert (REAL_ADDR_BITS = TAG_BITS + ROW_BITS + ROW_OFF_BITS)
+# report "geometry bits don't add up" severity FAILURE;
+#
+# sim_debug: if SIM generate
+# debug: process
+# begin
+# report "ROW_SIZE = " & natural'image(ROW_SIZE);
+# report "ROW_PER_LINE = " & natural'image(ROW_PER_LINE);
+# report "BRAM_ROWS = " & natural'image(BRAM_ROWS);
+# report "INSN_PER_ROW = " & natural'image(INSN_PER_ROW);
+# report "INSN_BITS = " & natural'image(INSN_BITS);
+# report "ROW_BITS = " & natural'image(ROW_BITS);
+# report "ROW_LINEBITS = " & natural'image(ROW_LINEBITS);
+# report "LINE_OFF_BITS = " & natural'image(LINE_OFF_BITS);
+# report "ROW_OFF_BITS = " & natural'image(ROW_OFF_BITS);
+# report "INDEX_BITS = " & natural'image(INDEX_BITS);
+# report "TAG_BITS = " & natural'image(TAG_BITS);
+# report "WAY_BITS = " & natural'image(WAY_BITS);
+# wait;
+# end process;
# end generate;
-# end;
+