$(shell scripts/make_version.sh git.vhdl)
core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
- utils.vhdl plru.vhdl cache_ram.vhdl icache.vhdl predecode.vhdl \
- decode1.vhdl helpers.vhdl insn_helpers.vhdl \
+ utils.vhdl plru.vhdl plrufn.vhdl cache_ram.vhdl icache.vhdl \
+ predecode.vhdl decode1.vhdl helpers.vhdl insn_helpers.vhdl \
control.vhdl decode2.vhdl register_file.vhdl \
cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \
logical.vhdl countbits.vhdl multiply.vhdl multiply-32s.vhdl divider.vhdl \
-- efficient use of distributed RAM and less logic/muxes. Currently we
-- write TAG_BITS width which may not match full ram blocks and might
-- cause muxes to be inferred for "partial writes".
--- * Check if making the read size of PLRU a ROM helps utilization
--
library ieee;
use ieee.std_logic_1164.all;
signal wb_rd_data : std_ulogic_vector(ROW_SIZE_BITS - 1 downto 0);
-- PLRU output interface
- type plru_out_t is array(index_t) of std_ulogic_vector(WAY_BITS-1 downto 0);
- signal plru_victim : plru_out_t;
+ signal plru_victim : way_sig_t;
-- Memory write snoop signals
signal snoop_valid : std_ulogic;
-- Generate PLRUs
maybe_plrus: if NUM_WAYS > 1 generate
+ type plru_array is array(index_t) of std_ulogic_vector(NUM_WAYS - 2 downto 0);
+ signal plru_ram : plru_array;
+ signal plru_cur : std_ulogic_vector(NUM_WAYS - 2 downto 0);
+ signal plru_upd : std_ulogic_vector(NUM_WAYS - 2 downto 0);
+ signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
+ signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
begin
- plrus: for i in 0 to NUM_LINES-1 generate
- -- PLRU interface
- signal plru_acc : std_ulogic_vector(WAY_BITS-1 downto 0);
- signal plru_acc_en : std_ulogic;
- signal plru_out : std_ulogic_vector(WAY_BITS-1 downto 0);
-
- begin
- plru : entity work.plru
- generic map (
- BITS => WAY_BITS
- )
- port map (
- clk => clk,
- rst => rst,
- acc => plru_acc,
- acc_en => plru_acc_en,
- lru => plru_out
- );
-
- process(all)
- begin
- -- PLRU interface
- if is_X(r.hit_nia) then
- plru_acc_en <= 'X';
- elsif get_index(r.hit_nia) = i then
- plru_acc_en <= r.hit_valid;
- else
- plru_acc_en <= '0';
- end if;
- plru_acc <= std_ulogic_vector(r.hit_way);
- plru_victim(i) <= plru_out;
- end process;
- end generate;
+ plru : entity work.plrufn
+ generic map (
+ BITS => WAY_BITS
+ )
+ port map (
+ acc => plru_acc,
+ tree_in => plru_cur,
+ tree_out => plru_upd,
+ lru => plru_out
+ );
+
+ process(all)
+ begin
+ -- Read PLRU bits from array
+ if is_X(r.hit_nia) then
+ plru_cur <= (others => 'X');
+ else
+ plru_cur <= plru_ram(to_integer(get_index(r.hit_nia)));
+ end if;
+
+ -- PLRU interface
+ plru_acc <= std_ulogic_vector(r.hit_way);
+ plru_victim <= unsigned(plru_out);
+ end process;
+
+ -- synchronous writes to PLRU array
+ process(clk)
+ begin
+ if rising_edge(clk) then
+ if r.hit_valid = '1' then
+ assert not is_X(r.hit_nia) severity failure;
+ plru_ram(to_integer(get_index(r.hit_nia))) <= plru_upd;
+ end if;
+ end if;
+ end process;
end generate;
-- TLB hit detection and real address generation
replace_way := to_unsigned(0, WAY_BITS);
if NUM_WAYS > 1 then
-- Get victim way from plru
- replace_way := unsigned(plru_victim(to_integer(r.store_index)));
+ replace_way := plru_victim;
end if;
r.store_way <= replace_way;
--- /dev/null
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+use ieee.math_real.all;
+
+entity plrufn is
+ generic (
+ BITS : positive := 2
+ )
+ ;
+ port (
+ acc : in std_ulogic_vector(BITS-1 downto 0);
+ tree_in : in std_ulogic_vector(2 ** BITS - 2 downto 0);
+ tree_out : out std_ulogic_vector(2 ** BITS - 2 downto 0);
+ lru : out std_ulogic_vector(BITS-1 downto 0)
+ );
+end entity plrufn;
+
+architecture rtl of plrufn is
+ -- Each level of the tree (from leaf to root) has half the number of nodes
+ -- of the previous level. So for a 2^N bits LRU, we have a level of N/2 bits
+ -- one of N/4 bits etc.. down to 1. This gives us 2^N-1 nodes. Ie, 2 bits
+ -- LRU has 3 nodes (2 + 1), 4 bits LRU has 15 nodes (8 + 4 + 2 + 1) etc...
+ constant count : positive := 2 ** BITS - 1;
+ subtype node_t is integer range 0 to count - 1;
+begin
+
+ get_lru: process(tree_in)
+ variable node : node_t;
+ variable abit : std_ulogic;
+ begin
+ node := 0;
+ for i in 0 to BITS-1 loop
+ abit := tree_in(node);
+ if is_X(abit) then
+ abit := '0';
+ end if;
+ lru(BITS-1-i) <= abit;
+ if i /= BITS-1 then
+ node := node * 2;
+ if abit = '1' then
+ node := node + 2;
+ else
+ node := node + 1;
+ end if;
+ end if;
+ end loop;
+ end process;
+
+ update_lru: process(all)
+ variable node : node_t;
+ variable abit : std_ulogic;
+ begin
+ tree_out <= tree_in;
+ node := 0;
+ for i in 0 to BITS-1 loop
+ abit := acc(BITS-1-i);
+ if is_X(abit) then
+ abit := '0';
+ end if;
+ tree_out(node) <= not abit;
+ if i /= BITS-1 then
+ node := node * 2;
+ if abit = '1' then
+ node := node + 2;
+ else
+ node := node + 1;
+ end if;
+ end if;
+ end loop;
+ end process;
+end;