From: Paul Mackerras Date: Tue, 19 Oct 2021 01:22:10 +0000 (+1100) Subject: core: Make popcnt* take two cycles X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=2491aa7fc5ea013d0f384d707f3df3902d9caa77;p=microwatt.git core: Make popcnt* take two cycles This moves the calculation of the result for popcnt* into the countbits unit, renamed from countzero, so that we can take two cycles to get the result. The motivation for this is that the popcnt* calculation was showing up as a critical path. Signed-off-by: Paul Mackerras --- diff --git a/Makefile b/Makefile index eb46c5b..cf723e3 100644 --- a/Makefile +++ b/Makefile @@ -60,7 +60,7 @@ core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \ decode1.vhdl helpers.vhdl insn_helpers.vhdl \ control.vhdl decode2.vhdl register_file.vhdl \ cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \ - logical.vhdl countzero.vhdl multiply.vhdl divider.vhdl execute1.vhdl \ + logical.vhdl countbits.vhdl multiply.vhdl divider.vhdl execute1.vhdl \ loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl core_debug.vhdl \ core.vhdl fpu.vhdl pmu.vhdl diff --git a/countbits.vhdl b/countbits.vhdl new file mode 100644 index 0000000..134540f --- /dev/null +++ b/countbits.vhdl @@ -0,0 +1,130 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; +use work.helpers.all; + +entity bit_counter is + port ( + clk : in std_logic; + rs : in std_ulogic_vector(63 downto 0); + count_right : in std_ulogic; + do_popcnt : in std_ulogic; + is_32bit : in std_ulogic; + datalen : in std_ulogic_vector(3 downto 0); + result : out std_ulogic_vector(63 downto 0) + ); +end entity bit_counter; + +architecture behaviour of bit_counter is + -- signals for count-leading/trailing-zeroes + signal inp : std_ulogic_vector(63 downto 0); + signal sum : std_ulogic_vector(64 downto 0); + signal msb_r : std_ulogic; + signal onehot : std_ulogic_vector(63 downto 0); + signal onehot_r : std_ulogic_vector(63 downto 0); + signal bitnum : std_ulogic_vector(5 downto 0); + signal cntz : std_ulogic_vector(63 downto 0); + + -- signals for popcnt + signal dlen_r : std_ulogic_vector(3 downto 0); + signal pcnt_r : std_ulogic; + subtype twobit is unsigned(1 downto 0); + type twobit32 is array(0 to 31) of twobit; + signal pc2 : twobit32; + subtype threebit is unsigned(2 downto 0); + type threebit16 is array(0 to 15) of threebit; + signal pc4 : threebit16; + subtype fourbit is unsigned(3 downto 0); + type fourbit8 is array(0 to 7) of fourbit; + signal pc8 : fourbit8; + signal pc8_r : fourbit8; + subtype sixbit is unsigned(5 downto 0); + type sixbit2 is array(0 to 1) of sixbit; + signal pc32 : sixbit2; + signal popcnt : std_ulogic_vector(63 downto 0); + +begin + countzero_r: process(clk) + begin + if rising_edge(clk) then + msb_r <= sum(64); + onehot_r <= onehot; + end if; + end process; + + countzero: process(all) + begin + if is_32bit = '0' then + if count_right = '0' then + inp <= bit_reverse(rs); + else + inp <= rs; + end if; + else + inp(63 downto 32) <= x"FFFFFFFF"; + if count_right = '0' then + inp(31 downto 0) <= bit_reverse(rs(31 downto 0)); + else + inp(31 downto 0) <= rs(31 downto 0); + end if; + end if; + + sum <= std_ulogic_vector(unsigned('0' & not inp) + 1); + onehot <= sum(63 downto 0) and inp; + + -- The following occurs after a clock edge + bitnum <= bit_number(onehot_r); + + cntz <= 57x"0" & msb_r & bitnum; + end process; + + popcnt_r: process(clk) + begin + if rising_edge(clk) then + for i in 0 to 7 loop + pc8_r(i) <= pc8(i); + end loop; + dlen_r <= datalen; + pcnt_r <= do_popcnt; + end if; + end process; + + popcnt_a: process(all) + begin + for i in 0 to 31 loop + pc2(i) <= unsigned("0" & rs(i * 2 downto i * 2)) + unsigned("0" & rs(i * 2 + 1 downto i * 2 + 1)); + end loop; + for i in 0 to 15 loop + pc4(i) <= ('0' & pc2(i * 2)) + ('0' & pc2(i * 2 + 1)); + end loop; + for i in 0 to 7 loop + pc8(i) <= ('0' & pc4(i * 2)) + ('0' & pc4(i * 2 + 1)); + end loop; + + -- after a clock edge + for i in 0 to 1 loop + pc32(i) <= ("00" & pc8_r(i * 4)) + ("00" & pc8_r(i * 4 + 1)) + + ("00" & pc8_r(i * 4 + 2)) + ("00" & pc8_r(i * 4 + 3)); + end loop; + + popcnt <= (others => '0'); + if dlen_r(3 downto 2) = "00" then + -- popcntb + for i in 0 to 7 loop + popcnt(i * 8 + 3 downto i * 8) <= std_ulogic_vector(pc8_r(i)); + end loop; + elsif dlen_r(3) = '0' then + -- popcntw + for i in 0 to 1 loop + popcnt(i * 32 + 5 downto i * 32) <= std_ulogic_vector(pc32(i)); + end loop; + else + popcnt(6 downto 0) <= std_ulogic_vector(('0' & pc32(0)) + ('0' & pc32(1))); + end if; + end process; + + result <= cntz when pcnt_r = '0' else popcnt; + +end behaviour; diff --git a/countbits_tb.vhdl b/countbits_tb.vhdl new file mode 100644 index 0000000..c00a6b6 --- /dev/null +++ b/countbits_tb.vhdl @@ -0,0 +1,118 @@ +library vunit_lib; +context vunit_lib.vunit_context; + +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; +use work.common.all; + +library osvvm; +use osvvm.RandomPkg.all; + +entity countbits_tb is + generic (runner_cfg : string := runner_cfg_default); +end countbits_tb; + +architecture behave of countbits_tb is + constant clk_period: time := 10 ns; + signal rs: std_ulogic_vector(63 downto 0); + signal is_32bit, count_right: std_ulogic := '0'; + signal res: std_ulogic_vector(63 downto 0); + signal clk: std_ulogic; + +begin + bitcounter_0: entity work.bit_counter + port map ( + clk => clk, + rs => rs, + result => res, + count_right => count_right, + is_32bit => is_32bit, + do_popcnt => '0', + datalen => "0000" + ); + + clk_process: process + begin + clk <= '0'; + wait for clk_period/2; + clk <= '1'; + wait for clk_period/2; + end process; + + stim_process: process + variable r: std_ulogic_vector(63 downto 0); + variable rnd : RandomPType; + begin + rnd.InitSeed(stim_process'path_name); + + test_runner_setup(runner, runner_cfg); + + while test_suite loop + if run("Test with input = 0") then + rs <= (others => '0'); + is_32bit <= '0'; + count_right <= '0'; + wait for clk_period; + check_equal(res, 16#40#, result("for cntlzd")); + count_right <= '1'; + wait for clk_period; + check_equal(res, 16#40#, result("for cnttzd")); + is_32bit <= '1'; + count_right <= '0'; + wait for clk_period; + check_equal(res, 16#20#, result("for cntlzw")); + count_right <= '1'; + wait for clk_period; + check_equal(res, 16#20#, result("for cnttzw")); + + elsif run("Test cntlzd/w") then + count_right <= '0'; + for j in 0 to 100 loop + r := rnd.RandSlv(64); + r(63) := '1'; + for i in 0 to 63 loop + rs <= r; + is_32bit <= '0'; + wait for clk_period; + check_equal(res, i, result("for cntlzd " & to_hstring(rs))); + rs <= r(31 downto 0) & r(63 downto 32); + is_32bit <= '1'; + wait for clk_period; + if i < 32 then + check_equal(res, i, result("for cntlzw " & to_hstring(rs))); + else + check_equal(res, 32, result("for cntlzw " & to_hstring(rs))); + end if; + r := '0' & r(63 downto 1); + end loop; + end loop; + + elsif run("Test cnttzd/w") then + count_right <= '1'; + for j in 0 to 100 loop + r := rnd.RandSlv(64); + r(0) := '1'; + for i in 0 to 63 loop + rs <= r; + is_32bit <= '0'; + wait for clk_period; + check_equal(res, i, result("for cnttzd " & to_hstring(rs))); + is_32bit <= '1'; + wait for clk_period; + if i < 32 then + check_equal(res, i, result("for cnttzw " & to_hstring(rs))); + else + check_equal(res, 32, result("for cnttzw " & to_hstring(rs))); + end if; + r := r(62 downto 0) & '0'; + end loop; + end loop; + end if; + end loop; + + test_runner_cleanup(runner); + end process; +end behave; diff --git a/countzero.vhdl b/countzero.vhdl deleted file mode 100644 index 55a58b1..0000000 --- a/countzero.vhdl +++ /dev/null @@ -1,60 +0,0 @@ -library ieee; -use ieee.std_logic_1164.all; -use ieee.numeric_std.all; - -library work; -use work.helpers.all; - -entity zero_counter is - port ( - clk : in std_logic; - rs : in std_ulogic_vector(63 downto 0); - count_right : in std_ulogic; - is_32bit : in std_ulogic; - result : out std_ulogic_vector(63 downto 0) - ); -end entity zero_counter; - -architecture behaviour of zero_counter is - signal inp : std_ulogic_vector(63 downto 0); - signal sum : std_ulogic_vector(64 downto 0); - signal msb_r : std_ulogic; - signal onehot : std_ulogic_vector(63 downto 0); - signal onehot_r : std_ulogic_vector(63 downto 0); - signal bitnum : std_ulogic_vector(5 downto 0); - -begin - countzero_r: process(clk) - begin - if rising_edge(clk) then - msb_r <= sum(64); - onehot_r <= onehot; - end if; - end process; - - countzero: process(all) - begin - if is_32bit = '0' then - if count_right = '0' then - inp <= bit_reverse(rs); - else - inp <= rs; - end if; - else - inp(63 downto 32) <= x"FFFFFFFF"; - if count_right = '0' then - inp(31 downto 0) <= bit_reverse(rs(31 downto 0)); - else - inp(31 downto 0) <= rs(31 downto 0); - end if; - end if; - - sum <= std_ulogic_vector(unsigned('0' & not inp) + 1); - onehot <= sum(63 downto 0) and inp; - - -- The following occurs after a clock edge - bitnum <= bit_number(onehot_r); - - result <= x"00000000000000" & "0" & msb_r & bitnum; - end process; -end behaviour; diff --git a/countzero_tb.vhdl b/countzero_tb.vhdl deleted file mode 100644 index f8319b9..0000000 --- a/countzero_tb.vhdl +++ /dev/null @@ -1,116 +0,0 @@ -library vunit_lib; -context vunit_lib.vunit_context; - -library ieee; -use ieee.std_logic_1164.all; -use ieee.numeric_std.all; - -library work; -use work.common.all; - -library osvvm; -use osvvm.RandomPkg.all; - -entity countzero_tb is - generic (runner_cfg : string := runner_cfg_default); -end countzero_tb; - -architecture behave of countzero_tb is - constant clk_period: time := 10 ns; - signal rs: std_ulogic_vector(63 downto 0); - signal is_32bit, count_right: std_ulogic := '0'; - signal res: std_ulogic_vector(63 downto 0); - signal clk: std_ulogic; - -begin - zerocounter_0: entity work.zero_counter - port map ( - clk => clk, - rs => rs, - result => res, - count_right => count_right, - is_32bit => is_32bit - ); - - clk_process: process - begin - clk <= '0'; - wait for clk_period/2; - clk <= '1'; - wait for clk_period/2; - end process; - - stim_process: process - variable r: std_ulogic_vector(63 downto 0); - variable rnd : RandomPType; - begin - rnd.InitSeed(stim_process'path_name); - - test_runner_setup(runner, runner_cfg); - - while test_suite loop - if run("Test with input = 0") then - rs <= (others => '0'); - is_32bit <= '0'; - count_right <= '0'; - wait for clk_period; - check_equal(res, 16#40#, result("for cntlzd")); - count_right <= '1'; - wait for clk_period; - check_equal(res, 16#40#, result("for cnttzd")); - is_32bit <= '1'; - count_right <= '0'; - wait for clk_period; - check_equal(res, 16#20#, result("for cntlzw")); - count_right <= '1'; - wait for clk_period; - check_equal(res, 16#20#, result("for cnttzw")); - - elsif run("Test cntlzd/w") then - count_right <= '0'; - for j in 0 to 100 loop - r := rnd.RandSlv(64); - r(63) := '1'; - for i in 0 to 63 loop - rs <= r; - is_32bit <= '0'; - wait for clk_period; - check_equal(res, i, result("for cntlzd " & to_hstring(rs))); - rs <= r(31 downto 0) & r(63 downto 32); - is_32bit <= '1'; - wait for clk_period; - if i < 32 then - check_equal(res, i, result("for cntlzw " & to_hstring(rs))); - else - check_equal(res, 32, result("for cntlzw " & to_hstring(rs))); - end if; - r := '0' & r(63 downto 1); - end loop; - end loop; - - elsif run("Test cnttzd/w") then - count_right <= '1'; - for j in 0 to 100 loop - r := rnd.RandSlv(64); - r(0) := '1'; - for i in 0 to 63 loop - rs <= r; - is_32bit <= '0'; - wait for clk_period; - check_equal(res, i, result("for cnttzd " & to_hstring(rs))); - is_32bit <= '1'; - wait for clk_period; - if i < 32 then - check_equal(res, i, result("for cnttzw " & to_hstring(rs))); - else - check_equal(res, 32, result("for cnttzw " & to_hstring(rs))); - end if; - r := r(62 downto 0) & '0'; - end loop; - end loop; - end if; - end loop; - - test_runner_cleanup(runner); - end process; -end behave; diff --git a/decode2.vhdl b/decode2.vhdl index f9fa541..5aa1a6f 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -215,7 +215,6 @@ architecture behaviour of decode2 is OP_AND => "001", -- logical_result OP_OR => "001", OP_XOR => "001", - OP_POPCNT => "001", OP_PRTY => "001", OP_CMPB => "001", OP_EXTS => "001", @@ -234,7 +233,8 @@ architecture behaviour of decode2 is OP_DIV => "011", OP_DIVE => "011", OP_MOD => "011", - OP_CNTZ => "100", -- countzero_result + OP_CNTZ => "100", -- countbits_result + OP_POPCNT => "100", OP_MFSPR => "101", -- spr_result OP_B => "110", -- next_nia OP_BC => "110", diff --git a/execute1.vhdl b/execute1.vhdl index 7b90181..54f8dc1 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -106,7 +106,8 @@ architecture behaviour of execute1 is signal rotator_result: std_ulogic_vector(63 downto 0); signal rotator_carry: std_ulogic; signal logical_result: std_ulogic_vector(63 downto 0); - signal countzero_result: std_ulogic_vector(63 downto 0); + signal do_popcnt: std_ulogic; + signal countbits_result: std_ulogic_vector(63 downto 0); signal alu_result: std_ulogic_vector(63 downto 0); signal adder_result: std_ulogic_vector(63 downto 0); signal misc_result: std_ulogic_vector(63 downto 0); @@ -284,13 +285,15 @@ begin datalen => e_in.data_len ); - countzero_0: entity work.zero_counter + countbits_0: entity work.bit_counter port map ( clk => clk, rs => c_in, count_right => e_in.insn(10), is_32bit => e_in.is_32bit, - result => countzero_result + do_popcnt => do_popcnt, + datalen => e_in.data_len, + result => countbits_result ); multiply_0: entity work.multiply @@ -391,7 +394,7 @@ begin logical_result when "001", rotator_result when "010", muldiv_result when "011", - countzero_result when "100", + countbits_result when "100", spr_result when "101", next_nia when "110", misc_result when others; @@ -813,6 +816,8 @@ begin rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0'; rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0'; + do_popcnt <= '1' when e_in.insn_type = OP_POPCNT else '0'; + illegal := '0'; if r.intr_pending = '1' then v.e.srr1 := r.e.srr1; @@ -963,7 +968,7 @@ begin when OP_ADDG6S => when OP_CMPRB => when OP_CMPEQB => - when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS | + when OP_AND | OP_OR | OP_XOR | OP_PRTY | OP_CMPB | OP_EXTS | OP_BPERM | OP_BCD => when OP_B => @@ -1025,7 +1030,7 @@ begin end if; do_trace := '0'; - when OP_CNTZ => + when OP_CNTZ | OP_POPCNT => v.e.valid := '0'; v.cntz_in_progress := '1'; v.busy := '1'; @@ -1220,7 +1225,7 @@ begin -- valid_in = 0. Hence they don't happen in the same cycle as any of -- the cases above which depend on valid_in = 1. if r.cntz_in_progress = '1' then - -- cnt[lt]z always takes two cycles + -- cnt[lt]z and popcnt* always take two cycles v.e.valid := '1'; elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or diff --git a/logical.vhdl b/logical.vhdl index b4ba116..60309ac 100644 --- a/logical.vhdl +++ b/logical.vhdl @@ -20,20 +20,7 @@ end entity logical; architecture behaviour of logical is - subtype twobit is unsigned(1 downto 0); - type twobit32 is array(0 to 31) of twobit; - signal pc2 : twobit32; - subtype threebit is unsigned(2 downto 0); - type threebit16 is array(0 to 15) of threebit; - signal pc4 : threebit16; - subtype fourbit is unsigned(3 downto 0); - type fourbit8 is array(0 to 7) of fourbit; - signal pc8 : fourbit8; - subtype sixbit is unsigned(5 downto 0); - type sixbit2 is array(0 to 1) of sixbit; - signal pc32 : sixbit2; signal par0, par1 : std_ulogic; - signal popcnt : std_ulogic_vector(63 downto 0); signal parity : std_ulogic_vector(63 downto 0); signal permute : std_ulogic_vector(7 downto 0); @@ -109,35 +96,6 @@ begin variable negative : std_ulogic; variable j : integer; begin - -- population counts - for i in 0 to 31 loop - pc2(i) <= unsigned("0" & rs(i * 2 downto i * 2)) + unsigned("0" & rs(i * 2 + 1 downto i * 2 + 1)); - end loop; - for i in 0 to 15 loop - pc4(i) <= ('0' & pc2(i * 2)) + ('0' & pc2(i * 2 + 1)); - end loop; - for i in 0 to 7 loop - pc8(i) <= ('0' & pc4(i * 2)) + ('0' & pc4(i * 2 + 1)); - end loop; - for i in 0 to 1 loop - pc32(i) <= ("00" & pc8(i * 4)) + ("00" & pc8(i * 4 + 1)) + - ("00" & pc8(i * 4 + 2)) + ("00" & pc8(i * 4 + 3)); - end loop; - popcnt <= (others => '0'); - if datalen(3 downto 2) = "00" then - -- popcntb - for i in 0 to 7 loop - popcnt(i * 8 + 3 downto i * 8) <= std_ulogic_vector(pc8(i)); - end loop; - elsif datalen(3) = '0' then - -- popcntw - for i in 0 to 1 loop - popcnt(i * 32 + 5 downto i * 32) <= std_ulogic_vector(pc32(i)); - end loop; - else - popcnt(6 downto 0) <= std_ulogic_vector(('0' & pc32(0)) + ('0' & pc32(1))); - end if; - -- parity calculations par0 <= rs(0) xor rs(8) xor rs(16) xor rs(24); par1 <= rs(32) xor rs(40) xor rs(48) xor rs(56); @@ -178,8 +136,6 @@ begin tmp := not tmp; end if; - when OP_POPCNT => - tmp := popcnt; when OP_PRTY => tmp := parity; when OP_CMPB => diff --git a/microwatt.core b/microwatt.core index f463d90..46e114e 100644 --- a/microwatt.core +++ b/microwatt.core @@ -18,7 +18,7 @@ filesets: - ppc_fx_insns.vhdl - sim_console.vhdl - logical.vhdl - - countzero.vhdl + - countbits.vhdl - control.vhdl - execute1.vhdl - fpu.vhdl