core: Make popcnt* take two cycles
authorPaul Mackerras <paulus@ozlabs.org>
Tue, 19 Oct 2021 01:22:10 +0000 (12:22 +1100)
committerAnton Blanchard <anton@ozlabs.org>
Wed, 2 Feb 2022 05:10:54 +0000 (16:10 +1100)
This moves the calculation of the result for popcnt* into the
countbits unit, renamed from countzero, so that we can take two cycles
to get the result.  The motivation for this is that the popcnt*
calculation was showing up as a critical path.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Makefile
countbits.vhdl [new file with mode: 0644]
countbits_tb.vhdl [new file with mode: 0644]
countzero.vhdl [deleted file]
countzero_tb.vhdl [deleted file]
decode2.vhdl
execute1.vhdl
logical.vhdl
microwatt.core

index eb46c5b3fc1ea19b6c419965ce2837dc1f945b3e..cf723e32e066efd901fbaf1f73b7ce434c2c5da6 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -60,7 +60,7 @@ core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
        decode1.vhdl helpers.vhdl insn_helpers.vhdl \
        control.vhdl decode2.vhdl register_file.vhdl \
        cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \
-       logical.vhdl countzero.vhdl multiply.vhdl divider.vhdl execute1.vhdl \
+       logical.vhdl countbits.vhdl multiply.vhdl divider.vhdl execute1.vhdl \
        loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl core_debug.vhdl \
        core.vhdl fpu.vhdl pmu.vhdl
 
diff --git a/countbits.vhdl b/countbits.vhdl
new file mode 100644 (file)
index 0000000..134540f
--- /dev/null
@@ -0,0 +1,130 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.helpers.all;
+
+entity bit_counter is
+    port (
+        clk         : in std_logic;
+        rs          : in std_ulogic_vector(63 downto 0);
+        count_right : in std_ulogic;
+        do_popcnt   : in std_ulogic;
+        is_32bit    : in std_ulogic;
+        datalen     : in std_ulogic_vector(3 downto 0);
+        result      : out std_ulogic_vector(63 downto 0)
+        );
+end entity bit_counter;
+
+architecture behaviour of bit_counter is
+    -- signals for count-leading/trailing-zeroes
+    signal inp : std_ulogic_vector(63 downto 0);
+    signal sum : std_ulogic_vector(64 downto 0);
+    signal msb_r : std_ulogic;
+    signal onehot : std_ulogic_vector(63 downto 0);
+    signal onehot_r : std_ulogic_vector(63 downto 0);
+    signal bitnum : std_ulogic_vector(5 downto 0);
+    signal cntz : std_ulogic_vector(63 downto 0);
+
+    -- signals for popcnt
+    signal dlen_r   : std_ulogic_vector(3 downto 0);
+    signal pcnt_r   : std_ulogic;
+    subtype twobit is unsigned(1 downto 0);
+    type twobit32 is array(0 to 31) of twobit;
+    signal pc2      : twobit32;
+    subtype threebit is unsigned(2 downto 0);
+    type threebit16 is array(0 to 15) of threebit;
+    signal pc4      : threebit16;
+    subtype fourbit is unsigned(3 downto 0);
+    type fourbit8 is array(0 to 7) of fourbit;
+    signal pc8      : fourbit8;
+    signal pc8_r    : fourbit8;
+    subtype sixbit is unsigned(5 downto 0);
+    type sixbit2 is array(0 to 1) of sixbit;
+    signal pc32     : sixbit2;
+    signal popcnt   : std_ulogic_vector(63 downto 0);
+
+begin
+    countzero_r: process(clk)
+    begin
+        if rising_edge(clk) then
+            msb_r <= sum(64);
+            onehot_r <= onehot;
+        end if;
+    end process;
+
+    countzero: process(all)
+    begin
+        if is_32bit = '0' then
+            if count_right = '0' then
+                inp <= bit_reverse(rs);
+            else
+                inp <= rs;
+            end if;
+        else
+            inp(63 downto 32) <= x"FFFFFFFF";
+            if count_right = '0' then
+                inp(31 downto 0) <= bit_reverse(rs(31 downto 0));
+            else
+                inp(31 downto 0) <= rs(31 downto 0);
+            end if;
+        end if;
+
+        sum <= std_ulogic_vector(unsigned('0' & not inp) + 1);
+        onehot <= sum(63 downto 0) and inp;
+
+        -- The following occurs after a clock edge
+        bitnum <= bit_number(onehot_r);
+
+        cntz <= 57x"0" & msb_r & bitnum;
+    end process;
+
+    popcnt_r: process(clk)
+    begin
+        if rising_edge(clk) then
+            for i in 0 to 7 loop
+                pc8_r(i) <= pc8(i);
+            end loop;
+            dlen_r <= datalen;
+            pcnt_r <= do_popcnt;
+        end if;
+    end process;
+
+    popcnt_a: process(all)
+    begin
+        for i in 0 to 31 loop
+            pc2(i) <= unsigned("0" & rs(i * 2 downto i * 2)) + unsigned("0" & rs(i * 2 + 1 downto i * 2 + 1));
+        end loop;
+        for i in 0 to 15 loop
+            pc4(i) <= ('0' & pc2(i * 2)) + ('0' & pc2(i * 2 + 1));
+        end loop;
+        for i in 0 to 7 loop
+            pc8(i) <= ('0' & pc4(i * 2)) + ('0' & pc4(i * 2 + 1));
+        end loop;
+
+        -- after a clock edge
+        for i in 0 to 1 loop
+            pc32(i) <= ("00" & pc8_r(i * 4)) + ("00" & pc8_r(i * 4 + 1)) +
+                       ("00" & pc8_r(i * 4 + 2)) + ("00" & pc8_r(i * 4 + 3));
+        end loop;
+        
+        popcnt <= (others => '0');
+        if dlen_r(3 downto 2) = "00" then
+            -- popcntb
+            for i in 0 to 7 loop
+                popcnt(i * 8 + 3 downto i * 8) <= std_ulogic_vector(pc8_r(i));
+            end loop;
+        elsif dlen_r(3) = '0' then
+            -- popcntw
+            for i in 0 to 1 loop
+                popcnt(i * 32 + 5 downto i * 32) <= std_ulogic_vector(pc32(i));
+            end loop;
+        else
+            popcnt(6 downto 0) <= std_ulogic_vector(('0' & pc32(0)) + ('0' & pc32(1)));
+        end if;
+    end process;
+
+    result <= cntz when pcnt_r = '0' else popcnt;
+
+end behaviour;
diff --git a/countbits_tb.vhdl b/countbits_tb.vhdl
new file mode 100644 (file)
index 0000000..c00a6b6
--- /dev/null
@@ -0,0 +1,118 @@
+library vunit_lib;
+context vunit_lib.vunit_context;
+
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.common.all;
+
+library osvvm;
+use osvvm.RandomPkg.all;
+
+entity countbits_tb is
+    generic (runner_cfg : string := runner_cfg_default);
+end countbits_tb;
+
+architecture behave of countbits_tb is
+    constant clk_period: time := 10 ns;
+    signal rs: std_ulogic_vector(63 downto 0);
+    signal is_32bit, count_right: std_ulogic := '0';
+    signal res: std_ulogic_vector(63 downto 0);
+    signal clk: std_ulogic;
+
+begin
+    bitcounter_0: entity work.bit_counter
+        port map (
+            clk => clk,
+            rs => rs,
+            result => res,
+            count_right => count_right,
+            is_32bit => is_32bit,
+            do_popcnt => '0',
+            datalen => "0000"
+        );
+
+    clk_process: process
+    begin
+        clk <= '0';
+        wait for clk_period/2;
+        clk <= '1';
+        wait for clk_period/2;
+    end process;
+
+    stim_process: process
+        variable r: std_ulogic_vector(63 downto 0);
+        variable rnd : RandomPType;
+    begin
+        rnd.InitSeed(stim_process'path_name);
+
+        test_runner_setup(runner, runner_cfg);
+
+        while test_suite loop
+            if run("Test with input = 0") then
+                rs <= (others => '0');
+                is_32bit <= '0';
+                count_right <= '0';
+                wait for clk_period;
+                check_equal(res, 16#40#, result("for cntlzd"));
+                count_right <= '1';
+                wait for clk_period;
+                check_equal(res, 16#40#, result("for cnttzd"));
+                is_32bit <= '1';
+                count_right <= '0';
+                wait for clk_period;
+                check_equal(res, 16#20#, result("for cntlzw"));
+                count_right <= '1';
+                wait for clk_period;
+                check_equal(res, 16#20#, result("for cnttzw"));
+
+            elsif run("Test cntlzd/w") then
+                count_right <= '0';
+                for j in 0 to 100 loop
+                    r := rnd.RandSlv(64);
+                    r(63) := '1';
+                    for i in 0 to 63 loop
+                        rs <= r;
+                        is_32bit <= '0';
+                        wait for clk_period;
+                        check_equal(res, i, result("for cntlzd " & to_hstring(rs)));
+                        rs <= r(31 downto 0) & r(63 downto 32);
+                        is_32bit <= '1';
+                        wait for clk_period;
+                        if i < 32 then
+                            check_equal(res, i, result("for cntlzw " & to_hstring(rs)));
+                        else
+                            check_equal(res, 32, result("for cntlzw " & to_hstring(rs)));
+                        end if;
+                        r := '0' & r(63 downto 1);
+                    end loop;
+                end loop;
+
+            elsif run("Test cnttzd/w") then
+                count_right <= '1';
+                for j in 0 to 100 loop
+                    r := rnd.RandSlv(64);
+                    r(0) := '1';
+                    for i in 0 to 63 loop
+                        rs <= r;
+                        is_32bit <= '0';
+                        wait for clk_period;
+                        check_equal(res, i, result("for cnttzd " & to_hstring(rs)));
+                        is_32bit <= '1';
+                        wait for clk_period;
+                        if i < 32 then
+                            check_equal(res, i, result("for cnttzw " & to_hstring(rs)));
+                        else
+                            check_equal(res, 32, result("for cnttzw " & to_hstring(rs)));
+                        end if;
+                        r := r(62 downto 0) & '0';
+                    end loop;
+                end loop;
+            end if;
+        end loop;
+
+        test_runner_cleanup(runner);
+    end process;
+end behave;
diff --git a/countzero.vhdl b/countzero.vhdl
deleted file mode 100644 (file)
index 55a58b1..0000000
+++ /dev/null
@@ -1,60 +0,0 @@
-library ieee;
-use ieee.std_logic_1164.all;
-use ieee.numeric_std.all;
-
-library work;
-use work.helpers.all;
-
-entity zero_counter is
-    port (
-        clk         : in std_logic;
-        rs          : in std_ulogic_vector(63 downto 0);
-        count_right : in std_ulogic;
-        is_32bit    : in std_ulogic;
-        result      : out std_ulogic_vector(63 downto 0)
-        );
-end entity zero_counter;
-
-architecture behaviour of zero_counter is
-    signal inp : std_ulogic_vector(63 downto 0);
-    signal sum : std_ulogic_vector(64 downto 0);
-    signal msb_r : std_ulogic;
-    signal onehot : std_ulogic_vector(63 downto 0);
-    signal onehot_r : std_ulogic_vector(63 downto 0);
-    signal bitnum : std_ulogic_vector(5 downto 0);
-
-begin
-    countzero_r: process(clk)
-    begin
-        if rising_edge(clk) then
-            msb_r <= sum(64);
-            onehot_r <= onehot;
-        end if;
-    end process;
-
-    countzero: process(all)
-    begin
-        if is_32bit = '0' then
-            if count_right = '0' then
-                inp <= bit_reverse(rs);
-            else
-                inp <= rs;
-            end if;
-        else
-            inp(63 downto 32) <= x"FFFFFFFF";
-            if count_right = '0' then
-                inp(31 downto 0) <= bit_reverse(rs(31 downto 0));
-            else
-                inp(31 downto 0) <= rs(31 downto 0);
-            end if;
-        end if;
-
-        sum <= std_ulogic_vector(unsigned('0' & not inp) + 1);
-        onehot <= sum(63 downto 0) and inp;
-
-        -- The following occurs after a clock edge
-        bitnum <= bit_number(onehot_r);
-
-        result <= x"00000000000000" & "0" & msb_r & bitnum;
-    end process;
-end behaviour;
diff --git a/countzero_tb.vhdl b/countzero_tb.vhdl
deleted file mode 100644 (file)
index f8319b9..0000000
+++ /dev/null
@@ -1,116 +0,0 @@
-library vunit_lib;
-context vunit_lib.vunit_context;
-
-library ieee;
-use ieee.std_logic_1164.all;
-use ieee.numeric_std.all;
-
-library work;
-use work.common.all;
-
-library osvvm;
-use osvvm.RandomPkg.all;
-
-entity countzero_tb is
-    generic (runner_cfg : string := runner_cfg_default);
-end countzero_tb;
-
-architecture behave of countzero_tb is
-    constant clk_period: time := 10 ns;
-    signal rs: std_ulogic_vector(63 downto 0);
-    signal is_32bit, count_right: std_ulogic := '0';
-    signal res: std_ulogic_vector(63 downto 0);
-    signal clk: std_ulogic;
-
-begin
-    zerocounter_0: entity work.zero_counter
-        port map (
-            clk => clk,
-            rs => rs,
-            result => res,
-            count_right => count_right,
-            is_32bit => is_32bit
-        );
-
-    clk_process: process
-    begin
-        clk <= '0';
-        wait for clk_period/2;
-        clk <= '1';
-        wait for clk_period/2;
-    end process;
-
-    stim_process: process
-        variable r: std_ulogic_vector(63 downto 0);
-        variable rnd : RandomPType;
-    begin
-        rnd.InitSeed(stim_process'path_name);
-
-        test_runner_setup(runner, runner_cfg);
-
-        while test_suite loop
-            if run("Test with input = 0") then
-                rs <= (others => '0');
-                is_32bit <= '0';
-                count_right <= '0';
-                wait for clk_period;
-                check_equal(res, 16#40#, result("for cntlzd"));
-                count_right <= '1';
-                wait for clk_period;
-                check_equal(res, 16#40#, result("for cnttzd"));
-                is_32bit <= '1';
-                count_right <= '0';
-                wait for clk_period;
-                check_equal(res, 16#20#, result("for cntlzw"));
-                count_right <= '1';
-                wait for clk_period;
-                check_equal(res, 16#20#, result("for cnttzw"));
-
-            elsif run("Test cntlzd/w") then
-                count_right <= '0';
-                for j in 0 to 100 loop
-                    r := rnd.RandSlv(64);
-                    r(63) := '1';
-                    for i in 0 to 63 loop
-                        rs <= r;
-                        is_32bit <= '0';
-                        wait for clk_period;
-                        check_equal(res, i, result("for cntlzd " & to_hstring(rs)));
-                        rs <= r(31 downto 0) & r(63 downto 32);
-                        is_32bit <= '1';
-                        wait for clk_period;
-                        if i < 32 then
-                            check_equal(res, i, result("for cntlzw " & to_hstring(rs)));
-                        else
-                            check_equal(res, 32, result("for cntlzw " & to_hstring(rs)));
-                        end if;
-                        r := '0' & r(63 downto 1);
-                    end loop;
-                end loop;
-
-            elsif run("Test cnttzd/w") then
-                count_right <= '1';
-                for j in 0 to 100 loop
-                    r := rnd.RandSlv(64);
-                    r(0) := '1';
-                    for i in 0 to 63 loop
-                        rs <= r;
-                        is_32bit <= '0';
-                        wait for clk_period;
-                        check_equal(res, i, result("for cnttzd " & to_hstring(rs)));
-                        is_32bit <= '1';
-                        wait for clk_period;
-                        if i < 32 then
-                            check_equal(res, i, result("for cnttzw " & to_hstring(rs)));
-                        else
-                            check_equal(res, 32, result("for cnttzw " & to_hstring(rs)));
-                        end if;
-                        r := r(62 downto 0) & '0';
-                    end loop;
-                end loop;
-            end if;
-        end loop;
-
-        test_runner_cleanup(runner);
-    end process;
-end behave;
index f9fa5412dce1233eda7d41fc876b83e01fb4ed86..5aa1a6f383dfb945208b10ec00f24b69af42e67e 100644 (file)
@@ -215,7 +215,6 @@ architecture behaviour of decode2 is
         OP_AND      => "001",           -- logical_result
         OP_OR       => "001",
         OP_XOR      => "001",
-        OP_POPCNT   => "001",
         OP_PRTY     => "001",
         OP_CMPB     => "001",
         OP_EXTS     => "001",
@@ -234,7 +233,8 @@ architecture behaviour of decode2 is
         OP_DIV      => "011",
         OP_DIVE     => "011",
         OP_MOD      => "011",
-        OP_CNTZ     => "100",           -- countzero_result
+        OP_CNTZ     => "100",           -- countbits_result
+        OP_POPCNT   => "100",
         OP_MFSPR    => "101",           -- spr_result
         OP_B        => "110",           -- next_nia
         OP_BC       => "110",
index 7b901817af5a0ca6fefa42d4c4008ab4ba80ea63..54f8dc1d010b556ea7898d48c06cb8be0c636561 100644 (file)
@@ -106,7 +106,8 @@ architecture behaviour of execute1 is
     signal rotator_result: std_ulogic_vector(63 downto 0);
     signal rotator_carry: std_ulogic;
     signal logical_result: std_ulogic_vector(63 downto 0);
-    signal countzero_result: std_ulogic_vector(63 downto 0);
+    signal do_popcnt: std_ulogic;
+    signal countbits_result: std_ulogic_vector(63 downto 0);
     signal alu_result: std_ulogic_vector(63 downto 0);
     signal adder_result: std_ulogic_vector(63 downto 0);
     signal misc_result: std_ulogic_vector(63 downto 0);
@@ -284,13 +285,15 @@ begin
             datalen => e_in.data_len
            );
 
-    countzero_0: entity work.zero_counter
+    countbits_0: entity work.bit_counter
        port map (
             clk => clk,
            rs => c_in,
            count_right => e_in.insn(10),
            is_32bit => e_in.is_32bit,
-           result => countzero_result
+            do_popcnt => do_popcnt,
+            datalen => e_in.data_len,
+           result => countbits_result
            );
 
     multiply_0: entity work.multiply
@@ -391,7 +394,7 @@ begin
         logical_result     when "001",
         rotator_result     when "010",
         muldiv_result      when "011",
-        countzero_result   when "100",
+        countbits_result   when "100",
         spr_result         when "101",
         next_nia           when "110",
         misc_result        when others;
@@ -813,6 +816,8 @@ begin
        rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0';
         rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0';
 
+        do_popcnt <= '1' when e_in.insn_type = OP_POPCNT else '0';
+
         illegal := '0';
         if r.intr_pending = '1' then
             v.e.srr1 := r.e.srr1;
@@ -963,7 +968,7 @@ begin
             when OP_ADDG6S =>
             when OP_CMPRB =>
             when OP_CMPEQB =>
-            when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS |
+            when OP_AND | OP_OR | OP_XOR | OP_PRTY | OP_CMPB | OP_EXTS |
                     OP_BPERM | OP_BCD =>
 
            when OP_B =>
@@ -1025,7 +1030,7 @@ begin
                 end if;
                 do_trace := '0';
 
-            when OP_CNTZ =>
+            when OP_CNTZ | OP_POPCNT =>
                 v.e.valid := '0';
                 v.cntz_in_progress := '1';
                 v.busy := '1';
@@ -1220,7 +1225,7 @@ begin
         -- valid_in = 0.  Hence they don't happen in the same cycle as any of
         -- the cases above which depend on valid_in = 1.
         if r.cntz_in_progress = '1' then
-            -- cnt[lt]z always takes two cycles
+            -- cnt[lt]z and popcnt* always take two cycles
             v.e.valid := '1';
        elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then
            if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or
index b4ba11604df176f034683d31ec0b80c0848a6278..60309ac35f40dae257f4c7c91a5011688b850380 100644 (file)
@@ -20,20 +20,7 @@ end entity logical;
 
 architecture behaviour of logical is
 
-    subtype twobit is unsigned(1 downto 0);
-    type twobit32 is array(0 to 31) of twobit;
-    signal pc2      : twobit32;
-    subtype threebit is unsigned(2 downto 0);
-    type threebit16 is array(0 to 15) of threebit;
-    signal pc4      : threebit16;
-    subtype fourbit is unsigned(3 downto 0);
-    type fourbit8 is array(0 to 7) of fourbit;
-    signal pc8      : fourbit8;
-    subtype sixbit is unsigned(5 downto 0);
-    type sixbit2 is array(0 to 1) of sixbit;
-    signal pc32     : sixbit2;
     signal par0, par1 : std_ulogic;
-    signal popcnt   : std_ulogic_vector(63 downto 0);
     signal parity   : std_ulogic_vector(63 downto 0);
     signal permute  : std_ulogic_vector(7 downto 0);
 
@@ -109,35 +96,6 @@ begin
         variable negative : std_ulogic;
         variable j : integer;
     begin
-        -- population counts
-        for i in 0 to 31 loop
-            pc2(i) <= unsigned("0" & rs(i * 2 downto i * 2)) + unsigned("0" & rs(i * 2 + 1 downto i * 2 + 1));
-        end loop;
-        for i in 0 to 15 loop
-            pc4(i) <= ('0' & pc2(i * 2)) + ('0' & pc2(i * 2 + 1));
-        end loop;
-        for i in 0 to 7 loop
-            pc8(i) <= ('0' & pc4(i * 2)) + ('0' & pc4(i * 2 + 1));
-        end loop;
-        for i in 0 to 1 loop
-            pc32(i) <= ("00" & pc8(i * 4)) + ("00" & pc8(i * 4 + 1)) +
-                       ("00" & pc8(i * 4 + 2)) + ("00" & pc8(i * 4 + 3));
-        end loop;
-        popcnt <= (others => '0');
-        if datalen(3 downto 2) = "00" then
-            -- popcntb
-            for i in 0 to 7 loop
-                popcnt(i * 8 + 3 downto i * 8) <= std_ulogic_vector(pc8(i));
-            end loop;
-        elsif datalen(3) = '0' then
-            -- popcntw
-            for i in 0 to 1 loop
-                popcnt(i * 32 + 5 downto i * 32) <= std_ulogic_vector(pc32(i));
-            end loop;
-        else
-            popcnt(6 downto 0) <= std_ulogic_vector(('0' & pc32(0)) + ('0' & pc32(1)));
-        end if;
-
         -- parity calculations
         par0 <= rs(0) xor rs(8) xor rs(16) xor rs(24);
         par1 <= rs(32) xor rs(40) xor rs(48) xor rs(56);
@@ -178,8 +136,6 @@ begin
                     tmp := not tmp;
                 end if;
 
-            when OP_POPCNT =>
-                tmp := popcnt;
             when OP_PRTY =>
                 tmp := parity;
             when OP_CMPB =>
index f463d906f8f566d660ef94d7634a621e0c4da81b..46e114eea3fa8e734ea79a4e4eef7d9cf2487250 100644 (file)
@@ -18,7 +18,7 @@ filesets:
       - ppc_fx_insns.vhdl
       - sim_console.vhdl
       - logical.vhdl
-      - countzero.vhdl
+      - countbits.vhdl
       - control.vhdl
       - execute1.vhdl
       - fpu.vhdl