From: Paul Mackerras <paulus@ozlabs.org>
Date: Sat, 11 Jul 2020 02:05:43 +0000 (+1000)
Subject: countzero: Faster algorithm for count leading/trailing zeroes
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=03a3a5d326d8c79f4fd14668534571049d70eaf7;p=microwatt.git

countzero: Faster algorithm for count leading/trailing zeroes

This uses an algorithm for count leading/trailing zeroes that is
faster on FPGAs, which makes timing easier.  cntlz* and cnttz*
still take two cycles, though.

For count trailing zeroes, we compute x & -x, which for non-zero x
has a single 1 bit in the position of the least-significant 1 bit
in x.  This one-hot representation can then be converted to a bit
number with six 32-input OR gates.  For count leading zeroes, we
simply do a bit-reversal on x and then use the same algorithm.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---

diff --git a/countzero.vhdl b/countzero.vhdl
index 50e6ead..18aa043 100644
--- a/countzero.vhdl
+++ b/countzero.vhdl
@@ -15,123 +15,81 @@ entity zero_counter is
 end entity zero_counter;
 
 architecture behaviour of zero_counter is
-    type intermediate_result is record
-        v16: std_ulogic_vector(15 downto 0);
-        sel_hi: std_ulogic_vector(1 downto 0);
-        is_32bit: std_ulogic;
-        count_right: std_ulogic;
-    end record;
-
-    signal r, r_in  : intermediate_result;
+    -- Reverse the order of bits in a word
+    function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector is
+        variable ret: std_ulogic_vector(a'left downto a'right);
+    begin
+        for i in a'right to a'left loop
+            ret(a'left + a'right - i) := a(i);
+        end loop;
+        return ret;
+    end;
 
-    -- Return the index of the leftmost or rightmost 1 in a set of 4 bits.
-    -- Assumes v is not "0000"; if it is, return (right ? "11" : "00").
-    function encoder(v: std_ulogic_vector(3 downto 0); right: std_ulogic) return std_ulogic_vector is
+    -- If there is only one bit set in a doubleword, return its bit number
+    -- (counting from the right).  Each bit of the result is obtained by
+    -- ORing together 32 bits of the input:
+    --  bit 0 = a[1] or a[3] or a[5] or ...
+    --  bit 1 = a[2] or a[3] or a[6] or a[7] or ...
+    --  bit 2 = a[4..7] or a[12..15] or ...
+    --  bit 5 = a[32..63] ORed together
+    function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
+        variable ret: std_ulogic_vector(5 downto 0);
+        variable stride: natural;
+        variable bit: std_ulogic;
+        variable k: natural;
     begin
-	if right = '0' then
-	    if v(3) = '1' then
-		return "11";
-	    elsif v(2) = '1' then
-		return "10";
-	    elsif v(1) = '1' then
-		return "01";
-	    else
-		return "00";
-	    end if;
-	else
-	    if v(0) = '1' then
-		return "00";
-	    elsif v(1) = '1' then
-		return "01";
-	    elsif v(2) = '1' then
-		return "10";
-	    else
-		return "11";
-	    end if;
-	end if;
+        stride := 2;
+        for i in 0 to 5 loop
+            bit := '0';
+            for j in 0 to (64 / stride) - 1 loop
+                k := j * stride;
+                bit := bit or (or a(k + stride - 1 downto k + (stride / 2)));
+            end loop;
+            ret(i) := bit;
+            stride := stride * 2;
+        end loop;
+        return ret;
     end;
 
+    signal inp : std_ulogic_vector(63 downto 0);
+    signal sum : std_ulogic_vector(64 downto 0);
+    signal msb_r : std_ulogic;
+    signal onehot : std_ulogic_vector(63 downto 0);
+    signal onehot_r : std_ulogic_vector(63 downto 0);
+    signal bitnum : std_ulogic_vector(5 downto 0);
+
 begin
-    zerocounter_0: process(clk)
+    countzero_r: process(clk)
     begin
-	if rising_edge(clk) then
-            r <= r_in;
+        if rising_edge(clk) then
+            msb_r <= sum(64);
+            onehot_r <= onehot;
         end if;
     end process;
 
-    zerocounter_1: process(all)
-        variable v: intermediate_result;
-        variable y, z: std_ulogic_vector(3 downto 0);
-        variable sel: std_ulogic_vector(5 downto 0);
-        variable v4: std_ulogic_vector(3 downto 0);
-
+    countzero: process(all)
     begin
-	-- Test 4 groups of 16 bits each.
-	-- The top 2 groups are considered to be zero in 32-bit mode.
-	z(0) := or (rs(15 downto 0));
-	z(1) := or (rs(31 downto 16));
-	z(2) := or (rs(47 downto 32));
-	z(3) := or (rs(63 downto 48));
         if is_32bit = '0' then
-            v.sel_hi := encoder(z, count_right);
+            if count_right = '0' then
+                inp <= bit_reverse(rs);
+            else
+                inp <= rs;
+            end if;
         else
-            v.sel_hi(1) := '0';
+            inp(63 downto 32) <= x"FFFFFFFF";
             if count_right = '0' then
-                v.sel_hi(0) := z(1);
+                inp(31 downto 0) <= bit_reverse(rs(31 downto 0));
             else
-                v.sel_hi(0) := not z(0);
+                inp(31 downto 0) <= rs(31 downto 0);
             end if;
         end if;
 
-	-- Select the leftmost/rightmost non-zero group of 16 bits
-	case v.sel_hi is
-	    when "00" =>
-		v.v16 := rs(15 downto 0);
-	    when "01" =>
-		v.v16 := rs(31 downto 16);
-	    when "10" =>
-		v.v16 := rs(47 downto 32);
-	    when others =>
-		v.v16 := rs(63 downto 48);
-	end case;
-
-        -- Latch this and do the rest in the next cycle, for the sake of timing
-        v.is_32bit := is_32bit;
-        v.count_right := count_right;
-        r_in <= v;
-        sel(5 downto 4) := r.sel_hi;
-
-	-- Test 4 groups of 4 bits
-	y(0) := or (r.v16(3 downto 0));
-	y(1) := or (r.v16(7 downto 4));
-	y(2) := or (r.v16(11 downto 8));
-	y(3) := or (r.v16(15 downto 12));
-	sel(3 downto 2) := encoder(y, r.count_right);
-
-	-- Select the leftmost/rightmost non-zero group of 4 bits
-	case sel(3 downto 2) is
-	    when "00" =>
-		v4 := r.v16(3 downto 0);
-	    when "01" =>
-		v4 := r.v16(7 downto 4);
-	    when "10" =>
-		v4 := r.v16(11 downto 8);
-	    when others =>
-		v4 := r.v16(15 downto 12);
-	end case;
-
-	sel(1 downto 0) := encoder(v4, r.count_right);
+        sum <= std_ulogic_vector(unsigned('0' & not inp) + 1);
+        onehot <= sum(63 downto 0) and inp;
 
-	-- sel is now the index of the leftmost/rightmost 1 bit in rs
-	if v4 = "0000" then
-	    -- operand is zero, return 32 for 32-bit, else 64
-	    result <= x"00000000000000" & '0' & not r.is_32bit & r.is_32bit & "00000";
-	elsif r.count_right = '0' then
-	    -- return (63 - sel), trimmed to 5 bits in 32-bit mode
-	    result <= x"00000000000000" & "00" & (not sel(5) and not r.is_32bit) & not sel(4 downto 0);
-	else
-	    result <= x"00000000000000" & "00" & sel;
-	end if;
+        -- The following occurs after a clock edge
+        bitnum <= bit_number(onehot_r);
 
+        result <= x"00000000000000" & "0" & msb_r & bitnum;
     end process;
 end behaviour;