From: Paul Mackerras <paulus@ozlabs.org>
Date: Tue, 8 Oct 2019 21:55:43 +0000 (+1100)
Subject: execute: Consolidate count-leading/trailing-zeroes implementations
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=24a4a796ce1e4bf370e00f801bc1cee6faf7d8f7;p=microwatt.git

execute: Consolidate count-leading/trailing-zeroes implementations

This adds combinatorial logic that does 32-bit and 64-bit count
leading and trailing zeroes in one unit, and consolidates the
four instructions under a single OP_CNTZ opcode.

This saves 84 slice LUTs on the Arty A7-100.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---

diff --git a/Makefile b/Makefile
index 4e394dd..efcebbf 100644
--- a/Makefile
+++ b/Makefile
@@ -18,12 +18,13 @@ sim_jtag.o: sim_jtag_socket.o
 core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o
 core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o execute2.o loadstore1.o loadstore2.o multiply.o writeback.o core_debug.o divider.o
 core_debug.o: common.o
+countzero.o:
 cr_file.o: common.o
 crhelpers.o: common.o
 decode1.o: common.o decode_types.o
 decode2.o: decode_types.o common.o helpers.o insn_helpers.o
 decode_types.o:
-execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o
+execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o
 execute2.o: common.o crhelpers.o ppc_fx_insns.o
 fetch1.o: common.o
 fetch2.o: common.o wishbone_types.o
diff --git a/countzero.vhdl b/countzero.vhdl
new file mode 100644
index 0000000..3e0cec7
--- /dev/null
+++ b/countzero.vhdl
@@ -0,0 +1,103 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+
+entity zero_counter is
+    port (
+	rs          : in std_ulogic_vector(63 downto 0);
+	count_right : in std_ulogic;
+	is_32bit    : in std_ulogic;
+	result      : out std_ulogic_vector(63 downto 0)
+	);
+end entity zero_counter;
+
+architecture behaviour of zero_counter is
+    signal l32, r32 : std_ulogic;
+    signal v32      : std_ulogic_vector(31 downto 0);
+    signal v16      : std_ulogic_vector(15 downto 0);
+    signal v8       : std_ulogic_vector(7 downto 0);
+    signal v4       : std_ulogic_vector(3 downto 0);
+    signal sel      : std_ulogic_vector(5 downto 0);
+begin
+    zerocounter0: process(all)
+    begin
+	l32 <= or (rs(63 downto 32));
+	r32 <= or (rs(31 downto 0));
+	if (l32 = '0' or is_32bit = '1') and r32 = '0' then
+	    -- operand is zero, return 32 for 32-bit, else 64
+	    result <= x"00000000000000" & '0' & not is_32bit & is_32bit & "00000";
+	else
+
+	    if count_right = '0' then
+		sel(5) <= l32 and (not is_32bit);
+	    else
+		sel(5) <= (not r32) and (not is_32bit);
+	    end if;
+	    if sel(5) = '1' then
+		v32 <= rs(63 downto 32);
+	    else
+		v32 <= rs(31 downto 0);
+	    end if;
+
+	    if count_right = '0' then
+		sel(4) <= or (v32(31 downto 16));
+	    else
+		sel(4) <= not (or (v32(15 downto 0)));
+	    end if;
+	    if sel(4) = '1' then
+		v16 <= v32(31 downto 16);
+	    else
+		v16 <= v32(15 downto 0);
+	    end if;
+
+	    if count_right = '0' then
+		sel(3) <= or (v16(15 downto 8));
+	    else
+		sel(3) <= not (or (v16(7 downto 0)));
+	    end if;
+	    if sel(3) = '1' then
+		v8 <= v16(15 downto 8);
+	    else
+		v8 <= v16(7 downto 0);
+	    end if;
+
+	    if count_right = '0' then
+		sel(2) <= or (v8(7 downto 4));
+	    else
+		sel(2) <= not (or (v8(3 downto 0)));
+	    end if;
+	    if sel(2) = '1' then
+		v4 <= v8(7 downto 4);
+	    else
+		v4 <= v8(3 downto 0);
+	    end if;
+
+	    if count_right = '0' then
+		if v4(3) = '1' then
+		    sel(1 downto 0) <= "11";
+		elsif v4(2) = '1' then
+		    sel(1 downto 0) <= "10";
+		elsif v4(1) = '1' then
+		    sel(1 downto 0) <= "01";
+		else
+		    sel(1 downto 0) <= "00";
+		end if;
+		result <= x"00000000000000" & "00" & (not sel(5) and not is_32bit) & not sel(4 downto 0);
+	    else
+		if v4(0) = '1' then
+		    sel(1 downto 0) <= "00";
+		elsif v4(1) = '1' then
+		    sel(1 downto 0) <= "01";
+		elsif v4(2) = '1' then
+		    sel(1 downto 0) <= "10";
+		else
+		    sel(1 downto 0) <= "11";
+		end if;
+		result <= x"00000000000000" & "00" & sel;
+	    end if;
+	end if;
+
+    end process;
+end behaviour;
diff --git a/decode1.vhdl b/decode1.vhdl
index 0e1d44f..c94e3e3 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -145,10 +145,10 @@ architecture behaviour of decode1 is
 		-- 2#0011100000# cmpeqb
 		2#0000100000#  =>       (ALU,    OP_CMPL,      RA,         RB,          NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- cmpl
 		-- 2#0011000000# cmprb
-		2#0000111010#  =>       (ALU,    OP_CNTLZD,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- cntlzd
-		2#0000011010#  =>       (ALU,    OP_CNTLZW,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- cntlzw
-		2#1000111010#  =>       (ALU,    OP_CNTTZD,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- cnttzd
-		2#1000011010#  =>       (ALU,    OP_CNTTZW,    NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- cnttzw
+		2#0000111010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- cntlzd
+		2#0000011010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '1'), -- cntlzw
+		2#1000111010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'), -- cnttzd
+		2#1000011010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '1'), -- cnttzw
 		-- 2#1011110011# darn
 		2#0001010110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbf
 		2#0000110110#  =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbst
diff --git a/decode_types.vhdl b/decode_types.vhdl
index dbcf972..982b172 100644
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@@ -5,7 +5,7 @@ package decode_types is
 	type insn_type_t is (OP_ILLEGAL, OP_NOP, OP_ADD,
 		OP_ADDPCIS, OP_AND, OP_ATTN, OP_B, OP_BC, OP_BCREG,
 		OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPL, OP_CMPRB,
-		OP_CNTLZD, OP_CNTLZW, OP_CNTTZD, OP_CNTTZW, OP_CRAND,
+		OP_CNTZ, OP_CRAND,
 		OP_CRANDC, OP_CREQV, OP_CRNAND, OP_CRNOR, OP_CROR, OP_CRORC,
 		OP_CRXOR, OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST,
 		OP_DCBZ, OP_DIV, OP_EXTSB, OP_EXTSH, OP_EXTSW,
diff --git a/execute1.vhdl b/execute1.vhdl
index cbc9179..1c5e5cb 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -46,6 +46,7 @@ architecture behaviour of execute1 is
 	signal rotator_result: std_ulogic_vector(63 downto 0);
 	signal rotator_carry: std_ulogic;
 	signal logical_result: std_ulogic_vector(63 downto 0);
+	signal countzero_result: std_ulogic_vector(63 downto 0);
 
         function decode_input_carry (carry_sel : carry_in_t; ca_in : std_ulogic) return std_ulogic is
         begin
@@ -85,6 +86,14 @@ begin
 			result => logical_result
 		);
 
+	countzero_0: entity work.zero_counter
+		port map (
+			rs => e_in.read_data3,
+			count_right => e_in.insn(10),
+			is_32bit => e_in.is_32bit,
+			result => countzero_result
+		);
+
 	execute1_0: process(clk)
 	begin
 		if rising_edge(clk) then
@@ -217,17 +226,8 @@ begin
 						hi := lo + 3;
 						v.e.write_cr_data(hi downto lo) := ppc_cmpl(l, e_in.read_data1, e_in.read_data2);
 					end loop;
-				when OP_CNTLZW =>
-					result := ppc_cntlzw(e_in.read_data3);
-					result_en := 1;
-				when OP_CNTTZW =>
-					result := ppc_cnttzw(e_in.read_data3);
-					result_en := 1;
-				when OP_CNTLZD =>
-					result := ppc_cntlzd(e_in.read_data3);
-					result_en := 1;
-				when OP_CNTTZD =>
-					result := ppc_cnttzd(e_in.read_data3);
+				when OP_CNTZ =>
+					result := countzero_result;
 					result_en := 1;
 				when OP_EXTSB =>
 					result := ppc_extsb(e_in.read_data3);
diff --git a/microwatt.core b/microwatt.core
index a6e0bfa..45405d9 100644
--- a/microwatt.core
+++ b/microwatt.core
@@ -19,6 +19,7 @@ filesets:
       - ppc_fx_insns.vhdl
       - sim_console.vhdl
       - logical.vhdl
+      - countzero.vhdl
       - execute1.vhdl
       - execute2.vhdl
       - loadstore1.vhdl