From: Paul Mackerras Date: Tue, 8 Oct 2019 21:55:43 +0000 (+1100) Subject: execute: Consolidate count-leading/trailing-zeroes implementations X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=24a4a796ce1e4bf370e00f801bc1cee6faf7d8f7;p=microwatt.git execute: Consolidate count-leading/trailing-zeroes implementations This adds combinatorial logic that does 32-bit and 64-bit count leading and trailing zeroes in one unit, and consolidates the four instructions under a single OP_CNTZ opcode. This saves 84 slice LUTs on the Arty A7-100. Signed-off-by: Paul Mackerras --- diff --git a/Makefile b/Makefile index 4e394dd..efcebbf 100644 --- a/Makefile +++ b/Makefile @@ -18,12 +18,13 @@ sim_jtag.o: sim_jtag_socket.o core_tb.o: common.o wishbone_types.o core.o soc.o sim_jtag.o core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o execute2.o loadstore1.o loadstore2.o multiply.o writeback.o core_debug.o divider.o core_debug.o: common.o +countzero.o: cr_file.o: common.o crhelpers.o: common.o decode1.o: common.o decode_types.o decode2.o: decode_types.o common.o helpers.o insn_helpers.o decode_types.o: -execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o +execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o execute2.o: common.o crhelpers.o ppc_fx_insns.o fetch1.o: common.o fetch2.o: common.o wishbone_types.o diff --git a/countzero.vhdl b/countzero.vhdl new file mode 100644 index 0000000..3e0cec7 --- /dev/null +++ b/countzero.vhdl @@ -0,0 +1,103 @@ +library ieee; +use ieee.std_logic_1164.all; +use ieee.numeric_std.all; + +library work; + +entity zero_counter is + port ( + rs : in std_ulogic_vector(63 downto 0); + count_right : in std_ulogic; + is_32bit : in std_ulogic; + result : out std_ulogic_vector(63 downto 0) + ); +end entity zero_counter; + +architecture behaviour of zero_counter is + signal l32, r32 : std_ulogic; + signal v32 : std_ulogic_vector(31 downto 0); + signal v16 : std_ulogic_vector(15 downto 0); + signal v8 : std_ulogic_vector(7 downto 0); + signal v4 : std_ulogic_vector(3 downto 0); + signal sel : std_ulogic_vector(5 downto 0); +begin + zerocounter0: process(all) + begin + l32 <= or (rs(63 downto 32)); + r32 <= or (rs(31 downto 0)); + if (l32 = '0' or is_32bit = '1') and r32 = '0' then + -- operand is zero, return 32 for 32-bit, else 64 + result <= x"00000000000000" & '0' & not is_32bit & is_32bit & "00000"; + else + + if count_right = '0' then + sel(5) <= l32 and (not is_32bit); + else + sel(5) <= (not r32) and (not is_32bit); + end if; + if sel(5) = '1' then + v32 <= rs(63 downto 32); + else + v32 <= rs(31 downto 0); + end if; + + if count_right = '0' then + sel(4) <= or (v32(31 downto 16)); + else + sel(4) <= not (or (v32(15 downto 0))); + end if; + if sel(4) = '1' then + v16 <= v32(31 downto 16); + else + v16 <= v32(15 downto 0); + end if; + + if count_right = '0' then + sel(3) <= or (v16(15 downto 8)); + else + sel(3) <= not (or (v16(7 downto 0))); + end if; + if sel(3) = '1' then + v8 <= v16(15 downto 8); + else + v8 <= v16(7 downto 0); + end if; + + if count_right = '0' then + sel(2) <= or (v8(7 downto 4)); + else + sel(2) <= not (or (v8(3 downto 0))); + end if; + if sel(2) = '1' then + v4 <= v8(7 downto 4); + else + v4 <= v8(3 downto 0); + end if; + + if count_right = '0' then + if v4(3) = '1' then + sel(1 downto 0) <= "11"; + elsif v4(2) = '1' then + sel(1 downto 0) <= "10"; + elsif v4(1) = '1' then + sel(1 downto 0) <= "01"; + else + sel(1 downto 0) <= "00"; + end if; + result <= x"00000000000000" & "00" & (not sel(5) and not is_32bit) & not sel(4 downto 0); + else + if v4(0) = '1' then + sel(1 downto 0) <= "00"; + elsif v4(1) = '1' then + sel(1 downto 0) <= "01"; + elsif v4(2) = '1' then + sel(1 downto 0) <= "10"; + else + sel(1 downto 0) <= "11"; + end if; + result <= x"00000000000000" & "00" & sel; + end if; + end if; + + end process; +end behaviour; diff --git a/decode1.vhdl b/decode1.vhdl index 0e1d44f..c94e3e3 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -145,10 +145,10 @@ architecture behaviour of decode1 is -- 2#0011100000# cmpeqb 2#0000100000# => (ALU, OP_CMPL, RA, RB, NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- cmpl -- 2#0011000000# cmprb - 2#0000111010# => (ALU, OP_CNTLZD, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- cntlzd - 2#0000011010# => (ALU, OP_CNTLZW, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- cntlzw - 2#1000111010# => (ALU, OP_CNTTZD, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- cnttzd - 2#1000011010# => (ALU, OP_CNTTZW, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- cnttzw + 2#0000111010# => (ALU, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- cntlzd + 2#0000011010# => (ALU, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '1'), -- cntlzw + 2#1000111010# => (ALU, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'), -- cnttzd + 2#1000011010# => (ALU, OP_CNTZ, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '1'), -- cnttzw -- 2#1011110011# darn 2#0001010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbf 2#0000110110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbst diff --git a/decode_types.vhdl b/decode_types.vhdl index dbcf972..982b172 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -5,7 +5,7 @@ package decode_types is type insn_type_t is (OP_ILLEGAL, OP_NOP, OP_ADD, OP_ADDPCIS, OP_AND, OP_ATTN, OP_B, OP_BC, OP_BCREG, OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPL, OP_CMPRB, - OP_CNTLZD, OP_CNTLZW, OP_CNTTZD, OP_CNTTZW, OP_CRAND, + OP_CNTZ, OP_CRAND, OP_CRANDC, OP_CREQV, OP_CRNAND, OP_CRNOR, OP_CROR, OP_CRORC, OP_CRXOR, OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST, OP_DCBZ, OP_DIV, OP_EXTSB, OP_EXTSH, OP_EXTSW, diff --git a/execute1.vhdl b/execute1.vhdl index cbc9179..1c5e5cb 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -46,6 +46,7 @@ architecture behaviour of execute1 is signal rotator_result: std_ulogic_vector(63 downto 0); signal rotator_carry: std_ulogic; signal logical_result: std_ulogic_vector(63 downto 0); + signal countzero_result: std_ulogic_vector(63 downto 0); function decode_input_carry (carry_sel : carry_in_t; ca_in : std_ulogic) return std_ulogic is begin @@ -85,6 +86,14 @@ begin result => logical_result ); + countzero_0: entity work.zero_counter + port map ( + rs => e_in.read_data3, + count_right => e_in.insn(10), + is_32bit => e_in.is_32bit, + result => countzero_result + ); + execute1_0: process(clk) begin if rising_edge(clk) then @@ -217,17 +226,8 @@ begin hi := lo + 3; v.e.write_cr_data(hi downto lo) := ppc_cmpl(l, e_in.read_data1, e_in.read_data2); end loop; - when OP_CNTLZW => - result := ppc_cntlzw(e_in.read_data3); - result_en := 1; - when OP_CNTTZW => - result := ppc_cnttzw(e_in.read_data3); - result_en := 1; - when OP_CNTLZD => - result := ppc_cntlzd(e_in.read_data3); - result_en := 1; - when OP_CNTTZD => - result := ppc_cnttzd(e_in.read_data3); + when OP_CNTZ => + result := countzero_result; result_en := 1; when OP_EXTSB => result := ppc_extsb(e_in.read_data3); diff --git a/microwatt.core b/microwatt.core index a6e0bfa..45405d9 100644 --- a/microwatt.core +++ b/microwatt.core @@ -19,6 +19,7 @@ filesets: - ppc_fx_insns.vhdl - sim_console.vhdl - logical.vhdl + - countzero.vhdl - execute1.vhdl - execute2.vhdl - loadstore1.vhdl