From: Paul Mackerras Date: Mon, 13 Jan 2020 07:13:09 +0000 (+1100) Subject: execute: Move popcnt and prty instructions into the logical unit X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=0c714f1be680ed36373be0ee9c15d30a7cc263b6;p=microwatt.git execute: Move popcnt and prty instructions into the logical unit This implements logic in the logical entity to calculate the results of the popcnt* and prty* instructions. We now have one insn_type_t value for the 3 popcnt variants and one for the two prty variants, using the length field of the decode_rom_t to distinguish between them. The implementations in logical.vhdl using recursive algorithms rather than the simple functions in ppc_fx_insns.vhdl. This gives a saving of about 140 slice LUTs on the A7-100 and improves timing slightly. Signed-off-by: Paul Mackerras --- diff --git a/decode1.vhdl b/decode1.vhdl index 0e42d1b..d2dbd96 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -263,11 +263,11 @@ architecture behaviour of decode1 is 2#0001111100# => (ALU, OP_OR, NONE, RB, RS, RA, '0', '0', '0', '1', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- nor 2#0110111100# => (ALU, OP_OR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- or 2#0110011100# => (ALU, OP_OR, NONE, RB, RS, RA, '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- orc - 2#0001111010# => (ALU, OP_POPCNTB, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntb - 2#0111111010# => (ALU, OP_POPCNTD, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntd - 2#0101111010# => (ALU, OP_POPCNTW, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntw - 2#0010111010# => (ALU, OP_PRTYD, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyd - 2#0010011010# => (ALU, OP_PRTYW, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyw + 2#0001111010# => (ALU, OP_POPCNT, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntb + 2#0111111010# => (ALU, OP_POPCNT, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntd + 2#0101111010# => (ALU, OP_POPCNT, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- popcntw + 2#0010111010# => (ALU, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyd + 2#0010011010# => (ALU, OP_PRTY, NONE, NONE, RS, RA, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- prtyw -- 2#0010000000# setb 2#0000011011# => (ALU, OP_SHL, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- sld 2#0000011000# => (ALU, OP_SHL, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- slw diff --git a/decode_types.vhdl b/decode_types.vhdl index 82039bd..21d8b68 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -14,8 +14,8 @@ package decode_types is OP_MCRXR, OP_MCRXRX, OP_MFCR, OP_MFSPR, OP_MOD, OP_MTCRF, OP_MTSPR, OP_MUL_L64, OP_MUL_H64, OP_MUL_H32, OP_OR, - OP_POPCNTB, OP_POPCNTD, OP_POPCNTW, OP_PRTYD, - OP_PRTYW, OP_RLC, OP_RLCL, OP_RLCR, OP_SETB, + OP_POPCNT, OP_PRTY, + OP_RLC, OP_RLCL, OP_RLCR, OP_SETB, OP_SHL, OP_SHR, OP_SYNC, OP_TD, OP_TDI, OP_TW, OP_TWI, OP_XOR, OP_SIM_CONFIG diff --git a/execute1.vhdl b/execute1.vhdl index 6889a6a..5a626f8 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -54,6 +54,8 @@ architecture behaviour of execute1 is signal rotator_carry: std_ulogic; signal logical_result: std_ulogic_vector(63 downto 0); signal countzero_result: std_ulogic_vector(63 downto 0); + signal popcnt_result: std_ulogic_vector(63 downto 0); + signal parity_result: std_ulogic_vector(63 downto 0); -- multiply signals signal x_to_multiply: Execute1ToMultiplyType; @@ -127,7 +129,10 @@ begin op => e_in.insn_type, invert_in => e_in.invert_a, invert_out => e_in.invert_out, - result => logical_result + result => logical_result, + datalen => e_in.data_len, + popcnt => popcnt_result, + parity => parity_result ); countzero_0: entity work.zero_counter @@ -612,20 +617,11 @@ begin -- when others => -- end case; end if; - when OP_POPCNTB => - result := ppc_popcntb(e_in.read_data3); + when OP_POPCNT => + result := popcnt_result; result_en := '1'; - when OP_POPCNTW => - result := ppc_popcntw(e_in.read_data3); - result_en := '1'; - when OP_POPCNTD => - result := ppc_popcntd(e_in.read_data3); - result_en := '1'; - when OP_PRTYD => - result := ppc_prtyd(e_in.read_data3); - result_en := '1'; - when OP_PRTYW => - result := ppc_prtyw(e_in.read_data3); + when OP_PRTY => + result := parity_result; result_en := '1'; when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR => result := rotator_result; diff --git a/logical.vhdl b/logical.vhdl index b92b98d..4dfc13d 100644 --- a/logical.vhdl +++ b/logical.vhdl @@ -12,11 +12,29 @@ entity logical is op : in insn_type_t; invert_in : in std_ulogic; invert_out : in std_ulogic; - result : out std_ulogic_vector(63 downto 0) + result : out std_ulogic_vector(63 downto 0); + datalen : in std_logic_vector(3 downto 0); + popcnt : out std_ulogic_vector(63 downto 0); + parity : out std_ulogic_vector(63 downto 0) ); end entity logical; architecture behaviour of logical is + + subtype twobit is unsigned(1 downto 0); + type twobit32 is array(0 to 31) of twobit; + signal pc2 : twobit32; + subtype threebit is unsigned(2 downto 0); + type threebit16 is array(0 to 15) of threebit; + signal pc4 : threebit16; + subtype fourbit is unsigned(3 downto 0); + type fourbit8 is array(0 to 7) of fourbit; + signal pc8 : fourbit8; + subtype sixbit is unsigned(5 downto 0); + type sixbit2 is array(0 to 1) of sixbit; + signal pc32 : sixbit2; + signal par0, par1 : std_ulogic; + begin logical_0: process(all) variable rb_adj, tmp : std_ulogic_vector(63 downto 0); @@ -40,5 +58,45 @@ begin result <= not tmp; end if; + -- population counts + for i in 0 to 31 loop + pc2(i) <= unsigned("0" & rs(i * 2 downto i * 2)) + unsigned("0" & rs(i * 2 + 1 downto i * 2 + 1)); + end loop; + for i in 0 to 15 loop + pc4(i) <= ('0' & pc2(i * 2)) + ('0' & pc2(i * 2 + 1)); + end loop; + for i in 0 to 7 loop + pc8(i) <= ('0' & pc4(i * 2)) + ('0' & pc4(i * 2 + 1)); + end loop; + for i in 0 to 1 loop + pc32(i) <= ("00" & pc8(i * 4)) + ("00" & pc8(i * 4 + 1)) + + ("00" & pc8(i * 4 + 2)) + ("00" & pc8(i * 4 + 3)); + end loop; + popcnt <= (others => '0'); + if datalen(3 downto 2) = "00" then + -- popcntb + for i in 0 to 7 loop + popcnt(i * 8 + 3 downto i * 8) <= std_ulogic_vector(pc8(i)); + end loop; + elsif datalen(3) = '0' then + -- popcntw + for i in 0 to 1 loop + popcnt(i * 32 + 5 downto i * 32) <= std_ulogic_vector(pc32(i)); + end loop; + else + popcnt(6 downto 0) <= std_ulogic_vector(('0' & pc32(0)) + ('0' & pc32(1))); + end if; + + -- parity calculations + par0 <= rs(0) xor rs(8) xor rs(16) xor rs(24); + par1 <= rs(32) xor rs(40) xor rs(48) xor rs(56); + parity <= (others => '0'); + if datalen(3) = '1' then + parity(0) <= par0 xor par1; + else + parity(0) <= par0; + parity(32) <= par1; + end if; + end process; end behaviour;