From ec2fa61792ca73265159f711157ae3dfa6c623e0 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Mon, 15 Jun 2020 16:59:08 +1000 Subject: [PATCH] execute1: Reduce width of the result mux to help timing This reduces the number of different things that are assigned to the result variable. - The computations for the popcnt, prty, cmpb and exts instruction families are moved into the logical unit. - The result of mfspr from the slow SPRs is computed in 'spr_val' before being assigned to 'result'. - Writes to LR as a result of a blr or bclr instruction are done through the exc_write path to writeback. This eases timing considerably. Signed-off-by: Paul Mackerras --- execute1.vhdl | 60 ++++++++++++++-------------------------------- logical.vhdl | 66 ++++++++++++++++++++++++++++++++++----------------- 2 files changed, 62 insertions(+), 64 deletions(-) diff --git a/execute1.vhdl b/execute1.vhdl index 12d3df1..902af70 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -82,8 +82,6 @@ architecture behaviour of execute1 is signal rotator_carry: std_ulogic; signal logical_result: std_ulogic_vector(63 downto 0); signal countzero_result: std_ulogic_vector(63 downto 0); - signal popcnt_result: std_ulogic_vector(63 downto 0); - signal parity_result: std_ulogic_vector(63 downto 0); -- multiply signals signal x_to_multiply: Execute1ToMultiplyType; @@ -208,9 +206,7 @@ begin invert_in => e_in.invert_a, invert_out => e_in.invert_out, result => logical_result, - datalen => e_in.data_len, - popcnt => popcnt_result, - parity => parity_result + datalen => e_in.data_len ); countzero_0: entity work.zero_counter @@ -295,7 +291,6 @@ begin variable sign1, sign2 : std_ulogic; variable abs1, abs2 : signed(63 downto 0); variable overflow : std_ulogic; - variable negative : std_ulogic; variable zerohi, zerolo : std_ulogic; variable msb_a, msb_b : std_ulogic; variable a_lt : std_ulogic; @@ -308,6 +303,7 @@ begin variable is_branch : std_ulogic; variable taken_branch : std_ulogic; variable abs_branch : std_ulogic; + variable spr_val : std_ulogic_vector(63 downto 0); begin result := (others => '0'); result_with_carry := (others => '0'); @@ -627,7 +623,7 @@ begin end if; end if; end if; - when OP_AND | OP_OR | OP_XOR => + when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS => result := logical_result; result_en := '1'; when OP_B => @@ -677,27 +673,10 @@ begin ctrl_tmp.msr(MSR_DR) <= '1'; end if; - when OP_CMPB => - result := ppc_cmpb(c_in, b_in); - result_en := '1'; when OP_CNTZ => v.e.valid := '0'; v.cntz_in_progress := '1'; v.busy := '1'; - when OP_EXTS => - -- note data_len is a 1-hot encoding - negative := (e_in.data_len(0) and c_in(7)) or - (e_in.data_len(1) and c_in(15)) or - (e_in.data_len(2) and c_in(31)); - result := (others => negative); - if e_in.data_len(2) = '1' then - result(31 downto 16) := c_in(31 downto 16); - end if; - if e_in.data_len(2) = '1' or e_in.data_len(1) = '1' then - result(15 downto 8) := c_in(15 downto 8); - end if; - result(7 downto 0) := c_in(7 downto 0); - result_en := '1'; when OP_ISEL => crbit := to_integer(unsigned(insn_bc(e_in.insn))); if e_in.cr(31-crbit) = '1' then @@ -769,24 +748,25 @@ begin result(63-45) := v.e.xerc.ca32; end if; else + spr_val := c_in; case decode_spr_num(e_in.insn) is when SPR_TB => - result := ctrl.tb; + spr_val := ctrl.tb; when SPR_DEC => - result := ctrl.dec; + spr_val := ctrl.dec; when 724 => -- LOG_ADDR SPR - result := log_wr_addr & r.log_addr_spr; + spr_val := log_wr_addr & r.log_addr_spr; when 725 => -- LOG_DATA SPR - result := log_rd_data; + spr_val := log_rd_data; v.log_addr_spr := std_ulogic_vector(unsigned(r.log_addr_spr) + 1); when others => -- mfspr from unimplemented SPRs should be a nop in -- supervisor mode and a program interrupt for user mode - result := c_in; if ctrl.msr(MSR_PR) = '1' then illegal := '1'; end if; end case; + result := spr_val; end if; when OP_MFCR => if e_in.insn(20) = '0' then @@ -862,12 +842,6 @@ begin end if; end case; end if; - when OP_POPCNT => - result := popcnt_result; - result_en := '1'; - when OP_PRTY => - result := parity_result; - result_en := '1'; when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR | OP_EXTSWSLI => result := rotator_result; if e_in.output_carry = '1' then @@ -917,12 +891,14 @@ begin -- Update LR on the next cycle after a branch link -- If we're not writing back anything else, we can write back LR - -- this cycle, otherwise we take an extra cycle. + -- this cycle, otherwise we take an extra cycle. We use the + -- exc_write path since next_nia is written through that path + -- in other places. if e_in.lr = '1' then if result_en = '0' then - result_en := '1'; - result := next_nia; - v.e.write_reg := fast_spr_num(SPR_LR); + v.e.exc_write_enable := '1'; + v.e.exc_write_data := next_nia; + v.e.exc_write_reg := fast_spr_num(SPR_LR); else v.lr_update := '1'; v.next_lr := next_nia; @@ -939,9 +915,9 @@ begin end if; elsif r.lr_update = '1' then - result_en := '1'; - result := r.next_lr; - v.e.write_reg := fast_spr_num(SPR_LR); + v.e.exc_write_enable := '1'; + v.e.exc_write_data := r.next_lr; + v.e.exc_write_reg := fast_spr_num(SPR_LR); v.e.valid := '1'; elsif r.cntz_in_progress = '1' then -- cnt[lt]z always takes two cycles diff --git a/logical.vhdl b/logical.vhdl index 4dfc13d..5e6abfa 100644 --- a/logical.vhdl +++ b/logical.vhdl @@ -4,6 +4,7 @@ use ieee.numeric_std.all; library work; use work.decode_types.all; +use work.ppc_fx_insns.all; entity logical is port ( @@ -13,9 +14,7 @@ entity logical is invert_in : in std_ulogic; invert_out : in std_ulogic; result : out std_ulogic_vector(63 downto 0); - datalen : in std_logic_vector(3 downto 0); - popcnt : out std_ulogic_vector(63 downto 0); - parity : out std_ulogic_vector(63 downto 0) + datalen : in std_logic_vector(3 downto 0) ); end entity logical; @@ -34,30 +33,14 @@ architecture behaviour of logical is type sixbit2 is array(0 to 1) of sixbit; signal pc32 : sixbit2; signal par0, par1 : std_ulogic; + signal popcnt : std_ulogic_vector(63 downto 0); + signal parity : std_ulogic_vector(63 downto 0); begin logical_0: process(all) variable rb_adj, tmp : std_ulogic_vector(63 downto 0); + variable negative : std_ulogic; begin - rb_adj := rb; - if invert_in = '1' then - rb_adj := not rb; - end if; - - case op is - when OP_AND => - tmp := rs and rb_adj; - when OP_OR => - tmp := rs or rb_adj; - when others => - tmp := rs xor rb_adj; - end case; - - result <= tmp; - if invert_out = '1' then - result <= not tmp; - end if; - -- population counts for i in 0 to 31 loop pc2(i) <= unsigned("0" & rs(i * 2 downto i * 2)) + unsigned("0" & rs(i * 2 + 1 downto i * 2 + 1)); @@ -98,5 +81,44 @@ begin parity(32) <= par1; end if; + rb_adj := rb; + if invert_in = '1' then + rb_adj := not rb; + end if; + + case op is + when OP_AND => + tmp := rs and rb_adj; + when OP_OR => + tmp := rs or rb_adj; + when OP_XOR => + tmp := rs xor rb_adj; + when OP_POPCNT => + tmp := popcnt; + when OP_PRTY => + tmp := parity; + when OP_CMPB => + tmp := ppc_cmpb(rs, rb); + when others => + -- EXTS + -- note datalen is a 1-hot encoding + negative := (datalen(0) and rs(7)) or + (datalen(1) and rs(15)) or + (datalen(2) and rs(31)); + tmp := (others => negative); + if datalen(2) = '1' then + tmp(31 downto 16) := rs(31 downto 16); + end if; + if datalen(2) = '1' or datalen(1) = '1' then + tmp(15 downto 8) := rs(15 downto 8); + end if; + tmp(7 downto 0) := rs(7 downto 0); + end case; + + if invert_out = '1' then + tmp := not tmp; + end if; + result <= tmp; + end process; end behaviour; -- 2.30.2