decode1.vhdl helpers.vhdl insn_helpers.vhdl \
control.vhdl decode2.vhdl register_file.vhdl \
cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \
- logical.vhdl countbits.vhdl multiply.vhdl divider.vhdl execute1.vhdl \
- loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl core_debug.vhdl \
- core.vhdl fpu.vhdl pmu.vhdl
+ logical.vhdl countbits.vhdl multiply.vhdl multiply-32s.vhdl divider.vhdl \
+ execute1.vhdl loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl \
+ core_debug.vhdl core.vhdl fpu.vhdl pmu.vhdl
soc_files = wishbone_arbiter.vhdl wishbone_bram_wrapper.vhdl sync_fifo.vhdl \
wishbone_debug_master.vhdl xics.vhdl syscon.vhdl gpio.vhdl soc.vhdl \
write_pmuspr : std_ulogic;
ramspr_write_even : std_ulogic;
ramspr_write_odd : std_ulogic;
+ mult_32s : std_ulogic;
end record;
constant side_effect_init : side_effect_type := (others => '0');
-- multiply signals
signal x_to_multiply: MultiplyInputType;
signal multiply_to_x: MultiplyOutputType;
+ signal x_to_mult_32s: MultiplyInputType;
+ signal mult_32s_to_x: MultiplyOutputType;
-- divider signals
signal x_to_divider: Execute1ToDividerType;
m_out => multiply_to_x
);
+ mult_32s_0: entity work.multiply_32s
+ port map (
+ clk => clk,
+ stall => stage2_stall,
+ m_in => x_to_mult_32s,
+ m_out => mult_32s_to_x
+ );
+
divider_0: if not HAS_FPU generate
div_0: entity work.divider
port map (
addend := not addend;
end if;
+ x_to_multiply.data1 <= std_ulogic_vector(abs1);
+ x_to_multiply.data2 <= std_ulogic_vector(abs2);
x_to_multiply.is_32bit <= e_in.is_32bit;
x_to_multiply.not_result <= sign1 xor sign2;
x_to_multiply.addend <= addend;
x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus);
if e_in.is_32bit = '0' then
-- 64-bit forms
- x_to_multiply.data1 <= std_ulogic_vector(abs1);
- x_to_multiply.data2 <= std_ulogic_vector(abs2);
if e_in.insn_type = OP_DIVE then
x_to_divider.is_extended <= '1';
end if;
x_to_divider.divisor <= std_ulogic_vector(abs2);
else
-- 32-bit forms
- x_to_multiply.data1 <= x"00000000" & std_ulogic_vector(abs1(31 downto 0));
- x_to_multiply.data2 <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
x_to_divider.is_extended <= '0';
if e_in.insn_type = OP_DIVE then -- extended forms
x_to_divider.dividend <= std_ulogic_vector(abs1(31 downto 0)) & x"00000000";
x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
end if;
+ -- signals to 32-bit multiplier
+ x_to_mult_32s.data1 <= 31x"0" & (a_in(31) and e_in.is_signed) & a_in(31 downto 0);
+ x_to_mult_32s.data2 <= 31x"0" & (b_in(31) and e_in.is_signed) & b_in(31 downto 0);
+ -- The following are unused, but set here to avoid X states
+ x_to_mult_32s.is_32bit <= '1';
+ x_to_mult_32s.not_result <= '0';
+ x_to_mult_32s.addend <= (others => '0');
+
shortmul_result <= std_ulogic_vector(resize(signed(mshort_p), 64));
case ex1.mul_select is
when "00" =>
v.se.icache_inval := '1';
when OP_MUL_L64 =>
- if HAS_SHORT_MULT and e_in.reg_valid3 = '0' and
+ if e_in.is_32bit = '1' then
+ v.se.mult_32s := '1';
+ v.res2_sel := "00";
+ slow_op := '1';
+ elsif HAS_SHORT_MULT and e_in.reg_valid3 = '0' and
fits_in_n_bits(a_in, 16) and fits_in_n_bits(b_in, 16) then
-- Operands fit into 16 bits, so use short multiplier
if e_in.oe = '1' then
owait := '1';
end if;
- when OP_MUL_H64 | OP_MUL_H32 =>
+ when OP_MUL_H64 =>
v.start_mul := '1';
slow_op := '1';
owait := '1';
+ when OP_MUL_H32 =>
+ v.se.mult_32s := '1';
+ v.res2_sel := "01";
+ slow_op := '1';
+
when OP_DIV | OP_DIVE | OP_MOD =>
if not HAS_FPU then
v.start_div := '1';
fv := Execute1ToFPUInit;
x_to_multiply.valid <= '0';
+ x_to_mult_32s.valid <= '0';
x_to_divider.valid <= '0';
v.ext_interrupt := '0';
v.taken_branch_event := '0';
v.res2_sel := actions.res2_sel;
v.msr := actions.new_msr;
x_to_multiply.valid <= actions.start_mul;
+ x_to_mult_32s.valid <= actions.se.mult_32s;
v.mul_in_progress := actions.start_mul;
x_to_divider.valid <= actions.start_div;
v.div_in_progress := actions.start_div;
-- Second execute stage control
execute2_1: process(all)
variable v : reg_stage2_type;
- variable overflow : std_ulogic;
- variable lv : Execute1ToLoadstore1Type;
- variable fv : Execute1ToFPUType;
- variable k : integer;
- variable go : std_ulogic;
variable bypass_valid : std_ulogic;
variable rcresult : std_ulogic_vector(63 downto 0);
variable sprres : std_ulogic_vector(63 downto 0);
v.br_mispredict := ex1.br_mispredict;
end if;
+ if ex1.se.mult_32s = '1' and ex1.oe = '1' then
+ v.e.xerc.ov := mult_32s_to_x.overflow;
+ v.e.xerc.ov32 := mult_32s_to_x.overflow;
+ if mult_32s_to_x.overflow = '1' then
+ v.e.xerc.so := '1';
+ end if;
+ end if;
+
ctrl_tmp <= ctrl;
-- FIXME: run at 512MHz not core freq
ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1);
v.e.write_xerc_enable := '0';
v.e.redirect := '0';
v.e.br_last := '0';
- v.se := side_effect_init;
v.taken_branch_event := '0';
v.br_mispredict := '0';
end if;
if flush_in = '1' then
v.e.valid := '0';
v.e.interrupt := '0';
+ v.se := side_effect_init;
v.ext_interrupt := '0';
end if;
-- This is split like this because mfspr doesn't have an Rc bit,
-- and we don't want the zero-detect logic to be after the
-- SPR mux for timing reasons.
- if ex1.res2_sel(0) = '0' then
+ if ex1.se.mult_32s = '1' then
+ if ex1.res2_sel(0) = '0' then
+ rcresult := mult_32s_to_x.result(63 downto 0);
+ else
+ rcresult := mult_32s_to_x.result(63 downto 32) &
+ mult_32s_to_x.result(63 downto 32);
+ end if;
+ elsif ex1.res2_sel(0) = '0' then
rcresult := ex1.e.write_data;
- sprres := spr_result;
else
rcresult := countbits_result;
+ end if;
+ if ex1.res2_sel(0) = '0' then
+ sprres := spr_result;
+ else
sprres := pmu_to_x.spr_val;
end if;
if ex1.res2_sel(1) = '0' then
cr_res(31) := sign;
cr_res(30) := not (sign or zero);
cr_res(29) := zero;
- cr_res(28) := ex1.e.xerc.so;
+ cr_res(28) := v.e.xerc.so;
cr_mask(7) := '1';
end if;
xilinx_specific:
files:
- xilinx-mult.vhdl : {file_type : vhdlSource-2008}
+ - xilinx-mult-32s.vhdl : {file_type : vhdlSource-2008}
- fpga/fpga-random.vhdl : {file_type : vhdlSource-2008}
- fpga/fpga-random.xdc : {file_type : xdc}
--- /dev/null
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.common.all;
+
+-- Signed 33b x 33b multiplier giving 64-bit product, with no addend,
+-- with fixed 1-cycle latency.
+
+entity multiply_32s is
+ port (
+ clk : in std_logic;
+ stall : in std_ulogic;
+
+ m_in : in MultiplyInputType;
+ m_out : out MultiplyOutputType
+ );
+end entity multiply_32s;
+
+architecture behaviour of multiply_32s is
+ type reg_type is record
+ valid : std_ulogic;
+ data : signed(65 downto 0);
+ end record;
+ constant reg_type_init : reg_type := (valid => '0', data => (others => '0'));
+
+ signal r, rin : reg_type := reg_type_init;
+begin
+ multiply_0: process(clk)
+ begin
+ if rising_edge(clk) and stall = '0' then
+ r <= rin;
+ end if;
+ end process;
+
+ multiply_1: process(all)
+ variable v : reg_type;
+ variable d : std_ulogic_vector(63 downto 0);
+ variable ov : std_ulogic;
+ begin
+ v.valid := m_in.valid;
+ v.data := signed(m_in.data1(32 downto 0)) * signed(m_in.data2(32 downto 0));
+
+ d := std_ulogic_vector(r.data(63 downto 0));
+
+ ov := (or d(63 downto 31)) and not (and d(63 downto 31));
+
+ m_out.result <= 64x"0" & d;
+ m_out.overflow <= ov;
+ m_out.valid <= r.valid;
+
+ rin <= v;
+ end process;
+end architecture behaviour;
--- /dev/null
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.common.all;
+
+library unisim;
+use unisim.vcomponents.all;
+
+-- Signed 33b x 33b multiplier giving 64-bit product, with no addend.
+
+entity multiply_32s is
+ port (
+ clk : in std_logic;
+ stall : in std_ulogic;
+
+ m_in : in MultiplyInputType;
+ m_out : out MultiplyOutputType
+ );
+end entity multiply_32s;
+
+architecture behaviour of multiply_32s is
+ signal clocken : std_ulogic;
+ signal data1 : std_ulogic_vector(52 downto 0);
+ signal data2 : std_ulogic_vector(34 downto 0);
+ signal m00_p, m01_p : std_ulogic_vector(47 downto 0);
+ signal m00_pc : std_ulogic_vector(47 downto 0);
+ signal m10_p, m11_p : std_ulogic_vector(47 downto 0);
+ signal m10_pc : std_ulogic_vector(47 downto 0);
+ signal p0_pat, p0_patb : std_ulogic;
+ signal p1_pat, p1_patb : std_ulogic;
+ signal product_lo : std_ulogic_vector(22 downto 0);
+
+begin
+ -- sign extend
+ data1 <= std_ulogic_vector(resize(signed(m_in.data1(32 downto 0)), 53));
+ data2 <= std_ulogic_vector(resize(signed(m_in.data2(32 downto 0)), 35));
+
+ clocken <= m_in.valid and not stall;
+
+ m00: DSP48E1
+ generic map (
+ ACASCREG => 0,
+ ALUMODEREG => 0,
+ AREG => 0,
+ BCASCREG => 0,
+ BREG => 0,
+ CARRYINREG => 0,
+ CARRYINSELREG => 0,
+ CREG => 0,
+ INMODEREG => 0,
+ MREG => 0,
+ OPMODEREG => 0,
+ PREG => 0
+ )
+ port map (
+ A => "0000000" & data1(22 downto 0),
+ ACIN => (others => '0'),
+ ALUMODE => "0000",
+ B => '0' & data2(16 downto 0),
+ BCIN => (others => '0'),
+ C => (others => '0'),
+ CARRYCASCIN => '0',
+ CARRYIN => '0',
+ CARRYINSEL => "000",
+ CEA1 => '0',
+ CEA2 => '0',
+ CEAD => '0',
+ CEALUMODE => '0',
+ CEB1 => '0',
+ CEB2 => '0',
+ CEC => '0',
+ CECARRYIN => '0',
+ CECTRL => '0',
+ CED => '0',
+ CEINMODE => '0',
+ CEM => '0',
+ CEP => '0',
+ CLK => clk,
+ D => (others => '0'),
+ INMODE => "00000",
+ MULTSIGNIN => '0',
+ OPMODE => "0110101",
+ P => m00_p,
+ PCIN => (others => '0'),
+ PCOUT => m00_pc,
+ RSTA => '0',
+ RSTALLCARRYIN => '0',
+ RSTALUMODE => '0',
+ RSTB => '0',
+ RSTC => '0',
+ RSTCTRL => '0',
+ RSTD => '0',
+ RSTINMODE => '0',
+ RSTM => '0',
+ RSTP => '0'
+ );
+
+ m01: DSP48E1
+ generic map (
+ ACASCREG => 0,
+ ALUMODEREG => 0,
+ AREG => 0,
+ BCASCREG => 0,
+ BREG => 0,
+ CARRYINREG => 0,
+ CARRYINSELREG => 0,
+ CREG => 0,
+ INMODEREG => 0,
+ MREG => 0,
+ OPMODEREG => 0,
+ PREG => 0
+ )
+ port map (
+ A => "0000000" & data1(22 downto 0),
+ ACIN => (others => '0'),
+ ALUMODE => "0000",
+ B => data2(34 downto 17),
+ BCIN => (others => '0'),
+ C => (others => '0'),
+ CARRYCASCIN => '0',
+ CARRYIN => '0',
+ CARRYINSEL => "000",
+ CEA1 => '0',
+ CEA2 => '0',
+ CEAD => '0',
+ CEALUMODE => '0',
+ CEB1 => '0',
+ CEB2 => '0',
+ CEC => '0',
+ CECARRYIN => '0',
+ CECTRL => '0',
+ CED => '0',
+ CEINMODE => '0',
+ CEM => '0',
+ CEP => '0',
+ CLK => clk,
+ D => (others => '0'),
+ INMODE => "00000",
+ MULTSIGNIN => '0',
+ OPMODE => "1010101",
+ P => m01_p,
+ PCIN => m00_pc,
+ RSTA => '0',
+ RSTALLCARRYIN => '0',
+ RSTALUMODE => '0',
+ RSTB => '0',
+ RSTC => '0',
+ RSTCTRL => '0',
+ RSTD => '0',
+ RSTINMODE => '0',
+ RSTM => '0',
+ RSTP => '0'
+ );
+
+ m10: DSP48E1
+ generic map (
+ ACASCREG => 0,
+ ALUMODEREG => 0,
+ AREG => 0,
+ BCASCREG => 0,
+ BREG => 0,
+ CARRYINREG => 0,
+ CARRYINSELREG => 0,
+ CREG => 1,
+ INMODEREG => 0,
+ MASK => x"fffffffe00ff",
+ OPMODEREG => 0,
+ PREG => 0,
+ USE_PATTERN_DETECT => "PATDET"
+ )
+ port map (
+ A => data1(52 downto 23),
+ ACIN => (others => '0'),
+ ALUMODE => "0000",
+ B => '0' & data2(16 downto 0),
+ BCIN => (others => '0'),
+ C => std_ulogic_vector(resize(signed(m01_p(38 downto 6)), 48)),
+ CARRYCASCIN => '0',
+ CARRYIN => '0',
+ CARRYINSEL => "000",
+ CEA1 => '0',
+ CEA2 => '0',
+ CEAD => '0',
+ CEALUMODE => '0',
+ CEB1 => '0',
+ CEB2 => '0',
+ CEC => clocken,
+ CECARRYIN => '0',
+ CECTRL => '0',
+ CED => '0',
+ CEINMODE => '0',
+ CEM => clocken,
+ CEP => '0',
+ CLK => clk,
+ D => (others => '0'),
+ INMODE => "00000",
+ MULTSIGNIN => '0',
+ OPMODE => "0110101",
+ P => m10_p,
+ PATTERNDETECT => p0_pat,
+ PATTERNBDETECT => p0_patb,
+ PCIN => (others => '0'),
+ PCOUT => m10_pc,
+ RSTA => '0',
+ RSTALLCARRYIN => '0',
+ RSTALUMODE => '0',
+ RSTB => '0',
+ RSTC => '0',
+ RSTCTRL => '0',
+ RSTD => '0',
+ RSTINMODE => '0',
+ RSTM => '0',
+ RSTP => '0'
+ );
+
+ m11: DSP48E1
+ generic map (
+ ACASCREG => 0,
+ ALUMODEREG => 0,
+ AREG => 0,
+ BCASCREG => 0,
+ BREG => 0,
+ CARRYINREG => 0,
+ CARRYINSELREG => 0,
+ CREG => 0,
+ INMODEREG => 0,
+ MASK => x"fffffc000000",
+ OPMODEREG => 0,
+ PREG => 0,
+ USE_PATTERN_DETECT => "PATDET"
+ )
+ port map (
+ A => data1(52 downto 23),
+ ACIN => (others => '0'),
+ ALUMODE => "0000",
+ B => data2(34 downto 17),
+ BCIN => (others => '0'),
+ C => (others => '0'),
+ CARRYCASCIN => '0',
+ CARRYIN => '0',
+ CARRYINSEL => "000",
+ CEA1 => '0',
+ CEA2 => '0',
+ CEAD => '0',
+ CEALUMODE => '0',
+ CEB1 => '0',
+ CEB2 => '0',
+ CEC => '0',
+ CECARRYIN => '0',
+ CECTRL => '0',
+ CED => '0',
+ CEINMODE => '0',
+ CEM => clocken,
+ CEP => '0',
+ CLK => clk,
+ D => (others => '0'),
+ INMODE => "00000",
+ MULTSIGNIN => '0',
+ OPMODE => "1010101",
+ P => m11_p,
+ PATTERNDETECT => p1_pat,
+ PATTERNBDETECT => p1_patb,
+ PCIN => m10_pc,
+ RSTA => '0',
+ RSTALLCARRYIN => '0',
+ RSTALUMODE => '0',
+ RSTB => '0',
+ RSTC => '0',
+ RSTCTRL => '0',
+ RSTD => '0',
+ RSTINMODE => '0',
+ RSTM => '0',
+ RSTP => '0'
+ );
+
+ m_out.result(127 downto 64) <= (others => '0');
+ m_out.result(63 downto 40) <= m11_p(23 downto 0);
+ m_out.result(39 downto 23) <= m10_p(16 downto 0);
+ m_out.result(22 downto 0) <= product_lo;
+
+ m_out.overflow <= not ((p0_pat and p1_pat) or (p0_patb and p1_patb));
+
+ process(clk)
+ begin
+ if rising_edge(clk) and stall = '0' then
+ m_out.valid <= m_in.valid;
+ product_lo <= m01_p(5 downto 0) & m00_p(16 downto 0);
+ end if;
+ end process;
+
+end architecture behaviour;