execute1: Add a pipelined 33-bit signed multiplier

author Paul Mackerras <paulus@ozlabs.org>

Sat, 16 Jul 2022 01:49:28 +0000 (11:49 +1000)

committer Paul Mackerras <paulus@ozlabs.org>

Tue, 9 Aug 2022 10:16:28 +0000 (20:16 +1000)
author Paul Mackerras <paulus@ozlabs.org>
Sat, 16 Jul 2022 01:49:28 +0000 (11:49 +1000)
committer Paul Mackerras <paulus@ozlabs.org>
Tue, 9 Aug 2022 10:16:28 +0000 (20:16 +1000)
diff --git a/Makefile b/Makefile

index ebb1b79b304887039ea20ab28c9d655fce02630d..794cbc1085580f5b8cfa51ba8fca7fd239a5840c 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -60,9 +60,9 @@ core_files = decode_types.vhdl common.vhdl wishbone_types.vhdl fetch1.vhdl \
         decode1.vhdl helpers.vhdl insn_helpers.vhdl \
         control.vhdl decode2.vhdl register_file.vhdl \
         cr_file.vhdl crhelpers.vhdl ppc_fx_insns.vhdl rotator.vhdl \
-       logical.vhdl countbits.vhdl multiply.vhdl divider.vhdl execute1.vhdl \
-       loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl core_debug.vhdl \
-       core.vhdl fpu.vhdl pmu.vhdl
+       logical.vhdl countbits.vhdl multiply.vhdl multiply-32s.vhdl divider.vhdl \
+       execute1.vhdl loadstore1.vhdl mmu.vhdl dcache.vhdl writeback.vhdl \
+       core_debug.vhdl core.vhdl fpu.vhdl pmu.vhdl
  
  soc_files = wishbone_arbiter.vhdl wishbone_bram_wrapper.vhdl sync_fifo.vhdl \
         wishbone_debug_master.vhdl xics.vhdl syscon.vhdl gpio.vhdl soc.vhdl \
diff --git a/execute1.vhdl b/execute1.vhdl

index 92da2ee39c42ebf1b7b7668e8f26944e5df76957..948bdd613760e3f40ae7593b80df7ce9bd0804a1 100644 (file)
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -85,6 +85,7 @@ architecture behaviour of execute1 is
          write_pmuspr : std_ulogic;
          ramspr_write_even : std_ulogic;
          ramspr_write_odd : std_ulogic;
+        mult_32s : std_ulogic;
      end record;
      constant side_effect_init : side_effect_type := (others => '0');
  
@@ -203,6 +204,8 @@ architecture behaviour of execute1 is
      -- multiply signals
      signal x_to_multiply: MultiplyInputType;
      signal multiply_to_x: MultiplyOutputType;
+    signal x_to_mult_32s: MultiplyInputType;
+    signal mult_32s_to_x: MultiplyOutputType;
  
      -- divider signals
      signal x_to_divider: Execute1ToDividerType;
@@ -411,6 +414,14 @@ begin
              m_out => multiply_to_x
              );
  
+    mult_32s_0: entity work.multiply_32s
+        port map (
+            clk => clk,
+            stall => stage2_stall,
+            m_in => x_to_mult_32s,
+            m_out => mult_32s_to_x
+            );
+
      divider_0: if not HAS_FPU generate
          div_0: entity work.divider
              port map (
@@ -730,14 +741,14 @@ begin
              addend := not addend;
          end if;
  
+        x_to_multiply.data1 <= std_ulogic_vector(abs1);
+        x_to_multiply.data2 <= std_ulogic_vector(abs2);
         x_to_multiply.is_32bit <= e_in.is_32bit;
          x_to_multiply.not_result <= sign1 xor sign2;
          x_to_multiply.addend <= addend;
          x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus);
          if e_in.is_32bit = '0' then
              -- 64-bit forms
-            x_to_multiply.data1 <= std_ulogic_vector(abs1);
-            x_to_multiply.data2 <= std_ulogic_vector(abs2);
              if e_in.insn_type = OP_DIVE then
                  x_to_divider.is_extended <= '1';
              end if;
@@ -745,8 +756,6 @@ begin
              x_to_divider.divisor <= std_ulogic_vector(abs2);
          else
              -- 32-bit forms
-            x_to_multiply.data1 <= x"00000000" & std_ulogic_vector(abs1(31 downto 0));
-            x_to_multiply.data2 <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
              x_to_divider.is_extended <= '0';
              if e_in.insn_type = OP_DIVE then   -- extended forms
                  x_to_divider.dividend <= std_ulogic_vector(abs1(31 downto 0)) & x"00000000";
@@ -756,6 +765,14 @@ begin
              x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
          end if;
  
+        -- signals to 32-bit multiplier
+        x_to_mult_32s.data1 <= 31x"0" & (a_in(31) and e_in.is_signed) & a_in(31 downto 0);
+        x_to_mult_32s.data2 <= 31x"0" & (b_in(31) and e_in.is_signed) & b_in(31 downto 0);
+        -- The following are unused, but set here to avoid X states
+        x_to_mult_32s.is_32bit <= '1';
+        x_to_mult_32s.not_result <= '0';
+        x_to_mult_32s.addend <= (others => '0');
+
          shortmul_result <= std_ulogic_vector(resize(signed(mshort_p), 64));
          case ex1.mul_select is
              when "00" =>
@@ -1271,7 +1288,11 @@ begin
                 v.se.icache_inval := '1';
  
             when OP_MUL_L64 =>
-                if HAS_SHORT_MULT and e_in.reg_valid3 = '0' and
+                if e_in.is_32bit = '1' then
+                    v.se.mult_32s := '1';
+                    v.res2_sel := "00";
+                    slow_op := '1';
+                elsif HAS_SHORT_MULT and e_in.reg_valid3 = '0' and
                      fits_in_n_bits(a_in, 16) and fits_in_n_bits(b_in, 16) then
                      -- Operands fit into 16 bits, so use short multiplier
                      if e_in.oe = '1' then
@@ -1285,11 +1306,16 @@ begin
                      owait := '1';
                  end if;
  
-           when OP_MUL_H64 | OP_MUL_H32 =>
+           when OP_MUL_H64 =>
                  v.start_mul := '1';
                  slow_op := '1';
                  owait := '1';
  
+            when OP_MUL_H32 =>
+                v.se.mult_32s := '1';
+                v.res2_sel := "01";
+                slow_op := '1';
+
             when OP_DIV | OP_DIVE | OP_MOD =>
                  if not HAS_FPU then
                      v.start_div := '1';
@@ -1370,6 +1396,7 @@ begin
          fv := Execute1ToFPUInit;
  
          x_to_multiply.valid <= '0';
+        x_to_mult_32s.valid <= '0';
          x_to_divider.valid <= '0';
          v.ext_interrupt := '0';
          v.taken_branch_event := '0';
@@ -1456,6 +1483,7 @@ begin
              v.res2_sel := actions.res2_sel;
              v.msr := actions.new_msr;
              x_to_multiply.valid <= actions.start_mul;
+            x_to_mult_32s.valid <= actions.se.mult_32s;
              v.mul_in_progress := actions.start_mul;
              x_to_divider.valid <= actions.start_div;
              v.div_in_progress := actions.start_div;
@@ -1624,11 +1652,6 @@ begin
      -- Second execute stage control
      execute2_1: process(all)
         variable v : reg_stage2_type;
-       variable overflow : std_ulogic;
-        variable lv : Execute1ToLoadstore1Type;
-        variable fv : Execute1ToFPUType;
-        variable k : integer;
-        variable go : std_ulogic;
          variable bypass_valid : std_ulogic;
          variable rcresult : std_ulogic_vector(63 downto 0);
          variable sprres : std_ulogic_vector(63 downto 0);
@@ -1647,6 +1670,14 @@ begin
              v.br_mispredict := ex1.br_mispredict;
          end if;
  
+        if ex1.se.mult_32s = '1' and ex1.oe = '1' then
+            v.e.xerc.ov := mult_32s_to_x.overflow;
+            v.e.xerc.ov32 := mult_32s_to_x.overflow;
+            if mult_32s_to_x.overflow = '1' then
+                v.e.xerc.so := '1';
+            end if;
+        end if;
+
         ctrl_tmp <= ctrl;
         -- FIXME: run at 512MHz not core freq
         ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1);
@@ -1667,24 +1698,34 @@ begin
              v.e.write_xerc_enable := '0';
              v.e.redirect := '0';
              v.e.br_last := '0';
-            v.se := side_effect_init;
              v.taken_branch_event := '0';
              v.br_mispredict := '0';
          end if;
          if flush_in = '1' then
              v.e.valid := '0';
              v.e.interrupt := '0';
+            v.se := side_effect_init;
              v.ext_interrupt := '0';
          end if;
  
          -- This is split like this because mfspr doesn't have an Rc bit,
          -- and we don't want the zero-detect logic to be after the
          -- SPR mux for timing reasons.
-        if ex1.res2_sel(0) = '0' then
+        if ex1.se.mult_32s = '1' then
+            if ex1.res2_sel(0) = '0' then
+                rcresult := mult_32s_to_x.result(63 downto 0);
+            else
+                rcresult := mult_32s_to_x.result(63 downto 32) &
+                            mult_32s_to_x.result(63 downto 32);
+            end if;
+        elsif ex1.res2_sel(0) = '0' then
              rcresult := ex1.e.write_data;
-            sprres := spr_result;
          else
              rcresult := countbits_result;
+        end if;
+        if ex1.res2_sel(0) = '0' then
+            sprres := spr_result;
+        else
              sprres := pmu_to_x.spr_val;
          end if;
          if ex1.res2_sel(1) = '0' then
@@ -1708,7 +1749,7 @@ begin
              cr_res(31) := sign;
              cr_res(30) := not (sign or zero);
              cr_res(29) := zero;
-            cr_res(28) := ex1.e.xerc.so;
+            cr_res(28) := v.e.xerc.so;
              cr_mask(7) := '1';
          end if;
  
diff --git a/microwatt.core b/microwatt.core

index 4c8695ee1793489399078fde730bbe356ed5220b..b817901f68fea24205267ed6a8e8e974dfcc0f9f 100644 (file)
--- a/microwatt.core
+++ b/microwatt.core
@@ -66,6 +66,7 @@ filesets:
    xilinx_specific:
      files:
        - xilinx-mult.vhdl : {file_type : vhdlSource-2008}
+      - xilinx-mult-32s.vhdl : {file_type : vhdlSource-2008}
        - fpga/fpga-random.vhdl : {file_type : vhdlSource-2008}
        - fpga/fpga-random.xdc : {file_type : xdc}
  
diff --git a/multiply-32s.vhdl b/multiply-32s.vhdl

new file mode 100644 (file)

index 0000000..0639dbf
--- /dev/null
+++ b/multiply-32s.vhdl
@@ -0,0 +1,55 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.common.all;
+
+-- Signed 33b x 33b multiplier giving 64-bit product, with no addend,
+-- with fixed 1-cycle latency.
+
+entity multiply_32s is
+    port (
+        clk   : in std_logic;
+        stall : in std_ulogic;
+
+        m_in  : in MultiplyInputType;
+        m_out : out MultiplyOutputType
+        );
+end entity multiply_32s;
+
+architecture behaviour of multiply_32s is
+    type reg_type is record
+        valid     : std_ulogic;
+        data      : signed(65 downto 0);
+    end record;
+    constant reg_type_init : reg_type := (valid => '0', data => (others => '0'));
+
+    signal r, rin : reg_type := reg_type_init;
+begin
+    multiply_0: process(clk)
+    begin
+        if rising_edge(clk) and stall = '0' then
+            r <= rin;
+        end if;
+    end process;
+
+    multiply_1: process(all)
+        variable v : reg_type;
+        variable d : std_ulogic_vector(63 downto 0);
+       variable ov : std_ulogic;
+    begin
+        v.valid := m_in.valid;
+        v.data := signed(m_in.data1(32 downto 0)) * signed(m_in.data2(32 downto 0));
+
+        d := std_ulogic_vector(r.data(63 downto 0));
+
+        ov := (or d(63 downto 31)) and not (and d(63 downto 31));
+
+        m_out.result <= 64x"0" & d;
+        m_out.overflow <= ov;
+        m_out.valid <= r.valid;
+
+        rin <= v;
+    end process;
+end architecture behaviour;
diff --git a/xilinx-mult-32s.vhdl b/xilinx-mult-32s.vhdl

new file mode 100644 (file)

index 0000000..fde19ae
--- /dev/null
+++ b/xilinx-mult-32s.vhdl
@@ -0,0 +1,293 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.common.all;
+
+library unisim;
+use unisim.vcomponents.all;
+
+-- Signed 33b x 33b multiplier giving 64-bit product, with no addend.
+
+entity multiply_32s is
+    port (
+        clk   : in std_logic;
+        stall : in std_ulogic;
+
+        m_in  : in MultiplyInputType;
+        m_out : out MultiplyOutputType
+        );
+end entity multiply_32s;
+
+architecture behaviour of multiply_32s is
+    signal clocken : std_ulogic;
+    signal data1 : std_ulogic_vector(52 downto 0);
+    signal data2 : std_ulogic_vector(34 downto 0);
+    signal m00_p, m01_p : std_ulogic_vector(47 downto 0);
+    signal m00_pc : std_ulogic_vector(47 downto 0);
+    signal m10_p, m11_p : std_ulogic_vector(47 downto 0);
+    signal m10_pc : std_ulogic_vector(47 downto 0);
+    signal p0_pat, p0_patb : std_ulogic;
+    signal p1_pat, p1_patb : std_ulogic;
+    signal product_lo : std_ulogic_vector(22 downto 0);
+
+begin
+    -- sign extend
+    data1 <= std_ulogic_vector(resize(signed(m_in.data1(32 downto 0)), 53));
+    data2 <= std_ulogic_vector(resize(signed(m_in.data2(32 downto 0)), 35));
+
+    clocken <= m_in.valid and not stall;
+
+    m00: DSP48E1
+        generic map (
+            ACASCREG => 0,
+            ALUMODEREG => 0,
+            AREG => 0,
+            BCASCREG => 0,
+            BREG => 0,
+            CARRYINREG => 0,
+            CARRYINSELREG => 0,
+            CREG => 0,
+            INMODEREG => 0,
+            MREG => 0,
+            OPMODEREG => 0,
+            PREG => 0
+            )
+        port map (
+            A => "0000000" & data1(22 downto 0),
+            ACIN => (others => '0'),
+            ALUMODE => "0000",
+            B => '0' & data2(16 downto 0),
+            BCIN => (others => '0'),
+            C => (others => '0'),
+            CARRYCASCIN => '0',
+            CARRYIN => '0',
+            CARRYINSEL => "000",
+            CEA1 => '0',
+            CEA2 => '0',
+            CEAD => '0',
+            CEALUMODE => '0',
+            CEB1 => '0',
+            CEB2 => '0',
+            CEC => '0',
+            CECARRYIN => '0',
+            CECTRL => '0',
+            CED => '0',
+            CEINMODE => '0',
+            CEM => '0',
+            CEP => '0',
+            CLK => clk,
+            D => (others => '0'),
+            INMODE => "00000",
+            MULTSIGNIN => '0',
+            OPMODE => "0110101",
+            P => m00_p,
+            PCIN => (others => '0'),
+            PCOUT => m00_pc,
+            RSTA => '0',
+            RSTALLCARRYIN => '0',
+            RSTALUMODE => '0',
+            RSTB => '0',
+            RSTC => '0',
+            RSTCTRL => '0',
+            RSTD => '0',
+            RSTINMODE => '0',
+            RSTM => '0',
+            RSTP => '0'
+            );
+
+    m01: DSP48E1
+        generic map (
+            ACASCREG => 0,
+            ALUMODEREG => 0,
+            AREG => 0,
+            BCASCREG => 0,
+            BREG => 0,
+            CARRYINREG => 0,
+            CARRYINSELREG => 0,
+            CREG => 0,
+            INMODEREG => 0,
+            MREG => 0,
+            OPMODEREG => 0,
+            PREG => 0
+            )
+        port map (
+            A => "0000000" & data1(22 downto 0),
+            ACIN => (others => '0'),
+            ALUMODE => "0000",
+            B => data2(34 downto 17),
+            BCIN => (others => '0'),
+            C => (others => '0'),
+            CARRYCASCIN => '0',
+            CARRYIN => '0',
+            CARRYINSEL => "000",
+            CEA1 => '0',
+            CEA2 => '0',
+            CEAD => '0',
+            CEALUMODE => '0',
+            CEB1 => '0',
+            CEB2 => '0',
+            CEC => '0',
+            CECARRYIN => '0',
+            CECTRL => '0',
+            CED => '0',
+            CEINMODE => '0',
+            CEM => '0',
+            CEP => '0',
+            CLK => clk,
+            D => (others => '0'),
+            INMODE => "00000",
+            MULTSIGNIN => '0',
+            OPMODE => "1010101",
+            P => m01_p,
+            PCIN => m00_pc,
+            RSTA => '0',
+            RSTALLCARRYIN => '0',
+            RSTALUMODE => '0',
+            RSTB => '0',
+            RSTC => '0',
+            RSTCTRL => '0',
+            RSTD => '0',
+            RSTINMODE => '0',
+            RSTM => '0',
+            RSTP => '0'
+            );
+
+    m10: DSP48E1
+        generic map (
+            ACASCREG => 0,
+            ALUMODEREG => 0,
+            AREG => 0,
+            BCASCREG => 0,
+            BREG => 0,
+            CARRYINREG => 0,
+            CARRYINSELREG => 0,
+            CREG => 1,
+            INMODEREG => 0,
+            MASK => x"fffffffe00ff",
+            OPMODEREG => 0,
+            PREG => 0,
+            USE_PATTERN_DETECT => "PATDET"
+            )
+        port map (
+            A => data1(52 downto 23),
+            ACIN => (others => '0'),
+            ALUMODE => "0000",
+            B => '0' & data2(16 downto 0),
+            BCIN => (others => '0'),
+            C => std_ulogic_vector(resize(signed(m01_p(38 downto 6)), 48)),
+            CARRYCASCIN => '0',
+            CARRYIN => '0',
+            CARRYINSEL => "000",
+            CEA1 => '0',
+            CEA2 => '0',
+            CEAD => '0',
+            CEALUMODE => '0',
+            CEB1 => '0',
+            CEB2 => '0',
+            CEC => clocken,
+            CECARRYIN => '0',
+            CECTRL => '0',
+            CED => '0',
+            CEINMODE => '0',
+            CEM => clocken,
+            CEP => '0',
+            CLK => clk,
+            D => (others => '0'),
+            INMODE => "00000",
+            MULTSIGNIN => '0',
+            OPMODE => "0110101",
+            P => m10_p,
+            PATTERNDETECT => p0_pat,
+            PATTERNBDETECT => p0_patb,
+            PCIN => (others => '0'),
+            PCOUT => m10_pc,
+            RSTA => '0',
+            RSTALLCARRYIN => '0',
+            RSTALUMODE => '0',
+            RSTB => '0',
+            RSTC => '0',
+            RSTCTRL => '0',
+            RSTD => '0',
+            RSTINMODE => '0',
+            RSTM => '0',
+            RSTP => '0'
+            );
+
+    m11: DSP48E1
+        generic map (
+            ACASCREG => 0,
+            ALUMODEREG => 0,
+            AREG => 0,
+            BCASCREG => 0,
+            BREG => 0,
+            CARRYINREG => 0,
+            CARRYINSELREG => 0,
+            CREG => 0,
+            INMODEREG => 0,
+            MASK => x"fffffc000000",
+            OPMODEREG => 0,
+            PREG => 0,
+            USE_PATTERN_DETECT => "PATDET"
+            )
+        port map (
+            A => data1(52 downto 23),
+            ACIN => (others => '0'),
+            ALUMODE => "0000",
+            B => data2(34 downto 17),
+            BCIN => (others => '0'),
+            C => (others => '0'),
+            CARRYCASCIN => '0',
+            CARRYIN => '0',
+            CARRYINSEL => "000",
+            CEA1 => '0',
+            CEA2 => '0',
+            CEAD => '0',
+            CEALUMODE => '0',
+            CEB1 => '0',
+            CEB2 => '0',
+            CEC => '0',
+            CECARRYIN => '0',
+            CECTRL => '0',
+            CED => '0',
+            CEINMODE => '0',
+            CEM => clocken,
+            CEP => '0',
+            CLK => clk,
+            D => (others => '0'),
+            INMODE => "00000",
+            MULTSIGNIN => '0',
+            OPMODE => "1010101",
+            P => m11_p,
+            PATTERNDETECT => p1_pat,
+            PATTERNBDETECT => p1_patb,
+            PCIN => m10_pc,
+            RSTA => '0',
+            RSTALLCARRYIN => '0',
+            RSTALUMODE => '0',
+            RSTB => '0',
+            RSTC => '0',
+            RSTCTRL => '0',
+            RSTD => '0',
+            RSTINMODE => '0',
+            RSTM => '0',
+            RSTP => '0'
+            );
+
+    m_out.result(127 downto 64) <= (others => '0');
+    m_out.result(63 downto 40) <= m11_p(23 downto 0);
+    m_out.result(39 downto 23) <= m10_p(16 downto 0);
+    m_out.result(22 downto 0)  <= product_lo;
+
+    m_out.overflow <= not ((p0_pat and p1_pat) or (p0_patb and p1_patb));
+
+    process(clk)
+    begin
+        if rising_edge(clk) and stall = '0' then
+            m_out.valid <= m_in.valid;
+            product_lo <= m01_p(5 downto 0) & m00_p(16 downto 0);
+        end if;
+    end process;
+
+end architecture behaviour;
author	Paul Mackerras <paulus@ozlabs.org>
	Sat, 16 Jul 2022 01:49:28 +0000 (11:49 +1000)
committer	Paul Mackerras <paulus@ozlabs.org>
	Tue, 9 Aug 2022 10:16:28 +0000 (20:16 +1000)
Makefile		patch \| blob \| history
execute1.vhdl		patch \| blob \| history
microwatt.core		patch \| blob \| history
multiply-32s.vhdl	[new file with mode: 0644]	patch \| blob
xilinx-mult-32s.vhdl	[new file with mode: 0644]	patch \| blob