FPU: Implement fmul[s]

author Paul Mackerras <paulus@ozlabs.org>

Mon, 27 Jul 2020 08:27:50 +0000 (18:27 +1000)

committer Paul Mackerras <paulus@ozlabs.org>

Thu, 3 Sep 2020 07:45:07 +0000 (17:45 +1000)
author Paul Mackerras <paulus@ozlabs.org>
Mon, 27 Jul 2020 08:27:50 +0000 (18:27 +1000)
committer Paul Mackerras <paulus@ozlabs.org>
Thu, 3 Sep 2020 07:45:07 +0000 (17:45 +1000)
diff --git a/decode1.vhdl b/decode1.vhdl

index 737d83cba33657fbe84403124d327b66ad8fe791..721c478361833f92cee46ccf8b453e3696a7a7d3 100644 (file)
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -418,6 +418,7 @@ architecture behaviour of decode1 is
          2#01110#  =>  (FPU,   OP_FPOP_I,     NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fcfid[u]s
          2#10100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fsubs
          2#10101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fadds
+        2#11001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmuls
          others => illegal_inst
          );
  
@@ -470,6 +471,7 @@ architecture behaviour of decode1 is
          --                         op                               in   out   A   out  in    out  len        ext                                pipe
          2#0100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsub
          2#0101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fadd
+        2#1001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmul
          others => illegal_inst
          );
  
diff --git a/decode2.vhdl b/decode2.vhdl

index ec8232f570a1226db6d9170af3dffd68f2e039a2..9443212bef583e0672b91a7473543c3762d10193 100644 (file)
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -152,6 +152,12 @@ architecture behaviour of decode2 is
                  else
                      return ('0', (others => '0'), (others => '0'));
                  end if;
+            when FRC =>
+                if HAS_FPU then
+                    return ('1', fpr_to_gspr(insn_frc(insn_in)), reg_data);
+                else
+                    return ('0', (others => '0'), (others => '0'));
+                end if;
              when NONE =>
                  return ('0', (others => '0'), (others => '0'));
          end case;
@@ -308,6 +314,7 @@ begin
                         else fpr_to_gspr(insn_frb(d_in.insn)) when d_in.decode.input_reg_b = FRB and HAS_FPU
                         else gpr_to_gspr(insn_rb(d_in.insn));
      r_out.read3_reg <= gpr_to_gspr(insn_rcreg(d_in.insn)) when d_in.decode.input_reg_c = RCR
+                       else fpr_to_gspr(insn_frc(d_in.insn)) when d_in.decode.input_reg_c = FRC and HAS_FPU
                         else fpr_to_gspr(insn_frt(d_in.insn)) when d_in.decode.input_reg_c = FRS and HAS_FPU
                         else gpr_to_gspr(insn_rs(d_in.insn));
  
diff --git a/decode_types.vhdl b/decode_types.vhdl

index 08fdc4a2fc95420d5ab88287c7d118336882b3da..72609bf07a5e6d4ab10e628c7a987bce17777d6f 100644 (file)
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@@ -26,7 +26,7 @@ package decode_types is
      type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA, FRA);
      type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD,
                             CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR, FRB);
-    type input_reg_c_t is (NONE, RS, RCR, FRS);
+    type input_reg_c_t is (NONE, RS, RCR, FRC, FRS);
      type output_reg_a_t is (NONE, RT, RA, SPR, FRT);
      type rc_t is (NONE, ONE, RC);
      type carry_in_t is (ZERO, CA, OV, ONE);
diff --git a/fpu.vhdl b/fpu.vhdl

index e9edfb45f4c79949c45a422318bbc8459d97f79e..209daa0735acecf3ac2e23e39b153f09bcce9049 100644 (file)
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -40,15 +40,18 @@ architecture behaviour of fpu is
                       DO_FMR, DO_FMRG,
                       DO_FCFID, DO_FCTI,
                       DO_FRSP, DO_FRI,
-                     DO_FADD,
+                     DO_FADD, DO_FMUL,
                       FRI_1,
                       ADD_SHIFT, ADD_2, ADD_3,
+                     MULT_1,
                       INT_SHIFT, INT_ROUND, INT_ISHIFT,
                       INT_FINAL, INT_CHECK, INT_OFLOW,
                       FINISH, NORMALIZE,
                       ROUND_UFLOW, ROUND_OFLOW,
                       ROUNDING, ROUNDING_2, ROUNDING_3,
-                     DENORM);
+                     DENORM,
+                     RENORM_A, RENORM_A2,
+                     RENORM_C, RENORM_C2);
  
      type reg_type is record
          state        : state_t;
@@ -65,8 +68,10 @@ architecture behaviour of fpu is
          fpscr        : std_ulogic_vector(31 downto 0);
          a            : fpu_reg_type;
          b            : fpu_reg_type;
+        c            : fpu_reg_type;
          r            : std_ulogic_vector(63 downto 0);  -- 10.54 format
          x            : std_ulogic;
+        p            : std_ulogic_vector(63 downto 0);  -- 8.56 format
          result_sign  : std_ulogic;
          result_class : fp_number_class;
          result_exp   : signed(EXP_BITS-1 downto 0);
@@ -84,6 +89,8 @@ architecture behaviour of fpu is
          is_subtract  : std_ulogic;
          exp_cmp      : std_ulogic;
          add_bsmall   : std_ulogic;
+        is_multiply  : std_ulogic;
+        first        : std_ulogic;
      end record;
  
      signal r, rin : reg_type;
@@ -103,11 +110,17 @@ architecture behaviour of fpu is
      signal r_hi_nz       : std_ulogic;
      signal r_lo_nz       : std_ulogic;
      signal misc_sel      : std_ulogic_vector(3 downto 0);
+    signal f_to_multiply : MultiplyInputType;
+    signal multiply_to_f : MultiplyOutputType;
+    signal msel_1        : std_ulogic_vector(1 downto 0);
+    signal msel_2        : std_ulogic_vector(1 downto 0);
+    signal msel_inv      : std_ulogic;
  
      -- opsel values
      constant AIN_R    : std_ulogic_vector(1 downto 0) := "00";
      constant AIN_A    : std_ulogic_vector(1 downto 0) := "01";
      constant AIN_B    : std_ulogic_vector(1 downto 0) := "10";
+    constant AIN_C    : std_ulogic_vector(1 downto 0) := "11";
  
      constant BIN_ZERO : std_ulogic_vector(1 downto 0) := "00";
      constant BIN_R    : std_ulogic_vector(1 downto 0) := "01";
@@ -115,8 +128,17 @@ architecture behaviour of fpu is
  
      constant RES_SUM   : std_ulogic_vector(1 downto 0) := "00";
      constant RES_SHIFT : std_ulogic_vector(1 downto 0) := "01";
+    constant RES_MULT  : std_ulogic_vector(1 downto 0) := "10";
      constant RES_MISC  : std_ulogic_vector(1 downto 0) := "11";
  
+    -- msel values
+    constant MUL1_A : std_ulogic_vector(1 downto 0) := "00";
+    constant MUL1_B : std_ulogic_vector(1 downto 0) := "01";
+    constant MUL1_R : std_ulogic_vector(1 downto 0) := "11";
+
+    constant MUL2_C   : std_ulogic_vector(1 downto 0) := "00";
+    constant MUL2_R   : std_ulogic_vector(1 downto 0) := "11";
+
      -- Left and right shifter with 120 bit input and 64 bit output.
      -- Shifts inp left by shift bits and returns the upper 64 bits of
      -- the result.  The shift parameter is interpreted as a signed
@@ -313,6 +335,13 @@ architecture behaviour of fpu is
      end;
  
  begin
+    fpu_multiply_0: entity work.multiply
+        port map (
+            clk => clk,
+            m_in => f_to_multiply,
+            m_out => multiply_to_f
+            );
+
      fpu_0: process(clk)
      begin
          if rising_edge(clk) then
@@ -347,6 +376,7 @@ begin
          variable v           : reg_type;
          variable adec        : fpu_reg_type;
          variable bdec        : fpu_reg_type;
+        variable cdec        : fpu_reg_type;
          variable fpscr_mask  : std_ulogic_vector(31 downto 0);
          variable illegal     : std_ulogic;
          variable j, k        : integer;
@@ -377,6 +407,10 @@ begin
          variable is_add      : std_ulogic;
          variable qnan_result : std_ulogic;
          variable longmask    : std_ulogic;
+        variable set_a       : std_ulogic;
+        variable set_c       : std_ulogic;
+        variable px_nz       : std_ulogic;
+        variable maddend     : std_ulogic_vector(127 downto 0);
      begin
          v := r;
          illegal := '0';
@@ -407,11 +441,15 @@ begin
              v.denorm := '0';
              v.round_mode := '0' & r.fpscr(FPSCR_RN+1 downto FPSCR_RN);
              v.is_subtract := '0';
+            v.is_multiply := '0';
              v.add_bsmall := '0';
              adec := decode_dp(e_in.fra, int_input);
              bdec := decode_dp(e_in.frb, int_input);
+            cdec := decode_dp(e_in.frc, int_input);
              v.a := adec;
              v.b := bdec;
+            v.c := cdec;
+
              v.exp_cmp := '0';
              if adec.exponent > bdec.exponent then
                  v.exp_cmp := '1';
@@ -440,10 +478,14 @@ begin
              exp_huge := '1';
          end if;
  
+        -- Compare P with zero
+        px_nz := or (r.p(57 downto 4));
+
          v.writing_back := '0';
          v.instr_done := '0';
          v.update_fprf := '0';
          v.shift := to_signed(0, EXP_BITS);
+        v.first := '0';
          opsel_a <= AIN_R;
          opsel_ainv <= '0';
          opsel_amask <= '0';
@@ -460,6 +502,13 @@ begin
          set_x := '0';
          qnan_result := '0';
          longmask := r.single_prec;
+        set_a := '0';
+        set_c := '0';
+        f_to_multiply.is_32bit <= '0';
+        f_to_multiply.valid <= '0';
+        msel_1 <= MUL1_A;
+        msel_2 <= MUL2_C;
+        msel_inv <= '0';
  
          case r.state is
              when IDLE =>
@@ -503,6 +552,9 @@ begin
                              v.state := DO_FCTI;
                          when "10100" | "10101" =>
                              v.state := DO_FADD;
+                        when "11001" =>
+                            v.is_multiply := '1';
+                            v.state := DO_FMUL;
                          when others =>
                              illegal := '1';
                      end case;
@@ -795,6 +847,81 @@ begin
                      arith_done := '1';
                  end if;
  
+            when DO_FMUL =>
+                -- fmul[s]
+                opsel_a <= AIN_A;
+                v.result_sign := r.a.negative;
+                v.result_class := r.a.class;
+                v.result_exp := r.a.exponent;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                if r.a.class = FINITE and r.c.class = FINITE then
+                    v.result_sign := r.a.negative xor r.c.negative;
+                    v.result_exp := r.a.exponent + r.c.exponent;
+                    -- Renormalize denorm operands
+                    if r.a.mantissa(54) = '0' then
+                        v.state := RENORM_A;
+                    elsif r.c.mantissa(54) = '0' then
+                        opsel_a <= AIN_C;
+                        v.state := RENORM_C;
+                    else
+                        f_to_multiply.valid <= '1';
+                        v.state := MULT_1;
+                    end if;
+                else
+                    if (r.a.class = NAN and r.a.mantissa(53) = '0') or
+                        (r.c.class = NAN and r.c.mantissa(53) = '0') then
+                        -- Signalling NAN
+                        v.fpscr(FPSCR_VXSNAN) := '1';
+                        invalid := '1';
+                    end if;
+                    if r.a.class = NAN then
+                    -- result is A
+                    elsif r.c.class = NAN then
+                        v.result_class := NAN;
+                        v.result_sign := r.c.negative;
+                        opsel_a <= AIN_C;
+                    elsif (r.a.class = INFINITY and r.c.class = ZERO) or
+                        (r.a.class = ZERO and r.c.class = INFINITY) then
+                        -- invalid operation, construct QNaN
+                        v.fpscr(FPSCR_VXIMZ) := '1';
+                        qnan_result := '1';
+                    elsif r.a.class = ZERO or r.a.class = INFINITY then
+                        -- result is +/- A
+                        v.result_sign := r.a.negative xor r.c.negative;
+                    else
+                        -- r.c.class is ZERO or INFINITY
+                        v.result_class := r.c.class;
+                        v.result_sign := r.a.negative xor r.c.negative;
+                    end if;
+                    arith_done := '1';
+                end if;
+
+            when RENORM_A =>
+                renormalize := '1';
+                v.state := RENORM_A2;
+
+            when RENORM_A2 =>
+                set_a := '1';
+                v.result_exp := new_exp;
+                opsel_a <= AIN_C;
+                if r.c.mantissa(54) = '1' then
+                    v.first := '1';
+                    v.state := MULT_1;
+                else
+                    v.state := RENORM_C;
+                end if;
+
+            when RENORM_C =>
+                renormalize := '1';
+                v.state := RENORM_C2;
+
+            when RENORM_C2 =>
+                set_c := '1';
+                v.result_exp := new_exp;
+                v.first := '1';
+                v.state := MULT_1;
+
              when ADD_SHIFT =>
                  opsel_r <= RES_SHIFT;
                  set_x := '1';
@@ -848,6 +975,13 @@ begin
                      v.state := NORMALIZE;
                  end if;
  
+            when MULT_1 =>
+                f_to_multiply.valid <= r.first;
+                opsel_r <= RES_MULT;
+                if multiply_to_f.valid = '1' then
+                    v.state := FINISH;
+                end if;
+
              when INT_SHIFT =>
                  opsel_r <= RES_SHIFT;
                  set_x := '1';
@@ -930,6 +1064,9 @@ begin
                  v.state := ROUNDING;
  
              when FINISH =>
+                if r.is_multiply = '1' and px_nz = '1' then
+                    v.x := '1';
+                end if;
                  if r.r(63 downto 54) /= "0000000001" then
                      renormalize := '1';
                      v.state := NORMALIZE;
@@ -1099,6 +1236,32 @@ begin
              update_fx := '1';
          end if;
  
+        -- Multiplier data path
+        case msel_1 is
+            when MUL1_A =>
+                f_to_multiply.data1 <= r.a.mantissa(61 downto 0) & "00";
+            when MUL1_B =>
+                f_to_multiply.data1 <= r.b.mantissa(61 downto 0) & "00";
+            when others =>
+                f_to_multiply.data1 <= r.r(61 downto 0) & "00";
+        end case;
+        case msel_2 is
+            when MUL2_C =>
+                f_to_multiply.data2 <= r.c.mantissa(61 downto 0) & "00";
+            when others =>
+                f_to_multiply.data2 <= r.r(61 downto 0) & "00";
+        end case;
+        maddend := (others => '0');
+        if msel_inv = '1' then
+            f_to_multiply.addend <= not maddend;
+        else
+            f_to_multiply.addend <= maddend;
+        end if;
+        f_to_multiply.not_result <= msel_inv;
+        if multiply_to_f.valid = '1' then
+            v.p := multiply_to_f.result(63 downto 0);
+        end if;
+
          -- Data path.
          -- This has A and B input multiplexers, an adder, a shifter,
          -- count-leading-zeroes logic, and a result mux.
@@ -1119,8 +1282,10 @@ begin
                  in_a0 := r.r;
              when AIN_A =>
                  in_a0 := r.a.mantissa;
-            when others =>
+            when AIN_B =>
                  in_a0 := r.b.mantissa;
+            when others =>
+                in_a0 := r.c.mantissa;
          end case;
          if (or (mask and in_a0)) = '1' and set_x = '1' then
              v.x := '1';
@@ -1157,6 +1322,8 @@ begin
                  result <= std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + carry_in);
              when RES_SHIFT =>
                  result <= shift_res;
+            when RES_MULT =>
+                result <= multiply_to_f.result(121 downto 58);
              when others =>
                  case misc_sel is
                      when "0000" =>
@@ -1207,6 +1374,15 @@ begin
          end case;
          v.r := result;
  
+        if set_a = '1' then
+            v.a.exponent := new_exp;
+            v.a.mantissa := shift_res;
+        end if;
+        if set_c = '1' then
+            v.c.exponent := new_exp;
+            v.c.mantissa := shift_res;
+        end if;
+
          if opsel_r = RES_SHIFT then
              v.result_exp := new_exp;
          end if;
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c

index 8f7407a7b46996ee9b98742304b0167ab3794e1d..305359a67c996d4b3d326ca70ff386d09de87648 100644 (file)
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -205,6 +205,7 @@ struct sp_dp_equiv {
         { 0x00200000, 0x37f0000000000000 },
         { 0x00000002, 0x36b0000000000000 },
         { 0x00000001, 0x36a0000000000000 },
+       { 0x7f7fffff, 0x47efffffe0000000 },
  };
  
  int sp_to_dp(long arg)
@@ -995,6 +996,83 @@ int fpu_test_14(void)
         return trapit(0, test14);
  }
  
+struct mulvals {
+       unsigned long val_a;
+       unsigned long val_b;
+       unsigned long prod;
+} mulvals[] = {
+       { 0x0000000000000000, 0x0000000000000000, 0x0000000000000000 },
+       { 0x8000000000000000, 0x8000000000000000, 0x0000000000000000 },
+       { 0x3ff0000000000000, 0x3ff0000000000000, 0x3ff0000000000000 },
+       { 0xbff0000000000000, 0x3ff0000000000000, 0xbff0000000000000 },
+       { 0xbf4fff801fffffff, 0x6d7fffff8000007f, 0xecdfff7fa001fffe },
+       { 0x3fbd50275a65ed80, 0x0010000000000000, 0x0001d50275a65ed8 },
+};
+
+int test15(long arg)
+{
+       long i;
+       unsigned long result;
+       struct mulvals *vp = mulvals;
+
+       set_fpscr(FPS_RN_NEAR);
+       for (i = 0; i < sizeof(mulvals) / sizeof(mulvals[0]); ++i, ++vp) {
+               asm("lfd 5,0(%0); lfd 6,8(%0); fmul 7,5,6; stfd 7,0(%1)"
+                   : : "b" (&vp->val_a), "b" (&result) : "memory");
+               if (result != vp->prod) {
+                       print_hex(i, 2, " ");
+                       print_hex(result, 16, " ");
+                       return i + 1;
+               }
+       }
+       return 0;
+}
+
+int fpu_test_15(void)
+{
+       enable_fp();
+       return trapit(0, test15);
+}
+
+struct mulvals_sp {
+       unsigned int val_a;
+       unsigned int val_b;
+       unsigned int prod;
+} mulvals_sp[] = {
+       { 0x00000000, 0x00000000, 0x00000000 },
+       { 0x80000000, 0x80000000, 0x00000000 },
+       { 0x3f800000, 0x3f800000, 0x3f800000 },
+       { 0xbf800000, 0x3f800000, 0xbf800000 },
+       { 0xbe7ff801, 0x6d7fffff, 0xec7ff800 },
+       { 0xc100003d, 0xfe803ff8, 0x7f800000 },
+       { 0x4f780080, 0x389003ff, 0x488b8427 },
+};
+
+int test16(long arg)
+{
+       long i;
+       unsigned int result;
+       struct mulvals_sp *vp = mulvals_sp;
+
+       set_fpscr(FPS_RN_NEAR);
+       for (i = 0; i < sizeof(mulvals_sp) / sizeof(mulvals_sp[0]); ++i, ++vp) {
+               asm("lfs 5,0(%0); lfs 6,4(%0); fmuls 7,5,6; stfs 7,0(%1)"
+                   : : "b" (&vp->val_a), "b" (&result) : "memory");
+               if (result != vp->prod) {
+                       print_hex(i, 2, " ");
+                       print_hex(result, 8, " ");
+                       return i + 1;
+               }
+       }
+       return 0;
+}
+
+int fpu_test_16(void)
+{
+       enable_fp();
+       return trapit(0, test16);
+}
+
  int fail = 0;
  
  void do_test(int num, int (*test)(void))
@@ -1034,6 +1112,8 @@ int main(void)
         do_test(12, fpu_test_12);
         do_test(13, fpu_test_13);
         do_test(14, fpu_test_14);
+       do_test(15, fpu_test_15);
+       do_test(16, fpu_test_16);
  
         return fail;
  }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin

index 623db3f690b0aad69472a9a8740964239799983c..1e0e29e0c174fe7fcc993071b35fef6086d1a0c0 100755 (executable)

Binary files a/tests/test_fpu.bin and b/tests/test_fpu.bin differ
diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out

index 440cd77cf854e440012ed590b16b74ef2ae35383..04c6c08ad2d8a821705a21cfa699d6fbef092b25 100644 (file)
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -12,3 +12,5 @@ test 11:PASS
  test 12:PASS\r
  test 13:PASS\r
  test 14:PASS\r
+test 15:PASS\r
+test 16:PASS\r
author	Paul Mackerras <paulus@ozlabs.org>
	Mon, 27 Jul 2020 08:27:50 +0000 (18:27 +1000)
committer	Paul Mackerras <paulus@ozlabs.org>
	Thu, 3 Sep 2020 07:45:07 +0000 (17:45 +1000)
decode1.vhdl		patch \| blob \| history
decode2.vhdl		patch \| blob \| history
decode_types.vhdl		patch \| blob \| history
fpu.vhdl		patch \| blob \| history
tests/fpu/fpu.c		patch \| blob \| history
tests/test_fpu.bin		patch \| blob \| history
tests/test_fpu.console_out		patch \| blob \| history