FPU: Implement fsqrt[s] and add a test for fsqrt

author Paul Mackerras <paulus@ozlabs.org>

Fri, 31 Jul 2020 02:02:55 +0000 (12:02 +1000)

committer Paul Mackerras <paulus@ozlabs.org>

Thu, 3 Sep 2020 07:45:34 +0000 (17:45 +1000)
author Paul Mackerras <paulus@ozlabs.org>
Fri, 31 Jul 2020 02:02:55 +0000 (12:02 +1000)
committer Paul Mackerras <paulus@ozlabs.org>
Thu, 3 Sep 2020 07:45:34 +0000 (17:45 +1000)
diff --git a/decode1.vhdl b/decode1.vhdl

index 7163ff99b0d2ebfe80b25d093bfb01b1cff9b481..e821469432cb1399860de2bcb3b3091a5062de3d 100644 (file)
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -419,6 +419,7 @@ architecture behaviour of decode1 is
          2#10010#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fdivs
          2#10100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fsubs
          2#10101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fadds
+        2#10110#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fsqrts
          2#11000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fres
          2#11001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- fmuls
          2#11010#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- frsqrtes
@@ -477,6 +478,7 @@ architecture behaviour of decode1 is
          2#0010#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fdiv
          2#0100#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsub
          2#0101#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fadd
+        2#0110#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsqrt
          2#0111#  =>  (FPU,   OP_FPOP,       FRA,  FRB,  FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fsel
          2#1000#  =>  (FPU,   OP_FPOP,       NONE, FRB,  NONE, FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fre
          2#1001#  =>  (FPU,   OP_FPOP,       FRA,  NONE, FRC,  FRT,  '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- fmul
diff --git a/fpu.vhdl b/fpu.vhdl

index 0cbd43fabd91194229c13df1f54444521180145f..244454ef7e0e1b349c6975e78dd9268be7585a39 100644 (file)
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -40,7 +40,7 @@ architecture behaviour of fpu is
                       DO_FMR, DO_FMRG, DO_FCMP,
                       DO_FCFID, DO_FCTI,
                       DO_FRSP, DO_FRI,
-                     DO_FADD, DO_FMUL, DO_FDIV,
+                     DO_FADD, DO_FMUL, DO_FDIV, DO_FSQRT,
                       DO_FRE, DO_FRSQRTE,
                       DO_FSEL,
                       FRI_1,
@@ -51,6 +51,9 @@ architecture behaviour of fpu is
                       DIV_2, DIV_3, DIV_4, DIV_5, DIV_6,
                       FRE_1,
                       RSQRT_1,
+                     SQRT_1, SQRT_2, SQRT_3, SQRT_4,
+                     SQRT_5, SQRT_6, SQRT_7, SQRT_8,
+                     SQRT_9, SQRT_10, SQRT_11, SQRT_12,
                       INT_SHIFT, INT_ROUND, INT_ISHIFT,
                       INT_FINAL, INT_CHECK, INT_OFLOW,
                       FINISH, NORMALIZE,
@@ -140,6 +143,7 @@ architecture behaviour of fpu is
      constant BIN_ZERO : std_ulogic_vector(1 downto 0) := "00";
      constant BIN_R    : std_ulogic_vector(1 downto 0) := "01";
      constant BIN_MASK : std_ulogic_vector(1 downto 0) := "10";
+    constant BIN_PS6  : std_ulogic_vector(1 downto 0) := "11";
  
      constant RES_SUM   : std_ulogic_vector(1 downto 0) := "00";
      constant RES_SHIFT : std_ulogic_vector(1 downto 0) := "01";
@@ -604,6 +608,7 @@ begin
          variable pshift      : std_ulogic;
          variable renorm_sqrt : std_ulogic;
          variable sqrt_exp    : signed(EXP_BITS-1 downto 0);
+        variable shiftin     : std_ulogic;
      begin
          v := r;
          illegal := '0';
@@ -717,6 +722,7 @@ begin
          set_y := '0';
          pshift := '0';
          renorm_sqrt := '0';
+        shiftin := '0';
          case r.state is
              when IDLE =>
                  if e_in.valid = '1' then
@@ -765,6 +771,9 @@ begin
                              v.state := DO_FDIV;
                          when "10100" | "10101" =>
                              v.state := DO_FADD;
+                        when "10110" =>
+                            v.is_sqrt := '1';
+                            v.state := DO_FSQRT;
                          when "10111" =>
                              v.state := DO_FSEL;
                          when "11000" =>
@@ -1248,6 +1257,43 @@ begin
                  v.quieten_nan := '0';
                  arith_done := '1';
  
+            when DO_FSQRT =>
+                opsel_a <= AIN_B;
+                v.result_class := r.b.class;
+                v.result_sign := r.b.negative;
+                v.fpscr(FPSCR_FR) := '0';
+                v.fpscr(FPSCR_FI) := '0';
+                if r.b.class = NAN and r.b.mantissa(53) = '0' then
+                    v.fpscr(FPSCR_VXSNAN) := '1';
+                    invalid := '1';
+                end if;
+                case r.b.class is
+                    when FINITE =>
+                        v.result_exp := r.b.exponent;
+                        if r.b.negative = '1' then
+                            v.fpscr(FPSCR_VXSQRT) := '1';
+                            qnan_result := '1';
+                            arith_done := '1';
+                        elsif r.b.mantissa(54) = '0' then
+                            v.state := RENORM_B;
+                        elsif r.b.exponent(0) = '0' then
+                            v.state := SQRT_1;
+                        else
+                            v.shift := to_signed(1, EXP_BITS);
+                            v.state := RENORM_B2;
+                        end if;
+                    when NAN | ZERO =>
+                        -- result is B
+                        arith_done := '1';
+                    when INFINITY =>
+                        if r.b.negative = '1' then
+                            v.fpscr(FPSCR_VXSQRT) := '1';
+                            qnan_result := '1';
+                        -- else result is B
+                        end if;
+                        arith_done := '1';
+                end case;
+
              when DO_FRE =>
                  opsel_a <= AIN_B;
                  v.result_class := r.b.class;
@@ -1454,7 +1500,11 @@ begin
                  -- wait one cycle for inverse_table[B] lookup
                  v.first := '1';
                  if r.insn(4) = '0' then
-                    v.state := DIV_2;
+                    if r.insn(3) = '0' then
+                        v.state := DIV_2;
+                    else
+                        v.state := SQRT_1;
+                    end if;
                  elsif r.insn(2) = '0' then
                      v.state := FRE_1;
                  else
@@ -1545,6 +1595,156 @@ begin
                  v.shift := to_signed(1, EXP_BITS);
                  v.state := NORMALIZE;
  
+            when SQRT_1 =>
+                -- put invsqr[B] in R and compute P = invsqr[B] * B
+                -- also transfer B (in R) to A
+                set_a := '1';
+                opsel_r <= RES_MISC;
+                misc_sel <= "0111";
+                msel_1 <= MUL1_B;
+                msel_2 <= MUL2_LUT;
+                f_to_multiply.valid <= '1';
+                v.shift := to_signed(-1, EXP_BITS);
+                v.count := "00";
+                v.state := SQRT_2;
+
+            when SQRT_2 =>
+                -- shift R right one place
+                -- not expecting multiplier result yet
+                opsel_r <= RES_SHIFT;
+                v.first := '1';
+                v.state := SQRT_3;
+
+            when SQRT_3 =>
+                -- put R into Y, wait for product from multiplier
+                msel_2 <= MUL2_R;
+                set_y := r.first;
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    -- put result into R
+                    opsel_r <= RES_MULT;
+                    v.first := '1';
+                    v.state := SQRT_4;
+                end if;
+
+            when SQRT_4 =>
+                -- compute 1.5 - Y * P
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_P;
+                msel_add <= MULADD_CONST;
+                msel_inv <= '1';
+                f_to_multiply.valid <= r.first;
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    v.state := SQRT_5;
+                end if;
+
+            when SQRT_5 =>
+                -- compute Y = Y * P
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_P;
+                f_to_multiply.valid <= '1';
+                v.first := '1';
+                v.state := SQRT_6;
+
+            when SQRT_6 =>
+                -- pipeline in R = R * P
+                msel_1 <= MUL1_R;
+                msel_2 <= MUL2_P;
+                f_to_multiply.valid <= r.first;
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    v.first := '1';
+                    v.state := SQRT_7;
+                end if;
+
+            when SQRT_7 =>
+                -- first multiply is done, put result in Y
+                msel_2 <= MUL2_P;
+                set_y := r.first;
+                -- wait for second multiply (should be here already)
+                pshift := '1';
+                if multiply_to_f.valid = '1' then
+                    -- put result into R
+                    opsel_r <= RES_MULT;
+                    v.first := '1';
+                    v.count := r.count + 1;
+                    if r.count < 2 then
+                        v.state := SQRT_4;
+                    else
+                        v.first := '1';
+                        v.state := SQRT_8;
+                    end if;
+                end if;
+
+            when SQRT_8 =>
+                -- compute P = A - R * R, which can be +ve or -ve
+                -- we arranged for B to be put into A earlier
+                msel_1 <= MUL1_R;
+                msel_2 <= MUL2_R;
+                msel_add <= MULADD_A;
+                msel_inv <= '1';
+                pshift := '1';
+                f_to_multiply.valid <= r.first;
+                if multiply_to_f.valid = '1' then
+                    v.first := '1';
+                    v.state := SQRT_9;
+                end if;
+
+            when SQRT_9 =>
+                -- compute P = P * Y
+                -- since Y is an estimate of 1/sqrt(B), this makes P an
+                -- estimate of the adjustment needed to R.  Since the error
+                -- could be negative and we have an unsigned multiplier, the
+                -- upper bits can be wrong, but it turns out the lowest 8 bits
+                -- are correct and are all we need (given 3 iterations through
+                -- SQRT_4 to SQRT_7).
+                msel_1 <= MUL1_Y;
+                msel_2 <= MUL2_P;
+                pshift := '1';
+                f_to_multiply.valid <= r.first;
+                if multiply_to_f.valid = '1' then
+                    v.state := SQRT_10;
+                end if;
+
+            when SQRT_10 =>
+                -- Add the bottom 8 bits of P, sign-extended,
+                -- divided by 4, onto R.
+                -- The division by 4 is because R is 10.54 format
+                -- whereas P is 8.56 format.
+                opsel_b <= BIN_PS6;
+                sqrt_exp := r.b.exponent(EXP_BITS-1) & r.b.exponent(EXP_BITS-1 downto 1);
+                v.result_exp := sqrt_exp;
+                v.shift := to_signed(1, EXP_BITS);
+                v.first := '1';
+                v.state := SQRT_11;
+
+            when SQRT_11 =>
+                -- compute P = A - R * R (remainder)
+                -- also put 2 * R + 1 into B for comparison with P
+                msel_1 <= MUL1_R;
+                msel_2 <= MUL2_R;
+                msel_add <= MULADD_A;
+                msel_inv <= '1';
+                f_to_multiply.valid <= r.first;
+                shiftin := '1';
+                set_b := r.first;
+                if multiply_to_f.valid = '1' then
+                    v.state := SQRT_12;
+                end if;
+
+            when SQRT_12 =>
+                -- test if remainder is 0 or >= B = 2*R + 1
+                if pcmpb_lt = '1' then
+                    -- square root is correct, set X if remainder non-zero
+                    v.x := r.p(58) or px_nz;
+                else
+                    -- square root needs to be incremented by 1
+                    carry_in <= '1';
+                    v.x := not pcmpb_eq;
+                end if;
+                v.state := FINISH;
+
              when INT_SHIFT =>
                  opsel_r <= RES_SHIFT;
                  set_x := '1';
@@ -1828,8 +2028,12 @@ begin
          maddend := (others => '0');
          case msel_add is
              when MULADD_CONST =>
-                -- addend is 2.0 in 16.112 format
-                maddend(113) := '1';                -- 2.0
+                -- addend is 2.0 or 1.5 in 16.112 format
+                if r.is_sqrt = '0' then
+                    maddend(113) := '1';                -- 2.0
+                else
+                    maddend(112 downto 111) := "11";    -- 1.5
+                end if;
              when MULADD_A =>
                  -- addend is A in 16.112 format
                  maddend(121 downto 58) := r.a.mantissa;
@@ -1895,14 +2099,15 @@ begin
              when BIN_MASK =>
                  in_b0 := mask;
              when others =>
-                in_b0 := (others => '0');
+                -- BIN_PS6, 6 LSBs of P/4 sign-extended to 64
+                in_b0 := std_ulogic_vector(resize(signed(r.p(7 downto 2)), 64));
          end case;
          if opsel_binv = '1' then
              in_b0 := not in_b0;
          end if;
          in_b <= in_b0;
          if r.shift >= to_signed(-64, EXP_BITS) and r.shift <= to_signed(63, EXP_BITS) then
-            shift_res := shifter_64(r.r & x"00000000000000",
+            shift_res := shifter_64(r.r & shiftin & 55x"00000000000000",
                                      std_ulogic_vector(r.shift(6 downto 0)));
          else
              shift_res := (others => '0');
diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c

index d9c5c06b4f59880620511fd4155a8a0fd8bf6941..b72b01ea56841a3635f36cafdb93f6a2ef53ed41 100644 (file)
--- a/tests/fpu/fpu.c
+++ b/tests/fpu/fpu.c
@@ -1291,6 +1291,53 @@ int fpu_test_21(void)
         return trapit(0, test21);
  }
  
+struct sqrtvals {
+       unsigned long val;
+       unsigned long inv;
+} sqrtvals[] = {
+       { 0x0000000000000000, 0x0000000000000000 },
+       { 0x8000000000000000, 0x8000000000000000 },
+       { 0xfff0000000000000, 0x7ff8000000000000 },
+       { 0x7ff0000000000000, 0x7ff0000000000000 },
+       { 0xfff123456789abcd, 0xfff923456789abcd },
+       { 0x3ff0000000000000, 0x3ff0000000000000 },
+       { 0x4000000000000000, 0x3ff6a09e667f3bcd },
+       { 0x4010000000000000, 0x4000000000000000 },
+       { 0xbff0000000000000, 0x7ff8000000000000 },
+       { 0x4008000000000000, 0x3ffbb67ae8584caa },
+       { 0x7fd0000000000000, 0x5fe0000000000000 },
+       { 0x0008000000000000, 0x1ff6a09e667f3bcd },
+       { 0x0004000000000000, 0x1ff0000000000000 },
+       { 0x0002000000000000, 0x1fe6a09e667f3bcd },
+       { 0x0000000000000002, 0x1e66a09e667f3bcd },
+       { 0x0000000000000001, 0x1e60000000000000 },
+};
+
+int test22(long arg)
+{
+       long i;
+       unsigned long result;
+       struct sqrtvals *vp = sqrtvals;
+
+       set_fpscr(FPS_RN_NEAR);
+       for (i = 0; i < sizeof(sqrtvals) / sizeof(sqrtvals[0]); ++i, ++vp) {
+               asm("lfd 6,0(%0); fsqrt 7,6; stfd 7,0(%1)"
+                   : : "b" (&vp->val), "b" (&result) : "memory");
+               if (result != vp->inv) {
+                       print_hex(i, 2, " ");
+                       print_hex(result, 16, " ");
+                       return i + 1;
+               }
+       }
+       return 0;
+}
+
+int fpu_test_22(void)
+{
+       enable_fp();
+       return trapit(0, test22);
+}
+
  int fail = 0;
  
  void do_test(int num, int (*test)(void))
@@ -1337,6 +1384,7 @@ int main(void)
         do_test(19, fpu_test_19);
         do_test(20, fpu_test_20);
         do_test(21, fpu_test_21);
+       do_test(22, fpu_test_22);
  
         return fail;
  }
diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin

index 0253720609c301172916dc31fbe1acd21c7c41bd..e3783415a22e72bae0935c2959ab8c4a5bf5a316 100755 (executable)

Binary files a/tests/test_fpu.bin and b/tests/test_fpu.bin differ
diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out

index b6bc73384d76c14d6fcb6d22873dc9268ab73407..9b97cb5f61f7fe595bcb68c0b304072a4d0a4676 100644 (file)
--- a/tests/test_fpu.console_out
+++ b/tests/test_fpu.console_out
@@ -19,3 +19,4 @@ test 18:PASS
  test 19:PASS\r
  test 20:PASS\r
  test 21:PASS\r
+test 22:PASS\r
author	Paul Mackerras <paulus@ozlabs.org>
	Fri, 31 Jul 2020 02:02:55 +0000 (12:02 +1000)
committer	Paul Mackerras <paulus@ozlabs.org>
	Thu, 3 Sep 2020 07:45:34 +0000 (17:45 +1000)
decode1.vhdl		patch \| blob \| history
fpu.vhdl		patch \| blob \| history
tests/fpu/fpu.c		patch \| blob \| history
tests/test_fpu.bin		patch \| blob \| history
tests/test_fpu.console_out		patch \| blob \| history