From: Paul Mackerras Date: Wed, 15 Jul 2020 04:28:06 +0000 (+1000) Subject: FPU: Implement fmr and related instructions X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=b628af6176bd0bfa0289fa823ec205f48988ec53;p=microwatt.git FPU: Implement fmr and related instructions This implements fmr, fneg, fabs, fnabs and fcpsgn and adds tests for them. This adds logic to unpack and repack floating-point data from the 64-bit packed form (as stored in memory and the register file) into the unpacked form in the fpr_reg_type record. This is not strictly necessary for fmr et al., but will be useful for when we do actual arithmetic. Signed-off-by: Paul Mackerras --- diff --git a/decode1.vhdl b/decode1.vhdl index 343c0c3..5f5fb80 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -428,6 +428,11 @@ architecture behaviour of decode1 is 2#011000100# => (FPU, OP_FPOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 4/6=mtfsfi 2#011110010# => (FPU, OP_FPOP_I, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 18/7=mffs family 2#011110110# => (FPU, OP_FPOP_I, NONE, FRB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 22/7=mtfsf + 2#100000000# => (FPU, OP_FPOP, FRA, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 0/8=fcpsgn + 2#100000001# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 1/8=fneg + 2#100000010# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 2/8=fmr + 2#100000100# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 4/8=fnabs + 2#100001000# => (FPU, OP_FPOP, NONE, FRB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- 8/8=fabs others => illegal_inst ); diff --git a/decode2.vhdl b/decode2.vhdl index 8b2ab8c..ec8232f 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -80,6 +80,8 @@ architecture behaviour of decode2 is return (is_fast_spr(ispr), ispr, reg_data); elsif t = CIA then return ('0', (others => '0'), instr_addr); + elsif HAS_FPU and t = FRA then + return ('1', fpr_to_gspr(insn_fra(insn_in)), reg_data); else return ('0', (others => '0'), (others => '0')); end if; @@ -300,6 +302,7 @@ begin end process; r_out.read1_reg <= d_in.ispr1 when d_in.decode.input_reg_a = SPR + else fpr_to_gspr(insn_fra(d_in.insn)) when d_in.decode.input_reg_a = FRA and HAS_FPU else gpr_to_gspr(insn_ra(d_in.insn)); r_out.read2_reg <= d_in.ispr2 when d_in.decode.input_reg_b = SPR else fpr_to_gspr(insn_frb(d_in.insn)) when d_in.decode.input_reg_b = FRB and HAS_FPU diff --git a/decode_types.vhdl b/decode_types.vhdl index 5eaef50..08fdc4a 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -23,7 +23,7 @@ package decode_types is OP_BCD, OP_ADDG6S, OP_FETCH_FAILED ); - type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA); + type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA, FRA); type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD, CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR, FRB); type input_reg_c_t is (NONE, RS, RCR, FRS); diff --git a/fpu.vhdl b/fpu.vhdl index 047bf2d..3711b35 100644 --- a/fpu.vhdl +++ b/fpu.vhdl @@ -24,9 +24,20 @@ entity fpu is end entity fpu; architecture behaviour of fpu is + type fp_number_class is (ZERO, FINITE, INFINITY, NAN); + + constant EXP_BITS : natural := 13; + + type fpu_reg_type is record + class : fp_number_class; + negative : std_ulogic; + exponent : signed(EXP_BITS-1 downto 0); -- unbiased + mantissa : std_ulogic_vector(63 downto 0); -- 10.54 format + end record; type state_t is (IDLE, - DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF); + DO_MCRFS, DO_MTFSB, DO_MTFSFI, DO_MFFS, DO_MTFSF, + DO_FMR); type reg_type is record state : state_t; @@ -41,9 +52,14 @@ architecture behaviour of fpu is is_cmp : std_ulogic; single_prec : std_ulogic; fpscr : std_ulogic_vector(31 downto 0); - b : std_ulogic_vector(63 downto 0); + a : fpu_reg_type; + b : fpu_reg_type; r : std_ulogic_vector(63 downto 0); + result_sign : std_ulogic; + result_class : fp_number_class; + result_exp : signed(EXP_BITS-1 downto 0); writing_back : std_ulogic; + int_result : std_ulogic; cr_result : std_ulogic_vector(3 downto 0); cr_mask : std_ulogic_vector(7 downto 0); end record; @@ -51,6 +67,72 @@ architecture behaviour of fpu is signal r, rin : reg_type; signal fp_result : std_ulogic_vector(63 downto 0); + signal opsel_r : std_ulogic_vector(1 downto 0); + signal result : std_ulogic_vector(63 downto 0); + + -- Split a DP floating-point number into components and work out its class. + -- If is_int = 1, the input is considered an integer + function decode_dp(fpr: std_ulogic_vector(63 downto 0); is_int: std_ulogic) return fpu_reg_type is + variable r : fpu_reg_type; + variable exp_nz : std_ulogic; + variable exp_ao : std_ulogic; + variable frac_nz : std_ulogic; + variable cls : std_ulogic_vector(2 downto 0); + begin + r.negative := fpr(63); + exp_nz := or (fpr(62 downto 52)); + exp_ao := and (fpr(62 downto 52)); + frac_nz := or (fpr(51 downto 0)); + if is_int = '0' then + r.exponent := signed(resize(unsigned(fpr(62 downto 52)), EXP_BITS)) - to_signed(1023, EXP_BITS); + if exp_nz = '0' then + r.exponent := to_signed(-1022, EXP_BITS); + end if; + r.mantissa := "000000000" & exp_nz & fpr(51 downto 0) & "00"; + cls := exp_ao & exp_nz & frac_nz; + case cls is + when "000" => r.class := ZERO; + when "001" => r.class := FINITE; -- denormalized + when "010" => r.class := FINITE; + when "011" => r.class := FINITE; + when "110" => r.class := INFINITY; + when others => r.class := NAN; + end case; + else + r.mantissa := fpr; + r.exponent := (others => '0'); + if (fpr(63) or exp_nz or frac_nz) = '1' then + r.class := FINITE; + else + r.class := ZERO; + end if; + end if; + return r; + end; + + -- Construct a DP floating-point result from components + function pack_dp(sign: std_ulogic; class: fp_number_class; exp: signed(EXP_BITS-1 downto 0); + mantissa: std_ulogic_vector) return std_ulogic_vector is + variable result : std_ulogic_vector(63 downto 0); + begin + result := (others => '0'); + result(63) := sign; + case class is + when ZERO => + when FINITE => + if mantissa(54) = '1' then + -- normalized number + result(62 downto 52) := std_ulogic_vector(resize(exp, 11) + 1023); + end if; + result(51 downto 0) := mantissa(53 downto 2); + when INFINITY => + result(62 downto 52) := "11111111111"; + when NAN => + result(62 downto 52) := "11111111111"; + result(51 downto 0) := mantissa(53 downto 2); + end case; + return result; + end; begin fpu_0: process(clk) @@ -85,14 +167,18 @@ begin fpu_1: process(all) variable v : reg_type; + variable adec : fpu_reg_type; + variable bdec : fpu_reg_type; variable fpscr_mask : std_ulogic_vector(31 downto 0); variable illegal : std_ulogic; variable j, k : integer; variable flm : std_ulogic_vector(7 downto 0); + variable int_input : std_ulogic; begin v := r; illegal := '0'; v.busy := '0'; + int_input := '0'; -- capture incoming instruction if e_in.valid = '1' then @@ -101,6 +187,7 @@ begin v.fe_mode := or (e_in.fe_mode); v.dest_fpr := e_in.frt; v.single_prec := e_in.single; + v.int_result := '0'; v.rc := e_in.rc; v.is_cmp := e_in.out_cr; if e_in.out_cr = '0' then @@ -108,11 +195,19 @@ begin else v.cr_mask := num_to_fxm(to_integer(unsigned(insn_bf(e_in.insn)))); end if; - v.b := e_in.frb; + int_input := '0'; + if e_in.op = OP_FPOP_I then + int_input := '1'; + end if; + adec := decode_dp(e_in.fra, int_input); + bdec := decode_dp(e_in.frb, int_input); + v.a := adec; + v.b := bdec; end if; v.writing_back := '0'; v.instr_done := '0'; + opsel_r <= "00"; fpscr_mask := (others => '1'); case r.state is @@ -133,6 +228,8 @@ begin else v.state := DO_MTFSF; end if; + when "01000" => + v.state := DO_FMR; when others => illegal := '1'; end case; @@ -177,7 +274,9 @@ begin v.state := IDLE; when DO_MFFS => + v.int_result := '1'; v.writing_back := '1'; + opsel_r <= "10"; case r.insn(20 downto 16) is when "00000" => -- mffs @@ -191,7 +290,7 @@ begin -- mffscrn fpscr_mask := x"000000FF"; v.fpscr(FPSCR_RN+1 downto FPSCR_RN) := - r.b(FPSCR_RN+1 downto FPSCR_RN); + r.b.mantissa(FPSCR_RN+1 downto FPSCR_RN); when "10111" => -- mffscrni fpscr_mask := x"000000FF"; @@ -216,19 +315,48 @@ begin for i in 0 to 7 loop k := i * 4; if flm(i) = '1' then - v.fpscr(k + 3 downto k) := r.b(k + 3 downto k); + v.fpscr(k + 3 downto k) := r.b.mantissa(k + 3 downto k); end if; end loop; v.instr_done := '1'; v.state := IDLE; + when DO_FMR => + v.result_class := r.b.class; + v.result_exp := r.b.exponent; + if r.insn(9) = '1' then + v.result_sign := '0'; -- fabs + elsif r.insn(8) = '1' then + v.result_sign := '1'; -- fnabs + elsif r.insn(7) = '1' then + v.result_sign := r.b.negative; -- fmr + elsif r.insn(6) = '1' then + v.result_sign := not r.b.negative; -- fneg + else + v.result_sign := r.a.negative; -- fcpsgn + end if; + v.writing_back := '1'; + v.instr_done := '1'; + v.state := IDLE; + end case; -- Data path. - -- Just enough to read FPSCR for now. - v.r := x"00000000" & (r.fpscr and fpscr_mask); + case opsel_r is + when "00" => + result <= r.b.mantissa; + when "10" => + result <= x"00000000" & (r.fpscr and fpscr_mask); + when others => + result <= (others => '0'); + end case; + v.r := result; - fp_result <= r.r; + if r.int_result = '1' then + fp_result <= r.r; + else + fp_result <= pack_dp(r.result_sign, r.result_class, r.result_exp, r.r); + end if; v.fpscr(FPSCR_VX) := (or (v.fpscr(FPSCR_VXSNAN downto FPSCR_VXVC))) or (or (v.fpscr(FPSCR_VXSOFT downto FPSCR_VXCVI))); diff --git a/tests/fpu/fpu.c b/tests/fpu/fpu.c index f9c4245..46668f8 100644 --- a/tests/fpu/fpu.c +++ b/tests/fpu/fpu.c @@ -438,6 +438,39 @@ int fpu_test_5(void) return 0; } +#define SIGN 0x8000000000000000ul + +int test6(long arg) +{ + long i; + unsigned long results[6]; + unsigned long v; + + for (i = 0; i < sizeof(sp_dp_equiv) / sizeof(sp_dp_equiv[0]); ++i) { + v = sp_dp_equiv[i].dp; + asm("lfd%U0%X0 3,%0; fmr 6,3; fneg 7,3; stfd 6,0(%1); stfd 7,8(%1)" + : : "m" (sp_dp_equiv[i].dp), "b" (results) : "memory"); + asm("fabs 9,6; fnabs 10,6; stfd 9,16(%0); stfd 10,24(%0)" + : : "b" (results) : "memory"); + asm("fcpsgn 4,9,3; stfd 4,32(%0); fcpsgn 5,10,3; stfd 5,40(%0)" + : : "b" (results) : "memory"); + if (results[0] != v || + results[1] != (v ^ SIGN) || + results[2] != (v & ~SIGN) || + results[3] != (v | SIGN) || + results[4] != (v & ~SIGN) || + results[5] != (v | SIGN)) + return i + 1; + } + return 0; +} + +int fpu_test_6(void) +{ + enable_fp(); + return trapit(0, test6); +} + int fail = 0; void do_test(int num, int (*test)(void)) @@ -469,6 +502,7 @@ int main(void) do_test(3, fpu_test_3); do_test(4, fpu_test_4); do_test(5, fpu_test_5); + do_test(6, fpu_test_6); return fail; } diff --git a/tests/test_fpu.bin b/tests/test_fpu.bin index 6bac861..4fb260e 100755 Binary files a/tests/test_fpu.bin and b/tests/test_fpu.bin differ diff --git a/tests/test_fpu.console_out b/tests/test_fpu.console_out index 99d32e6..a49bb9b 100644 --- a/tests/test_fpu.console_out +++ b/tests/test_fpu.console_out @@ -3,3 +3,4 @@ test 02:PASS test 03:PASS test 04:PASS test 05:PASS +test 06:PASS