From: Paul Mackerras Date: Fri, 28 Aug 2020 03:35:05 +0000 (+1000) Subject: core: Add support for single-precision FP loads and stores X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=9d285a265cf9fab8f5f17d6d4588d9545e555e68;p=microwatt.git core: Add support for single-precision FP loads and stores This adds code to loadstore1 to convert between single-precision and double-precision formats, and implements the lfs* and stfs* instructions. The conversion processes are described in Power ISA v3.1 Book 1 sections 4.6.2 and 4.6.3. These conversions take one cycle, so lfs* and stfs* are one cycle slower than lfd* and stfd*. Signed-off-by: Paul Mackerras --- diff --git a/common.vhdl b/common.vhdl index 14bdcf7..e1ba844 100644 --- a/common.vhdl +++ b/common.vhdl @@ -287,6 +287,7 @@ package common is virt_mode : std_ulogic; -- do translation through TLB priv_mode : std_ulogic; -- privileged mode (MSR[PR] = 0) mode_32bit : std_ulogic; -- trim addresses to 32 bits + is_32bit : std_ulogic; end record; constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', op => OP_ILLEGAL, ci => '0', byte_reverse => '0', sign_extend => '0', update => '0', xerc => xerc_init, @@ -294,7 +295,7 @@ package common is nia => (others => '0'), insn => (others => '0'), addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'), write_reg => (others => '0'), length => (others => '0'), - mode_32bit => '0', others => (others => '0')); + mode_32bit => '0', is_32bit => '0', others => (others => '0')); type Loadstore1ToExecute1Type is record busy : std_ulogic; diff --git a/countzero.vhdl b/countzero.vhdl index 18aa043..b46f108 100644 --- a/countzero.vhdl +++ b/countzero.vhdl @@ -3,6 +3,7 @@ use ieee.std_logic_1164.all; use ieee.numeric_std.all; library work; +use work.helpers.all; entity zero_counter is port ( @@ -15,42 +16,6 @@ entity zero_counter is end entity zero_counter; architecture behaviour of zero_counter is - -- Reverse the order of bits in a word - function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector is - variable ret: std_ulogic_vector(a'left downto a'right); - begin - for i in a'right to a'left loop - ret(a'left + a'right - i) := a(i); - end loop; - return ret; - end; - - -- If there is only one bit set in a doubleword, return its bit number - -- (counting from the right). Each bit of the result is obtained by - -- ORing together 32 bits of the input: - -- bit 0 = a[1] or a[3] or a[5] or ... - -- bit 1 = a[2] or a[3] or a[6] or a[7] or ... - -- bit 2 = a[4..7] or a[12..15] or ... - -- bit 5 = a[32..63] ORed together - function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is - variable ret: std_ulogic_vector(5 downto 0); - variable stride: natural; - variable bit: std_ulogic; - variable k: natural; - begin - stride := 2; - for i in 0 to 5 loop - bit := '0'; - for j in 0 to (64 / stride) - 1 loop - k := j * stride; - bit := bit or (or a(k + stride - 1 downto k + (stride / 2))); - end loop; - ret(i) := bit; - stride := stride * 2; - end loop; - return ret; - end; - signal inp : std_ulogic_vector(63 downto 0); signal sum : std_ulogic_vector(64 downto 0); signal msb_r : std_ulogic; diff --git a/decode1.vhdl b/decode1.vhdl index 75da175..29f0e50 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -74,8 +74,8 @@ architecture behaviour of decode1 is 35 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzu 50 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfd 51 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdu --- 48 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfs --- 49 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsu + 48 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfs + 49 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsu 42 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lha 43 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhau 40 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhz @@ -93,8 +93,8 @@ architecture behaviour of decode1 is 39 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbu 54 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfd 55 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdu --- 52 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfs --- 53 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsu + 52 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfs + 53 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsu 44 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sth 45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu 36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw @@ -284,8 +284,8 @@ architecture behaviour of decode1 is 2#1001110111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdux 2#1101010111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwax 2#1101110111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwzx --- 2#1000010111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfsx --- 2#1000110111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsux + 2#1000010111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfsx + 2#1000110111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsux 2#0001110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lharx 2#0101110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhaux 2#0101010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lhax @@ -367,8 +367,8 @@ architecture behaviour of decode1 is 2#1011010111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfdx 2#1011110111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdux 2#1111010111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfiwx --- 2#1010010111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfsx --- 2#1010110111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsux + 2#1010010111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfsx + 2#1010110111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsux 2#1110010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthbrx 2#1110110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthcix 2#1011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- sthcx diff --git a/execute1.vhdl b/execute1.vhdl index 4d6a9cc..9d9b711 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -1259,6 +1259,7 @@ begin lv.virt_mode := ctrl.msr(MSR_DR); lv.priv_mode := not ctrl.msr(MSR_PR); lv.mode_32bit := not ctrl.msr(MSR_SF); + lv.is_32bit := e_in.is_32bit; -- Update registers rin <= v; diff --git a/helpers.vhdl b/helpers.vhdl index fe91938..834e386 100644 --- a/helpers.vhdl +++ b/helpers.vhdl @@ -25,6 +25,10 @@ package helpers is function byte_reverse(val: std_ulogic_vector(63 downto 0); size: integer) return std_ulogic_vector; function sign_extend(val: std_ulogic_vector(63 downto 0); size: natural) return std_ulogic_vector; + + function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector; + function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector; + function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector; end package helpers; package body helpers is @@ -206,4 +210,53 @@ package body helpers is return std_ulogic_vector(ret); end; + + -- Reverse the order of bits in a word + function bit_reverse(a: std_ulogic_vector) return std_ulogic_vector is + variable ret: std_ulogic_vector(a'left downto a'right); + begin + for i in a'right to a'left loop + ret(a'left + a'right - i) := a(i); + end loop; + return ret; + end; + + -- If there is only one bit set in a doubleword, return its bit number + -- (counting from the right). Each bit of the result is obtained by + -- ORing together 32 bits of the input: + -- bit 0 = a[1] or a[3] or a[5] or ... + -- bit 1 = a[2] or a[3] or a[6] or a[7] or ... + -- bit 2 = a[4..7] or a[12..15] or ... + -- bit 5 = a[32..63] ORed together + function bit_number(a: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is + variable ret: std_ulogic_vector(5 downto 0); + variable stride: natural; + variable bit: std_ulogic; + variable k: natural; + begin + stride := 2; + for i in 0 to 5 loop + bit := '0'; + for j in 0 to (64 / stride) - 1 loop + k := j * stride; + bit := bit or (or a(k + stride - 1 downto k + (stride / 2))); + end loop; + ret(i) := bit; + stride := stride * 2; + end loop; + return ret; + end; + + -- Count leading zeroes operation + -- Assumes the value passed in is not zero (if it is, zero is returned) + function count_left_zeroes(val: std_ulogic_vector) return std_ulogic_vector is + variable rev: std_ulogic_vector(val'left downto val'right); + variable sum: std_ulogic_vector(val'left downto val'right); + variable onehot: std_ulogic_vector(val'left downto val'right); + begin + rev := bit_reverse(val); + sum := std_ulogic_vector(- signed(rev)); + onehot := sum and rev; + return bit_number(std_ulogic_vector(resize(unsigned(onehot), 64))); + end; end package body helpers; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index ec20319..919ba0e 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -45,10 +45,12 @@ architecture behave of loadstore1 is -- State machine for unaligned loads/stores type state_t is (IDLE, -- ready for instruction + FPR_CONV, -- converting double to float for store SECOND_REQ, -- send 2nd request of unaligned xfer ACK_WAIT, -- waiting for ack from dcache MMU_LOOKUP, -- waiting for MMU to look up translation TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie + FINISH_LFS, -- write back converted SP data for lfs* COMPLETE -- extra cycle to complete an operation ); @@ -89,6 +91,11 @@ architecture behave of loadstore1 is do_update : std_ulogic; extra_cycle : std_ulogic; mode_32bit : std_ulogic; + load_sp : std_ulogic; + ld_sp_data : std_ulogic_vector(31 downto 0); + ld_sp_nz : std_ulogic; + ld_sp_lz : std_ulogic_vector(5 downto 0); + st_sp_data : std_ulogic_vector(31 downto 0); end record; type byte_sel_t is array(0 to 7) of std_ulogic; @@ -98,6 +105,9 @@ architecture behave of loadstore1 is signal r, rin : reg_stage_t; signal lsu_sum : std_ulogic_vector(63 downto 0); + signal store_sp_data : std_ulogic_vector(31 downto 0); + signal load_dp_data : std_ulogic_vector(63 downto 0); + -- Generate byte enables from sizes function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is begin @@ -128,6 +138,72 @@ architecture behave of loadstore1 is to_integer(unsigned(address)))); end function xfer_data_sel; + -- 23-bit right shifter for DP -> SP float conversions + function shifter_23r(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0)) + return std_ulogic_vector is + variable fs1 : std_ulogic_vector(22 downto 0); + variable fs2 : std_ulogic_vector(22 downto 0); + begin + case shift(1 downto 0) is + when "00" => + fs1 := frac; + when "01" => + fs1 := '0' & frac(22 downto 1); + when "10" => + fs1 := "00" & frac(22 downto 2); + when others => + fs1 := "000" & frac(22 downto 3); + end case; + case shift(4 downto 2) is + when "000" => + fs2 := fs1; + when "001" => + fs2 := x"0" & fs1(22 downto 4); + when "010" => + fs2 := x"00" & fs1(22 downto 8); + when "011" => + fs2 := x"000" & fs1(22 downto 12); + when "100" => + fs2 := x"0000" & fs1(22 downto 16); + when others => + fs2 := x"00000" & fs1(22 downto 20); + end case; + return fs2; + end; + + -- 23-bit left shifter for SP -> DP float conversions + function shifter_23l(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0)) + return std_ulogic_vector is + variable fs1 : std_ulogic_vector(22 downto 0); + variable fs2 : std_ulogic_vector(22 downto 0); + begin + case shift(1 downto 0) is + when "00" => + fs1 := frac; + when "01" => + fs1 := frac(21 downto 0) & '0'; + when "10" => + fs1 := frac(20 downto 0) & "00"; + when others => + fs1 := frac(19 downto 0) & "000"; + end case; + case shift(4 downto 2) is + when "000" => + fs2 := fs1; + when "001" => + fs2 := fs1(18 downto 0) & x"0" ; + when "010" => + fs2 := fs1(14 downto 0) & x"00"; + when "011" => + fs2 := fs1(10 downto 0) & x"000"; + when "100" => + fs2 := fs1(6 downto 0) & x"0000"; + when others => + fs2 := fs1(2 downto 0) & x"00000"; + end case; + return fs2; + end; + begin -- Calculate the address in the first cycle lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0'); @@ -145,6 +221,59 @@ begin end if; end process; + ls_fp_conv: if HAS_FPU generate + -- Convert DP data to SP for stfs + dp_to_sp: process(all) + variable exp : unsigned(10 downto 0); + variable frac : std_ulogic_vector(22 downto 0); + variable shift : unsigned(4 downto 0); + begin + store_sp_data(31) <= l_in.data(63); + store_sp_data(30 downto 0) <= (others => '0'); + exp := unsigned(l_in.data(62 downto 52)); + if exp > 896 then + store_sp_data(30) <= l_in.data(62); + store_sp_data(29 downto 0) <= l_in.data(58 downto 29); + elsif exp >= 874 then + -- denormalization required + frac := '1' & l_in.data(51 downto 30); + shift := 0 - exp(4 downto 0); + store_sp_data(22 downto 0) <= shifter_23r(frac, shift); + end if; + end process; + + -- Convert SP data to DP for lfs + sp_to_dp: process(all) + variable exp : unsigned(7 downto 0); + variable exp_dp : unsigned(10 downto 0); + variable exp_nz : std_ulogic; + variable exp_ao : std_ulogic; + variable frac : std_ulogic_vector(22 downto 0); + variable frac_shift : unsigned(4 downto 0); + begin + frac := r.ld_sp_data(22 downto 0); + exp := unsigned(r.ld_sp_data(30 downto 23)); + exp_nz := or (r.ld_sp_data(30 downto 23)); + exp_ao := and (r.ld_sp_data(30 downto 23)); + frac_shift := (others => '0'); + if exp_ao = '1' then + exp_dp := to_unsigned(2047, 11); -- infinity or NaN + elsif exp_nz = '1' then + exp_dp := 896 + resize(exp, 11); -- finite normalized value + elsif r.ld_sp_nz = '0' then + exp_dp := to_unsigned(0, 11); -- zero + else + -- denormalized SP operand, need to normalize + exp_dp := 896 - resize(unsigned(r.ld_sp_lz), 11); + frac_shift := unsigned(r.ld_sp_lz(4 downto 0)) + 1; + end if; + load_dp_data(63) <= r.ld_sp_data(31); + load_dp_data(62 downto 52) <= std_ulogic_vector(exp_dp); + load_dp_data(51 downto 29) <= shifter_23l(frac, frac_shift); + load_dp_data(28 downto 0) <= (others => '0'); + end process; + end generate; + loadstore1_1: process(all) variable v : reg_stage_t; variable brev_lenm1 : unsigned(2 downto 0); @@ -165,6 +294,9 @@ begin variable data_permuted : std_ulogic_vector(63 downto 0); variable data_trimmed : std_ulogic_vector(63 downto 0); variable store_data : std_ulogic_vector(63 downto 0); + variable data_in : std_ulogic_vector(63 downto 0); + variable byte_rev : std_ulogic; + variable length : std_ulogic_vector(3 downto 0); variable use_second : byte_sel_t; variable trim_ctl : trim_ctl_t; variable negative : std_ulogic; @@ -176,6 +308,8 @@ begin variable mmu_mtspr : std_ulogic; variable itlb_fault : std_ulogic; variable misaligned : std_ulogic; + variable fp_reg_conv : std_ulogic; + variable lfs_done : std_ulogic; begin v := r; req := '0'; @@ -185,8 +319,10 @@ begin sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10)); dsisr := (others => '0'); mmureq := '0'; + fp_reg_conv := '0'; write_enable := '0'; + lfs_done := '0'; do_update := r.do_update; v.do_update := '0'; @@ -245,19 +381,38 @@ begin end case; end loop; - -- Byte reversing and rotating for stores - -- Done in the first cycle (when l_in.valid = 1) + if HAS_FPU then + -- Single-precision FP conversion + v.st_sp_data := store_sp_data; + v.ld_sp_data := data_trimmed(31 downto 0); + v.ld_sp_nz := or (data_trimmed(22 downto 0)); + v.ld_sp_lz := count_left_zeroes(data_trimmed(22 downto 0)); + end if; + + -- Byte reversing and rotating for stores. + -- Done in the first cycle (when l_in.valid = 1) for integer stores + -- and DP float stores, and in the second cycle for SP float stores. store_data := r.store_data; - if l_in.valid = '1' then - byte_offset := unsigned(lsu_sum(2 downto 0)); + if l_in.valid = '1' or (HAS_FPU and r.state = FPR_CONV) then + if HAS_FPU and r.state = FPR_CONV then + data_in := x"00000000" & r.st_sp_data; + byte_offset := unsigned(r.addr(2 downto 0)); + byte_rev := r.byte_reverse; + length := r.length; + else + data_in := l_in.data; + byte_offset := unsigned(lsu_sum(2 downto 0)); + byte_rev := l_in.byte_reverse; + length := l_in.length; + end if; brev_lenm1 := "000"; - if l_in.byte_reverse = '1' then - brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1; + if byte_rev = '1' then + brev_lenm1 := unsigned(length(2 downto 0)) - 1; end if; for i in 0 to 7 loop k := (to_unsigned(i, 3) - byte_offset) xor brev_lenm1; j := to_integer(k) * 8; - store_data(i * 8 + 7 downto i * 8) := l_in.data(j + 7 downto j); + store_data(i * 8 + 7 downto i * 8) := data_in(j + 7 downto j); end loop; end if; v.store_data := store_data; @@ -292,6 +447,14 @@ begin case r.state is when IDLE => + when FPR_CONV => + req := '1'; + if r.second_bytes /= "00000000" then + v.state := SECOND_REQ; + else + v.state := ACK_WAIT; + end if; + when SECOND_REQ => req := '1'; v.state := ACK_WAIT; @@ -323,8 +486,13 @@ begin v.load_data := data_permuted; end if; else - write_enable := r.load; - if r.extra_cycle = '1' then + write_enable := r.load and not r.load_sp; + if HAS_FPU and r.load_sp = '1' then + -- SP to DP conversion takes a cycle + -- Write back rA update in this cycle if needed + do_update := r.update; + v.state := FINISH_LFS; + elsif r.extra_cycle = '1' then -- loads with rA update need an extra cycle v.state := COMPLETE; v.do_update := r.update; @@ -362,6 +530,9 @@ begin when TLBIE_WAIT => + when FINISH_LFS => + lfs_done := '1'; + when COMPLETE => exception := r.align_intr; @@ -395,6 +566,7 @@ begin v.nc := l_in.ci; v.virt_mode := l_in.virt_mode; v.priv_mode := l_in.priv_mode; + v.load_sp := '0'; v.wait_dcache := '0'; v.wait_mmu := '0'; v.do_update := '0'; @@ -436,14 +608,24 @@ begin v.dcbz := '1'; when OP_FPSTORE => if HAS_FPU then - req := '1'; + if l_in.is_32bit = '1' then + v.state := FPR_CONV; + fp_reg_conv := '1'; + else + req := '1'; + end if; end if; when OP_FPLOAD => if HAS_FPU then v.load := '1'; req := '1'; - -- Allow an extra cycle for RA update + -- Allow an extra cycle for SP->DP precision conversion + -- or RA update v.extra_cycle := l_in.update; + if l_in.is_32bit = '1' then + v.load_sp := '1'; + v.extra_cycle := '1'; + end if; end if; when OP_TLBIE => mmureq := '1'; @@ -500,7 +682,7 @@ begin end if; end if; - v.busy := req or mmureq or mmu_mtspr; + v.busy := req or mmureq or mmu_mtspr or fp_reg_conv; end if; -- Update outputs to dcache @@ -539,6 +721,10 @@ begin l_out.write_enable <= '1'; l_out.write_reg <= gpr_to_gspr(r.update_reg); l_out.write_data <= r.addr; + elsif lfs_done = '1' then + l_out.write_enable <= '1'; + l_out.write_reg <= r.write_reg; + l_out.write_data <= load_dp_data; else l_out.write_enable <= write_enable; l_out.write_reg <= r.write_reg;