From: Paul Mackerras Date: Fri, 28 Aug 2020 02:49:48 +0000 (+1000) Subject: core: Add support for floating-point loads and stores X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=45cd8f4fc375185544309ffd16d73a7dc5ce1dce;p=microwatt.git core: Add support for floating-point loads and stores This extends the register file so it can hold FPR values, and implements the FP loads and stores that do not require conversion between single and double precision. We now have the FP, FE0 and FE1 bits in MSR. FP loads and stores cause a FP unavailable interrupt if MSR[FP] = 0. The FPU facilities are optional and their presence is controlled by the HAS_FPU generic passed down from the top-level board file. It defaults to true for all except the A7-35 boards. Signed-off-by: Paul Mackerras --- diff --git a/common.vhdl b/common.vhdl index 1ca1178..14bdcf7 100644 --- a/common.vhdl +++ b/common.vhdl @@ -13,8 +13,11 @@ package common is constant MSR_SF : integer := (63 - 0); -- Sixty-Four bit mode constant MSR_EE : integer := (63 - 48); -- External interrupt Enable constant MSR_PR : integer := (63 - 49); -- PRoblem state + constant MSR_FP : integer := (63 - 50); -- Floating Point available + constant MSR_FE0 : integer := (63 - 52); -- Floating Exception mode constant MSR_SE : integer := (63 - 53); -- Single-step bit of TE field constant MSR_BE : integer := (63 - 54); -- Branch trace bit of TE field + constant MSR_FE1 : integer := (63 - 55); -- Floating Exception mode constant MSR_IR : integer := (63 - 58); -- Instruction Relocation constant MSR_DR : integer := (63 - 59); -- Data Relocation constant MSR_RI : integer := (63 - 62); -- Recoverable Interrupt @@ -53,8 +56,11 @@ package common is -- GPR indices in the register file (GPR only) subtype gpr_index_t is std_ulogic_vector(4 downto 0); - -- Extended GPR indice (can hold an SPR) - subtype gspr_index_t is std_ulogic_vector(5 downto 0); + -- Extended GPR index (can hold an SPR or a FPR) + subtype gspr_index_t is std_ulogic_vector(6 downto 0); + + -- FPR indices + subtype fpr_index_t is std_ulogic_vector(4 downto 0); -- Some SPRs are stored in the register file, they use the magic -- GPR numbers above 31. @@ -64,6 +70,9 @@ package common is -- indicates if this is indeed a fast SPR. If clear, then -- the SPR is not stored in the GPR file. -- + -- FPRs are also stored in the register file, using GSPR + -- numbers from 64 to 95. + -- function fast_spr_num(spr: spr_num_t) return gspr_index_t; -- Indices conversion functions @@ -71,6 +80,7 @@ package common is function gpr_to_gspr(i: gpr_index_t) return gspr_index_t; function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t; function is_fast_spr(s: gspr_index_t) return std_ulogic; + function fpr_to_gspr(f: fpr_index_t) return gspr_index_t; -- The XER is split: the common bits (CA, OV, SO, OV32 and CA32) are -- in the CR file as a kind of CR extension (with a separate write @@ -226,7 +236,7 @@ package common is read2_enable : std_ulogic; read2_reg : gspr_index_t; read3_enable : std_ulogic; - read3_reg : gpr_index_t; + read3_reg : gspr_index_t; end record; type RegisterFileToDecode2Type is record @@ -264,7 +274,7 @@ package common is addr1 : std_ulogic_vector(63 downto 0); addr2 : std_ulogic_vector(63 downto 0); data : std_ulogic_vector(63 downto 0); -- data to write, unused for read - write_reg : gpr_index_t; + write_reg : gspr_index_t; length : std_ulogic_vector(3 downto 0); ci : std_ulogic; -- cache-inhibited load/store byte_reverse : std_ulogic; @@ -282,7 +292,8 @@ package common is sign_extend => '0', update => '0', xerc => xerc_init, reserve => '0', rc => '0', virt_mode => '0', priv_mode => '0', nia => (others => '0'), insn => (others => '0'), - addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'), length => (others => '0'), + addr1 => (others => '0'), addr2 => (others => '0'), data => (others => '0'), + write_reg => (others => '0'), length => (others => '0'), mode_32bit => '0', others => (others => '0')); type Loadstore1ToExecute1Type is record @@ -369,7 +380,7 @@ package common is type Loadstore1ToWritebackType is record valid : std_ulogic; write_enable: std_ulogic; - write_reg : gpr_index_t; + write_reg : gspr_index_t; write_data : std_ulogic_vector(63 downto 0); xerc : xer_common_t; rc : std_ulogic; @@ -473,10 +484,10 @@ package body common is n := 13; when others => n := 0; - return "000000"; + return "0000000"; end case; tmp := std_ulogic_vector(to_unsigned(n, 5)); - return "1" & tmp; + return "01" & tmp; end; function gspr_to_gpr(i: gspr_index_t) return gpr_index_t is @@ -486,7 +497,7 @@ package body common is function gpr_to_gspr(i: gpr_index_t) return gspr_index_t is begin - return "0" & i; + return "00" & i; end; function gpr_or_spr_to_gspr(g: gpr_index_t; s: gspr_index_t) return gspr_index_t is @@ -502,4 +513,9 @@ package body common is begin return s(5); end; + + function fpr_to_gspr(f: fpr_index_t) return gspr_index_t is + begin + return "10" & f; + end; end common; diff --git a/control.vhdl b/control.vhdl index d04576a..4f67ad4 100644 --- a/control.vhdl +++ b/control.vhdl @@ -34,7 +34,7 @@ entity control is gpr_b_read_in : in gspr_index_t; gpr_c_read_valid_in : in std_ulogic; - gpr_c_read_in : in gpr_index_t; + gpr_c_read_in : in gspr_index_t; cr_read_in : in std_ulogic; cr_write_in : in std_ulogic; @@ -70,7 +70,6 @@ architecture rtl of control is signal gpr_write_valid : std_ulogic := '0'; signal cr_write_valid : std_ulogic := '0'; - signal gpr_c_read_in_fmt : std_ulogic_vector(5 downto 0); begin gpr_hazard0: entity work.gpr_hazard generic map ( @@ -122,8 +121,6 @@ begin use_bypass => gpr_bypass_b ); - gpr_c_read_in_fmt <= "0" & gpr_c_read_in; - gpr_hazard2: entity work.gpr_hazard generic map ( PIPELINE_DEPTH => PIPELINE_DEPTH @@ -140,7 +137,7 @@ begin gpr_write_in => gpr_write_in, bypass_avail => gpr_bypassable, gpr_read_valid_in => gpr_c_read_valid_in, - gpr_read_in => gpr_c_read_in_fmt, + gpr_read_in => gpr_c_read_in, ugpr_write_valid => update_gpr_write_valid, ugpr_write_reg => update_gpr_write_reg, diff --git a/core.vhdl b/core.vhdl index c7dd3f6..81e11c8 100644 --- a/core.vhdl +++ b/core.vhdl @@ -11,6 +11,7 @@ entity core is SIM : boolean := false; DISABLE_FLATTEN : boolean := false; EX1_BYPASS : boolean := true; + HAS_FPU : boolean := true; ALT_RESET_ADDRESS : std_ulogic_vector(63 downto 0) := (others => '0'); LOG_LENGTH : natural := 512 ); @@ -244,6 +245,7 @@ begin decode2_0: entity work.decode2 generic map ( EX1_BYPASS => EX1_BYPASS, + HAS_FPU => HAS_FPU, LOG_LENGTH => LOG_LENGTH ) port map ( @@ -267,6 +269,7 @@ begin register_file_0: entity work.register_file generic map ( SIM => SIM, + HAS_FPU => HAS_FPU, LOG_LENGTH => LOG_LENGTH ) port map ( @@ -280,7 +283,7 @@ begin dbg_gpr_data => dbg_gpr_data, sim_dump => terminate, sim_dump_done => sim_cr_dump, - log_out => log_data(255 downto 185) + log_out => log_data(255 downto 184) ); cr_file_0: entity work.cr_file @@ -294,12 +297,13 @@ begin d_out => cr_file_to_decode2, w_in => writeback_to_cr_file, sim_dump => sim_cr_dump, - log_out => log_data(184 downto 172) + log_out => log_data(183 downto 171) ); execute1_0: entity work.execute1 generic map ( EX1_BYPASS => EX1_BYPASS, + HAS_FPU => HAS_FPU, LOG_LENGTH => LOG_LENGTH ) port map ( @@ -324,6 +328,7 @@ begin loadstore1_0: entity work.loadstore1 generic map ( + HAS_FPU => HAS_FPU, LOG_LENGTH => LOG_LENGTH ) port map ( @@ -368,7 +373,7 @@ begin stall_out => dcache_stall_out, wishbone_in => wishbone_data_in, wishbone_out => wishbone_data_out, - log_out => log_data(171 downto 152) + log_out => log_data(170 downto 151) ); writeback_0: entity work.writeback @@ -381,7 +386,7 @@ begin complete_out => complete ); - log_data(151 downto 150) <= "00"; + log_data(150) <= '0'; log_data(139 downto 135) <= "00000"; debug_0: entity work.core_debug diff --git a/decode1.vhdl b/decode1.vhdl index a7d5910..75da175 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -72,6 +72,10 @@ architecture behaviour of decode1 is 10 => (ALU, OP_CMP, RA, CONST_UI, NONE, NONE, '0', '1', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli 34 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lbz 35 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lbzu + 50 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfd + 51 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdu +-- 48 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfs +-- 49 => (LDST, OP_FPLOAD, RA_OR_ZERO, CONST_SI, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsu 42 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lha 43 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhau 40 => (LDST, OP_LOAD, RA_OR_ZERO, CONST_SI, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lhz @@ -87,6 +91,10 @@ architecture behaviour of decode1 is 17 => (ALU, OP_SC, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sc 38 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stb 39 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbu + 54 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfd + 55 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdu +-- 52 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfs +-- 53 => (LDST, OP_FPSTORE, RA_OR_ZERO, CONST_SI, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsu 44 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sth 45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu 36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw @@ -272,6 +280,12 @@ architecture behaviour of decode1 is 2#1101110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldcix 2#0000110101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- ldux 2#0000010101# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- ldx + 2#1001010111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfdx + 2#1001110111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- lfdux + 2#1101010111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwax + 2#1101110111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- lfiwzx +-- 2#1000010111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- lfsx +-- 2#1000110111# => (LDST, OP_FPLOAD, RA_OR_ZERO, RB, NONE, FRT, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- lfsux 2#0001110100# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- lharx 2#0101110111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '1', '0', '0', '0', NONE, '0', '0'), -- lhaux 2#0101010111# => (LDST, OP_LOAD, RA_OR_ZERO, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '0'), -- lhax @@ -350,6 +364,11 @@ architecture behaviour of decode1 is 2#0011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stdcx 2#0010110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stdux 2#0010010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdx + 2#1011010111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfdx + 2#1011110111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stfdux + 2#1111010111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stfiwx +-- 2#1010010111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '1', '0', NONE, '0', '0'), -- stfsx +-- 2#1010110111# => (LDST, OP_FPSTORE, RA_OR_ZERO, RB, FRS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '1', '0', NONE, '0', '0'), -- stfsux 2#1110010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthbrx 2#1110110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthcix 2#1011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- sthcx diff --git a/decode2.vhdl b/decode2.vhdl index a2a602c..6cc74c7 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -11,6 +11,7 @@ use work.insn_helpers.all; entity decode2 is generic ( EX1_BYPASS : boolean := true; + HAS_FPU : boolean := true; -- Non-zero to enable log data collection LOG_LENGTH : natural := 0 ); @@ -73,7 +74,7 @@ architecture behaviour of decode2 is -- If it's all 0, we don't treat it as a dependency as slow SPRs -- operations are single issue. -- - assert is_fast_spr(ispr) = '1' or ispr = "000000" + assert is_fast_spr(ispr) = '1' or ispr = "0000000" report "Decode A says SPR but ISPR is invalid:" & to_hstring(ispr) severity failure; return (is_fast_spr(ispr), ispr, reg_data); @@ -118,7 +119,7 @@ architecture behaviour of decode2 is -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. -- If it's all 0, we don't treat it as a dependency as slow SPRs -- operations are single issue. - assert is_fast_spr(ispr) = '1' or ispr = "000000" + assert is_fast_spr(ispr) = '1' or ispr = "0000000" report "Decode B says SPR but ISPR is invalid:" & to_hstring(ispr) severity failure; ret := (is_fast_spr(ispr), ispr, reg_data); @@ -137,6 +138,12 @@ architecture behaviour of decode2 is return ('1', gpr_to_gspr(insn_rs(insn_in)), reg_data); when RCR => return ('1', gpr_to_gspr(insn_rcreg(insn_in)), reg_data); + when FRS => + if HAS_FPU then + return ('1', fpr_to_gspr(insn_frt(insn_in)), reg_data); + else + return ('0', (others => '0'), (others => '0')); + end if; when NONE => return ('0', (others => '0'), (others => '0')); end case; @@ -150,16 +157,22 @@ architecture behaviour of decode2 is return ('1', gpr_to_gspr(insn_rt(insn_in))); when RA => return ('1', gpr_to_gspr(insn_ra(insn_in))); + when FRT => + if HAS_FPU then + return ('1', fpr_to_gspr(insn_frt(insn_in))); + else + return ('0', "0000000"); + end if; when SPR => -- ISPR must be either a valid fast SPR number or all 0 for a slow SPR. -- If it's all 0, we don't treat it as a dependency as slow SPRs -- operations are single issue. - assert is_fast_spr(ispr) = '1' or ispr = "000000" + assert is_fast_spr(ispr) = '1' or ispr = "0000000" report "Decode B says SPR but ISPR is invalid:" & to_hstring(ispr) severity failure; return (is_fast_spr(ispr), ispr); when NONE => - return ('0', "000000"); + return ('0', "0000000"); end case; end; @@ -212,7 +225,7 @@ architecture behaviour of decode2 is signal gpr_b_bypass : std_ulogic; signal gpr_c_read_valid : std_ulogic; - signal gpr_c_read : gpr_index_t; + signal gpr_c_read : gspr_index_t; signal gpr_c_bypass : std_ulogic; signal cr_write_valid : std_ulogic; @@ -284,8 +297,9 @@ begin else gpr_to_gspr(insn_ra(d_in.insn)); r_out.read2_reg <= d_in.ispr2 when d_in.decode.input_reg_b = SPR else gpr_to_gspr(insn_rb(d_in.insn)); - r_out.read3_reg <= insn_rcreg(d_in.insn) when d_in.decode.input_reg_c = RCR - else insn_rs(d_in.insn); + r_out.read3_reg <= gpr_to_gspr(insn_rcreg(d_in.insn)) when d_in.decode.input_reg_c = RCR + else fpr_to_gspr(insn_frt(d_in.insn)) when d_in.decode.input_reg_c = FRS and HAS_FPU + else gpr_to_gspr(insn_rs(d_in.insn)); c_out.read <= d_in.decode.input_cr; @@ -394,7 +408,7 @@ begin gpr_b_read <= decoded_reg_b.reg; gpr_c_read_valid <= decoded_reg_c.reg_valid; - gpr_c_read <= gspr_to_gpr(decoded_reg_c.reg); + gpr_c_read <= decoded_reg_c.reg; cr_write_valid <= d_in.decode.output_cr or decode_rc(d_in.decode.rc, d_in.insn); cr_bypass_avail <= '0'; diff --git a/decode_types.vhdl b/decode_types.vhdl index ef654c3..8c20441 100644 --- a/decode_types.vhdl +++ b/decode_types.vhdl @@ -10,6 +10,7 @@ package decode_types is OP_DCBZ, OP_DIV, OP_DIVE, OP_EXTS, OP_EXTSWSLI, OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC, OP_LOAD, OP_STORE, + OP_FPLOAD, OP_FPSTORE, OP_MCRXRX, OP_MFCR, OP_MFMSR, OP_MFSPR, OP_MOD, OP_MTCRF, OP_MTMSRD, OP_MTSPR, OP_MUL_L64, OP_MUL_H64, OP_MUL_H32, OP_OR, @@ -24,8 +25,8 @@ package decode_types is type input_reg_a_t is (NONE, RA, RA_OR_ZERO, SPR, CIA); type input_reg_b_t is (NONE, RB, CONST_UI, CONST_SI, CONST_SI_HI, CONST_UI_HI, CONST_LI, CONST_BD, CONST_DXHI4, CONST_DS, CONST_M1, CONST_SH, CONST_SH32, SPR); - type input_reg_c_t is (NONE, RS, RCR); - type output_reg_a_t is (NONE, RT, RA, SPR); + type input_reg_c_t is (NONE, RS, RCR, FRS); + type output_reg_a_t is (NONE, RT, RA, SPR, FRT); type rc_t is (NONE, ONE, RC); type carry_in_t is (ZERO, CA, OV, ONE); diff --git a/execute1.vhdl b/execute1.vhdl index 04cc970..4d6a9cc 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -13,6 +13,7 @@ use work.ppc_fx_insns.all; entity execute1 is generic ( EX1_BYPASS : boolean := true; + HAS_FPU : boolean := true; -- Non-zero to enable log data collection LOG_LENGTH : natural := 0 ); @@ -542,6 +543,9 @@ begin ctrl_tmp.msr(MSR_PR) <= '0'; ctrl_tmp.msr(MSR_SE) <= '0'; ctrl_tmp.msr(MSR_BE) <= '0'; + ctrl_tmp.msr(MSR_FP) <= '0'; + ctrl_tmp.msr(MSR_FE0) <= '0'; + ctrl_tmp.msr(MSR_FE1) <= '0'; ctrl_tmp.msr(MSR_IR) <= '0'; ctrl_tmp.msr(MSR_DR) <= '0'; ctrl_tmp.msr(MSR_RI) <= '0'; @@ -578,7 +582,19 @@ begin -- set bit 45 to indicate privileged instruction type interrupt ctrl_tmp.srr1(63 - 45) <= '1'; report "privileged instruction"; - + + elsif not HAS_FPU and valid_in = '1' and + (e_in.insn_type = OP_FPLOAD or e_in.insn_type = OP_FPSTORE) then + -- make lfd/stfd/lfs/stfs etc. illegal in no-FPU implementations + illegal := '1'; + + elsif HAS_FPU and valid_in = '1' and ctrl.msr(MSR_FP) = '0' and + (e_in.insn_type = OP_FPLOAD or e_in.insn_type = OP_FPSTORE) then + -- generate a floating-point unavailable interrupt + exception := '1'; + v.f.redirect_nia := std_logic_vector(to_unsigned(16#800#, 64)); + report "FP unavailable interrupt"; + elsif valid_in = '1' and e_in.unit = ALU then report "execute nia " & to_hstring(e_in.nia); @@ -1225,7 +1241,7 @@ begin lv.addr1 := a_in; lv.addr2 := b_in; lv.data := c_in; - lv.write_reg := gspr_to_gpr(e_in.write_reg); + lv.write_reg := e_in.write_reg; lv.length := e_in.data_len; lv.byte_reverse := e_in.byte_reverse xnor ctrl.msr(MSR_LE); lv.sign_extend := e_in.sign_extend; diff --git a/fpga/top-arty.vhdl b/fpga/top-arty.vhdl index a4d253d..8a3dc7a 100644 --- a/fpga/top-arty.vhdl +++ b/fpga/top-arty.vhdl @@ -14,6 +14,7 @@ entity toplevel is RAM_INIT_FILE : string := "firmware.hex"; RESET_LOW : boolean := true; CLK_FREQUENCY : positive := 100000000; + HAS_FPU : boolean := true; USE_LITEDRAM : boolean := false; NO_BRAM : boolean := false; DISABLE_FLATTEN_CORE : boolean := false; @@ -168,6 +169,7 @@ begin RAM_INIT_FILE => RAM_INIT_FILE, SIM => false, CLK_FREQ => CLK_FREQUENCY, + HAS_FPU => HAS_FPU, HAS_DRAM => USE_LITEDRAM, DRAM_SIZE => 256 * 1024 * 1024, DRAM_INIT_SIZE => PAYLOAD_SIZE, diff --git a/fpga/top-generic.vhdl b/fpga/top-generic.vhdl index 2300456..2ad0dd3 100644 --- a/fpga/top-generic.vhdl +++ b/fpga/top-generic.vhdl @@ -11,6 +11,7 @@ entity toplevel is RESET_LOW : boolean := true; CLK_INPUT : positive := 100000000; CLK_FREQUENCY : positive := 100000000; + HAS_FPU : boolean := true; DISABLE_FLATTEN_CORE : boolean := false; UART_IS_16550 : boolean := true ); @@ -68,6 +69,7 @@ begin RAM_INIT_FILE => RAM_INIT_FILE, SIM => false, CLK_FREQ => CLK_FREQUENCY, + HAS_FPU => HAS_FPU, DISABLE_FLATTEN_CORE => DISABLE_FLATTEN_CORE, UART0_IS_16550 => UART_IS_16550 ) diff --git a/fpga/top-nexys-video.vhdl b/fpga/top-nexys-video.vhdl index 745ef79..1942b10 100644 --- a/fpga/top-nexys-video.vhdl +++ b/fpga/top-nexys-video.vhdl @@ -14,6 +14,7 @@ entity toplevel is RAM_INIT_FILE : string := "firmware.hex"; RESET_LOW : boolean := true; CLK_FREQUENCY : positive := 100000000; + HAS_FPU : boolean := true; USE_LITEDRAM : boolean := false; NO_BRAM : boolean := false; DISABLE_FLATTEN_CORE : boolean := false; @@ -120,6 +121,7 @@ begin RAM_INIT_FILE => RAM_INIT_FILE, SIM => false, CLK_FREQ => CLK_FREQUENCY, + HAS_FPU => HAS_FPU, HAS_DRAM => USE_LITEDRAM, DRAM_SIZE => 512 * 1024 * 1024, DRAM_INIT_SIZE => PAYLOAD_SIZE, diff --git a/gpr_hazard.vhdl b/gpr_hazard.vhdl index 0fa66c5..fec03c7 100644 --- a/gpr_hazard.vhdl +++ b/gpr_hazard.vhdl @@ -2,6 +2,9 @@ library ieee; use ieee.std_logic_1164.all; use ieee.numeric_std.all; +library work; +use work.common.all; + entity gpr_hazard is generic ( PIPELINE_DEPTH : natural := 1 @@ -15,13 +18,13 @@ entity gpr_hazard is issuing : in std_ulogic; gpr_write_valid_in : in std_ulogic; - gpr_write_in : in std_ulogic_vector(5 downto 0); + gpr_write_in : in gspr_index_t; bypass_avail : in std_ulogic; gpr_read_valid_in : in std_ulogic; - gpr_read_in : in std_ulogic_vector(5 downto 0); + gpr_read_in : in gspr_index_t; ugpr_write_valid : in std_ulogic; - ugpr_write_reg : in std_ulogic_vector(5 downto 0); + ugpr_write_reg : in gspr_index_t; stall_out : out std_ulogic; use_bypass : out std_ulogic @@ -31,9 +34,9 @@ architecture behaviour of gpr_hazard is type pipeline_entry_type is record valid : std_ulogic; bypass : std_ulogic; - gpr : std_ulogic_vector(5 downto 0); + gpr : gspr_index_t; ugpr_valid : std_ulogic; - ugpr : std_ulogic_vector(5 downto 0); + ugpr : gspr_index_t; end record; constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0'), ugpr_valid => '0', ugpr => (others => '0')); diff --git a/insn_helpers.vhdl b/insn_helpers.vhdl index 592acb0..be3892a 100644 --- a/insn_helpers.vhdl +++ b/insn_helpers.vhdl @@ -37,6 +37,10 @@ package insn_helpers is function insn_sh (insn_in : std_ulogic_vector) return std_ulogic_vector; function insn_me (insn_in : std_ulogic_vector) return std_ulogic_vector; function insn_mb (insn_in : std_ulogic_vector) return std_ulogic_vector; + function insn_frt (insn_in : std_ulogic_vector) return std_ulogic_vector; + function insn_fra (insn_in : std_ulogic_vector) return std_ulogic_vector; + function insn_frb (insn_in : std_ulogic_vector) return std_ulogic_vector; + function insn_frc (insn_in : std_ulogic_vector) return std_ulogic_vector; end package insn_helpers; package body insn_helpers is @@ -214,4 +218,24 @@ package body insn_helpers is begin return insn_in(5) & insn_in(10 downto 6); end; + + function insn_frt(insn_in : std_ulogic_vector) return std_ulogic_vector is + begin + return insn_in(25 downto 21); + end; + + function insn_fra(insn_in : std_ulogic_vector) return std_ulogic_vector is + begin + return insn_in(20 downto 16); + end; + + function insn_frb(insn_in : std_ulogic_vector) return std_ulogic_vector is + begin + return insn_in(15 downto 11); + end; + + function insn_frc(insn_in : std_ulogic_vector) return std_ulogic_vector is + begin + return insn_in(10 downto 6); + end; end package body insn_helpers; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index e36025c..ec20319 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -5,12 +5,15 @@ use ieee.numeric_std.all; library work; use work.decode_types.all; use work.common.all; +use work.insn_helpers.all; +use work.helpers.all; -- 2 cycle LSU -- We calculate the address in the first cycle entity loadstore1 is generic ( + HAS_FPU : boolean := true; -- Non-zero to enable log data collection LOG_LENGTH : natural := 0 ); @@ -58,7 +61,7 @@ architecture behave of loadstore1 is addr : std_ulogic_vector(63 downto 0); store_data : std_ulogic_vector(63 downto 0); load_data : std_ulogic_vector(63 downto 0); - write_reg : gpr_index_t; + write_reg : gspr_index_t; length : std_ulogic_vector(3 downto 0); byte_reverse : std_ulogic; sign_extend : std_ulogic; @@ -431,6 +434,17 @@ begin v.align_intr := v.nc; req := '1'; v.dcbz := '1'; + when OP_FPSTORE => + if HAS_FPU then + req := '1'; + end if; + when OP_FPLOAD => + if HAS_FPU then + v.load := '1'; + req := '1'; + -- Allow an extra cycle for RA update + v.extra_cycle := l_in.update; + end if; when OP_TLBIE => mmureq := '1'; v.tlbie := '1'; @@ -523,7 +537,7 @@ begin l_out.write_data <= r.sprval; elsif do_update = '1' then l_out.write_enable <= '1'; - l_out.write_reg <= r.update_reg; + l_out.write_reg <= gpr_to_gspr(r.update_reg); l_out.write_data <= r.addr; else l_out.write_enable <= write_enable; diff --git a/microwatt.core b/microwatt.core index cd24a06..3b47339 100644 --- a/microwatt.core +++ b/microwatt.core @@ -132,6 +132,7 @@ targets: - disable_flatten_core - log_length=2048 - uart_is_16550 + - has_fpu tools: vivado: {part : xc7a100tcsg324-1} toplevel : toplevel @@ -215,6 +216,7 @@ targets: - spi_flash_offset=10485760 - log_length=2048 - uart_is_16550 + - has_fpu tools: vivado: {part : xc7a200tsbg484-1} toplevel : toplevel @@ -231,6 +233,7 @@ targets: - spi_flash_offset=10485760 - log_length=2048 - uart_is_16550 + - has_fpu generate: [litedram_nexys_video] tools: vivado: {part : xc7a200tsbg484-1} @@ -249,6 +252,7 @@ targets: - log_length=512 - uart_is_16550 - has_uart1 + - has_fpu=false tools: vivado: {part : xc7a35ticsg324-1L} toplevel : toplevel @@ -267,6 +271,7 @@ targets: - log_length=512 - uart_is_16550 - has_uart1 + - has_fpu=false generate: [litedram_arty, liteeth_arty] tools: vivado: {part : xc7a35ticsg324-1L} @@ -285,6 +290,7 @@ targets: - log_length=2048 - uart_is_16550 - has_uart1 + - has_fpu tools: vivado: {part : xc7a100ticsg324-1L} toplevel : toplevel @@ -303,6 +309,7 @@ targets: - log_length=2048 - uart_is_16550 - has_uart1 + - has_fpu generate: [litedram_arty, liteeth_arty] tools: vivado: {part : xc7a100ticsg324-1L} @@ -320,6 +327,7 @@ targets: - disable_flatten_core - log_length=512 - uart_is_16550 + - has_fpu=false tools: vivado: {part : xc7a35tcpg236-1} toplevel : toplevel @@ -380,6 +388,12 @@ parameters: paramtype : generic default : 100000000 + has_fpu: + datatype : bool + description : Include a floating-point unit in the core + paramtype : generic + default : true + disable_flatten_core: datatype : bool description : Prevent Vivado from flattening the main core components diff --git a/register_file.vhdl b/register_file.vhdl index 10f28a4..32c8490 100644 --- a/register_file.vhdl +++ b/register_file.vhdl @@ -8,6 +8,7 @@ use work.common.all; entity register_file is generic ( SIM : boolean := false; + HAS_FPU : boolean := true; -- Non-zero to enable log data collection LOG_LENGTH : natural := 0 ); @@ -28,12 +29,12 @@ entity register_file is sim_dump : in std_ulogic; sim_dump_done : out std_ulogic; - log_out : out std_ulogic_vector(70 downto 0) + log_out : out std_ulogic_vector(71 downto 0) ); end entity register_file; architecture behaviour of register_file is - type regfile is array(0 to 63) of std_ulogic_vector(63 downto 0); + type regfile is array(0 to 127) of std_ulogic_vector(63 downto 0); signal registers : regfile := (others => (others => '0')); signal rd_port_b : std_ulogic_vector(63 downto 0); signal dbg_data : std_ulogic_vector(63 downto 0); @@ -41,53 +42,73 @@ architecture behaviour of register_file is begin -- synchronous writes register_write_0: process(clk) + variable w_addr : gspr_index_t; begin if rising_edge(clk) then if w_in.write_enable = '1' then - if w_in.write_reg(5) = '0' then - report "Writing GPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data); - else - report "Writing GSPR " & to_hstring(w_in.write_reg) & " " & to_hstring(w_in.write_data); - end if; + w_addr := w_in.write_reg; + if HAS_FPU and w_addr(6) = '1' then + report "Writing FPR " & to_hstring(w_addr(4 downto 0)) & " " & to_hstring(w_in.write_data); + else + w_addr(6) := '0'; + if w_addr(5) = '0' then + report "Writing GPR " & to_hstring(w_addr) & " " & to_hstring(w_in.write_data); + else + report "Writing GSPR " & to_hstring(w_addr) & " " & to_hstring(w_in.write_data); + end if; + end if; assert not(is_x(w_in.write_data)) and not(is_x(w_in.write_reg)) severity failure; - registers(to_integer(unsigned(w_in.write_reg))) <= w_in.write_data; + registers(to_integer(unsigned(w_addr))) <= w_in.write_data; end if; end if; end process register_write_0; -- asynchronous reads register_read_0: process(all) - variable b_addr : gspr_index_t; + variable a_addr, b_addr, c_addr : gspr_index_t; + variable w_addr : gspr_index_t; begin + a_addr := d_in.read1_reg; + b_addr := d_in.read2_reg; + c_addr := d_in.read3_reg; + w_addr := w_in.write_reg; + if not HAS_FPU then + -- Make it obvious that we only want 64 GSPRs for a no-FPU implementation + a_addr(6) := '0'; + b_addr(6) := '0'; + c_addr(6) := '0'; + w_addr(6) := '0'; + end if; if d_in.read1_enable = '1' then - report "Reading GPR " & to_hstring(d_in.read1_reg) & " " & to_hstring(registers(to_integer(unsigned(d_in.read1_reg)))); + report "Reading GPR " & to_hstring(a_addr) & " " & to_hstring(registers(to_integer(unsigned(a_addr)))); end if; if d_in.read2_enable = '1' then - report "Reading GPR " & to_hstring(d_in.read2_reg) & " " & to_hstring(registers(to_integer(unsigned(d_in.read2_reg)))); + report "Reading GPR " & to_hstring(b_addr) & " " & to_hstring(registers(to_integer(unsigned(b_addr)))); end if; if d_in.read3_enable = '1' then - report "Reading GPR " & to_hstring(d_in.read3_reg) & " " & to_hstring(registers(to_integer(unsigned(d_in.read3_reg)))); + report "Reading GPR " & to_hstring(c_addr) & " " & to_hstring(registers(to_integer(unsigned(c_addr)))); end if; - d_out.read1_data <= registers(to_integer(unsigned(d_in.read1_reg))); + d_out.read1_data <= registers(to_integer(unsigned(a_addr))); -- B read port is multiplexed with reads from the debug circuitry if d_in.read2_enable = '0' and dbg_gpr_req = '1' and dbg_ack = '0' then b_addr := dbg_gpr_addr; - else - b_addr := d_in.read2_reg; + if not HAS_FPU then + b_addr(6) := '0'; + end if; end if; rd_port_b <= registers(to_integer(unsigned(b_addr))); d_out.read2_data <= rd_port_b; - d_out.read3_data <= registers(to_integer(unsigned(gpr_to_gspr(d_in.read3_reg)))); + d_out.read3_data <= registers(to_integer(unsigned(c_addr))); -- Forward any written data if w_in.write_enable = '1' then - if d_in.read1_reg = w_in.write_reg then + if a_addr = w_addr then d_out.read1_data <= w_in.write_data; end if; - if d_in.read2_reg = w_in.write_reg then + if b_addr = w_addr then d_out.read2_data <= w_in.write_data; end if; - if gpr_to_gspr(d_in.read3_reg) = w_in.write_reg then + if c_addr = w_addr then d_out.read3_data <= w_in.write_data; end if; end if; @@ -136,7 +157,7 @@ begin end generate; rf_log: if LOG_LENGTH > 0 generate - signal log_data : std_ulogic_vector(70 downto 0); + signal log_data : std_ulogic_vector(71 downto 0); begin reg_log: process(clk) begin diff --git a/scripts/fmt_log/fmt_log.c b/scripts/fmt_log/fmt_log.c index 146346d..eca4bf0 100644 --- a/scripts/fmt_log/fmt_log.c +++ b/scripts/fmt_log/fmt_log.c @@ -58,7 +58,7 @@ struct log_entry { u64 ls_lo_valid: 1; u64 ls_eo_except: 1; u64 ls_stall_out: 1; - u64 pad2: 2; + u64 pad2: 1; u64 dc_state: 3; u64 dc_ra_valid: 1; u64 dc_tlb_way: 3; @@ -74,7 +74,7 @@ struct log_entry { u64 cr_wr_mask: 8; u64 cr_wr_data: 4; u64 cr_wr_enable: 1; - u64 reg_wr_reg: 6; + u64 reg_wr_reg: 7; u64 reg_wr_enable: 1; u64 reg_wr_data; @@ -90,11 +90,11 @@ const char *ops[64] = "illegal", "nop ", "add ", "and ", "attn ", "b ", "bc ", "bcreg ", "bperm ", "cmp ", "cmpb ", "cmpeqb ", "cmprb ", "cntz ", "crop ", "darn ", "dcbf ", "dcbst ", "dcbt ", "dcbtst ", "dcbz ", "div ", "dive ", "exts ", - "extswsl", "icbi ", "icbt ", "isel ", "isync ", "ld ", "st ", "mcrxrx ", - "mfcr ", "mfmsr ", "mfspr ", "mod ", "mtcrf ", "mtmsr ", "mtspr ", "mull64 ", - "mulh64 ", "mulh32 ", "or ", "popcnt ", "prty ", "rfid ", "rlc ", "rlcl ", - "rlcr ", "sc ", "setb ", "shl ", "shr ", "sync ", "tlbie ", "trap ", - "xor ", "bcd ", "addg6s ", "ffail ", "?60 ", "?61 ", "?62 ", "?63 " + "extswsl", "icbi ", "icbt ", "isel ", "isync ", "ld ", "st ", "fpload ", + "fpstore", "mcrxrx ", "mfcr ", "mfmsr ", "mfspr ", "mod ", "mtcrf ", "mtmsr ", + "mtspr ", "mull64 ", "mulh64 ", "mulh32 ", "or ", "popcnt ", "prty ", "rfid ", + "rlc ", "rlcl ", "rlcr ", "sc ", "setb ", "shl ", "shr ", "sync ", + "tlbie ", "trap ", "xor ", "bcd ", "addg6s ", "ffail ", "?62 ", "?63 " }; const char *spr_names[13] = diff --git a/soc.vhdl b/soc.vhdl index 0a70026..7ab146f 100644 --- a/soc.vhdl +++ b/soc.vhdl @@ -52,6 +52,7 @@ entity soc is RAM_INIT_FILE : string; CLK_FREQ : positive; SIM : boolean; + HAS_FPU : boolean := true; DISABLE_FLATTEN_CORE : boolean := false; HAS_DRAM : boolean := false; DRAM_SIZE : integer := 0; @@ -253,6 +254,7 @@ begin processor: entity work.core generic map( SIM => SIM, + HAS_FPU => HAS_FPU, DISABLE_FLATTEN => DISABLE_FLATTEN_CORE, ALT_RESET_ADDRESS => (23 downto 0 => '0', others => '1'), LOG_LENGTH => LOG_LENGTH diff --git a/writeback.vhdl b/writeback.vhdl index 053a8ba..d0230d8 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -80,7 +80,7 @@ begin end if; if l_in.write_enable = '1' then - w_out.write_reg <= gpr_to_gspr(l_in.write_reg); + w_out.write_reg <= l_in.write_reg; w_out.write_data <= l_in.write_data; w_out.write_enable <= '1'; end if;