use ieee.numeric_std.all;
library work;
+use work.decode_types.all;
use work.common.all;
+use work.insn_helpers.all;
use work.helpers.all;
-- 2 cycle LSU
-- We calculate the address in the first cycle
entity loadstore1 is
+ generic (
+ HAS_FPU : boolean := true;
+ -- Non-zero to enable log data collection
+ LOG_LENGTH : natural := 0
+ );
port (
clk : in std_ulogic;
+ rst : in std_ulogic;
+
+ l_in : in Execute1ToLoadstore1Type;
+ e_out : out Loadstore1ToExecute1Type;
+ l_out : out Loadstore1ToWritebackType;
+
+ d_out : out Loadstore1ToDcacheType;
+ d_in : in DcacheToLoadstore1Type;
- l_in : in Decode2ToLoadstore1Type;
+ m_out : out Loadstore1ToMmuType;
+ m_in : in MmuToLoadstore1Type;
- l_out : out Loadstore1ToDcacheType
+ dc_stall : in std_ulogic;
+
+ log_out : out std_ulogic_vector(9 downto 0)
);
end loadstore1;
+-- Note, we don't currently use the stall output from the dcache because
+-- we know it can take two requests without stalling when idle, we are
+-- its only user, and we know it never stalls when idle.
+
architecture behave of loadstore1 is
- signal r, rin : Loadstore1ToDcacheType;
+
+ -- State machine for unaligned loads/stores
+ type state_t is (IDLE, -- ready for instruction
+ SECOND_REQ, -- send 2nd request of unaligned xfer
+ ACK_WAIT, -- waiting for ack from dcache
+ MMU_LOOKUP, -- waiting for MMU to look up translation
+ TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie
+ FINISH_LFS, -- write back converted SP data for lfs*
+ COMPLETE -- extra cycle to complete an operation
+ );
+
+ type byte_index_t is array(0 to 7) of unsigned(2 downto 0);
+ subtype byte_trim_t is std_ulogic_vector(1 downto 0);
+ type trim_ctl_t is array(0 to 7) of byte_trim_t;
+
+ type reg_stage_t is record
+ -- latch most of the input request
+ load : std_ulogic;
+ tlbie : std_ulogic;
+ dcbz : std_ulogic;
+ addr : std_ulogic_vector(63 downto 0);
+ store_data : std_ulogic_vector(63 downto 0);
+ load_data : std_ulogic_vector(63 downto 0);
+ write_reg : gspr_index_t;
+ length : std_ulogic_vector(3 downto 0);
+ byte_reverse : std_ulogic;
+ byte_offset : unsigned(2 downto 0);
+ brev_mask : unsigned(2 downto 0);
+ sign_extend : std_ulogic;
+ update : std_ulogic;
+ update_reg : gpr_index_t;
+ xerc : xer_common_t;
+ reserve : std_ulogic;
+ atomic : std_ulogic;
+ atomic_last : std_ulogic;
+ rc : std_ulogic;
+ nc : std_ulogic; -- non-cacheable access
+ virt_mode : std_ulogic;
+ priv_mode : std_ulogic;
+ state : state_t;
+ dwords_done : std_ulogic;
+ last_dword : std_ulogic;
+ first_bytes : std_ulogic_vector(7 downto 0);
+ second_bytes : std_ulogic_vector(7 downto 0);
+ dar : std_ulogic_vector(63 downto 0);
+ dsisr : std_ulogic_vector(31 downto 0);
+ instr_fault : std_ulogic;
+ align_intr : std_ulogic;
+ sprval : std_ulogic_vector(63 downto 0);
+ busy : std_ulogic;
+ wait_dcache : std_ulogic;
+ wait_mmu : std_ulogic;
+ do_update : std_ulogic;
+ extra_cycle : std_ulogic;
+ mode_32bit : std_ulogic;
+ byte_index : byte_index_t;
+ use_second : std_ulogic_vector(7 downto 0);
+ trim_ctl : trim_ctl_t;
+ load_sp : std_ulogic;
+ ld_sp_data : std_ulogic_vector(31 downto 0);
+ ld_sp_nz : std_ulogic;
+ ld_sp_lz : std_ulogic_vector(5 downto 0);
+ wr_sel : std_ulogic_vector(1 downto 0);
+ end record;
+
+ signal r, rin : reg_stage_t;
signal lsu_sum : std_ulogic_vector(63 downto 0);
+
+ signal store_sp_data : std_ulogic_vector(31 downto 0);
+ signal load_dp_data : std_ulogic_vector(63 downto 0);
+
+ -- Generate byte enables from sizes
+ function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
+ begin
+ case length is
+ when "0001" =>
+ return "00000001";
+ when "0010" =>
+ return "00000011";
+ when "0100" =>
+ return "00001111";
+ when "1000" =>
+ return "11111111";
+ when others =>
+ return "00000000";
+ end case;
+ end function length_to_sel;
+
+ -- Calculate byte enables
+ -- This returns 16 bits, giving the select signals for two transfers,
+ -- to account for unaligned loads or stores
+ function xfer_data_sel(size : in std_logic_vector(3 downto 0);
+ address : in std_logic_vector(2 downto 0))
+ return std_ulogic_vector is
+ variable longsel : std_ulogic_vector(15 downto 0);
+ begin
+ longsel := "00000000" & length_to_sel(size);
+ return std_ulogic_vector(shift_left(unsigned(longsel),
+ to_integer(unsigned(address))));
+ end function xfer_data_sel;
+
+ -- 23-bit right shifter for DP -> SP float conversions
+ function shifter_23r(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0))
+ return std_ulogic_vector is
+ variable fs1 : std_ulogic_vector(22 downto 0);
+ variable fs2 : std_ulogic_vector(22 downto 0);
+ begin
+ case shift(1 downto 0) is
+ when "00" =>
+ fs1 := frac;
+ when "01" =>
+ fs1 := '0' & frac(22 downto 1);
+ when "10" =>
+ fs1 := "00" & frac(22 downto 2);
+ when others =>
+ fs1 := "000" & frac(22 downto 3);
+ end case;
+ case shift(4 downto 2) is
+ when "000" =>
+ fs2 := fs1;
+ when "001" =>
+ fs2 := x"0" & fs1(22 downto 4);
+ when "010" =>
+ fs2 := x"00" & fs1(22 downto 8);
+ when "011" =>
+ fs2 := x"000" & fs1(22 downto 12);
+ when "100" =>
+ fs2 := x"0000" & fs1(22 downto 16);
+ when others =>
+ fs2 := x"00000" & fs1(22 downto 20);
+ end case;
+ return fs2;
+ end;
+
+ -- 23-bit left shifter for SP -> DP float conversions
+ function shifter_23l(frac: std_ulogic_vector(22 downto 0); shift: unsigned(4 downto 0))
+ return std_ulogic_vector is
+ variable fs1 : std_ulogic_vector(22 downto 0);
+ variable fs2 : std_ulogic_vector(22 downto 0);
+ begin
+ case shift(1 downto 0) is
+ when "00" =>
+ fs1 := frac;
+ when "01" =>
+ fs1 := frac(21 downto 0) & '0';
+ when "10" =>
+ fs1 := frac(20 downto 0) & "00";
+ when others =>
+ fs1 := frac(19 downto 0) & "000";
+ end case;
+ case shift(4 downto 2) is
+ when "000" =>
+ fs2 := fs1;
+ when "001" =>
+ fs2 := fs1(18 downto 0) & x"0" ;
+ when "010" =>
+ fs2 := fs1(14 downto 0) & x"00";
+ when "011" =>
+ fs2 := fs1(10 downto 0) & x"000";
+ when "100" =>
+ fs2 := fs1(6 downto 0) & x"0000";
+ when others =>
+ fs2 := fs1(2 downto 0) & x"00000";
+ end case;
+ return fs2;
+ end;
+
begin
-- Calculate the address in the first cycle
lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0');
loadstore1_0: process(clk)
begin
if rising_edge(clk) then
- r <= rin;
+ if rst = '1' then
+ r.state <= IDLE;
+ r.busy <= '0';
+ r.do_update <= '0';
+ else
+ r <= rin;
+ end if;
end if;
end process;
+ ls_fp_conv: if HAS_FPU generate
+ -- Convert DP data to SP for stfs
+ dp_to_sp: process(all)
+ variable exp : unsigned(10 downto 0);
+ variable frac : std_ulogic_vector(22 downto 0);
+ variable shift : unsigned(4 downto 0);
+ begin
+ store_sp_data(31) <= l_in.data(63);
+ store_sp_data(30 downto 0) <= (others => '0');
+ exp := unsigned(l_in.data(62 downto 52));
+ if exp > 896 then
+ store_sp_data(30) <= l_in.data(62);
+ store_sp_data(29 downto 0) <= l_in.data(58 downto 29);
+ elsif exp >= 874 then
+ -- denormalization required
+ frac := '1' & l_in.data(51 downto 30);
+ shift := 0 - exp(4 downto 0);
+ store_sp_data(22 downto 0) <= shifter_23r(frac, shift);
+ end if;
+ end process;
+
+ -- Convert SP data to DP for lfs
+ sp_to_dp: process(all)
+ variable exp : unsigned(7 downto 0);
+ variable exp_dp : unsigned(10 downto 0);
+ variable exp_nz : std_ulogic;
+ variable exp_ao : std_ulogic;
+ variable frac : std_ulogic_vector(22 downto 0);
+ variable frac_shift : unsigned(4 downto 0);
+ begin
+ frac := r.ld_sp_data(22 downto 0);
+ exp := unsigned(r.ld_sp_data(30 downto 23));
+ exp_nz := or (r.ld_sp_data(30 downto 23));
+ exp_ao := and (r.ld_sp_data(30 downto 23));
+ frac_shift := (others => '0');
+ if exp_ao = '1' then
+ exp_dp := to_unsigned(2047, 11); -- infinity or NaN
+ elsif exp_nz = '1' then
+ exp_dp := 896 + resize(exp, 11); -- finite normalized value
+ elsif r.ld_sp_nz = '0' then
+ exp_dp := to_unsigned(0, 11); -- zero
+ else
+ -- denormalized SP operand, need to normalize
+ exp_dp := 896 - resize(unsigned(r.ld_sp_lz), 11);
+ frac_shift := unsigned(r.ld_sp_lz(4 downto 0)) + 1;
+ end if;
+ load_dp_data(63) <= r.ld_sp_data(31);
+ load_dp_data(62 downto 52) <= std_ulogic_vector(exp_dp);
+ load_dp_data(51 downto 29) <= shifter_23l(frac, frac_shift);
+ load_dp_data(28 downto 0) <= (others => '0');
+ end process;
+ end generate;
+
loadstore1_1: process(all)
- variable v : Loadstore1ToDcacheType;
+ variable v : reg_stage_t;
+ variable brev_lenm1 : unsigned(2 downto 0);
+ variable byte_offset : unsigned(2 downto 0);
+ variable j : integer;
+ variable k : unsigned(2 downto 0);
+ variable kk : unsigned(3 downto 0);
+ variable long_sel : std_ulogic_vector(15 downto 0);
+ variable byte_sel : std_ulogic_vector(7 downto 0);
+ variable req : std_ulogic;
+ variable busy : std_ulogic;
+ variable addr : std_ulogic_vector(63 downto 0);
+ variable maddr : std_ulogic_vector(63 downto 0);
+ variable wdata : std_ulogic_vector(63 downto 0);
+ variable write_enable : std_ulogic;
+ variable do_update : std_ulogic;
+ variable done : std_ulogic;
+ variable data_permuted : std_ulogic_vector(63 downto 0);
+ variable data_trimmed : std_ulogic_vector(63 downto 0);
+ variable store_data : std_ulogic_vector(63 downto 0);
+ variable byte_rev : std_ulogic;
+ variable length : std_ulogic_vector(3 downto 0);
+ variable negative : std_ulogic;
+ variable sprn : std_ulogic_vector(9 downto 0);
+ variable exception : std_ulogic;
+ variable next_addr : std_ulogic_vector(63 downto 0);
+ variable mmureq : std_ulogic;
+ variable dsisr : std_ulogic_vector(31 downto 0);
+ variable mmu_mtspr : std_ulogic;
+ variable itlb_fault : std_ulogic;
+ variable misaligned : std_ulogic;
begin
v := r;
+ req := '0';
+ mmu_mtspr := '0';
+ itlb_fault := '0';
+ sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10));
+ dsisr := (others => '0');
+ mmureq := '0';
+ v.wr_sel := "11";
+
+ write_enable := '0';
+
+ do_update := r.do_update;
+ v.do_update := '0';
+
+ -- load data formatting
+ -- shift and byte-reverse data bytes
+ for i in 0 to 7 loop
+ j := to_integer(r.byte_index(i)) * 8;
+ data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j);
+ end loop;
+
+ -- Work out the sign bit for sign extension.
+ -- For unaligned loads crossing two dwords, the sign bit is in the
+ -- first dword for big-endian (byte_reverse = 1), or the second dword
+ -- for little-endian.
+ if r.dwords_done = '1' and r.byte_reverse = '1' then
+ negative := (r.length(3) and r.load_data(63)) or
+ (r.length(2) and r.load_data(31)) or
+ (r.length(1) and r.load_data(15)) or
+ (r.length(0) and r.load_data(7));
+ else
+ negative := (r.length(3) and data_permuted(63)) or
+ (r.length(2) and data_permuted(31)) or
+ (r.length(1) and data_permuted(15)) or
+ (r.length(0) and data_permuted(7));
+ end if;
+
+ -- trim and sign-extend
+ for i in 0 to 7 loop
+ case r.trim_ctl(i) is
+ when "11" =>
+ data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8);
+ when "10" =>
+ data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8);
+ when "01" =>
+ data_trimmed(i * 8 + 7 downto i * 8) := (others => negative);
+ when others =>
+ data_trimmed(i * 8 + 7 downto i * 8) := x"00";
+ end case;
+ end loop;
- v.valid := l_in.valid;
- v.load := l_in.load;
- v.data := l_in.data;
- v.write_reg := l_in.write_reg;
- v.length := l_in.length;
- v.byte_reverse := l_in.byte_reverse;
- v.sign_extend := l_in.sign_extend;
- v.update := l_in.update;
- v.update_reg := l_in.update_reg;
-
- -- XXX Temporary hack. Mark the op as non-cachable if the address
- -- is the form 0xc-------
- --
- -- This will have to be replaced by a combination of implementing the
- -- proper HV CI load/store instructions and having an MMU to get the I
- -- bit otherwise.
- if lsu_sum(31 downto 28) = "1100" then
- v.nc := '1';
- else
- v.nc := '0';
- end if;
-
- -- XXX Do length_to_sel here ?
-
- -- byte reverse stores in the first cycle
- if v.load = '0' and l_in.byte_reverse = '1' then
- v.data := byte_reverse(l_in.data, to_integer(unsigned(l_in.length)));
+ if HAS_FPU then
+ -- Single-precision FP conversion for loads
+ v.ld_sp_data := data_trimmed(31 downto 0);
+ v.ld_sp_nz := or (data_trimmed(22 downto 0));
+ v.ld_sp_lz := count_left_zeroes(data_trimmed(22 downto 0));
+ end if;
+
+ -- Byte reversing and rotating for stores.
+ -- Done in the second cycle (the cycle after l_in.valid = 1).
+ for i in 0 to 7 loop
+ k := (to_unsigned(i, 3) - r.byte_offset) xor r.brev_mask;
+ j := to_integer(k) * 8;
+ store_data(i * 8 + 7 downto i * 8) := r.store_data(j + 7 downto j);
+ end loop;
+
+ -- compute (addr + 8) & ~7 for the second doubleword when unaligned
+ next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000";
+
+ -- Busy calculation.
+ -- We need to minimize the delay from clock to busy valid because it
+ -- gates the start of execution of the next instruction.
+ busy := r.busy and not ((r.wait_dcache and d_in.valid) or (r.wait_mmu and m_in.done));
+ v.busy := busy;
+
+ done := '0';
+ if r.state /= IDLE and busy = '0' then
+ done := '1';
+ end if;
+ exception := '0';
+
+ if r.dwords_done = '1' or r.state = SECOND_REQ then
+ addr := next_addr;
+ byte_sel := r.second_bytes;
+ else
+ addr := r.addr;
+ byte_sel := r.first_bytes;
+ end if;
+ if r.mode_32bit = '1' then
+ addr(63 downto 32) := (others => '0');
end if;
+ maddr := addr;
- v.addr := lsu_sum;
+ case r.state is
+ when IDLE =>
+
+ when SECOND_REQ =>
+ req := '1';
+ v.state := ACK_WAIT;
+ v.last_dword := '0';
+
+ when ACK_WAIT =>
+ -- r.wr_sel gets set one cycle after we come into ACK_WAIT state,
+ -- which is OK because the dcache always takes at least two cycles.
+ if r.update = '1' and (r.load = '0' or (HAS_FPU and r.load_sp = '1')) then
+ v.wr_sel := "01";
+ end if;
+ if d_in.error = '1' then
+ -- dcache will discard the second request if it
+ -- gets an error on the 1st of two requests
+ if d_in.cache_paradox = '1' then
+ -- signal an interrupt straight away
+ exception := '1';
+ dsisr(63 - 38) := not r.load;
+ -- XXX there is no architected bit for this
+ dsisr(63 - 35) := d_in.cache_paradox;
+ else
+ -- Look up the translation for TLB miss
+ -- and also for permission error and RC error
+ -- in case the PTE has been updated.
+ mmureq := '1';
+ v.state := MMU_LOOKUP;
+ end if;
+ end if;
+ if d_in.valid = '1' then
+ if r.last_dword = '0' then
+ v.dwords_done := '1';
+ v.last_dword := '1';
+ if r.load = '1' then
+ v.load_data := data_permuted;
+ end if;
+ else
+ write_enable := r.load and not r.load_sp;
+ if HAS_FPU and r.load_sp = '1' then
+ -- SP to DP conversion takes a cycle
+ -- Write back rA update in this cycle if needed
+ do_update := r.update;
+ v.wr_sel := "10";
+ v.state := FINISH_LFS;
+ elsif r.extra_cycle = '1' then
+ -- loads with rA update need an extra cycle
+ v.wr_sel := "01";
+ v.state := COMPLETE;
+ v.do_update := r.update;
+ else
+ -- stores write back rA update in this cycle
+ do_update := r.update;
+ end if;
+ v.busy := '0';
+ end if;
+ end if;
+ -- r.wait_dcache gets set one cycle after we come into ACK_WAIT state,
+ -- which is OK because the dcache always takes at least two cycles.
+ v.wait_dcache := r.last_dword and not r.extra_cycle;
+
+ when MMU_LOOKUP =>
+ if m_in.done = '1' then
+ if r.instr_fault = '0' then
+ -- retry the request now that the MMU has installed a TLB entry
+ req := '1';
+ if r.last_dword = '0' then
+ v.state := SECOND_REQ;
+ else
+ v.state := ACK_WAIT;
+ end if;
+ end if;
+ end if;
+ if m_in.err = '1' then
+ exception := '1';
+ dsisr(63 - 33) := m_in.invalid;
+ dsisr(63 - 36) := m_in.perm_error;
+ dsisr(63 - 38) := not r.load;
+ dsisr(63 - 44) := m_in.badtree;
+ dsisr(63 - 45) := m_in.rc_error;
+ end if;
+
+ when TLBIE_WAIT =>
+
+ when FINISH_LFS =>
+
+ when COMPLETE =>
+ exception := r.align_intr;
+
+ end case;
+
+ if done = '1' or exception = '1' then
+ v.state := IDLE;
+ v.busy := '0';
+ end if;
+
+ -- Note that l_in.valid is gated with busy inside execute1
+ if l_in.valid = '1' then
+ v.mode_32bit := l_in.mode_32bit;
+ v.load := '0';
+ v.dcbz := '0';
+ v.tlbie := '0';
+ v.instr_fault := '0';
+ v.align_intr := '0';
+ v.dwords_done := '0';
+ v.last_dword := '1';
+ v.write_reg := l_in.write_reg;
+ v.length := l_in.length;
+ v.byte_reverse := l_in.byte_reverse;
+ v.sign_extend := l_in.sign_extend;
+ v.update := l_in.update;
+ v.update_reg := l_in.update_reg;
+ v.xerc := l_in.xerc;
+ v.reserve := l_in.reserve;
+ v.rc := l_in.rc;
+ v.nc := l_in.ci;
+ v.virt_mode := l_in.virt_mode;
+ v.priv_mode := l_in.priv_mode;
+ v.load_sp := '0';
+ v.wait_dcache := '0';
+ v.wait_mmu := '0';
+ v.do_update := '0';
+ v.extra_cycle := '0';
+
+ if HAS_FPU and l_in.is_32bit = '1' then
+ v.store_data := x"00000000" & store_sp_data;
+ else
+ v.store_data := l_in.data;
+ end if;
+
+ addr := lsu_sum;
+ if l_in.second = '1' then
+ -- for the second half of a 16-byte transfer, use next_addr
+ addr := next_addr;
+ end if;
+ if l_in.mode_32bit = '1' then
+ addr(63 downto 32) := (others => '0');
+ end if;
+ v.addr := addr;
+ maddr := l_in.addr2; -- address from RB for tlbie
+
+ -- XXX Temporary hack. Mark the op as non-cachable if the address
+ -- is the form 0xc------- for a real-mode access.
+ if addr(31 downto 28) = "1100" and l_in.virt_mode = '0' then
+ v.nc := '1';
+ end if;
+
+ if l_in.second = '0' then
+ -- Do length_to_sel and work out if we are doing 2 dwords
+ long_sel := xfer_data_sel(l_in.length, lsu_sum(2 downto 0));
+ byte_sel := long_sel(7 downto 0);
+ v.first_bytes := byte_sel;
+ v.second_bytes := long_sel(15 downto 8);
+ else
+ byte_sel := r.first_bytes;
+ long_sel := r.second_bytes & r.first_bytes;
+ end if;
+
+ -- check alignment for larx/stcx
+ misaligned := or (std_ulogic_vector(unsigned(l_in.length(2 downto 0)) - 1) and addr(2 downto 0));
+ v.align_intr := l_in.reserve and misaligned;
+ if l_in.repeat = '1' and l_in.second = '0' and addr(3) = '1' then
+ -- length is really 16 not 8
+ -- Make misaligned lq cause an alignment interrupt in LE mode,
+ -- in order to avoid the case with RA = RT + 1 where the second half
+ -- faults but the first doesn't (and updates RT+1, destroying RA).
+ -- The equivalent BE case doesn't occur because RA = RT is illegal.
+ misaligned := '1';
+ if l_in.reserve = '1' or (l_in.op = OP_LOAD and l_in.byte_reverse = '0') then
+ v.align_intr := '1';
+ end if;
+ end if;
+
+ v.atomic := not misaligned;
+ v.atomic_last := not misaligned and (l_in.second or not l_in.repeat);
+
+ case l_in.op is
+ when OP_STORE =>
+ req := '1';
+ when OP_LOAD =>
+ req := '1';
+ v.load := '1';
+ -- Allow an extra cycle for RA update on loads
+ v.extra_cycle := l_in.update;
+ if HAS_FPU and l_in.is_32bit = '1' then
+ -- Allow an extra cycle for SP->DP precision conversion
+ v.load_sp := '1';
+ v.extra_cycle := '1';
+ end if;
+ when OP_DCBZ =>
+ v.align_intr := v.nc;
+ req := '1';
+ v.dcbz := '1';
+ when OP_TLBIE =>
+ mmureq := '1';
+ v.tlbie := '1';
+ v.state := TLBIE_WAIT;
+ v.wait_mmu := '1';
+ when OP_MFSPR =>
+ v.wr_sel := "00";
+ -- partial decode on SPR number should be adequate given
+ -- the restricted set that get sent down this path
+ if sprn(9) = '0' and sprn(5) = '0' then
+ if sprn(0) = '0' then
+ v.sprval := x"00000000" & r.dsisr;
+ else
+ v.sprval := r.dar;
+ end if;
+ else
+ -- reading one of the SPRs in the MMU
+ v.sprval := m_in.sprval;
+ end if;
+ v.state := COMPLETE;
+ when OP_MTSPR =>
+ if sprn(9) = '0' and sprn(5) = '0' then
+ if sprn(0) = '0' then
+ v.dsisr := l_in.data(31 downto 0);
+ else
+ v.dar := l_in.data;
+ end if;
+ v.state := COMPLETE;
+ else
+ -- writing one of the SPRs in the MMU
+ mmu_mtspr := '1';
+ v.state := TLBIE_WAIT;
+ v.wait_mmu := '1';
+ end if;
+ when OP_FETCH_FAILED =>
+ -- send it to the MMU to do the radix walk
+ maddr := l_in.nia;
+ v.instr_fault := '1';
+ mmureq := '1';
+ v.state := MMU_LOOKUP;
+ v.wait_mmu := '1';
+ when others =>
+ assert false report "unknown op sent to loadstore1";
+ end case;
+
+ if req = '1' then
+ if v.align_intr = '1' then
+ v.state := COMPLETE;
+ elsif long_sel(15 downto 8) = "00000000" then
+ v.state := ACK_WAIT;
+ else
+ v.state := SECOND_REQ;
+ end if;
+ end if;
+
+ v.busy := req or mmureq or mmu_mtspr;
+ end if;
+
+ -- Work out controls for store formatting
+ if l_in.valid = '1' then
+ byte_offset := unsigned(lsu_sum(2 downto 0));
+ byte_rev := l_in.byte_reverse;
+ length := l_in.length;
+ brev_lenm1 := "000";
+ if byte_rev = '1' then
+ brev_lenm1 := unsigned(length(2 downto 0)) - 1;
+ end if;
+ v.byte_offset := byte_offset;
+ v.brev_mask := brev_lenm1;
+ end if;
+
+ -- Work out load formatter controls for next cycle
+ byte_offset := unsigned(v.addr(2 downto 0));
+ brev_lenm1 := "000";
+ if v.byte_reverse = '1' then
+ brev_lenm1 := unsigned(v.length(2 downto 0)) - 1;
+ end if;
+
+ for i in 0 to 7 loop
+ kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
+ v.use_second(i) := kk(3);
+ v.byte_index(i) := kk(2 downto 0);
+ end loop;
+
+ for i in 0 to 7 loop
+ if i < to_integer(unsigned(v.length)) then
+ if v.dwords_done = '1' then
+ v.trim_ctl(i) := '1' & not v.use_second(i);
+ else
+ v.trim_ctl(i) := "10";
+ end if;
+ else
+ v.trim_ctl(i) := '0' & v.sign_extend;
+ end if;
+ end loop;
+
+ -- Update outputs to dcache
+ d_out.valid <= req and not v.align_intr;
+ d_out.load <= v.load;
+ d_out.dcbz <= v.dcbz;
+ d_out.nc <= v.nc;
+ d_out.reserve <= v.reserve;
+ d_out.atomic <= v.atomic;
+ d_out.atomic_last <= v.atomic_last;
+ d_out.addr <= addr;
+ d_out.data <= store_data;
+ d_out.byte_sel <= byte_sel;
+ d_out.virt_mode <= v.virt_mode;
+ d_out.priv_mode <= v.priv_mode;
+
+ -- Update outputs to MMU
+ m_out.valid <= mmureq;
+ m_out.iside <= v.instr_fault;
+ m_out.load <= r.load;
+ m_out.priv <= r.priv_mode;
+ m_out.tlbie <= v.tlbie;
+ m_out.mtspr <= mmu_mtspr;
+ m_out.sprn <= sprn;
+ m_out.addr <= maddr;
+ m_out.slbia <= l_in.insn(7);
+ m_out.rs <= l_in.data;
+
+ -- Update outputs to writeback
+ -- Multiplex either cache data to the destination GPR or
+ -- the address for the rA update.
+ l_out.valid <= done;
+ case r.wr_sel is
+ when "00" =>
+ l_out.write_enable <= '1';
+ l_out.write_reg <= r.write_reg;
+ l_out.write_data <= r.sprval;
+ when "01" =>
+ l_out.write_enable <= do_update;
+ l_out.write_reg <= gpr_to_gspr(r.update_reg);
+ l_out.write_data <= r.addr;
+ when "10" =>
+ l_out.write_enable <= '1';
+ l_out.write_reg <= r.write_reg;
+ l_out.write_data <= load_dp_data;
+ when others =>
+ l_out.write_enable <= write_enable;
+ l_out.write_reg <= r.write_reg;
+ l_out.write_data <= data_trimmed;
+ end case;
+ l_out.xerc <= r.xerc;
+ l_out.rc <= r.rc and done;
+ l_out.store_done <= d_in.store_done;
+
+ -- update exception info back to execute1
+ e_out.busy <= busy;
+ e_out.exception <= exception;
+ e_out.alignment <= r.align_intr;
+ e_out.instr_fault <= r.instr_fault;
+ e_out.invalid <= m_in.invalid;
+ e_out.badtree <= m_in.badtree;
+ e_out.perm_error <= m_in.perm_error;
+ e_out.rc_error <= m_in.rc_error;
+ e_out.segment_fault <= m_in.segerr;
+ if exception = '1' and r.instr_fault = '0' then
+ v.dar := addr;
+ if m_in.segerr = '0' and r.align_intr = '0' then
+ v.dsisr := dsisr;
+ end if;
+ end if;
-- Update registers
rin <= v;
- -- Update outputs
- l_out <= r;
end process;
+
+ l1_log: if LOG_LENGTH > 0 generate
+ signal log_data : std_ulogic_vector(9 downto 0);
+ begin
+ ls1_log: process(clk)
+ begin
+ if rising_edge(clk) then
+ log_data <= e_out.busy &
+ e_out.exception &
+ l_out.valid &
+ m_out.valid &
+ d_out.valid &
+ m_in.done &
+ r.dwords_done &
+ std_ulogic_vector(to_unsigned(state_t'pos(r.state), 3));
+ end if;
+ end process;
+ log_out <= log_data;
+ end generate;
+
end;