carry: std_ulogic;
end record;
- type Fetch1ToFetch2Type is record
- nia: std_ulogic_vector(63 downto 0);
- end record;
-
- type Fetch2ToIcacheType is record
+ type Fetch1ToIcacheType is record
req: std_ulogic;
- addr: std_ulogic_vector(63 downto 0);
+ stop_mark: std_ulogic;
+ nia: std_ulogic_vector(63 downto 0);
end record;
type IcacheToFetch2Type is record
- ack: std_ulogic;
+ valid: std_ulogic;
+ stop_mark: std_ulogic;
+ nia: std_ulogic_vector(63 downto 0);
insn: std_ulogic_vector(31 downto 0);
end record;
architecture behave of core is
-- fetch signals
- signal fetch1_to_fetch2: Fetch1ToFetch2Type;
signal fetch2_to_decode1: Fetch2ToDecode1Type;
-- icache signals
- signal fetch2_to_icache : Fetch2ToIcacheType;
+ signal fetch1_to_icache : Fetch1ToIcacheType;
signal icache_to_fetch2 : IcacheToFetch2Type;
-- decode signals
-- local signals
signal fetch1_stall_in : std_ulogic;
+ signal icache_stall_out : std_ulogic;
signal fetch2_stall_in : std_ulogic;
- signal fetch2_stall_out : std_ulogic;
signal decode1_stall_in : std_ulogic;
signal decode2_stall_out : std_ulogic;
rst => core_rst,
stall_in => fetch1_stall_in,
flush_in => flush,
- e_in => execute1_to_fetch1,
- f_out => fetch1_to_fetch2
- );
-
- fetch1_stall_in <= fetch2_stall_out or decode2_stall_out;
-
- fetch2_0: entity work.fetch2
- port map (
- clk => clk,
- rst => core_rst,
- stall_in => fetch2_stall_in,
- stall_out => fetch2_stall_out,
- flush_in => flush,
- i_in => icache_to_fetch2,
- i_out => fetch2_to_icache,
stop_in => dbg_core_stop,
- f_in => fetch1_to_fetch2,
- f_out => fetch2_to_decode1
+ e_in => execute1_to_fetch1,
+ i_out => fetch1_to_icache
);
- fetch2_stall_in <= decode2_stall_out;
+ fetch1_stall_in <= icache_stall_out or decode2_stall_out;
icache_0: entity work.icache
generic map(
port map(
clk => clk,
rst => icache_rst,
- i_in => fetch2_to_icache,
+ i_in => fetch1_to_icache,
i_out => icache_to_fetch2,
+ flush_in => flush,
+ stall_out => icache_stall_out,
wishbone_out => wishbone_insn_out,
wishbone_in => wishbone_insn_in
);
- icache_rst <= rst or dbg_icache_rst;
+ icache_rst <= rst or dbg_icache_rst;
+
+ fetch2_0: entity work.fetch2
+ port map (
+ clk => clk,
+ rst => core_rst,
+ stall_in => fetch2_stall_in,
+ flush_in => flush,
+ i_in => icache_to_fetch2,
+ f_out => fetch2_to_decode1
+ );
+
+ fetch2_stall_in <= decode2_stall_out;
decode1_0: entity work.decode1
port map (
icache_rst => dbg_icache_rst,
terminate => terminate,
core_stopped => dbg_core_is_stopped,
- nia => fetch1_to_fetch2.nia,
+ nia => fetch1_to_icache.nia,
terminated_out => terminated_out
);
reg_write: process(clk)
begin
if rising_edge(clk) then
+ -- Reset the 1-cycle "do" signals
+ do_step <= '0';
+ do_reset <= '0';
+ do_icreset <= '0';
+
if (rst) then
stopping <= '0';
terminated <= '0';
else
- -- Reset the 1-cycle "do" signals
- do_step <= '0';
- do_reset <= '0';
- do_icreset <= '0';
-
-- Edge detect on dmi_req for 1-shot pulses
dmi_req_1 <= dmi_req;
if dmi_req = '1' and dmi_req_1 = '0' then
-- Control inputs:
stall_in : in std_ulogic;
flush_in : in std_ulogic;
+ stop_in : in std_ulogic;
-- redirect from execution unit
e_in : in Execute1ToFetch1Type;
- -- fetch data out
- f_out : out Fetch1ToFetch2Type
+ -- Request to icache
+ i_out : out Fetch1ToIcacheType
);
end entity fetch1;
architecture behaviour of fetch1 is
- signal r, r_next : Fetch1ToFetch2Type;
+ type stop_state_t is (RUNNING, STOPPED, RESTARTING);
+ type reg_internal_t is record
+ stop_state: stop_state_t;
+ end record;
+ signal r, r_next : Fetch1ToIcacheType;
+ signal r_int, r_next_int : reg_internal_t;
begin
regs : process(clk)
begin
if rising_edge(clk) then
- if rst = '1' or e_in.redirect = '1' or stall_in = '0' then
- r <= r_next;
+ if r /= r_next then
+ report "fetch1 rst:" & std_ulogic'image(rst) &
+ " R:" & std_ulogic'image(e_in.redirect) &
+ " S:" & std_ulogic'image(stall_in) &
+ " T:" & std_ulogic'image(stop_in) &
+ " nia:" & to_hstring(r_next.nia) &
+ " SM:" & std_ulogic'image(r_next.stop_mark);
end if;
+ r <= r_next;
+ r_int <= r_next_int;
end if;
end process;
comb : process(all)
- variable v : Fetch1ToFetch2Type;
+ variable v : Fetch1ToIcacheType;
+ variable v_int : reg_internal_t;
+ variable increment : boolean;
begin
v := r;
+ v_int := r_int;
if rst = '1' then
v.nia := RESET_ADDRESS;
+ v_int.stop_state := RUNNING;
elsif e_in.redirect = '1' then
v.nia := e_in.redirect_nia;
- else
- v.nia := std_logic_vector(unsigned(v.nia) + 4);
+ elsif stall_in = '0' then
+
+ -- For debug stop/step to work properly we need a little bit of
+ -- trickery here. If we just stop incrementing and send stop marks
+ -- when stop_in is set, then we'll increment on the cycle it clears
+ -- and end up never executing the instruction we were stopped on.
+ --
+ -- Avoid this along with the opposite issue when stepping (stop is
+ -- cleared for only one cycle) is handled by the state machine below
+ --
+ -- By default, increment addresses
+ increment := true;
+ case v_int.stop_state is
+ when RUNNING =>
+ -- If we are running and stop_in is set, then stop incrementing,
+ -- we are now stopped.
+ if stop_in = '1' then
+ increment := false;
+ v_int.stop_state := STOPPED;
+ end if;
+ when STOPPED =>
+ -- When stopped, never increment. If stop is cleared, go to state
+ -- "restarting" but still don't increment that cycle. stop_in is
+ -- now 0 so we'll send the NIA down without a stop mark.
+ increment := false;
+ if stop_in = '0' then
+ v_int.stop_state := RESTARTING;
+ end if;
+ when RESTARTING =>
+ -- We have just sent the NIA down, we can start incrementing again.
+ -- If stop_in is still not set, go back to running normally.
+ -- If stop_in is set again (that was a one-cycle "step"), go
+ -- back to "stopped" state which means we'll stop incrementing
+ -- on the next cycle. This ensures we increment the PC once after
+ -- sending one instruction without a stop mark. Since stop_in is
+ -- now set, the new PC will be sent with a stop mark and thus not
+ -- executed.
+ if stop_in = '0' then
+ v_int.stop_state := RUNNING;
+ else
+ v_int.stop_state := STOPPED;
+ end if;
+ end case;
+
+ if increment then
+ v.nia := std_logic_vector(unsigned(v.nia) + 4);
+ end if;
end if;
+ v.req := not rst;
+ v.stop_mark := stop_in;
+
r_next <= v;
+ r_next_int <= v_int;
-- Update outputs to the icache
- f_out <= r;
-
- report "fetch1 rst:" & std_ulogic'image(rst) &
- " R:" & std_ulogic'image(e_in.redirect) &
- " S:" & std_ulogic'image(stall_in) &
- " nia_next:" & to_hstring(r_next.nia) &
- " nia:" & to_hstring(r.nia);
+ i_out <= r;
end process;
rst : in std_ulogic;
stall_in : in std_ulogic;
- stall_out : out std_ulogic;
-
flush_in : in std_ulogic;
- stop_in : in std_ulogic;
+ -- Results from icache
i_in : in IcacheToFetch2Type;
- i_out : out Fetch2ToIcacheType;
-
- f_in : in Fetch1ToFetch2Type;
+ -- Output to decode
f_out : out Fetch2ToDecode1Type
);
end entity fetch2;
architecture behaviour of fetch2 is
+
+ -- The icache cannot stall, so we need to stash a cycle
+ -- of output from it when we stall.
+ type reg_internal_type is record
+ stash : IcacheToFetch2Type;
+ stash_valid : std_ulogic;
+ stopped : std_ulogic;
+ end record;
+
+ signal r_int, rin_int : reg_internal_type;
signal r, rin : Fetch2ToDecode1Type;
+
begin
regs : process(clk)
begin
if rising_edge(clk) then
+
+ if (r /= rin) then
+ report "fetch2 rst:" & std_ulogic'image(rst) &
+ " S:" & std_ulogic'image(stall_in) &
+ " F:" & std_ulogic'image(flush_in) &
+ " T:" & std_ulogic'image(rin.stop_mark) &
+ " V:" & std_ulogic'image(rin.valid) &
+ " nia:" & to_hstring(rin.nia);
+ end if;
+
-- Output state remains unchanged on stall, unless we are flushing
if rst = '1' or flush_in = '1' or stall_in = '0' then
r <= rin;
end if;
+
+ -- Internal state is updated on every clock
+ r_int <= rin_int;
end if;
end process;
comb : process(all)
- variable v : Fetch2ToDecode1Type;
+ variable v : Fetch2ToDecode1Type;
+ variable v_int : reg_internal_type;
+ variable v_i_in : IcacheToFetch2Type;
begin
v := r;
+ v_int := r_int;
- -- asynchronous icache lookup
- i_out.req <= '1';
- i_out.addr <= f_in.nia;
- v.valid := i_in.ack;
- v.nia := f_in.nia;
- v.insn := i_in.insn;
- stall_out <= stop_in or not i_in.ack;
+ -- If stalling, stash away the current input from the icache
+ if stall_in = '1' and v_int.stash_valid = '0' then
+ v_int.stash := i_in;
+ v_int.stash_valid := '1';
+ end if;
+
+ -- If unstalling, source input from the stash and invalidate it,
+ -- otherwise source normally from the icache.
+ --
+ v_i_in := i_in;
+ if v_int.stash_valid = '1' and stall_in = '0' then
+ v_i_in := v_int.stash;
+ v_int.stash_valid := '0';
+ end if;
+
+ v.valid := v_i_in.valid;
+ v.stop_mark := v_i_in.stop_mark;
+ v.nia := v_i_in.nia;
+ v.insn := v_i_in.insn;
+
+ -- Clear stash internal valid bit on flush. We still mark
+ -- the stash itself as valid since we still want to override
+ -- whatever comes form icache when unstalling, but we'll
+ -- override it with something invalid.
+ --
+ if flush_in = '1' then
+ v_int.stash.valid := '0';
+ end if;
+
+ -- If we are flushing or the instruction comes with a stop mark
+ -- we tag it as invalid so it doesn't get decoded and executed
+ if flush_in = '1' or v.stop_mark = '1' then
- if flush_in = '1' or stop_in = '1' then
v.valid := '0';
end if;
- v.stop_mark := stop_in;
+
+ -- Clear stash on reset
+ if rst = '1' then
+ v_int.stash_valid := '0';
+ end if;
-- Update registers
rin <= v;
+ rin_int <= v_int;
-- Update outputs
f_out <= r;
end process;
+
end architecture behaviour;
clk : in std_ulogic;
rst : in std_ulogic;
- i_in : in Fetch2ToIcacheType;
+ i_in : in Fetch1ToIcacheType;
i_out : out IcacheToFetch2Type;
+ stall_out : out std_ulogic;
+ flush_in : in std_ulogic;
+
wishbone_out : out wishbone_master_out;
wishbone_in : in wishbone_slave_out
);
subtype cacheline_tag_type is std_logic_vector(TAG_BITS-1 downto 0);
type cacheline_tag_array is array(0 to NUM_LINES-1) of cacheline_tag_type;
- signal cachelines : cacheline_array := (others => (others => '0'));
- signal tags : cacheline_tag_array := (others => (others => '0'));
- signal tags_valid : std_ulogic_vector(NUM_LINES-1 downto 0) := (others => '0');
-
+ -- Storage. Hopefully "cachelines" is a BRAM, the rest is LUTs
+ signal cachelines : cacheline_array;
+ signal tags : cacheline_tag_array;
+ signal tags_valid : std_ulogic_vector(NUM_LINES-1 downto 0);
attribute ram_style : string;
attribute ram_style of cachelines : signal is "block";
-
attribute ram_decomp : string;
attribute ram_decomp of cachelines : signal is "power";
+ -- Cache reload state machine
type state_type is (IDLE, WAIT_ACK);
type reg_internal_type is record
- state : state_type;
- w : wishbone_master_out;
- store_index : integer range 0 to (NUM_LINES-1);
- store_word : integer range 0 to (LINE_SIZE-1);
+ -- Cache hit state (1 cycle BRAM access)
+ hit_line : cacheline_type;
+ hit_nia : std_ulogic_vector(63 downto 0);
+ hit_smark : std_ulogic;
+ hit_valid : std_ulogic;
+
+ -- Cache miss state (reload state machine)
+ state : state_type;
+ wb : wishbone_master_out;
+ store_index : integer range 0 to (NUM_LINES-1);
+ store_mask : std_ulogic_vector(LINE_SIZE_DW-1 downto 0);
end record;
signal r : reg_internal_type;
- signal read_index : integer range 0 to NUM_LINES-1;
- signal read_tag : std_ulogic_vector(63-OFFSET_BITS-INDEX_BITS downto 0);
- signal read_miss : boolean;
+ -- Async signals decoding incoming requests
+ signal req_index : integer range 0 to NUM_LINES-1;
+ signal req_tag : std_ulogic_vector(TAG_BITS-1 downto 0);
+ signal req_word : integer range 0 to LINE_SIZE_DW*2-1;
+ signal req_is_hit : std_ulogic;
+ -- Return the cache line index (tag index) for an address
function get_index(addr: std_ulogic_vector(63 downto 0)) return integer is
begin
return to_integer(unsigned(addr((OFFSET_BITS+INDEX_BITS-1) downto OFFSET_BITS)));
end;
- function get_word(addr: std_ulogic_vector(63 downto 0); data: cacheline_type) return std_ulogic_vector is
- variable word : integer;
+ -- Return the word index in a cache line for an address
+ function get_word(addr: std_ulogic_vector(63 downto 0)) return integer is
+ begin
+ return to_integer(unsigned(addr(OFFSET_BITS-1 downto 2)));
+ end;
+
+ -- Read a word in a cache line for an address
+ function read_word(word: integer; data: cacheline_type) return std_ulogic_vector is
begin
- word := to_integer(unsigned(addr(OFFSET_BITS-1 downto 2)));
- return data((word+1)*32-1 downto word*32);
+ return data((word+1)*32-1 downto word*32);
end;
+ -- Calculate the tag value from the address
function get_tag(addr: std_ulogic_vector(63 downto 0)) return std_ulogic_vector is
begin
return addr(63 downto OFFSET_BITS+INDEX_BITS);
end;
+
begin
assert ispow2(LINE_SIZE) report "LINE_SIZE not power of 2" severity FAILURE;
assert ispow2(NUM_LINES) report "NUM_LINES not power of 2" severity FAILURE;
- icache_read : process(all)
+ icache_comb : process(all)
begin
- read_index <= get_index(i_in.addr);
- read_tag <= get_tag(i_in.addr);
- read_miss <= false;
-
- i_out.ack <= '0';
- i_out.insn <= get_word(i_in.addr, cachelines(read_index));
-
- if i_in.req = '1' then
- if (tags_valid(read_index) = '1') and (tags(read_index) = read_tag) then
- -- report hit asynchronously
- i_out.ack <= '1';
- else
- read_miss <= true;
- end if;
- end if;
+ -- Calculate next index and tag index
+ req_index <= get_index(i_in.nia);
+ req_tag <= get_tag(i_in.nia);
+ req_word <= get_word(i_in.nia);
+
+ -- Test if pending request is a hit
+ if tags(req_index) = req_tag then
+ req_is_hit <= tags_valid(req_index);
+ else
+ req_is_hit <= '0';
+ end if;
+
+ -- Output instruction from current cache line
+ --
+ -- Note: This is a mild violation of our design principle of having pipeline
+ -- stages output from a clean latch. In this case we output the result
+ -- of a mux. The alternative would be output an entire cache line
+ -- which I prefer not to do just yet.
+ --
+ i_out.valid <= r.hit_valid;
+ i_out.insn <= read_word(get_word(r.hit_nia), r.hit_line);
+ i_out.nia <= r.hit_nia;
+ i_out.stop_mark <= r.hit_smark;
+
+ -- This needs to match the latching of a new request in icache_hit
+ stall_out <= not req_is_hit;
+
+ -- Wishbone requests output (from the cache miss reload machine)
+ wishbone_out <= r.wb;
end process;
- wishbone_out <= r.w;
+ icache_hit : process(clk)
+ begin
+ if rising_edge(clk) then
+ -- Assume we have nothing valid first
+ r.hit_valid <= '0';
+
+ -- Are we free to latch a new request ?
+ --
+ -- Note: this test needs to match the equation for generating stall_out
+ --
+ if i_in.req = '1' and req_is_hit = '1' and flush_in = '0' then
+ -- Read the cache line (BRAM read port) and remember the NIA
+ r.hit_line <= cachelines(req_index);
+ r.hit_nia <= i_in.nia;
+ r.hit_smark <= i_in.stop_mark;
+ r.hit_valid <= '1';
+
+ report "cache hit nia:" & to_hstring(i_in.nia) &
+ " SM:" & std_ulogic'image(i_in.stop_mark) &
+ " idx:" & integer'image(req_index) &
+ " tag:" & to_hstring(req_tag);
+ end if;
- icache_write : process(clk)
+ -- Flush requested ? discard...
+ if flush_in then
+ r.hit_valid <= '0';
+ end if;
+ end if;
+ end process;
+
+ icache_miss : process(clk)
+ variable store_dword : std_ulogic_vector(OFFSET_BITS-4 downto 0);
begin
if rising_edge(clk) then
if rst = '1' then
tags_valid <= (others => '0');
+ r.store_mask <= (others => '0');
r.state <= IDLE;
- r.w.cyc <= '0';
- r.w.stb <= '0';
- end if;
+ r.wb.cyc <= '0';
+ r.wb.stb <= '0';
- r.w.dat <= (others => '0');
- r.w.sel <= "11111111";
- r.w.we <= '0';
+ -- We only ever do reads on wishbone
+ r.wb.dat <= (others => '0');
+ r.wb.sel <= "11111111";
+ r.wb.we <= '0';
+ end if;
+ -- State machine
case r.state is
when IDLE =>
- if read_miss = true then
+ -- We need to read a cache line
+ if i_in.req = '1' and req_is_hit = '0' then
+
+ report "cache miss nia:" & to_hstring(i_in.nia) &
+ " SM:" & std_ulogic'image(i_in.stop_mark) &
+ " idx:" & integer'image(req_index) &
+ " tag:" & to_hstring(req_tag);
+
r.state <= WAIT_ACK;
- r.store_word <= 0;
- r.store_index <= read_index;
+ r.store_mask <= (0 => '1', others => '0');
+ r.store_index <= req_index;
- tags(read_index) <= read_tag;
- tags_valid(read_index) <= '0';
+ -- Force misses while reloading that line
+ tags_valid(req_index) <= '0';
+ tags(req_index) <= req_tag;
- r.w.adr <= i_in.addr(63 downto OFFSET_BITS) & (OFFSET_BITS-1 downto 0 => '0');
- r.w.cyc <= '1';
- r.w.stb <= '1';
+ -- Prep for first dword read
+ r.wb.adr <= i_in.nia(63 downto OFFSET_BITS) & (OFFSET_BITS-1 downto 0 => '0');
+ r.wb.cyc <= '1';
+ r.wb.stb <= '1';
end if;
when WAIT_ACK =>
if wishbone_in.ack = '1' then
- cachelines(r.store_index)((r.store_word+1)*64-1 downto ((r.store_word)*64)) <= wishbone_in.dat;
- r.store_word <= r.store_word + 1;
+ -- Store the current dword in both the cache
+ for i in 0 to LINE_SIZE_DW-1 loop
+ if r.store_mask(i) = '1' then
+ cachelines(r.store_index)(63 + i*64 downto i*64) <= wishbone_in.dat;
+ end if;
+ end loop;
- if r.store_word = (LINE_SIZE_DW-1) then
+ -- That was the last word ? We are done
+ if r.store_mask(LINE_SIZE_DW-1) = '1' then
r.state <= IDLE;
tags_valid(r.store_index) <= '1';
- r.w.cyc <= '0';
- r.w.stb <= '0';
+ r.wb.cyc <= '0';
+ r.wb.stb <= '0';
else
- r.w.adr(OFFSET_BITS-1 downto 3) <= std_ulogic_vector(to_unsigned(r.store_word+1, OFFSET_BITS-3));
+ store_dword := r.wb.adr(OFFSET_BITS-1 downto 3);
+ store_dword := std_ulogic_vector(unsigned(store_dword) + 1);
+ r.wb.adr(OFFSET_BITS-1 downto 3) <= store_dword;
end if;
+ -- Advance to next word
+ r.store_mask <= r.store_mask(LINE_SIZE_DW-2 downto 0) & '0';
end if;
end case;
end if;
signal clk : std_ulogic;
signal rst : std_ulogic;
- signal i_out : Fetch2ToIcacheType;
+ signal i_out : Fetch1ToIcacheType;
signal i_in : IcacheToFetch2Type;
signal wb_bram_in : wishbone_master_out;
rst => rst,
i_in => i_out,
i_out => i_in,
+ flush_in => '0',
wishbone_out => wb_bram_in,
wishbone_in => wb_bram_out
);
stim: process
begin
i_out.req <= '0';
- i_out.addr <= (others => '0');
+ i_out.nia <= (others => '0');
wait for 4*clk_period;
i_out.req <= '1';
- i_out.addr <= x"0000000000000004";
+ i_out.nia <= x"0000000000000004";
wait for 30*clk_period;
- assert i_in.ack = '1';
+ assert i_in.valid = '1';
assert i_in.insn = x"00000001";
i_out.req <= '0';
-- hit
i_out.req <= '1';
- i_out.addr <= x"0000000000000008";
- wait for clk_period/2;
- assert i_in.ack = '1';
+ i_out.nia <= x"0000000000000008";
+ wait for clk_period;
+ assert i_in.valid = '1';
assert i_in.insn = x"00000002";
- wait for clk_period/2;
+ wait for clk_period;
-- another miss
i_out.req <= '1';
- i_out.addr <= x"0000000000000040";
+ i_out.nia <= x"0000000000000040";
wait for 30*clk_period;
- assert i_in.ack = '1';
+ assert i_in.valid = '1';
assert i_in.insn = x"00000010";
-- test something that aliases
i_out.req <= '1';
- i_out.addr <= x"0000000000000100";
- wait for clk_period/2;
- assert i_in.ack = '0';
- wait for clk_period/2;
+ i_out.nia <= x"0000000000000100";
+ wait for clk_period;
+ assert i_in.valid = '0';
+ wait for clk_period;
wait for 30*clk_period;
- assert i_in.ack = '1';
+ assert i_in.valid = '1';
assert i_in.insn = x"00000040";
i_out.req <= '0';