--
-- Set associative dcache write-through
--
--- TODO (in no specific order):
---
--- * See list in icache.vhdl
--- * Complete load misses on the cycle when WB data comes instead of
--- at the end of line (this requires dealing with requests coming in
--- while not idle...)
--
library ieee;
use ieee.std_logic_1164.all;
-- L1 DTLB number of sets
TLB_NUM_WAYS : positive := 2;
-- L1 DTLB log_2(page_size)
- TLB_LG_PGSZ : positive := 12
+ TLB_LG_PGSZ : positive := 12;
+ -- Non-zero to enable log data collection
+ LOG_LENGTH : natural := 0
);
port (
clk : in std_ulogic;
architecture rtl of dcache is
-- BRAM organisation: We never access more than wishbone_data_bits at
-- a time so to save resources we make the array only that wide, and
- -- use consecutive indices for to make a cache "line"
+ -- use consecutive indices to make a cache "line"
--
-- ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
constant ROW_SIZE : natural := wishbone_data_bits / 8;
-- which means that the BRAM output is delayed by an extra cycle.
--
-- Thus, the dcache has a 2-stage internal pipeline for cache hits
- -- with no stalls.
+ -- with no stalls. Stores also complete in 2 cycles in most
+ -- circumstances.
+ --
+ -- A request proceeds through the pipeline as follows.
+ --
+ -- Cycle 0: Request is received from loadstore or mmu if either
+ -- d_in.valid or m_in.valid is 1 (not both). In this cycle portions
+ -- of the address are presented to the TLB tag RAM and data RAM
+ -- and the cache tag RAM and data RAM.
+ --
+ -- Clock edge between cycle 0 and cycle 1:
+ -- Request is stored in r0 (assuming r0_full was 0). TLB tag and
+ -- data RAMs are read, and the cache tag RAM is read. (Cache data
+ -- comes out a cycle later due to its output register, giving the
+ -- whole of cycle 1 to read the cache data RAM.)
+ --
+ -- Cycle 1: TLB and cache tag matching is done, the real address
+ -- (RA) for the access is calculated, and the type of operation is
+ -- determined (the OP_* values above). This gives the TLB way for
+ -- a TLB hit, and the cache way for a hit or the way to replace
+ -- for a load miss.
+ --
+ -- Clock edge between cycle 1 and cycle 2:
+ -- Request is stored in r1 (assuming r1.full was 0)
+ -- The state machine transitions out of IDLE state for a load miss,
+ -- a store, a dcbz, or a non-cacheable load. r1.full is set to 1
+ -- for a load miss, dcbz or non-cacheable load but not a store.
--
- -- All other operations are handled via stalling in the first stage.
+ -- Cycle 2: Completion signals are asserted for a load hit,
+ -- a store (excluding dcbz), a TLB operation, a conditional
+ -- store which failed due to no matching reservation, or an error
+ -- (cache hit on non-cacheable operation, TLB miss, or protection
+ -- fault).
--
- -- The second stage can thus complete a hit at the same time as the
- -- first stage emits a stall for a complex op.
+ -- For a load miss, store, or dcbz, the state machine initiates
+ -- a wishbone cycle, which takes at least 2 cycles. For a store,
+ -- if another store comes in with the same cache tag (therefore
+ -- in the same 4k page), it can be added on to the existing cycle,
+ -- subject to some constraints.
+ -- While r1.full = 1, no new requests can go from r0 to r1, but
+ -- requests can come in to r0 and be satisfied if they are
+ -- cacheable load hits or stores with the same cache tag.
--
+ -- Writing to the cache data RAM is done at the clock edge
+ -- at the end of cycle 2 for a store hit (excluding dcbz).
+ -- Stores that miss are not written to the cache data RAM
+ -- but just stored through to memory.
+ -- Dcbz is done like a cache miss, but the wishbone cycle
+ -- is a write rather than a read, and zeroes are written to
+ -- the cache data RAM. Thus dcbz will allocate the line in
+ -- the cache as well as zeroing memory.
+ --
+ -- Since stores are written to the cache data RAM at the end of
+ -- cycle 2, and loads can come in and hit on the data just stored,
+ -- there is a two-stage bypass from store data to load data to
+ -- make sure that loads always see previously-stored data even
+ -- if it has not yet made it to the cache data RAM.
+ --
+ -- Load misses read the requested dword of the cache line first in
+ -- the memory read request and then cycle around through the other
+ -- dwords. The load is completed on the cycle after the requested
+ -- dword comes back from memory (using a forwarding path, rather
+ -- than going via the cache data RAM). We maintain an array of
+ -- valid bits per dword for the line being refilled so that
+ -- subsequent load requests to the same line can be completed as
+ -- soon as the necessary data comes in from memory, without
+ -- waiting for the whole line to be read.
-- Stage 0 register, basically contains just the latched request
type reg_stage_0_t is record
req : Loadstore1ToDcacheType;
- tlbie : std_ulogic;
- doall : std_ulogic;
- tlbld : std_ulogic;
+ tlbie : std_ulogic; -- indicates a tlbie request (from MMU)
+ doall : std_ulogic; -- with tlbie, indicates flush whole TLB
+ tlbld : std_ulogic; -- indicates a TLB load request (from MMU)
mmu_req : std_ulogic; -- indicates source of request
end record;
type mem_access_request_t is record
op : op_t;
+ valid : std_ulogic;
dcbz : std_ulogic;
real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
data : std_ulogic_vector(63 downto 0);
byte_sel : std_ulogic_vector(7 downto 0);
hit_way : way_t;
same_tag : std_ulogic;
+ mmu_req : std_ulogic;
end record;
-- First stage register, contains state for stage 1 of load hits
hit_index : index_t;
cache_hit : std_ulogic;
+ -- TLB hit state
+ tlb_hit : std_ulogic;
+ tlb_hit_way : tlb_way_t;
+ tlb_hit_index : tlb_index_t;
+
-- 2-stage data buffer for data forwarded from writes to reads
forward_data1 : std_ulogic_vector(63 downto 0);
forward_data2 : std_ulogic_vector(63 downto 0);
end_row_ix : row_in_line_t;
rows_valid : row_per_line_valid_t;
acks_pending : unsigned(2 downto 0);
-
- -- Signals to complete with error
- error_done : std_ulogic;
+ inc_acks : std_ulogic;
+ dec_acks : std_ulogic;
+
+ -- Signals to complete (possibly with error)
+ ls_valid : std_ulogic;
+ ls_error : std_ulogic;
+ mmu_done : std_ulogic;
+ mmu_error : std_ulogic;
cache_paradox : std_ulogic;
-- Signal to complete a failed stcx.
stcx_fail : std_ulogic;
-
- -- completion signal for tlbie
- tlbie_done : std_ulogic;
end record;
signal r1 : reg_stage_1_t;
signal req_op : op_t;
signal req_data : std_ulogic_vector(63 downto 0);
signal req_same_tag : std_ulogic;
+ signal req_go : std_ulogic;
signal early_req_row : row_t;
ptes(j + TLB_PTE_BITS - 1 downto j) := newpte;
end;
- signal log_data : std_ulogic_vector(19 downto 0);
-
begin
assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE;
lru => tlb_plru_out
);
- process(tlb_req_index, tlb_hit, tlb_hit_way, tlb_plru_out)
+ process(all)
begin
-- PLRU interface
- if tlb_hit = '1' and tlb_req_index = i then
- tlb_plru_acc_en <= '1';
+ if r1.tlb_hit_index = i then
+ tlb_plru_acc_en <= r1.tlb_hit;
else
tlb_plru_acc_en <= '0';
end if;
- tlb_plru_acc <= std_ulogic_vector(to_unsigned(tlb_hit_way, TLB_WAY_BITS));
+ tlb_plru_acc <= std_ulogic_vector(to_unsigned(r1.tlb_hit_way, TLB_WAY_BITS));
tlb_plru_victim(i) <= tlb_plru_out;
end process;
end generate;
req_row <= get_row(r0.req.addr);
req_tag <= get_tag(ra);
- go := r0_valid and not (r0.tlbie or r0.tlbld) and not r1.error_done;
+ go := r0_valid and not (r0.tlbie or r0.tlbld) and not r1.ls_error;
-- Test if pending request is a hit on any way
-- In order to make timing in virtual mode, when we are using the TLB,
end if;
end if;
req_op <= op;
+ req_go <= go;
-- Version of the row number that is valid one cycle earlier
-- in the cases where we need to read the cache data BRAM.
-- XXX or if r0.req.nc = '1'
if r0.req.load = '1' then
-- load with reservation
- set_rsrv <= '1';
+ set_rsrv <= r0.req.atomic_last;
else
-- store conditional
- clear_rsrv <= '1';
+ clear_rsrv <= r0.req.atomic_last;
if reservation.valid = '0' or
r0.req.addr(63 downto LINE_OFF_BITS) /= reservation.addr then
cancel_store <= '1';
end if;
end loop;
- d_out.valid <= '0';
+ d_out.valid <= r1.ls_valid;
d_out.data <= data_out;
- d_out.store_done <= '0';
- d_out.error <= '0';
- d_out.cache_paradox <= '0';
+ d_out.store_done <= not r1.stcx_fail;
+ d_out.error <= r1.ls_error;
+ d_out.cache_paradox <= r1.cache_paradox;
-- Outputs to MMU
- m_out.done <= r1.tlbie_done;
- m_out.err <= '0';
+ m_out.done <= r1.mmu_done;
+ m_out.err <= r1.mmu_error;
m_out.data <= data_out;
-- We have a valid load or store hit or we just completed a slow
-- Load hit case is the standard path
if r1.hit_load_valid = '1' then
report "completing load hit data=" & to_hstring(data_out);
- d_out.valid <= '1';
end if;
-- error cases complete without stalling
- if r1.error_done = '1' then
+ if r1.ls_error = '1' then
report "completing ld/st with error";
- d_out.error <= '1';
- d_out.cache_paradox <= r1.cache_paradox;
- d_out.valid <= '1';
end if;
-- Slow ops (load miss, NC, stores)
if r1.slow_valid = '1' then
- d_out.store_done <= '1';
report "completing store or load miss data=" & to_hstring(data_out);
- d_out.valid <= '1';
- end if;
-
- if r1.stcx_fail = '1' then
- d_out.store_done <= '0';
- d_out.valid <= '1';
end if;
else
-- Request came from MMU
if r1.hit_load_valid = '1' then
report "completing load hit to MMU, data=" & to_hstring(m_out.data);
- m_out.done <= '1';
end if;
-- error cases complete without stalling
- if r1.error_done = '1' then
+ if r1.mmu_error = '1' then
report "completing MMU ld with error";
- m_out.err <= '1';
- m_out.done <= '1';
end if;
-- Slow ops (i.e. load miss)
if r1.slow_valid = '1' then
report "completing MMU load miss, data=" & to_hstring(m_out.data);
- m_out.done <= '1';
end if;
end if;
if req_op = OP_BAD then
report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) &
" rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok);
- r1.error_done <= '1';
+ r1.ls_error <= not r0.mmu_req;
+ r1.mmu_error <= r0.mmu_req;
r1.cache_paradox <= access_ok;
else
- r1.error_done <= '0';
+ r1.ls_error <= '0';
+ r1.mmu_error <= '0';
r1.cache_paradox <= '0';
end if;
r1.stcx_fail <= '0';
end if;
- -- complete tlbies and TLB loads in the third cycle
- r1.tlbie_done <= r0_valid and (r0.tlbie or r0.tlbld);
+ -- Record TLB hit information for updating TLB PLRU
+ r1.tlb_hit <= tlb_hit;
+ r1.tlb_hit_way <= tlb_hit_way;
+ r1.tlb_hit_index <= tlb_req_index;
+
end if;
end process;
r1.slow_valid <= '0';
r1.wb.cyc <= '0';
r1.wb.stb <= '0';
+ r1.ls_valid <= '0';
+ r1.mmu_done <= '0';
-- Not useful normally but helps avoiding tons of sim warnings
r1.wb.adr <= (others => '0');
-- One cycle pulses reset
r1.slow_valid <= '0';
r1.write_bram <= '0';
+ r1.inc_acks <= '0';
+ r1.dec_acks <= '0';
+
+ r1.ls_valid <= '0';
+ -- complete tlbies and TLB loads in the third cycle
+ r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld);
+ if req_op = OP_LOAD_HIT or req_op = OP_STCX_FAIL then
+ if r0.mmu_req = '0' then
+ r1.ls_valid <= '1';
+ else
+ r1.mmu_done <= '1';
+ end if;
+ end if;
if r1.write_tag = '1' then
-- Store new tag in selected way
req := r1.req;
else
req.op := req_op;
+ req.valid := req_go;
+ req.mmu_req := r0.mmu_req;
req.dcbz := r0.req.dcbz;
req.real_addr := ra;
- req.data := r0.req.data;
- req.byte_sel := r0.req.byte_sel;
+ -- Force data to 0 for dcbz
+ if r0.req.dcbz = '0' then
+ req.data := d_in.data;
+ else
+ req.data := (others => '0');
+ end if;
+ -- Select all bytes for dcbz and for cacheable loads
+ if r0.req.dcbz = '1' or (r0.req.load = '1' and r0.req.nc = '0') then
+ req.byte_sel := (others => '1');
+ else
+ req.byte_sel := r0.req.byte_sel;
+ end if;
req.hit_way := req_hit_way;
req.same_tag := req_same_tag;
case r1.state is
when IDLE =>
r1.wb.adr <= req.real_addr(r1.wb.adr'left downto 0);
- r1.dcbz <= '0';
+ r1.wb.sel <= req.byte_sel;
+ r1.wb.dat <= req.data;
+ r1.dcbz <= req.dcbz;
-- Keep track of our index and way for subsequent stores.
r1.store_index <= get_index(req.real_addr);
" tag:" & to_hstring(get_tag(req.real_addr));
-- Start the wishbone cycle
- r1.wb.sel <= (others => '1');
r1.wb.we <= '0';
r1.wb.cyc <= '1';
r1.wb.stb <= '1';
r1.write_tag <= '1';
when OP_LOAD_NC =>
- r1.wb.sel <= req.byte_sel;
r1.wb.cyc <= '1';
r1.wb.stb <= '1';
r1.wb.we <= '0';
when OP_STORE_HIT | OP_STORE_MISS =>
if req.dcbz = '0' then
- r1.wb.sel <= req.byte_sel;
- r1.wb.dat <= req.data;
r1.state <= STORE_WAIT_ACK;
r1.acks_pending <= to_unsigned(1, 3);
r1.full <= '0';
r1.slow_valid <= '1';
+ if req.mmu_req = '0' then
+ r1.ls_valid <= '1';
+ else
+ r1.mmu_done <= '1';
+ end if;
if req.op = OP_STORE_HIT then
r1.write_bram <= '1';
end if;
else
-- dcbz is handled much like a load miss except
-- that we are writing to memory instead of reading
-
- -- Start the wishbone writes
- r1.wb.sel <= (others => '1');
- r1.wb.dat <= (others => '0');
-
- -- Handle the rest like a load miss
r1.state <= RELOAD_WAIT_ACK;
if req.op = OP_STORE_MISS then
r1.write_tag <= '1';
end if;
- r1.dcbz <= '1';
end if;
r1.wb.we <= '1';
r1.wb.cyc <= '1';
r1.store_row = get_row(r1.req.real_addr) then
r1.full <= '0';
r1.slow_valid <= '1';
+ if r1.mmu_req = '0' then
+ r1.ls_valid <= '1';
+ else
+ r1.mmu_done <= '1';
+ end if;
r1.forward_sel <= (others => '1');
r1.use_forward1 <= '1';
end if;
when STORE_WAIT_ACK =>
stbs_done := r1.wb.stb = '0';
acks := r1.acks_pending;
+ if r1.inc_acks /= r1.dec_acks then
+ if r1.inc_acks = '1' then
+ acks := acks + 1;
+ else
+ acks := acks - 1;
+ end if;
+ end if;
+ r1.acks_pending <= acks;
-- Clear stb when slave accepted request
if wishbone_in.stall = '0' then
-- See if there is another store waiting to be done
-- which is in the same real page.
- if acks < 7 and req.same_tag = '1' and
- (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then
- r1.wb.adr <= req.real_addr(r1.wb.adr'left downto 0);
+ if req.valid = '1' then
+ r1.wb.adr(SET_SIZE_BITS - 1 downto 0) <=
+ req.real_addr(SET_SIZE_BITS - 1 downto 0);
r1.wb.dat <= req.data;
r1.wb.sel <= req.byte_sel;
+ end if;
+ if acks < 7 and req.same_tag = '1' and
+ (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then
r1.wb.stb <= '1';
stbs_done := false;
if req.op = OP_STORE_HIT then
end if;
r1.full <= '0';
r1.slow_valid <= '1';
- acks := acks + 1;
+ -- Store requests never come from the MMU
+ r1.ls_valid <= '1';
+ stbs_done := false;
+ r1.inc_acks <= '1';
else
r1.wb.stb <= '0';
stbs_done := true;
r1.wb.cyc <= '0';
r1.wb.stb <= '0';
end if;
- acks := acks - 1;
+ r1.dec_acks <= '1';
end if;
- r1.acks_pending <= acks;
when NC_LOAD_WAIT_ACK =>
-- Clear stb when slave accepted request
r1.state <= IDLE;
r1.full <= '0';
r1.slow_valid <= '1';
+ if r1.mmu_req = '0' then
+ r1.ls_valid <= '1';
+ else
+ r1.mmu_done <= '1';
+ end if;
r1.forward_sel <= (others => '1');
r1.use_forward1 <= '1';
r1.wb.cyc <= '0';
end if;
end process;
- dcache_log: process(clk)
+ dc_log: if LOG_LENGTH > 0 generate
+ signal log_data : std_ulogic_vector(19 downto 0);
begin
- if rising_edge(clk) then
- log_data <= r1.wb.adr(5 downto 3) &
- wishbone_in.stall &
- wishbone_in.ack &
- r1.wb.stb & r1.wb.cyc &
- d_out.error &
- d_out.valid &
- std_ulogic_vector(to_unsigned(op_t'pos(req_op), 3)) &
- stall_out &
- std_ulogic_vector(to_unsigned(tlb_hit_way, 3)) &
- valid_ra &
- std_ulogic_vector(to_unsigned(state_t'pos(r1.state), 3));
- end if;
- end process;
- log_out <= log_data;
+ dcache_log: process(clk)
+ begin
+ if rising_edge(clk) then
+ log_data <= r1.wb.adr(5 downto 3) &
+ wishbone_in.stall &
+ wishbone_in.ack &
+ r1.wb.stb & r1.wb.cyc &
+ d_out.error &
+ d_out.valid &
+ std_ulogic_vector(to_unsigned(op_t'pos(req_op), 3)) &
+ stall_out &
+ std_ulogic_vector(to_unsigned(tlb_hit_way, 3)) &
+ valid_ra &
+ std_ulogic_vector(to_unsigned(state_t'pos(r1.state), 3));
+ end if;
+ end process;
+ log_out <= log_data;
+ end generate;
end;