From a4500c63a281a57752edbfdd4d9033974a98c8c8 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Sat, 13 Jun 2020 23:00:13 +1000 Subject: [PATCH] dcache: Reduce back-to-back store latency from 3 cycles to 2 This uses the machinery we already had for comparing the real address of a new request with the tag of a previous request (r1.reload_tag) to get better timing on comparing the address of a second store with the one in progress. The comparison is now on the set size rather than the page size, but since set size can't be larger than the page size (and usually will equal the page size), that is OK. The same comparison can also be used to tell when we can satisfy a load miss during a cache line refill. Signed-off-by: Paul Mackerras --- dcache.vhdl | 27 +++++++++++++-------------- 1 file changed, 13 insertions(+), 14 deletions(-) diff --git a/dcache.vhdl b/dcache.vhdl index bc351b0..9ecb6a9 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -232,6 +232,7 @@ architecture rtl of dcache is byte_sel : std_ulogic_vector(7 downto 0); hit_way : way_t; repl_way : way_t; + same_tag : std_ulogic; end record; -- First stage register, contains state for stage 1 of load hits @@ -301,6 +302,7 @@ architecture rtl of dcache is signal req_tag : cache_tag_t; signal req_op : op_t; signal req_data : std_ulogic_vector(63 downto 0); + signal req_same_tag : std_ulogic; signal early_req_row : row_t; @@ -777,6 +779,7 @@ begin rel_match := '1'; end if; end if; + req_same_tag <= rel_match; -- See if the request matches the line currently being reloaded if r1.state = RELOAD_WAIT_ACK and req_index = r1.store_index and @@ -1222,6 +1225,7 @@ begin req.byte_sel := r0.req.byte_sel; req.hit_way := req_hit_way; req.repl_way := replace_way; + req.same_tag := req_same_tag; -- Store the incoming request from r0, if it is a slow request -- Note that r1.full = 1 implies req_op = OP_NONE @@ -1243,6 +1247,7 @@ begin r1.store_row <= get_row(req.real_addr); r1.end_row_ix <= get_row_of_line(get_row(req.real_addr)) - 1; r1.reload_tag <= get_tag(req.real_addr); + r1.req.same_tag <= '1'; if req.op = OP_STORE_HIT then r1.store_way <= req.hit_way; @@ -1346,11 +1351,10 @@ begin -- complete the request next cycle. -- Compare the whole address in case the request in -- r1.req is not the one that started this refill. - if r1.full = '1' and + if r1.full = '1' and r1.req.same_tag = '1' and ((r1.dcbz = '1' and r1.req.dcbz = '1') or (r1.dcbz = '0' and r1.req.op = OP_LOAD_MISS)) and - r1.store_row = get_row(r1.req.real_addr) and - r1.reload_tag = get_tag(r1.req.real_addr) then + r1.store_row = get_row(r1.req.real_addr) then r1.full <= '0'; r1.slow_valid <= '1'; r1.forward_sel <= (others => '1'); @@ -1379,19 +1383,14 @@ begin if wishbone_in.stall = '0' then -- See if there is another store waiting to be done -- which is in the same real page. - -- Using r1.req rather than req here limits us to one - -- store every two cycles, but helps timing in that we - -- don't depend on req_op or ra. - if r1.full = '1' and acks < 7 and - (r1.req.op = OP_STORE_MISS or r1.req.op = OP_STORE_HIT) and - (r1.req.real_addr(r1.wb.adr'left downto TLB_LG_PGSZ) = - r1.wb.adr(r1.wb.adr'left downto TLB_LG_PGSZ)) then - r1.wb.adr <= r1.req.real_addr(r1.wb.adr'left downto 0); - r1.wb.dat <= r1.req.data; - r1.wb.sel <= r1.req.byte_sel; + if acks < 7 and req.same_tag = '1' and + (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then + r1.wb.adr <= req.real_addr(r1.wb.adr'left downto 0); + r1.wb.dat <= req.data; + r1.wb.sel <= req.byte_sel; r1.wb.stb <= '1'; stbs_done := false; - if r1.req.op = OP_STORE_HIT then + if req.op = OP_STORE_HIT then r1.write_bram <= '1'; end if; r1.full <= '0'; -- 2.30.2