From: Paul Mackerras Date: Tue, 28 Apr 2020 08:11:52 +0000 (+1000) Subject: dcache: Implement the dcbz instruction X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=041d6bef60956849364c1540e7eecb6fdca77497;p=microwatt.git dcache: Implement the dcbz instruction This adds logic to dcache and loadstore1 to implement dcbz. For now it zeroes a single cache line (by default 64 bytes), not 128 bytes like IBM Power processors do. The dcbz operation is performed much like a load miss, except that we are writing zeroes to memory instead of reading. As each ack comes back, we write zeroes to the BRAM instead of data from memory. In this way we zero the line in memory and also zero the line of cache memory, establishing the line in the cache if it wasn't already resident. If it was already resident then we overwrite the existing line in the cache. Signed-off-by: Paul Mackerras --- diff --git a/common.vhdl b/common.vhdl index 65e40c1..61252bd 100644 --- a/common.vhdl +++ b/common.vhdl @@ -236,6 +236,7 @@ package common is type Loadstore1ToDcacheType is record valid : std_ulogic; load : std_ulogic; + dcbz : std_ulogic; nc : std_ulogic; reserve : std_ulogic; addr : std_ulogic_vector(63 downto 0); diff --git a/dcache.vhdl b/dcache.vhdl index 7e553bf..550298b 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -581,8 +581,12 @@ begin wr_data <= r0.data; wr_sel <= r0.byte_sel; else - -- Otherwise, we might be doing a reload - wr_data <= wishbone_in.dat; + -- Otherwise, we might be doing a reload or a DCBZ + if r1.req.dcbz = '1' then + wr_data <= (others => '0'); + else + wr_data <= wishbone_in.dat; + end if; wr_sel <= (others => '1'); wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS)); end if; @@ -718,18 +722,54 @@ begin r1.wb.we <= '0'; r1.state <= NC_LOAD_WAIT_ACK; - when OP_STORE_HIT | OP_STORE_MISS => - r1.wb.sel <= r0.byte_sel; - r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; - r1.wb.dat <= r0.data; - if cancel_store = '0' then + when OP_STORE_HIT | OP_STORE_MISS => + if r0.dcbz = '0' then + r1.wb.sel <= r0.byte_sel; + r1.wb.adr <= r0.addr(r1.wb.adr'left downto 3) & "000"; + r1.wb.dat <= r0.data; + if cancel_store = '0' then + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; + r1.wb.we <= '1'; + r1.state <= STORE_WAIT_ACK; + else + r1.stcx_fail <= '1'; + r1.state <= IDLE; + end if; + else + -- dcbz is handled much like a load miss except + -- that we are writing to memory instead of reading + r1.store_index <= req_index; + r1.store_row <= get_row(req_laddr); + + if req_op = OP_STORE_HIT then + r1.store_way <= req_hit_way; + else + r1.store_way <= replace_way; + + -- Force misses on the victim way while zeroing + cache_valids(req_index)(replace_way) <= '0'; + + -- Store new tag in selected way + for i in 0 to NUM_WAYS-1 loop + if i = replace_way then + tagset := cache_tags(req_index); + write_tag(i, tagset, req_tag); + cache_tags(req_index) <= tagset; + end if; + end loop; + end if; + + -- Set up for wishbone writes + r1.wb.adr <= req_laddr(r1.wb.adr'left downto 0); + r1.wb.sel <= (others => '1'); + r1.wb.we <= '1'; + r1.wb.dat <= (others => '0'); r1.wb.cyc <= '1'; r1.wb.stb <= '1'; - r1.wb.we <= '1'; - r1.state <= STORE_WAIT_ACK; - else - r1.stcx_fail <= '1'; - r1.state <= IDLE; + + -- Handle the rest like a load miss + r1.state <= RELOAD_WAIT_ACK; end if; -- OP_NONE and OP_BAD do nothing @@ -766,7 +806,7 @@ begin -- not idle, which we don't currently know how to deal -- with. -- - if r1.store_row = get_row(r1.req.addr) then + if r1.store_row = get_row(r1.req.addr) and r1.req.dcbz = '0' then r1.slow_data <= wishbone_in.dat; end if; diff --git a/decode1.vhdl b/decode1.vhdl index 8c7d5f2..785b669 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -164,7 +164,7 @@ architecture behaviour of decode1 is 2#0000110110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbst 2#0100010110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbt 2#0011110110# => (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- dcbtst - -- 2#1111110110# dcbz + 2#1111110110# => (LDST, OP_DCBZ, RA_OR_ZERO, RB, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- dcbz 2#0110001001# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- divdeu 2#1110001001# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- divdeuo 2#0110001011# => (ALU, OP_DIVE, RA, RB, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- divweu diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 664e396..90650db 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -43,6 +43,7 @@ architecture behave of loadstore1 is type reg_stage_t is record -- latch most of the input request load : std_ulogic; + dcbz : std_ulogic; addr : std_ulogic_vector(63 downto 0); store_data : std_ulogic_vector(63 downto 0); load_data : std_ulogic_vector(63 downto 0); @@ -198,8 +199,11 @@ begin when IDLE => if l_in.valid = '1' then v.load := '0'; + v.dcbz := '0'; if l_in.op = OP_LOAD then v.load := '1'; + elsif l_in.op = OP_DCBZ then + v.dcbz := '1'; end if; v.addr := lsu_sum; v.write_reg := l_in.write_reg; @@ -293,6 +297,7 @@ begin -- Update outputs to dcache d_out.valid <= req; d_out.load <= v.load; + d_out.dcbz <= v.dcbz; d_out.nc <= v.nc; d_out.reserve <= v.reserve; d_out.addr <= addr;