From: Paul Mackerras Date: Thu, 27 Feb 2020 21:09:08 +0000 (+1100) Subject: dcache: Implement load-reserve and store-conditional instructions X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=5d85ede97dfe13b6762c47a894edb49ceeb7f26c;p=microwatt.git dcache: Implement load-reserve and store-conditional instructions This involves plumbing the (existing) 'reserve' and 'rc' bits in the decode tables down to dcache, and 'rc' and 'store_done' bits from dcache to writeback. It turns out that we had 'RC' set in the 'rc' column for several ordinary stores and for the attn instruction. This corrects them to 'NONE', and sets the 'rc' column to 'ONE' for the conditional stores. In writeback we now have logic to set CR0 when the input from dcache has rc = 1. In dcache we have the reservation itself, which has a valid bit and the address down to cache line granularity. We don't currently store the reservation length. For a store conditional which fails, we set a 'cancel_store' signal which inhibits the write to the cache and prevents the state machine from starting a bus cycle or going to the STORE_WAIT_ACK state. Instead we set r1.stcx_fail which causes the instruction to complete in the next cycle with rc=1 and store_done=0. Signed-off-by: Paul Mackerras --- diff --git a/common.vhdl b/common.vhdl index ffddb0b..84bbc47 100644 --- a/common.vhdl +++ b/common.vhdl @@ -130,12 +130,13 @@ package common is byte_reverse : std_ulogic; sign_extend : std_ulogic; -- do we need to sign extend? update : std_ulogic; -- is this an update instruction? + reserve : std_ulogic; -- set for larx/stcx end record; constant Decode2ToExecute1Init : Decode2ToExecute1Type := (valid => '0', insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0', lr => '0', rc => '0', oe => '0', invert_a => '0', invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0', - is_32bit => '0', is_signed => '0', xerc => xerc_init, + is_32bit => '0', is_signed => '0', xerc => xerc_init, reserve => '0', byte_reverse => '0', sign_extend => '0', update => '0', others => (others => '0')); type Execute1ToMultiplyType is record @@ -206,10 +207,12 @@ package common is update : std_ulogic; -- is this an update instruction? update_reg : gpr_index_t; -- if so, the register to update xerc : xer_common_t; + reserve : std_ulogic; -- set for larx/stcx. + rc : std_ulogic; -- set for stcx. end record; constant Execute1ToLoadstore1Init : Execute1ToLoadstore1Type := (valid => '0', load => '0', byte_reverse => '0', sign_extend => '0', update => '0', xerc => xerc_init, - others => (others => '0')); + reserve => '0', rc => '0', others => (others => '0')); type Loadstore1ToDcacheType is record valid : std_ulogic; @@ -224,6 +227,8 @@ package common is update : std_ulogic; update_reg : gpr_index_t; xerc : xer_common_t; + reserve : std_ulogic; + rc : std_ulogic; end record; type DcacheToWritebackType is record @@ -237,10 +242,12 @@ package common is byte_reverse : std_ulogic; second_word : std_ulogic; xerc : xer_common_t; + rc : std_ulogic; + store_done : std_ulogic; end record; constant DcacheToWritebackInit : DcacheToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0', byte_reverse => '0', second_word => '0', xerc => xerc_init, - others => (others => '0')); + rc => '0', store_done => '0', others => (others => '0')); type Execute1ToWritebackType is record valid: std_ulogic; diff --git a/dcache.vhdl b/dcache.vhdl index 5bf477b..75b10c7 100644 --- a/dcache.vhdl +++ b/dcache.vhdl @@ -171,6 +171,9 @@ architecture rtl of dcache is slow_data : std_ulogic_vector(63 downto 0); slow_valid : std_ulogic; + -- Signal to complete a failed stcx. + stcx_fail : std_ulogic; + -- Cache miss state (reload state machine) state : state_t; wb : wishbone_master_out; @@ -199,6 +202,15 @@ architecture rtl of dcache is signal r2 : reg_stage_2_t; + -- Reservation information + -- + type reservation_t is record + valid : std_ulogic; + addr : std_ulogic_vector(63 downto LINE_OFF_BITS); + end record; + + signal reservation : reservation_t; + -- Async signals on incoming request signal req_index : index_t; signal req_row : row_t; @@ -210,6 +222,10 @@ architecture rtl of dcache is signal req_laddr : std_ulogic_vector(63 downto 0); signal req_sel : std_ulogic_vector(7 downto 0); + signal cancel_store : std_ulogic; + signal set_rsrv : std_ulogic; + signal clear_rsrv : std_ulogic; + -- Cache RAM interface type cache_ram_out_t is array(way_t) of cache_row_t; signal cache_out : cache_ram_out_t; @@ -481,6 +497,41 @@ begin -- Generate stalls from stage 1 state machine stall_out <= '1' when r1.state /= IDLE else '0'; + -- Handle load-with-reservation and store-conditional instructions + reservation_comb: process(all) + begin + cancel_store <= '0'; + set_rsrv <= '0'; + clear_rsrv <= '0'; + if d_in.valid = '1' and d_in.reserve = '1' then + -- XXX generate alignment interrupt if address is not aligned + -- XXX or if d_in.nc = '1' + if d_in.load = '1' then + -- load with reservation + set_rsrv <= '1'; + else + -- store conditional + clear_rsrv <= '1'; + if reservation.valid = '0' or + d_in.addr(63 downto LINE_OFF_BITS) /= reservation.addr then + cancel_store <= '1'; + end if; + end if; + end if; + end process; + + reservation_reg: process(clk) + begin + if rising_edge(clk) then + if rst = '1' or clear_rsrv = '1' then + reservation.valid <= '0'; + elsif set_rsrv = '1' then + reservation.valid <= '1'; + reservation.addr <= d_in.addr(63 downto LINE_OFF_BITS); + end if; + end if; + end process; + -- Writeback (loads and reg updates) & completion control logic -- writeback_control: process(all) @@ -497,6 +548,8 @@ begin d_out.byte_reverse <= r2.byte_reverse; d_out.second_word <= r2.second_dword; d_out.xerc <= r2.xerc; + d_out.rc <= '0'; -- loads never have rc=1 + d_out.store_done <= '0'; -- We have a valid load or store hit or we just completed a slow -- op such as a load miss, a NC load or a store @@ -512,11 +565,14 @@ begin assert (r1.update_valid and r2.hit_load_valid) /= '1' report "unexpected hit_load_delayed collision with update_valid" severity FAILURE; - assert (r1.slow_valid and r2.hit_load_valid) /= '1' report + assert (r1.slow_valid and r1.stcx_fail) /= '1' report + "unexpected slow_valid collision with stcx_fail" + severity FAILURE; + assert ((r1.slow_valid or r1.stcx_fail) and r2.hit_load_valid) /= '1' report "unexpected hit_load_delayed collision with slow_valid" severity FAILURE; - assert (r1.slow_valid and r1.update_valid) /= '1' report - "unexpected update_valid collision with slow_valid" + assert ((r1.slow_valid or r1.stcx_fail) and r1.update_valid) /= '1' report + "unexpected update_valid collision with slow_valid or stcx_fail" severity FAILURE; -- Delayed load hit case is the standard path @@ -551,6 +607,8 @@ begin d_out.xerc <= r1.req.xerc; d_out.second_word <= r1.second_dword; end if; + d_out.rc <= r1.req.rc; + d_out.store_done <= '1'; -- If it's a store or a non-update load form, complete now -- unless we need to do another dword transfer @@ -561,6 +619,12 @@ begin end if; end if; + if r1.stcx_fail = '1' then + d_out.rc <= r1.req.rc; + d_out.store_done <= '0'; + d_out.valid <= '1'; + end if; + -- We have a register update to do. if r1.update_valid = '1' then d_out.write_enable <= '1'; @@ -657,7 +721,7 @@ begin if reloading and wishbone_in.ack = '1' and r1.store_way = i then do_write <= '1'; end if; - if req_op = OP_STORE_HIT and req_hit_way = i then + if req_op = OP_STORE_HIT and req_hit_way = i and cancel_store = '0' then assert not reloading report "Store hit while in state:" & state_t'image(r1.state) severity FAILURE; @@ -753,6 +817,7 @@ begin -- One cycle pulses reset r1.slow_valid <= '0'; r1.update_valid <= '0'; + r1.stcx_fail <= '0'; -- We cannot currently process a new request when not idle assert d_in.valid = '0' or r1.state = IDLE report "request " & @@ -832,10 +897,15 @@ begin r1.wb.sel <= req_sel; r1.wb.adr <= req_addr(r1.wb.adr'left downto 3) & "000"; r1.wb.dat <= req_data; - r1.wb.cyc <= '1'; - r1.wb.stb <= '1'; - r1.wb.we <= '1'; - r1.state <= STORE_WAIT_ACK; + if cancel_store = '0' then + r1.wb.cyc <= '1'; + r1.wb.stb <= '1'; + r1.wb.we <= '1'; + r1.state <= STORE_WAIT_ACK; + else + r1.stcx_fail <= '1'; + r1.state <= IDLE; + end if; -- OP_NONE and OP_BAD do nothing when OP_NONE => @@ -932,7 +1002,7 @@ begin r1.wb.cyc <= '0'; r1.wb.stb <= '0'; end if; - end case; + end case; end if; end if; end process; diff --git a/decode1.vhdl b/decode1.vhdl index bca7c2a..349aa7e 100644 --- a/decode1.vhdl +++ b/decode1.vhdl @@ -60,8 +60,8 @@ architecture behaviour of decode1 is 20 => (ALU, OP_RLC, RA, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwimi 21 => (ALU, OP_RLC, NONE, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwinm 23 => (ALU, OP_RLC, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- rlwnm - 38 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- stb - 39 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', RC, '0', '0'), -- stbu + 38 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stb + 39 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbu 44 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sth 45 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthu 36 => (LDST, OP_STORE, RA_OR_ZERO, CONST_SI, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stw @@ -278,19 +278,19 @@ architecture behaviour of decode1 is 2#1100111000# => (ALU, OP_SHR, NONE, CONST_SH32, RS, RA, '0', '0', '0', '0', ZERO, '1', NONE, '0', '0', '0', '0', '1', '1', RC, '0', '0'), -- srawi 2#1000011011# => (ALU, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- srd 2#1000011000# => (ALU, OP_SHR, NONE, RB, RS, RA, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC, '0', '0'), -- srw - 2#1010110110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', RC, '0', '0'), -- stbcx - 2#0011110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', RC, '0', '0'), -- stbux - 2#0011010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- stbx + 2#1010110110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stbcx + 2#0011110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stbux + 2#0011010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stbx 2#1010010100# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdbrx - 2#0011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- stdcx + 2#0011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stdcx 2#0010110101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stdux 2#0010010101# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is8B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stdx 2#1110010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthbrx - 2#1011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- sthcx + 2#1011010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- sthcx 2#0110110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- sthux 2#0110010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is2B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- sthx 2#1010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '1', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwbrx - 2#0010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', NONE, '0', '0'), -- stwcx + 2#0010010110# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '1', '0', '0', ONE, '0', '0'), -- stwcx 2#0010110111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '1', '0', '0', '0', NONE, '0', '0'), -- stwux 2#0010010111# => (LDST, OP_STORE, RA_OR_ZERO, RB, RS, NONE, '0', '0', '0', '0', ZERO, '0', is4B, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- stwx 2#0000101000# => (ALU, OP_ADD, RA, RB, NONE, RT, '0', '0', '1', '0', ONE, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '0'), -- subf @@ -329,7 +329,7 @@ architecture behaviour of decode1 is -- unit internal in1 in2 in3 out CR CR inv inv cry cry ldst BR sgn upd rsrv 32b sgn rc lk sgl -- op in out A out in out len ext pipe - constant attn_instr : decode_rom_t := (ALU, OP_ILLEGAL, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC, '0', '1'); + constant attn_instr : decode_rom_t := (ALU, OP_ILLEGAL, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'); constant nop_instr : decode_rom_t := (ALU, OP_NOP, NONE, NONE, NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'); constant sim_cfg_instr : decode_rom_t := (ALU, OP_SIM_CONFIG,NONE, NONE, NONE, RT, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'); diff --git a/decode2.vhdl b/decode2.vhdl index 3d6b7d8..ff773aa 100644 --- a/decode2.vhdl +++ b/decode2.vhdl @@ -334,6 +334,7 @@ begin v.e.byte_reverse := d_in.decode.byte_reverse; v.e.sign_extend := d_in.decode.sign_extend; v.e.update := d_in.decode.update; + v.e.reserve := d_in.decode.reserve; -- issue control control_valid_in <= d_in.valid; diff --git a/execute1.vhdl b/execute1.vhdl index c536a27..b1662b7 100644 --- a/execute1.vhdl +++ b/execute1.vhdl @@ -759,6 +759,8 @@ begin lv.update := e_in.update; lv.update_reg := gspr_to_gpr(e_in.read_reg1); lv.xerc := v.e.xerc; + lv.reserve := e_in.reserve; + lv.rc := e_in.rc; -- Update registers rin <= v; diff --git a/loadstore1.vhdl b/loadstore1.vhdl index 9e038e1..a0c0beb 100644 --- a/loadstore1.vhdl +++ b/loadstore1.vhdl @@ -51,6 +51,8 @@ begin v.update := l_in.update; v.update_reg := l_in.update_reg; v.xerc := l_in.xerc; + v.reserve := l_in.reserve; + v.rc := l_in.rc; -- XXX Temporary hack. Mark the op as non-cachable if the address -- is the form 0xc------- diff --git a/writeback.vhdl b/writeback.vhdl index b924ee0..0151561 100644 --- a/writeback.vhdl +++ b/writeback.vhdl @@ -63,6 +63,7 @@ begin variable xe: xer_common_t; variable zero : std_ulogic; variable sign : std_ulogic; + variable scf : std_ulogic_vector(3 downto 0); begin x(0) := e_in.valid; y(0) := l_in.valid; @@ -124,6 +125,17 @@ begin w_out.write_enable <= not partial_write or second_word; end if; + if l_in.rc = '1' then + -- st*cx. instructions + scf(3) := '0'; + scf(2) := '0'; + scf(1) := l_in.store_done; + scf(0) := xe.so; + c_out.write_cr_enable <= '1'; + c_out.write_cr_mask <= num_to_fxm(0); + c_out.write_cr_data(31 downto 28) <= scf; + end if; + -- shift and byte-reverse data bytes for i in 0 to 7 loop k := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);