execute1: Simplify the interrupt logic a little
[microwatt.git] / loadstore1.vhdl
1 library ieee;
2 use ieee.std_logic_1164.all;
3 use ieee.numeric_std.all;
4
5 library work;
6 use work.common.all;
7 use work.helpers.all;
8
9 -- 2 cycle LSU
10 -- We calculate the address in the first cycle
11
12 entity loadstore1 is
13 port (
14 clk : in std_ulogic;
15 rst : in std_ulogic;
16
17 l_in : in Execute1ToLoadstore1Type;
18 l_out : out Loadstore1ToWritebackType;
19
20 d_out : out Loadstore1ToDcacheType;
21 d_in : in DcacheToLoadstore1Type;
22
23 dc_stall : in std_ulogic;
24 stall_out : out std_ulogic
25 );
26 end loadstore1;
27
28 -- Note, we don't currently use the stall output from the dcache because
29 -- we know it can take two requests without stalling when idle, we are
30 -- its only user, and we know it never stalls when idle.
31
32 architecture behave of loadstore1 is
33
34 -- State machine for unaligned loads/stores
35 type state_t is (IDLE, -- ready for instruction
36 SECOND_REQ, -- send 2nd request of unaligned xfer
37 FIRST_ACK_WAIT, -- waiting for 1st ack from dcache
38 LAST_ACK_WAIT, -- waiting for last ack from dcache
39 LD_UPDATE -- writing rA with computed addr on load
40 );
41
42 type reg_stage_t is record
43 -- latch most of the input request
44 load : std_ulogic;
45 addr : std_ulogic_vector(63 downto 0);
46 store_data : std_ulogic_vector(63 downto 0);
47 load_data : std_ulogic_vector(63 downto 0);
48 write_reg : gpr_index_t;
49 length : std_ulogic_vector(3 downto 0);
50 byte_reverse : std_ulogic;
51 sign_extend : std_ulogic;
52 update : std_ulogic;
53 update_reg : gpr_index_t;
54 xerc : xer_common_t;
55 reserve : std_ulogic;
56 rc : std_ulogic;
57 nc : std_ulogic; -- non-cacheable access
58 state : state_t;
59 second_bytes : std_ulogic_vector(7 downto 0);
60 end record;
61
62 type byte_sel_t is array(0 to 7) of std_ulogic;
63 subtype byte_trim_t is std_ulogic_vector(1 downto 0);
64 type trim_ctl_t is array(0 to 7) of byte_trim_t;
65
66 signal r, rin : reg_stage_t;
67 signal lsu_sum : std_ulogic_vector(63 downto 0);
68
69 -- Generate byte enables from sizes
70 function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
71 begin
72 case length is
73 when "0001" =>
74 return "00000001";
75 when "0010" =>
76 return "00000011";
77 when "0100" =>
78 return "00001111";
79 when "1000" =>
80 return "11111111";
81 when others =>
82 return "00000000";
83 end case;
84 end function length_to_sel;
85
86 -- Calculate byte enables
87 -- This returns 16 bits, giving the select signals for two transfers,
88 -- to account for unaligned loads or stores
89 function xfer_data_sel(size : in std_logic_vector(3 downto 0);
90 address : in std_logic_vector(2 downto 0))
91 return std_ulogic_vector is
92 variable longsel : std_ulogic_vector(15 downto 0);
93 begin
94 longsel := "00000000" & length_to_sel(size);
95 return std_ulogic_vector(shift_left(unsigned(longsel),
96 to_integer(unsigned(address))));
97 end function xfer_data_sel;
98
99 begin
100 -- Calculate the address in the first cycle
101 lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0');
102
103 loadstore1_0: process(clk)
104 begin
105 if rising_edge(clk) then
106 if rst = '1' then
107 r.state <= IDLE;
108 else
109 r <= rin;
110 end if;
111 end if;
112 end process;
113
114 loadstore1_1: process(all)
115 variable v : reg_stage_t;
116 variable brev_lenm1 : unsigned(2 downto 0);
117 variable byte_offset : unsigned(2 downto 0);
118 variable j : integer;
119 variable k : unsigned(2 downto 0);
120 variable kk : unsigned(3 downto 0);
121 variable long_sel : std_ulogic_vector(15 downto 0);
122 variable byte_sel : std_ulogic_vector(7 downto 0);
123 variable req : std_ulogic;
124 variable stall : std_ulogic;
125 variable addr : std_ulogic_vector(63 downto 0);
126 variable wdata : std_ulogic_vector(63 downto 0);
127 variable write_enable : std_ulogic;
128 variable do_update : std_ulogic;
129 variable two_dwords : std_ulogic;
130 variable done : std_ulogic;
131 variable data_permuted : std_ulogic_vector(63 downto 0);
132 variable data_trimmed : std_ulogic_vector(63 downto 0);
133 variable use_second : byte_sel_t;
134 variable trim_ctl : trim_ctl_t;
135 variable negative : std_ulogic;
136 begin
137 v := r;
138 req := '0';
139 stall := '0';
140 done := '0';
141 byte_sel := (others => '0');
142 addr := lsu_sum;
143
144 write_enable := '0';
145 do_update := '0';
146 two_dwords := or (r.second_bytes);
147
148 -- load data formatting
149 if r.load = '1' then
150 byte_offset := unsigned(r.addr(2 downto 0));
151 brev_lenm1 := "000";
152 if r.byte_reverse = '1' then
153 brev_lenm1 := unsigned(r.length(2 downto 0)) - 1;
154 end if;
155
156 -- shift and byte-reverse data bytes
157 for i in 0 to 7 loop
158 kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
159 use_second(i) := kk(3);
160 j := to_integer(kk(2 downto 0)) * 8;
161 data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j);
162 end loop;
163
164 -- Work out the sign bit for sign extension.
165 -- Assumes we are not doing both sign extension and byte reversal,
166 -- in that for unaligned loads crossing two dwords we end up
167 -- using a bit from the second dword, whereas for a byte-reversed
168 -- (i.e. big-endian) load the sign bit would be in the first dword.
169 negative := (r.length(3) and data_permuted(63)) or
170 (r.length(2) and data_permuted(31)) or
171 (r.length(1) and data_permuted(15)) or
172 (r.length(0) and data_permuted(7));
173
174 -- trim and sign-extend
175 for i in 0 to 7 loop
176 if i < to_integer(unsigned(r.length)) then
177 if two_dwords = '1' then
178 trim_ctl(i) := '1' & not use_second(i);
179 else
180 trim_ctl(i) := not use_second(i) & '0';
181 end if;
182 else
183 trim_ctl(i) := '0' & (negative and r.sign_extend);
184 end if;
185 case trim_ctl(i) is
186 when "11" =>
187 data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8);
188 when "10" =>
189 data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8);
190 when "01" =>
191 data_trimmed(i * 8 + 7 downto i * 8) := x"FF";
192 when others =>
193 data_trimmed(i * 8 + 7 downto i * 8) := x"00";
194 end case;
195 end loop;
196 end if;
197
198 case r.state is
199 when IDLE =>
200 if l_in.valid = '1' then
201 v.load := l_in.load;
202 v.addr := lsu_sum;
203 v.write_reg := l_in.write_reg;
204 v.length := l_in.length;
205 v.byte_reverse := l_in.byte_reverse;
206 v.sign_extend := l_in.sign_extend;
207 v.update := l_in.update;
208 v.update_reg := l_in.update_reg;
209 v.xerc := l_in.xerc;
210 v.reserve := l_in.reserve;
211 v.rc := l_in.rc;
212 v.nc := l_in.ci;
213
214 -- XXX Temporary hack. Mark the op as non-cachable if the address
215 -- is the form 0xc-------
216 --
217 -- This will have to be replaced by a combination of implementing the
218 -- proper HV CI load/store instructions and having an MMU to get the I
219 -- bit otherwise.
220 if lsu_sum(31 downto 28) = "1100" then
221 v.nc := '1';
222 end if;
223
224 -- Do length_to_sel and work out if we are doing 2 dwords
225 long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0));
226 byte_sel := long_sel(7 downto 0);
227 v.second_bytes := long_sel(15 downto 8);
228
229 v.addr := lsu_sum;
230
231 -- Do byte reversing and rotating for stores in the first cycle
232 if v.load = '0' then
233 byte_offset := unsigned(lsu_sum(2 downto 0));
234 brev_lenm1 := "000";
235 if l_in.byte_reverse = '1' then
236 brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
237 end if;
238 for i in 0 to 7 loop
239 k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
240 j := to_integer(k) * 8;
241 v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
242 end loop;
243 end if;
244
245 req := '1';
246 stall := '1';
247 if long_sel(15 downto 8) = "00000000" then
248 v.state := LAST_ACK_WAIT;
249 else
250 v.state := SECOND_REQ;
251 end if;
252 end if;
253
254 when SECOND_REQ =>
255 -- compute (addr + 8) & ~7 for the second doubleword when unaligned
256 addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000";
257 byte_sel := r.second_bytes;
258 req := '1';
259 stall := '1';
260 v.state := FIRST_ACK_WAIT;
261
262 when FIRST_ACK_WAIT =>
263 stall := '1';
264 if d_in.valid = '1' then
265 v.state := LAST_ACK_WAIT;
266 if r.load = '1' then
267 v.load_data := data_permuted;
268 end if;
269 end if;
270
271 when LAST_ACK_WAIT =>
272 stall := '1';
273 if d_in.valid = '1' then
274 write_enable := r.load;
275 if r.load = '1' and r.update = '1' then
276 -- loads with rA update need an extra cycle
277 v.state := LD_UPDATE;
278 else
279 -- stores write back rA update in this cycle
280 do_update := r.update;
281 stall := '0';
282 done := '1';
283 v.state := IDLE;
284 end if;
285 end if;
286
287 when LD_UPDATE =>
288 do_update := '1';
289 v.state := IDLE;
290 done := '1';
291 end case;
292
293 -- Update outputs to dcache
294 d_out.valid <= req;
295 d_out.load <= v.load;
296 d_out.nc <= v.nc;
297 d_out.reserve <= v.reserve;
298 d_out.addr <= addr;
299 d_out.data <= v.store_data;
300 d_out.byte_sel <= byte_sel;
301
302 -- Update outputs to writeback
303 -- Multiplex either cache data to the destination GPR or
304 -- the address for the rA update.
305 l_out.valid <= done;
306 if do_update = '1' then
307 l_out.write_enable <= '1';
308 l_out.write_reg <= r.update_reg;
309 l_out.write_data <= r.addr;
310 else
311 l_out.write_enable <= write_enable;
312 l_out.write_reg <= r.write_reg;
313 l_out.write_data <= data_trimmed;
314 end if;
315 l_out.xerc <= r.xerc;
316 l_out.rc <= r.rc and done;
317 l_out.store_done <= d_in.store_done;
318
319 stall_out <= stall;
320
321 -- Update registers
322 rin <= v;
323
324 end process;
325
326 end;