Change the default cross compiler prefix to powerpc64le-linux-gnu-
[microwatt.git] / loadstore1.vhdl
1 library ieee;
2 use ieee.std_logic_1164.all;
3 use ieee.numeric_std.all;
4
5 library work;
6 use work.decode_types.all;
7 use work.common.all;
8 use work.helpers.all;
9
10 -- 2 cycle LSU
11 -- We calculate the address in the first cycle
12
13 entity loadstore1 is
14 port (
15 clk : in std_ulogic;
16 rst : in std_ulogic;
17
18 l_in : in Execute1ToLoadstore1Type;
19 l_out : out Loadstore1ToWritebackType;
20
21 d_out : out Loadstore1ToDcacheType;
22 d_in : in DcacheToLoadstore1Type;
23
24 dc_stall : in std_ulogic;
25 stall_out : out std_ulogic
26 );
27 end loadstore1;
28
29 -- Note, we don't currently use the stall output from the dcache because
30 -- we know it can take two requests without stalling when idle, we are
31 -- its only user, and we know it never stalls when idle.
32
33 architecture behave of loadstore1 is
34
35 -- State machine for unaligned loads/stores
36 type state_t is (IDLE, -- ready for instruction
37 SECOND_REQ, -- send 2nd request of unaligned xfer
38 FIRST_ACK_WAIT, -- waiting for 1st ack from dcache
39 LAST_ACK_WAIT, -- waiting for last ack from dcache
40 LD_UPDATE -- writing rA with computed addr on load
41 );
42
43 type reg_stage_t is record
44 -- latch most of the input request
45 load : std_ulogic;
46 dcbz : std_ulogic;
47 addr : std_ulogic_vector(63 downto 0);
48 store_data : std_ulogic_vector(63 downto 0);
49 load_data : std_ulogic_vector(63 downto 0);
50 write_reg : gpr_index_t;
51 length : std_ulogic_vector(3 downto 0);
52 byte_reverse : std_ulogic;
53 sign_extend : std_ulogic;
54 update : std_ulogic;
55 update_reg : gpr_index_t;
56 xerc : xer_common_t;
57 reserve : std_ulogic;
58 rc : std_ulogic;
59 nc : std_ulogic; -- non-cacheable access
60 state : state_t;
61 second_bytes : std_ulogic_vector(7 downto 0);
62 end record;
63
64 type byte_sel_t is array(0 to 7) of std_ulogic;
65 subtype byte_trim_t is std_ulogic_vector(1 downto 0);
66 type trim_ctl_t is array(0 to 7) of byte_trim_t;
67
68 signal r, rin : reg_stage_t;
69 signal lsu_sum : std_ulogic_vector(63 downto 0);
70
71 -- Generate byte enables from sizes
72 function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
73 begin
74 case length is
75 when "0001" =>
76 return "00000001";
77 when "0010" =>
78 return "00000011";
79 when "0100" =>
80 return "00001111";
81 when "1000" =>
82 return "11111111";
83 when others =>
84 return "00000000";
85 end case;
86 end function length_to_sel;
87
88 -- Calculate byte enables
89 -- This returns 16 bits, giving the select signals for two transfers,
90 -- to account for unaligned loads or stores
91 function xfer_data_sel(size : in std_logic_vector(3 downto 0);
92 address : in std_logic_vector(2 downto 0))
93 return std_ulogic_vector is
94 variable longsel : std_ulogic_vector(15 downto 0);
95 begin
96 longsel := "00000000" & length_to_sel(size);
97 return std_ulogic_vector(shift_left(unsigned(longsel),
98 to_integer(unsigned(address))));
99 end function xfer_data_sel;
100
101 begin
102 -- Calculate the address in the first cycle
103 lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0');
104
105 loadstore1_0: process(clk)
106 begin
107 if rising_edge(clk) then
108 if rst = '1' then
109 r.state <= IDLE;
110 else
111 r <= rin;
112 end if;
113 end if;
114 end process;
115
116 loadstore1_1: process(all)
117 variable v : reg_stage_t;
118 variable brev_lenm1 : unsigned(2 downto 0);
119 variable byte_offset : unsigned(2 downto 0);
120 variable j : integer;
121 variable k : unsigned(2 downto 0);
122 variable kk : unsigned(3 downto 0);
123 variable long_sel : std_ulogic_vector(15 downto 0);
124 variable byte_sel : std_ulogic_vector(7 downto 0);
125 variable req : std_ulogic;
126 variable stall : std_ulogic;
127 variable addr : std_ulogic_vector(63 downto 0);
128 variable wdata : std_ulogic_vector(63 downto 0);
129 variable write_enable : std_ulogic;
130 variable do_update : std_ulogic;
131 variable two_dwords : std_ulogic;
132 variable done : std_ulogic;
133 variable data_permuted : std_ulogic_vector(63 downto 0);
134 variable data_trimmed : std_ulogic_vector(63 downto 0);
135 variable use_second : byte_sel_t;
136 variable trim_ctl : trim_ctl_t;
137 variable negative : std_ulogic;
138 begin
139 v := r;
140 req := '0';
141 stall := '0';
142 done := '0';
143 byte_sel := (others => '0');
144 addr := lsu_sum;
145
146 write_enable := '0';
147 do_update := '0';
148 two_dwords := or (r.second_bytes);
149
150 -- load data formatting
151 byte_offset := unsigned(r.addr(2 downto 0));
152 brev_lenm1 := "000";
153 if r.byte_reverse = '1' then
154 brev_lenm1 := unsigned(r.length(2 downto 0)) - 1;
155 end if;
156
157 -- shift and byte-reverse data bytes
158 for i in 0 to 7 loop
159 kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
160 use_second(i) := kk(3);
161 j := to_integer(kk(2 downto 0)) * 8;
162 data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j);
163 end loop;
164
165 -- Work out the sign bit for sign extension.
166 -- Assumes we are not doing both sign extension and byte reversal,
167 -- in that for unaligned loads crossing two dwords we end up
168 -- using a bit from the second dword, whereas for a byte-reversed
169 -- (i.e. big-endian) load the sign bit would be in the first dword.
170 negative := (r.length(3) and data_permuted(63)) or
171 (r.length(2) and data_permuted(31)) or
172 (r.length(1) and data_permuted(15)) or
173 (r.length(0) and data_permuted(7));
174
175 -- trim and sign-extend
176 for i in 0 to 7 loop
177 if i < to_integer(unsigned(r.length)) then
178 if two_dwords = '1' then
179 trim_ctl(i) := '1' & not use_second(i);
180 else
181 trim_ctl(i) := not use_second(i) & '0';
182 end if;
183 else
184 trim_ctl(i) := '0' & (negative and r.sign_extend);
185 end if;
186 case trim_ctl(i) is
187 when "11" =>
188 data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8);
189 when "10" =>
190 data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8);
191 when "01" =>
192 data_trimmed(i * 8 + 7 downto i * 8) := x"FF";
193 when others =>
194 data_trimmed(i * 8 + 7 downto i * 8) := x"00";
195 end case;
196 end loop;
197
198 case r.state is
199 when IDLE =>
200 if l_in.valid = '1' then
201 v.load := '0';
202 v.dcbz := '0';
203 if l_in.op = OP_LOAD then
204 v.load := '1';
205 elsif l_in.op = OP_DCBZ then
206 v.dcbz := '1';
207 end if;
208 v.addr := lsu_sum;
209 v.write_reg := l_in.write_reg;
210 v.length := l_in.length;
211 v.byte_reverse := l_in.byte_reverse;
212 v.sign_extend := l_in.sign_extend;
213 v.update := l_in.update;
214 v.update_reg := l_in.update_reg;
215 v.xerc := l_in.xerc;
216 v.reserve := l_in.reserve;
217 v.rc := l_in.rc;
218 v.nc := l_in.ci;
219
220 -- XXX Temporary hack. Mark the op as non-cachable if the address
221 -- is the form 0xc-------
222 --
223 -- This will have to be replaced by a combination of implementing the
224 -- proper HV CI load/store instructions and having an MMU to get the I
225 -- bit otherwise.
226 if lsu_sum(31 downto 28) = "1100" then
227 v.nc := '1';
228 end if;
229
230 -- Do length_to_sel and work out if we are doing 2 dwords
231 long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0));
232 byte_sel := long_sel(7 downto 0);
233 v.second_bytes := long_sel(15 downto 8);
234
235 v.addr := lsu_sum;
236
237 -- Do byte reversing and rotating for stores in the first cycle
238 byte_offset := unsigned(lsu_sum(2 downto 0));
239 brev_lenm1 := "000";
240 if l_in.byte_reverse = '1' then
241 brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
242 end if;
243 for i in 0 to 7 loop
244 k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
245 j := to_integer(k) * 8;
246 v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
247 end loop;
248
249 req := '1';
250 stall := '1';
251 if long_sel(15 downto 8) = "00000000" then
252 v.state := LAST_ACK_WAIT;
253 else
254 v.state := SECOND_REQ;
255 end if;
256 end if;
257
258 when SECOND_REQ =>
259 -- compute (addr + 8) & ~7 for the second doubleword when unaligned
260 addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000";
261 byte_sel := r.second_bytes;
262 req := '1';
263 stall := '1';
264 v.state := FIRST_ACK_WAIT;
265
266 when FIRST_ACK_WAIT =>
267 stall := '1';
268 if d_in.valid = '1' then
269 v.state := LAST_ACK_WAIT;
270 if r.load = '1' then
271 v.load_data := data_permuted;
272 end if;
273 end if;
274
275 when LAST_ACK_WAIT =>
276 stall := '1';
277 if d_in.valid = '1' then
278 write_enable := r.load;
279 if r.load = '1' and r.update = '1' then
280 -- loads with rA update need an extra cycle
281 v.state := LD_UPDATE;
282 else
283 -- stores write back rA update in this cycle
284 do_update := r.update;
285 stall := '0';
286 done := '1';
287 v.state := IDLE;
288 end if;
289 end if;
290
291 when LD_UPDATE =>
292 do_update := '1';
293 v.state := IDLE;
294 done := '1';
295 end case;
296
297 -- Update outputs to dcache
298 d_out.valid <= req;
299 d_out.load <= v.load;
300 d_out.dcbz <= v.dcbz;
301 d_out.nc <= v.nc;
302 d_out.reserve <= v.reserve;
303 d_out.addr <= addr;
304 d_out.data <= v.store_data;
305 d_out.byte_sel <= byte_sel;
306
307 -- Update outputs to writeback
308 -- Multiplex either cache data to the destination GPR or
309 -- the address for the rA update.
310 l_out.valid <= done;
311 if do_update = '1' then
312 l_out.write_enable <= '1';
313 l_out.write_reg <= r.update_reg;
314 l_out.write_data <= r.addr;
315 else
316 l_out.write_enable <= write_enable;
317 l_out.write_reg <= r.write_reg;
318 l_out.write_data <= data_trimmed;
319 end if;
320 l_out.xerc <= r.xerc;
321 l_out.rc <= r.rc and done;
322 l_out.store_done <= d_in.store_done;
323
324 stall_out <= stall;
325
326 -- Update registers
327 rin <= v;
328
329 end process;
330
331 end;