loadstore1: Further tweaks to improve synthesis with yosys/nextpnr
[microwatt.git] / loadstore1.vhdl
1 library ieee;
2 use ieee.std_logic_1164.all;
3 use ieee.numeric_std.all;
4
5 library work;
6 use work.decode_types.all;
7 use work.common.all;
8
9 -- 2 cycle LSU
10 -- We calculate the address in the first cycle
11
12 entity loadstore1 is
13 generic (
14 -- Non-zero to enable log data collection
15 LOG_LENGTH : natural := 0
16 );
17 port (
18 clk : in std_ulogic;
19 rst : in std_ulogic;
20
21 l_in : in Execute1ToLoadstore1Type;
22 e_out : out Loadstore1ToExecute1Type;
23 l_out : out Loadstore1ToWritebackType;
24
25 d_out : out Loadstore1ToDcacheType;
26 d_in : in DcacheToLoadstore1Type;
27
28 m_out : out Loadstore1ToMmuType;
29 m_in : in MmuToLoadstore1Type;
30
31 dc_stall : in std_ulogic;
32
33 log_out : out std_ulogic_vector(9 downto 0)
34 );
35 end loadstore1;
36
37 -- Note, we don't currently use the stall output from the dcache because
38 -- we know it can take two requests without stalling when idle, we are
39 -- its only user, and we know it never stalls when idle.
40
41 architecture behave of loadstore1 is
42
43 -- State machine for unaligned loads/stores
44 type state_t is (IDLE, -- ready for instruction
45 SECOND_REQ, -- send 2nd request of unaligned xfer
46 ACK_WAIT, -- waiting for ack from dcache
47 MMU_LOOKUP, -- waiting for MMU to look up translation
48 TLBIE_WAIT, -- waiting for MMU to finish doing a tlbie
49 COMPLETE -- extra cycle to complete an operation
50 );
51
52 type reg_stage_t is record
53 -- latch most of the input request
54 load : std_ulogic;
55 tlbie : std_ulogic;
56 dcbz : std_ulogic;
57 mfspr : std_ulogic;
58 addr : std_ulogic_vector(63 downto 0);
59 store_data : std_ulogic_vector(63 downto 0);
60 load_data : std_ulogic_vector(63 downto 0);
61 write_reg : gpr_index_t;
62 length : std_ulogic_vector(3 downto 0);
63 byte_reverse : std_ulogic;
64 sign_extend : std_ulogic;
65 update : std_ulogic;
66 update_reg : gpr_index_t;
67 xerc : xer_common_t;
68 reserve : std_ulogic;
69 rc : std_ulogic;
70 nc : std_ulogic; -- non-cacheable access
71 virt_mode : std_ulogic;
72 priv_mode : std_ulogic;
73 state : state_t;
74 dwords_done : std_ulogic;
75 last_dword : std_ulogic;
76 first_bytes : std_ulogic_vector(7 downto 0);
77 second_bytes : std_ulogic_vector(7 downto 0);
78 dar : std_ulogic_vector(63 downto 0);
79 dsisr : std_ulogic_vector(31 downto 0);
80 instr_fault : std_ulogic;
81 sprval : std_ulogic_vector(63 downto 0);
82 busy : std_ulogic;
83 wait_dcache : std_ulogic;
84 wait_mmu : std_ulogic;
85 do_update : std_ulogic;
86 extra_cycle : std_ulogic;
87 end record;
88
89 type byte_sel_t is array(0 to 7) of std_ulogic;
90 subtype byte_trim_t is std_ulogic_vector(1 downto 0);
91 type trim_ctl_t is array(0 to 7) of byte_trim_t;
92
93 signal r, rin : reg_stage_t;
94 signal lsu_sum : std_ulogic_vector(63 downto 0);
95
96 -- Generate byte enables from sizes
97 function length_to_sel(length : in std_logic_vector(3 downto 0)) return std_ulogic_vector is
98 begin
99 case length is
100 when "0001" =>
101 return "00000001";
102 when "0010" =>
103 return "00000011";
104 when "0100" =>
105 return "00001111";
106 when "1000" =>
107 return "11111111";
108 when others =>
109 return "00000000";
110 end case;
111 end function length_to_sel;
112
113 -- Calculate byte enables
114 -- This returns 16 bits, giving the select signals for two transfers,
115 -- to account for unaligned loads or stores
116 function xfer_data_sel(size : in std_logic_vector(3 downto 0);
117 address : in std_logic_vector(2 downto 0))
118 return std_ulogic_vector is
119 variable longsel : std_ulogic_vector(15 downto 0);
120 begin
121 longsel := "00000000" & length_to_sel(size);
122 return std_ulogic_vector(shift_left(unsigned(longsel),
123 to_integer(unsigned(address))));
124 end function xfer_data_sel;
125
126 begin
127 -- Calculate the address in the first cycle
128 lsu_sum <= std_ulogic_vector(unsigned(l_in.addr1) + unsigned(l_in.addr2)) when l_in.valid = '1' else (others => '0');
129
130 loadstore1_0: process(clk)
131 begin
132 if rising_edge(clk) then
133 if rst = '1' then
134 r.state <= IDLE;
135 r.busy <= '0';
136 r.do_update <= '0';
137 else
138 r <= rin;
139 end if;
140 end if;
141 end process;
142
143 loadstore1_1: process(all)
144 variable v : reg_stage_t;
145 variable brev_lenm1 : unsigned(2 downto 0);
146 variable byte_offset : unsigned(2 downto 0);
147 variable j : integer;
148 variable k : unsigned(2 downto 0);
149 variable kk : unsigned(3 downto 0);
150 variable long_sel : std_ulogic_vector(15 downto 0);
151 variable byte_sel : std_ulogic_vector(7 downto 0);
152 variable req : std_ulogic;
153 variable busy : std_ulogic;
154 variable addr : std_ulogic_vector(63 downto 0);
155 variable maddr : std_ulogic_vector(63 downto 0);
156 variable wdata : std_ulogic_vector(63 downto 0);
157 variable write_enable : std_ulogic;
158 variable do_update : std_ulogic;
159 variable done : std_ulogic;
160 variable data_permuted : std_ulogic_vector(63 downto 0);
161 variable data_trimmed : std_ulogic_vector(63 downto 0);
162 variable use_second : byte_sel_t;
163 variable trim_ctl : trim_ctl_t;
164 variable negative : std_ulogic;
165 variable sprn : std_ulogic_vector(9 downto 0);
166 variable exception : std_ulogic;
167 variable next_addr : std_ulogic_vector(63 downto 0);
168 variable mmureq : std_ulogic;
169 variable dsisr : std_ulogic_vector(31 downto 0);
170 variable mmu_mtspr : std_ulogic;
171 variable itlb_fault : std_ulogic;
172 begin
173 v := r;
174 req := '0';
175 v.mfspr := '0';
176 mmu_mtspr := '0';
177 itlb_fault := '0';
178 sprn := std_ulogic_vector(to_unsigned(decode_spr_num(l_in.insn), 10));
179 dsisr := (others => '0');
180 mmureq := '0';
181
182 write_enable := '0';
183
184 do_update := r.do_update;
185 v.do_update := '0';
186
187 -- load data formatting
188 byte_offset := unsigned(r.addr(2 downto 0));
189 brev_lenm1 := "000";
190 if r.byte_reverse = '1' then
191 brev_lenm1 := unsigned(r.length(2 downto 0)) - 1;
192 end if;
193
194 -- shift and byte-reverse data bytes
195 for i in 0 to 7 loop
196 kk := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
197 use_second(i) := kk(3);
198 j := to_integer(kk(2 downto 0)) * 8;
199 data_permuted(i * 8 + 7 downto i * 8) := d_in.data(j + 7 downto j);
200 end loop;
201
202 -- Work out the sign bit for sign extension.
203 -- Assumes we are not doing both sign extension and byte reversal,
204 -- in that for unaligned loads crossing two dwords we end up
205 -- using a bit from the second dword, whereas for a byte-reversed
206 -- (i.e. big-endian) load the sign bit would be in the first dword.
207 negative := (r.length(3) and data_permuted(63)) or
208 (r.length(2) and data_permuted(31)) or
209 (r.length(1) and data_permuted(15)) or
210 (r.length(0) and data_permuted(7));
211
212 -- trim and sign-extend
213 for i in 0 to 7 loop
214 if i < to_integer(unsigned(r.length)) then
215 if r.dwords_done = '1' then
216 trim_ctl(i) := '1' & not use_second(i);
217 else
218 trim_ctl(i) := "10";
219 end if;
220 else
221 trim_ctl(i) := '0' & (negative and r.sign_extend);
222 end if;
223 case trim_ctl(i) is
224 when "11" =>
225 data_trimmed(i * 8 + 7 downto i * 8) := r.load_data(i * 8 + 7 downto i * 8);
226 when "10" =>
227 data_trimmed(i * 8 + 7 downto i * 8) := data_permuted(i * 8 + 7 downto i * 8);
228 when "01" =>
229 data_trimmed(i * 8 + 7 downto i * 8) := x"FF";
230 when others =>
231 data_trimmed(i * 8 + 7 downto i * 8) := x"00";
232 end case;
233 end loop;
234
235 -- compute (addr + 8) & ~7 for the second doubleword when unaligned
236 next_addr := std_ulogic_vector(unsigned(r.addr(63 downto 3)) + 1) & "000";
237
238 -- Busy calculation.
239 -- We need to minimize the delay from clock to busy valid because it
240 -- gates the start of execution of the next instruction.
241 busy := r.busy and not ((r.wait_dcache and d_in.valid) or (r.wait_mmu and m_in.done));
242 v.busy := busy;
243
244 done := '0';
245 if r.state /= IDLE and busy = '0' then
246 done := '1';
247 end if;
248 exception := '0';
249
250 if r.dwords_done = '1' or r.state = SECOND_REQ then
251 maddr := next_addr;
252 byte_sel := r.second_bytes;
253 else
254 maddr := r.addr;
255 byte_sel := r.first_bytes;
256 end if;
257 addr := maddr;
258
259 case r.state is
260 when IDLE =>
261
262 when SECOND_REQ =>
263 req := '1';
264 v.state := ACK_WAIT;
265 v.last_dword := '0';
266
267 when ACK_WAIT =>
268 if d_in.error = '1' then
269 -- dcache will discard the second request if it
270 -- gets an error on the 1st of two requests
271 if d_in.cache_paradox = '1' then
272 -- signal an interrupt straight away
273 exception := '1';
274 dsisr(63 - 38) := not r.load;
275 -- XXX there is no architected bit for this
276 dsisr(63 - 35) := d_in.cache_paradox;
277 else
278 -- Look up the translation for TLB miss
279 -- and also for permission error and RC error
280 -- in case the PTE has been updated.
281 mmureq := '1';
282 v.state := MMU_LOOKUP;
283 end if;
284 end if;
285 if d_in.valid = '1' then
286 if r.last_dword = '0' then
287 v.dwords_done := '1';
288 v.last_dword := '1';
289 if r.load = '1' then
290 v.load_data := data_permuted;
291 end if;
292 else
293 write_enable := r.load;
294 if r.extra_cycle = '1' then
295 -- loads with rA update need an extra cycle
296 v.state := COMPLETE;
297 v.do_update := r.update;
298 else
299 -- stores write back rA update in this cycle
300 do_update := r.update;
301 end if;
302 v.busy := '0';
303 end if;
304 end if;
305 -- r.wait_dcache gets set one cycle after we come into ACK_WAIT state,
306 -- which is OK because the dcache always takes at least two cycles.
307 v.wait_dcache := r.last_dword and not r.extra_cycle;
308
309 when MMU_LOOKUP =>
310 if m_in.done = '1' then
311 if r.instr_fault = '0' then
312 -- retry the request now that the MMU has installed a TLB entry
313 req := '1';
314 if r.last_dword = '0' then
315 v.state := SECOND_REQ;
316 else
317 v.state := ACK_WAIT;
318 end if;
319 end if;
320 end if;
321 if m_in.err = '1' then
322 exception := '1';
323 dsisr(63 - 33) := m_in.invalid;
324 dsisr(63 - 36) := m_in.perm_error;
325 dsisr(63 - 38) := not r.load;
326 dsisr(63 - 44) := m_in.badtree;
327 dsisr(63 - 45) := m_in.rc_error;
328 end if;
329
330 when TLBIE_WAIT =>
331
332 when COMPLETE =>
333
334 end case;
335
336 if done = '1' or exception = '1' then
337 v.state := IDLE;
338 v.busy := '0';
339 end if;
340
341 -- Note that l_in.valid is gated with busy inside execute1
342 if l_in.valid = '1' then
343 v.addr := lsu_sum;
344 v.load := '0';
345 v.dcbz := '0';
346 v.tlbie := '0';
347 v.instr_fault := '0';
348 v.dwords_done := '0';
349 v.last_dword := '1';
350 v.write_reg := l_in.write_reg;
351 v.length := l_in.length;
352 v.byte_reverse := l_in.byte_reverse;
353 v.sign_extend := l_in.sign_extend;
354 v.update := l_in.update;
355 v.update_reg := l_in.update_reg;
356 v.xerc := l_in.xerc;
357 v.reserve := l_in.reserve;
358 v.rc := l_in.rc;
359 v.nc := l_in.ci;
360 v.virt_mode := l_in.virt_mode;
361 v.priv_mode := l_in.priv_mode;
362 v.wait_dcache := '0';
363 v.wait_mmu := '0';
364 v.do_update := '0';
365 v.extra_cycle := '0';
366
367 addr := lsu_sum;
368 maddr := l_in.addr2; -- address from RB for tlbie
369
370 -- XXX Temporary hack. Mark the op as non-cachable if the address
371 -- is the form 0xc------- for a real-mode access.
372 if lsu_sum(31 downto 28) = "1100" and l_in.virt_mode = '0' then
373 v.nc := '1';
374 end if;
375
376 -- Do length_to_sel and work out if we are doing 2 dwords
377 long_sel := xfer_data_sel(l_in.length, v.addr(2 downto 0));
378 byte_sel := long_sel(7 downto 0);
379 v.first_bytes := byte_sel;
380 v.second_bytes := long_sel(15 downto 8);
381
382 -- Do byte reversing and rotating for stores in the first cycle
383 byte_offset := unsigned(lsu_sum(2 downto 0));
384 brev_lenm1 := "000";
385 if l_in.byte_reverse = '1' then
386 brev_lenm1 := unsigned(l_in.length(2 downto 0)) - 1;
387 end if;
388 for i in 0 to 7 loop
389 k := (to_unsigned(i, 3) xor brev_lenm1) + byte_offset;
390 j := to_integer(k) * 8;
391 v.store_data(j + 7 downto j) := l_in.data(i * 8 + 7 downto i * 8);
392 end loop;
393
394 case l_in.op is
395 when OP_STORE =>
396 req := '1';
397 when OP_LOAD =>
398 req := '1';
399 v.load := '1';
400 -- Allow an extra cycle for RA update on loads
401 v.extra_cycle := l_in.update;
402 when OP_DCBZ =>
403 req := '1';
404 v.dcbz := '1';
405 when OP_TLBIE =>
406 mmureq := '1';
407 v.tlbie := '1';
408 v.state := TLBIE_WAIT;
409 v.wait_mmu := '1';
410 when OP_MFSPR =>
411 v.mfspr := '1';
412 -- partial decode on SPR number should be adequate given
413 -- the restricted set that get sent down this path
414 if sprn(9) = '0' and sprn(5) = '0' then
415 if sprn(0) = '0' then
416 v.sprval := x"00000000" & r.dsisr;
417 else
418 v.sprval := r.dar;
419 end if;
420 else
421 -- reading one of the SPRs in the MMU
422 v.sprval := m_in.sprval;
423 end if;
424 v.state := COMPLETE;
425 when OP_MTSPR =>
426 if sprn(9) = '0' and sprn(5) = '0' then
427 if sprn(0) = '0' then
428 v.dsisr := l_in.data(31 downto 0);
429 else
430 v.dar := l_in.data;
431 end if;
432 v.state := COMPLETE;
433 else
434 -- writing one of the SPRs in the MMU
435 mmu_mtspr := '1';
436 v.state := TLBIE_WAIT;
437 v.wait_mmu := '1';
438 end if;
439 when OP_FETCH_FAILED =>
440 -- send it to the MMU to do the radix walk
441 maddr := l_in.nia;
442 v.instr_fault := '1';
443 mmureq := '1';
444 v.state := MMU_LOOKUP;
445 v.wait_mmu := '1';
446 when others =>
447 assert false report "unknown op sent to loadstore1";
448 end case;
449
450 if req = '1' then
451 if long_sel(15 downto 8) = "00000000" then
452 v.state := ACK_WAIT;
453 else
454 v.state := SECOND_REQ;
455 end if;
456 end if;
457
458 v.busy := req or mmureq or mmu_mtspr;
459 end if;
460
461 -- Update outputs to dcache
462 d_out.valid <= req;
463 d_out.load <= v.load;
464 d_out.dcbz <= v.dcbz;
465 d_out.nc <= v.nc;
466 d_out.reserve <= v.reserve;
467 d_out.addr <= addr;
468 d_out.data <= v.store_data;
469 d_out.byte_sel <= byte_sel;
470 d_out.virt_mode <= v.virt_mode;
471 d_out.priv_mode <= v.priv_mode;
472
473 -- Update outputs to MMU
474 m_out.valid <= mmureq;
475 m_out.iside <= v.instr_fault;
476 m_out.load <= r.load;
477 m_out.priv <= r.priv_mode;
478 m_out.tlbie <= v.tlbie;
479 m_out.mtspr <= mmu_mtspr;
480 m_out.sprn <= sprn;
481 m_out.addr <= maddr;
482 m_out.slbia <= l_in.insn(7);
483 m_out.rs <= l_in.data;
484
485 -- Update outputs to writeback
486 -- Multiplex either cache data to the destination GPR or
487 -- the address for the rA update.
488 l_out.valid <= done;
489 if r.mfspr = '1' then
490 l_out.write_enable <= '1';
491 l_out.write_reg <= r.write_reg;
492 l_out.write_data <= r.sprval;
493 elsif do_update = '1' then
494 l_out.write_enable <= '1';
495 l_out.write_reg <= r.update_reg;
496 l_out.write_data <= r.addr;
497 else
498 l_out.write_enable <= write_enable;
499 l_out.write_reg <= r.write_reg;
500 l_out.write_data <= data_trimmed;
501 end if;
502 l_out.xerc <= r.xerc;
503 l_out.rc <= r.rc and done;
504 l_out.store_done <= d_in.store_done;
505
506 -- update exception info back to execute1
507 e_out.busy <= busy;
508 e_out.exception <= exception;
509 e_out.instr_fault <= r.instr_fault;
510 e_out.invalid <= m_in.invalid;
511 e_out.badtree <= m_in.badtree;
512 e_out.perm_error <= m_in.perm_error;
513 e_out.rc_error <= m_in.rc_error;
514 e_out.segment_fault <= m_in.segerr;
515 if exception = '1' and r.instr_fault = '0' then
516 v.dar := addr;
517 if m_in.segerr = '0' then
518 v.dsisr := dsisr;
519 end if;
520 end if;
521
522 -- Update registers
523 rin <= v;
524
525 end process;
526
527 l1_log: if LOG_LENGTH > 0 generate
528 signal log_data : std_ulogic_vector(9 downto 0);
529 begin
530 ls1_log: process(clk)
531 begin
532 if rising_edge(clk) then
533 log_data <= e_out.busy &
534 e_out.exception &
535 l_out.valid &
536 m_out.valid &
537 d_out.valid &
538 m_in.done &
539 r.dwords_done &
540 std_ulogic_vector(to_unsigned(state_t'pos(r.state), 3));
541 end if;
542 end process;
543 log_out <= log_data;
544 end generate;
545
546 end;