constant SPR_SPRG3U : spr_num_t := 259;
constant SPR_HSPRG0 : spr_num_t := 304;
constant SPR_HSPRG1 : spr_num_t := 305;
+ constant SPR_PGTBL0 : spr_num_t := 720;
-- GPR indices in the register file (GPR only)
subtype gpr_index_t is std_ulogic_vector(4 downto 0);
type Loadstore1ToMmuType is record
valid : std_ulogic;
tlbie : std_ulogic;
+ mtspr : std_ulogic;
+ sprn : std_ulogic_vector(3 downto 0);
addr : std_ulogic_vector(63 downto 0);
rs : std_ulogic_vector(63 downto 0);
end record;
type MmuToLoadstore1Type is record
- done : std_ulogic;
- error : std_ulogic;
+ done : std_ulogic;
+ invalid : std_ulogic;
+ badtree : std_ulogic;
+ sprval : std_ulogic_vector(63 downto 0);
end record;
type MmuToDcacheType is record
valid : std_ulogic;
tlbie : std_ulogic;
+ tlbld : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
pte : std_ulogic_vector(63 downto 0);
end record;
type DcacheToMmuType is record
stall : std_ulogic;
done : std_ulogic;
+ err : std_ulogic;
+ data : std_ulogic_vector(63 downto 0);
end record;
type Loadstore1ToWritebackType is record
type reg_stage_0_t is record
req : Loadstore1ToDcacheType;
tlbie : std_ulogic;
+ tlbld : std_ulogic;
+ mmu_req : std_ulogic; -- indicates source of request
end record;
signal r0 : reg_stage_0_t;
type reg_stage_1_t is record
-- Latch the complete request from ls1
req : Loadstore1ToDcacheType;
+ mmu_req : std_ulogic;
-- Cache hit state
hit_way : way_t;
"request collision loadstore vs MMU";
if m_in.valid = '1' then
r0.req.valid <= '1';
- r0.req.load <= '0';
+ r0.req.load <= not (m_in.tlbie or m_in.tlbld);
r0.req.dcbz <= '0';
r0.req.nc <= '0';
r0.req.reserve <= '0';
r0.req.data <= m_in.pte;
r0.req.byte_sel <= (others => '1');
r0.tlbie <= m_in.tlbie;
- assert m_in.tlbie = '1' report "unknown request from MMU";
+ r0.tlbld <= m_in.tlbld;
+ r0.mmu_req <= '1';
else
r0.req <= d_in;
r0.tlbie <= '0';
+ r0.tlbld <= '0';
+ r0.mmu_req <= '0';
end if;
end if;
end if;
end loop;
tlb_hit <= hit and r0_valid;
tlb_hit_way <= hitway;
- pte <= read_tlb_pte(hitway, tlb_pte_way);
+ if tlb_hit = '1' then
+ pte <= read_tlb_pte(hitway, tlb_pte_way);
+ else
+ pte <= (others => '0');
+ end if;
valid_ra <= tlb_hit or not r0.req.virt_mode;
if r0.req.virt_mode = '1' then
ra <= pte(REAL_ADDR_BITS - 1 downto TLB_LG_PGSZ) &
if rising_edge(clk) then
tlbie := '0';
tlbia := '0';
- tlbwe := '0';
+ tlbwe := r0_valid and r0.tlbld;
if r0_valid = '1' and r0.tlbie = '1' then
if r0.req.addr(11 downto 10) /= "00" then
tlbia := '1';
dtlb_ptes(tlb_req_index) <= pteset;
dtlb_valids(tlb_req_index)(repl_way) <= '1';
end if;
- m_out.done <= r0_valid and r0.tlbie;
end if;
end process;
req_tag <= get_tag(ra);
-- Only do anything if not being stalled by stage 1
- go := r0_valid and not r0.tlbie;
+ go := r0_valid and not (r0.tlbie or r0.tlbld);
-- Calculate address of beginning of cache line, will be
-- used for cache miss processing if needed
d_out.perm_error <= '0';
d_out.rc_error <= '0';
+ -- Outputs to MMU
+ m_out.done <= r1.tlbie_done;
+ m_out.err <= '0';
+ m_out.data <= cache_out(r1.hit_way);
+
-- We have a valid load or store hit or we just completed a slow
-- op such as a load miss, a NC load or a store
--
"unexpected hit_load_delayed collision with slow_valid"
severity FAILURE;
- -- Load hit case is the standard path
- if r1.hit_load_valid = '1' then
- report "completing load hit";
- d_out.valid <= '1';
- end if;
+ if r1.mmu_req = '0' then
+ -- Request came from loadstore1...
+ -- Load hit case is the standard path
+ if r1.hit_load_valid = '1' then
+ report "completing load hit";
+ d_out.valid <= '1';
+ end if;
- -- error cases complete without stalling
- if r1.error_done = '1' then
- report "completing ld/st with error";
- d_out.error <= '1';
- d_out.tlb_miss <= r1.tlb_miss;
- d_out.perm_error <= r1.perm_error;
- d_out.rc_error <= r1.rc_error;
- d_out.valid <= '1';
- end if;
+ -- error cases complete without stalling
+ if r1.error_done = '1' then
+ report "completing ld/st with error";
+ d_out.error <= '1';
+ d_out.tlb_miss <= r1.tlb_miss;
+ d_out.perm_error <= r1.perm_error;
+ d_out.rc_error <= r1.rc_error;
+ d_out.valid <= '1';
+ end if;
- -- Slow ops (load miss, NC, stores)
- if r1.slow_valid = '1' then
- -- If it's a load, enable register writeback and switch
- -- mux accordingly
- --
- if r1.req.load then
- -- Read data comes from the slow data latch
- d_out.data <= r1.slow_data;
- end if;
- d_out.store_done <= '1';
+ -- Slow ops (load miss, NC, stores)
+ if r1.slow_valid = '1' then
+ -- If it's a load, enable register writeback and switch
+ -- mux accordingly
+ --
+ if r1.req.load then
+ -- Read data comes from the slow data latch
+ d_out.data <= r1.slow_data;
+ end if;
+ d_out.store_done <= '1';
- report "completing store or load miss";
- d_out.valid <= '1';
- end if;
+ report "completing store or load miss";
+ d_out.valid <= '1';
+ end if;
+
+ if r1.stcx_fail = '1' then
+ d_out.store_done <= '0';
+ d_out.valid <= '1';
+ end if;
+
+ else
+ -- Request came from MMU
+ if r1.hit_load_valid = '1' then
+ report "completing load hit to MMU, data=" & to_hstring(m_out.data);
+ m_out.done <= '1';
+ end if;
- if r1.stcx_fail = '1' then
- d_out.store_done <= '0';
- d_out.valid <= '1';
+ -- error cases complete without stalling
+ if r1.error_done = '1' then
+ report "completing MMU ld with error";
+ m_out.err <= '1';
+ m_out.done <= '1';
+ end if;
+
+ -- Slow ops (i.e. load miss)
+ if r1.slow_valid = '1' then
+ -- Read data comes from the slow data latch
+ m_out.data <= r1.slow_data;
+ report "completing MMU load miss, data=" & to_hstring(m_out.data);
+ m_out.done <= '1';
+ end if;
end if;
end process;
if req_op /= OP_NONE and stall_out = '0' then
r1.req <= r0.req;
+ r1.mmu_req <= r0.mmu_req;
report "op:" & op_t'image(req_op) &
" addr:" & to_hstring(r0.req.addr) &
" nc:" & std_ulogic'image(r0.req.nc) &
end if;
if req_op = OP_BAD then
- report "Signalling ld/st error valid_ra=" & " rc_ok=" & std_ulogic'image(rc_ok) &
- " perm_ok=" & std_ulogic'image(perm_ok);
+ report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) &
+ " rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok);
r1.error_done <= '1';
r1.tlb_miss <= not valid_ra;
r1.perm_error <= valid_ra and not perm_ok;
r1.error_done <= '0';
end if;
- -- complete tlbies in the third cycle
- r1.tlbie_done <= r0_valid and r0.tlbie;
+ -- complete tlbies and TLB loads in the third cycle
+ r1.tlbie_done <= r0_valid and (r0.tlbie or r0.tlbld);
end if;
end process;
v.decode.sgl_pipe := '1';
-- send MMU-related SPRs to loadstore1
case sprn is
- when SPR_DAR | SPR_DSISR =>
+ when SPR_DAR | SPR_DSISR | SPR_PGTBL0 =>
v.decode.unit := LDST;
when others =>
end case;
variable next_addr : std_ulogic_vector(63 downto 0);
variable mmureq : std_ulogic;
variable dsisr : std_ulogic_vector(31 downto 0);
+ variable mmu_mtspr : std_ulogic;
begin
v := r;
req := '0';
byte_sel := (others => '0');
addr := lsu_sum;
mfspr := '0';
+ mmu_mtspr := '0';
+ sprn := std_ulogic_vector(to_unsigned(l_in.spr_num, 10));
sprval := (others => '0'); -- avoid inferred latches
exception := '0';
dsisr := (others => '0');
mfspr := '1';
-- partial decode on SPR number should be adequate given
-- the restricted set that get sent down this path
- sprn := std_ulogic_vector(to_unsigned(l_in.spr_num, 10));
- if sprn(0) = '0' then
- sprval := x"00000000" & r.dsisr;
+ if sprn(9) = '0' then
+ if sprn(0) = '0' then
+ sprval := x"00000000" & r.dsisr;
+ else
+ sprval := r.dar;
+ end if;
else
- sprval := r.dar;
+ -- reading one of the SPRs in the MMU
+ sprval := m_in.sprval;
end if;
when OP_MTSPR =>
done := '1';
- sprn := std_ulogic_vector(to_unsigned(l_in.spr_num, 10));
- if sprn(0) = '0' then
- v.dsisr := l_in.data(31 downto 0);
+ if sprn(9) = '0' then
+ if sprn(0) = '0' then
+ v.dsisr := l_in.data(31 downto 0);
+ else
+ v.dar := l_in.data;
+ end if;
else
- v.dar := l_in.data;
+ -- writing one of the SPRs in the MMU
+ mmu_mtspr := '1';
end if;
when others =>
assert false report "unknown op sent to loadstore1";
byte_sel := r.first_bytes;
end if;
if m_in.done = '1' then
- if m_in.error = '0' then
+ if m_in.invalid = '0' and m_in.badtree = '0' then
-- retry the request now that the MMU has installed a TLB entry
req := '1';
if r.state = MMU_LOOKUP_1ST then
end if;
else
exception := '1';
- dsisr(63 - 33) := '1';
+ dsisr(63 - 33) := m_in.invalid;
dsisr(63 - 38) := not r.load;
+ dsisr(63 - 44) := m_in.badtree;
v.state := IDLE;
end if;
end if;
-- Update outputs to MMU
m_out.valid <= mmureq;
m_out.tlbie <= v.tlbie;
+ m_out.mtspr <= mmu_mtspr;
+ m_out.sprn <= sprn(3 downto 0);
m_out.addr <= addr;
m_out.rs <= l_in.data;
architecture behave of mmu is
type state_t is (IDLE,
- TLBIE_WAIT,
- RADIX_LOOKUP_0
+ TLB_WAIT,
+ RADIX_LOOKUP,
+ RADIX_READ_WAIT,
+ RADIX_LOAD_TLB,
+ RADIX_NO_TRANS,
+ RADIX_BAD_TREE
);
type reg_stage_t is record
-- latched request from loadstore1
valid : std_ulogic;
addr : std_ulogic_vector(63 downto 0);
+ -- internal state
state : state_t;
+ pgtbl0 : std_ulogic_vector(63 downto 0);
+ shift : unsigned(5 downto 0);
+ mask_size : unsigned(4 downto 0);
+ pgbase : std_ulogic_vector(55 downto 0);
+ pde : std_ulogic_vector(63 downto 0);
end record;
signal r, rin : reg_stage_t;
+ signal addrsh : std_ulogic_vector(15 downto 0);
+ signal mask : std_ulogic_vector(15 downto 0);
+ signal finalmask : std_ulogic_vector(43 downto 0);
+
begin
+ -- Multiplex internal SPR values back to loadstore1, selected
+ -- by l_in.sprn. Easy when there's only one...
+ l_out.sprval <= r.pgtbl0;
mmu_0: process(clk)
begin
if rst = '1' then
r.state <= IDLE;
r.valid <= '0';
+ r.pgtbl0 <= (others => '0');
else
if rin.valid = '1' then
report "MMU got tlb miss for " & to_hstring(rin.addr);
end if;
if l_out.done = '1' then
- report "MMU completing miss with error=" & std_ulogic'image(l_out.error);
+ report "MMU completing op with invalid=" & std_ulogic'image(l_out.invalid) &
+ " badtree=" & std_ulogic'image(l_out.badtree);
+ end if;
+ if rin.state = RADIX_LOOKUP then
+ report "radix lookup shift=" & integer'image(to_integer(rin.shift)) &
+ " msize=" & integer'image(to_integer(rin.mask_size));
+ end if;
+ if r.state = RADIX_LOOKUP then
+ report "send load addr=" & to_hstring(d_out.addr) &
+ " addrsh=" & to_hstring(addrsh) & " mask=" & to_hstring(mask);
end if;
r <= rin;
end if;
end if;
end process;
+ -- Shift address bits 61--12 right by 0--47 bits and
+ -- supply the least significant 16 bits of the result.
+ addrshifter: process(all)
+ variable sh1 : std_ulogic_vector(30 downto 0);
+ variable sh2 : std_ulogic_vector(18 downto 0);
+ variable result : std_ulogic_vector(15 downto 0);
+ begin
+ case r.shift(5 downto 4) is
+ when "00" =>
+ sh1 := r.addr(42 downto 12);
+ when "01" =>
+ sh1 := r.addr(58 downto 28);
+ when others =>
+ sh1 := "0000000000000" & r.addr(61 downto 44);
+ end case;
+ case r.shift(3 downto 2) is
+ when "00" =>
+ sh2 := sh1(18 downto 0);
+ when "01" =>
+ sh2 := sh1(22 downto 4);
+ when "10" =>
+ sh2 := sh1(26 downto 8);
+ when others =>
+ sh2 := sh1(30 downto 12);
+ end case;
+ case r.shift(1 downto 0) is
+ when "00" =>
+ result := sh2(15 downto 0);
+ when "01" =>
+ result := sh2(16 downto 1);
+ when "10" =>
+ result := sh2(17 downto 2);
+ when others =>
+ result := sh2(18 downto 3);
+ end case;
+ addrsh <= result;
+ end process;
+
+ -- generate mask for extracting address fields for PTE address generation
+ addrmaskgen: process(all)
+ variable m : std_ulogic_vector(15 downto 0);
+ begin
+ -- mask_count has to be >= 5
+ m := x"001f";
+ for i in 5 to 15 loop
+ if i < to_integer(r.mask_size) then
+ m(i) := '1';
+ end if;
+ end loop;
+ mask <= m;
+ end process;
+
+ -- generate mask for extracting address bits to go in TLB entry
+ -- in order to support pages > 4kB
+ finalmaskgen: process(all)
+ variable m : std_ulogic_vector(43 downto 0);
+ begin
+ m := (others => '0');
+ for i in 0 to 43 loop
+ if i < to_integer(r.shift) then
+ m(i) := '1';
+ end if;
+ end loop;
+ finalmask <= m;
+ end process;
+
mmu_1: process(all)
variable v : reg_stage_t;
variable dcreq : std_ulogic;
variable done : std_ulogic;
- variable err : std_ulogic;
+ variable invalid : std_ulogic;
+ variable badtree : std_ulogic;
+ variable tlb_load : std_ulogic;
+ variable tlbie_req : std_ulogic;
+ variable rts : unsigned(5 downto 0);
+ variable mbits : unsigned(5 downto 0);
+ variable pgtable_addr : std_ulogic_vector(63 downto 0);
+ variable pte : std_ulogic_vector(63 downto 0);
+ variable data : std_ulogic_vector(63 downto 0);
begin
- v.valid := l_in.valid;
- v.addr := l_in.addr;
- v.state := r.state;
+ v := r;
+ v.valid := '0';
dcreq := '0';
done := '0';
- err := '0';
+ invalid := '0';
+ badtree := '0';
+ tlb_load := '0';
+ tlbie_req := '0';
+
+ -- Radix tree data structures in memory are big-endian,
+ -- so we need to byte-swap them
+ for i in 0 to 7 loop
+ data(i * 8 + 7 downto i * 8) := d_in.data((7 - i) * 8 + 7 downto (7 - i) * 8);
+ end loop;
case r.state is
when IDLE =>
+ -- rts == radix tree size, # address bits being translated
+ rts := unsigned('0' & r.pgtbl0(62 downto 61) & r.pgtbl0(7 downto 5)) + (31 - 12);
+ -- mbits == # address bits to index top level of tree
+ mbits := unsigned('0' & r.pgtbl0(4 downto 0));
+ v.shift := rts - mbits;
+ v.mask_size := mbits(4 downto 0);
+ v.pgbase := r.pgtbl0(55 downto 8) & x"00";
+
if l_in.valid = '1' then
+ v.addr := l_in.addr;
if l_in.tlbie = '1' then
dcreq := '1';
- v.state := TLBIE_WAIT;
+ tlbie_req := '1';
+ v.state := TLB_WAIT;
else
- v.state := RADIX_LOOKUP_0;
+ v.valid := '1';
+ -- for now, take RPDS = 0 to disable radix translation
+ if mbits = 0 then
+ v.state := RADIX_NO_TRANS;
+ elsif mbits < 5 or mbits > 16 or mbits > rts then
+ v.state := RADIX_BAD_TREE;
+ else
+ v.state := RADIX_LOOKUP;
+ end if;
end if;
end if;
+ if l_in.mtspr = '1' then
+ v.pgtbl0 := l_in.rs;
+ end if;
- when TLBIE_WAIT =>
+ when TLB_WAIT =>
if d_in.done = '1' then
done := '1';
v.state := IDLE;
end if;
- when RADIX_LOOKUP_0 =>
+ when RADIX_LOOKUP =>
+ dcreq := '1';
+ v.state := RADIX_READ_WAIT;
+
+ when RADIX_READ_WAIT =>
+ if d_in.done = '1' then
+ if d_in.err = '0' then
+ v.pde := data;
+ -- test valid bit
+ if data(63) = '1' then
+ -- test leaf bit
+ if data(62) = '1' then
+ v.state := RADIX_LOAD_TLB;
+ else
+ mbits := unsigned('0' & data(4 downto 0));
+ if mbits < 5 or mbits > 16 or mbits > r.shift then
+ v.state := RADIX_BAD_TREE;
+ else
+ v.shift := v.shift - mbits;
+ v.mask_size := mbits(4 downto 0);
+ v.pgbase := data(55 downto 8) & x"00";
+ v.state := RADIX_LOOKUP;
+ end if;
+ end if;
+ else
+ -- non-present PTE, generate a DSI
+ v.state := RADIX_NO_TRANS;
+ end if;
+ else
+ v.state := RADIX_BAD_TREE;
+ end if;
+ end if;
+
+ when RADIX_LOAD_TLB =>
+ tlb_load := '1';
+ dcreq := '1';
+ v.state := TLB_WAIT;
+
+ when RADIX_NO_TRANS =>
+ done := '1';
+ invalid := '1';
+ v.state := IDLE;
+
+ when RADIX_BAD_TREE =>
done := '1';
- err := '1';
+ badtree := '1';
v.state := IDLE;
end case;
+ pgtable_addr := x"00" & r.pgbase(55 downto 19) &
+ ((r.pgbase(18 downto 3) and not mask) or (addrsh and mask)) &
+ "000";
+ pte := x"00" &
+ ((r.pde(55 downto 12) and not finalmask) or (r.addr(55 downto 12) and finalmask))
+ & r.pde(11 downto 0);
+
-- update registers
rin <= v;
-- drive outputs
l_out.done <= done;
- l_out.error <= err;
+ l_out.invalid <= invalid;
+ l_out.badtree <= badtree;
d_out.valid <= dcreq;
- d_out.tlbie <= l_in.tlbie;
- d_out.addr <= l_in.addr;
- d_out.pte <= l_in.rs;
+ d_out.tlbie <= tlbie_req;
+ d_out.tlbld <= tlb_load;
+ if tlbie_req = '1' then
+ d_out.addr <= l_in.addr;
+ d_out.pte <= l_in.rs;
+ elsif tlb_load = '1' then
+ d_out.addr <= r.addr(63 downto 12) & x"000";
+ d_out.pte <= pte;
+ else
+ d_out.addr <= pgtable_addr;
+ d_out.pte <= (others => '0');
+ end if;
end process;
end;