dcache: Split PLRU into storage and logic
authorPaul Mackerras <paulus@ozlabs.org>
Wed, 7 Sep 2022 10:21:42 +0000 (20:21 +1000)
committerPaul Mackerras <paulus@ozlabs.org>
Thu, 8 Sep 2022 09:28:10 +0000 (19:28 +1000)
Rather than having update and decode logic for each individual PLRU
as well as a register to store the current PLRU state, we now put the
PLRU state in a little RAM, which will typically use LUT RAM on FPGAs,
and have just a single copy of the logic to calculate the pseudo-LRU
way and to update the PLRU state.

The PLRU RAM that apples to the data storage (as opposed to the TLB)
is read asynchronously in the cycle after the cache tag matching is
done.  At the end of that cycle the PLRU RAM entry is updated if the
access was a cache hit, or a victim way is calculated and stored if
the access was a cache miss.  It is possible that a cache miss doesn't
start being handled until later, in which case the stored victim way
is used later when the miss gets handled.

Similarly for the TLB PLRU, the RAM is read asynchronously in the
cycle after a TLB lookup is done, and either updated at the end of
that cycle (for a hit), or a victim is chosen and stored for when the
TLB miss is satisfied.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
dcache.vhdl

index a29cf6f6c37c6036b251231d41e9920fa5708448..75c2ce00105bc313b42d62f31edfa7aec83d7dac 100644 (file)
@@ -317,6 +317,7 @@ architecture rtl of dcache is
         tlb_hit          : std_ulogic;
         tlb_hit_way      : tlb_way_sig_t;
         tlb_hit_index    : tlb_index_sig_t;
+        tlb_victim       : tlb_way_sig_t;
 
        -- data buffer for data forwarded from writes to reads
        forward_data     : std_ulogic_vector(63 downto 0);
@@ -342,6 +343,8 @@ architecture rtl of dcache is
         acks_pending     : unsigned(2 downto 0);
         inc_acks         : std_ulogic;
         dec_acks         : std_ulogic;
+        choose_victim    : std_ulogic;
+        victim_way       : way_t;
 
         -- Signals to complete (possibly with error)
         ls_valid         : std_ulogic;
@@ -398,8 +401,7 @@ architecture rtl of dcache is
     signal ram_wr_select : std_ulogic_vector(ROW_SIZE - 1 downto 0);
 
     -- PLRU output interface
-    type plru_out_t is array(0 to NUM_LINES-1) of std_ulogic_vector(WAY_BITS-1 downto 0);
-    signal plru_victim : plru_out_t;
+    signal plru_victim : way_t;
     signal replace_way : way_t;
 
     -- Wishbone read/write/cache write formatting signals
@@ -423,8 +425,7 @@ architecture rtl of dcache is
     signal tlb_miss : std_ulogic;
 
     -- TLB PLRU output interface
-    type tlb_plru_out_t is array(tlb_index_t) of std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
-    signal tlb_plru_victim : tlb_plru_out_t;
+    signal tlb_plru_victim : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
 
     signal snoop_tag_set : cache_tags_set_t;
     signal snoop_valid   : std_ulogic;
@@ -650,39 +651,49 @@ begin
     end process;
 
     -- Generate TLB PLRUs
-    maybe_tlb_plrus: if TLB_NUM_WAYS > 1 generate
+    maybe_tlb_plrus : if TLB_NUM_WAYS > 1 generate
+        type tlb_plru_array is array(tlb_index_t) of std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0);
+        signal tlb_plru_ram    : tlb_plru_array;
+        signal tlb_plru_cur    : std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0);
+        signal tlb_plru_upd    : std_ulogic_vector(TLB_NUM_WAYS - 2 downto 0);
+        signal tlb_plru_acc    : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
+        signal tlb_plru_out    : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
     begin
-       tlb_plrus: for i in 0 to TLB_SET_SIZE - 1 generate
-           -- TLB PLRU interface
-           signal tlb_plru_acc    : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
-           signal tlb_plru_acc_en : std_ulogic;
-           signal tlb_plru_out    : std_ulogic_vector(TLB_WAY_BITS-1 downto 0);
-       begin
-           tlb_plru : entity work.plru
-               generic map (
-                   BITS => TLB_WAY_BITS
-                   )
-               port map (
-                   clk => clk,
-                   rst => rst,
-                   acc => tlb_plru_acc,
-                   acc_en => tlb_plru_acc_en,
-                   lru => tlb_plru_out
-                   );
-
-           process(all)
-           begin
-               -- PLRU interface
-               if not is_X(r1.tlb_hit_index) and r1.tlb_hit_index = i then
-                   tlb_plru_acc_en <= r1.tlb_hit;
-                    assert not is_X(r1.tlb_hit_way);
-               else
-                   tlb_plru_acc_en <= '0';
-               end if;
-               tlb_plru_acc <= std_ulogic_vector(r1.tlb_hit_way);
-               tlb_plru_victim(i) <= tlb_plru_out;
-           end process;
-       end generate;
+        tlb_plru : entity work.plrufn
+            generic map (
+                BITS => TLB_WAY_BITS
+                )
+            port map (
+                acc      => tlb_plru_acc,
+                tree_in  => tlb_plru_cur,
+                tree_out => tlb_plru_upd,
+                lru      => tlb_plru_out
+                );
+
+        process(all)
+        begin
+            -- Read PLRU bits from array
+            if is_X(r1.tlb_hit_index) then
+                tlb_plru_cur <= (others => 'X');
+            else
+                tlb_plru_cur <= tlb_plru_ram(to_integer(r1.tlb_hit_index));
+            end if;
+
+            -- PLRU interface
+            tlb_plru_acc <= std_ulogic_vector(r1.tlb_hit_way);
+            tlb_plru_victim <= tlb_plru_out;
+        end process;
+
+        -- synchronous writes to TLB PLRU array
+        process(clk)
+        begin
+            if rising_edge(clk) then
+                if r1.tlb_hit = '1' then
+                    assert not is_X(r1.tlb_hit_index) severity failure;
+                    tlb_plru_ram(to_integer(r1.tlb_hit_index)) <= tlb_plru_upd;
+                end if;
+            end if;
+        end process;
     end generate;
 
     tlb_search : process(all)
@@ -753,7 +764,7 @@ begin
                     if tlb_hit = '1' then
                         repl_way := tlb_hit_way;
                     else
-                        repl_way := unsigned(tlb_plru_victim(to_integer(tlb_req_index)));
+                        repl_way := unsigned(r1.tlb_victim);
                     end if;
                     assert not is_X(repl_way);
                 end if;
@@ -770,39 +781,49 @@ begin
     end process;
 
     -- Generate PLRUs
-    maybe_plrus: if NUM_WAYS > 1 generate
+    maybe_plrus : if NUM_WAYS > 1 generate
+        type plru_array is array(0 to NUM_LINES-1) of std_ulogic_vector(NUM_WAYS - 2 downto 0);
+        signal plru_ram    : plru_array;
+        signal plru_cur    : std_ulogic_vector(NUM_WAYS - 2 downto 0);
+        signal plru_upd    : std_ulogic_vector(NUM_WAYS - 2 downto 0);
+        signal plru_acc    : std_ulogic_vector(WAY_BITS-1 downto 0);
+        signal plru_out    : std_ulogic_vector(WAY_BITS-1 downto 0);
     begin
-       plrus: for i in 0 to NUM_LINES-1 generate
-           -- PLRU interface
-           signal plru_acc    : std_ulogic_vector(WAY_BITS-1 downto 0);
-           signal plru_acc_en : std_ulogic;
-           signal plru_out    : std_ulogic_vector(WAY_BITS-1 downto 0);
-           
-       begin
-           plru : entity work.plru
-               generic map (
-                   BITS => WAY_BITS
-                   )
-               port map (
-                   clk => clk,
-                   rst => rst,
-                   acc => plru_acc,
-                   acc_en => plru_acc_en,
-                   lru => plru_out
-                   );
-
-           process(all)
-           begin
-               -- PLRU interface
-               if not is_X(r1.hit_index) and r1.hit_index = to_unsigned(i, INDEX_BITS) then
-                   plru_acc_en <= r1.cache_hit;
-               else
-                   plru_acc_en <= '0';
-               end if;
-               plru_acc <= std_ulogic_vector(r1.hit_way);
-               plru_victim(i) <= plru_out;
-           end process;
-       end generate;
+        plru : entity work.plrufn
+            generic map (
+                BITS => WAY_BITS
+                )
+            port map (
+                acc      => plru_acc,
+                tree_in  => plru_cur,
+                tree_out => plru_upd,
+                lru      => plru_out
+                );
+
+        process(all)
+        begin
+            -- Read PLRU bits from array
+            if is_X(r1.hit_index) then
+                plru_cur <= (others => 'X');
+            else
+                plru_cur <= plru_ram(to_integer(r1.hit_index));
+            end if;
+
+            -- PLRU interface
+            plru_acc <= std_ulogic_vector(r1.hit_way);
+            plru_victim <= unsigned(plru_out);
+        end process;
+
+        -- synchronous writes to PLRU array
+        process(clk)
+        begin
+            if rising_edge(clk) then
+                if r1.cache_hit = '1' then
+                    assert not is_X(r1.hit_index) severity failure;
+                    plru_ram(to_integer(r1.hit_index)) <= plru_upd;
+                end if;
+            end if;
+        end process;
     end generate;
 
     -- Cache tag RAM read port
@@ -980,8 +1001,13 @@ begin
         replace_way <= to_unsigned(0, WAY_BITS);
         if NUM_WAYS > 1 then
             if r1.write_tag = '1' then
-                assert not is_X(r1.store_index);
-                replace_way <= unsigned(plru_victim(to_integer(r1.store_index)));
+                if r1.choose_victim = '1' then
+                    replace_way <= plru_victim;
+                else
+                    -- Cache victim way was chosen earlier,
+                    -- in the cycle after the miss was detected.
+                    replace_way <= r1.victim_way;
+                end if;
             else
                 replace_way <= r1.store_way;
             end if;
@@ -1305,8 +1331,6 @@ begin
             end if;
 
             -- Fast path for load/store hits. Set signals for the writeback controls.
-            r1.hit_way <= req_hit_way;
-            r1.hit_index <= req_index;
            if req_op = OP_LOAD_HIT then
                r1.hit_load_valid <= '1';
            else
@@ -1340,6 +1364,11 @@ begin
             r1.tlb_hit <= tlb_hit;
             r1.tlb_hit_way <= tlb_hit_way;
             r1.tlb_hit_index <= tlb_req_index;
+            -- determine victim way in the TLB in the cycle after
+            -- we detect the TLB miss
+            if r1.ls_error = '1' then
+                r1.tlb_victim <= unsigned(tlb_plru_victim);
+            end if;
 
        end if;
     end process;
@@ -1364,6 +1393,7 @@ begin
             ev.load_miss <= '0';
             ev.store_miss <= '0';
             ev.dtlb_miss <= tlb_miss;
+            r1.choose_victim <= '0';
 
            -- On reset, clear all valid bits to force misses
             if rst = '1' then
@@ -1460,6 +1490,17 @@ begin
                     end if;
                 end if;
 
+                -- Signals for PLRU update and victim selection
+                r1.hit_way <= req_hit_way;
+                r1.hit_index <= req_index;
+                -- Record victim way in the cycle after we see a load or dcbz miss
+                if r1.choose_victim = '1' then
+                    r1.victim_way <= plru_victim;
+                end if;
+                if req_op = OP_LOAD_MISS or (req_op = OP_STORE_MISS and r0.req.dcbz = '1') then
+                    r1.choose_victim <= '1';
+                end if;
+
                -- Main state machine
                case r1.state is
                 when IDLE =>