Fix typo

[microwatt.git] / dcache.vhdl
diff --git a/dcache.vhdl b/dcache.vhdl

index bc351b0dde74b686c72ee4db4e1fc458543936ef..bb93148c3b68f94f3dcdcb16894d36c422b2183e 100644 (file)
--- a/dcache.vhdl
+++ b/dcache.vhdl
@@ -1,12 +1,6 @@
  --
  -- Set associative dcache write-through
  --
  --
  -- Set associative dcache write-through
  --
--- TODO (in no specific order):
---
--- * See list in icache.vhdl
--- * Complete load misses on the cycle when WB data comes instead of
---   at the end of line (this requires dealing with requests coming in
---   while not idle...)
  --
  library ieee;
  use ieee.std_logic_1164.all;
  --
  library ieee;
  use ieee.std_logic_1164.all;
@@ -31,7 +25,9 @@ entity dcache is
          -- L1 DTLB number of sets
          TLB_NUM_WAYS : positive := 2;
          -- L1 DTLB log_2(page_size)
          -- L1 DTLB number of sets
          TLB_NUM_WAYS : positive := 2;
          -- L1 DTLB log_2(page_size)
-        TLB_LG_PGSZ : positive := 12
+        TLB_LG_PGSZ : positive := 12;
+        -- Non-zero to enable log data collection
+        LOG_LENGTH : natural := 0
          );
      port (
          clk          : in std_ulogic;
          );
      port (
          clk          : in std_ulogic;
@@ -55,7 +51,7 @@ end entity dcache;
  architecture rtl of dcache is
      -- BRAM organisation: We never access more than wishbone_data_bits at
      -- a time so to save resources we make the array only that wide, and
  architecture rtl of dcache is
      -- BRAM organisation: We never access more than wishbone_data_bits at
      -- a time so to save resources we make the array only that wide, and
-    -- use consecutive indices for to make a cache "line"
+    -- use consecutive indices to make a cache "line"
      --
      -- ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
      constant ROW_SIZE      : natural := wishbone_data_bits / 8;
      --
      -- ROW_SIZE is the width in bytes of the BRAM (based on WB, so 64-bits)
      constant ROW_SIZE      : natural := wishbone_data_bits / 8;
@@ -204,21 +200,82 @@ architecture rtl of dcache is
      -- which means that the BRAM output is delayed by an extra cycle.
      --
      -- Thus, the dcache has a 2-stage internal pipeline for cache hits
      -- which means that the BRAM output is delayed by an extra cycle.
      --
      -- Thus, the dcache has a 2-stage internal pipeline for cache hits
-    -- with no stalls.
+    -- with no stalls.  Stores also complete in 2 cycles in most
+    -- circumstances.
+    --
+    -- A request proceeds through the pipeline as follows.
+    --
+    -- Cycle 0: Request is received from loadstore or mmu if either
+    -- d_in.valid or m_in.valid is 1 (not both).  In this cycle portions
+    -- of the address are presented to the TLB tag RAM and data RAM
+    -- and the cache tag RAM and data RAM.
+    --
+    -- Clock edge between cycle 0 and cycle 1:
+    -- Request is stored in r0 (assuming r0_full was 0).  TLB tag and
+    -- data RAMs are read, and the cache tag RAM is read.  (Cache data
+    -- comes out a cycle later due to its output register, giving the
+    -- whole of cycle 1 to read the cache data RAM.)
+    --
+    -- Cycle 1: TLB and cache tag matching is done, the real address
+    -- (RA) for the access is calculated, and the type of operation is
+    -- determined (the OP_* values above).  This gives the TLB way for
+    -- a TLB hit, and the cache way for a hit or the way to replace
+    -- for a load miss.
+    --
+    -- Clock edge between cycle 1 and cycle 2:
+    -- Request is stored in r1 (assuming r1.full was 0)
+    -- The state machine transitions out of IDLE state for a load miss,
+    -- a store, a dcbz, or a non-cacheable load.  r1.full is set to 1
+    -- for a load miss, dcbz or non-cacheable load but not a store.
+    --
+    -- Cycle 2: Completion signals are asserted for a load hit,
+    -- a store (excluding dcbz), a TLB operation, a conditional
+    -- store which failed due to no matching reservation, or an error
+    -- (cache hit on non-cacheable operation, TLB miss, or protection
+    -- fault).
      --
      --
-    -- All other operations are handled via stalling in the first stage.
+    -- For a load miss, store, or dcbz, the state machine initiates
+    -- a wishbone cycle, which takes at least 2 cycles.  For a store,
+    -- if another store comes in with the same cache tag (therefore
+    -- in the same 4k page), it can be added on to the existing cycle,
+    -- subject to some constraints.
+    -- While r1.full = 1, no new requests can go from r0 to r1, but
+    -- requests can come in to r0 and be satisfied if they are
+    -- cacheable load hits or stores with the same cache tag.
      --
      --
-    -- The second stage can thus complete a hit at the same time as the
-    -- first stage emits a stall for a complex op.
+    -- Writing to the cache data RAM is done at the clock edge
+    -- at the end of cycle 2 for a store hit (excluding dcbz).
+    -- Stores that miss are not written to the cache data RAM
+    -- but just stored through to memory.
+    -- Dcbz is done like a cache miss, but the wishbone cycle
+    -- is a write rather than a read, and zeroes are written to
+    -- the cache data RAM.  Thus dcbz will allocate the line in
+    -- the cache as well as zeroing memory.
      --
      --
+    -- Since stores are written to the cache data RAM at the end of
+    -- cycle 2, and loads can come in and hit on the data just stored,
+    -- there is a two-stage bypass from store data to load data to
+    -- make sure that loads always see previously-stored data even
+    -- if it has not yet made it to the cache data RAM.
+    --
+    -- Load misses read the requested dword of the cache line first in
+    -- the memory read request and then cycle around through the other
+    -- dwords.  The load is completed on the cycle after the requested
+    -- dword comes back from memory (using a forwarding path, rather
+    -- than going via the cache data RAM).  We maintain an array of
+    -- valid bits per dword for the line being refilled so that
+    -- subsequent load requests to the same line can be completed as
+    -- soon as the necessary data comes in from memory, without
+    -- waiting for the whole line to be read.
  
      -- Stage 0 register, basically contains just the latched request
      type reg_stage_0_t is record
          req   : Loadstore1ToDcacheType;
  
      -- Stage 0 register, basically contains just the latched request
      type reg_stage_0_t is record
          req   : Loadstore1ToDcacheType;
-        tlbie : std_ulogic;
-        doall : std_ulogic;
-        tlbld : std_ulogic;
+        tlbie : std_ulogic;     -- indicates a tlbie request (from MMU)
+        doall : std_ulogic;     -- with tlbie, indicates flush whole TLB
+        tlbld : std_ulogic;     -- indicates a TLB load request (from MMU)
          mmu_req : std_ulogic;   -- indicates source of request
          mmu_req : std_ulogic;   -- indicates source of request
+        d_valid : std_ulogic;   -- indicates req.data is valid now
      end record;
  
      signal r0 : reg_stage_0_t;
      end record;
  
      signal r0 : reg_stage_0_t;
@@ -226,12 +283,14 @@ architecture rtl of dcache is
  
      type mem_access_request_t is record
          op        : op_t;
  
      type mem_access_request_t is record
          op        : op_t;
+        valid     : std_ulogic;
          dcbz      : std_ulogic;
          real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
          data      : std_ulogic_vector(63 downto 0);
          byte_sel  : std_ulogic_vector(7 downto 0);
          hit_way   : way_t;
          dcbz      : std_ulogic;
          real_addr : std_ulogic_vector(REAL_ADDR_BITS - 1 downto 0);
          data      : std_ulogic_vector(63 downto 0);
          byte_sel  : std_ulogic_vector(7 downto 0);
          hit_way   : way_t;
-        repl_way  : way_t;
+        same_tag  : std_ulogic;
+        mmu_req   : std_ulogic;
      end record;
  
      -- First stage register, contains state for stage 1 of load hits
      end record;
  
      -- First stage register, contains state for stage 1 of load hits
@@ -246,6 +305,13 @@ architecture rtl of dcache is
         -- Cache hit state
         hit_way          : way_t;
         hit_load_valid   : std_ulogic;
         -- Cache hit state
         hit_way          : way_t;
         hit_load_valid   : std_ulogic;
+        hit_index        : index_t;
+        cache_hit        : std_ulogic;
+
+        -- TLB hit state
+        tlb_hit          : std_ulogic;
+        tlb_hit_way      : tlb_way_t;
+        tlb_hit_index    : tlb_index_t;
  
         -- 2-stage data buffer for data forwarded from writes to reads
         forward_data1    : std_ulogic_vector(63 downto 0);
  
         -- 2-stage data buffer for data forwarded from writes to reads
         forward_data1    : std_ulogic_vector(63 downto 0);
@@ -271,16 +337,18 @@ architecture rtl of dcache is
          end_row_ix       : row_in_line_t;
          rows_valid       : row_per_line_valid_t;
          acks_pending     : unsigned(2 downto 0);
          end_row_ix       : row_in_line_t;
          rows_valid       : row_per_line_valid_t;
          acks_pending     : unsigned(2 downto 0);
-
-        -- Signals to complete with error
-        error_done       : std_ulogic;
+        inc_acks         : std_ulogic;
+        dec_acks         : std_ulogic;
+
+        -- Signals to complete (possibly with error)
+        ls_valid         : std_ulogic;
+        ls_error         : std_ulogic;
+        mmu_done         : std_ulogic;
+        mmu_error        : std_ulogic;
          cache_paradox    : std_ulogic;
  
          -- Signal to complete a failed stcx.
          stcx_fail        : std_ulogic;
          cache_paradox    : std_ulogic;
  
          -- Signal to complete a failed stcx.
          stcx_fail        : std_ulogic;
-
-        -- completion signal for tlbie
-        tlbie_done       : std_ulogic;
      end record;
  
      signal r1 : reg_stage_1_t;
      end record;
  
      signal r1 : reg_stage_1_t;
@@ -301,6 +369,8 @@ architecture rtl of dcache is
      signal req_tag     : cache_tag_t;
      signal req_op      : op_t;
      signal req_data    : std_ulogic_vector(63 downto 0);
      signal req_tag     : cache_tag_t;
      signal req_op      : op_t;
      signal req_data    : std_ulogic_vector(63 downto 0);
+    signal req_same_tag : std_ulogic;
+    signal req_go      : std_ulogic;
  
      signal early_req_row  : row_t;
  
  
      signal early_req_row  : row_t;
  
@@ -453,8 +523,6 @@ architecture rtl of dcache is
          ptes(j + TLB_PTE_BITS - 1 downto j) := newpte;
      end;
  
          ptes(j + TLB_PTE_BITS - 1 downto j) := newpte;
      end;
  
-    signal log_data : std_ulogic_vector(19 downto 0);
-
  begin
  
      assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE;
  begin
  
      assert LINE_SIZE mod ROW_SIZE = 0 report "LINE_SIZE not multiple of ROW_SIZE" severity FAILURE;
@@ -497,17 +565,27 @@ begin
                  r.mmu_req := '1';
              else
                  r.req := d_in;
                  r.mmu_req := '1';
              else
                  r.req := d_in;
+                r.req.data := (others => '0');
                  r.tlbie := '0';
                  r.doall := '0';
                  r.tlbld := '0';
                  r.mmu_req := '0';
              end if;
                  r.tlbie := '0';
                  r.doall := '0';
                  r.tlbld := '0';
                  r.mmu_req := '0';
              end if;
+            r.d_valid := '0';
              if rst = '1' then
                  r0_full <= '0';
              if rst = '1' then
                  r0_full <= '0';
-            elsif r1.full = '0' or r0_full = '0' then
+            elsif (r1.full = '0' and d_in.hold = '0') or r0_full = '0' then
                  r0 <= r;
                  r0_full <= r.req.valid;
              end if;
                  r0 <= r;
                  r0_full <= r.req.valid;
              end if;
+            -- Sample data the cycle after a request comes in from loadstore1.
+            -- If another request has come in already then the data will get
+            -- put directly into req.data below.
+            if r0.req.valid = '1' and r.req.valid = '0' and r0.d_valid = '0' and
+                r0.mmu_req = '0' then
+                r0.req.data <= d_in.data;
+                r0.d_valid <= '1';
+            end if;
          end if;
      end process;
  
          end if;
      end process;
  
@@ -515,8 +593,8 @@ begin
      m_out.stall <= '0';
  
      -- Hold off the request in r0 when r1 has an uncompleted request
      m_out.stall <= '0';
  
      -- Hold off the request in r0 when r1 has an uncompleted request
-    r0_stall <= r0_full and r1.full;
-    r0_valid <= r0_full and not r1.full;
+    r0_stall <= r0_full and (r1.full or d_in.hold);
+    r0_valid <= r0_full and not r1.full and not d_in.hold;
      stall_out <= r0_stall;
  
      -- TLB
      stall_out <= r0_stall;
  
      -- TLB
@@ -564,15 +642,15 @@ begin
                     lru => tlb_plru_out
                     );
  
                     lru => tlb_plru_out
                     );
  
-           process(tlb_req_index, tlb_hit, tlb_hit_way, tlb_plru_out)
+           process(all)
             begin
                 -- PLRU interface
             begin
                 -- PLRU interface
-               if tlb_hit = '1' and tlb_req_index = i then
-                   tlb_plru_acc_en <= '1';
+               if r1.tlb_hit_index = i then
+                   tlb_plru_acc_en <= r1.tlb_hit;
                 else
                     tlb_plru_acc_en <= '0';
                 end if;
                 else
                     tlb_plru_acc_en <= '0';
                 end if;
-               tlb_plru_acc <= std_ulogic_vector(to_unsigned(tlb_hit_way, TLB_WAY_BITS));
+               tlb_plru_acc <= std_ulogic_vector(to_unsigned(r1.tlb_hit_way, TLB_WAY_BITS));
                 tlb_plru_victim(i) <= tlb_plru_out;
             end process;
         end generate;
                 tlb_plru_victim(i) <= tlb_plru_out;
             end process;
         end generate;
@@ -675,16 +753,15 @@ begin
                     lru => plru_out
                     );
  
                     lru => plru_out
                     );
  
-           process(req_index, req_op, req_hit_way, plru_out)
+           process(all)
             begin
                 -- PLRU interface
             begin
                 -- PLRU interface
-               if (req_op = OP_LOAD_HIT or
-                   req_op = OP_STORE_HIT) and req_index = i then
-                   plru_acc_en <= '1';
+               if r1.hit_index = i then
+                   plru_acc_en <= r1.cache_hit;
                 else
                     plru_acc_en <= '0';
                 end if;
                 else
                     plru_acc_en <= '0';
                 end if;
-               plru_acc <= std_ulogic_vector(to_unsigned(req_hit_way, WAY_BITS));
+               plru_acc <= std_ulogic_vector(to_unsigned(r1.hit_way, WAY_BITS));
                 plru_victim(i) <= plru_out;
             end process;
         end generate;
                 plru_victim(i) <= plru_out;
             end process;
         end generate;
@@ -728,7 +805,7 @@ begin
          req_row <= get_row(r0.req.addr);
          req_tag <= get_tag(ra);
  
          req_row <= get_row(r0.req.addr);
          req_tag <= get_tag(ra);
  
-        go := r0_valid and not (r0.tlbie or r0.tlbld) and not r1.error_done;
+        go := r0_valid and not (r0.tlbie or r0.tlbld) and not r1.ls_error;
  
         -- Test if pending request is a hit on any way
          -- In order to make timing in virtual mode, when we are using the TLB,
  
         -- Test if pending request is a hit on any way
          -- In order to make timing in virtual mode, when we are using the TLB,
@@ -777,6 +854,7 @@ begin
                  rel_match := '1';
              end if;
          end if;
                  rel_match := '1';
              end if;
          end if;
+        req_same_tag <= rel_match;
  
          -- See if the request matches the line currently being reloaded
          if r1.state = RELOAD_WAIT_ACK and req_index = r1.store_index and
  
          -- See if the request matches the line currently being reloaded
          if r1.state = RELOAD_WAIT_ACK and req_index = r1.store_index and
@@ -785,7 +863,7 @@ begin
              -- since it will be by the time we perform the store.
              -- For a load, check the appropriate row valid bit.
              is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE);
              -- since it will be by the time we perform the store.
              -- For a load, check the appropriate row valid bit.
              is_hit := not r0.req.load or r1.rows_valid(req_row mod ROW_PER_LINE);
-            hit_way := r1.store_way;
+            hit_way := replace_way;
          end if;
  
          -- Whether to use forwarded data for a load or not
          end if;
  
          -- Whether to use forwarded data for a load or not
@@ -808,8 +886,12 @@ begin
         -- The way that matched on a hit               
         req_hit_way <= hit_way;
  
         -- The way that matched on a hit               
         req_hit_way <= hit_way;
  
-       -- The way to replace on a miss
-       replace_way <= to_integer(unsigned(plru_victim(req_index)));
+        -- The way to replace on a miss
+        if r1.write_tag = '1' then
+            replace_way <= to_integer(unsigned(plru_victim(r1.store_index)));
+        else
+            replace_way <= r1.store_way;
+        end if;
  
          -- work out whether we have permission for this access
          -- NB we don't yet implement AMR, thus no KUAP
  
          -- work out whether we have permission for this access
          -- NB we don't yet implement AMR, thus no KUAP
@@ -844,6 +926,7 @@ begin
              end if;
          end if;
         req_op <= op;
              end if;
          end if;
         req_op <= op;
+        req_go <= go;
  
          -- Version of the row number that is valid one cycle earlier
          -- in the cases where we need to read the cache data BRAM.
  
          -- Version of the row number that is valid one cycle earlier
          -- in the cases where we need to read the cache data BRAM.
@@ -874,10 +957,10 @@ begin
              -- XXX or if r0.req.nc = '1'
              if r0.req.load = '1' then
                  -- load with reservation
              -- XXX or if r0.req.nc = '1'
              if r0.req.load = '1' then
                  -- load with reservation
-                set_rsrv <= '1';
+                set_rsrv <= r0.req.atomic_last;
              else
                  -- store conditional
              else
                  -- store conditional
-                clear_rsrv <= '1';
+                clear_rsrv <= r0.req.atomic_last;
                  if reservation.valid = '0' or
                      r0.req.addr(63 downto LINE_OFF_BITS) /= reservation.addr then
                      cancel_store <= '1';
                  if reservation.valid = '0' or
                      r0.req.addr(63 downto LINE_OFF_BITS) /= reservation.addr then
                      cancel_store <= '1';
@@ -925,15 +1008,15 @@ begin
              end if;
          end loop;
  
              end if;
          end loop;
  
-       d_out.valid <= '0';
+       d_out.valid <= r1.ls_valid;
         d_out.data <= data_out;
         d_out.data <= data_out;
-        d_out.store_done <= '0';
-        d_out.error <= '0';
-        d_out.cache_paradox <= '0';
+        d_out.store_done <= not r1.stcx_fail;
+        d_out.error <= r1.ls_error;
+        d_out.cache_paradox <= r1.cache_paradox;
  
          -- Outputs to MMU
  
          -- Outputs to MMU
-        m_out.done <= r1.tlbie_done;
-        m_out.err <= '0';
+        m_out.done <= r1.mmu_done;
+        m_out.err <= r1.mmu_error;
          m_out.data <= data_out;
  
         -- We have a valid load or store hit or we just completed a slow
          m_out.data <= data_out;
  
         -- We have a valid load or store hit or we just completed a slow
@@ -959,47 +1042,32 @@ begin
              -- Load hit case is the standard path
              if r1.hit_load_valid = '1' then
                  report "completing load hit data=" & to_hstring(data_out);
              -- Load hit case is the standard path
              if r1.hit_load_valid = '1' then
                  report "completing load hit data=" & to_hstring(data_out);
-                d_out.valid <= '1';
              end if;
  
              -- error cases complete without stalling
              end if;
  
              -- error cases complete without stalling
-            if r1.error_done = '1' then
+            if r1.ls_error = '1' then
                  report "completing ld/st with error";
                  report "completing ld/st with error";
-                d_out.error <= '1';
-                d_out.cache_paradox <= r1.cache_paradox;
-                d_out.valid <= '1';
              end if;
  
              -- Slow ops (load miss, NC, stores)
              if r1.slow_valid = '1' then
              end if;
  
              -- Slow ops (load miss, NC, stores)
              if r1.slow_valid = '1' then
-                d_out.store_done <= '1';
                  report "completing store or load miss data=" & to_hstring(data_out);
                  report "completing store or load miss data=" & to_hstring(data_out);
-                d_out.valid <= '1';
-            end if;
-
-            if r1.stcx_fail = '1' then
-                d_out.store_done <= '0';
-                d_out.valid <= '1';
              end if;
  
          else
              -- Request came from MMU
              if r1.hit_load_valid = '1' then
                  report "completing load hit to MMU, data=" & to_hstring(m_out.data);
              end if;
  
          else
              -- Request came from MMU
              if r1.hit_load_valid = '1' then
                  report "completing load hit to MMU, data=" & to_hstring(m_out.data);
-                m_out.done <= '1';
              end if;
  
              -- error cases complete without stalling
              end if;
  
              -- error cases complete without stalling
-            if r1.error_done = '1' then
+            if r1.mmu_error = '1' then
                  report "completing MMU ld with error";
                  report "completing MMU ld with error";
-                m_out.err <= '1';
-                m_out.done <= '1';
              end if;
  
              -- Slow ops (i.e. load miss)
              if r1.slow_valid = '1' then
                  report "completing MMU load miss, data=" & to_hstring(m_out.data);
              end if;
  
              -- Slow ops (i.e. load miss)
              if r1.slow_valid = '1' then
                  report "completing MMU load miss, data=" & to_hstring(m_out.data);
-                m_out.done <= '1';
              end if;
          end if;
  
              end if;
          end if;
  
@@ -1076,7 +1144,7 @@ begin
                  wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS));
                  wr_sel <= (others => '1');
  
                  wr_addr <= std_ulogic_vector(to_unsigned(r1.store_row, ROW_BITS));
                  wr_sel <= (others => '1');
  
-                if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and r1.store_way = i then
+                if r1.state = RELOAD_WAIT_ACK and wishbone_in.ack = '1' and replace_way = i then
                      do_write <= '1';
                  end if;
             end if;
                      do_write <= '1';
                  end if;
             end if;
@@ -1110,20 +1178,28 @@ begin
              end if;
  
              -- Fast path for load/store hits. Set signals for the writeback controls.
              end if;
  
              -- Fast path for load/store hits. Set signals for the writeback controls.
+            r1.hit_way <= req_hit_way;
+            r1.hit_index <= req_index;
             if req_op = OP_LOAD_HIT then
             if req_op = OP_LOAD_HIT then
-               r1.hit_way <= req_hit_way;
                 r1.hit_load_valid <= '1';
             else
                 r1.hit_load_valid <= '0';
             end if;
                 r1.hit_load_valid <= '1';
             else
                 r1.hit_load_valid <= '0';
             end if;
+            if req_op = OP_LOAD_HIT or req_op = OP_STORE_HIT then
+                r1.cache_hit <= '1';
+            else
+                r1.cache_hit <= '0';
+            end if;
  
              if req_op = OP_BAD then
                  report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) &
                      " rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok);
  
              if req_op = OP_BAD then
                  report "Signalling ld/st error valid_ra=" & std_ulogic'image(valid_ra) &
                      " rc_ok=" & std_ulogic'image(rc_ok) & " perm_ok=" & std_ulogic'image(perm_ok);
-                r1.error_done <= '1';
+                r1.ls_error <= not r0.mmu_req;
+                r1.mmu_error <= r0.mmu_req;
                  r1.cache_paradox <= access_ok;
              else
                  r1.cache_paradox <= access_ok;
              else
-                r1.error_done <= '0';
+                r1.ls_error <= '0';
+                r1.mmu_error <= '0';
                  r1.cache_paradox <= '0';
              end if;
  
                  r1.cache_paradox <= '0';
              end if;
  
@@ -1133,8 +1209,11 @@ begin
                  r1.stcx_fail <= '0';
              end if;
  
                  r1.stcx_fail <= '0';
              end if;
  
-            -- complete tlbies and TLB loads in the third cycle
-            r1.tlbie_done <= r0_valid and (r0.tlbie or r0.tlbld);
+            -- Record TLB hit information for updating TLB PLRU
+            r1.tlb_hit <= tlb_hit;
+            r1.tlb_hit_way <= tlb_hit_way;
+            r1.tlb_hit_index <= tlb_req_index;
+
         end if;
      end process;
  
         end if;
      end process;
  
@@ -1176,7 +1255,7 @@ begin
                      r1.forward_data1 <= wishbone_in.dat;
                  end if;
                  r1.forward_sel1 <= (others => '1');
                      r1.forward_data1 <= wishbone_in.dat;
                  end if;
                  r1.forward_sel1 <= (others => '1');
-                r1.forward_way1 <= r1.store_way;
+                r1.forward_way1 <= replace_way;
                  r1.forward_row1 <= r1.store_row;
                  r1.forward_valid1 <= '0';
              end if;
                  r1.forward_row1 <= r1.store_row;
                  r1.forward_valid1 <= '0';
              end if;
@@ -1191,6 +1270,8 @@ begin
                 r1.slow_valid <= '0';
                  r1.wb.cyc <= '0';
                  r1.wb.stb <= '0';
                 r1.slow_valid <= '0';
                  r1.wb.cyc <= '0';
                  r1.wb.stb <= '0';
+                r1.ls_valid <= '0';
+                r1.mmu_done <= '0';
  
                 -- Not useful normally but helps avoiding tons of sim warnings
                 r1.wb.adr <= (others => '0');
  
                 -- Not useful normally but helps avoiding tons of sim warnings
                 r1.wb.adr <= (others => '0');
@@ -1198,15 +1279,29 @@ begin
                 -- One cycle pulses reset
                 r1.slow_valid <= '0';
                  r1.write_bram <= '0';
                 -- One cycle pulses reset
                 r1.slow_valid <= '0';
                  r1.write_bram <= '0';
+                r1.inc_acks <= '0';
+                r1.dec_acks <= '0';
+
+                r1.ls_valid <= '0';
+                -- complete tlbies and TLB loads in the third cycle
+                r1.mmu_done <= r0_valid and (r0.tlbie or r0.tlbld);
+                if req_op = OP_LOAD_HIT or req_op = OP_STCX_FAIL then
+                    if r0.mmu_req = '0' then
+                        r1.ls_valid <= '1';
+                    else
+                        r1.mmu_done <= '1';
+                    end if;
+                end if;
  
                  if r1.write_tag = '1' then
                      -- Store new tag in selected way
                      for i in 0 to NUM_WAYS-1 loop
  
                  if r1.write_tag = '1' then
                      -- Store new tag in selected way
                      for i in 0 to NUM_WAYS-1 loop
-                        if i = r1.store_way then
+                        if i = replace_way then
                              cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                                  (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag;
                          end if;
                      end loop;
                              cache_tags(r1.store_index)((i + 1) * TAG_WIDTH - 1 downto i * TAG_WIDTH) <=
                                  (TAG_WIDTH - 1 downto TAG_BITS => '0') & r1.reload_tag;
                          end if;
                      end loop;
+                    r1.store_way <= replace_way;
                      r1.write_tag <= '0';
                  end if;
  
                      r1.write_tag <= '0';
                  end if;
  
@@ -1216,12 +1311,26 @@ begin
                      req := r1.req;
                  else
                      req.op := req_op;
                      req := r1.req;
                  else
                      req.op := req_op;
+                    req.valid := req_go;
+                    req.mmu_req := r0.mmu_req;
                      req.dcbz := r0.req.dcbz;
                      req.real_addr := ra;
                      req.dcbz := r0.req.dcbz;
                      req.real_addr := ra;
-                    req.data := r0.req.data;
-                    req.byte_sel := r0.req.byte_sel;
+                    -- Force data to 0 for dcbz
+                    if r0.req.dcbz = '1' then
+                        req.data := (others => '0');
+                    elsif r0.d_valid = '1' then
+                        req.data := r0.req.data;
+                    else
+                        req.data := d_in.data;
+                    end if;
+                    -- Select all bytes for dcbz and for cacheable loads
+                    if r0.req.dcbz = '1' or (r0.req.load = '1' and r0.req.nc = '0') then
+                        req.byte_sel := (others => '1');
+                    else
+                        req.byte_sel := r0.req.byte_sel;
+                    end if;
                      req.hit_way := req_hit_way;
                      req.hit_way := req_hit_way;
-                    req.repl_way := replace_way;
+                    req.same_tag := req_same_tag;
  
                      -- Store the incoming request from r0, if it is a slow request
                      -- Note that r1.full = 1 implies req_op = OP_NONE
  
                      -- Store the incoming request from r0, if it is a slow request
                      -- Note that r1.full = 1 implies req_op = OP_NONE
@@ -1236,18 +1345,19 @@ begin
                 case r1.state is
                  when IDLE =>
                      r1.wb.adr <= req.real_addr(r1.wb.adr'left downto 0);
                 case r1.state is
                  when IDLE =>
                      r1.wb.adr <= req.real_addr(r1.wb.adr'left downto 0);
-                    r1.dcbz <= '0';
+                    r1.wb.sel <= req.byte_sel;
+                    r1.wb.dat <= req.data;
+                    r1.dcbz <= req.dcbz;
  
                      -- Keep track of our index and way for subsequent stores.
                      r1.store_index <= get_index(req.real_addr);
                      r1.store_row <= get_row(req.real_addr);
                      r1.end_row_ix <= get_row_of_line(get_row(req.real_addr)) - 1;
                      r1.reload_tag <= get_tag(req.real_addr);
  
                      -- Keep track of our index and way for subsequent stores.
                      r1.store_index <= get_index(req.real_addr);
                      r1.store_row <= get_row(req.real_addr);
                      r1.end_row_ix <= get_row_of_line(get_row(req.real_addr)) - 1;
                      r1.reload_tag <= get_tag(req.real_addr);
+                    r1.req.same_tag <= '1';
  
                      if req.op = OP_STORE_HIT then
                          r1.store_way <= req.hit_way;
  
                      if req.op = OP_STORE_HIT then
                          r1.store_way <= req.hit_way;
-                    else
-                        r1.store_way <= req.repl_way;
                      end if;
  
                      -- Reset per-row valid bits, ready for handling OP_LOAD_MISS
                      end if;
  
                      -- Reset per-row valid bits, ready for handling OP_LOAD_MISS
@@ -1264,11 +1374,9 @@ begin
                         --
                         report "cache miss real addr:" & to_hstring(req.real_addr) &
                             " idx:" & integer'image(get_index(req.real_addr)) &
                         --
                         report "cache miss real addr:" & to_hstring(req.real_addr) &
                             " idx:" & integer'image(get_index(req.real_addr)) &
-                           " way:" & integer'image(req.repl_way) &
                             " tag:" & to_hstring(get_tag(req.real_addr));
  
                         -- Start the wishbone cycle
                             " tag:" & to_hstring(get_tag(req.real_addr));
  
                         -- Start the wishbone cycle
-                       r1.wb.sel <= (others => '1');
                         r1.wb.we  <= '0';
                         r1.wb.cyc <= '1';
                         r1.wb.stb <= '1';
                         r1.wb.we  <= '0';
                         r1.wb.cyc <= '1';
                         r1.wb.stb <= '1';
@@ -1278,7 +1386,6 @@ begin
                          r1.write_tag <= '1';
  
                     when OP_LOAD_NC =>
                          r1.write_tag <= '1';
  
                     when OP_LOAD_NC =>
-                        r1.wb.sel <= req.byte_sel;
                          r1.wb.cyc <= '1';
                          r1.wb.stb <= '1';
                         r1.wb.we <= '0';
                          r1.wb.cyc <= '1';
                          r1.wb.stb <= '1';
                         r1.wb.we <= '0';
@@ -1286,27 +1393,25 @@ begin
  
                      when OP_STORE_HIT | OP_STORE_MISS =>
                          if req.dcbz = '0' then
  
                      when OP_STORE_HIT | OP_STORE_MISS =>
                          if req.dcbz = '0' then
-                            r1.wb.sel <= req.byte_sel;
-                            r1.wb.dat <= req.data;
                              r1.state <= STORE_WAIT_ACK;
                              r1.acks_pending <= to_unsigned(1, 3);
                              r1.full <= '0';
                              r1.slow_valid <= '1';
                              r1.state <= STORE_WAIT_ACK;
                              r1.acks_pending <= to_unsigned(1, 3);
                              r1.full <= '0';
                              r1.slow_valid <= '1';
+                            if req.mmu_req = '0' then
+                                r1.ls_valid <= '1';
+                            else
+                                r1.mmu_done <= '1';
+                            end if;
                              if req.op = OP_STORE_HIT then
                                  r1.write_bram <= '1';
                              end if;
                          else
                              -- dcbz is handled much like a load miss except
                              -- that we are writing to memory instead of reading
                              if req.op = OP_STORE_HIT then
                                  r1.write_bram <= '1';
                              end if;
                          else
                              -- dcbz is handled much like a load miss except
                              -- that we are writing to memory instead of reading
-
-                            -- Start the wishbone writes
-                            r1.wb.sel <= (others => '1');
-                            r1.wb.dat <= (others => '0');
-
-                            -- Handle the rest like a load miss
                              r1.state <= RELOAD_WAIT_ACK;
                              r1.state <= RELOAD_WAIT_ACK;
-                            r1.write_tag <= '1';
-                            r1.dcbz <= '1';
+                            if req.op = OP_STORE_MISS then
+                                r1.write_tag <= '1';
+                            end if;
                          end if;
                          r1.wb.we <= '1';
                          r1.wb.cyc <= '1';
                          end if;
                          r1.wb.we <= '1';
                          r1.wb.cyc <= '1';
@@ -1346,13 +1451,17 @@ begin
                          -- complete the request next cycle.
                          -- Compare the whole address in case the request in
                          -- r1.req is not the one that started this refill.
                          -- complete the request next cycle.
                          -- Compare the whole address in case the request in
                          -- r1.req is not the one that started this refill.
-                       if r1.full = '1' and
-                            ((r1.dcbz = '1' and r1.req.dcbz = '1') or
-                             (r1.dcbz = '0' and r1.req.op = OP_LOAD_MISS)) and
-                            r1.store_row = get_row(r1.req.real_addr) and
-                            r1.reload_tag = get_tag(r1.req.real_addr) then
+                       if req.valid = '1' and req.same_tag = '1' and
+                            ((r1.dcbz = '1' and req.dcbz = '1') or
+                             (r1.dcbz = '0' and req.op = OP_LOAD_MISS)) and
+                            r1.store_row = get_row(req.real_addr) then
                              r1.full <= '0';
                              r1.slow_valid <= '1';
                              r1.full <= '0';
                              r1.slow_valid <= '1';
+                            if r1.mmu_req = '0' then
+                                r1.ls_valid <= '1';
+                            else
+                                r1.mmu_done <= '1';
+                            end if;
                              r1.forward_sel <= (others => '1');
                              r1.use_forward1 <= '1';
                         end if;
                              r1.forward_sel <= (others => '1');
                              r1.use_forward1 <= '1';
                         end if;
@@ -1375,28 +1484,37 @@ begin
                  when STORE_WAIT_ACK =>
                     stbs_done := r1.wb.stb = '0';
                      acks := r1.acks_pending;
                  when STORE_WAIT_ACK =>
                     stbs_done := r1.wb.stb = '0';
                      acks := r1.acks_pending;
+                    if r1.inc_acks /= r1.dec_acks then
+                        if r1.inc_acks = '1' then
+                            acks := acks + 1;
+                        else
+                            acks := acks - 1;
+                        end if;
+                    end if;
+                    r1.acks_pending <= acks;
                     -- Clear stb when slave accepted request
                      if wishbone_in.stall = '0' then
                          -- See if there is another store waiting to be done
                          -- which is in the same real page.
                     -- Clear stb when slave accepted request
                      if wishbone_in.stall = '0' then
                          -- See if there is another store waiting to be done
                          -- which is in the same real page.
-                        -- Using r1.req rather than req here limits us to one
-                        -- store every two cycles, but helps timing in that we
-                        -- don't depend on req_op or ra.
-                        if r1.full = '1' and acks < 7 and
-                            (r1.req.op = OP_STORE_MISS or r1.req.op = OP_STORE_HIT) and
-                            (r1.req.real_addr(r1.wb.adr'left downto TLB_LG_PGSZ) =
-                             r1.wb.adr(r1.wb.adr'left downto TLB_LG_PGSZ)) then
-                            r1.wb.adr <= r1.req.real_addr(r1.wb.adr'left downto 0);
-                            r1.wb.dat <= r1.req.data;
-                            r1.wb.sel <= r1.req.byte_sel;
+                        if req.valid = '1' then
+                            r1.wb.adr(SET_SIZE_BITS - 1 downto 0) <=
+                                req.real_addr(SET_SIZE_BITS - 1 downto 0);
+                            r1.wb.dat <= req.data;
+                            r1.wb.sel <= req.byte_sel;
+                        end if;
+                        if acks < 7 and req.same_tag = '1' and
+                            (req.op = OP_STORE_MISS or req.op = OP_STORE_HIT) then
                              r1.wb.stb <= '1';
                              stbs_done := false;
                              r1.wb.stb <= '1';
                              stbs_done := false;
-                            if r1.req.op = OP_STORE_HIT then
+                            if req.op = OP_STORE_HIT then
                                  r1.write_bram <= '1';
                              end if;
                              r1.full <= '0';
                              r1.slow_valid <= '1';
                                  r1.write_bram <= '1';
                              end if;
                              r1.full <= '0';
                              r1.slow_valid <= '1';
-                            acks := acks + 1;
+                            -- Store requests never come from the MMU
+                            r1.ls_valid <= '1';
+                            stbs_done := false;
+                            r1.inc_acks <= '1';
                          else
                              r1.wb.stb <= '0';
                              stbs_done := true;
                          else
                              r1.wb.stb <= '0';
                              stbs_done := true;
@@ -1410,9 +1528,8 @@ begin
                              r1.wb.cyc <= '0';
                              r1.wb.stb <= '0';
                          end if;
                              r1.wb.cyc <= '0';
                              r1.wb.stb <= '0';
                          end if;
-                        acks := acks - 1;
+                        r1.dec_acks <= '1';
                     end if;
                     end if;
-                    r1.acks_pending <= acks;
  
                  when NC_LOAD_WAIT_ACK =>
                     -- Clear stb when slave accepted request
  
                  when NC_LOAD_WAIT_ACK =>
                     -- Clear stb when slave accepted request
@@ -1425,6 +1542,11 @@ begin
                          r1.state <= IDLE;
                          r1.full <= '0';
                         r1.slow_valid <= '1';
                          r1.state <= IDLE;
                          r1.full <= '0';
                         r1.slow_valid <= '1';
+                        if r1.mmu_req = '0' then
+                            r1.ls_valid <= '1';
+                        else
+                            r1.mmu_done <= '1';
+                        end if;
                          r1.forward_sel <= (others => '1');
                          r1.use_forward1 <= '1';
                         r1.wb.cyc <= '0';
                          r1.forward_sel <= (others => '1');
                          r1.use_forward1 <= '1';
                         r1.wb.cyc <= '0';
@@ -1435,21 +1557,25 @@ begin
         end if;
      end process;
  
         end if;
      end process;
  
-    dcache_log: process(clk)
+    dc_log: if LOG_LENGTH > 0 generate
+        signal log_data : std_ulogic_vector(19 downto 0);
      begin
      begin
-        if rising_edge(clk) then
-            log_data <= r1.wb.adr(5 downto 3) &
-                        wishbone_in.stall &
-                        wishbone_in.ack &
-                        r1.wb.stb & r1.wb.cyc &
-                        d_out.error &
-                        d_out.valid &
-                        std_ulogic_vector(to_unsigned(op_t'pos(req_op), 3)) &
-                        stall_out &
-                        std_ulogic_vector(to_unsigned(tlb_hit_way, 3)) &
-                        valid_ra &
-                        std_ulogic_vector(to_unsigned(state_t'pos(r1.state), 3));
-        end if;
-    end process;
-    log_out <= log_data;
+        dcache_log: process(clk)
+        begin
+            if rising_edge(clk) then
+                log_data <= r1.wb.adr(5 downto 3) &
+                            wishbone_in.stall &
+                            wishbone_in.ack &
+                            r1.wb.stb & r1.wb.cyc &
+                            d_out.error &
+                            d_out.valid &
+                            std_ulogic_vector(to_unsigned(op_t'pos(req_op), 3)) &
+                            stall_out &
+                            std_ulogic_vector(to_unsigned(tlb_hit_way, 3)) &
+                            valid_ra &
+                            std_ulogic_vector(to_unsigned(state_t'pos(r1.state), 3));
+            end if;
+        end process;
+        log_out <= log_data;
+    end generate;
  end;
  end;