execute: Implement bypass from output of execute1 to input
authorPaul Mackerras <paulus@ozlabs.org>
Mon, 13 Jan 2020 02:23:42 +0000 (13:23 +1100)
committerPaul Mackerras <paulus@ozlabs.org>
Tue, 14 Jan 2020 11:42:50 +0000 (22:42 +1100)
This enables back-to-back execution of integer instructions where
the first instruction writes a GPR and the second reads the same
GPR.  This is done with a set of multiplexers at the start of
execute1 which enable any of the three input operands to be taken
from the output of execute1 (i.e. r.e.write_data) rather than the
input from decode2 (i.e. e_in.read_data[123]).

This also requires changes to the hazard detection and handling.
Decode2 generates a signal indicating that the GPR being written
is available for bypass, which is true for instructions that are
executed in execute1 (rather than loadstore1/dcache).  The
gpr_hazard module stores this "bypassable" bit, and if the same
GPR needs to be read by a subsequent instruction, it outputs a
"use_bypass" signal rather than generating a stall.  The
use_bypass signal is then latched at the output of decode2 and
passed down to execute1 to control the input multiplexer.

At the moment there is no bypass on the inputs to loadstore1, but that
is OK because all load and store instructions are marked as
single-issue.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
common.vhdl
control.vhdl
core.vhdl
decode2.vhdl
execute1.vhdl
gpr_hazard.vhdl

index 86123896e473b7b919403508cc6f64e47ca3571c..9c8a9420a1a01b2330876bc696e932cb82c184c7 100644 (file)
@@ -109,6 +109,9 @@ package common is
        read_data1: std_ulogic_vector(63 downto 0);
        read_data2: std_ulogic_vector(63 downto 0);
        read_data3: std_ulogic_vector(63 downto 0);
+        bypass_data1: std_ulogic;
+        bypass_data2: std_ulogic;
+        bypass_data3: std_ulogic;
        cr: std_ulogic_vector(31 downto 0);
        xerc: xer_common_t;
        lr: std_ulogic;
@@ -126,7 +129,8 @@ package common is
        data_len: std_ulogic_vector(3 downto 0);
     end record;
     constant Decode2ToExecute1Init : Decode2ToExecute1Type :=
-       (valid => '0', insn_type => OP_ILLEGAL, lr => '0', rc => '0', oe => '0', invert_a => '0',
+       (valid => '0', insn_type => OP_ILLEGAL, bypass_data1 => '0', bypass_data2 => '0', bypass_data3 => '0',
+         lr => '0', rc => '0', oe => '0', invert_a => '0',
         invert_out => '0', input_carry => ZERO, output_carry => '0', input_cr => '0', output_cr => '0',
         is_32bit => '0', is_signed => '0', xerc => xerc_init, others => (others => '0'));
 
index ead3c1ffe20ab79a2861680e73a33b65cd9518b3..064ff98529e79eab93680ed20c40c979789caa12 100644 (file)
@@ -21,6 +21,7 @@ entity control is
 
         gpr_write_valid_in  : in std_ulogic;
         gpr_write_in        : in gspr_index_t;
+        gpr_bypassable      : in std_ulogic;
 
         gpr_a_read_valid_in : in std_ulogic;
         gpr_a_read_in       : in gspr_index_t;
@@ -36,7 +37,11 @@ entity control is
 
         valid_out           : out std_ulogic;
         stall_out           : out std_ulogic;
-        stopped_out         : out std_ulogic
+        stopped_out         : out std_ulogic;
+
+        gpr_bypass_a        : out std_ulogic;
+        gpr_bypass_b        : out std_ulogic;
+        gpr_bypass_c        : out std_ulogic
         );
 end entity control;
 
@@ -71,10 +76,12 @@ begin
 
             gpr_write_valid_in => gpr_write_valid,
             gpr_write_in       => gpr_write_in,
+            bypass_avail       => gpr_bypassable,
             gpr_read_valid_in  => gpr_a_read_valid_in,
             gpr_read_in        => gpr_a_read_in,
 
-            stall_out          => stall_a_out
+            stall_out          => stall_a_out,
+            use_bypass         => gpr_bypass_a
             );
 
     gpr_hazard1: entity work.gpr_hazard
@@ -87,10 +94,12 @@ begin
 
             gpr_write_valid_in => gpr_write_valid,
             gpr_write_in       => gpr_write_in,
+            bypass_avail       => gpr_bypassable,
             gpr_read_valid_in  => gpr_b_read_valid_in,
             gpr_read_in        => gpr_b_read_in,
 
-            stall_out          => stall_b_out
+            stall_out          => stall_b_out,
+            use_bypass         => gpr_bypass_b
             );
 
     gpr_c_read_in_fmt <= "0" & gpr_c_read_in;
@@ -105,10 +114,12 @@ begin
 
             gpr_write_valid_in => gpr_write_valid,
             gpr_write_in       => gpr_write_in,
+            bypass_avail       => gpr_bypassable,
             gpr_read_valid_in  => gpr_c_read_valid_in,
             gpr_read_in        => gpr_c_read_in_fmt,
 
-            stall_out          => stall_c_out
+            stall_out          => stall_c_out,
+            use_bypass         => gpr_bypass_c
             );
 
     cr_hazard0: entity work.cr_hazard
index a38cf36b3030360b8a080bf36a48e2e51a181efd..aa86689729307f822f1597c6782809533f1f28a4 100644 (file)
--- a/core.vhdl
+++ b/core.vhdl
@@ -9,7 +9,8 @@ use work.wishbone_types.all;
 entity core is
     generic (
         SIM : boolean := false;
-       DISABLE_FLATTEN : boolean := false
+       DISABLE_FLATTEN : boolean := false;
+        EX1_BYPASS : boolean := true
         );
     port (
         clk          : in std_logic;
@@ -176,6 +177,9 @@ begin
     decode1_stall_in <= decode2_stall_out;
 
     decode2_0: entity work.decode2
+        generic map (
+            EX1_BYPASS => EX1_BYPASS
+            )
         port map (
             clk => clk,
             rst => core_rst,
@@ -220,6 +224,9 @@ begin
             );
 
     execute1_0: entity work.execute1
+        generic map (
+            EX1_BYPASS => EX1_BYPASS
+            )
         port map (
             clk => clk,
             rst => core_rst,
index 6cd457403e8ad1300fa9ddafa2443cdde36d71b5..6e3bd8ae03695034f7b2fa3a0836c95b681c4b6b 100644 (file)
@@ -9,6 +9,9 @@ use work.helpers.all;
 use work.insn_helpers.all;
 
 entity decode2 is
+        generic (
+                EX1_BYPASS : boolean := true
+        );
        port (
                clk   : in std_ulogic;
                rst   : in std_ulogic;
@@ -184,15 +187,19 @@ architecture behaviour of decode2 is
 
        signal gpr_write_valid : std_ulogic;
        signal gpr_write : gspr_index_t;
+        signal gpr_bypassable  : std_ulogic;
 
        signal gpr_a_read_valid : std_ulogic;
        signal gpr_a_read :gspr_index_t;
+        signal gpr_a_bypass : std_ulogic;
 
        signal gpr_b_read_valid : std_ulogic;
        signal gpr_b_read : gspr_index_t;
+        signal gpr_b_bypass : std_ulogic;
 
        signal gpr_c_read_valid : std_ulogic;
        signal gpr_c_read : gpr_index_t;
+        signal gpr_c_bypass : std_ulogic;
 
        signal cr_write_valid : std_ulogic;
 begin
@@ -213,6 +220,7 @@ begin
 
                gpr_write_valid_in => gpr_write_valid,
                gpr_write_in       => gpr_write,
+                gpr_bypassable     => gpr_bypassable,
 
                gpr_a_read_valid_in  => gpr_a_read_valid,
                gpr_a_read_in        => gpr_a_read,
@@ -228,7 +236,11 @@ begin
 
                valid_out   => control_valid_out,
                stall_out   => stall_out,
-               stopped_out => stopped_out
+               stopped_out => stopped_out,
+
+                gpr_bypass_a => gpr_a_bypass,
+                gpr_bypass_b => gpr_b_bypass,
+                gpr_bypass_c => gpr_c_bypass
        );
 
        decode2_0: process(clk)
@@ -295,9 +307,12 @@ begin
                v.e.insn_type := d_in.decode.insn_type;
                v.e.read_reg1 := decoded_reg_a.reg;
                v.e.read_data1 := decoded_reg_a.data;
+                v.e.bypass_data1 := gpr_a_bypass;
                v.e.read_reg2 := decoded_reg_b.reg;
                v.e.read_data2 := decoded_reg_b.data;
+                v.e.bypass_data2 := gpr_b_bypass;
                 v.e.read_data3 := decoded_reg_c.data;
+                v.e.bypass_data3 := gpr_c_bypass;
                v.e.write_reg := decoded_reg_o.reg;
                v.e.rc := decode_rc(d_in.decode.rc, d_in.insn);
                 if not (d_in.decode.insn_type = OP_MUL_H32 or d_in.decode.insn_type = OP_MUL_H64) then
@@ -342,6 +357,10 @@ begin
 
                gpr_write_valid <= decoded_reg_o.reg_valid;
                gpr_write <= decoded_reg_o.reg;
+                gpr_bypassable <= '0';
+                if EX1_BYPASS and d_in.decode.unit = ALU then
+                        gpr_bypassable <= '1';
+                end if;
 
                gpr_a_read_valid <= decoded_reg_a.reg_valid;
                gpr_a_read <= decoded_reg_a.reg;
index 5a626f882777172b7119b9f18259584e5d570b66..d63697cc29dad927d1be26c84bead0d68f270263 100644 (file)
@@ -11,6 +11,9 @@ use work.insn_helpers.all;
 use work.ppc_fx_insns.all;
 
 entity execute1 is
+    generic (
+        EX1_BYPASS : boolean := true
+        );
     port (
        clk   : in std_ulogic;
         rst   : in std_ulogic;
@@ -46,6 +49,8 @@ architecture behaviour of execute1 is
 
     signal r, rin : reg_type;
 
+    signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0);
+
     signal ctrl: ctrl_t := (others => (others => '0'));
     signal ctrl_tmp: ctrl_t := (others => (others => '0'));
 
@@ -109,9 +114,9 @@ begin
 
     rotator_0: entity work.rotator
        port map (
-           rs => e_in.read_data3,
-           ra => e_in.read_data1,
-           shift => e_in.read_data2(6 downto 0),
+           rs => c_in,
+           ra => a_in,
+           shift => b_in(6 downto 0),
            insn => e_in.insn,
            is_32bit => e_in.is_32bit,
            right_shift => right_shift,
@@ -124,8 +129,8 @@ begin
 
     logical_0: entity work.logical
        port map (
-           rs => e_in.read_data3,
-           rb => e_in.read_data2,
+           rs => c_in,
+           rb => b_in,
            op => e_in.insn_type,
            invert_in => e_in.invert_a,
            invert_out => e_in.invert_out,
@@ -137,7 +142,7 @@ begin
 
     countzero_0: entity work.zero_counter
        port map (
-           rs => e_in.read_data3,
+           rs => c_in,
            count_right => e_in.insn(10),
            is_32bit => e_in.is_32bit,
            result => countzero_result
@@ -158,6 +163,10 @@ begin
             d_out => divider_to_x
             );
 
+    a_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data1 = '1' else e_in.read_data1;
+    b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2;
+    c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3;
+
     execute1_0: process(clk)
     begin
        if rising_edge(clk) then
@@ -256,21 +265,21 @@ begin
 
        if e_in.is_32bit = '1' then
            if e_in.is_signed = '1' then
-               x_to_multiply.data1 <= (others => e_in.read_data1(31));
-               x_to_multiply.data1(31 downto 0) <= e_in.read_data1(31 downto 0);
-               x_to_multiply.data2 <= (others => e_in.read_data2(31));
-               x_to_multiply.data2(31 downto 0) <= e_in.read_data2(31 downto 0);
+               x_to_multiply.data1 <= (others => a_in(31));
+               x_to_multiply.data1(31 downto 0) <= a_in(31 downto 0);
+               x_to_multiply.data2 <= (others => b_in(31));
+               x_to_multiply.data2(31 downto 0) <= b_in(31 downto 0);
            else
-               x_to_multiply.data1 <= '0' & x"00000000" & e_in.read_data1(31 downto 0);
-               x_to_multiply.data2 <= '0' & x"00000000" & e_in.read_data2(31 downto 0);
+               x_to_multiply.data1 <= '0' & x"00000000" & a_in(31 downto 0);
+               x_to_multiply.data2 <= '0' & x"00000000" & b_in(31 downto 0);
            end if;
        else
            if e_in.is_signed = '1' then
-               x_to_multiply.data1 <= e_in.read_data1(63) & e_in.read_data1;
-               x_to_multiply.data2 <= e_in.read_data2(63) & e_in.read_data2;
+               x_to_multiply.data1 <= a_in(63) & a_in;
+               x_to_multiply.data2 <= b_in(63) & b_in;
            else
-               x_to_multiply.data1 <= '0' & e_in.read_data1;
-               x_to_multiply.data2 <= '0' & e_in.read_data2;
+               x_to_multiply.data1 <= '0' & a_in;
+               x_to_multiply.data2 <= '0' & b_in;
            end if;
        end if;
 
@@ -279,23 +288,23 @@ begin
         sign2 := '0';
         if e_in.is_signed = '1' then
             if e_in.is_32bit = '1' then
-                sign1 := e_in.read_data1(31);
-                sign2 := e_in.read_data2(31);
+                sign1 := a_in(31);
+                sign2 := b_in(31);
             else
-                sign1 := e_in.read_data1(63);
-                sign2 := e_in.read_data2(63);
+                sign1 := a_in(63);
+                sign2 := b_in(63);
             end if;
         end if;
         -- take absolute values
         if sign1 = '0' then
-            abs1 := signed(e_in.read_data1);
+            abs1 := signed(a_in);
         else
-            abs1 := - signed(e_in.read_data1);
+            abs1 := - signed(a_in);
         end if;
         if sign2 = '0' then
-            abs2 := signed(e_in.read_data2);
+            abs2 := signed(b_in);
         else
-            abs2 := - signed(e_in.read_data2);
+            abs2 := - signed(b_in);
         end if;
 
         x_to_divider <= Execute1ToDividerInit;
@@ -358,14 +367,14 @@ begin
                -- Do nothing
            when OP_ADD | OP_CMP =>
                if e_in.invert_a = '0' then
-                   a_inv := e_in.read_data1;
+                   a_inv := a_in;
                else
-                   a_inv := not e_in.read_data1;
+                   a_inv := not a_in;
                end if;
-               result_with_carry := ppc_adde(a_inv, e_in.read_data2,
+               result_with_carry := ppc_adde(a_inv, b_in,
                                              decode_input_carry(e_in.input_carry, v.e.xerc));
                result := result_with_carry(63 downto 0);
-                carry_32 := result(32) xor a_inv(32) xor e_in.read_data2(32);
+                carry_32 := result(32) xor a_inv(32) xor b_in(32);
                 carry_64 := result_with_carry(64);
                 if e_in.insn_type = OP_ADD then
                     if e_in.output_carry = '1' then
@@ -373,8 +382,8 @@ begin
                     end if;
                     if e_in.oe = '1' then
                         set_ov(v.e,
-                               calc_ov(a_inv(63), e_in.read_data2(63), carry_64, result_with_carry(63)),
-                               calc_ov(a_inv(31), e_in.read_data2(31), carry_32, result_with_carry(31)));
+                               calc_ov(a_inv(63), b_in(63), carry_64, result_with_carry(63)),
+                               calc_ov(a_inv(31), b_in(31), carry_32, result_with_carry(31)));
                     end if;
                     result_en := '1';
                 else
@@ -385,20 +394,20 @@ begin
                     v.e.write_cr_enable := '1';
                     crnum := to_integer(unsigned(bf));
                     v.e.write_cr_mask := num_to_fxm(crnum);
-                    zerolo := not (or (e_in.read_data1(31 downto 0) xor e_in.read_data2(31 downto 0)));
-                    zerohi := not (or (e_in.read_data1(63 downto 32) xor e_in.read_data2(63 downto 32)));
+                    zerolo := not (or (a_in(31 downto 0) xor b_in(31 downto 0)));
+                    zerohi := not (or (a_in(63 downto 32) xor b_in(63 downto 32)));
                     if zerolo = '1' and (l = '0' or zerohi = '1') then
                         -- values are equal
                         newcrf := "001" & v.e.xerc.so;
                     else
                         if l = '1' then
                             -- 64-bit comparison
-                            msb_a := e_in.read_data1(63);
-                            msb_b := e_in.read_data2(63);
+                            msb_a := a_in(63);
+                            msb_b := b_in(63);
                         else
                             -- 32-bit comparison
-                            msb_a := e_in.read_data1(31);
-                            msb_b := e_in.read_data2(31);
+                            msb_a := a_in(31);
+                            msb_b := b_in(31);
                         end if;
                         if msb_a /= msb_b then
                             -- Subtraction might overflow, but
@@ -424,25 +433,25 @@ begin
            when OP_B =>
                f_out.redirect <= '1';
                if (insn_aa(e_in.insn)) then
-                   f_out.redirect_nia <= std_ulogic_vector(signed(e_in.read_data2));
+                   f_out.redirect_nia <= std_ulogic_vector(signed(b_in));
                else
-                   f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(e_in.read_data2));
+                   f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
                end if;
            when OP_BC =>
                -- read_data1 is CTR
                bo := insn_bo(e_in.insn);
                bi := insn_bi(e_in.insn);
                if bo(4-2) = '0' then
-                   result := std_ulogic_vector(unsigned(e_in.read_data1) - 1);
+                   result := std_ulogic_vector(unsigned(a_in) - 1);
                    result_en := '1';
                    v.e.write_reg := fast_spr_num(SPR_CTR);
                end if;
-               if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then
+               if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then
                    f_out.redirect <= '1';
                    if (insn_aa(e_in.insn)) then
-                       f_out.redirect_nia <= std_ulogic_vector(signed(e_in.read_data2));
+                       f_out.redirect_nia <= std_ulogic_vector(signed(b_in));
                    else
-                       f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(e_in.read_data2));
+                       f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
                    end if;
                end if;
            when OP_BCREG =>
@@ -451,40 +460,40 @@ begin
                bo := insn_bo(e_in.insn);
                bi := insn_bi(e_in.insn);
                if bo(4-2) = '0' and e_in.insn(10) = '0' then
-                   result := std_ulogic_vector(unsigned(e_in.read_data1) - 1);
+                   result := std_ulogic_vector(unsigned(a_in) - 1);
                    result_en := '1';
                    v.e.write_reg := fast_spr_num(SPR_CTR);
                end if;
-               if ppc_bc_taken(bo, bi, e_in.cr, e_in.read_data1) = 1 then
+               if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then
                    f_out.redirect <= '1';
-                   f_out.redirect_nia <= e_in.read_data2(63 downto 2) & "00";
+                   f_out.redirect_nia <= b_in(63 downto 2) & "00";
                end if;
            when OP_CMPB =>
-               result := ppc_cmpb(e_in.read_data3, e_in.read_data2);
+               result := ppc_cmpb(c_in, b_in);
                result_en := '1';
            when OP_CNTZ =>
                result := countzero_result;
                result_en := '1';
             when OP_EXTS =>
                 -- note data_len is a 1-hot encoding
-               negative := (e_in.data_len(0) and e_in.read_data3(7)) or
-                           (e_in.data_len(1) and e_in.read_data3(15)) or
-                           (e_in.data_len(2) and e_in.read_data3(31));
+               negative := (e_in.data_len(0) and c_in(7)) or
+                           (e_in.data_len(1) and c_in(15)) or
+                           (e_in.data_len(2) and c_in(31));
                result := (others => negative);
                if e_in.data_len(2) = '1' then
-                   result(31 downto 16) := e_in.read_data3(31 downto 16);
+                   result(31 downto 16) := c_in(31 downto 16);
                end if;
                if e_in.data_len(2) = '1' or e_in.data_len(1) = '1' then
-                   result(15 downto 8) := e_in.read_data3(15 downto 8);
+                   result(15 downto 8) := c_in(15 downto 8);
                end if;
-               result(7 downto 0) := e_in.read_data3(7 downto 0);
+               result(7 downto 0) := c_in(7 downto 0);
                result_en := '1';
            when OP_ISEL =>
                crbit := to_integer(unsigned(insn_bc(e_in.insn)));
                if e_in.cr(31-crbit) = '1' then
-                   result := e_in.read_data1;
+                   result := a_in;
                else
-                   result := e_in.read_data2;
+                   result := b_in;
                end if;
                result_en := '1';
            when OP_MCRF =>
@@ -549,7 +558,7 @@ begin
                end if;
            when OP_MFSPR =>
                if is_fast_spr(e_in.read_reg1) then
-                   result := e_in.read_data1;
+                   result := a_in;
                    if decode_spr_num(e_in.insn) = SPR_XER then
                        -- bits 0:31 and 35:43 are treated as reserved and return 0s when read using mfxer
                        result(63 downto 32) := (others => '0');
@@ -596,19 +605,19 @@ begin
                    crnum := fxm_to_num(insn_fxm(e_in.insn));
                    v.e.write_cr_mask := num_to_fxm(crnum);
                end if;
-               v.e.write_cr_data := e_in.read_data3(31 downto 0);
+               v.e.write_cr_data := c_in(31 downto 0);
            when OP_MTSPR =>
                report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
-                   "=" & to_hstring(e_in.read_data3);
+                   "=" & to_hstring(c_in);
                if is_fast_spr(e_in.write_reg) then
-                   result := e_in.read_data3;
+                   result := c_in;
                    result_en := '1';
                    if decode_spr_num(e_in.insn) = SPR_XER then
-                       v.e.xerc.so := e_in.read_data3(63-32);
-                       v.e.xerc.ov := e_in.read_data3(63-33);
-                       v.e.xerc.ca := e_in.read_data3(63-34);
-                       v.e.xerc.ov32 := e_in.read_data3(63-44);
-                       v.e.xerc.ca32 := e_in.read_data3(63-45);
+                       v.e.xerc.so := c_in(63-32);
+                       v.e.xerc.ov := c_in(63-33);
+                       v.e.xerc.ca := c_in(63-34);
+                       v.e.xerc.ov32 := c_in(63-44);
+                       v.e.xerc.ca32 := c_in(63-45);
                        v.e.write_xerc_enable := '1';
                    end if;
                else
index 705e69d025753897ca272fabf953d2df30601f30..de4f7d29d2adef8da8621438f46604cc26b403a5 100644 (file)
@@ -12,18 +12,21 @@ entity gpr_hazard is
 
         gpr_write_valid_in : in std_ulogic;
         gpr_write_in       : in std_ulogic_vector(5 downto 0);
+        bypass_avail       : in std_ulogic;
         gpr_read_valid_in  : in std_ulogic;
         gpr_read_in        : in std_ulogic_vector(5 downto 0);
 
-        stall_out          : out std_ulogic
+        stall_out          : out std_ulogic;
+        use_bypass         : out std_ulogic
         );
 end entity gpr_hazard;
 architecture behaviour of gpr_hazard is
     type pipeline_entry_type is record
-        valid : std_ulogic;
-        gpr   : std_ulogic_vector(5 downto 0);
+        valid  : std_ulogic;
+        bypass : std_ulogic;
+        gpr    : std_ulogic_vector(5 downto 0);
     end record;
-    constant pipeline_entry_init : pipeline_entry_type := (valid => '0', gpr => (others => '0'));
+    constant pipeline_entry_init : pipeline_entry_type := (valid => '0', bypass => '0', gpr => (others => '0'));
 
     type pipeline_t is array(0 to PIPELINE_DEPTH-1) of pipeline_entry_type;
     constant pipeline_t_init : pipeline_t := (others => pipeline_entry_init);
@@ -33,9 +36,7 @@ begin
     gpr_hazard0: process(clk)
     begin
         if rising_edge(clk) then
-           if stall_in = '0' then
-               r <= rin;
-           end if;
+            r <= rin;
         end if;
     end process;
 
@@ -45,22 +46,49 @@ begin
         v := r;
 
         stall_out <= '0';
-        loop_0: for i in 0 to PIPELINE_DEPTH-1 loop
-            if ((r(i).valid = gpr_read_valid_in) and r(i).gpr = gpr_read_in) then
-                stall_out <= '1';
+        use_bypass <= '0';
+        if gpr_read_valid_in = '1' then
+            if r(0).valid = '1' and r(0).gpr = gpr_read_in then
+                if r(0).bypass = '1' and stall_in = '0' then
+                    use_bypass <= '1';
+                else
+                    stall_out <= '1';
+                end if;
             end if;
-        end loop;
+            loop_0: for i in 1 to PIPELINE_DEPTH-1 loop
+                if r(i).valid = '1' and r(i).gpr = gpr_read_in then
+                    if r(i).bypass = '1' then
+                        use_bypass <= '1';
+                    else
+                        stall_out <= '1';
+                    end if;
+                end if;
+            end loop;
+        end if;
 
-        v(0).valid := gpr_write_valid_in;
-        v(0).gpr   := gpr_write_in;
-        loop_1: for i in 0 to PIPELINE_DEPTH-2 loop
-            -- propagate to next slot
-            v(i+1) := r(i);
-        end loop;
+        if stall_in = '0' then
+            v(0).valid  := gpr_write_valid_in;
+            v(0).bypass := bypass_avail;
+            v(0).gpr    := gpr_write_in;
+            loop_1: for i in 1 to PIPELINE_DEPTH-1 loop
+                -- propagate to next slot
+                v(i).valid  := r(i-1).valid;
+                v(i).bypass := r(i-1).bypass;
+                v(i).gpr    := r(i-1).gpr;
+            end loop;
 
-        -- asynchronous output
-        if gpr_read_valid_in = '0' then
-            stall_out <= '0';
+        else
+            -- stage 0 stalled, so stage 1 becomes empty
+            loop_1b: for i in 1 to PIPELINE_DEPTH-1 loop
+                -- propagate to next slot
+                if i = 1 then
+                    v(i).valid := '0';
+                else
+                    v(i).valid  := r(i-1).valid;
+                    v(i).bypass := r(i-1).bypass;
+                    v(i).gpr    := r(i-1).gpr;
+                end if;
+            end loop;
         end if;
 
         -- update registers