loadstore1: Improve timing of data path from cache RAM to writeback

[microwatt.git] / fpu.vhdl
diff --git a/fpu.vhdl b/fpu.vhdl

index 5e3038605d98171ab8c45dbb0126518d9b235535..2e8096a5bf6061cfe7d4bb50840775844938f37a 100644 (file)
--- a/fpu.vhdl
+++ b/fpu.vhdl
@@ -44,7 +44,7 @@ architecture behaviour of fpu is
                       DO_FRE, DO_FRSQRTE,
                       DO_FSEL,
                       FRI_1,
-                     ADD_SHIFT, ADD_2, ADD_3,
+                     ADD_1, ADD_SHIFT, ADD_2, ADD_3,
                       CMP_1, CMP_2,
                       MULT_1,
                       FMADD_1, FMADD_2, FMADD_3,
@@ -65,7 +65,8 @@ architecture behaviour of fpu is
                       DENORM,
                       RENORM_A, RENORM_A2,
                       RENORM_B, RENORM_B2,
-                     RENORM_C, RENORM_C2);
+                     RENORM_C, RENORM_C2,
+                     NAN_RESULT, EXC_RESULT);
  
      type reg_type is record
          state        : state_t;
@@ -111,6 +112,13 @@ architecture behaviour of fpu is
          first        : std_ulogic;
          count        : unsigned(1 downto 0);
          doing_ftdiv  : std_ulogic_vector(1 downto 0);
+        opsel_a      : std_ulogic_vector(1 downto 0);
+        use_a        : std_ulogic;
+        use_b        : std_ulogic;
+        use_c        : std_ulogic;
+        invalid      : std_ulogic;
+        negate       : std_ulogic;
+        longmask     : std_ulogic;
      end record;
  
      type lookup_table is array(0 to 1023) of std_ulogic_vector(17 downto 0);
@@ -118,12 +126,11 @@ architecture behaviour of fpu is
      signal r, rin : reg_type;
  
      signal fp_result     : std_ulogic_vector(63 downto 0);
-    signal opsel_a       : std_ulogic_vector(1 downto 0);
      signal opsel_b       : std_ulogic_vector(1 downto 0);
      signal opsel_r       : std_ulogic_vector(1 downto 0);
      signal opsel_s       : std_ulogic_vector(1 downto 0);
      signal opsel_ainv    : std_ulogic;
-    signal opsel_amask   : std_ulogic;
+    signal opsel_mask    : std_ulogic;
      signal opsel_binv    : std_ulogic;
      signal in_a          : std_ulogic_vector(63 downto 0);
      signal in_b          : std_ulogic_vector(63 downto 0);
@@ -150,7 +157,7 @@ architecture behaviour of fpu is
  
      constant BIN_ZERO : std_ulogic_vector(1 downto 0) := "00";
      constant BIN_R    : std_ulogic_vector(1 downto 0) := "01";
-    constant BIN_MASK : std_ulogic_vector(1 downto 0) := "10";
+    constant BIN_RND  : std_ulogic_vector(1 downto 0) := "10";
      constant BIN_PS6  : std_ulogic_vector(1 downto 0) := "11";
  
      constant RES_SUM   : std_ulogic_vector(1 downto 0) := "00";
@@ -609,7 +616,6 @@ begin
          variable need_check  : std_ulogic;
          variable msb         : std_ulogic;
          variable is_add      : std_ulogic;
-        variable longmask    : std_ulogic;
          variable set_a       : std_ulogic;
          variable set_b       : std_ulogic;
          variable set_c       : std_ulogic;
@@ -625,6 +631,8 @@ begin
          variable shiftin     : std_ulogic;
          variable mulexp      : signed(EXP_BITS-1 downto 0);
          variable maddend     : std_ulogic_vector(127 downto 0);
+        variable sum         : std_ulogic_vector(63 downto 0);
+        variable round_inc   : std_ulogic_vector(63 downto 0);
      begin
          v := r;
          illegal := '0';
@@ -638,6 +646,7 @@ begin
              v.fe_mode := or (e_in.fe_mode);
              v.dest_fpr := e_in.frt;
              v.single_prec := e_in.single;
+            v.longmask := e_in.single;
              v.int_result := '0';
              v.rc := e_in.rc;
              v.is_cmp := e_in.out_cr;
@@ -724,9 +733,9 @@ begin
          v.update_fprf := '0';
          v.shift := to_signed(0, EXP_BITS);
          v.first := '0';
-        opsel_a <= AIN_R;
+        v.opsel_a := AIN_R;
          opsel_ainv <= '0';
-        opsel_amask <= '0';
+        opsel_mask <= '0';
          opsel_b <= BIN_ZERO;
          opsel_binv <= '0';
          opsel_r <= RES_SUM;
@@ -741,7 +750,6 @@ begin
          renormalize := '0';
          set_x := '0';
          qnan_result := '0';
-        longmask := r.single_prec;
          set_a := '0';
          set_b := '0';
          set_c := '0';
@@ -758,6 +766,11 @@ begin
          shiftin := '0';
          case r.state is
              when IDLE =>
+                v.use_a := '0';
+                v.use_b := '0';
+                v.use_c := '0';
+                v.invalid := '0';
+                v.negate := '0';
                  if e_in.valid = '1' then
                      case e_in.insn(5 downto 1) is
                          when "00000" =>
@@ -770,6 +783,7 @@ begin
                              elsif e_in.insn(7) = '1' then
                                  v.state := DO_MCRFS;
                              else
+                                v.opsel_a := AIN_B;
                                  v.state := DO_FCMP;
                              end if;
                          when "00110" =>
@@ -789,14 +803,17 @@ begin
                                  v.state := DO_MTFSF;
                              end if;
                          when "01000" =>
+                            v.opsel_a := AIN_B;
                              if e_in.insn(9 downto 8) /= "11" then
                                  v.state := DO_FMR;
                              else
                                  v.state := DO_FRI;
                              end if;
                          when "01100" =>
+                            v.opsel_a := AIN_B;
                              v.state := DO_FRSP;
                          when "01110" =>
+                            v.opsel_a := AIN_B;
                              if int_input = '1' then
                                  -- fcfid[u][s]
                                  v.state := DO_FCFID;
@@ -805,25 +822,45 @@ begin
                              end if;
                          when "01111" =>
                              v.round_mode := "001";
+                            v.opsel_a := AIN_B;
                              v.state := DO_FCTI;
                          when "10010" =>
+                            v.opsel_a := AIN_A;
+                            if v.b.mantissa(54) = '0' and v.a.mantissa(54) = '1' then
+                                v.opsel_a := AIN_B;
+                            end if;
                              v.state := DO_FDIV;
                          when "10100" | "10101" =>
+                            v.opsel_a := AIN_A;
                              v.state := DO_FADD;
                          when "10110" =>
                              v.is_sqrt := '1';
+                            v.opsel_a := AIN_B;
                              v.state := DO_FSQRT;
                          when "10111" =>
                              v.state := DO_FSEL;
                          when "11000" =>
+                            v.opsel_a := AIN_B;
                              v.state := DO_FRE;
                          when "11001" =>
                              v.is_multiply := '1';
+                            v.opsel_a := AIN_A;
+                            if v.c.mantissa(54) = '0' and v.a.mantissa(54) = '1' then
+                                v.opsel_a := AIN_C;
+                            end if;
                              v.state := DO_FMUL;
                          when "11010" =>
                              v.is_sqrt := '1';
+                            v.opsel_a := AIN_B;
                              v.state := DO_FRSQRTE;
                          when "11100" | "11101" | "11110" | "11111" =>
+                            if v.a.mantissa(54) = '0' then
+                                v.opsel_a := AIN_A;
+                            elsif v.c.mantissa(54) = '0' then
+                                v.opsel_a := AIN_C;
+                            else
+                                v.opsel_a := AIN_B;
+                            end if;
                              v.state := DO_FMADD;
                          when others =>
                              illegal := '1';
@@ -880,11 +917,10 @@ begin
  
              when DO_FCMP =>
                  -- fcmp[uo]
+                -- r.opsel_a = AIN_B
                  v.instr_done := '1';
                  v.state := IDLE;
                  update_fx := '1';
-                opsel_a <= AIN_B;
-                opsel_r <= RES_SUM;
                  v.result_exp := r.b.exponent;
                  if (r.a.class = NAN and r.a.mantissa(53) = '0') or
                      (r.b.class = NAN and r.b.mantissa(53) = '0') then
@@ -930,6 +966,7 @@ begin
                      -- Prepare to subtract mantissas, put B in R
                      v.cr_result := "0000";
                      v.instr_done := '0';
+                    v.opsel_a := AIN_A;
                      v.state := CMP_1;
                  end if;
                  v.fpscr(FPSCR_FL downto FPSCR_FU) := v.cr_result;
@@ -1017,7 +1054,7 @@ begin
                  v.state := IDLE;
  
              when DO_FMR =>
-                opsel_a <= AIN_B;
+                -- r.opsel_a = AIN_B
                  v.result_class := r.b.class;
                  v.result_exp := r.b.exponent;
                  v.quieten_nan := '0';
@@ -1037,7 +1074,7 @@ begin
                  v.state := IDLE;
  
              when DO_FRI =>    -- fri[nzpm]
-                opsel_a <= AIN_B;
+                -- r.opsel_a = AIN_B
                  v.result_class := r.b.class;
                  v.result_sign := r.b.negative;
                  v.result_exp := r.b.exponent;
@@ -1062,7 +1099,7 @@ begin
                  end if;
  
              when DO_FRSP =>
-                opsel_a <= AIN_B;
+                -- r.opsel_a = AIN_B, r.shift = 0
                  v.result_class := r.b.class;
                  v.result_sign := r.b.negative;
                  v.result_exp := r.b.exponent;
@@ -1081,7 +1118,6 @@ begin
                      elsif r.b.exponent > to_signed(127, EXP_BITS) then
                          v.state := ROUND_OFLOW;
                      else
-                        v.shift := to_signed(-2, EXP_BITS);
                          v.state := ROUNDING;
                      end if;
                  else
@@ -1092,7 +1128,7 @@ begin
                  -- instr bit 9: 1=dword 0=word
                  -- instr bit 8: 1=unsigned 0=signed
                  -- instr bit 1: 1=round to zero 0=use fpscr[RN]
-                opsel_a <= AIN_B;
+                -- r.opsel_a = AIN_B
                  v.result_class := r.b.class;
                  v.result_sign := r.b.negative;
                  v.result_exp := r.b.exponent;
@@ -1130,8 +1166,8 @@ begin
                  end case;
  
              when DO_FCFID =>
+                -- r.opsel_a = AIN_B
                  v.result_sign := '0';
-                opsel_a <= AIN_B;
                  if r.insn(8) = '0' and r.b.negative = '1' then
                      -- fcfid[s] with negative operand, set R = -B
                      opsel_ainv <= '1';
@@ -1150,96 +1186,78 @@ begin
  
              when DO_FADD =>
                  -- fadd[s] and fsub[s]
-                opsel_a <= AIN_A;
+                -- r.opsel_a = AIN_A
                  v.result_sign := r.a.negative;
                  v.result_class := r.a.class;
                  v.result_exp := r.a.exponent;
                  v.fpscr(FPSCR_FR) := '0';
                  v.fpscr(FPSCR_FI) := '0';
+                v.use_a := '1';
+                v.use_b := '1';
                  is_add := r.a.negative xor r.b.negative xor r.insn(1);
                  if r.a.class = FINITE and r.b.class = FINITE then
                      v.is_subtract := not is_add;
                      v.add_bsmall := r.exp_cmp;
+                    v.opsel_a := AIN_B;
                      if r.exp_cmp = '0' then
                          v.shift := r.a.exponent - r.b.exponent;
                          v.result_sign := r.b.negative xnor r.insn(1);
                          if r.a.exponent = r.b.exponent then
                              v.state := ADD_2;
                          else
+                            v.longmask := '0';
                              v.state := ADD_SHIFT;
                          end if;
                      else
-                        opsel_a <= AIN_B;
-                        v.shift := r.b.exponent - r.a.exponent;
-                        v.result_exp := r.b.exponent;
-                        v.state := ADD_SHIFT;
+                        v.state := ADD_1;
                      end if;
                  else
-                    if (r.a.class = NAN and r.a.mantissa(53) = '0') or
-                        (r.b.class = NAN and r.b.mantissa(53) = '0') then
-                        -- Signalling NAN
-                        v.fpscr(FPSCR_VXSNAN) := '1';
-                        invalid := '1';
-                    end if;
-                    if r.a.class = NAN then
-                        -- nothing to do, result is A
-                    elsif r.b.class = NAN then
-                        v.result_class := NAN;
-                        v.result_sign := r.b.negative;
-                        opsel_a <= AIN_B;
+                    if r.a.class = NAN or r.b.class = NAN then
+                        v.state := NAN_RESULT;
                      elsif r.a.class = INFINITY and r.b.class = INFINITY and is_add = '0' then
                          -- invalid operation, construct QNaN
                          v.fpscr(FPSCR_VXISI) := '1';
                          qnan_result := '1';
+                        arith_done := '1';
                      elsif r.a.class = ZERO and r.b.class = ZERO and is_add = '0' then
                          -- return -0 for rounding to -infinity
                          v.result_sign := r.round_mode(1) and r.round_mode(0);
+                        arith_done := '1';
                      elsif r.a.class = INFINITY or r.b.class = ZERO then
-                        -- nothing to do, result is A
+                        -- result is A
+                        v.opsel_a := AIN_A;
+                        v.state := EXC_RESULT;
                      else
                          -- result is +/- B
-                        v.result_sign := r.b.negative xnor r.insn(1);
-                        v.result_class := r.b.class;
-                        v.result_exp := r.b.exponent;
-                        opsel_a <= AIN_B;
+                        v.opsel_a := AIN_B;
+                        v.negate := not r.insn(1);
+                        v.state := EXC_RESULT;
                      end if;
-                    arith_done := '1';
                  end if;
  
              when DO_FMUL =>
                  -- fmul[s]
-                opsel_a <= AIN_A;
-                v.result_sign := r.a.negative;
+                -- r.opsel_a = AIN_A unless C is denorm and A isn't
+                v.result_sign := r.a.negative xor r.c.negative;
                  v.result_class := r.a.class;
-                v.result_exp := r.a.exponent;
                  v.fpscr(FPSCR_FR) := '0';
                  v.fpscr(FPSCR_FI) := '0';
+                v.use_a := '1';
+                v.use_c := '1';
                  if r.a.class = FINITE and r.c.class = FINITE then
-                    v.result_sign := r.a.negative xor r.c.negative;
                      v.result_exp := r.a.exponent + r.c.exponent;
                      -- Renormalize denorm operands
                      if r.a.mantissa(54) = '0' then
                          v.state := RENORM_A;
                      elsif r.c.mantissa(54) = '0' then
-                        opsel_a <= AIN_C;
                          v.state := RENORM_C;
                      else
                          f_to_multiply.valid <= '1';
                          v.state := MULT_1;
                      end if;
                  else
-                    if (r.a.class = NAN and r.a.mantissa(53) = '0') or
-                        (r.c.class = NAN and r.c.mantissa(53) = '0') then
-                        -- Signalling NAN
-                        v.fpscr(FPSCR_VXSNAN) := '1';
-                        invalid := '1';
-                    end if;
-                    if r.a.class = NAN then
-                    -- result is A
-                    elsif r.c.class = NAN then
-                        v.result_class := NAN;
-                        v.result_sign := r.c.negative;
-                        opsel_a <= AIN_C;
+                    if r.a.class = NAN or r.c.class = NAN then
+                        v.state := NAN_RESULT;
                      elsif (r.a.class = INFINITY and r.c.class = ZERO) or
                          (r.a.class = ZERO and r.c.class = INFINITY) then
                          -- invalid operation, construct QNaN
@@ -1247,22 +1265,22 @@ begin
                          qnan_result := '1';
                      elsif r.a.class = ZERO or r.a.class = INFINITY then
                          -- result is +/- A
-                        v.result_sign := r.a.negative xor r.c.negative;
+                        arith_done := '1';
                      else
                          -- r.c.class is ZERO or INFINITY
-                        v.result_class := r.c.class;
-                        v.result_sign := r.a.negative xor r.c.negative;
+                        v.opsel_a := AIN_C;
+                        v.negate := r.a.negative;
+                        v.state := EXC_RESULT;
                      end if;
-                    arith_done := '1';
                  end if;
  
              when DO_FDIV =>
-                opsel_a <= AIN_A;
-                v.result_sign := r.a.negative;
+                -- r.opsel_a = AIN_A unless B is denorm and A isn't
                  v.result_class := r.a.class;
-                v.result_exp := r.a.exponent;
                  v.fpscr(FPSCR_FR) := '0';
                  v.fpscr(FPSCR_FI) := '0';
+                v.use_a := '1';
+                v.use_b := '1';
                  v.result_sign := r.a.negative xor r.b.negative;
                  v.result_exp := r.a.exponent - r.b.exponent;
                  v.count := "00";
@@ -1271,26 +1289,14 @@ begin
                      if r.a.mantissa(54) = '0' then
                          v.state := RENORM_A;
                      elsif r.b.mantissa(54) = '0' then
-                        opsel_a <= AIN_B;
                          v.state := RENORM_B;
                      else
                          v.first := '1';
                          v.state := DIV_2;
                      end if;
                  else
-                    if (r.a.class = NAN and r.a.mantissa(53) = '0') or
-                        (r.b.class = NAN and r.b.mantissa(53) = '0') then
-                        -- Signalling NAN
-                        v.fpscr(FPSCR_VXSNAN) := '1';
-                        invalid := '1';
-                    end if;
-                    if r.a.class = NAN then
-                        -- result is A
-                        v.result_sign := r.a.negative;
-                    elsif r.b.class = NAN then
-                        v.result_class := NAN;
-                        v.result_sign := r.b.negative;
-                        opsel_a <= AIN_B;
+                    if r.a.class = NAN or r.b.class = NAN then
+                        v.state := NAN_RESULT;
                      elsif r.b.class = INFINITY then
                          if r.a.class = INFINITY then
                              v.fpscr(FPSCR_VXIDI) := '1';
@@ -1298,6 +1304,7 @@ begin
                          else
                              v.result_class := ZERO;
                          end if;
+                        arith_done := '1';
                      elsif r.b.class = ZERO then
                          if r.a.class = ZERO then
                              v.fpscr(FPSCR_VXZDZ) := '1';
@@ -1308,46 +1315,36 @@ begin
                              end if;
                              v.result_class := INFINITY;
                          end if;
-                    -- else r.b.class = FINITE, result_class = r.a.class
+                        arith_done := '1';
+                    else -- r.b.class = FINITE, result_class = r.a.class
+                        arith_done := '1';
                      end if;
-                    arith_done := '1';
                  end if;
  
              when DO_FSEL =>
-                opsel_a <= AIN_A;
                  v.fpscr(FPSCR_FR) := '0';
                  v.fpscr(FPSCR_FI) := '0';
                  if r.a.class = ZERO or (r.a.negative = '0' and r.a.class /= NAN) then
-                    v.result_sign := r.c.negative;
-                    v.result_exp := r.c.exponent;
-                    v.result_class := r.c.class;
-                    opsel_a <= AIN_C;
+                    v.opsel_a := AIN_C;
                  else
-                    v.result_sign := r.b.negative;
-                    v.result_exp := r.b.exponent;
-                    v.result_class := r.b.class;
-                    opsel_a <= AIN_B;
+                    v.opsel_a := AIN_B;
                  end if;
                  v.quieten_nan := '0';
-                arith_done := '1';
+                v.state := EXC_RESULT;
  
              when DO_FSQRT =>
-                opsel_a <= AIN_B;
+                -- r.opsel_a = AIN_B
                  v.result_class := r.b.class;
                  v.result_sign := r.b.negative;
                  v.fpscr(FPSCR_FR) := '0';
                  v.fpscr(FPSCR_FI) := '0';
-                if r.b.class = NAN and r.b.mantissa(53) = '0' then
-                    v.fpscr(FPSCR_VXSNAN) := '1';
-                    invalid := '1';
-                end if;
+                v.use_b := '1';
                  case r.b.class is
                      when FINITE =>
                          v.result_exp := r.b.exponent;
                          if r.b.negative = '1' then
                              v.fpscr(FPSCR_VXSQRT) := '1';
                              qnan_result := '1';
-                            arith_done := '1';
                          elsif r.b.mantissa(54) = '0' then
                              v.state := RENORM_B;
                          elsif r.b.exponent(0) = '0' then
@@ -1356,7 +1353,9 @@ begin
                              v.shift := to_signed(1, EXP_BITS);
                              v.state := RENORM_B2;
                          end if;
-                    when NAN | ZERO =>
+                    when NAN =>
+                        v.state := NAN_RESULT;
+                    when ZERO =>
                          -- result is B
                          arith_done := '1';
                      when INFINITY =>
@@ -1369,15 +1368,12 @@ begin
                  end case;
  
              when DO_FRE =>
-                opsel_a <= AIN_B;
+                -- r.opsel_a = AIN_B
                  v.result_class := r.b.class;
                  v.result_sign := r.b.negative;
                  v.fpscr(FPSCR_FR) := '0';
                  v.fpscr(FPSCR_FI) := '0';
-                if r.b.class = NAN and r.b.mantissa(53) = '0' then
-                    v.fpscr(FPSCR_VXSNAN) := '1';
-                    invalid := '1';
-                end if;
+                v.use_b := '1';
                  case r.b.class is
                      when FINITE =>
                          v.result_exp := - r.b.exponent;
@@ -1387,8 +1383,7 @@ begin
                              v.state := FRE_1;
                          end if;
                      when NAN =>
-                        -- result is B
-                        arith_done := '1';
+                        v.state := NAN_RESULT;
                      when INFINITY =>
                          v.result_class := ZERO;
                          arith_done := '1';
@@ -1399,15 +1394,12 @@ begin
                  end case;
  
              when DO_FRSQRTE =>
-                opsel_a <= AIN_B;
+                -- r.opsel_a = AIN_B
                  v.result_class := r.b.class;
                  v.result_sign := r.b.negative;
                  v.fpscr(FPSCR_FR) := '0';
                  v.fpscr(FPSCR_FI) := '0';
-                if r.b.class = NAN and r.b.mantissa(53) = '0' then
-                    v.fpscr(FPSCR_VXSNAN) := '1';
-                    invalid := '1';
-                end if;
+                v.use_b := '1';
                  v.shift := to_signed(1, EXP_BITS);
                  case r.b.class is
                      when FINITE =>
@@ -1415,7 +1407,6 @@ begin
                          if r.b.negative = '1' then
                              v.fpscr(FPSCR_VXSQRT) := '1';
                              qnan_result := '1';
-                            arith_done := '1';
                          elsif r.b.mantissa(54) = '0' then
                              v.state := RENORM_B;
                          elsif r.b.exponent(0) = '0' then
@@ -1424,8 +1415,7 @@ begin
                              v.state := RENORM_B2;
                          end if;
                      when NAN =>
-                        -- result is B
-                        arith_done := '1';
+                        v.state := NAN_RESULT;
                      when INFINITY =>
                          if r.b.negative = '1' then
                              v.fpscr(FPSCR_VXSQRT) := '1';
@@ -1442,25 +1432,26 @@ begin
  
              when DO_FMADD =>
                  -- fmadd, fmsub, fnmadd, fnmsub
-                opsel_a <= AIN_A;
+                -- r.opsel_a = AIN_A if A is denorm, else AIN_C if C is denorm,
+                -- else AIN_B
                  v.result_sign := r.a.negative;
                  v.result_class := r.a.class;
                  v.result_exp := r.a.exponent;
                  v.fpscr(FPSCR_FR) := '0';
                  v.fpscr(FPSCR_FI) := '0';
+                v.use_a := '1';
+                v.use_b := '1';
+                v.use_c := '1';
                  is_add := r.a.negative xor r.c.negative xor r.b.negative xor r.insn(1);
                  if r.a.class = FINITE and r.c.class = FINITE and
                      (r.b.class = FINITE or r.b.class = ZERO) then
                      v.is_subtract := not is_add;
                      mulexp := r.a.exponent + r.c.exponent;
                      v.result_exp := mulexp;
-                    opsel_a <= AIN_B;
                      -- Make sure A and C are normalized
                      if r.a.mantissa(54) = '0' then
-                        opsel_a <= AIN_A;
                          v.state := RENORM_A;
                      elsif r.c.mantissa(54) = '0' then
-                        opsel_a <= AIN_C;
                          v.state := RENORM_C;
                      elsif r.b.class = ZERO then
                          -- no addend, degenerates to multiply
@@ -1483,25 +1474,8 @@ begin
                          v.state := FMADD_2;
                      end if;
                  else
-                    if (r.a.class = NAN and r.a.mantissa(53) = '0') or
-                        (r.b.class = NAN and r.b.mantissa(53) = '0') or
-                        (r.c.class = NAN and r.c.mantissa(53) = '0') then
-                        -- Signalling NAN
-                        v.fpscr(FPSCR_VXSNAN) := '1';
-                        invalid := '1';
-                    end if;
-                    if r.a.class = NAN then
-                        -- nothing to do, result is A
-                    elsif r.b.class = NAN then
-                        -- result is B
-                        v.result_class := NAN;
-                        v.result_sign := r.b.negative;
-                        opsel_a <= AIN_B;
-                    elsif r.c.class = NAN then
-                        -- result is C
-                        v.result_class := NAN;
-                        v.result_sign := r.c.negative;
-                        opsel_a <= AIN_C;
+                    if r.a.class = NAN or r.b.class = NAN or r.c.class = NAN then
+                        v.state := NAN_RESULT;
                      elsif (r.a.class = ZERO and r.c.class = INFINITY) or
                          (r.a.class = INFINITY and r.c.class = ZERO) then
                          -- invalid operation, construct QNaN
@@ -1516,32 +1490,36 @@ begin
                              -- result is infinity
                              v.result_class := INFINITY;
                              v.result_sign := r.a.negative xor r.c.negative xor r.insn(2);
+                            arith_done := '1';
                          end if;
                      else
                          -- Here A is zero, C is zero, or B is infinity
                          -- Result is +/-B in all of those cases
-                        v.result_class := r.b.class;
-                        v.result_exp := r.b.exponent;
-                        if v.result_class /= ZERO or is_add = '1' then
-                            v.result_sign := not (r.b.negative xor r.insn(1) xor r.insn(2));
+                        v.opsel_a := AIN_B;
+                        if r.b.class /= ZERO or is_add = '1' then
+                            v.negate := not (r.insn(1) xor r.insn(2));
                          else
                              -- have to be careful about rule for 0 - 0 result sign
-                            v.result_sign := (r.round_mode(1) and r.round_mode(0)) xor r.insn(2);
+                            v.negate := r.b.negative xor (r.round_mode(1) and r.round_mode(0)) xor r.insn(2);
                          end if;
-                        opsel_a <= AIN_B;
+                        v.state := EXC_RESULT;
                      end if;
-                    arith_done := '1';
                  end if;
  
              when RENORM_A =>
                  renormalize := '1';
                  v.state := RENORM_A2;
+                if r.insn(4) = '1' then
+                    v.opsel_a := AIN_C;
+                else
+                    v.opsel_a := AIN_B;
+                end if;
  
              when RENORM_A2 =>
+                -- r.opsel_a = AIN_C for fmul/fmadd, AIN_B for fdiv
                  set_a := '1';
                  v.result_exp := new_exp;
                  if r.insn(4) = '1' then
-                    opsel_a <= AIN_C;
                      if r.c.mantissa(54) = '1' then
                          if r.insn(3) = '0' or r.b.class = ZERO then
                              v.first := '1';
@@ -1551,18 +1529,18 @@ begin
                              if new_exp + 1 >= r.b.exponent then
                                  v.madd_cmp := '1';
                              end if;
+                            v.opsel_a := AIN_B;
                              v.state := DO_FMADD;
                          end if;
                      else
                          v.state := RENORM_C;
                      end if;
                  else
-                        opsel_a <= AIN_B;
-                        if r.b.mantissa(54) = '1' then
-                            v.first := '1';
-                            v.state := DIV_2;
-                        else
-                            v.state := RENORM_B;
+                    if r.b.mantissa(54) = '1' then
+                        v.first := '1';
+                        v.state := DIV_2;
+                    else
+                        v.state := RENORM_B;
                      end if;
                  end if;
  
@@ -1578,6 +1556,7 @@ begin
                  else
                      v.result_exp := new_exp;
                  end if;
+                v.opsel_a := AIN_B;
                  v.state := LOOKUP;
  
              when RENORM_C =>
@@ -1595,22 +1574,32 @@ begin
                      if new_exp + 1 >= r.b.exponent then
                          v.madd_cmp := '1';
                      end if;
+                    v.opsel_a := AIN_B;
                      v.state := DO_FMADD;
                  end if;
  
+            when ADD_1 =>
+                -- transferring B to R
+                v.shift := r.b.exponent - r.a.exponent;
+                v.result_exp := r.b.exponent;
+                v.longmask := '0';
+                v.state := ADD_SHIFT;
+
              when ADD_SHIFT =>
+                -- r.shift = - exponent difference, r.longmask = 0
                  opsel_r <= RES_SHIFT;
                  v.x := s_nz;
                  set_x := '1';
-                longmask := '0';
-                v.state := ADD_2;
-
-            when ADD_2 =>
+                v.longmask := r.single_prec;
                  if r.add_bsmall = '1' then
-                    opsel_a <= AIN_A;
+                    v.opsel_a := AIN_A;
                  else
-                    opsel_a <= AIN_B;
+                    v.opsel_a := AIN_B;
                  end if;
+                v.state := ADD_2;
+
+            when ADD_2 =>
+                -- r.opsel_a = AIN_A if r.add_bsmall = 1 else AIN_B
                  opsel_b <= BIN_R;
                  opsel_binv <= r.is_subtract;
                  carry_in <= r.is_subtract and not r.x;
@@ -1619,6 +1608,7 @@ begin
  
              when ADD_3 =>
                  -- check for overflow or negative result (can't get both)
+                -- r.shift = -1
                  if r.r(63) = '1' then
                      -- result is opposite sign to expected
                      v.result_sign := not r.result_sign;
@@ -1629,7 +1619,6 @@ begin
                      -- sum overflowed, shift right
                      opsel_r <= RES_SHIFT;
                      set_x := '1';
-                    v.shift := to_signed(-2, EXP_BITS);
                      if exp_huge = '1' then
                          v.state := ROUND_OFLOW;
                      else
@@ -1637,7 +1626,6 @@ begin
                      end if;
                  elsif r.r(54) = '1' then
                      set_x := '1';
-                    v.shift := to_signed(-2, EXP_BITS);
                      v.state := ROUNDING;
                  elsif (r_hi_nz or r_lo_nz or r.r(1) or r.r(0)) = '0' then
                      -- r.x must be zero at this point
@@ -1653,7 +1641,7 @@ begin
                  end if;
  
              when CMP_1 =>
-                opsel_a <= AIN_A;
+                -- r.opsel_a = AIN_A
                  opsel_b <= BIN_R;
                  opsel_binv <= '1';
                  carry_in <= '1';
@@ -1689,17 +1677,20 @@ begin
                  set_s := '1';
                  f_to_multiply.valid <= r.first;
                  if multiply_to_f.valid = '1' then
+                    v.longmask := '0';
                      v.state := ADD_SHIFT;
                  end if;
  
              when FMADD_2 =>
                  -- Product is potentially bigger here
+                -- r.shift = addend exp - product exp + 64, r.r = r.b.mantissa
                  set_s := '1';
                  opsel_s <= S_SHIFT;
                  v.shift := r.shift - to_signed(64, EXP_BITS);
                  v.state := FMADD_3;
  
              when FMADD_3 =>
+                -- r.shift = addend exp - product exp
                  opsel_r <= RES_SHIFT;
                  v.first := '1';
                  v.state := FMADD_4;
@@ -1711,26 +1702,24 @@ begin
                  opsel_r <= RES_MULT;
                  opsel_s <= S_MULT;
                  set_s := '1';
-                v.shift := to_signed(56, EXP_BITS);
                  if multiply_to_f.valid = '1' then
-                    if multiply_to_f.result(121) = '1' then
-                        v.state := FMADD_5;
-                    else
-                        v.state := FMADD_6;
-                    end if;
+                    v.state := FMADD_5;
                  end if;
  
              when FMADD_5 =>
-                -- negate R:S:X
-                v.result_sign := not r.result_sign;
-                opsel_ainv <= '1';
-                carry_in <= not (s_nz or r.x);
-                opsel_s <= S_NEG;
-                set_s := '1';
+                -- negate R:S:X if negative
+                if r.r(63) = '1' then
+                    v.result_sign := not r.result_sign;
+                    opsel_ainv <= '1';
+                    carry_in <= not (s_nz or r.x);
+                    opsel_s <= S_NEG;
+                    set_s := '1';
+                end if;
                  v.shift := to_signed(56, EXP_BITS);
                  v.state := FMADD_6;
  
              when FMADD_6 =>
+                -- r.shift = 56 (or 0, but only if r is now nonzero)
                  if (r.r(56) or r_hi_nz or r_lo_nz or r.r(1) or r.r(0)) = '0' then
                      if s_nz = '0' then
                          -- must be a subtraction, and r.x must be zero
@@ -1752,7 +1741,7 @@ begin
                  end if;
  
              when LOOKUP =>
-                opsel_a <= AIN_B;
+                -- r.opsel_a = AIN_B
                  -- wait one cycle for inverse_table[B] lookup
                  v.first := '1';
                  if r.insn(4) = '0' then
@@ -1877,6 +1866,7 @@ begin
              when SQRT_2 =>
                  -- shift R right one place
                  -- not expecting multiplier result yet
+                -- r.shift = -1
                  opsel_r <= RES_SHIFT;
                  v.first := '1';
                  v.state := SQRT_3;
@@ -2012,12 +2002,14 @@ begin
                  v.state := FINISH;
  
              when INT_SHIFT =>
+                -- r.shift = b.exponent - 52
                  opsel_r <= RES_SHIFT;
                  set_x := '1';
                  v.state := INT_ROUND;
                  v.shift := to_signed(-2, EXP_BITS);
  
              when INT_ROUND =>
+                -- r.shift = -2
                  opsel_r <= RES_SHIFT;
                  round := fp_rounding(r.r, r.x, '0', r.round_mode, r.result_sign);
                  v.fpscr(FPSCR_FR downto FPSCR_FI) := round;
@@ -2030,6 +2022,7 @@ begin
                  end if;
  
              when INT_ISHIFT =>
+                -- r.shift = b.exponent - 54;
                  opsel_r <= RES_SHIFT;
                  v.state := INT_FINAL;
  
@@ -2087,9 +2080,9 @@ begin
                  arith_done := '1';
  
              when FRI_1 =>
+                -- r.shift = b.exponent - 52
                  opsel_r <= RES_SHIFT;
                  set_x := '1';
-                v.shift := to_signed(-2, EXP_BITS);
                  v.state := ROUNDING;
  
              when FINISH =>
@@ -2107,13 +2100,13 @@ begin
                      elsif exp_huge = '1' then
                          v.state := ROUND_OFLOW;
                      else
-                        v.shift := to_signed(-2, EXP_BITS);
                          v.state := ROUNDING;
                      end if;
                  end if;
  
              when NORMALIZE =>
                  -- Shift so we have 9 leading zeroes (we know R is non-zero)
+                -- r.shift = clz(r.r) - 9
                  opsel_r <= RES_SHIFT;
                  set_x := '1';
                  if exp_tiny = '1' then
@@ -2122,18 +2115,17 @@ begin
                  elsif exp_huge = '1' then
                      v.state := ROUND_OFLOW;
                  else
-                    v.shift := to_signed(-2, EXP_BITS);
                      v.state := ROUNDING;
                  end if;
  
              when ROUND_UFLOW =>
+                -- r.shift = - amount by which exponent underflows
                  v.tiny := '1';
                  if r.fpscr(FPSCR_UE) = '0' then
                      -- disabled underflow exception case
                      -- have to denormalize before rounding
                      opsel_r <= RES_SHIFT;
                      set_x := '1';
-                    v.shift := to_signed(-2, EXP_BITS);
                      v.state := ROUNDING;
                  else
                      -- enabled underflow exception case
@@ -2144,7 +2136,6 @@ begin
                          renormalize := '1';
                          v.state := NORMALIZE;
                      else
-                        v.shift := to_signed(-2, EXP_BITS);
                          v.state := ROUNDING;
                      end if;
                  end if;
@@ -2171,18 +2162,16 @@ begin
                  else
                      -- enabled overflow exception
                      v.result_exp := r.result_exp - bias_exp;
-                    v.shift := to_signed(-2, EXP_BITS);
                      v.state := ROUNDING;
                  end if;
  
              when ROUNDING =>
-                opsel_amask <= '1';
+                opsel_mask <= '1';
                  round := fp_rounding(r.r, r.x, r.single_prec, r.round_mode, r.result_sign);
                  v.fpscr(FPSCR_FR downto FPSCR_FI) := round;
                  if round(1) = '1' then
-                    -- set mask to increment the LSB for the precision
-                    opsel_b <= BIN_MASK;
-                    carry_in <= '1';
+                    -- increment the LSB for the precision
+                    opsel_b <= BIN_RND;
                      v.shift := to_signed(-1, EXP_BITS);
                      v.state := ROUNDING_2;
                  else
@@ -2204,6 +2193,7 @@ begin
  
              when ROUNDING_2 =>
                  -- Check for overflow during rounding
+                -- r.shift = -1
                  v.x := '0';
                  if r.r(55) = '1' then
                      opsel_r <= RES_SHIFT;
@@ -2221,6 +2211,7 @@ begin
                  end if;
  
              when ROUNDING_3 =>
+                -- r.shift = clz(r.r) - 9
                  mant_nz := r_hi_nz or (r_lo_nz and not r.single_prec);
                  if mant_nz = '0' then
                      v.result_class := ZERO;
@@ -2242,9 +2233,45 @@ begin
                  end if;
  
              when DENORM =>
+                -- r.shift = result_exp - -1022
                  opsel_r <= RES_SHIFT;
                  arith_done := '1';
  
+            when NAN_RESULT =>
+                if (r.use_a = '1' and r.a.class = NAN and r.a.mantissa(53) = '0') or
+                    (r.use_b = '1' and r.b.class = NAN and r.b.mantissa(53) = '0') or
+                    (r.use_c = '1' and r.c.class = NAN and r.c.mantissa(53) = '0') then
+                    -- Signalling NAN
+                    v.fpscr(FPSCR_VXSNAN) := '1';
+                    invalid := '1';
+                end if;
+                if r.use_a = '1' and r.a.class = NAN then
+                    v.opsel_a := AIN_A;
+                elsif r.use_b = '1' and r.b.class = NAN then
+                    v.opsel_a := AIN_B;
+                elsif r.use_c = '1' and r.c.class = NAN then
+                    v.opsel_a := AIN_C;
+                end if;
+                v.state := EXC_RESULT;
+
+            when EXC_RESULT =>
+                -- r.opsel_a = AIN_A, AIN_B or AIN_C according to which input is the result
+                case r.opsel_a is
+                    when AIN_B =>
+                        v.result_sign := r.b.negative xor r.negate;
+                        v.result_exp := r.b.exponent;
+                        v.result_class := r.b.class;
+                    when AIN_C =>
+                        v.result_sign := r.c.negative xor r.negate;
+                        v.result_exp := r.c.exponent;
+                        v.result_class := r.c.class;
+                    when others =>
+                        v.result_sign := r.a.negative xor r.negate;
+                        v.result_exp := r.a.exponent;
+                        v.result_class := r.a.class;
+                end case;
+                arith_done := '1';
+
          end case;
  
          if zero_divide = '1' then
@@ -2256,11 +2283,15 @@ begin
              v.result_sign := '0';
              misc_sel <= "0001";
              opsel_r <= RES_MISC;
+            arith_done := '1';
+        end if;
+        if invalid = '1' then
+            v.invalid := '1';
          end if;
          if arith_done = '1' then
              -- Enabled invalid exception doesn't write result or FPRF
              -- Neither does enabled zero-divide exception
-            if (invalid and r.fpscr(FPSCR_VE)) = '0' and
+            if (v.invalid and r.fpscr(FPSCR_VE)) = '0' and
                  (zero_divide and r.fpscr(FPSCR_ZE)) = '0' then
                  v.writing_back := '1';
                  v.update_fprf := '1';
@@ -2328,7 +2359,7 @@ begin
          -- Data path.
          -- This has A and B input multiplexers, an adder, a shifter,
          -- count-leading-zeroes logic, and a result mux.
-        if longmask = '1' then
+        if r.longmask = '1' then
              mshift := r.shift + to_signed(-29, EXP_BITS);
          else
              mshift := r.shift;
@@ -2340,7 +2371,7 @@ begin
          else
              mask := right_mask(unsigned(mshift(5 downto 0)));
          end if;
-        case opsel_a is
+        case r.opsel_a is
              when AIN_R =>
                  in_a0 := r.r;
              when AIN_A =>
@@ -2356,17 +2387,15 @@ begin
          if opsel_ainv = '1' then
              in_a0 := not in_a0;
          end if;
-        if opsel_amask = '1' then
-            in_a0 := in_a0 and not mask;
-        end if;
          in_a <= in_a0;
          case opsel_b is
              when BIN_ZERO =>
                  in_b0 := (others => '0');
              when BIN_R =>
                  in_b0 := r.r;
-            when BIN_MASK =>
-                in_b0 := mask;
+            when BIN_RND =>
+                round_inc := (31 => r.single_prec, 2 => not r.single_prec, others => '0');
+                in_b0 := round_inc;
              when others =>
                  -- BIN_PS6, 6 LSBs of P/4 sign-extended to 64
                  in_b0 := std_ulogic_vector(resize(signed(r.p(7 downto 2)), 64));
@@ -2381,9 +2410,16 @@ begin
          else
              shift_res := (others => '0');
          end if;
+        sum := std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + carry_in);
+        if opsel_mask = '1' then
+            sum(1 downto 0) := "00";
+            if r.single_prec = '1' then
+                sum(30 downto 2) := (others => '0');
+            end if;
+        end if;
          case opsel_r is
              when RES_SUM =>
-                result <= std_ulogic_vector(unsigned(in_a) + unsigned(in_b) + carry_in);
+                result <= sum;
              when RES_SHIFT =>
                  result <= shift_res;
              when RES_MULT =>