X-Git-Url: https://git.libre-soc.org/?a=blobdiff_plain;f=execute1.vhdl;h=875e22c95919ce4e8438bd52c8b597da0447243c;hb=3cd3449b4b88e025ff9412f82737747b0c6d938a;hp=abd4a18b0d8678714472d1323831a384b89c7133;hpb=167e37d6675136d26acdb6f7aba0a7f7ad1e60d8;p=microwatt.git

diff --git a/execute1.vhdl b/execute1.vhdl
index abd4a18..875e22c 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -12,70 +12,122 @@ use work.ppc_fx_insns.all;
 
 entity execute1 is
     generic (
-        EX1_BYPASS : boolean := true
+        EX1_BYPASS : boolean := true;
+        HAS_FPU : boolean := true;
+        -- Non-zero to enable log data collection
+        LOG_LENGTH : natural := 0
         );
     port (
 	clk   : in std_ulogic;
         rst   : in std_ulogic;
 
 	-- asynchronous
-	flush_out : out std_ulogic;
-	stall_out : out std_ulogic;
+	flush_in : in std_ulogic;
+	busy_out : out std_ulogic;
 
 	e_in  : in Decode2ToExecute1Type;
+        l_in  : in Loadstore1ToExecute1Type;
+        fp_in : in FPUToExecute1Type;
+
+	ext_irq_in : std_ulogic;
+        interrupt_in : std_ulogic;
 
 	-- asynchronous
         l_out : out Execute1ToLoadstore1Type;
-	f_out : out Execute1ToFetch1Type;
+        fp_out : out Execute1ToFPUType;
 
 	e_out : out Execute1ToWritebackType;
+        bypass_data : out bypass_data_t;
+        bypass_cr_data : out cr_bypass_data_t;
+
+        dbg_msr_out : out std_ulogic_vector(63 downto 0);
 
 	icache_inval : out std_ulogic;
-	terminate_out : out std_ulogic
+	terminate_out : out std_ulogic;
+
+        log_out : out std_ulogic_vector(14 downto 0);
+        log_rd_addr : out std_ulogic_vector(31 downto 0);
+        log_rd_data : in std_ulogic_vector(63 downto 0);
+        log_wr_addr : in std_ulogic_vector(31 downto 0)
 	);
 end entity execute1;
 
 architecture behaviour of execute1 is
     type reg_type is record
 	e : Execute1ToWritebackType;
-	lr_update : std_ulogic;
-	next_lr : std_ulogic_vector(63 downto 0);
+        cur_instr : Decode2ToExecute1Type;
+        busy: std_ulogic;
+        terminate: std_ulogic;
+        fp_exception_next : std_ulogic;
+        trace_next : std_ulogic;
+        prev_op : insn_type_t;
+        br_taken : std_ulogic;
 	mul_in_progress : std_ulogic;
+        mul_finish : std_ulogic;
         div_in_progress : std_ulogic;
         cntz_in_progress : std_ulogic;
-	slow_op_dest : gpr_index_t;
-	slow_op_rc : std_ulogic;
-	slow_op_oe : std_ulogic;
-	slow_op_xerc : xer_common_t;
+        log_addr_spr : std_ulogic_vector(31 downto 0);
     end record;
     constant reg_type_init : reg_type :=
-        (e => Execute1ToWritebackInit, lr_update => '0',
-         mul_in_progress => '0', div_in_progress => '0', cntz_in_progress => '0',
-         slow_op_rc => '0', slow_op_oe => '0', slow_op_xerc => xerc_init,
+        (e => Execute1ToWritebackInit,
+         cur_instr => Decode2ToExecute1Init,
+         busy => '0', terminate => '0',
+         fp_exception_next => '0', trace_next => '0', prev_op => OP_ILLEGAL, br_taken => '0',
+         mul_in_progress => '0', mul_finish => '0', div_in_progress => '0', cntz_in_progress => '0',
          others => (others => '0'));
 
     signal r, rin : reg_type;
 
     signal a_in, b_in, c_in : std_ulogic_vector(63 downto 0);
+    signal cr_in : std_ulogic_vector(31 downto 0);
+    signal xerc_in : xer_common_t;
 
-    signal ctrl: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0'));
-    signal ctrl_tmp: ctrl_t := (irq_state => WRITE_SRR0, others => (others => '0'));
+    signal valid_in : std_ulogic;
+    signal ctrl: ctrl_t := (others => (others => '0'));
+    signal ctrl_tmp: ctrl_t := (others => (others => '0'));
     signal right_shift, rot_clear_left, rot_clear_right: std_ulogic;
+    signal rot_sign_ext: std_ulogic;
     signal rotator_result: std_ulogic_vector(63 downto 0);
     signal rotator_carry: std_ulogic;
     signal logical_result: std_ulogic_vector(63 downto 0);
     signal countzero_result: std_ulogic_vector(63 downto 0);
-    signal popcnt_result: std_ulogic_vector(63 downto 0);
-    signal parity_result: std_ulogic_vector(63 downto 0);
+    signal alu_result: std_ulogic_vector(63 downto 0);
+    signal adder_result: std_ulogic_vector(63 downto 0);
+    signal misc_result: std_ulogic_vector(63 downto 0);
+    signal muldiv_result: std_ulogic_vector(63 downto 0);
+    signal spr_result: std_ulogic_vector(63 downto 0);
+    signal result_mux_sel: std_ulogic_vector(2 downto 0);
+    signal sub_mux_sel: std_ulogic_vector(2 downto 0);
+    signal next_nia : std_ulogic_vector(63 downto 0);
+    signal current: Decode2ToExecute1Type;
+
+    signal carry_32 : std_ulogic;
+    signal carry_64 : std_ulogic;
+    signal overflow_32 : std_ulogic;
+    signal overflow_64 : std_ulogic;
+
+    signal trapval : std_ulogic_vector(4 downto 0);
+
+    signal write_cr_mask : std_ulogic_vector(7 downto 0);
+    signal write_cr_data : std_ulogic_vector(31 downto 0);
 
     -- multiply signals
-    signal x_to_multiply: Execute1ToMultiplyType;
-    signal multiply_to_x: MultiplyToExecute1Type;
+    signal x_to_multiply: MultiplyInputType;
+    signal multiply_to_x: MultiplyOutputType;
 
     -- divider signals
     signal x_to_divider: Execute1ToDividerType;
     signal divider_to_x: DividerToExecute1Type;
 
+    -- random number generator signals
+    signal random_raw  : std_ulogic_vector(63 downto 0);
+    signal random_cond : std_ulogic_vector(63 downto 0);
+    signal random_err  : std_ulogic;
+
+    -- signals for logging
+    signal exception_log : std_ulogic;
+    signal irq_valid_log : std_ulogic;
+
     type privilege_level is (USER, SUPER);
     type op_privilege_array is array(insn_type_t) of privilege_level;
     constant op_privilege: op_privilege_array := (
@@ -83,6 +135,7 @@ architecture behaviour of execute1 is
         OP_MFMSR => SUPER,
         OP_MTMSRD => SUPER,
         OP_RFID => SUPER,
+        OP_TLBIE => SUPER,
         others => USER
         );
 
@@ -104,7 +157,6 @@ architecture behaviour of execute1 is
     begin
 	e.xerc.ca32 := carry32;
 	e.xerc.ca := carry;
-	e.write_xerc_enable := '1';
     end;
 
     procedure set_ov(e: inout Execute1ToWritebackType;
@@ -116,7 +168,6 @@ architecture behaviour of execute1 is
 	if ov = '1' then
 	    e.xerc.so := '1';
 	end if;
-	e.write_xerc_enable := '1';
     end;
 
     function calc_ov(msb_a : std_ulogic; msb_b: std_ulogic;
@@ -133,6 +184,8 @@ architecture behaviour of execute1 is
 	    return '0';
 	when CA =>
 	    return xerc.ca;
+        when OV =>
+            return xerc.ov;
 	when ONE =>
 	    return '1';
 	end case;
@@ -159,6 +212,11 @@ architecture behaviour of execute1 is
 	return msr_out;
     end;
 
+    -- Tell vivado to keep the hierarchy for the random module so that the
+    -- net names in the xdc file match.
+    attribute keep_hierarchy : string;
+    attribute keep_hierarchy of random_0 : label is "yes";
+
 begin
 
     rotator_0: entity work.rotator
@@ -172,6 +230,7 @@ begin
 	    arith => e_in.is_signed,
 	    clear_left => rot_clear_left,
 	    clear_right => rot_clear_right,
+            sign_ext_rs => rot_sign_ext,
 	    result => rotator_result,
 	    carry_out => rotator_carry
 	    );
@@ -184,9 +243,7 @@ begin
 	    invert_in => e_in.invert_a,
 	    invert_out => e_in.invert_out,
 	    result => logical_result,
-            datalen => e_in.data_len,
-            popcnt => popcnt_result,
-            parity => parity_result
+            datalen => e_in.data_len
 	    );
 
     countzero_0: entity work.zero_counter
@@ -213,142 +270,119 @@ begin
             d_out => divider_to_x
             );
 
-    a_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data1 = '1' else e_in.read_data1;
-    b_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data2 = '1' else e_in.read_data2;
-    c_in <= r.e.write_data when EX1_BYPASS and e_in.bypass_data3 = '1' else e_in.read_data3;
+    random_0: entity work.random
+        port map (
+            clk => clk,
+            data => random_cond,
+            raw => random_raw,
+            err => random_err
+            );
+
+    dbg_msr_out <= ctrl.msr;
+    log_rd_addr <= r.log_addr_spr;
+
+    a_in <= e_in.read_data1;
+    b_in <= e_in.read_data2;
+    c_in <= e_in.read_data3;
+    cr_in <= e_in.cr;
+
+    -- XER forwarding. To avoid having to track XER hazards, we use
+    -- the previously latched value.  Since the XER common bits
+    -- (SO, OV[32] and CA[32]) are only modified by instructions that are
+    -- handled here, we can just forward the result being sent to
+    -- writeback.
+    xerc_in <= r.e.xerc when r.e.write_xerc_enable = '1' or r.busy = '1' else e_in.xerc;
+
+    busy_out <= l_in.busy or r.busy or fp_in.busy;
+    valid_in <= e_in.valid and not busy_out and not flush_in;
+
+    terminate_out <= r.terminate;
+
+    current <= e_in when r.busy = '0' else r.cur_instr;
+
+    -- Result mux
+    with current.result_sel select alu_result <=
+        adder_result       when "000",
+        logical_result     when "001",
+        rotator_result     when "010",
+        muldiv_result      when "011",
+        countzero_result   when "100",
+        spr_result         when "101",
+        next_nia           when "110",
+        misc_result        when others;
 
     execute1_0: process(clk)
     begin
 	if rising_edge(clk) then
             if rst = '1' then
                 r <= reg_type_init;
+                ctrl.tb <= (others => '0');
+                ctrl.dec <= (others => '0');
                 ctrl.msr <= (MSR_SF => '1', MSR_LE => '1', others => '0');
-                ctrl.irq_state <= WRITE_SRR0;
             else
                 r <= rin;
                 ctrl <= ctrl_tmp;
-                assert not (r.lr_update = '1' and e_in.valid = '1')
-                    report "LR update collision with valid in EX1"
-                    severity failure;
-                if r.lr_update = '1' then
-                    report "LR update to " & to_hstring(r.next_lr);
+                if valid_in = '1' then
+                    report "execute " & to_hstring(e_in.nia) & " op=" & insn_type_t'image(e_in.insn_type) &
+                        " wr=" & to_hstring(rin.e.write_reg) & " we=" & std_ulogic'image(rin.e.write_enable) &
+                        " tag=" & integer'image(rin.e.instr_tag.tag) & std_ulogic'image(rin.e.instr_tag.valid);
                 end if;
             end if;
 	end if;
     end process;
 
-    execute1_1: process(all)
-	variable v : reg_type;
+    -- Data path for integer instructions
+    execute1_dp: process(all)
 	variable a_inv : std_ulogic_vector(63 downto 0);
-	variable result : std_ulogic_vector(63 downto 0);
-	variable newcrf : std_ulogic_vector(3 downto 0);
-	variable result_with_carry : std_ulogic_vector(64 downto 0);
-	variable result_en : std_ulogic;
-	variable crnum : crnum_t;
+	variable b_or_m1 : std_ulogic_vector(63 downto 0);
+	variable sum_with_carry : std_ulogic_vector(64 downto 0);
+        variable sign1, sign2 : std_ulogic;
+        variable abs1, abs2 : signed(63 downto 0);
+        variable addend : std_ulogic_vector(127 downto 0);
+	variable addg6s : std_ulogic_vector(63 downto 0);
 	variable crbit : integer range 0 to 31;
-	variable scrnum : crnum_t;
+	variable isel_result : std_ulogic_vector(63 downto 0);
+	variable darn : std_ulogic_vector(63 downto 0);
+	variable setb_result : std_ulogic_vector(63 downto 0);
+	variable mfcr_result : std_ulogic_vector(63 downto 0);
 	variable lo, hi : integer;
-	variable sh, mb, me : std_ulogic_vector(5 downto 0);
-	variable sh32, mb32, me32 : std_ulogic_vector(4 downto 0);
-	variable bo, bi : std_ulogic_vector(4 downto 0);
-	variable bf, bfa : std_ulogic_vector(2 downto 0);
-	variable cr_op : std_ulogic_vector(9 downto 0);
-        variable cr_operands : std_ulogic_vector(1 downto 0);
-	variable bt, ba, bb : std_ulogic_vector(4 downto 0);
-	variable btnum, banum, bbnum : integer range 0 to 31;
-	variable crresult : std_ulogic;
 	variable l : std_ulogic;
-	variable next_nia : std_ulogic_vector(63 downto 0);
-        variable carry_32, carry_64 : std_ulogic;
-        variable sign1, sign2 : std_ulogic;
-        variable abs1, abs2 : signed(63 downto 0);
-	variable overflow : std_ulogic;
-	variable negative : std_ulogic;
         variable zerohi, zerolo : std_ulogic;
         variable msb_a, msb_b : std_ulogic;
         variable a_lt : std_ulogic;
-        variable lv : Execute1ToLoadstore1Type;
-	variable irq_valid : std_ulogic;
-	variable exception : std_ulogic;
-        variable exception_nextpc : std_ulogic;
-        variable trapval : std_ulogic_vector(4 downto 0);
-        variable illegal : std_ulogic;
+        variable a_lt_lo : std_ulogic;
+        variable a_lt_hi : std_ulogic;
+	variable newcrf : std_ulogic_vector(3 downto 0);
+	variable bf, bfa : std_ulogic_vector(2 downto 0);
+	variable crnum : crnum_t;
+	variable scrnum : crnum_t;
+        variable cr_operands : std_ulogic_vector(1 downto 0);
+	variable crresult : std_ulogic;
+	variable bt, ba, bb : std_ulogic_vector(4 downto 0);
+        variable btnum : integer range 0 to 3;
+	variable banum, bbnum : integer range 0 to 31;
+        variable j : integer;
     begin
-	result := (others => '0');
-	result_with_carry := (others => '0');
-	result_en := '0';
-	newcrf := (others => '0');
-
-	v := r;
-	v.e := Execute1ToWritebackInit;
-        lv := Execute1ToLoadstore1Init;
-
-	-- XER forwarding. To avoid having to track XER hazards, we
-	-- use the previously latched value.
-	--
-	-- If the XER was modified by a multiply or a divide, those are
-	-- single issue, we'll get the up to date value from decode2 from
-	-- the register file.
-	--
-	-- If it was modified by an instruction older than the previous
-	-- one in EX1, it will have also hit writeback and will be up
-	-- to date in decode2.
-	--
-	-- That leaves us with the case where it was updated by the previous
-	-- instruction in EX1. In that case, we can forward it back here.
-	--
-	-- This will break if we allow pipelining of multiply and divide,
-	-- but ideally, those should go via EX1 anyway and run as a state
-	-- machine from here.
-	--
-	-- One additional hazard to beware of is an XER:SO modifying instruction
-	-- in EX1 followed immediately by a store conditional. Due to our
-	-- writeback latency, the store will go down the LSU with the previous
-	-- XER value, thus the stcx. will set CR0:SO using an obsolete SO value.
-	--
-	-- We will need to handle that if we ever make stcx. not single issue
-	--
-	-- We always pass a valid XER value downto writeback even when
-	-- we aren't updating it, in order for XER:SO -> CR0:SO transfer
-	-- to work for RC instructions.
-	--
-	if r.e.write_xerc_enable = '1' then
-	    v.e.xerc := r.e.xerc;
-	else
-	    v.e.xerc := e_in.xerc;
-	end if;
-
-	v.lr_update := '0';
-	v.mul_in_progress := '0';
-        v.div_in_progress := '0';
-        v.cntz_in_progress := '0';
-
-	-- signals to multiply unit
-	x_to_multiply <= Execute1ToMultiplyInit;
-	x_to_multiply.insn_type <= e_in.insn_type;
-	x_to_multiply.is_32bit <= e_in.is_32bit;
-
-	if e_in.is_32bit = '1' then
-	    if e_in.is_signed = '1' then
-		x_to_multiply.data1 <= (others => a_in(31));
-		x_to_multiply.data1(31 downto 0) <= a_in(31 downto 0);
-		x_to_multiply.data2 <= (others => b_in(31));
-		x_to_multiply.data2(31 downto 0) <= b_in(31 downto 0);
-	    else
-		x_to_multiply.data1 <= '0' & x"00000000" & a_in(31 downto 0);
-		x_to_multiply.data2 <= '0' & x"00000000" & b_in(31 downto 0);
-	    end if;
-	else
-	    if e_in.is_signed = '1' then
-		x_to_multiply.data1 <= a_in(63) & a_in;
-		x_to_multiply.data2 <= b_in(63) & b_in;
-	    else
-		x_to_multiply.data1 <= '0' & a_in;
-		x_to_multiply.data2 <= '0' & b_in;
-	    end if;
-	end if;
-
-        -- signals to divide unit
+        -- Main adder
+        if e_in.invert_a = '0' then
+            a_inv := a_in;
+        else
+            a_inv := not a_in;
+        end if;
+        if e_in.addm1 = '0' then
+            b_or_m1 := b_in;
+        else
+            b_or_m1 := (others => '1');
+        end if;
+        sum_with_carry := ppc_adde(a_inv, b_or_m1,
+                                   decode_input_carry(e_in.input_carry, xerc_in));
+        adder_result <= sum_with_carry(63 downto 0);
+        carry_32 <= sum_with_carry(32) xor a_inv(32) xor b_in(32);
+        carry_64 <= sum_with_carry(64);
+        overflow_32 <= calc_ov(a_inv(31), b_in(31), carry_32, sum_with_carry(31));
+        overflow_64 <= calc_ov(a_inv(63), b_in(63), carry_64, sum_with_carry(63));
+
+        -- signals to multiply and divide units
         sign1 := '0';
         sign2 := '0';
         if e_in.is_signed = '1' then
@@ -372,15 +406,35 @@ begin
             abs2 := - signed(b_in);
         end if;
 
-        x_to_divider <= Execute1ToDividerInit;
+        -- Interface to multiply and divide units
         x_to_divider.is_signed <= e_in.is_signed;
 	x_to_divider.is_32bit <= e_in.is_32bit;
+        x_to_divider.is_extended <= '0';
+        x_to_divider.is_modulus <= '0';
         if e_in.insn_type = OP_MOD then
             x_to_divider.is_modulus <= '1';
         end if;
+
+        addend := (others => '0');
+        if e_in.insn(26) = '0' then
+            -- integer multiply-add, major op 4 (if it is a multiply)
+            addend(63 downto 0) := c_in;
+            if e_in.is_signed = '1' then
+                addend(127 downto 64) := (others => c_in(63));
+            end if;
+        end if;
+        if (sign1 xor sign2) = '1' then
+            addend := not addend;
+        end if;
+
+	x_to_multiply.is_32bit <= e_in.is_32bit;
+        x_to_multiply.not_result <= sign1 xor sign2;
+        x_to_multiply.addend <= addend;
         x_to_divider.neg_result <= sign1 xor (sign2 and not x_to_divider.is_modulus);
         if e_in.is_32bit = '0' then
             -- 64-bit forms
+            x_to_multiply.data1 <= std_ulogic_vector(abs1);
+            x_to_multiply.data2 <= std_ulogic_vector(abs2);
             if e_in.insn_type = OP_DIVE then
                 x_to_divider.is_extended <= '1';
             end if;
@@ -388,6 +442,8 @@ begin
             x_to_divider.divisor <= std_ulogic_vector(abs2);
         else
             -- 32-bit forms
+            x_to_multiply.data1 <= x"00000000" & std_ulogic_vector(abs1(31 downto 0));
+            x_to_multiply.data2 <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
             x_to_divider.is_extended <= '0';
             if e_in.insn_type = OP_DIVE then   -- extended forms
                 x_to_divider.dividend <= std_ulogic_vector(abs1(31 downto 0)) & x"00000000";
@@ -397,98 +453,361 @@ begin
             x_to_divider.divisor <= x"00000000" & std_ulogic_vector(abs2(31 downto 0));
         end if;
 
+        case current.sub_select(1 downto 0) is
+            when "00" =>
+                muldiv_result <= multiply_to_x.result(63 downto 0);
+            when "01" =>
+                muldiv_result <= multiply_to_x.result(127 downto 64);
+            when "10" =>
+                muldiv_result <= multiply_to_x.result(63 downto 32) &
+                                 multiply_to_x.result(63 downto 32);
+            when others =>
+                muldiv_result <= divider_to_x.write_reg_data;
+        end case;
+
+        -- Compute misc_result
+        case current.sub_select is
+            when "000" =>
+                misc_result <= (others => '0');
+            when "001" =>
+                -- addg6s
+                addg6s := (others => '0');
+                for i in 0 to 14 loop
+                    lo := i * 4;
+                    hi := (i + 1) * 4;
+                    if (a_in(hi) xor b_in(hi) xor sum_with_carry(hi)) = '0' then
+                        addg6s(lo + 3 downto lo) := "0110";
+                    end if;
+                end loop;
+                if sum_with_carry(64) = '0' then
+                    addg6s(63 downto 60) := "0110";
+                end if;
+                misc_result <= addg6s;
+            when "010" =>
+                -- isel
+		crbit := to_integer(unsigned(insn_bc(e_in.insn)));
+		if cr_in(31-crbit) = '1' then
+		    isel_result := a_in;
+		else
+		    isel_result := b_in;
+		end if;
+                misc_result <= isel_result;
+            when "011" =>
+                -- darn
+                darn := (others => '1');
+                if random_err = '0' then
+                    case e_in.insn(17 downto 16) is
+                        when "00" =>
+                            darn := x"00000000" & random_cond(31 downto 0);
+                        when "10" =>
+                            darn := random_raw;
+                        when others =>
+                            darn := random_cond;
+                    end case;
+                end if;
+                misc_result <= darn;
+            when "100" =>
+                -- mfmsr
+		misc_result <= ctrl.msr;
+            when "101" =>
+		if e_in.insn(20) = '0' then
+		    -- mfcr
+		    mfcr_result := x"00000000" & cr_in;
+		else
+		    -- mfocrf
+		    crnum := fxm_to_num(insn_fxm(e_in.insn));
+		    mfcr_result := (others => '0');
+		    for i in 0 to 7 loop
+			lo := (7-i)*4;
+			hi := lo + 3;
+			if crnum = i then
+			    mfcr_result(hi downto lo) := cr_in(hi downto lo);
+			end if;
+		    end loop;
+		end if;
+                misc_result <= mfcr_result;
+            when "110" =>
+                -- setb
+                bfa := insn_bfa(e_in.insn);
+                crbit := to_integer(unsigned(bfa)) * 4;
+                setb_result := (others => '0');
+                if cr_in(31 - crbit) = '1' then
+                    setb_result := (others => '1');
+                elsif cr_in(30 - crbit) = '1' then
+                    setb_result(0) := '1';
+                end if;
+                misc_result <= setb_result;
+            when others =>
+                misc_result <= (others => '0');
+        end case;
+
+        -- compute comparison results
+        -- Note, we have done RB - RA, not RA - RB
+        if e_in.insn_type = OP_CMP then
+            l := insn_l(e_in.insn);
+        else
+            l := not e_in.is_32bit;
+        end if;
+        zerolo := not (or (a_in(31 downto 0) xor b_in(31 downto 0)));
+        zerohi := not (or (a_in(63 downto 32) xor b_in(63 downto 32)));
+        if zerolo = '1' and (l = '0' or zerohi = '1') then
+            -- values are equal
+            trapval <= "00100";
+        else
+            a_lt_lo := '0';
+            a_lt_hi := '0';
+            if unsigned(a_in(30 downto 0)) < unsigned(b_in(30 downto 0)) then
+                a_lt_lo := '1';
+            end if;
+            if unsigned(a_in(62 downto 31)) < unsigned(b_in(62 downto 31)) then
+                a_lt_hi := '1';
+            end if;
+            if l = '1' then
+                -- 64-bit comparison
+                msb_a := a_in(63);
+                msb_b := b_in(63);
+                a_lt := a_lt_hi or (zerohi and (a_in(31) xnor b_in(31)) and a_lt_lo);
+            else
+                -- 32-bit comparison
+                msb_a := a_in(31);
+                msb_b := b_in(31);
+                a_lt := a_lt_lo;
+            end if;
+            if msb_a /= msb_b then
+                -- Comparison is clear from MSB difference.
+                -- for signed, 0 is greater; for unsigned, 1 is greater
+                trapval <= msb_a & msb_b & '0' & msb_b & msb_a;
+            else
+                -- MSBs are equal, so signed and unsigned comparisons give the
+                -- same answer.
+                trapval <= a_lt & not a_lt & '0' & a_lt & not a_lt;
+            end if;
+        end if;
+
+        -- CR result mux
+        bf := insn_bf(e_in.insn);
+        crnum := to_integer(unsigned(bf));
+        newcrf := (others => '0');
+        case current.sub_select is
+            when "000" =>
+                -- CMP and CMPL instructions
+                if e_in.is_signed = '1' then
+                    newcrf := trapval(4 downto 2) & xerc_in.so;
+                else
+                    newcrf := trapval(1 downto 0) & trapval(2) & xerc_in.so;
+                end if;
+            when "001" =>
+                newcrf := ppc_cmprb(a_in, b_in, insn_l(e_in.insn));
+            when "010" =>
+                newcrf := ppc_cmpeqb(a_in, b_in);
+            when "011" =>
+                if current.insn(1) = '1' then
+                    -- CR logical instructions
+                    j := (7 - crnum) * 4;
+                    newcrf := cr_in(j + 3 downto j);
+                    bt := insn_bt(e_in.insn);
+                    ba := insn_ba(e_in.insn);
+                    bb := insn_bb(e_in.insn);
+                    btnum := 3 - to_integer(unsigned(bt(1 downto 0)));
+                    banum := 31 - to_integer(unsigned(ba));
+                    bbnum := 31 - to_integer(unsigned(bb));
+                    -- Bits 6-9 of the instruction word give the truth table
+                    -- of the requested logical operation
+                    cr_operands := cr_in(banum) & cr_in(bbnum);
+                    crresult := e_in.insn(6 + to_integer(unsigned(cr_operands)));
+                    for i in 0 to 3 loop
+                        if i = btnum then
+                            newcrf(i) := crresult;
+                        end if;
+                    end loop;
+                else
+                    -- MCRF
+                    bfa := insn_bfa(e_in.insn);
+                    scrnum := to_integer(unsigned(bfa));
+                    j := (7 - scrnum) * 4;
+                    newcrf := cr_in(j + 3 downto j);
+                end if;
+            when "100" =>
+                -- MCRXRX
+                newcrf := xerc_in.ov & xerc_in.ca & xerc_in.ov32 & xerc_in.ca32;
+            when others =>
+        end case;
+        if current.insn_type = OP_MTCRF then
+            if e_in.insn(20) = '0' then
+                -- mtcrf
+                write_cr_mask <= insn_fxm(e_in.insn);
+            else
+                -- mtocrf: We require one hot priority encoding here
+                crnum := fxm_to_num(insn_fxm(e_in.insn));
+                write_cr_mask <= num_to_fxm(crnum);
+            end if;
+            write_cr_data <= c_in(31 downto 0);
+        else
+            write_cr_mask <= num_to_fxm(crnum);
+            write_cr_data <= newcrf & newcrf & newcrf & newcrf &
+                             newcrf & newcrf & newcrf & newcrf;
+        end if;
+
+    end process;
+
+    execute1_1: process(all)
+	variable v : reg_type;
+	variable lo, hi : integer;
+	variable sh, mb, me : std_ulogic_vector(5 downto 0);
+	variable bo, bi : std_ulogic_vector(4 downto 0);
+	variable overflow : std_ulogic;
+        variable lv : Execute1ToLoadstore1Type;
+	variable irq_valid : std_ulogic;
+	variable exception : std_ulogic;
+        variable illegal : std_ulogic;
+        variable is_branch : std_ulogic;
+        variable is_direct_branch : std_ulogic;
+        variable taken_branch : std_ulogic;
+        variable abs_branch : std_ulogic;
+        variable spr_val : std_ulogic_vector(63 downto 0);
+        variable do_trace : std_ulogic;
+        variable hold_wr_data : std_ulogic;
+        variable fv : Execute1ToFPUType;
+    begin
+        is_branch := '0';
+        is_direct_branch := '0';
+        taken_branch := '0';
+        abs_branch := '0';
+        hold_wr_data := '0';
+
+	v := r;
+	v.e := Execute1ToWritebackInit;
+        v.e.redir_mode := ctrl.msr(MSR_IR) & not ctrl.msr(MSR_PR) &
+                          not ctrl.msr(MSR_LE) & not ctrl.msr(MSR_SF);
+        v.e.xerc := xerc_in;
+
+        lv := Execute1ToLoadstore1Init;
+        fv := Execute1ToFPUInit;
+
+        x_to_multiply.valid <= '0';
+        x_to_divider.valid <= '0';
+	v.mul_in_progress := '0';
+        v.div_in_progress := '0';
+        v.cntz_in_progress := '0';
+        v.mul_finish := '0';
+
+        spr_result <= (others => '0');
+        spr_val := (others => '0');
+
 	ctrl_tmp <= ctrl;
 	-- FIXME: run at 512MHz not core freq
 	ctrl_tmp.tb <= std_ulogic_vector(unsigned(ctrl.tb) + 1);
 	ctrl_tmp.dec <= std_ulogic_vector(unsigned(ctrl.dec) - 1);
 
 	irq_valid := '0';
-	if ctrl.msr(MSR_EE) = '1' and ctrl.dec(63) = '1' then
-	    report "IRQ valid";
-	    irq_valid := '1';
+	if ctrl.msr(MSR_EE) = '1' then
+	    if ctrl.dec(63) = '1' then
+		v.e.intr_vec := 16#900#;
+		report "IRQ valid: DEC";
+		irq_valid := '1';
+	    elsif ext_irq_in = '1' then
+		v.e.intr_vec := 16#500#;
+		report "IRQ valid: External";
+		irq_valid := '1';
+	    end if;
 	end if;
 
-	terminate_out <= '0';
+	v.terminate := '0';
 	icache_inval <= '0';
-	stall_out <= '0';
-	f_out <= Execute1ToFetch1TypeInit;
+	v.busy := '0';
 
 	-- Next insn adder used in a couple of places
-	next_nia := std_ulogic_vector(unsigned(e_in.nia) + 4);
+	next_nia <= std_ulogic_vector(unsigned(e_in.nia) + 4);
 
 	-- rotator control signals
 	right_shift <= '1' when e_in.insn_type = OP_SHR else '0';
 	rot_clear_left <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCL else '0';
 	rot_clear_right <= '1' when e_in.insn_type = OP_RLC or e_in.insn_type = OP_RLCR else '0';
+        rot_sign_ext <= '1' when e_in.insn_type = OP_EXTSWSLI else '0';
 
-	ctrl_tmp.irq_state <= WRITE_SRR0;
+        v.e.srr1 := msr_copy(ctrl.msr);
 	exception := '0';
         illegal := '0';
-        exception_nextpc := '0';
-        v.e.exc_write_enable := '0';
-        v.e.exc_write_reg := fast_spr_num(SPR_SRR0);
-        v.e.exc_write_data := e_in.nia;
-
-	if ctrl.irq_state = WRITE_SRR1 then
-	    v.e.exc_write_reg := fast_spr_num(SPR_SRR1);
-	    v.e.exc_write_data := ctrl.srr1;
-            v.e.exc_write_enable := '1';
-            ctrl_tmp.msr(MSR_SF) <= '1';
-            ctrl_tmp.msr(MSR_EE) <= '0';
-            ctrl_tmp.msr(MSR_PR) <= '0';
-            ctrl_tmp.msr(MSR_IR) <= '0';
-            ctrl_tmp.msr(MSR_DR) <= '0';
-            ctrl_tmp.msr(MSR_RI) <= '0';
-            ctrl_tmp.msr(MSR_LE) <= '1';
-	    f_out.redirect <= '1';
-	    f_out.redirect_nia <= ctrl.irq_nia;
-	    v.e.valid := e_in.valid;
-	    report "Writing SRR1: " & to_hstring(ctrl.srr1);
-
-	elsif irq_valid = '1' then
-	    -- we need two cycles to write srr0 and 1
-	    -- will need more when we have to write DSISR, DAR and HIER
-            -- Don't deliver the interrupt until we have a valid instruction
-            -- coming in, so we have a valid NIA to put in SRR0.
-	    exception := e_in.valid;
-	    ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#900#, 64));
-	    ctrl_tmp.srr1 <= msr_copy(ctrl.msr);
-
-        elsif e_in.valid = '1' and ctrl.msr(MSR_PR) = '1' and
-            instr_is_privileged(e_in.insn_type, e_in.insn) then
-            -- generate a program interrupt
-            exception := '1';
-            ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#700#, 64));
-            ctrl_tmp.srr1 <= msr_copy(ctrl.msr);
-            -- set bit 45 to indicate privileged instruction type interrupt
-            ctrl_tmp.srr1(63 - 45) <= '1';
-            report "privileged instruction";
-            
-	elsif e_in.valid = '1' and e_in.unit = ALU then
+        if valid_in = '1' then
+            v.e.last_nia := e_in.nia;
+        else
+            v.e.last_nia := r.e.last_nia;
+        end if;
 
+        v.e.mode_32bit := not ctrl.msr(MSR_SF);
+        v.e.instr_tag := current.instr_tag;
+
+        do_trace := valid_in and ctrl.msr(MSR_SE);
+        if valid_in = '1' then
+            v.prev_op := e_in.insn_type;
+        end if;
+
+        -- Determine if there is any exception to be taken
+        -- before/instead of executing this instruction
+        if valid_in = '1' and e_in.second = '0' then
+            if HAS_FPU and r.fp_exception_next = '1' then
+                -- This is used for FP-type program interrupts that
+                -- become pending due to MSR[FE0,FE1] changing from 00 to non-zero.
+                exception := '1';
+                v.e.intr_vec := 16#700#;
+                v.e.srr1(63 - 43) := '1';
+                v.e.srr1(63 - 47) := '1';
+            elsif r.trace_next = '1' then
+                -- Generate a trace interrupt rather than executing the next instruction
+                -- or taking any asynchronous interrupt
+                exception := '1';
+                v.e.intr_vec := 16#d00#;
+                v.e.srr1(63 - 33) := '1';
+                if r.prev_op = OP_LOAD or r.prev_op = OP_ICBI or r.prev_op = OP_ICBT or
+                    r.prev_op = OP_DCBT or r.prev_op = OP_DCBST or r.prev_op = OP_DCBF then
+                    v.e.srr1(63 - 35) := '1';
+                elsif r.prev_op = OP_STORE or r.prev_op = OP_DCBZ or r.prev_op = OP_DCBTST then
+                    v.e.srr1(63 - 36) := '1';
+                end if;
+
+            elsif irq_valid = '1' then
+                -- Don't deliver the interrupt until we have a valid instruction
+                -- coming in, so we have a valid NIA to put in SRR0.
+                exception := '1';
+
+            elsif ctrl.msr(MSR_PR) = '1' and instr_is_privileged(e_in.insn_type, e_in.insn) then
+                -- generate a program interrupt
+                exception := '1';
+                v.e.intr_vec := 16#700#;
+                -- set bit 45 to indicate privileged instruction type interrupt
+                v.e.srr1(63 - 45) := '1';
+                report "privileged instruction";
+
+            elsif not HAS_FPU and e_in.fac = FPU then
+                -- make lfd/stfd/lfs/stfs etc. illegal in no-FPU implementations
+                illegal := '1';
+
+            elsif HAS_FPU and ctrl.msr(MSR_FP) = '0' and e_in.fac = FPU then
+                -- generate a floating-point unavailable interrupt
+                exception := '1';
+                v.e.intr_vec := 16#800#;
+                report "FP unavailable interrupt";
+            end if;
+        end if;
+
+	if valid_in = '1' and exception = '0' and illegal = '0' and e_in.unit = ALU then
+            v.cur_instr := e_in;
 	    v.e.valid := '1';
-	    v.e.write_reg := e_in.write_reg;
-	    v.slow_op_dest := gspr_to_gpr(e_in.write_reg);
-	    v.slow_op_rc := e_in.rc;
-	    v.slow_op_oe := e_in.oe;
-	    v.slow_op_xerc := v.e.xerc;
 
 	    case_0: case e_in.insn_type is
 
 	    when OP_ILLEGAL =>
 		-- we need two cycles to write srr0 and 1
-		-- will need more when we have to write DSISR, DAR and HIER
+		-- will need more when we have to write HEIR
 		illegal := '1';
 	    when OP_SC =>
 		-- check bit 1 of the instruction is 1 so we know this is sc;
                 -- 0 would mean scv, so generate an illegal instruction interrupt
 		-- we need two cycles to write srr0 and 1
-		-- will need more when we have to write DSISR, DAR and HIER
                 if e_in.insn(1) = '1' then
                     exception := '1';
-                    exception_nextpc := '1';
-                    ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#C00#, 64));
-                    ctrl_tmp.srr1 <= msr_copy(ctrl.msr);
+                    v.e.intr_vec := 16#C00#;
+                    v.e.last_nia := next_nia;
                     report "sc";
                 else
                     illegal := '1';
@@ -497,282 +816,155 @@ begin
                 -- check bits 1-10 of the instruction to make sure it's attn
                 -- if not then it is illegal
                 if e_in.insn(10 downto 1) = "0100000000" then
-                    terminate_out <= '1';
+                    v.terminate := '1';
                     report "ATTN";
                 else
                     illegal := '1';
                 end if;
-	    when OP_NOP =>
+	    when OP_NOP | OP_DCBF | OP_DCBST | OP_DCBT | OP_DCBTST | OP_ICBT =>
 		-- Do nothing
-	    when OP_ADD | OP_CMP | OP_TRAP =>
-		if e_in.invert_a = '0' then
-		    a_inv := a_in;
-		else
-		    a_inv := not a_in;
-		end if;
-		result_with_carry := ppc_adde(a_inv, b_in,
-					      decode_input_carry(e_in.input_carry, v.e.xerc));
-		result := result_with_carry(63 downto 0);
-                carry_32 := result(32) xor a_inv(32) xor b_in(32);
-                carry_64 := result_with_carry(64);
-                if e_in.insn_type = OP_ADD then
-                    if e_in.output_carry = '1' then
+	    when OP_ADD =>
+                if e_in.output_carry = '1' then
+                    if e_in.input_carry /= OV then
                         set_carry(v.e, carry_32, carry_64);
-                    end if;
-                    if e_in.oe = '1' then
-                        set_ov(v.e,
-                               calc_ov(a_inv(63), b_in(63), carry_64, result_with_carry(63)),
-                               calc_ov(a_inv(31), b_in(31), carry_32, result_with_carry(31)));
-                    end if;
-                    result_en := '1';
-                else
-                    -- trap, CMP and CMPL instructions
-                    -- Note, we have done RB - RA, not RA - RB
-                    if e_in.insn_type = OP_CMP then
-                        l := insn_l(e_in.insn);
-                    else
-                        l := not e_in.is_32bit;
-                    end if;
-                    zerolo := not (or (a_in(31 downto 0) xor b_in(31 downto 0)));
-                    zerohi := not (or (a_in(63 downto 32) xor b_in(63 downto 32)));
-                    if zerolo = '1' and (l = '0' or zerohi = '1') then
-                        -- values are equal
-                        trapval := "00100";
-                    else
-                        if l = '1' then
-                            -- 64-bit comparison
-                            msb_a := a_in(63);
-                            msb_b := b_in(63);
-                        else
-                            -- 32-bit comparison
-                            msb_a := a_in(31);
-                            msb_b := b_in(31);
-                        end if;
-                        if msb_a /= msb_b then
-                            -- Subtraction might overflow, but
-                            -- comparison is clear from MSB difference.
-                            -- for signed, 0 is greater; for unsigned, 1 is greater
-                            trapval := msb_a & msb_b & '0' & msb_b & msb_a;
-                        else
-                            -- Subtraction cannot overflow since MSBs are equal.
-                            -- carry = 1 indicates RA is smaller (signed or unsigned)
-                            a_lt := (not l and carry_32) or (l and carry_64);
-                            trapval := a_lt & not a_lt & '0' & a_lt & not a_lt;
-                        end if;
-                    end if;
-                    if e_in.insn_type = OP_CMP then
-                        if e_in.is_signed = '1' then
-                            newcrf := trapval(4 downto 2) & v.e.xerc.so;
-                        else
-                            newcrf := trapval(1 downto 0) & trapval(2) & v.e.xerc.so;
-                        end if;
-                        bf := insn_bf(e_in.insn);
-                        crnum := to_integer(unsigned(bf));
-                        v.e.write_cr_enable := '1';
-                        v.e.write_cr_mask := num_to_fxm(crnum);
-                        for i in 0 to 7 loop
-                            lo := i*4;
-                            hi := lo + 3;
-                            v.e.write_cr_data(hi downto lo) := newcrf;
-                        end loop;
                     else
-                        -- trap instructions (tw, twi, td, tdi)
-                        if or (trapval and insn_to(e_in.insn)) = '1' then
-                            -- generate trap-type program interrupt
-                            exception := '1';
-                            ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#700#, 64));
-                            ctrl_tmp.srr1 <= msr_copy(ctrl.msr);
-                            -- set bit 46 to say trap occurred
-                            ctrl_tmp.srr1(63 - 46) <= '1';
-                            report "trap";
-                        end if;
+                        v.e.xerc.ov := carry_64;
+                        v.e.xerc.ov32 := carry_32;
                     end if;
                 end if;
-	    when OP_AND | OP_OR | OP_XOR =>
-		result := logical_result;
-		result_en := '1';
+                if e_in.oe = '1' then
+                    set_ov(v.e, overflow_64, overflow_32);
+                end if;
+            when OP_CMP =>
+            when OP_TRAP =>
+                -- trap instructions (tw, twi, td, tdi)
+                v.e.intr_vec := 16#700#;
+                -- set bit 46 to say trap occurred
+                v.e.srr1(63 - 46) := '1';
+                if or (trapval and insn_to(e_in.insn)) = '1' then
+                    -- generate trap-type program interrupt
+                    exception := '1';
+                    report "trap";
+                end if;
+            when OP_ADDG6S =>
+            when OP_CMPRB =>
+            when OP_CMPEQB =>
+            when OP_AND | OP_OR | OP_XOR | OP_POPCNT | OP_PRTY | OP_CMPB | OP_EXTS |
+                    OP_BPERM | OP_BCD =>
+
 	    when OP_B =>
-		f_out.redirect <= '1';
-		if (insn_aa(e_in.insn)) then
-		    f_out.redirect_nia <= b_in;
-		else
-		    f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
-		end if;
-	    when OP_BC =>
-		-- read_data1 is CTR
-		bo := insn_bo(e_in.insn);
-		bi := insn_bi(e_in.insn);
-		if bo(4-2) = '0' then
-		    result := std_ulogic_vector(unsigned(a_in) - 1);
-		    result_en := '1';
-		    v.e.write_reg := fast_spr_num(SPR_CTR);
-		end if;
-		if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then
-		    f_out.redirect <= '1';
-		    if (insn_aa(e_in.insn)) then
-			f_out.redirect_nia <= b_in;
-		    else
-			f_out.redirect_nia <= std_ulogic_vector(signed(e_in.nia) + signed(b_in));
-		    end if;
-		end if;
-	    when OP_BCREG =>
-		-- read_data1 is CTR
-		-- read_data2 is target register (CTR, LR or TAR)
+                is_branch := '1';
+                taken_branch := '1';
+                is_direct_branch := '1';
+                abs_branch := e_in.br_abs;
+                if ctrl.msr(MSR_BE) = '1' then
+                    do_trace := '1';
+                end if;
+            when OP_BC | OP_BCREG =>
+                -- read_data1 is CTR
+		-- for OP_BCREG, read_data2 is target register (CTR, LR or TAR)
+                -- If this instruction updates both CTR and LR, then it is
+                -- doubled; the first instruction decrements CTR and determines
+                -- whether the branch is taken, and the second does the
+                -- redirect and the LR update.
 		bo := insn_bo(e_in.insn);
 		bi := insn_bi(e_in.insn);
-		if bo(4-2) = '0' and e_in.insn(10) = '0' then
-		    result := std_ulogic_vector(unsigned(a_in) - 1);
-		    result_en := '1';
-		    v.e.write_reg := fast_spr_num(SPR_CTR);
-		end if;
-		if ppc_bc_taken(bo, bi, e_in.cr, a_in) = 1 then
-		    f_out.redirect <= '1';
-		    f_out.redirect_nia <= b_in(63 downto 2) & "00";
-		end if;
+                if e_in.second = '0' then
+                    taken_branch := ppc_bc_taken(bo, bi, cr_in, a_in);
+                else
+                    taken_branch := r.br_taken;
+                end if;
+                v.br_taken := taken_branch;
+                abs_branch := e_in.br_abs;
+                if e_in.repeat = '0' or e_in.second = '1' then
+                    is_branch := '1';
+                    if e_in.insn_type = OP_BC then
+                        is_direct_branch := '1';
+                    end if;
+                    if ctrl.msr(MSR_BE) = '1' then
+                        do_trace := '1';
+                    end if;
+                end if;
 
 	    when OP_RFID =>
-		f_out.redirect <= '1';
-		f_out.redirect_nia <= a_in(63 downto 2) & "00"; -- srr0
+                v.e.redir_mode := (a_in(MSR_IR) or a_in(MSR_PR)) & not a_in(MSR_PR) &
+                                  not a_in(MSR_LE) & not a_in(MSR_SF);
                 -- Can't use msr_copy here because the partial function MSR
                 -- bits should be left unchanged, not zeroed.
-                ctrl_tmp.msr(63 downto 31) <= b_in(63 downto 31);
-                ctrl_tmp.msr(26 downto 22) <= b_in(26 downto 22);
-                ctrl_tmp.msr(15 downto 0)  <= b_in(15 downto 0);
-                if b_in(MSR_PR) = '1' then
+                ctrl_tmp.msr(63 downto 31) <= a_in(63 downto 31);
+                ctrl_tmp.msr(26 downto 22) <= a_in(26 downto 22);
+                ctrl_tmp.msr(15 downto 0)  <= a_in(15 downto 0);
+                if a_in(MSR_PR) = '1' then
                     ctrl_tmp.msr(MSR_EE) <= '1';
                     ctrl_tmp.msr(MSR_IR) <= '1';
                     ctrl_tmp.msr(MSR_DR) <= '1';
                 end if;
+                -- mark this as a branch so CFAR gets updated
+                is_branch := '1';
+                taken_branch := '1';
+                abs_branch := '1';
+                if HAS_FPU then
+                    v.fp_exception_next := fp_in.exception and
+                                           (a_in(MSR_FE0) or a_in(MSR_FE1));
+                end if;
+                do_trace := '0';
 
-	    when OP_CMPB =>
-		result := ppc_cmpb(c_in, b_in);
-		result_en := '1';
             when OP_CNTZ =>
                 v.e.valid := '0';
                 v.cntz_in_progress := '1';
-                stall_out <= '1';
-            when OP_EXTS =>
-                -- note data_len is a 1-hot encoding
-		negative := (e_in.data_len(0) and c_in(7)) or
-			    (e_in.data_len(1) and c_in(15)) or
-			    (e_in.data_len(2) and c_in(31));
-		result := (others => negative);
-		if e_in.data_len(2) = '1' then
-		    result(31 downto 16) := c_in(31 downto 16);
-		end if;
-		if e_in.data_len(2) = '1' or e_in.data_len(1) = '1' then
-		    result(15 downto 8) := c_in(15 downto 8);
-		end if;
-		result(7 downto 0) := c_in(7 downto 0);
-		result_en := '1';
+                v.busy := '1';
 	    when OP_ISEL =>
-		crbit := to_integer(unsigned(insn_bc(e_in.insn)));
-		if e_in.cr(31-crbit) = '1' then
-		    result := a_in;
-		else
-		    result := b_in;
-		end if;
-		result_en := '1';
-	    when OP_CROP =>
-		cr_op := insn_cr(e_in.insn);
-		report "CR OP " & to_hstring(cr_op);
-		if cr_op(0) = '0' then -- MCRF
-		    bf := insn_bf(e_in.insn);
-		    bfa := insn_bfa(e_in.insn);
-		    v.e.write_cr_enable := '1';
-		    crnum := to_integer(unsigned(bf));
-		    scrnum := to_integer(unsigned(bfa));
-		    v.e.write_cr_mask := num_to_fxm(crnum);
-		    for i in 0 to 7 loop
-		        lo := (7-i)*4;
-		        hi := lo + 3;
-		        if i = scrnum then
-			    newcrf := e_in.cr(hi downto lo);
-		        end if;
-		    end loop;
-		    for i in 0 to 7 loop
-		        lo := i*4;
-		        hi := lo + 3;
-		        v.e.write_cr_data(hi downto lo) := newcrf;
-		    end loop;
-		else
-		    v.e.write_cr_enable := '1';
-		    bt := insn_bt(e_in.insn);
-		    ba := insn_ba(e_in.insn);
-		    bb := insn_bb(e_in.insn);
-		    btnum := 31 - to_integer(unsigned(bt));
-		    banum := 31 - to_integer(unsigned(ba));
-		    bbnum := 31 - to_integer(unsigned(bb));
-                    -- Bits 5-8 of cr_op give the truth table of the requested
-                    -- logical operation
-                    cr_operands := e_in.cr(banum) & e_in.cr(bbnum);
-                    crresult := cr_op(5 + to_integer(unsigned(cr_operands)));
-		    v.e.write_cr_mask := num_to_fxm((31-btnum) / 4);
-		    for i in 0 to 31 loop
-			if i = btnum then
-		            v.e.write_cr_data(i) := crresult;
-			else
-		            v.e.write_cr_data(i) := e_in.cr(i);
-			end if;
-		    end loop;
-		end if;
+            when OP_CROP =>
+            when OP_MCRXRX =>
+            when OP_DARN =>
 	    when OP_MFMSR =>
-		result := ctrl.msr;
-		result_en := '1';
 	    when OP_MFSPR =>
 		report "MFSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
 		    "=" & to_hstring(a_in);
-		if is_fast_spr(e_in.read_reg1) then
-		    result := a_in;
-		    if decode_spr_num(e_in.insn) = SPR_XER then
+		if is_fast_spr(e_in.read_reg1) = '1' then
+		    spr_val := a_in;
+                    if decode_spr_num(e_in.insn) = SPR_XER then
 			-- bits 0:31 and 35:43 are treated as reserved and return 0s when read using mfxer
-			result(63 downto 32) := (others => '0');
-			result(63-32) := v.e.xerc.so;
-			result(63-33) := v.e.xerc.ov;
-			result(63-34) := v.e.xerc.ca;
-			result(63-35 downto 63-43) := "000000000";
-			result(63-44) := v.e.xerc.ov32;
-			result(63-45) := v.e.xerc.ca32;
-		    end if;
+			spr_val(63 downto 32) := (others => '0');
+			spr_val(63-32) := xerc_in.so;
+			spr_val(63-33) := xerc_in.ov;
+			spr_val(63-34) := xerc_in.ca;
+			spr_val(63-35 downto 63-43) := "000000000";
+			spr_val(63-44) := xerc_in.ov32;
+			spr_val(63-45) := xerc_in.ca32;
+                    end if;
 		else
-		    case decode_spr_num(e_in.insn) is
+                    spr_val := c_in;
+                    case decode_spr_num(e_in.insn) is
 		    when SPR_TB =>
-			result := ctrl.tb;
+			spr_val := ctrl.tb;
+		    when SPR_TBU =>
+                        spr_val(63 downto 32) := (others => '0');
+			spr_val(31 downto 0)  := ctrl.tb(63 downto 32);
 		    when SPR_DEC =>
-			result := ctrl.dec;
-		    when others =>
-			result := (others => '0');
-		    end case;
-		end if;
-		result_en := '1';
+			spr_val := ctrl.dec;
+                    when SPR_CFAR =>
+                        spr_val := ctrl.cfar;
+                    when SPR_PVR =>
+                        spr_val(63 downto 32) := (others => '0');
+                        spr_val(31 downto 0) := PVR_MICROWATT;
+                    when 724 =>     -- LOG_ADDR SPR
+                        spr_val := log_wr_addr & r.log_addr_spr;
+                    when 725 =>     -- LOG_DATA SPR
+                        spr_val := log_rd_data;
+                        v.log_addr_spr := std_ulogic_vector(unsigned(r.log_addr_spr) + 1);
+                    when others =>
+                        -- mfspr from unimplemented SPRs should be a nop in
+                        -- supervisor mode and a program interrupt for user mode
+                        if is_fast_spr(e_in.read_reg1) = '0' and ctrl.msr(MSR_PR) = '1' then
+                            illegal := '1';
+                        end if;
+                    end case;
+                end if;
+                spr_result <= spr_val;
+
 	    when OP_MFCR =>
-		if e_in.insn(20) = '0' then
-		    -- mfcr
-		    result := x"00000000" & e_in.cr;
-		else
-		    -- mfocrf
-		    crnum := fxm_to_num(insn_fxm(e_in.insn));
-		    result := (others => '0');
-		    for i in 0 to 7 loop
-			lo := (7-i)*4;
-			hi := lo + 3;
-			if crnum = i then
-			    result(hi downto lo) := e_in.cr(hi downto lo);
-			end if;
-		    end loop;
-		end if;
-		result_en := '1';
 	    when OP_MTCRF =>
-		v.e.write_cr_enable := '1';
-		if e_in.insn(20) = '0' then
-		    -- mtcrf
-		    v.e.write_cr_mask := insn_fxm(e_in.insn);
-		else
-		    -- mtocrf: We require one hot priority encoding here
-		    crnum := fxm_to_num(insn_fxm(e_in.insn));
-		    v.e.write_cr_mask := num_to_fxm(crnum);
-		end if;
-		v.e.write_cr_data := c_in(31 downto 0);
             when OP_MTMSRD =>
                 if e_in.insn(16) = '1' then
                     -- just update EE and RI
@@ -781,53 +973,57 @@ begin
                 else
                     -- Architecture says to leave out bits 3 (HV), 51 (ME)
                     -- and 63 (LE) (IBM bit numbering)
-                    ctrl_tmp.msr(63 downto 61) <= c_in(63 downto 61);
-                    ctrl_tmp.msr(59 downto 13) <= c_in(59 downto 13);
+                    if e_in.is_32bit = '0' then
+                        ctrl_tmp.msr(63 downto 61) <= c_in(63 downto 61);
+                        ctrl_tmp.msr(59 downto 32) <= c_in(59 downto 32);
+                    end if;
+                    ctrl_tmp.msr(31 downto 13) <= c_in(31 downto 13);
                     ctrl_tmp.msr(11 downto 1)  <= c_in(11 downto 1);
                     if c_in(MSR_PR) = '1' then
                         ctrl_tmp.msr(MSR_EE) <= '1';
                         ctrl_tmp.msr(MSR_IR) <= '1';
                         ctrl_tmp.msr(MSR_DR) <= '1';
                     end if;
+                    if HAS_FPU then
+                        v.fp_exception_next := fp_in.exception and
+                                               (c_in(MSR_FE0) or c_in(MSR_FE1));
+                    end if;
                 end if;
 	    when OP_MTSPR =>
 		report "MTSPR to SPR " & integer'image(decode_spr_num(e_in.insn)) &
 		    "=" & to_hstring(c_in);
 		if is_fast_spr(e_in.write_reg) then
-		    result := c_in;
-		    result_en := '1';
 		    if decode_spr_num(e_in.insn) = SPR_XER then
 			v.e.xerc.so := c_in(63-32);
 			v.e.xerc.ov := c_in(63-33);
 			v.e.xerc.ca := c_in(63-34);
 			v.e.xerc.ov32 := c_in(63-44);
 			v.e.xerc.ca32 := c_in(63-45);
-			v.e.write_xerc_enable := '1';
 		    end if;
 		else
 		    -- slow spr
 		    case decode_spr_num(e_in.insn) is
 		    when SPR_DEC =>
 			ctrl_tmp.dec <= c_in;
+                    when 724 =>     -- LOG_ADDR SPR
+                        v.log_addr_spr := c_in(31 downto 0);
 		    when others =>
+                        -- mtspr to unimplemented SPRs should be a nop in
+                        -- supervisor mode and a program interrupt for user mode
+                        if ctrl.msr(MSR_PR) = '1' then
+                            illegal := '1';
+                        end if;
 		    end case;
 		end if;
-	    when OP_POPCNT =>
-		result := popcnt_result;
-		result_en := '1';
-	    when OP_PRTY =>
-		result := parity_result;
-		result_en := '1';
-	    when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR =>
-		result := rotator_result;
+	    when OP_RLC | OP_RLCL | OP_RLCR | OP_SHL | OP_SHR | OP_EXTSWSLI =>
 		if e_in.output_carry = '1' then
 		    set_carry(v.e, rotator_carry, rotator_carry);
 		end if;
-		result_en := '1';
+            when OP_SETB =>
 
 	    when OP_ISYNC =>
-		f_out.redirect <= '1';
-		f_out.redirect_nia <= next_nia;
+		v.e.redirect := '1';
+                v.e.br_offset := std_ulogic_vector(to_unsigned(4, 64));
 
 	    when OP_ICBI =>
 		icache_inval <= '1';
@@ -835,140 +1031,265 @@ begin
 	    when OP_MUL_L64 | OP_MUL_H64 | OP_MUL_H32 =>
 		v.e.valid := '0';
 		v.mul_in_progress := '1';
-		stall_out <= '1';
+		v.busy := '1';
 		x_to_multiply.valid <= '1';
 
 	    when OP_DIV | OP_DIVE | OP_MOD =>
 		v.e.valid := '0';
 		v.div_in_progress := '1';
-		stall_out <= '1';
+		v.busy := '1';
 		x_to_divider.valid <= '1';
 
             when others =>
-		terminate_out <= '1';
+		v.terminate := '1';
 		report "illegal";
 	    end case;
 
-	    v.e.rc := e_in.rc and e_in.valid;
-
-	    -- Update LR on the next cycle after a branch link
-	    --
-	    -- WARNING: The LR update isn't tracked by our hazard tracker. This
-	    --          will work (well I hope) because it only happens on branches
-	    --          which will flush all decoded instructions. By the time
-	    --          fetch catches up, we'll have the new LR. This will
-	    --          *not* work properly however if we have a branch predictor,
-	    --          in which case the solution would probably be to keep a
-	    --          local cache of the updated LR in execute1 (flushed on
-	    --          exceptions) that is used instead of the value from
-	    --          decode when its content is valid.
-	    if e_in.lr = '1' then
-		v.lr_update := '1';
-		v.next_lr := next_nia;
-		v.e.valid := '0';
-		report "Delayed LR update to " & to_hstring(next_nia);
-		stall_out <= '1';
-	    end if;
+            -- Mispredicted branches cause a redirect
+            if is_branch = '1' then
+                if taken_branch = '1' then
+                    ctrl_tmp.cfar <= e_in.nia;
+                end if;
+                if taken_branch = '1' then
+                    v.e.br_offset := b_in;
+                    v.e.abs_br := abs_branch;
+                else
+                    v.e.br_offset := std_ulogic_vector(to_unsigned(4, 64));
+                end if;
+                if taken_branch /= e_in.br_pred then
+                    v.e.redirect := '1';
+                end if;
+                v.e.br_last := is_direct_branch;
+                v.e.br_taken := taken_branch;
+            end if;
 
-        elsif e_in.valid = '1' then
+        elsif valid_in = '1' and exception = '0' and illegal = '0' then
             -- instruction for other units, i.e. LDST
-            v.e.valid := '0';
             if e_in.unit = LDST then
                 lv.valid := '1';
+            elsif e_in.unit = NONE then
+                illegal := '1';
+            elsif HAS_FPU and e_in.unit = FPU then
+                fv.valid := '1';
+            end if;
+            -- Handling an ITLB miss doesn't count as having executed an instruction
+            if e_in.insn_type = OP_FETCH_FAILED then
+                do_trace := '0';
             end if;
+        end if;
 
-	elsif r.lr_update = '1' then
-	    result_en := '1';
-	    result := r.next_lr;
-	    v.e.write_reg := fast_spr_num(SPR_LR);
-	    v.e.valid := '1';
-        elsif r.cntz_in_progress = '1' then
+        -- The following cases all occur when r.busy = 1 and therefore
+        -- valid_in = 0.  Hence they don't happen in the same cycle as any of
+        -- the cases above which depend on valid_in = 1.
+        if r.cntz_in_progress = '1' then
             -- cnt[lt]z always takes two cycles
-            result := countzero_result;
-            result_en := '1';
-            v.e.write_reg := gpr_to_gspr(v.slow_op_dest);
-            v.e.rc := v.slow_op_rc;
-            v.e.xerc := v.slow_op_xerc;
             v.e.valid := '1';
 	elsif r.mul_in_progress = '1' or r.div_in_progress = '1' then
 	    if (r.mul_in_progress = '1' and multiply_to_x.valid = '1') or
 	       (r.div_in_progress = '1' and divider_to_x.valid = '1') then
 		if r.mul_in_progress = '1' then
-		    result := multiply_to_x.write_reg_data;
-		    overflow := multiply_to_x.overflow;
+                    overflow := '0';
 		else
-		    result := divider_to_x.write_reg_data;
 		    overflow := divider_to_x.overflow;
 		end if;
-		result_en := '1';
-		v.e.write_reg := gpr_to_gspr(v.slow_op_dest);
-		v.e.rc := v.slow_op_rc;
-		v.e.xerc := v.slow_op_xerc;
-		v.e.write_xerc_enable := v.slow_op_oe;
-		-- We must test oe because the RC update code in writeback
-		-- will use the xerc value to set CR0:SO so we must not clobber
-		-- xerc if OE wasn't set.
-		if v.slow_op_oe = '1' then
-		    v.e.xerc.ov := overflow;
-		    v.e.xerc.ov32 := overflow;
-		    v.e.xerc.so := v.slow_op_xerc.so or overflow;
-		end if;
-		v.e.valid := '1';
+                if r.mul_in_progress = '1' and current.oe = '1' then
+                    -- have to wait until next cycle for overflow indication
+                    v.mul_finish := '1';
+                    v.busy := '1';
+                else
+                    -- We must test oe because the RC update code in writeback
+                    -- will use the xerc value to set CR0:SO so we must not clobber
+                    -- xerc if OE wasn't set.
+                    if current.oe = '1' then
+                        v.e.xerc.ov := overflow;
+                        v.e.xerc.ov32 := overflow;
+                        if overflow = '1' then
+                            v.e.xerc.so := '1';
+                        end if;
+                    end if;
+                    v.e.valid := '1';
+                end if;
 	    else
-		stall_out <= '1';
+		v.busy := '1';
 		v.mul_in_progress := r.mul_in_progress;
 		v.div_in_progress := r.div_in_progress;
 	    end if;
+        elsif r.mul_finish = '1' then
+            hold_wr_data := '1';
+            v.e.xerc.ov := multiply_to_x.overflow;
+            v.e.xerc.ov32 := multiply_to_x.overflow;
+            if multiply_to_x.overflow = '1' then
+                v.e.xerc.so := '1';
+            end if;
+            v.e.valid := '1';
 	end if;
 
-        if illegal = '1' then
+        -- Generate FP-type program interrupt.  fp_in.interrupt will only
+        -- be set during the execution of a FP instruction.
+        -- The case where MSR[FE0,FE1] goes from zero to non-zero is
+        -- handled above by mtmsrd and rfid setting v.fp_exception_next.
+        if HAS_FPU and fp_in.interrupt = '1' then
+            v.e.intr_vec := 16#700#;
+            v.e.srr1(63 - 43) := '1';
             exception := '1';
-            ctrl_tmp.irq_nia <= std_logic_vector(to_unsigned(16#700#, 64));
-            ctrl_tmp.srr1 <= msr_copy(ctrl.msr);
+        end if;
+
+        if illegal = '1' or (HAS_FPU and fp_in.illegal = '1') then
+            exception := '1';
+            v.e.intr_vec := 16#700#;
             -- Since we aren't doing Hypervisor emulation assist (0xe40) we
             -- set bit 44 to indicate we have an illegal
-            ctrl_tmp.srr1(63 - 44) <= '1';
+            v.e.srr1(63 - 44) := '1';
             report "illegal";
         end if;
-	if exception = '1' then
-            v.e.exc_write_enable := '1';
-            if exception_nextpc = '1' then
-                v.e.exc_write_data := next_nia;
+
+        -- generate DSI or DSegI for load/store exceptions
+        -- or ISI or ISegI for instruction fetch exceptions
+        if l_in.exception = '1' then
+            if l_in.alignment = '1' then
+                v.e.intr_vec := 16#600#;
+            elsif l_in.instr_fault = '0' then
+                if l_in.segment_fault = '0' then
+                    v.e.intr_vec := 16#300#;
+                else
+                    v.e.intr_vec := 16#380#;
+                end if;
+            else
+                if l_in.segment_fault = '0' then
+                    v.e.srr1(63 - 33) := l_in.invalid;
+                    v.e.srr1(63 - 35) := l_in.perm_error; -- noexec fault
+                    v.e.srr1(63 - 44) := l_in.badtree;
+                    v.e.srr1(63 - 45) := l_in.rc_error;
+                    v.e.intr_vec := 16#400#;
+                else
+                    v.e.intr_vec := 16#480#;
+                end if;
             end if;
-            ctrl_tmp.irq_state <= WRITE_SRR1;
-            v.e.valid := '1';
-	end if;
+        end if;
+
+        v.e.interrupt := exception or l_in.exception;
 
-	v.e.write_data := result;
-	v.e.write_enable := result_en;
+        if do_trace = '1' then
+            v.trace_next := '1';
+        end if;
+
+ 	if interrupt_in = '1' then
+            ctrl_tmp.msr(MSR_SF) <= '1';
+            ctrl_tmp.msr(MSR_EE) <= '0';
+            ctrl_tmp.msr(MSR_PR) <= '0';
+            ctrl_tmp.msr(MSR_SE) <= '0';
+            ctrl_tmp.msr(MSR_BE) <= '0';
+            ctrl_tmp.msr(MSR_FP) <= '0';
+            ctrl_tmp.msr(MSR_FE0) <= '0';
+            ctrl_tmp.msr(MSR_FE1) <= '0';
+            ctrl_tmp.msr(MSR_IR) <= '0';
+            ctrl_tmp.msr(MSR_DR) <= '0';
+            ctrl_tmp.msr(MSR_RI) <= '0';
+            ctrl_tmp.msr(MSR_LE) <= '1';
+            v.trace_next := '0';
+            v.fp_exception_next := '0';
+        end if;
+
+        if hold_wr_data = '0' then
+            v.e.write_data := alu_result;
+        else
+            v.e.write_data := r.e.write_data;
+        end if;
+        v.e.write_reg := current.write_reg;
+	v.e.write_enable := current.write_reg_enable and v.e.valid and not exception;
+        v.e.rc := current.rc and v.e.valid and not exception;
+        v.e.write_cr_data := write_cr_data;
+        v.e.write_cr_mask := write_cr_mask;
+        v.e.write_cr_enable := current.output_cr and v.e.valid and not exception;
+        v.e.write_xerc_enable := current.output_xer and v.e.valid and not exception;
+
+        bypass_data.tag.valid <= current.instr_tag.valid and current.write_reg_enable and v.e.valid;
+        bypass_data.tag.tag <= current.instr_tag.tag;
+        bypass_data.data <= v.e.write_data;
+
+        bypass_cr_data.tag.valid <= current.instr_tag.valid and current.output_cr and v.e.valid;
+        bypass_cr_data.tag.tag <= current.instr_tag.tag;
+        for i in 0 to 7 loop
+            if v.e.write_cr_mask(i) = '1' then
+                bypass_cr_data.data(i*4 + 3 downto i*4) <= v.e.write_cr_data(i*4 + 3 downto i*4);
+            else
+                bypass_cr_data.data(i*4 + 3 downto i*4) <= cr_in(i*4 + 3 downto i*4);
+            end if;
+        end loop;
 
         -- Outputs to loadstore1 (async)
         lv.op := e_in.insn_type;
+        lv.nia := e_in.nia;
+        lv.instr_tag := e_in.instr_tag;
         lv.addr1 := a_in;
         lv.addr2 := b_in;
         lv.data := c_in;
-        lv.write_reg := gspr_to_gpr(e_in.write_reg);
+        lv.write_reg := e_in.write_reg;
         lv.length := e_in.data_len;
-        lv.byte_reverse := e_in.byte_reverse;
+        lv.byte_reverse := e_in.byte_reverse xnor ctrl.msr(MSR_LE);
         lv.sign_extend := e_in.sign_extend;
         lv.update := e_in.update;
-        lv.update_reg := gspr_to_gpr(e_in.read_reg1);
-        lv.xerc := v.e.xerc;
+        lv.xerc := xerc_in;
         lv.reserve := e_in.reserve;
         lv.rc := e_in.rc;
+        lv.insn := e_in.insn;
         -- decode l*cix and st*cix instructions here
         if e_in.insn(31 downto 26) = "011111" and e_in.insn(10 downto 9) = "11" and
             e_in.insn(5 downto 1) = "10101" then
             lv.ci := '1';
         end if;
+        lv.virt_mode := ctrl.msr(MSR_DR);
+        lv.priv_mode := not ctrl.msr(MSR_PR);
+        lv.mode_32bit := not ctrl.msr(MSR_SF);
+        lv.is_32bit := e_in.is_32bit;
+        lv.repeat := e_in.repeat;
+        lv.second := e_in.second;
+
+        -- Outputs to FPU
+        fv.op := e_in.insn_type;
+        fv.nia := e_in.nia;
+        fv.insn := e_in.insn;
+        fv.itag := e_in.instr_tag;
+        fv.single := e_in.is_32bit;
+        fv.fe_mode := ctrl.msr(MSR_FE0) & ctrl.msr(MSR_FE1);
+        fv.fra := a_in;
+        fv.frb := b_in;
+        fv.frc := c_in;
+        fv.frt := e_in.write_reg;
+        fv.rc := e_in.rc;
+        fv.out_cr := e_in.output_cr;
 
 	-- Update registers
 	rin <= v;
 
 	-- update outputs
-	--f_out <= r.f;
         l_out <= lv;
 	e_out <= r.e;
-	flush_out <= f_out.redirect;
+        fp_out <= fv;
+
+        exception_log <= exception;
+        irq_valid_log <= irq_valid;
     end process;
+
+    e1_log: if LOG_LENGTH > 0 generate
+        signal log_data : std_ulogic_vector(14 downto 0);
+    begin
+        ex1_log : process(clk)
+        begin
+            if rising_edge(clk) then
+                log_data <= ctrl.msr(MSR_EE) & ctrl.msr(MSR_PR) &
+                            ctrl.msr(MSR_IR) & ctrl.msr(MSR_DR) &
+                            exception_log &
+                            irq_valid_log &
+                            interrupt_in &
+                            "000" &
+                            r.e.write_enable &
+                            r.e.valid &
+                            (r.e.redirect or r.e.interrupt) &
+                            r.busy &
+                            flush_in;
+            end if;
+        end process;
+        log_out <= log_data;
+    end generate;
 end architecture behaviour;