writeback: Do data formatting and condition recording in writeback
authorPaul Mackerras <paulus@ozlabs.org>
Mon, 14 Oct 2019 01:56:01 +0000 (12:56 +1100)
committerPaul Mackerras <paulus@ozlabs.org>
Tue, 15 Oct 2019 04:23:28 +0000 (15:23 +1100)
This adds code to writeback to format data and test the result
against zero for the purpose of setting CR0.  The data formatter
is able to shift and mask by bytes and do byte reversal and sign
extension.  It can also put together bytes from two input
doublewords to support unaligned loads (including unaligned
byte-reversed loads).

The data formatter starts with an 8:1 multiplexer that is able
to direct any byte of the input to any byte of the output.  This
lets us rotate the data and simultaneously byte-reverse it.
The rotated/reversed data goes to a register for the unaligned
cases that overlap two doublewords.  Then there is per-byte logic
that does trimming, sign extension, and splicing together bytes
from a previous input doubleword (stored in data_latched) and the
current doubleword.  Finally the 64-bit result is tested to set
CR0 if rc = 1.

This removes the RC logic from the execute2, multiply and divide
units, and the shift/mask/byte-reverse/sign-extend logic from
loadstore2.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
Makefile
common.vhdl
divider.vhdl
divider_tb.vhdl
execute2.vhdl
loadstore2.vhdl
multiply.vhdl
multiply_tb.vhdl
writeback.vhdl

index af9c91dd2ccc7d703002761c66c5b2c25cbda048..6657d4d6383fab289113073317b1e373302df01c 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -27,7 +27,7 @@ decode1.o: common.o decode_types.o
 decode2.o: decode_types.o common.o helpers.o insn_helpers.o control.o
 decode_types.o:
 execute1.o: decode_types.o common.o helpers.o crhelpers.o insn_helpers.o ppc_fx_insns.o rotator.o logical.o countzero.o
-execute2.o: common.o crhelpers.o ppc_fx_insns.o
+execute2.o: common.o
 fetch1.o: common.o
 fetch2.o: common.o wishbone_types.o
 glibc_random_helpers.o:
@@ -43,9 +43,9 @@ loadstore1.o: common.o helpers.o
 loadstore2.o: common.o helpers.o wishbone_types.o
 logical.o: decode_types.o
 multiply_tb.o: decode_types.o common.o glibc_random.o ppc_fx_insns.o multiply.o
-multiply.o: common.o decode_types.o ppc_fx_insns.o crhelpers.o
+multiply.o: common.o decode_types.o
 divider_tb.o: decode_types.o common.o glibc_random.o ppc_fx_insns.o divider.o
-divider.o: common.o decode_types.o crhelpers.o
+divider.o: common.o decode_types.o
 ppc_fx_insns.o: helpers.o
 register_file.o: common.o
 rotator.o: common.o
@@ -58,7 +58,7 @@ sim_uart.o: wishbone_types.o sim_console.o
 soc.o: common.o wishbone_types.o core.o wishbone_arbiter.o sim_uart.o simple_ram_behavioural.o dmi_dtm_xilinx.o wishbone_debug_master.o
 wishbone_arbiter.o: wishbone_types.o
 wishbone_types.o:
-writeback.o: common.o
+writeback.o: common.o crhelpers.o
 dmi_dtm_tb.o: dmi_dtm_xilinx.o wishbone_debug_master.o
 dmi_dtm_xilinx.o: wishbone_types.o sim-unisim/unisim_vcomponents.o
 wishbone_debug_master.o: wishbone_types.o
index ae61342ff91bad39164d75069a0519b6bb26abdc..321cff1257a69e2244c6108ee5fdef835978b775 100644 (file)
@@ -155,8 +155,13 @@ package common is
                write_enable: std_ulogic;
                write_reg : std_ulogic_vector(4 downto 0);
                write_data : std_ulogic_vector(63 downto 0);
+               write_len : std_ulogic_vector(3 downto 0);
+               write_shift : std_ulogic_vector(2 downto 0);
+               sign_extend : std_ulogic;
+               byte_reverse : std_ulogic;
+               second_word : std_ulogic;
        end record;
-       constant Loadstore2ToWritebackInit : Loadstore2ToWritebackType := (valid => '0', write_enable => '0', others => (others => '0'));
+       constant Loadstore2ToWritebackInit : Loadstore2ToWritebackType := (valid => '0', write_enable => '0', sign_extend => '0', byte_reverse => '0', second_word => '0', others => (others => '0'));
 
        type Execute1ToExecute2Type is record
                valid: std_ulogic;
@@ -172,6 +177,7 @@ package common is
 
        type Execute2ToWritebackType is record
                valid: std_ulogic;
+               rc : std_ulogic;
                write_enable : std_ulogic;
                write_reg: std_ulogic_vector(4 downto 0);
                write_data: std_ulogic_vector(63 downto 0);
@@ -179,7 +185,7 @@ package common is
                write_cr_mask : std_ulogic_vector(7 downto 0);
                write_cr_data : std_ulogic_vector(31 downto 0);
        end record;
-       constant Execute2ToWritebackInit : Execute2ToWritebackType := (valid => '0', write_enable => '0', write_cr_enable => '0', others => (others => '0'));
+       constant Execute2ToWritebackInit : Execute2ToWritebackType := (valid => '0', rc => '0', write_enable => '0', write_cr_enable => '0', others => (others => '0'));
 
        type MultiplyToWritebackType is record
                valid: std_ulogic;
@@ -187,11 +193,9 @@ package common is
                write_reg_enable : std_ulogic;
                write_reg_nr: std_ulogic_vector(4 downto 0);
                write_reg_data: std_ulogic_vector(63 downto 0);
-               write_cr_enable: std_ulogic;
-               write_cr_mask: std_ulogic_vector(7 downto 0);
-               write_cr_data: std_ulogic_vector(31 downto 0);
+               rc: std_ulogic;
        end record;
-       constant MultiplyToWritebackInit : MultiplyToWritebackType := (valid => '0', write_reg_enable => '0', write_cr_enable => '0', others => (others => '0'));
+       constant MultiplyToWritebackInit : MultiplyToWritebackType := (valid => '0', write_reg_enable => '0', rc => '0', others => (others => '0'));
 
        type DividerToWritebackType is record
                valid: std_ulogic;
@@ -199,11 +203,9 @@ package common is
                write_reg_enable : std_ulogic;
                write_reg_nr: std_ulogic_vector(4 downto 0);
                write_reg_data: std_ulogic_vector(63 downto 0);
-               write_cr_enable: std_ulogic;
-               write_cr_mask: std_ulogic_vector(7 downto 0);
-               write_cr_data: std_ulogic_vector(31 downto 0);
+               rc: std_ulogic;
        end record;
-       constant DividerToWritebackInit : DividerToWritebackType := (valid => '0', write_reg_enable => '0', write_cr_enable => '0', others => (others => '0'));
+       constant DividerToWritebackInit : DividerToWritebackType := (valid => '0', write_reg_enable => '0', rc => '0', others => (others => '0'));
 
        type WritebackToRegisterFileType is record
                write_reg : std_ulogic_vector(4 downto 0);
index cfadc5115e492ba5481fafd209b5938dbf0e6806..20d4600182015069b43daf3664f8036c15dc550e 100644 (file)
@@ -5,7 +5,6 @@ use ieee.numeric_std.all;
 library work;
 use work.common.all;
 use work.decode_types.all;
-use work.crhelpers.all;
 
 entity divider is
     port (
@@ -37,7 +36,6 @@ architecture behaviour of divider is
     signal overflow   : std_ulogic;
     signal ovf32      : std_ulogic;
     signal did_ovf    : std_ulogic;
-    signal cr_data    : std_ulogic_vector(2 downto 0);
 
 begin
     divider_0: process(clk)
@@ -114,7 +112,7 @@ begin
     divider_1: process(all)
     begin
         d_out.write_reg_nr <= write_reg;
-        d_out.write_cr_mask <= num_to_fxm(0);
+        d_out.rc <= rc;
 
         if is_modulus = '1' then
             result <= dend(128 downto 65);
@@ -144,29 +142,18 @@ begin
         else
             oresult <= sresult;
         end if;
-
-        if (did_ovf = '1') or (or (sresult) = '0') then
-            cr_data <= "001";
-        elsif (sresult(63) = '1') and not ((is_32bit = '1') and (is_modulus = '0')) then
-            cr_data <= "100";
-        else
-            cr_data <= "010";
-        end if;
     end process;
 
     divider_out: process(clk)
     begin
         if rising_edge(clk) then
             d_out.write_reg_data <= oresult;
-            d_out.write_cr_data <= cr_data & '0' & x"0000000";
             if count = "1000000" then
                 d_out.valid <= '1';
                 d_out.write_reg_enable <= '1';
-                d_out.write_cr_enable <= rc;
             else
                 d_out.valid <= '0';
                 d_out.write_reg_enable <= '0';
-                d_out.write_cr_enable <= '0';
             end if;
         end if;
     end process;
index fdc8da54c11bf4ff9d64712fbe38d7b7ffe2c429..5f809bb44ee1bc896bee0372693b8c5d9b06804b 100644 (file)
@@ -68,7 +68,7 @@ begin
         assert d2.write_reg_enable = '1';
         assert d2.write_reg_nr = "10001";
         assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data);
-        assert d2.write_cr_enable = '0';
+        assert d2.rc = '0';
 
         wait for clk_period;
         assert d2.valid = '0' report "valid";
@@ -92,9 +92,7 @@ begin
         assert d2.write_reg_enable = '1';
         assert d2.write_reg_nr = "10001";
         assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data);
-        assert d2.write_cr_enable = '1';
-        assert d2.write_cr_mask = "10000000";
-        assert d2.write_cr_data = x"40000000" report "cr data is " & to_hstring(d2.write_cr_data);
+        assert d2.rc = '1';
 
         wait for clk_period;
         assert d2.valid = '0';
@@ -129,8 +127,6 @@ begin
                     end if;
                     assert to_hstring(behave_rt) = to_hstring(d2.write_reg_data)
                         report "bad divd expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
-                    assert ppc_cmpi('1', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
-                        report "bad CR setting for divd";
                 end loop;
             end loop;
         end loop;
@@ -165,8 +161,6 @@ begin
                     end if;
                     assert to_hstring(behave_rt) = to_hstring(d2.write_reg_data)
                         report "bad divdu expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
-                    assert ppc_cmpi('1', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
-                        report "bad CR setting for divdu";
                 end loop;
             end loop;
         end loop;
@@ -207,8 +201,6 @@ begin
                     end if;
                     assert to_hstring(behave_rt) = to_hstring(d2.write_reg_data)
                         report "bad divde expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data) & " for ra = " & to_hstring(ra) & " rb = " & to_hstring(rb);
-                    assert ppc_cmpi('1', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
-                        report "bad CR setting for divde";
                 end loop;
             end loop;
         end loop;
@@ -246,8 +238,6 @@ begin
                     end if;
                     assert to_hstring(behave_rt) = to_hstring(d2.write_reg_data)
                         report "bad divdeu expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data) & " for ra = " & to_hstring(ra) & " rb = " & to_hstring(rb);
-                    assert ppc_cmpi('1', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
-                        report "bad CR setting for divdeu";
                 end loop;
             end loop;
         end loop;
@@ -284,8 +274,6 @@ begin
                     end if;
                     assert behave_rt = d2.write_reg_data
                         report "bad divw expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
-                    assert ppc_cmpi('1', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
-                        report "bad CR setting for divw";
                 end loop;
             end loop;
         end loop;
@@ -322,8 +310,6 @@ begin
                     end if;
                     assert behave_rt = d2.write_reg_data
                         report "bad divwu expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
-                    assert ppc_cmpi('1', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
-                        report "bad CR setting for divwu";
                 end loop;
             end loop;
         end loop;
@@ -363,8 +349,6 @@ begin
                         end if;
                         assert behave_rt = d2.write_reg_data
                             report "bad divwe expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data) & " for ra = " & to_hstring(ra) & " rb = " & to_hstring(rb);
-                        assert ppc_cmpi('1', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
-                            report "bad CR setting for divwe";
                     end if;
                 end loop;
             end loop;
@@ -402,8 +386,6 @@ begin
                     end if;
                     assert behave_rt = d2.write_reg_data
                         report "bad divweu expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data) & " for ra = " & to_hstring(ra) & " rb = " & to_hstring(rb);
-                    assert ppc_cmpi('1', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
-                        report "bad CR setting for divweu";
                 end loop;
             end loop;
         end loop;
@@ -441,8 +423,6 @@ begin
                     end if;
                     assert behave_rt = d2.write_reg_data
                         report "bad modsd expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
-                    assert ppc_cmpi('1', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
-                        report "bad CR setting for modsd";
                 end loop;
             end loop;
         end loop;
@@ -480,8 +460,6 @@ begin
                     end if;
                     assert behave_rt = d2.write_reg_data
                         report "bad modud expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
-                    assert ppc_cmpi('1', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
-                        report "bad CR setting for modud";
                 end loop;
             end loop;
         end loop;
@@ -524,8 +502,6 @@ begin
                     end if;
                     assert behave_rt = d2.write_reg_data
                         report "bad modsw expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
-                    assert ppc_cmpi('1', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
-                        report "bad CR setting for modsw";
                 end loop;
             end loop;
         end loop;
@@ -563,8 +539,6 @@ begin
                     end if;
                     assert behave_rt(31 downto 0) = d2.write_reg_data(31 downto 0)
                         report "bad moduw expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
-                    assert ppc_cmpi('1', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
-                        report "bad CR setting for moduw";
                 end loop;
             end loop;
         end loop;
index 9fdb1ddc5ee749a840db91dd62b45b822307e395..de55310a41e3203549910db19811a11736b6a034 100644 (file)
@@ -4,8 +4,6 @@ use ieee.numeric_std.all;
 
 library work;
 use work.common.all;
-use work.crhelpers.all;
-use work.ppc_fx_insns.all;
 
 -- 2 cycle ALU
 -- We handle rc form instructions here
@@ -41,12 +39,7 @@ begin
         v.write_cr_enable := e_in.write_cr_enable;
         v.write_cr_mask := e_in.write_cr_mask;
         v.write_cr_data := e_in.write_cr_data;
-
-        if e_in.valid = '1' and e_in.rc = '1' then
-            v.write_cr_enable := '1';
-            v.write_cr_mask := num_to_fxm(0);
-            v.write_cr_data := ppc_cmpi('1', e_in.write_data, x"0000") & x"0000000";
-        end if;
+        v.rc := e_in.rc;
 
         -- Update registers
         rin <= v;
index 17ef7e1c389701b34d15727f58c6f2bf59b22c48..cd7061c55a67822b2c57f48aca65a5a9a20d7bc4 100644 (file)
@@ -26,9 +26,6 @@ architecture behave of loadstore2 is
     signal l_saved : Loadstore1ToLoadstore2Type;
     signal w_tmp   : Loadstore2ToWritebackType;
     signal m_tmp   : wishbone_master_out;
-    signal read_data : std_ulogic_vector(63 downto 0);
-    signal read_data_shift : std_ulogic_vector(2 downto 0);
-    signal sign_extend_byte_reverse: std_ulogic_vector(1 downto 0);
     signal dlength : std_ulogic_vector(3 downto 0);
 
     type state_t is (IDLE, WAITING_FOR_READ_ACK, WAITING_FOR_WRITE_ACK);
@@ -61,37 +58,6 @@ architecture behave of loadstore2 is
     end function wishbone_data_sel;
 begin
 
-    loadstore2_1: process(all)
-        variable tmp     : std_ulogic_vector(63 downto 0);
-        variable data    : std_ulogic_vector(63 downto 0);
-    begin
-        tmp := std_logic_vector(shift_right(unsigned(read_data), to_integer(unsigned(read_data_shift)) * 8));
-        data := (others => '0');
-        case to_integer(unsigned(dlength)) is
-            when 0 =>
-            when 1 =>
-                data(7 downto 0) := tmp(7 downto 0);
-            when 2 =>
-                data(15 downto 0) := tmp(15 downto 0);
-            when 4 =>
-                data(31 downto 0) := tmp(31 downto 0);
-            when 8 =>
-                data(63 downto 0) := tmp(63 downto 0);
-            when others =>
-                assert false report "invalid length" severity failure;
-                data(63 downto 0) := tmp(63 downto 0);
-        end case;
-
-        case sign_extend_byte_reverse is
-            when "10" =>
-                w_tmp.write_data <= sign_extend(data, to_integer(unsigned(l_saved.length)));
-            when "01" =>
-                w_tmp.write_data <= byte_reverse(data, to_integer(unsigned(l_saved.length)));
-            when others =>
-                w_tmp.write_data <= data;
-        end case;
-    end process;
-
     w_out <= w_tmp;
     m_out <= m_tmp;
 
@@ -102,11 +68,13 @@ begin
             w_tmp.valid <= '0';
             w_tmp.write_enable <= '0';
             w_tmp.write_reg <= (others => '0');
+            w_tmp.write_len <= "1000";
+            w_tmp.write_shift <= "000";
+            w_tmp.sign_extend <= '0';
+            w_tmp.byte_reverse <= '0';
+            w_tmp.second_word <= '0';
 
             l_saved <= l_saved;
-            read_data_shift <= "000";
-            sign_extend_byte_reverse <= "00";
-            dlength <= "1000";
 
             case_0: case state is
                 when IDLE =>
@@ -131,7 +99,7 @@ begin
                             if l_in.update = '1' then
                                 w_tmp.write_enable <= '1';
                                 w_tmp.write_reg <= l_in.update_reg;
-                                read_data <= l_in.addr;
+                                w_tmp.write_data <= l_in.addr;
                             end if;
 
                             state <= WAITING_FOR_READ_ACK;
@@ -148,15 +116,15 @@ begin
 
                 when WAITING_FOR_READ_ACK =>
                     if m_in.ack = '1' then
-                        read_data <= m_in.dat;
-                        read_data_shift <= l_saved.addr(2 downto 0);
-                        dlength <= l_saved.length;
-                        sign_extend_byte_reverse <= l_saved.sign_extend & l_saved.byte_reverse;
-
                         -- write data to register file
                         w_tmp.valid <= '1';
                         w_tmp.write_enable <= '1';
+                        w_tmp.write_data <= m_in.dat;
                         w_tmp.write_reg <= l_saved.write_reg;
+                        w_tmp.write_len <= l_saved.length;
+                        w_tmp.write_shift <= l_saved.addr(2 downto 0);
+                        w_tmp.sign_extend <= l_saved.sign_extend;
+                        w_tmp.byte_reverse <= l_saved.byte_reverse;
 
                         m_tmp <= wishbone_master_out_init;
                         state <= IDLE;
@@ -168,7 +136,7 @@ begin
                         if l_saved.update = '1' then
                             w_tmp.write_enable <= '1';
                             w_tmp.write_reg <= l_saved.update_reg;
-                            read_data <= l_saved.addr;
+                            w_tmp.write_data <= l_saved.addr;
                         end if;
 
                         m_tmp <= wishbone_master_out_init;
index 71aceca8ad553dde48969382fee6c07727621d03..94fa7920b63fbd5ada6b8324069bd139f6759a67 100644 (file)
@@ -5,8 +5,6 @@ use ieee.numeric_std.all;
 library work;
 use work.common.all;
 use work.decode_types.all;
-use work.ppc_fx_insns.all;
-use work.crhelpers.all;
 
 entity multiply is
     generic (
@@ -88,12 +86,7 @@ begin
         if v.multiply_pipeline(PIPELINE_DEPTH-1).valid = '1' then
             m_out.valid <= '1';
             m_out.write_reg_enable <= '1';
-
-            if v.multiply_pipeline(PIPELINE_DEPTH-1).rc = '1' then
-                m_out.write_cr_enable <= '1';
-                m_out.write_cr_mask <= num_to_fxm(0);
-                m_out.write_cr_data <= ppc_cmpi('1', d2, x"0000") & x"0000000";
-            end if;
+            m_out.rc <= v.multiply_pipeline(PIPELINE_DEPTH-1).rc;
         end if;
 
         rin <= v;
index 95c3199e0764a724e430a31e2b20770a0bacd4f0..48f83abf87d1c2c3d4303aa7809d09687ee77cea 100644 (file)
@@ -61,7 +61,7 @@ begin
         assert m2.write_reg_enable = '1';
         assert m2.write_reg_nr = "10001";
         assert m2.write_reg_data = x"0000000001111000";
-        assert m2.write_cr_enable = '0';
+        assert m2.rc = '0';
 
         wait for clk_period;
         assert m2.valid = '0';
@@ -79,8 +79,7 @@ begin
         assert m2.write_reg_enable = '1';
         assert m2.write_reg_nr = "10001";
         assert m2.write_reg_data = x"0000000001111000";
-        assert m2.write_cr_enable = '1';
-        assert m2.write_cr_data = x"40000000";
+        assert m2.rc = '1';
 
         -- test mulld
         mulld_loop : for i in 0 to 1000 loop
index e2449601963f3d33a13dd930ac3d6c3c48d844eb..ba8897031d7fbce0aa5a8205d287111ef66b7ee0 100644 (file)
@@ -4,6 +4,7 @@ use ieee.numeric_std.all;
 
 library work;
 use work.common.all;
+use work.crhelpers.all;
 
 entity writeback is
     port (
@@ -22,12 +23,44 @@ entity writeback is
 end entity writeback;
 
 architecture behaviour of writeback is
+    subtype byte_index_t is unsigned(2 downto 0);
+    type permutation_t is array(0 to 7) of byte_index_t;
+    subtype byte_trim_t is std_ulogic_vector(1 downto 0);
+    type trim_ctl_t is array(0 to 7) of byte_trim_t;
+    type byte_sel_t is array(0 to 7) of std_ulogic;
+
+    signal data_len : unsigned(3 downto 0);
+    signal data_in : std_ulogic_vector(63 downto 0);
+    signal data_permuted : std_ulogic_vector(63 downto 0);
+    signal data_trimmed : std_ulogic_vector(63 downto 0);
+    signal data_latched : std_ulogic_vector(63 downto 0);
+    signal perm : permutation_t;
+    signal use_second : byte_sel_t;
+    signal byte_offset : unsigned(2 downto 0);
+    signal brev_lenm1 : unsigned(2 downto 0);
+    signal trim_ctl : trim_ctl_t;
+    signal rc : std_ulogic;
+    signal partial_write : std_ulogic;
+    signal sign_extend : std_ulogic;
+    signal negative : std_ulogic;
+    signal second_word : std_ulogic;
 begin
+    writeback_0: process(clk)
+    begin
+        if rising_edge(clk) then
+            if partial_write = '1' then
+                data_latched <= data_permuted;
+            end if;
+        end if;
+    end process;
+
     writeback_1: process(all)
         variable x : std_ulogic_vector(0 downto 0);
         variable y : std_ulogic_vector(0 downto 0);
         variable z : std_ulogic_vector(0 downto 0);
         variable w : std_ulogic_vector(0 downto 0);
+        variable j : integer;
+        variable k : unsigned(3 downto 0);
     begin
         x := "" & e_in.valid;
         y := "" & l_in.valid;
@@ -41,10 +74,11 @@ begin
         w := "" & d_in.write_reg_enable;
         assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z)) + to_integer(unsigned(w))) <= 1 severity failure;
 
-        x := "" & e_in.write_cr_enable;
-        y := "" & m_in.write_cr_enable;
-        z := "" & d_in.write_cr_enable;
-        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure;
+        w := "" & e_in.write_cr_enable;
+        x := "" & (e_in.write_enable and e_in.rc);
+        y := "" & (m_in.valid and m_in.rc);
+        z := "" & (d_in.valid and d_in.rc);
+        assert (to_integer(unsigned(w)) + to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure;
 
         w_out <= WritebackToRegisterFileInit;
         c_out <= WritebackToCrFileInit;
@@ -54,10 +88,19 @@ begin
             complete_out <= '1';
         end if;
 
+        rc <= '0';
+        brev_lenm1 <= "000";
+        byte_offset <= "000";
+        data_len <= x"8";
+        partial_write <= '0';
+        sign_extend <= '0';
+        second_word <= '0';
+
         if e_in.write_enable = '1' then
             w_out.write_reg <= e_in.write_reg;
-            w_out.write_data <= e_in.write_data;
+            data_in <= e_in.write_data;
             w_out.write_enable <= '1';
+            rc <= e_in.rc;
         end if;
 
         if e_in.write_cr_enable = '1' then
@@ -68,32 +111,89 @@ begin
 
         if l_in.write_enable = '1' then
             w_out.write_reg <= l_in.write_reg;
-            w_out.write_data <= l_in.write_data;
+            data_in <= l_in.write_data;
+            data_len <= unsigned(l_in.write_len);
+            byte_offset <= unsigned(l_in.write_shift);
+            sign_extend <= l_in.sign_extend;
+            if l_in.byte_reverse = '1' then
+                brev_lenm1 <= unsigned(l_in.write_len(2 downto 0)) - 1;
+            end if;
             w_out.write_enable <= '1';
+            second_word <= l_in.second_word;
+            if l_in.valid = '0' and (data_len + byte_offset > 8) then
+                partial_write <= '1';
+            end if;
         end if;
 
         if m_in.write_reg_enable = '1' then
             w_out.write_enable <= '1';
             w_out.write_reg <= m_in.write_reg_nr;
-            w_out.write_data <= m_in.write_reg_data;
-        end if;
-
-        if m_in.write_cr_enable = '1' then
-            c_out.write_cr_enable <= '1';
-            c_out.write_cr_mask <= m_in.write_cr_mask;
-            c_out.write_cr_data <= m_in.write_cr_data;
+            data_in <= m_in.write_reg_data;
+            rc <= m_in.rc;
         end if;
 
         if d_in.write_reg_enable = '1' then
             w_out.write_enable <= '1';
             w_out.write_reg <= d_in.write_reg_nr;
-            w_out.write_data <= d_in.write_reg_data;
+            data_in <= d_in.write_reg_data;
+            rc <= d_in.rc;
         end if;
 
-        if d_in.write_cr_enable = '1' then
+        -- shift and byte-reverse data bytes
+        for i in 0 to 7 loop
+            k := ('0' & (to_unsigned(i, 3) xor brev_lenm1)) + ('0' & byte_offset);
+            perm(i) <= k(2 downto 0);
+            use_second(i) <= k(3);
+        end loop;
+        for i in 0 to 7 loop
+            j := to_integer(perm(i)) * 8;
+            data_permuted(i * 8 + 7 downto i * 8) <= data_in(j + 7 downto j);
+        end loop;
+
+        -- If the data can arrive split over two cycles, this will be correct
+        -- provided we don't have both sign extension and byte reversal.
+        negative <= (data_len(2) and data_permuted(31)) or (data_len(1) and data_permuted(15)) or
+                    (data_len(0) and data_permuted(7));
+
+        -- trim and sign-extend
+        for i in 0 to 7 loop
+            if i < to_integer(data_len) then
+                if second_word = '1' then
+                    trim_ctl(i) <= '1' & not use_second(i);
+                else
+                    trim_ctl(i) <= not use_second(i) & '0';
+                end if;
+            else
+                trim_ctl(i) <= '0' & (negative and sign_extend);
+            end if;
+        end loop;
+        for i in 0 to 7 loop
+            case trim_ctl(i) is
+                when "11" =>
+                    data_trimmed(i * 8 + 7 downto i * 8) <= data_latched(i * 8 + 7 downto i * 8);
+                when "10" =>
+                    data_trimmed(i * 8 + 7 downto i * 8) <= data_permuted(i * 8 + 7 downto i * 8);
+                when "01" =>
+                    data_trimmed(i * 8 + 7 downto i * 8) <= x"FF";
+                when others =>
+                    data_trimmed(i * 8 + 7 downto i * 8) <= x"00";
+            end case;
+        end loop;
+
+        -- deliver to regfile
+        w_out.write_data <= data_trimmed;
+
+        -- test value against 0 and set CR0 if requested
+        if rc = '1' then
             c_out.write_cr_enable <= '1';
-            c_out.write_cr_mask <= d_in.write_cr_mask;
-            c_out.write_cr_data <= d_in.write_cr_data;
+            c_out.write_cr_mask <= num_to_fxm(0);
+            if data_trimmed(63) = '1' then
+                c_out.write_cr_data <= x"80000000";
+            elsif or (data_trimmed(62 downto 0)) = '1' then
+                c_out.write_cr_data <= x"40000000";
+            else
+                c_out.write_cr_data <= x"20000000";
+            end if;
         end if;
     end process;
 end;