From: Paul Mackerras <paulus@ozlabs.org>
Date: Sun, 22 Sep 2019 07:24:14 +0000 (+1000)
Subject: Add a divider unit and a testbench for it
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=d5bc6c882499e6644f9fa75fe4ddbeb0b5c49600;p=microwatt.git

Add a divider unit and a testbench for it

This adds a divider unit, connected to the core in much the same way
that the multiplier unit is connected.  The division algorithm is
very simple-minded, taking 64 clock cycles for any division (even
32-bit division instructions).

The decoding is simplified by making use of regularities in the
instruction encoding for div* and mod* instructions.  Instead of
having PPC_* encodings from the first-stage decoder for each of the
different div* and mod* instructions, we now just have PPC_DIV and
PPC_MOD, and the inputs to the divider that indicate what sort of
division operation to do are derived from instruction word bits.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---

diff --git a/Makefile b/Makefile
index 62e9644..318866d 100644
--- a/Makefile
+++ b/Makefile
@@ -2,7 +2,7 @@ GHDL=ghdl
 GHDLFLAGS=--std=08
 CFLAGS=-O2 -Wall
 
-all = core_tb simple_ram_behavioural_tb soc_reset_tb icache_tb multiply_tb
+all = core_tb simple_ram_behavioural_tb soc_reset_tb icache_tb multiply_tb divider_tb
 # XXX
 # loadstore_tb fetch_tb
 
@@ -13,7 +13,7 @@ all: $(all)
 
 common.o: decode_types.o
 core_tb.o: common.o core.o soc.o
-core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o execute2.o loadstore1.o loadstore2.o multiply.o writeback.o
+core.o: common.o wishbone_types.o fetch1.o fetch2.o icache.o decode1.o decode2.o register_file.o cr_file.o execute1.o execute2.o loadstore1.o loadstore2.o multiply.o divider.o writeback.o
 cr_file.o: common.o
 crhelpers.o: common.o
 decode1.o: common.o decode_types.o
@@ -33,6 +33,8 @@ loadstore1.o: common.o
 loadstore2.o: common.o helpers.o wishbone_types.o
 multiply_tb.o: common.o glibc_random.o ppc_fx_insns.o multiply.o
 multiply.o: common.o decode_types.o ppc_fx_insns.o crhelpers.o
+divider_tb.o: common.o glibc_random.o ppc_fx_insns.o divider.o
+divider.o: common.o decode_types.o ppc_fx_insns.o crhelpers.o
 ppc_fx_insns.o: helpers.o
 register_file.o: common.o
 sim_console.o:
@@ -64,6 +66,9 @@ loadstore_tb: loadstore_tb.o
 multiply_tb: multiply_tb.o
 	$(GHDL) -e $(GHDLFLAGS) $@
 
+divider_tb: divider_tb.o
+	$(GHDL) -e $(GHDLFLAGS) $@
+
 simple_ram_tb: simple_ram_tb.o
 	$(GHDL) -e $(GHDLFLAGS) $@
 
diff --git a/common.vhdl b/common.vhdl
index bf383ca..4f19017 100644
--- a/common.vhdl
+++ b/common.vhdl
@@ -73,6 +73,19 @@ package common is
 	end record;
 	constant Decode2ToMultiplyInit : Decode2ToMultiplyType := (valid => '0', insn_type => OP_ILLEGAL, rc => '0', others => (others => '0'));
 
+        type Decode2ToDividerType is record
+                valid: std_ulogic;
+		write_reg: std_ulogic_vector(4 downto 0);
+                dividend: std_ulogic_vector(63 downto 0);
+                divisor: std_ulogic_vector(63 downto 0);
+                neg_result: std_ulogic;
+                is_32bit: std_ulogic;
+                is_extended: std_ulogic;
+                is_modulus: std_ulogic;
+                rc: std_ulogic;
+        end record;
+        constant Decode2ToDividerInit: Decode2ToDividerType := (valid => '0', neg_result => '0', is_32bit => '0', is_extended => '0', is_modulus => '0', rc => '0', others => (others => '0'));
+
 	type Decode2ToRegisterFileType is record
 		read1_enable : std_ulogic;
 		read1_reg : std_ulogic_vector(4 downto 0);
@@ -173,6 +186,18 @@ package common is
 	end record;
 	constant MultiplyToWritebackInit : MultiplyToWritebackType := (valid => '0', write_reg_enable => '0', write_cr_enable => '0', others => (others => '0'));
 
+	type DividerToWritebackType is record
+		valid: std_ulogic;
+
+		write_reg_enable : std_ulogic;
+		write_reg_nr: std_ulogic_vector(4 downto 0);
+		write_reg_data: std_ulogic_vector(63 downto 0);
+		write_cr_enable: std_ulogic;
+		write_cr_mask: std_ulogic_vector(7 downto 0);
+		write_cr_data: std_ulogic_vector(31 downto 0);
+	end record;
+	constant DividerToWritebackInit : DividerToWritebackType := (valid => '0', write_reg_enable => '0', write_cr_enable => '0', others => (others => '0'));
+
 	type WritebackToRegisterFileType is record
 		write_reg : std_ulogic_vector(4 downto 0);
 		write_data : std_ulogic_vector(63 downto 0);
diff --git a/core.vhdl b/core.vhdl
index d34bf71..a52ad6d 100644
--- a/core.vhdl
+++ b/core.vhdl
@@ -63,6 +63,10 @@ architecture behave of core is
     signal decode2_to_multiply: Decode2ToMultiplyType;
     signal multiply_to_writeback: MultiplyToWritebackType;
 
+    -- divider signals
+    signal decode2_to_divider: Decode2ToDividerType;
+    signal divider_to_writeback: DividerToWritebackType;
+
     -- local signals
     signal fetch1_stall_in : std_ulogic;
     signal fetch2_stall_in : std_ulogic;
@@ -146,6 +150,7 @@ begin
             e_out => decode2_to_execute1,
             l_out => decode2_to_loadstore1,
             m_out => decode2_to_multiply,
+            d_out => decode2_to_divider,
             r_in => register_file_to_decode2,
             r_out => decode2_to_register_file,
             c_in => cr_file_to_decode2,
@@ -211,12 +216,21 @@ begin
             m_out => multiply_to_writeback
             );
 
+    divider_0: entity work.divider
+        port map (
+            clk => clk,
+            rst => rst,
+            d_in => decode2_to_divider,
+            d_out => divider_to_writeback
+            );
+
     writeback_0: entity work.writeback
         port map (
             clk => clk,
             e_in => execute2_to_writeback,
             l_in => loadstore2_to_writeback,
             m_in => multiply_to_writeback,
+            d_in => divider_to_writeback,
             w_out => writeback_to_register_file,
             c_out => writeback_to_cr_file,
             complete_out => complete
diff --git a/decode1.vhdl b/decode1.vhdl
index 6e8a521..49e61be 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -79,14 +79,7 @@ architecture behaviour of decode1 is
 		PPC_DCBT       =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, NONE, NONE, NONE, '0', '0', '0', '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'),
 		PPC_DCBTST     =>       (ALU,    OP_NOP,       NONE,       NONE,        NONE, NONE, NONE, NONE, NONE, '0', '0', '0', '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'),
 		--PPC_DCBZ
-		PPC_DIVD       =>       (ALU,    OP_DIVD,      RA,         RB,          NONE, RT,   NONE, NONE, NONE, '0', '0', '0', '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'),
-		--PPC_DIVDE
-		--PPC_DIVDEU
-		PPC_DIVDU      =>       (ALU,    OP_DIVDU,     RA,         RB,          NONE, RT,   NONE, NONE, NONE, '0', '0', '0', '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'),
-		PPC_DIVW       =>       (ALU,    OP_DIVW,      RA,         RB,          NONE, RT,   NONE, NONE, NONE, '0', '0', '0', '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'),
-		--PPC_DIVWE
-		--PPC_DIVWEU
-		PPC_DIVWU      =>       (ALU,    OP_DIVWU,     RA,         RB,          NONE, RT,   NONE, NONE, NONE, '0', '0', '0', '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'),
+		PPC_DIV        =>       (DIV,    OP_DIV,       RA,         RB,          NONE, RT,   NONE, NONE, NONE, '0', '0', '0', '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'),
 		PPC_EQV        =>       (ALU,    OP_EQV,       RS,         RB,          NONE, RA,   NONE, NONE, NONE, '0', '0', '0', '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'),
 		PPC_EXTSB      =>       (ALU,    OP_EXTSB,     RS,         NONE,        NONE, RA,   NONE, NONE, NONE, '0', '0', '0', '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'),
 		PPC_EXTSH      =>       (ALU,    OP_EXTSH,     RS,         NONE,        NONE, RA,   NONE, NONE, NONE, '0', '0', '0', '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'),
@@ -141,10 +134,7 @@ architecture behaviour of decode1 is
 		PPC_MTCTR      =>       (ALU,    OP_MTCTR,     RS,         NONE,        NONE, NONE, NONE, NONE, NONE, '0', '0', '0', '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'),
 		PPC_MTLR       =>       (ALU,    OP_MTLR,      RS,         NONE,        NONE, NONE, NONE, NONE, NONE, '0', '0', '0', '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'),
 		--PPC_MFSPR
-		--PPC_MODSD
-		--PPC_MODSW
-		--PPC_MODUD
-		--PPC_MODUW
+		PPC_MOD        =>       (DIV,    OP_MOD,       RA,         RB,          NONE, RT,   NONE, NONE, NONE, '0', '0', '0', '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '1'),
 		PPC_MTCRF      =>       (ALU,    OP_MTCRF,     RS,         NONE,        NONE, NONE, FXM,  NONE, NONE, '0', '1', '0', '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'),
 		PPC_MTOCRF     =>       (ALU,    OP_MTOCRF,    RS,         NONE,        NONE, NONE, FXM,  NONE, NONE, '0', '1', '0', '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '1'),
 		--PPC_MTSPR
@@ -404,30 +394,9 @@ begin
 			elsif std_match(f_in.insn, "011111---------------1111110110-") then
 				report "PPC_dcbz";
 				ppc_insn := PPC_DCBZ;
-			elsif std_match(f_in.insn, "011111---------------0111101001-") then
-				report "PPC_divd";
-				ppc_insn := PPC_DIVD;
-			elsif std_match(f_in.insn, "011111---------------0110101001-") then
-				report "PPC_divde";
-				ppc_insn := PPC_DIVDE;
-			elsif std_match(f_in.insn, "011111---------------0110001001-") then
-				report "PPC_divdeu";
-				ppc_insn := PPC_DIVDEU;
-			elsif std_match(f_in.insn, "011111---------------0111001001-") then
-				report "PPC_divdu";
-				ppc_insn := PPC_DIVDU;
-			elsif std_match(f_in.insn, "011111---------------0111101011-") then
-				report "PPC_divw";
-				ppc_insn := PPC_DIVW;
-			elsif std_match(f_in.insn, "011111---------------0110101011-") then
-				report "PPC_divwe";
-				ppc_insn := PPC_DIVWE;
-			elsif std_match(f_in.insn, "011111---------------0110001011-") then
-				report "PPC_divweu";
-				ppc_insn := PPC_DIVWEU;
-			elsif std_match(f_in.insn, "011111---------------0111001011-") then
-				report "PPC_divwu";
-				ppc_insn := PPC_DIVWU;
+			elsif std_match(f_in.insn, "011111----------------11--010-1-") then
+				report "PPC_div";
+				ppc_insn := PPC_DIV;
 			elsif std_match(f_in.insn, "011111---------------0100011100-") then
 				report "PPC_eqv";
 				ppc_insn := PPC_EQV;
@@ -588,18 +557,9 @@ begin
 			elsif std_match(f_in.insn, "011111---------------0101010011-") then
 				report "PPC_mfspr";
 				ppc_insn := PPC_MFSPR;
-			elsif std_match(f_in.insn, "011111---------------1100001001-") then
-				report "PPC_modsd";
-				ppc_insn := PPC_MODSD;
-			elsif std_match(f_in.insn, "011111---------------1100001011-") then
-				report "PPC_modsw";
-				ppc_insn := PPC_MODSW;
-			elsif std_match(f_in.insn, "011111---------------0100001001-") then
-				report "PPC_modud";
-				ppc_insn := PPC_MODUD;
-			elsif std_match(f_in.insn, "011111---------------0100001011-") then
-				report "PPC_moduw";
-				ppc_insn := PPC_MODUW;
+			elsif std_match(f_in.insn, "011111----------------1000010-1-") then
+				report "PPC_mod";
+				ppc_insn := PPC_MOD;
 			elsif std_match(f_in.insn, "011111-----0---------0010010000-") then
 				report "PPC_mtcrf";
 				ppc_insn := PPC_MTCRF;
diff --git a/decode2.vhdl b/decode2.vhdl
index 15dae5d..7a00ff2 100644
--- a/decode2.vhdl
+++ b/decode2.vhdl
@@ -22,6 +22,7 @@ entity decode2 is
 
 		e_out : out Decode2ToExecute1Type;
 		m_out : out Decode2ToMultiplyType;
+                d_out : out Decode2ToDividerType;
 		l_out : out Decode2ToLoadstore1Type;
 
 		r_in  : in RegisterFileToDecode2Type;
@@ -43,6 +44,7 @@ architecture behaviour of decode2 is
 	type reg_type is record
 		e : Decode2ToExecute1Type;
 		m : Decode2ToMultiplyType;
+                d : Decode2ToDividerType;
 		l : Decode2ToLoadstore1Type;
 	end record;
 
@@ -190,7 +192,7 @@ begin
 		if rising_edge(clk) then
 			assert r_int.outstanding <= 1 report "Outstanding bad " & integer'image(r_int.outstanding) severity failure;
 
-			if rin.e.valid = '1' or rin.l.valid = '1' or rin.m.valid = '1' then
+			if rin.e.valid = '1' or rin.l.valid = '1' or rin.m.valid = '1' or rin.d.valid = '1' then
 				report "execute " & to_hstring(rin.e.nia);
 			end if;
 			r <= rin;
@@ -217,9 +219,13 @@ begin
 		variable v_int : reg_internal_type;
 		variable mul_a : std_ulogic_vector(63 downto 0);
 		variable mul_b : std_ulogic_vector(63 downto 0);
+		variable dividend : std_ulogic_vector(63 downto 0);
+		variable divisor  : std_ulogic_vector(63 downto 0);
+                variable absdend  : std_ulogic_vector(31 downto 0);
 		variable decoded_reg_a : decode_input_reg_t;
 		variable decoded_reg_b : decode_input_reg_t;
 		variable decoded_reg_c : decode_input_reg_t;
+                variable signed_division: std_ulogic;
 		variable is_valid : std_ulogic;
 	begin
 		v := r;
@@ -228,6 +234,7 @@ begin
 		v.e := Decode2ToExecute1Init;
 		v.l := Decode2ToLoadStore1Init;
 		v.m := Decode2ToMultiplyInit;
+                v.d := Decode2ToDividerInit;
 
 		mul_a := (others => '0');
 		mul_b := (others => '0');
@@ -290,6 +297,73 @@ begin
 			end if;
 		end if;
 
+                -- divide unit
+                -- PPC divide and modulus instruction words have these bits in
+                -- the bottom 11 bits: o1dns 010t1 r
+                -- where o = OE for div instrs, signedness for mod instrs
+                --       d = 1 for div*, 0 for mod*
+                --       n = 1 for normal, 0 for extended (dividend << 32/64)
+                --       s = 1 for signed, 0 for unsigned (for div*)
+                --       t = 1 for 32-bit, 0 for 64-bit
+                --       r = RC bit (record condition code)
+                -- For signed division/modulus, we take absolute values and
+                -- tell the divider what the sign of the result should be,
+                -- which is the dividend sign for modulus, and the XOR of
+                -- the dividend and divisor signs for division.
+                dividend := decoded_reg_a.data;
+                divisor := decoded_reg_b.data;
+		v.d.write_reg := decode_output_reg(d_in.decode.output_reg_a, d_in.insn);
+                v.d.is_modulus := not d_in.insn(8);
+                if d_in.insn(8) = '1' then
+                    signed_division := d_in.insn(6);
+                else
+                    signed_division := d_in.insn(10);
+                end if;
+                if d_in.insn(2) = '0' then
+                    -- 64-bit forms
+                        v.d.is_32bit := '0';
+                        if d_in.insn(8) = '1' and d_in.insn(7) = '0' then
+                                v.d.is_extended := '1';
+                        end if;
+                        if signed_division = '1' and dividend(63) = '1' then
+                                v.d.neg_result := '1';
+                                v.d.dividend := std_ulogic_vector(- signed(dividend));
+                        else
+                                v.d.dividend := dividend;
+                        end if;
+                        if signed_division = '1' and divisor(63) = '1' then
+                                if d_in.insn(8) = '1' then
+                                        v.d.neg_result := not v.d.neg_result;
+                                end if;
+                                v.d.divisor := std_ulogic_vector(- signed(divisor));
+                        else
+                                v.d.divisor := divisor;
+                        end if;
+                else
+                        -- 32-bit forms
+                        v.d.is_32bit := '1';
+                        if signed_division = '1' and dividend(31) = '1' then
+                                v.d.neg_result := '1';
+                                absdend := std_ulogic_vector(- signed(dividend(31 downto 0)));
+                        else
+                                absdend := dividend(31 downto 0);
+                        end if;
+                        if d_in.insn(8) = '1' and d_in.insn(7) = '0' then   -- extended forms
+                                v.d.dividend := absdend & x"00000000";
+                        else
+                                v.d.dividend := x"00000000" & absdend;
+                        end if;
+                        if signed_division = '1' and divisor(31) = '1' then
+                                if d_in.insn(8) = '1' then
+                                        v.d.neg_result := not v.d.neg_result;
+                                end if;
+                                v.d.divisor := x"00000000" & std_ulogic_vector(- signed(divisor(31 downto 0)));
+                        else
+                                v.d.divisor := x"00000000" & divisor(31 downto 0);
+                        end if;
+                end if;
+                v.d.rc := decode_rc(d_in.decode.rc, d_in.insn);
+
 		-- load/store unit
 		v.l.update_reg := decoded_reg_a.reg;
 		v.l.addr1 := decoded_reg_a.data;
@@ -363,6 +437,7 @@ begin
 
 		v.e.valid := '0';
 		v.m.valid := '0';
+                v.d.valid := '0';
 		v.l.valid := '0';
 		case d_in.decode.unit is
 		when ALU =>
@@ -371,6 +446,8 @@ begin
 			v.l.valid := is_valid;
 		when MUL =>
 			v.m.valid := is_valid;
+                when DIV =>
+                        v.d.valid := is_valid;
 		when NONE =>
 			v.e.valid := is_valid;
 			v.e.insn_type := OP_ILLEGAL;
@@ -379,11 +456,12 @@ begin
 		if flush_in = '1' then
 			v.e.valid := '0';
 			v.m.valid := '0';
+                        v.d.valid := '0';
 			v.l.valid := '0';
 		end if;
 
 		-- track outstanding instructions
-		if v.e.valid = '1' or v.l.valid = '1' or v.m.valid = '1' then
+		if v.e.valid = '1' or v.l.valid = '1' or v.m.valid = '1' or v.d.valid = '1' then
 			v_int.outstanding := v_int.outstanding + 1;
 		end if;
 
@@ -393,6 +471,7 @@ begin
 			v.e := Decode2ToExecute1Init;
 			v.l := Decode2ToLoadStore1Init;
 			v.m := Decode2ToMultiplyInit;
+                        v.d := Decode2ToDividerInit;
 		end if;
 
 		-- Update registers
@@ -403,5 +482,6 @@ begin
 		e_out <= r.e;
 		l_out <= r.l;
 		m_out <= r.m;
+                d_out <= r.d;
 	end process;
 end architecture behaviour;
diff --git a/decode_types.vhdl b/decode_types.vhdl
index 9378303..12f9a37 100644
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@@ -11,9 +11,8 @@ package decode_types is
 		PPC_CMPRB, PPC_CNTLZD, PPC_CNTLZW, PPC_CNTTZD, PPC_CNTTZW,
 		PPC_CRAND, PPC_CRANDC, PPC_CREQV, PPC_CRNAND, PPC_CRNOR,
 		PPC_CROR, PPC_CRORC, PPC_CRXOR, PPC_DARN, PPC_DCBF, PPC_DCBST,
-		PPC_DCBT, PPC_DCBTST, PPC_DCBZ, PPC_DIVD, PPC_DIVDE,
-		PPC_DIVDEU, PPC_DIVDU, PPC_DIVW, PPC_DIVWE, PPC_DIVWEU,
-		PPC_DIVWU, PPC_EQV, PPC_EXTSB, PPC_EXTSH, PPC_EXTSW,
+		PPC_DCBT, PPC_DCBTST, PPC_DCBZ, PPC_DIV,
+		PPC_EQV, PPC_EXTSB, PPC_EXTSH, PPC_EXTSW,
 		PPC_EXTSWSLI, PPC_ICBI, PPC_ICBT, PPC_ISEL, PPC_ISYNC,
 		PPC_LBARX, PPC_LBZ, PPC_LBZU, PPC_LBZUX, PPC_LBZX, PPC_LD,
 		PPC_LDARX, PPC_LDBRX, PPC_LDU, PPC_LDUX, PPC_LDX, PPC_LHA,
@@ -22,7 +21,7 @@ package decode_types is
 		PPC_LWAX, PPC_LWBRX, PPC_LWZ, PPC_LWZU, PPC_LWZUX, PPC_LWZX,
 		PPC_MADDHD, PPC_MADDHDU, PPC_MADDLD, PPC_MCRF, PPC_MCRXR,
 		PPC_MCRXRX, PPC_MFCR, PPC_MFOCRF, PPC_MFSPR, PPC_MFTB,
-		PPC_MODSD, PPC_MODSW, PPC_MODUD, PPC_MODUW, PPC_MTCRF,
+		PPC_MOD, PPC_MTCRF,
 		PPC_MFCTR, PPC_MTCTR, PPC_MFLR, PPC_MTLR, PPC_MTOCRF,
 		PPC_MTSPR, PPC_MULHD, PPC_MULHDU, PPC_MULHW, PPC_MULHWU,
 		PPC_MULLD, PPC_MULLI, PPC_MULLW, PPC_NAND, PPC_NEG, PPC_NOR, PPC_NOP,
@@ -46,12 +45,11 @@ package decode_types is
 		OP_CNTLZD, OP_CNTLZW, OP_CNTTZD, OP_CNTTZW, OP_CRAND,
 		OP_CRANDC, OP_CREQV, OP_CRNAND, OP_CRNOR, OP_CROR, OP_CRORC,
 		OP_CRXOR, OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST,
-		OP_DCBZ, OP_DIVD, OP_DIVDE, OP_DIVDEU, OP_DIVDU, OP_DIVW,
-		OP_DIVWE, OP_DIVWEU, OP_DIVWU, OP_EQV, OP_EXTSB, OP_EXTSH,
+		OP_DCBZ, OP_DIV, OP_EQV, OP_EXTSB, OP_EXTSH,
 		OP_EXTSW, OP_EXTSWSLI, OP_ICBI, OP_ICBT, OP_ISEL, OP_ISYNC,
 		OP_LOAD, OP_STORE, OP_MADDHD, OP_MADDHDU, OP_MADDLD, OP_MCRF,
 		OP_MCRXR, OP_MCRXRX, OP_MFCR, OP_MFOCRF, OP_MFCTR, OP_MFLR,
-		OP_MFTB, OP_MFSPR, OP_MODSD, OP_MODSW, OP_MODUD, OP_MODUW,
+		OP_MFTB, OP_MFSPR, OP_MOD,
 		OP_MTCRF, OP_MTOCRF, OP_MTCTR, OP_MTLR, OP_MTSPR, OP_MUL_L64,
 		OP_MUL_H64, OP_MUL_H32, OP_NAND, OP_NEG, OP_NOR, OP_OR,
 		OP_ORC, OP_POPCNTB, OP_POPCNTD, OP_POPCNTW, OP_PRTYD,
@@ -88,7 +86,7 @@ package decode_types is
 
 	constant TOO_OFFSET : integer := 0;
 
-	type unit_t is (NONE, ALU, LDST, MUL);
+	type unit_t is (NONE, ALU, LDST, MUL, DIV);
 	type length_t is (NONE, is1B, is2B, is4B, is8B);
 
 	type decode_rom_t is record
diff --git a/divider.vhdl b/divider.vhdl
new file mode 100644
index 0000000..5cbc856
--- /dev/null
+++ b/divider.vhdl
@@ -0,0 +1,127 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.common.all;
+use work.decode_types.all;
+use work.crhelpers.all;
+
+entity divider is
+    port (
+        clk   : in std_logic;
+        rst   : in std_logic;
+        d_in  : in Decode2ToDividerType;
+        d_out : out DividerToWritebackType
+        );
+end entity divider;
+
+architecture behaviour of divider is
+    signal dend       : std_ulogic_vector(127 downto 0);
+    signal div        : unsigned(63 downto 0);
+    signal quot       : std_ulogic_vector(63 downto 0);
+    signal result     : std_ulogic_vector(63 downto 0);
+    signal sresult    : std_ulogic_vector(63 downto 0);
+    signal qbit       : std_ulogic;
+    signal running    : std_ulogic;
+    signal count      : unsigned(6 downto 0);
+    signal neg_result : std_ulogic;
+    signal is_modulus : std_ulogic;
+    signal is_32bit   : std_ulogic;
+    signal rc         : std_ulogic;
+    signal write_reg  : std_ulogic_vector(4 downto 0);
+
+    function compare_zero(value : std_ulogic_vector(63 downto 0); is_32 : std_ulogic)
+        return std_ulogic_vector is
+    begin
+        if is_32 = '1' then
+            if value(31) = '1' then
+                return "1000";
+            elsif unsigned(value(30 downto 0)) > 0 then
+                return "0100";
+            else
+                return "0010";
+            end if;
+        else
+            if value(63) = '1' then
+                return "1000";
+            elsif unsigned(value(62 downto 0)) > 0 then
+                return "0100";
+            else
+                return "0010";
+            end if;
+        end if;
+    end function compare_zero;
+
+begin
+    divider_0: process(clk)
+    begin
+        if rising_edge(clk) then
+            if rst = '1' then
+                dend <= (others => '0');
+                div <= (others => '0');
+                quot <= (others => '0');
+                running <= '0';
+                count <= "0000000";
+            elsif d_in.valid = '1' then
+                if d_in.is_extended = '1' then
+                    dend <= d_in.dividend & x"0000000000000000";
+                else
+                    dend <= x"0000000000000000" & d_in.dividend;
+                end if;
+                div <= unsigned(d_in.divisor);
+                quot <= (others => '0');
+                write_reg <= d_in.write_reg;
+                neg_result <= d_in.neg_result;
+                is_modulus <= d_in.is_modulus;
+                is_32bit <= d_in.is_32bit;
+                rc <= d_in.rc;
+                count <= "0000000";
+                running <= '1';
+            elsif running = '1' then
+                if dend(127) = '1' or unsigned(dend(126 downto 63)) >= div then
+                    dend <= std_ulogic_vector(unsigned(dend(126 downto 63)) - div) &
+                            dend(62 downto 0) & '0';
+                    quot <= quot(62 downto 0) & '1';
+                else
+                    dend <= dend(126 downto 0) & '0';
+                    quot <= quot(62 downto 0) & '0';
+                end if;
+                if count = "0111111" then
+                    running <= '0';
+                end if;
+                count <= count + 1;
+            else
+                count <= "0000000";
+            end if;
+        end if;
+    end process;
+
+    divider_1: process(all)
+    begin
+        d_out <= DividerToWritebackInit;
+        d_out.write_reg_nr <= write_reg;
+
+        if count(6) = '1' then
+            d_out.valid <= '1';
+            d_out.write_reg_enable <= '1';
+            if is_modulus = '1' then
+                result <= dend(127 downto 64);
+            else
+                result <= quot;
+            end if;
+            if neg_result = '1' then
+                sresult <= std_ulogic_vector(- signed(result));
+            else
+                sresult <= result;
+            end if;
+            d_out.write_reg_data <= sresult;
+            if rc = '1' then
+                d_out.write_cr_enable <= '1';
+                d_out.write_cr_mask <= num_to_fxm(0);
+                d_out.write_cr_data <= compare_zero(sresult, is_32bit) & x"0000000";
+            end if;
+        end if;
+    end process;
+
+end architecture behaviour;
diff --git a/divider_tb.vhdl b/divider_tb.vhdl
new file mode 100644
index 0000000..0fa7f05
--- /dev/null
+++ b/divider_tb.vhdl
@@ -0,0 +1,613 @@
+library ieee;
+use ieee.std_logic_1164.all;
+use ieee.numeric_std.all;
+
+library work;
+use work.decode_types.all;
+use work.common.all;
+use work.glibc_random.all;
+use work.ppc_fx_insns.all;
+
+entity divider_tb is
+end divider_tb;
+
+architecture behave of divider_tb is
+    signal clk              : std_ulogic;
+    signal rst              : std_ulogic;
+    constant clk_period     : time := 10 ns;
+
+    signal d1               : Decode2ToDividerType;
+    signal d2               : DividerToWritebackType;
+begin
+    divider_0: entity work.divider
+        port map (clk => clk, rst => rst, d_in => d1, d_out => d2);
+
+    clk_process: process
+    begin
+        clk <= '0';
+        wait for clk_period/2;
+        clk <= '1';
+        wait for clk_period/2;
+    end process;
+
+    stim_process: process
+        variable ra, rb, rt, behave_rt: std_ulogic_vector(63 downto 0);
+        variable si: std_ulogic_vector(15 downto 0);
+        variable d128: std_ulogic_vector(127 downto 0);
+        variable q128: std_ulogic_vector(127 downto 0);
+    begin
+        rst <= '1';
+        wait for clk_period;
+        rst <= '0';
+
+        d1.valid <= '1';
+        d1.write_reg <= "10001";
+        d1.dividend <= x"0000000010001000";
+        d1.divisor  <= x"0000000000001111";
+        d1.neg_result <= '0';
+        d1.is_32bit <= '0';
+        d1.is_extended <= '0';
+        d1.is_modulus <= '0';
+        d1.rc <= '0';
+
+        wait for clk_period;
+        assert d2.valid = '0';
+
+        d1.valid <= '0';
+
+        for j in 0 to 64 loop
+            wait for clk_period;
+            if d2.valid = '1' then
+                exit;
+            end if;
+        end loop;
+
+        assert d2.valid = '1';
+        assert d2.write_reg_enable = '1';
+        assert d2.write_reg_nr = "10001";
+        assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data);
+        assert d2.write_cr_enable = '0';
+
+        wait for clk_period;
+        assert d2.valid = '0' report "valid";
+
+        d1.valid <= '1';
+        d1.rc <= '1';
+
+        wait for clk_period;
+        assert d2.valid = '0' report "valid";
+
+        d1.valid <= '0';
+
+        for j in 0 to 64 loop
+            wait for clk_period;
+            if d2.valid = '1' then
+                exit;
+            end if;
+        end loop;
+
+        assert d2.valid = '1';
+        assert d2.write_reg_enable = '1';
+        assert d2.write_reg_nr = "10001";
+        assert d2.write_reg_data = x"000000000000f001" report "result " & to_hstring(d2.write_reg_data);
+        assert d2.write_cr_enable = '1';
+        assert d2.write_cr_mask = "10000000";
+        assert d2.write_cr_data = x"40000000" report "cr data is " & to_hstring(d2.write_cr_data);
+
+        wait for clk_period;
+        assert d2.valid = '0';
+
+        -- test divd
+        report "test divd";
+        divd_loop : for dlength in 1 to 8 loop
+            for vlength in 1 to dlength loop
+                for i in 0 to 100 loop
+                    ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
+                    rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
+
+                    if ra(63) = '1' then
+                        d1.dividend <= std_ulogic_vector(- signed(ra));
+                    else
+                        d1.dividend <= ra;
+                    end if;
+                    if rb(63) = '1' then
+                        d1.divisor <= std_ulogic_vector(- signed(rb));
+                    else
+                        d1.divisor <= rb;
+                    end if;
+                    if ra(63) = rb(63) then
+                        d1.neg_result <= '0';
+                    else
+                        d1.neg_result <= '1';
+                    end if;
+                    d1.valid <= '1';
+
+                    wait for clk_period;
+
+                    d1.valid <= '0';
+                    for j in 0 to 64 loop
+                        wait for clk_period;
+                        if d2.valid = '1' then
+                            exit;
+                        end if;
+                    end loop;
+                    assert d2.valid = '1';
+
+                    if rb /= x"0000000000000000" then
+                        behave_rt := ppc_divd(ra, rb);
+                        assert to_hstring(behave_rt) = to_hstring(d2.write_reg_data)
+                            report "bad divd expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
+                        assert ppc_cmpi('1', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
+                            report "bad CR setting for divd";
+                    end if;
+                end loop;
+            end loop;
+        end loop;
+
+        -- test divdu
+        report "test divdu";
+        divdu_loop : for dlength in 1 to 8 loop
+            for vlength in 1 to dlength loop
+                for i in 0 to 100 loop
+                    ra := std_ulogic_vector(resize(unsigned(pseudorand(dlength * 8)), 64));
+                    rb := std_ulogic_vector(resize(unsigned(pseudorand(vlength * 8)), 64));
+
+                    d1.dividend <= ra;
+                    d1.divisor <= rb;
+                    d1.neg_result <= '0';
+                    d1.valid <= '1';
+
+                    wait for clk_period;
+
+                    d1.valid <= '0';
+                    for j in 0 to 64 loop
+                        wait for clk_period;
+                        if d2.valid = '1' then
+                            exit;
+                        end if;
+                    end loop;
+                    assert d2.valid = '1';
+
+                    if rb /= x"0000000000000000" then
+                        behave_rt := ppc_divdu(ra, rb);
+                        assert to_hstring(behave_rt) = to_hstring(d2.write_reg_data)
+                            report "bad divdu expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
+                        assert ppc_cmpi('1', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
+                            report "bad CR setting for divdu";
+                    end if;
+                end loop;
+            end loop;
+        end loop;
+
+        -- test divde
+        report "test divde";
+        divde_loop : for vlength in 1 to 8 loop
+            for dlength in 1 to vlength loop
+                for i in 0 to 100 loop
+                    ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
+                    rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
+
+                    if ra(63) = '1' then
+                        d1.dividend <= std_ulogic_vector(- signed(ra));
+                    else
+                        d1.dividend <= ra;
+                    end if;
+                    if rb(63) = '1' then
+                        d1.divisor <= std_ulogic_vector(- signed(rb));
+                    else
+                        d1.divisor <= rb;
+                    end if;
+                    if ra(63) = rb(63) then
+                        d1.neg_result <= '0';
+                    else
+                        d1.neg_result <= '1';
+                    end if;
+                    d1.is_extended <= '1';
+                    d1.valid <= '1';
+
+                    wait for clk_period;
+
+                    d1.valid <= '0';
+                    for j in 0 to 64 loop
+                        wait for clk_period;
+                        if d2.valid = '1' then
+                            exit;
+                        end if;
+                    end loop;
+                    assert d2.valid = '1';
+
+                    if unsigned(d1.divisor) > unsigned(d1.dividend) then
+                        d128 := ra & x"0000000000000000";
+                        q128 := std_ulogic_vector(signed(d128) / signed(rb));
+                        behave_rt := q128(63 downto 0);
+                        assert to_hstring(behave_rt) = to_hstring(d2.write_reg_data)
+                            report "bad divde expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data) & " for ra = " & to_hstring(ra) & " rb = " & to_hstring(rb);
+                        assert ppc_cmpi('1', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
+                            report "bad CR setting for divde";
+                    end if;
+                end loop;
+            end loop;
+        end loop;
+
+        -- test divdeu
+        report "test divdeu";
+        divdeu_loop : for vlength in 1 to 8 loop
+            for dlength in 1 to vlength loop
+                for i in 0 to 100 loop
+                    ra := std_ulogic_vector(resize(unsigned(pseudorand(dlength * 8)), 64));
+                    rb := std_ulogic_vector(resize(unsigned(pseudorand(vlength * 8)), 64));
+
+                    d1.dividend <= ra;
+                    d1.divisor <= rb;
+                    d1.neg_result <= '0';
+                    d1.is_extended <= '1';
+                    d1.valid <= '1';
+
+                    wait for clk_period;
+
+                    d1.valid <= '0';
+                    for j in 0 to 64 loop
+                        wait for clk_period;
+                        if d2.valid = '1' then
+                            exit;
+                        end if;
+                    end loop;
+                    assert d2.valid = '1';
+
+                    if unsigned(d1.divisor) > unsigned(d1.dividend) then
+                        d128 := ra & x"0000000000000000";
+                        q128 := std_ulogic_vector(unsigned(d128) / unsigned(rb));
+                        behave_rt := q128(63 downto 0);
+                        assert to_hstring(behave_rt) = to_hstring(d2.write_reg_data)
+                            report "bad divdeu expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data) & " for ra = " & to_hstring(ra) & " rb = " & to_hstring(rb);
+                        assert ppc_cmpi('1', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
+                            report "bad CR setting for divdeu";
+                    end if;
+                end loop;
+            end loop;
+        end loop;
+
+        -- test divw
+        report "test divw";
+        divw_loop : for dlength in 1 to 4 loop
+            for vlength in 1 to dlength loop
+                for i in 0 to 100 loop
+                    ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
+                    rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
+
+                    if ra(63) = '1' then
+                        d1.dividend <= std_ulogic_vector(- signed(ra));
+                    else
+                        d1.dividend <= ra;
+                    end if;
+                    if rb(63) = '1' then
+                        d1.divisor <= std_ulogic_vector(- signed(rb));
+                    else
+                        d1.divisor <= rb;
+                    end if;
+                    if ra(63) = rb(63) then
+                        d1.neg_result <= '0';
+                    else
+                        d1.neg_result <= '1';
+                    end if;
+                    d1.is_extended <= '0';
+                    d1.is_32bit <= '1';
+                    d1.valid <= '1';
+
+                    wait for clk_period;
+
+                    d1.valid <= '0';
+                    for j in 0 to 64 loop
+                        wait for clk_period;
+                        if d2.valid = '1' then
+                            exit;
+                        end if;
+                    end loop;
+                    assert d2.valid = '1';
+
+                    if rb /= x"0000000000000000" then
+                        behave_rt := ppc_divw(ra, rb);
+                        assert behave_rt(31 downto 0) = d2.write_reg_data(31 downto 0)
+                            report "bad divw expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
+                        assert ppc_cmpi('0', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
+                            report "bad CR setting for divw";
+                    end if;
+                end loop;
+            end loop;
+        end loop;
+
+        -- test divwu
+        report "test divwu";
+        divwu_loop : for dlength in 1 to 4 loop
+            for vlength in 1 to dlength loop
+                for i in 0 to 100 loop
+                    ra := std_ulogic_vector(resize(unsigned(pseudorand(dlength * 8)), 64));
+                    rb := std_ulogic_vector(resize(unsigned(pseudorand(vlength * 8)), 64));
+
+                    d1.dividend <= ra;
+                    d1.divisor <= rb;
+                    d1.neg_result <= '0';
+                    d1.is_extended <= '0';
+                    d1.is_32bit <= '1';
+                    d1.valid <= '1';
+
+                    wait for clk_period;
+
+                    d1.valid <= '0';
+                    for j in 0 to 64 loop
+                        wait for clk_period;
+                        if d2.valid = '1' then
+                            exit;
+                        end if;
+                    end loop;
+                    assert d2.valid = '1';
+
+                    if rb /= x"0000000000000000" then
+                        behave_rt := ppc_divwu(ra, rb);
+                        assert behave_rt(31 downto 0) = d2.write_reg_data(31 downto 0)
+                            report "bad divwu expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
+                        assert ppc_cmpi('0', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
+                            report "bad CR setting for divwu";
+                    end if;
+                end loop;
+            end loop;
+        end loop;
+
+        -- test divwe
+        report "test divwe";
+        divwe_loop : for vlength in 1 to 4 loop
+            for dlength in 1 to vlength loop
+                for i in 0 to 100 loop
+                    ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 32)) & x"00000000";
+                    rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
+
+                    if ra(63) = '1' then
+                        d1.dividend <= std_ulogic_vector(- signed(ra));
+                    else
+                        d1.dividend <= ra;
+                    end if;
+                    if rb(63) = '1' then
+                        d1.divisor <= std_ulogic_vector(- signed(rb));
+                    else
+                        d1.divisor <= rb;
+                    end if;
+                    if ra(63) = rb(63) then
+                        d1.neg_result <= '0';
+                    else
+                        d1.neg_result <= '1';
+                    end if;
+                    d1.is_extended <= '0';
+                    d1.is_32bit <= '1';
+                    d1.valid <= '1';
+
+                    wait for clk_period;
+
+                    d1.valid <= '0';
+                    for j in 0 to 64 loop
+                        wait for clk_period;
+                        if d2.valid = '1' then
+                            exit;
+                        end if;
+                    end loop;
+                    assert d2.valid = '1';
+
+                    if unsigned(d1.divisor(31 downto 0)) > unsigned(d1.dividend(63 downto 32)) then
+                        behave_rt := std_ulogic_vector(signed(ra) / signed(rb));
+                        assert behave_rt(31 downto 0) = d2.write_reg_data(31 downto 0)
+                            report "bad divwe expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data) & " for ra = " & to_hstring(ra) & " rb = " & to_hstring(rb);
+                        assert ppc_cmpi('0', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
+                            report "bad CR setting for divwe";
+                    end if;
+                end loop;
+            end loop;
+        end loop;
+
+        -- test divweu
+        report "test divweu";
+        divweu_loop : for vlength in 1 to 4 loop
+            for dlength in 1 to vlength loop
+                for i in 0 to 100 loop
+                    ra := std_ulogic_vector(resize(unsigned(pseudorand(dlength * 8)), 32)) & x"00000000";
+                    rb := std_ulogic_vector(resize(unsigned(pseudorand(vlength * 8)), 64));
+
+                    d1.dividend <= ra;
+                    d1.divisor <= rb;
+                    d1.neg_result <= '0';
+                    d1.is_extended <= '0';
+                    d1.is_32bit <= '1';
+                    d1.valid <= '1';
+
+                    wait for clk_period;
+
+                    d1.valid <= '0';
+                    for j in 0 to 64 loop
+                        wait for clk_period;
+                        if d2.valid = '1' then
+                            exit;
+                        end if;
+                    end loop;
+                    assert d2.valid = '1';
+
+                    if unsigned(d1.divisor(31 downto 0)) > unsigned(d1.dividend(63 downto 32)) then
+                        behave_rt := std_ulogic_vector(unsigned(ra) / unsigned(rb));
+                        assert behave_rt(31 downto 0) = d2.write_reg_data(31 downto 0)
+                            report "bad divweu expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data) & " for ra = " & to_hstring(ra) & " rb = " & to_hstring(rb);
+                        assert ppc_cmpi('0', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
+                            report "bad CR setting for divweu";
+                    end if;
+                end loop;
+            end loop;
+        end loop;
+
+        -- test modsd
+        report "test modsd";
+        modsd_loop : for dlength in 1 to 8 loop
+            for vlength in 1 to dlength loop
+                for i in 0 to 100 loop
+                    ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
+                    rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
+
+                    if ra(63) = '1' then
+                        d1.dividend <= std_ulogic_vector(- signed(ra));
+                    else
+                        d1.dividend <= ra;
+                    end if;
+                    if rb(63) = '1' then
+                        d1.divisor <= std_ulogic_vector(- signed(rb));
+                    else
+                        d1.divisor <= rb;
+                    end if;
+                    d1.neg_result <= ra(63);
+                    d1.is_extended <= '0';
+                    d1.is_32bit <= '0';
+                    d1.is_modulus <= '1';
+                    d1.valid <= '1';
+
+                    wait for clk_period;
+
+                    d1.valid <= '0';
+                    for j in 0 to 64 loop
+                        wait for clk_period;
+                        if d2.valid = '1' then
+                            exit;
+                        end if;
+                    end loop;
+                    assert d2.valid = '1';
+
+                    if rb /= x"0000000000000000" then
+                        behave_rt := std_ulogic_vector(signed(ra) rem signed(rb));
+                        assert behave_rt = d2.write_reg_data
+                            report "bad modsd expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
+                        assert ppc_cmpi('1', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
+                            report "bad CR setting for modsd";
+                    end if;
+                end loop;
+            end loop;
+        end loop;
+
+        -- test modud
+        report "test modud";
+        modud_loop : for dlength in 1 to 8 loop
+            for vlength in 1 to dlength loop
+                for i in 0 to 100 loop
+                    ra := std_ulogic_vector(resize(unsigned(pseudorand(dlength * 8)), 64));
+                    rb := std_ulogic_vector(resize(unsigned(pseudorand(vlength * 8)), 64));
+
+                    d1.dividend <= ra;
+                    d1.divisor <= rb;
+                    d1.neg_result <= '0';
+                    d1.is_extended <= '0';
+                    d1.is_32bit <= '0';
+                    d1.is_modulus <= '1';
+                    d1.valid <= '1';
+
+                    wait for clk_period;
+
+                    d1.valid <= '0';
+                    for j in 0 to 64 loop
+                        wait for clk_period;
+                        if d2.valid = '1' then
+                            exit;
+                        end if;
+                    end loop;
+                    assert d2.valid = '1';
+
+                    if rb /= x"0000000000000000" then
+                        behave_rt := std_ulogic_vector(unsigned(ra) rem unsigned(rb));
+                        assert behave_rt = d2.write_reg_data
+                            report "bad modud expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
+                        assert ppc_cmpi('1', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
+                            report "bad CR setting for modud";
+                    end if;
+                end loop;
+            end loop;
+        end loop;
+
+        -- test modsw
+        report "test modsw";
+        modsw_loop : for dlength in 1 to 4 loop
+            for vlength in 1 to dlength loop
+                for i in 0 to 100 loop
+                    ra := std_ulogic_vector(resize(signed(pseudorand(dlength * 8)), 64));
+                    rb := std_ulogic_vector(resize(signed(pseudorand(vlength * 8)), 64));
+
+                    if ra(63) = '1' then
+                        d1.dividend <= std_ulogic_vector(- signed(ra));
+                    else
+                        d1.dividend <= ra;
+                    end if;
+                    if rb(63) = '1' then
+                        d1.divisor <= std_ulogic_vector(- signed(rb));
+                    else
+                        d1.divisor <= rb;
+                    end if;
+                    d1.neg_result <= ra(63);
+                    d1.is_extended <= '0';
+                    d1.is_32bit <= '1';
+                    d1.is_modulus <= '1';
+                    d1.valid <= '1';
+
+                    wait for clk_period;
+
+                    d1.valid <= '0';
+                    for j in 0 to 64 loop
+                        wait for clk_period;
+                        if d2.valid = '1' then
+                            exit;
+                        end if;
+                    end loop;
+                    assert d2.valid = '1';
+
+                    if rb /= x"0000000000000000" then
+                        behave_rt := x"00000000" & std_ulogic_vector(signed(ra(31 downto 0)) rem signed(rb(31 downto 0)));
+                        assert behave_rt(31 downto 0) = d2.write_reg_data(31 downto 0)
+                            report "bad modsw expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
+                        assert ppc_cmpi('0', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
+                            report "bad CR setting for modsw";
+                    end if;
+                end loop;
+            end loop;
+        end loop;
+
+        -- test moduw
+        report "test moduw";
+        moduw_loop : for dlength in 1 to 4 loop
+            for vlength in 1 to dlength loop
+                for i in 0 to 100 loop
+                    ra := std_ulogic_vector(resize(unsigned(pseudorand(dlength * 8)), 64));
+                    rb := std_ulogic_vector(resize(unsigned(pseudorand(vlength * 8)), 64));
+
+                    d1.dividend <= ra;
+                    d1.divisor <= rb;
+                    d1.neg_result <= '0';
+                    d1.is_extended <= '0';
+                    d1.is_32bit <= '1';
+                    d1.is_modulus <= '1';
+                    d1.valid <= '1';
+
+                    wait for clk_period;
+
+                    d1.valid <= '0';
+                    for j in 0 to 64 loop
+                        wait for clk_period;
+                        if d2.valid = '1' then
+                            exit;
+                        end if;
+                    end loop;
+                    assert d2.valid = '1';
+
+                    if rb /= x"0000000000000000" then
+                        behave_rt := x"00000000" & std_ulogic_vector(unsigned(ra(31 downto 0)) rem unsigned(rb(31 downto 0)));
+                        assert behave_rt(31 downto 0) = d2.write_reg_data(31 downto 0)
+                            report "bad moduw expected " & to_hstring(behave_rt) & " got " & to_hstring(d2.write_reg_data);
+                        assert ppc_cmpi('0', behave_rt, x"0000") & x"0000000" = d2.write_cr_data
+                            report "bad CR setting for moduw";
+                    end if;
+                end loop;
+            end loop;
+        end loop;
+
+        assert false report "end of test" severity failure;
+        wait;
+    end process;
+end behave;
diff --git a/execute1.vhdl b/execute1.vhdl
index 858dc5b..3cc4200 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -332,38 +332,6 @@ begin
 					-- Keep our test cases happy for now, ignore trap instructions
 					report "OP_TDI FIXME";
 
-				when OP_DIVDU =>
-					if SIM = true then
-						result := ppc_divdu(e_in.read_data1, e_in.read_data2);
-						result_en := 1;
-					else
-						terminate_out <= '1';
-						report "illegal";
-					end if;
-				when OP_DIVD =>
-					if SIM = true then
-						result := ppc_divd(e_in.read_data1, e_in.read_data2);
-						result_en := 1;
-					else
-						terminate_out <= '1';
-						report "illegal";
-					end if;
-				when OP_DIVWU =>
-					if SIM = true then
-						result := ppc_divwu(e_in.read_data1, e_in.read_data2);
-						result_en := 1;
-					else
-						terminate_out <= '1';
-						report "illegal";
-					end if;
-				when OP_DIVW =>
-					if SIM = true then
-						result := ppc_divw(e_in.read_data1, e_in.read_data2);
-						result_en := 1;
-					else
-						terminate_out <= '1';
-						report "illegal";
-					end if;
 				when others =>
 					terminate_out <= '1';
 					report "illegal";
diff --git a/microwatt.core b/microwatt.core
index b62aef9..c2020a4 100644
--- a/microwatt.core
+++ b/microwatt.core
@@ -23,6 +23,7 @@ filesets:
       - loadstore1.vhdl
       - loadstore2.vhdl
       - multiply.vhdl
+      - divider.vhdl
       - writeback.vhdl
       - insn_helpers.vhdl
       - core.vhdl
diff --git a/writeback.vhdl b/writeback.vhdl
index 82f5e5e..e244960 100644
--- a/writeback.vhdl
+++ b/writeback.vhdl
@@ -12,6 +12,7 @@ entity writeback is
         e_in         : in Execute2ToWritebackType;
         l_in         : in Loadstore2ToWritebackType;
         m_in         : in MultiplyToWritebackType;
+        d_in         : in DividerToWritebackType;
 
         w_out        : out WritebackToRegisterFileType;
         c_out        : out WritebackToCrFileType;
@@ -26,24 +27,30 @@ begin
         variable x : std_ulogic_vector(0 downto 0);
         variable y : std_ulogic_vector(0 downto 0);
         variable z : std_ulogic_vector(0 downto 0);
+        variable w : std_ulogic_vector(0 downto 0);
     begin
         x := "" & e_in.valid;
         y := "" & l_in.valid;
         z := "" & m_in.valid;
-        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure;
+        w := "" & d_in.valid;
+        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z)) + to_integer(unsigned(w))) <= 1 severity failure;
 
         x := "" & e_in.write_enable;
         y := "" & l_in.write_enable;
         z := "" & m_in.write_reg_enable;
-        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure;
+        w := "" & d_in.write_reg_enable;
+        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z)) + to_integer(unsigned(w))) <= 1 severity failure;
 
-        assert not(e_in.write_cr_enable = '1' and m_in.write_cr_enable = '1');
+        x := "" & e_in.write_cr_enable;
+        y := "" & m_in.write_cr_enable;
+        z := "" & d_in.write_cr_enable;
+        assert (to_integer(unsigned(x)) + to_integer(unsigned(y)) + to_integer(unsigned(z))) <= 1 severity failure;
 
         w_out <= WritebackToRegisterFileInit;
         c_out <= WritebackToCrFileInit;
 
         complete_out <= '0';
-        if e_in.valid = '1' or l_in.valid = '1' or m_in.valid = '1' then
+        if e_in.valid = '1' or l_in.valid = '1' or m_in.valid = '1' or d_in.valid = '1' then
             complete_out <= '1';
         end if;
 
@@ -76,5 +83,17 @@ begin
             c_out.write_cr_mask <= m_in.write_cr_mask;
             c_out.write_cr_data <= m_in.write_cr_data;
         end if;
+
+        if d_in.write_reg_enable = '1' then
+            w_out.write_enable <= '1';
+            w_out.write_reg <= d_in.write_reg_nr;
+            w_out.write_data <= d_in.write_reg_data;
+        end if;
+
+        if d_in.write_cr_enable = '1' then
+            c_out.write_cr_enable <= '1';
+            c_out.write_cr_mask <= d_in.write_cr_mask;
+            c_out.write_cr_data <= d_in.write_cr_data;
+        end if;
     end process;
 end;