From: Paul Mackerras <paulus@ozlabs.org>
Date: Fri, 13 Dec 2019 04:48:54 +0000 (+1100)
Subject: execute: Do comparisons using the main adder
X-Git-Url: https://git.libre-soc.org/?a=commitdiff_plain;h=d2ca625b3b9c98de607b2a56f8428c70ab343891;p=microwatt.git

execute: Do comparisons using the main adder

This handles OP_CMP like a subtraction; the main adder computes
~RA + RB + 1, and the condition codes are computed from the results.
A direct comparison of the two input operands is used to calculate the
EQ bit of the condition result.  The LT and GT bits are computed from
the MSB of the subtraction result, the carry out from the subtraction,
and the MSBs of the operands.  For a 32-bit comparison, the 32-bit
carry and bit 31 of the result and input operands are used; for a
64-bit comparison, the 64-bit carry and bit 63 of the operands and
result are used.

It turns out to be more convenient to use the 'signed' field of
the decode table to distinguish signed from unsigned comparisons,
rather than the insn_type.  Therefore this uses OP_CMP for both
cmp and cmpl, which also has the benefit of reducing the number
of values in insn_type_t.

Doing this saves over 200 slice LUTs on the Arty A7-100 and improves
timing slightly as well.

Signed-off-by: Paul Mackerras <paulus@ozlabs.org>
---

diff --git a/decode1.vhdl b/decode1.vhdl
index 6ac3f01..0e42d1b 100644
--- a/decode1.vhdl
+++ b/decode1.vhdl
@@ -44,8 +44,8 @@ architecture behaviour of decode1 is
 		29 =>       (ALU,    OP_AND,       NONE,       CONST_UI_HI, RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', ONE,  '0', '0'), -- andis.
 		18 =>       (ALU,    OP_B,         NONE,       CONST_LI,    NONE, NONE, '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- b
 		16 =>       (ALU,    OP_BC,        SPR,        CONST_BD,    NONE, SPR , '1', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '1', '0'), -- bc
-		11 =>       (ALU,    OP_CMP,       RA,         CONST_SI,    NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpi
-		10 =>       (ALU,    OP_CMPL,      RA,         CONST_UI,    NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli
+		11 =>       (ALU,    OP_CMP,       RA,         CONST_SI,    NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmpi
+		10 =>       (ALU,    OP_CMP,       RA,         CONST_UI,    NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpli
 		34 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '0', '0', '0', '0', NONE, '0', '1'), -- lbz
 		35 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is1B, '0', '0', '1', '0', '0', '0', NONE, '0', '1'), -- lbzu
 		42 =>       (LDST,   OP_LOAD,      RA_OR_ZERO, CONST_SI,    NONE, RT,   '0', '0', '0', '0', ZERO, '0', is2B, '0', '1', '0', '0', '0', '0', NONE, '0', '1'), -- lha
@@ -145,10 +145,10 @@ architecture behaviour of decode1 is
 		2#0000011100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- and
 		2#0000111100#  =>       (ALU,    OP_AND,       NONE,       RB,          RS,   RA,   '0', '0', '1', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- andc
 		-- 2#0011111100# bperm
-		2#0000000000#  =>       (ALU,    OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmp
+		2#0000000000#  =>       (ALU,    OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '1', NONE, '0', '0'), -- cmp
 		2#0111111100#  =>       (ALU,    OP_CMPB,      NONE,       RB,          RS,   RA,   '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpb
 		-- 2#0011100000# cmpeqb
-		2#0000100000#  =>       (ALU,    OP_CMPL,      RA,         RB,          NONE, NONE, '0', '1', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpl
+		2#0000100000#  =>       (ALU,    OP_CMP,       RA,         RB,          NONE, NONE, '0', '1', '1', '0', ONE,  '0', NONE, '0', '0', '0', '0', '0', '0', NONE, '0', '0'), -- cmpl
 		-- 2#0011000000# cmprb
 		2#0000111010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '0', '0', RC,   '0', '0'), -- cntlzd
 		2#0000011010#  =>       (ALU,    OP_CNTZ,      NONE,       NONE,        RS,   RA,   '0', '0', '0', '0', ZERO, '0', NONE, '0', '0', '0', '0', '1', '0', RC,   '0', '0'), -- cntlzw
diff --git a/decode_types.vhdl b/decode_types.vhdl
index fdc1e6e..82039bd 100644
--- a/decode_types.vhdl
+++ b/decode_types.vhdl
@@ -4,7 +4,7 @@ use ieee.std_logic_1164.all;
 package decode_types is
     type insn_type_t is (OP_ILLEGAL, OP_NOP, OP_ADD,
 			 OP_ADDPCIS, OP_AND, OP_ATTN, OP_B, OP_BC, OP_BCREG,
-			 OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPL, OP_CMPRB,
+			 OP_BPERM, OP_CMP, OP_CMPB, OP_CMPEQB, OP_CMPRB,
 			 OP_CNTZ, OP_CRAND,
 			 OP_CRANDC, OP_CREQV, OP_CRNAND, OP_CRNOR, OP_CROR, OP_CRORC,
 			 OP_CRXOR, OP_DARN, OP_DCBF, OP_DCBST, OP_DCBT, OP_DCBTST,
diff --git a/execute1.vhdl b/execute1.vhdl
index 1991009..6889a6a 100644
--- a/execute1.vhdl
+++ b/execute1.vhdl
@@ -193,6 +193,9 @@ begin
         variable abs1, abs2 : signed(63 downto 0);
 	variable overflow : std_ulogic;
 	variable negative : std_ulogic;
+        variable zerohi, zerolo : std_ulogic;
+        variable msb_a, msb_b : std_ulogic;
+        variable a_lt : std_ulogic;
     begin
 	result := (others => '0');
 	result_with_carry := (others => '0');
@@ -348,7 +351,7 @@ begin
 		report "illegal";
 	    when OP_NOP =>
 		-- Do nothing
-	    when OP_ADD =>
+	    when OP_ADD | OP_CMP =>
 		if e_in.invert_a = '0' then
 		    a_inv := e_in.read_data1;
 		else
@@ -359,15 +362,57 @@ begin
 		result := result_with_carry(63 downto 0);
                 carry_32 := result(32) xor a_inv(32) xor e_in.read_data2(32);
                 carry_64 := result_with_carry(64);
-		if e_in.output_carry = '1' then
-		    set_carry(v.e, carry_32, carry_64);
-		end if;
-		if e_in.oe = '1' then
-		    set_ov(v.e,
-			   calc_ov(a_inv(63), e_in.read_data2(63), carry_64, result_with_carry(63)),
-			   calc_ov(a_inv(31), e_in.read_data2(31), carry_32, result_with_carry(31)));
-		end if;
-		result_en := '1';
+                if e_in.insn_type = OP_ADD then
+                    if e_in.output_carry = '1' then
+                        set_carry(v.e, carry_32, carry_64);
+                    end if;
+                    if e_in.oe = '1' then
+                        set_ov(v.e,
+                               calc_ov(a_inv(63), e_in.read_data2(63), carry_64, result_with_carry(63)),
+                               calc_ov(a_inv(31), e_in.read_data2(31), carry_32, result_with_carry(31)));
+                    end if;
+                    result_en := '1';
+                else
+                    -- CMP and CMPL instructions
+                    -- Note, we have done RB - RA, not RA - RB
+                    bf := insn_bf(e_in.insn);
+                    l := insn_l(e_in.insn);
+                    v.e.write_cr_enable := '1';
+                    crnum := to_integer(unsigned(bf));
+                    v.e.write_cr_mask := num_to_fxm(crnum);
+                    zerolo := not (or (e_in.read_data1(31 downto 0) xor e_in.read_data2(31 downto 0)));
+                    zerohi := not (or (e_in.read_data1(63 downto 32) xor e_in.read_data2(63 downto 32)));
+                    if zerolo = '1' and (l = '0' or zerohi = '1') then
+                        -- values are equal
+                        newcrf := "001" & v.e.xerc.so;
+                    else
+                        if l = '1' then
+                            -- 64-bit comparison
+                            msb_a := e_in.read_data1(63);
+                            msb_b := e_in.read_data2(63);
+                        else
+                            -- 32-bit comparison
+                            msb_a := e_in.read_data1(31);
+                            msb_b := e_in.read_data2(31);
+                        end if;
+                        if msb_a /= msb_b then
+                            -- Subtraction might overflow, but
+                            -- comparison is clear from MSB difference.
+                            -- for signed, 0 is greater; for unsigned, 1 is greater
+                            a_lt := msb_a xnor e_in.is_signed;
+                        else
+                            -- Subtraction cannot overflow since MSBs are equal.
+                            -- carry = 1 indicates RA is smaller (signed or unsigned)
+                            a_lt := (not l and carry_32) or (l and carry_64);
+                        end if;
+                        newcrf := a_lt & not a_lt & '0' & v.e.xerc.so;
+                    end if;
+                    for i in 0 to 7 loop
+                        lo := i*4;
+                        hi := lo + 3;
+                        v.e.write_cr_data(hi downto lo) := newcrf;
+                    end loop;
+                end if;
 	    when OP_AND | OP_OR | OP_XOR =>
 		result := logical_result;
 		result_en := '1';
@@ -412,28 +457,6 @@ begin
 	    when OP_CMPB =>
 		result := ppc_cmpb(e_in.read_data3, e_in.read_data2);
 		result_en := '1';
-	    when OP_CMP =>
-		bf := insn_bf(e_in.insn);
-		l := insn_l(e_in.insn);
-		v.e.write_cr_enable := '1';
-		crnum := to_integer(unsigned(bf));
-		v.e.write_cr_mask := num_to_fxm(crnum);
-		for i in 0 to 7 loop
-		    lo := i*4;
-		    hi := lo + 3;
-		    v.e.write_cr_data(hi downto lo) := ppc_cmp(l, e_in.read_data1, e_in.read_data2, v.e.xerc.so);
-		end loop;
-	    when OP_CMPL =>
-		bf := insn_bf(e_in.insn);
-		l := insn_l(e_in.insn);
-		v.e.write_cr_enable := '1';
-		crnum := to_integer(unsigned(bf));
-		v.e.write_cr_mask := num_to_fxm(crnum);
-		for i in 0 to 7 loop
-		    lo := i*4;
-		    hi := lo + 3;
-		    v.e.write_cr_data(hi downto lo) := ppc_cmpl(l, e_in.read_data1, e_in.read_data2, v.e.xerc.so);
-		end loop;
 	    when OP_CNTZ =>
 		result := countzero_result;
 		result_en := '1';