From 16a559c9c66b3e810860b59c4099527b38a5337e Mon Sep 17 00:00:00 2001
From: Marc Orr <marc.orr@gmail.com>
Date: Sat, 19 May 2012 04:32:25 -0700
Subject: [PATCH] x86 ISA: Implement the sse3 haddps instruction.

Shuffle the 32 bit values into position, and then add in parallel.
---
 src/arch/x86/isa/decoder/two_byte_opcodes.isa |  2 +-
 .../arithmetic/horizontal_addition.py         | 37 ++++++++++++++++++-
 2 files changed, 37 insertions(+), 2 deletions(-)

diff --git a/src/arch/x86/isa/decoder/two_byte_opcodes.isa b/src/arch/x86/isa/decoder/two_byte_opcodes.isa
index 378d426e3..34b79a6a4 100644
--- a/src/arch/x86/isa/decoder/two_byte_opcodes.isa
+++ b/src/arch/x86/isa/decoder/two_byte_opcodes.isa
@@ -669,7 +669,7 @@
                     }
                     // repne (0xF2)
                     0x8: decode OPCODE_OP_BOTTOM3 {
-                        0x4: WarnUnimpl::haddps_Vo_Wo();
+                        0x4: HADDPS(Vo,Wo);
                         0x5: WarnUnimpl::hsubps_Vo_Wo();
                         default: UD2();
                     }
diff --git a/src/arch/x86/isa/insts/simd128/floating_point/arithmetic/horizontal_addition.py b/src/arch/x86/isa/insts/simd128/floating_point/arithmetic/horizontal_addition.py
index 8e5a01fbf..53d8d9354 100644
--- a/src/arch/x86/isa/insts/simd128/floating_point/arithmetic/horizontal_addition.py
+++ b/src/arch/x86/isa/insts/simd128/floating_point/arithmetic/horizontal_addition.py
@@ -36,7 +36,42 @@
 # Authors: Gabe Black
 
 microcode = '''
-# HADDPS
+def macroop HADDPS_XMM_XMM {
+    shuffle ufp1, xmml, xmmh, ext=((0 << 0) | (2 << 2)), size=4
+    shuffle ufp2, xmml, xmmh, ext=((1 << 0) | (3 << 2)), size=4
+    shuffle ufp3, xmmlm, xmmhm, ext=((0 << 0) | (2 << 2)), size=4
+    shuffle ufp4, xmmlm, xmmhm, ext=((1 << 0) | (3 << 2)), size=4
+
+    maddf xmml, ufp1, ufp2, size=4
+    maddf xmmh, ufp3, ufp4, size=4
+};
+
+def macroop HADDPS_XMM_M {
+    ldfp ufp1, seg, sib, disp, dataSize=8
+    ldfp ufp2, seg, sib, "DISPLACEMENT+8", dataSize=8
+
+    shuffle ufp3, xmml, xmmh, ext=((0 << 0) | (2 << 2)), size=4
+    shuffle ufp4, xmml, xmmh, ext=((1 << 0) | (3 << 2)), size=4
+    shuffle ufp5, ufp1, ufp2, ext=((0 << 0) | (2 << 2)), size=4
+    shuffle ufp6, ufp1, ufp2, ext=((1 << 0) | (3 << 2)), size=4
+
+    maddf xmml, ufp3, ufp4, size=4
+    maddf xmmh, ufp5, ufp6, size=4
+};
+
+def macroop HADDPS_XMM_P {
+    rdip t7
+    ldfp ufp1, seg, riprel, disp, dataSize=8
+    ldfp ufp2, seg, riprel, "DISPLACEMENT+8", dataSize=8
+
+    shuffle ufp3, xmml, xmmh, ext=((0 << 0) | (2 << 2)), size=4
+    shuffle ufp4, xmml, xmmh, ext=((1 << 0) | (3 << 2)), size=4
+    shuffle ufp5, ufp1, ufp2, ext=((0 << 0) | (2 << 2)), size=4
+    shuffle ufp6, ufp1, ufp2, ext=((1 << 0) | (3 << 2)), size=4
+
+    maddf xmml, ufp3, ufp4, size=4
+    maddf xmmh, ufp5, ufp6, size=4
+};
 
 def macroop HADDPD_XMM_XMM {
     maddf ufp1, xmmh , xmml, size=8, ext=Scalar
-- 
2.30.2