From a2c875c746a7b9b5dcb94fd93d94ab70286dbbb4 Mon Sep 17 00:00:00 2001 From: Steve Reinhardt Date: Tue, 6 Oct 2015 17:26:50 -0700 Subject: [PATCH] x86: implement rcpps and rcpss SSE insts These are packed single-precision approximate reciprocal operations, vector and scalar versions, respectively. This code was basically developed by copying the code for sqrtps and sqrtss. The mrcp micro-op was simplified relative to msqrt since there are no double-precision versions of this operation. --- src/arch/x86/isa/decoder/two_byte_opcodes.isa | 4 +- .../arithmetic/reciprocal_estimation.py | 39 ++++++++++++++++++- src/arch/x86/isa/microops/mediaop.isa | 39 ++++++++++++++++++- 3 files changed, 77 insertions(+), 5 deletions(-) diff --git a/src/arch/x86/isa/decoder/two_byte_opcodes.isa b/src/arch/x86/isa/decoder/two_byte_opcodes.isa index 0ba7434e8..4a21e2900 100644 --- a/src/arch/x86/isa/decoder/two_byte_opcodes.isa +++ b/src/arch/x86/isa/decoder/two_byte_opcodes.isa @@ -463,7 +463,7 @@ 0x0: MOVMSKPS(Gd,VRo); 0x1: SQRTPS(Vo,Wo); 0x2: WarnUnimpl::rqsrtps_Vo_Wo(); - 0x3: WarnUnimpl::rcpps_Vo_Wo(); + 0x3: RCPPS(Vo,Wo); 0x4: ANDPS(Vo,Wo); 0x5: ANDNPS(Vo,Wo); 0x6: ORPS(Vo,Wo); @@ -473,7 +473,7 @@ 0x4: decode OPCODE_OP_BOTTOM3 { 0x1: SQRTSS(Vd,Wd); 0x2: WarnUnimpl::rsqrtss_Vd_Wd(); - 0x3: WarnUnimpl::rcpss_Vd_Wd(); + 0x3: RCPSS(Vd,Wd); default: UD2(); } // operand size (0x66) diff --git a/src/arch/x86/isa/insts/simd128/floating_point/arithmetic/reciprocal_estimation.py b/src/arch/x86/isa/insts/simd128/floating_point/arithmetic/reciprocal_estimation.py index 6e0d7fbb6..666c45ca1 100644 --- a/src/arch/x86/isa/insts/simd128/floating_point/arithmetic/reciprocal_estimation.py +++ b/src/arch/x86/isa/insts/simd128/floating_point/arithmetic/reciprocal_estimation.py @@ -1,4 +1,6 @@ # Copyright (c) 2007 The Hewlett-Packard Development Company +# Copyright (c) 2015 Advanced Micro Devices, Inc. +# # All rights reserved. # # The license below extends only to copyright in the software and shall @@ -34,8 +36,41 @@ # OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. # # Authors: Gabe Black +# Steve Reinhardt microcode = ''' -# RCPPS -# RCPSS +def macroop RCPSS_XMM_XMM { + mrcp xmml, xmmlm, size=4, ext=Scalar +}; + +def macroop RCPSS_XMM_M { + ldfp ufp1, seg, sib, disp, dataSize=8 + mrcp xmml, ufp1, size=4, ext=Scalar +}; + +def macroop RCPSS_XMM_P { + rdip t7 + ldfp ufp1, seg, riprel, disp, dataSize=8 + mrcp xmml, ufp1, size=4, ext=Scalar +}; + +def macroop RCPPS_XMM_XMM { + mrcp xmml, xmmlm, size=4, ext=0 + mrcp xmmh, xmmhm, size=4, ext=0 +}; + +def macroop RCPPS_XMM_M { + ldfp ufp1, seg, sib, "DISPLACEMENT", dataSize=8 + ldfp ufp2, seg, sib, "DISPLACEMENT + 8", dataSize=8 + mrcp xmml, ufp1, size=4, ext=0 + mrcp xmmh, ufp2, size=4, ext=0 +}; + +def macroop RCPPS_XMM_P { + rdip t7 + ldfp ufp1, seg, riprel, "DISPLACEMENT", dataSize=8 + ldfp ufp2, seg, riprel, "DISPLACEMENT + 8", dataSize=8 + mrcp xmml, ufp1, size=4, ext=0 + mrcp xmmh, ufp2, size=4, ext=0 +}; ''' diff --git a/src/arch/x86/isa/microops/mediaop.isa b/src/arch/x86/isa/microops/mediaop.isa index e382151ef..e5f04109f 100644 --- a/src/arch/x86/isa/microops/mediaop.isa +++ b/src/arch/x86/isa/microops/mediaop.isa @@ -1,4 +1,6 @@ -/// Copyright (c) 2009 The Regents of The University of Michigan +// Copyright (c) 2009 The Regents of The University of Michigan +// Copyright (c) 2015 Advanced Micro Devices, Inc. +// // All rights reserved. // // Redistribution and use in source and binary forms, with or without @@ -691,6 +693,41 @@ let {{ FpDestReg_uqw = result; ''' + # compute approximate reciprocal --- single-precision only + class Mrcp(MediaOp): + def __init__(self, dest, src, \ + size = None, destSize = None, srcSize = None, ext = None): + super(Mrcp, self).__init__(dest, src,\ + "InstRegIndex(0)", size, destSize, srcSize, ext) + code = ''' + union floatInt + { + float f; + uint32_t i; + }; + + assert(srcSize == 4); // ISA defines single-precision only + assert(srcSize == destSize); + const int size = 4; + const int sizeBits = size * 8; + int items = numItems(size); + uint64_t result = FpDestReg_uqw; + + for (int i = 0; i < items; i++) { + int hiIndex = (i + 1) * sizeBits - 1; + int loIndex = (i + 0) * sizeBits; + uint64_t argBits = bits(FpSrcReg1_uqw, hiIndex, loIndex); + + floatInt fi; + fi.i = argBits; + // This is more accuracy than HW provides, but oh well + fi.f = 1.0 / fi.f; + argBits = fi.i; + result = insertBits(result, hiIndex, loIndex, argBits); + } + FpDestReg_uqw = result; + ''' + class Maddf(MediaOp): code = ''' union floatInt -- 2.30.2