+2017-05-12 Steven Munroe <munroesj@gcc.gnu.org>
+
+ * config.gcc (powerpc*-*-*): Add bmi2intrin.h, bmiintrin.h,
+ and x86intrin.h
+ * config/rs6000/bmiintrin.h: New file.
+ * config/rs6000/bmi2intrin.h: New file.
+ * config/rs6000/x86intrin.h: New file.
+
2017-05-12 Jeff Law <law@redhat.com>
* tree-vrp.c (vrp_dom_walker::before_dom_childern): Push unwinding
;;
powerpc*-*-*)
cpu_type=rs6000
- extra_headers="ppc-asm.h altivec.h spe.h ppu_intrinsics.h paired.h spu2vmx.h vec_types.h si2vmx.h htmintrin.h htmxlintrin.h"
+ extra_headers="ppc-asm.h altivec.h htmintrin.h htmxlintrin.h"
+ extra_headers="${extra_headers} bmi2intrin.h bmiintrin.h x86intrin.h"
+ extra_headers="${extra_headers} ppu_intrinsics.h spu2vmx.h vec_types.h si2vmx.h"
+ extra_headers="${extra_headers} spe.h paired.h"
case x$with_cpu in
xpowerpc64|xdefault64|x6[23]0|x970|xG5|xpower[3456789]|xpower6x|xrs64a|xcell|xa2|xe500mc64|xe5500|xe6500)
cpu_is_64bit=yes
--- /dev/null
+/* Copyright (C) 2011-2017 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This header is distributed to simplify porting x86_64 code that
+ makes explicit use of Intel intrinsics to powerpc64le.
+ It is the user's responsibility to determine if the results are
+ acceptable and make additional changes as necessary.
+ Note that much code that uses Intel intrinsics can be rewritten in
+ standard C or GNU C extensions, which are more portable and better
+ optimized across multiple targets. */
+
+#if !defined _X86INTRIN_H_INCLUDED
+# error "Never use <bmi2intrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _BMI2INTRIN_H_INCLUDED
+#define _BMI2INTRIN_H_INCLUDED
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u32 (unsigned int __X, unsigned int __Y)
+{
+ return ((__X << (32 - __Y)) >> (32 - __Y));
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mulx_u32 (unsigned int __X, unsigned int __Y, unsigned int *__P)
+{
+ unsigned long long __res = (unsigned long long) __X * __Y;
+ *__P = (unsigned int) (__res >> 32);
+ return (unsigned int) __res;
+}
+
+#ifdef __PPC64__
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bzhi_u64 (unsigned long long __X, unsigned long long __Y)
+{
+ return ((__X << (64 - __Y)) >> (64 - __Y));
+}
+
+/* __int128 requires base 64-bit. */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_mulx_u64 (unsigned long long __X, unsigned long long __Y,
+ unsigned long long *__P)
+{
+ unsigned __int128 __res = (unsigned __int128) __X * __Y;
+ *__P = (unsigned long long) (__res >> 64);
+ return (unsigned long long) __res;
+}
+
+#ifdef _ARCH_PWR7
+/* popcount and bpermd require power7 minimum. */
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u64 (unsigned long long __X, unsigned long long __M)
+{
+ unsigned long result = 0x0UL;
+ const unsigned long mask = 0x8000000000000000UL;
+ unsigned long m = __M;
+ unsigned long c, t;
+ unsigned long p;
+
+ /* The pop-count of the mask gives the number of the bits from
+ source to process. This is also needed to shift bits from the
+ source into the correct position for the result. */
+ p = 64 - __builtin_popcountl (__M);
+
+ /* The loop is for the number of '1' bits in the mask and clearing
+ each mask bit as it is processed. */
+ while (m != 0)
+ {
+ c = __builtin_clzl (m);
+ t = __X << (p - c);
+ m ^= (mask >> c);
+ result |= (t & (mask >> c));
+ p++;
+ }
+ return (result);
+}
+
+extern __inline unsigned long long
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u64 (unsigned long long __X, unsigned long long __M)
+{
+ unsigned long p = 0x4040404040404040UL; // initial bit permute control
+ const unsigned long mask = 0x8000000000000000UL;
+ unsigned long m = __M;
+ unsigned long c;
+ unsigned long result;
+
+ /* if the mask is constant and selects 8 bits or less we can use
+ the Power8 Bit permute instruction. */
+ if (__builtin_constant_p (__M) && (__builtin_popcountl (__M) <= 8))
+ {
+ /* Also if the pext mask is constant, then the popcount is
+ constant, we can evaluate the following loop at compile
+ time and use a constant bit permute vector. */
+ for (long i = 0; i < __builtin_popcountl (__M); i++)
+ {
+ c = __builtin_clzl (m);
+ p = (p << 8) | c;
+ m ^= (mask >> c);
+ }
+ result = __builtin_bpermd (p, __X);
+ }
+ else
+ {
+ p = 64 - __builtin_popcountl (__M);
+ result = 0;
+ /* We could a use a for loop here, but that combined with
+ -funroll-loops can expand to a lot of code. The while
+ loop avoids unrolling and the compiler commons the xor
+ from clearing the mask bit with the (m != 0) test. The
+ result is a more compact loop setup and body. */
+ while (m != 0)
+ {
+ unsigned long t;
+ c = __builtin_clzl (m);
+ t = (__X & (mask >> c)) >> (p - c);
+ m ^= (mask >> c);
+ result |= (t);
+ p++;
+ }
+ }
+ return (result);
+}
+
+/* these 32-bit implementations depend on 64-bit pdep/pext
+ which depend on _ARCH_PWR7. */
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pdep_u32 (unsigned int __X, unsigned int __Y)
+{
+ return _pdep_u64 (__X, __Y);
+}
+
+extern __inline unsigned int
+__attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_pext_u32 (unsigned int __X, unsigned int __Y)
+{
+ return _pext_u64 (__X, __Y);
+}
+#endif /* _ARCH_PWR7 */
+#endif /* __PPC64__ */
+
+#endif /* _BMI2INTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2010-2017 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+/* This header is distributed to simplify porting x86_64 code that
+ makes explicit use of Intel intrinsics to powerpc64le.
+ It is the user's responsibility to determine if the results are
+ acceptable and make additional changes as necessary.
+ Note that much code that uses Intel intrinsics can be rewritten in
+ standard C or GNU C extensions, which are more portable and better
+ optimized across multiple targets. */
+
+#if !defined _X86INTRIN_H_INCLUDED
+# error "Never use <bmiintrin.h> directly; include <x86intrin.h> instead."
+#endif
+
+#ifndef _BMIINTRIN_H_INCLUDED
+#define _BMIINTRIN_H_INCLUDED
+
+extern __inline unsigned short __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzcnt_u16 (unsigned short __X)
+{
+ return __builtin_ctz (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__andn_u32 (unsigned int __X, unsigned int __Y)
+{
+ return (~__X & __Y);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bextr_u32 (unsigned int __X, unsigned int __P, unsigned int __L)
+{
+ return ((__X << (32 - (__L + __P))) >> (32 - __L));
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bextr_u32 (unsigned int __X, unsigned int __Y)
+{
+ unsigned int __P, __L;
+ __P = __Y & 0xFF;
+ __L = (__Y >> 8) & 0xFF;
+ return (_bextr_u32 (__X, __P, __L));
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsi_u32 (unsigned int __X)
+{
+ return (__X & -__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsi_u32 (unsigned int __X)
+{
+ return __blsi_u32 (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsmsk_u32 (unsigned int __X)
+{
+ return (__X ^ (__X - 1));
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsmsk_u32 (unsigned int __X)
+{
+ return __blsmsk_u32 (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsr_u32 (unsigned int __X)
+{
+ return (__X & (__X - 1));
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsr_u32 (unsigned int __X)
+{
+ return __blsr_u32 (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzcnt_u32 (unsigned int __X)
+{
+ return __builtin_ctz (__X);
+}
+
+extern __inline unsigned int __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tzcnt_u32 (unsigned int __X)
+{
+ return __builtin_ctz (__X);
+}
+
+/* use the 64-bit shift, rotate, and count leading zeros instructions
+ for long long. */
+#ifdef __PPC64__
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__andn_u64 (unsigned long long __X, unsigned long long __Y)
+{
+ return (~__X & __Y);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_bextr_u64 (unsigned long long __X, unsigned int __P, unsigned int __L)
+{
+ return ((__X << (64 - (__L + __P))) >> (64 - __L));
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__bextr_u64 (unsigned long long __X, unsigned long long __Y)
+{
+ unsigned int __P, __L;
+ __P = __Y & 0xFF;
+ __L = (__Y & 0xFF00) >> 8;
+ return (_bextr_u64 (__X, __P, __L));
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsi_u64 (unsigned long long __X)
+{
+ return __X & -__X;
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsi_u64 (unsigned long long __X)
+{
+ return __blsi_u64 (__X);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsmsk_u64 (unsigned long long __X)
+{
+ return (__X ^ (__X - 1));
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsmsk_u64 (unsigned long long __X)
+{
+ return __blsmsk_u64 (__X);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__blsr_u64 (unsigned long long __X)
+{
+ return (__X & (__X - 1));
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_blsr_u64 (unsigned long long __X)
+{
+ return __blsr_u64 (__X);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+__tzcnt_u64 (unsigned long long __X)
+{
+ return __builtin_ctzll (__X);
+}
+
+extern __inline unsigned long long __attribute__((__gnu_inline__, __always_inline__, __artificial__))
+_tzcnt_u64 (unsigned long long __X)
+{
+ return __builtin_ctzll (__X);
+}
+#endif /* __PPC64__ */
+
+#endif /* _BMIINTRIN_H_INCLUDED */
--- /dev/null
+/* Copyright (C) 2008-2017 Free Software Foundation, Inc.
+
+ This file is part of GCC.
+
+ GCC is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 3, or (at your option)
+ any later version.
+
+ GCC is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ Under Section 7 of GPL version 3, you are granted additional
+ permissions described in the GCC Runtime Library Exception, version
+ 3.1, as published by the Free Software Foundation.
+
+ You should have received a copy of the GNU General Public License and
+ a copy of the GCC Runtime Library Exception along with this program;
+ see the files COPYING3 and COPYING.RUNTIME respectively. If not, see
+ <http://www.gnu.org/licenses/>. */
+
+#ifndef NO_WARN_X86_INTRINSICS
+/* This header is distributed to simplify porting x86_64 code that
+ makes explicit use of Intel intrinsics to powerpc64le.
+ It is the user's responsibility to determine if the results are
+ acceptable and make additional changes as necessary.
+ Note that much code that uses Intel intrinsics can be rewritten in
+ standard C or GNU C extensions, which are more portable and better
+ optimized across multiple targets. */
+#warning "Please read comment above. Use -DNO_WARN_X86_INTRINSICS to disable this warning."
+#endif
+
+#ifndef _X86INTRIN_H_INCLUDED
+#define _X86INTRIN_H_INCLUDED
+
+#include <bmiintrin.h>
+
+#include <bmi2intrin.h>
+
+
+#endif /* _X86INTRIN_H_INCLUDED */
+2017-05-12 Steven Munroe <munroesj@gcc.gnu.org>
+
+ * gcc.target/powerpc/bmi-andn-1.c: New file
+ * gcc.target/powerpc/bmi-andn-2.c: New file.
+ * gcc.target/powerpc/bmi-bextr-1.c: New file.
+ * gcc.target/powerpc/bmi-bextr-2.c: New file.
+ * gcc.target/powerpc/bmi-bextr-4.c: New file.
+ * gcc.target/powerpc/bmi-bextr-5.c: New file.
+ * gcc.target/powerpc/bmi-blsi-1.c: New file.
+ * gcc.target/powerpc/bmi-blsi-2.c: New file.
+ * gcc.target/powerpc/bmi-blsmsk-1.c: new file.
+ * gcc.target/powerpc/bmi-blsmsk-2.c: New file.
+ * gcc.target/powerpc/bmi-blsr-1.c: New file.
+ * gcc.target/powerpc/bmi-blsr-2.c: New File.
+ * gcc.target/powerpc/bmi-check.h: New File.
+ * gcc.target/powerpc/bmi-tzcnt-1.c: new file.
+ * gcc.target/powerpc/bmi-tzcnt-2.c: New file.
+ * gcc.target/powerpc/bmi2-bzhi32-1.c: New file.
+ * gcc.target/powerpc/bmi2-bzhi64-1.c: New file.
+ * gcc.target/powerpc/bmi2-bzhi64-1a.c: New file.
+ * gcc.target/powerpc/bmi2-check.h: New file.
+ * gcc.target/powerpc/bmi2-mulx32-1.c: New file.
+ * gcc.target/powerpc/bmi2-mulx32-2.c: New file.
+ * gcc.target/powerpc/bmi2-mulx64-1.c: New file.
+ * gcc.target/powerpc/bmi2-mulx64-2.c: New file.
+ * gcc.target/powerpc/bmi2-pdep32-1.c: New file.
+ * gcc.target/powerpc/bmi2-pdep64-1.c: New file.
+ * gcc.target/powerpc/bmi2-pext32-1.c: New File.
+ * gcc.target/powerpc/bmi2-pext64-1.c: New file.
+ * gcc.target/powerpc/bmi2-pext64-1a.c: New File.
+
2017-05-12 Paolo Carlini <paolo.carlini@oracle.com>
PR c++/60430
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O3 -m64" } */
+/* { dg-require-effective-target lp64 } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+long long calc_andn_u64 (long long src1,
+ long long src2,
+ long long dummy)
+{
+ return (~src1 + dummy) & (src2);
+}
+
+static void
+bmi_test()
+{
+ unsigned i;
+
+ long long src = 0xfacec0ffeefacec0;
+ long long res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = (i + src) << i;
+
+ res_ref = calc_andn_u64 (src, src+i, 0);
+ res = __andn_u64 (src, src+i);
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O3 -m64" } */
+/* { dg-require-effective-target lp64 } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+long long calc_andn_u32 (int src1, int src2, int dummy)
+{
+ return (~src1+dummy) & (src2);
+}
+
+static void
+bmi_test()
+{
+ unsigned i;
+
+ int src = 0xfacec0ff;
+ int res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = (i + src) << i;
+
+ res_ref = calc_andn_u32 (src, src+i, 0);
+ res = __andn_u32 (src, src+i);
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O2 -m64 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+long long calc_bextr_u64 (unsigned long long src1,
+ unsigned long long src2)
+{
+ long long res = 0;
+ unsigned char start = (src2 & 0xff);
+ unsigned char len = (int) ((src2 >> 8) & 0xff);
+ if (start < 64) {
+ unsigned i;
+ unsigned last = (start+len) < 64 ? start+len : 64;
+
+ src1 >>= start;
+ for (i=start; i<last; ++i) {
+ res |= (src1 & 1) << (i-start);
+ src1 >>= 1;
+ }
+ }
+
+ return res;
+}
+
+static void
+bmi_test ()
+{
+ unsigned i;
+ unsigned char start, len;
+ unsigned long long src1 = 0xfacec0ffeefacec0;
+ unsigned long long res, res_ref, src2;
+
+ for (i=0; i<5; ++i) {
+ start = (i * 1983) % 64;
+ len = (i + (i * 1983)) % 64;
+
+ src1 = src1 * 3;
+ src2 = start | (((unsigned long long)len) << 8);
+
+ res_ref = calc_bextr_u64 (src1, src2);
+ res = __bextr_u64 (src1, src2);
+
+ if (res != res_ref)
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O3 -m64 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+unsigned calc_bextr_u32 (unsigned src1, unsigned src2)
+{
+ unsigned res = 0;
+ unsigned char start = (src2 & 0xff);
+ unsigned char len = (int) ((src2 >> 8) & 0xff);
+ if (start < 32) {
+ unsigned i;
+ unsigned last = (start+len) < 32 ? start+len : 32;
+
+ src1 >>= start;
+ for (i=start; i<last; ++i) {
+ res |= (src1 & 1) << (i-start);
+ src1 >>= 1;
+ }
+ }
+
+ return res;
+}
+
+static void
+bmi_test ()
+{
+ unsigned i;
+ unsigned char start, len;
+ unsigned src1 = 0xfacec0ff;
+ unsigned res, res_ref, src2;
+
+ for (i=0; i<5; ++i) {
+ start = (i * 1983) % 32;
+ len = (i + (i * 1983)) % 32;
+
+ src1 = src1 * 3;
+ src2 = start | (((unsigned)len) << 8);
+
+ res_ref = calc_bextr_u32 (src1, src2);
+ res = __bextr_u32 (src1, src2);
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O3 -m64 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+unsigned calc_bextr_u32 (unsigned src1, unsigned src2)
+{
+ unsigned res = 0;
+ unsigned char start = (src2 & 0xff);
+ unsigned char len = (int) ((src2 >> 8) & 0xff);
+ if (start < 32) {
+ unsigned i;
+ unsigned last = (start+len) < 32 ? start+len : 32;
+
+ src1 >>= start;
+ for (i=start; i<last; ++i) {
+ res |= (src1 & 1) << (i-start);
+ src1 >>= 1;
+ }
+ }
+
+ return res;
+}
+
+static void
+bmi_test ()
+{
+ unsigned i;
+ unsigned char start, len;
+ unsigned src1 = 0xfacec0ff;
+ unsigned res, res_ref, src2;
+
+ for (i=0; i<5; ++i) {
+ start = i * 4;
+ len = i * 4;
+
+ src1 = src1 * 3;
+ src2 = (start & 0xff) | ((len & 0xff) << 8);
+
+ res_ref = calc_bextr_u32 (src1, src2);
+ res = _bextr_u32 (src1, start, len);
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O3 -m64 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+long long calc_bextr_u64 (unsigned long long src1,
+ unsigned long long src2)
+{
+ long long res = 0;
+ unsigned char start = (src2 & 0xff);
+ unsigned char len = (int) ((src2 >> 8) & 0xff);
+ if (start < 64) {
+ unsigned i;
+ unsigned last = (start+len) < 64 ? start+len : 64;
+
+ src1 >>= start;
+ for (i=start; i<last; ++i) {
+ res |= (src1 & 1) << (i-start);
+ src1 >>= 1;
+ }
+ }
+
+ return res;
+}
+
+static void
+bmi_test ()
+{
+ unsigned i;
+ unsigned char start, len;
+ unsigned long long src1 = 0xfacec0ffeefacec0;
+ unsigned long long res, res_ref, src2;
+
+ for (i=0; i<5; ++i) {
+ start = i * 4;
+ len = i * 3;
+ src1 = src1 * 3;
+ src2 = (start & 0xff) | ((len & 0xff) << 8);
+
+ res_ref = calc_bextr_u64 (src1, src2);
+ res = _bextr_u64 (src1, start, len);
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O3 -m64 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+/* To fool the compiler, so it does not generate blsi here. */
+long long calc_blsi_u64 (long long src1, long long src2)
+{
+ return (-src1) & (src2);
+}
+
+static void
+bmi_test()
+{
+ unsigned i;
+
+ long long src = 0xfacec0ffeefacec0;
+ long long res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = (i + src) << i;
+
+ res_ref = calc_blsi_u64 (src, src);
+ res = __blsi_u64 (src);
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O3 -m64 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+/* To fool the compiler, so it does not generate blsi here. */
+int calc_blsi_u32 (int src1, int src2)
+{
+ return (-src1) & (src2);
+}
+
+static void
+bmi_test()
+{
+ unsigned i;
+ int src = 0xfacec0ff;
+ int res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = (i + src) << i;
+
+ res_ref = calc_blsi_u32 (src, src);
+ res = __blsi_u32 (src);
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O3 -m64 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+/* Trick compiler in order not to generate target insn here. */
+long long calc_blsmsk_u64 (long long src1, long long src2)
+{
+ return (src1-1) ^ (src2);
+}
+
+static void
+bmi_test ()
+{
+ unsigned i;
+ long long src = 0xfacec0ffeefacec0;
+ long long res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = (i + src) << i;
+
+ res_ref = calc_blsmsk_u64 (src, src);
+ res = __blsmsk_u64 (src);
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O3 -m64 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+/* Trick compiler in order not to generate target insn here. */
+int calc_blsmsk_u32 (int src1, int src2)
+{
+ return (src1-1) ^ (src2);
+}
+
+static void
+bmi_test ()
+{
+ unsigned i;
+ int src = 0xfacec0ff;
+ int res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = (i + src) << i;
+
+ res_ref = calc_blsmsk_u32 (src, src);
+ res = __blsmsk_u32 (src);
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O3 -m64 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+long long calc_blsr_u64 (long long src1, long long src2)
+{
+ return (src1-1) & (src2);
+}
+
+static void
+bmi_test()
+{
+ unsigned i;
+ long long src = 0xfacec0ffeefacec0;
+ long long res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = (i + src) << i;
+
+ res_ref = calc_blsr_u64 (src, src);
+ res = __blsr_u64 (src);
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O3 -m64 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+int calc_blsr_u32 (int src1, int src2)
+{
+ return (src1-1) & (src2);
+}
+
+static void
+bmi_test ()
+{
+ unsigned i;
+ int src = 0xfacec0ff;
+ int res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = (i + src) << i;
+
+ res_ref = calc_blsr_u32 (src, src);
+ res = __blsr_u32 (src);
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+
+static void bmi_test (void);
+
+static void
+__attribute__ ((noinline))
+do_test (void)
+{
+ bmi_test ();
+}
+
+int
+main ()
+{
+ /* Need 64-bit for 64-bit longs as single instruction. */
+ if ( __builtin_cpu_supports ("ppc64") )
+ {
+ do_test ();
+#ifdef DEBUG
+ printf ("PASSED\n");
+#endif
+ }
+#ifdef DEBUG
+ else
+ printf ("SKIPPED\n");
+#endif
+
+ return 0;
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O3 -m64 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+long long calc_tzcnt_u64 (long long src)
+{
+ int i;
+ int res = 0;
+
+ while ( (res<64) && ((src&1) == 0)) {
+ ++res;
+ src >>= 1;
+ }
+
+ return res;
+}
+
+static void
+bmi_test ()
+{
+ unsigned i;
+ long long src = 0xfacec0ffeefacec0;
+ long long res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = (i + src) << i;
+
+ res_ref = calc_tzcnt_u64 (src);
+ res = __tzcnt_u64 (src);
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-require-effective-target lp64 } */
+/* { dg-options "-O3 -m64 -fno-inline" } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi-check.h"
+
+int calc_tzcnt_u32 (int src)
+{
+ int i;
+ int res = 0;
+
+ while ( (res<32) && ((src&1) == 0)) {
+ ++res;
+ src >>= 1;
+ }
+ return res;
+}
+
+static void
+bmi_test ()
+{
+ unsigned i;
+ int src = 0xfacec0ff;
+ int res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ src = i + (src << i);
+
+ res_ref = calc_tzcnt_u32 (src);
+ res = __tzcnt_u32 (src);
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O3 -m64 -mcpu=power7" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_bzhi_u32 (unsigned a, int l)
+{
+ unsigned res = a;
+ int i;
+ for (i = 0; i < 32 - l; ++i)
+ res &= ~(1 << (31 - i));
+
+ return res;
+}
+
+static void
+bmi2_test ()
+{
+ unsigned i;
+ unsigned src = 0xce7ace0f;
+ unsigned res, res_ref;
+
+ for (i = 0; i < 5; ++i) {
+ src = src * (i + 1);
+
+ res_ref = calc_bzhi_u32 (src, i * 2);
+ res = _bzhi_u32 (src, i * 2);
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O3 -m64 -mcpu=power7" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_bzhi_u64 (unsigned long long a, int l)
+{
+ unsigned long long res = a;
+ int i;
+ for (i = 0; i < 64 - l; ++i)
+ res &= ~(1LL << (63 - i));
+
+ return res;
+}
+
+static void
+bmi2_test ()
+{
+ unsigned i;
+ unsigned long long src = 0xce7ace0ce7ace0ff;
+ unsigned long long res, res_ref;
+
+ for (i = 0; i < 5; ++i) {
+ src = src * (i + 1);
+
+ res_ref = calc_bzhi_u64 (src, i * 2);
+ res = _bzhi_u64 (src, i * 2);
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -m64 -mcpu=power7" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+
+unsigned long long
+test__bzhi_u64_group (unsigned long long a)
+{
+ /* bzhi is implemented in source as shift left then shift right
+ to clear the high order bits.
+ For the case where the starting index is const, the compiler
+ should reduces this to a single Rotate Left Doubleword
+ Immediate then Clear Left (rldicl) instruction. */
+ unsigned long long res;
+ res = _bzhi_u64 (a, 8);
+ res += _bzhi_u64 (a, 16);
+ res += _bzhi_u64 (a, 24);
+ res += _bzhi_u64 (a, 32);
+ res += _bzhi_u64 (a, 40);
+ res += _bzhi_u64 (a, 48);
+ return (res);
+}
+/* the resulting assembler should have 6 X rldicl and no sld or
+ srd instructions. */
+
+/* { dg-final { scan-assembler-times "rldicl" 6 } } */
+/* { dg-final { scan-assembler-not "sld" } } */
+/* { dg-final { scan-assembler-not "srd" } } */
--- /dev/null
+#include <stdio.h>
+#include <stdlib.h>
+
+static void bmi2_test (void);
+
+static void
+__attribute__ ((noinline))
+do_test (void)
+{
+ bmi2_test ();
+}
+
+int
+main ()
+{
+ /* The BMI2 test for pext test requires the Bit Permute doubleword
+ (bpermd) instruction added in PowerISA 2.06 along with the VSX
+ facility. So we can test for arch_2_06. */
+ if ( __builtin_cpu_supports ("arch_2_06") )
+ {
+ do_test ();
+#ifdef DEBUG
+ printf ("PASSED\n");
+#endif
+ }
+#ifdef DEBUG
+ else
+ printf ("SKIPPED\n");
+#endif
+
+ return 0;
+}
+
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O3 -m64 -mcpu=power7" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_mul_u32 (unsigned volatile a, unsigned b)
+{
+ unsigned long long res = 0;
+ int i;
+ for (i = 0; i < b; ++i)
+ res += a;
+
+ return res;
+}
+
+__attribute__((noinline))
+unsigned long long
+gen_mulx (unsigned a, unsigned b)
+{
+ unsigned long long res;
+
+ res = (unsigned long long)a * b;
+
+ return res;
+}
+
+static void
+bmi2_test ()
+{
+ unsigned i;
+ unsigned a = 0xce7ace0;
+ unsigned b = 0xfacefff;
+ unsigned long long res, res_ref;
+
+ for (i = 0; i < 5; ++i) {
+ a = a * (i + 1);
+ b = b / (i + 1);
+
+ res_ref = calc_mul_u32 (a, b);
+ res = gen_mulx (a, b);
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O3 -m64 -mcpu=power7" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_mul_u32 (unsigned volatile a, unsigned b)
+{
+ unsigned long long res = 0;
+ int i;
+ for (i = 0; i < b; ++i)
+ res += a;
+
+ return res;
+}
+
+__attribute__((noinline))
+unsigned calc_mulx_u32 (unsigned x, unsigned y, unsigned *res_h)
+{
+ return (unsigned) _mulx_u32 (x, y, res_h);
+}
+
+static void
+bmi2_test ()
+{
+ unsigned i;
+ unsigned a = 0xce7ace0;
+ unsigned b = 0xfacefff;
+ unsigned res_l, res_h;
+ unsigned long long res, res_ref;
+
+ for (i = 0; i < 5; ++i) {
+ a = a * (i + 1);
+ b = b / (i + 1);
+
+ res_ref = calc_mul_u32 (a, b);
+ res_l = calc_mulx_u32 (a, b, &res_h);
+
+ res = ((unsigned long long) res_h << 32) | res_l;
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O3 -m64 -mcpu=power7" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned __int128
+calc_mul_u64 (unsigned long long volatile a, unsigned long long b)
+{
+ unsigned __int128 res = 0;
+ int i;
+ for (i = 0; i < b; ++i)
+ res += (unsigned __int128) a;
+
+ return res;
+}
+
+static void
+bmi2_test ()
+{
+ unsigned i;
+ unsigned long long a = 0xce7ace0ce7ace0;
+ unsigned long long b = 0xface;
+ unsigned __int128 res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ a = a * (i + 1);
+ b = b / (i + 1);
+
+ res_ref = calc_mul_u64 (a, b);
+ res = (unsigned __int128) a * b;
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O3 -m64 -mcpu=power7" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned __int128
+calc_mul_u64 (unsigned long long volatile a, unsigned long long b)
+{
+ unsigned __int128 res = 0;
+ int i;
+ for (i = 0; i < b; ++i)
+ res += (unsigned __int128) a;
+
+ return res;
+}
+
+__attribute__((noinline))
+unsigned long long
+calc_mulx_u64 (unsigned long long x,
+ unsigned long long y,
+ unsigned long long *res_h)
+{
+ return _mulx_u64 (x, y, res_h);
+}
+
+
+static void
+bmi2_test ()
+{
+ unsigned i;
+ unsigned long long a = 0xce7ace0ce7ace0;
+ unsigned long long b = 0xface;
+ unsigned long long res_l, res_h;
+ unsigned __int128 res, res_ref;
+
+ for (i=0; i<5; ++i) {
+ a = a * (i + 1);
+ b = b / (i + 1);
+
+ res_ref = calc_mul_u64 (a, b);
+
+ res_l = calc_mulx_u64 (a, b, &res_h);
+
+ res = ((unsigned __int128) res_h << 64) | res_l;
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O3 -m64 -mcpu=power7" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_pdep_u32 (unsigned a, int mask)
+{
+ unsigned res = 0;
+ int i, k = 0;
+
+ for (i = 0; i < 32; ++i)
+ if (mask & (1 << i)) {
+ res |= ((a & (1 << k)) >> k) << i;
+ ++k;
+ }
+
+ return res;
+}
+
+static void
+bmi2_test ()
+{
+ unsigned i;
+ unsigned src = 0xce7acc;
+ unsigned res, res_ref;
+
+ for (i = 0; i < 5; ++i) {
+ src = src * (i + 1);
+
+ res_ref = calc_pdep_u32 (src, i * 3);
+ res = _pdep_u32 (src, i * 3);
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O3 -m64 -mcpu=power7" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_pdep_u64 (unsigned long long a, unsigned long long mask)
+{
+ unsigned long long res = 0;
+ unsigned long long i, k = 0;
+
+ for (i = 0; i < 64; ++i)
+ if (mask & (1LL << i)) {
+ res |= ((a & (1LL << k)) >> k) << i;
+ ++k;
+ }
+ return res;
+}
+
+static
+void
+bmi2_test ()
+{
+ unsigned long long i;
+ unsigned long long src = 0xce7acce7acce7ac;
+ unsigned long long res, res_ref;
+
+ for (i = 0; i < 5; ++i) {
+ src = src * (i + 1);
+
+ res_ref = calc_pdep_u64 (src, ~(i * 3));
+ res = _pdep_u64 (src, ~(i * 3));
+
+ if (res != res_ref)
+ abort ();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O3 -m64 -mcpu=power7" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned
+calc_pext_u32 (unsigned a, unsigned mask)
+{
+ unsigned res = 0;
+ int i, k = 0;
+
+ for (i = 0; i < 32; ++i)
+ if (mask & (1 << i)) {
+ res |= ((a & (1 << i)) >> i) << k;
+ ++k;
+ }
+
+ return res;
+}
+
+static void
+bmi2_test ()
+{
+ unsigned i;
+ unsigned src = 0xce7acc;
+ unsigned res, res_ref;
+
+ for (i = 0; i < 5; ++i) {
+ src = src * (i + 1);
+
+ res_ref = calc_pext_u32 (src, ~(i * 3));
+ res = _pext_u32 (src, ~(i * 3));
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do run } */
+/* { dg-options "-O3 -m64 -mcpu=power7" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+#include "bmi2-check.h"
+
+__attribute__((noinline))
+unsigned long long
+calc_pext_u64 (unsigned long long a, unsigned long long mask)
+{
+ unsigned long long res = 0;
+ int i, k = 0;
+
+ for (i = 0; i < 64; ++i)
+ if (mask & (1LL << i)) {
+ res |= ((a & (1LL << i)) >> i) << k;
+ ++k;
+ }
+
+ return res;
+}
+
+static void
+bmi2_test ()
+{
+ unsigned long long i;
+ unsigned long long src = 0xce7acce7acce7ac;
+ unsigned long long res, res_ref;
+
+ for (i = 0; i < 5; ++i) {
+ src = src * (i + 1);
+
+ res_ref = calc_pext_u64 (src, ~(i * 3));
+ res = _pext_u64 (src, ~(i * 3));
+
+ if (res != res_ref)
+ abort();
+ }
+}
--- /dev/null
+/* { dg-do compile } */
+/* { dg-options "-O3 -m64 -mcpu=power7" } */
+/* { dg-require-effective-target powerpc_vsx_ok } */
+
+#define NO_WARN_X86_INTRINSICS 1
+#include <x86intrin.h>
+
+unsigned long long
+test__pexp_cmask_u64 (unsigned long long a[4])
+{
+ /* The _pext implmentation is nominally a popcount of the mask,
+ followed by a loop using count leading zeros to find the
+ next bit to process.
+ If the mask is a const, the popcount should be folded and
+ the constant propagation should eliminate the mask
+ generation loop and produce a single constant bpermd permute
+ control word.
+ This test verifies that the compiler is replacing the mask
+ popcount and loop with a const bperm control and generating
+ the bpermd for this case. */
+ const unsigned long mask = 0x00000000100000a4UL;
+ unsigned long res;
+ res = _pext_u64 (a[0], mask);
+ res = (res << 8) | _pext_u64 (a[1], mask);
+ res = (res << 8) | _pext_u64 (a[2], mask);
+ res = (res << 8) | _pext_u64 (a[3], mask);
+ return (res);
+}
+/* the resulting assembler should have 4 X bpermd and no popcntd or
+ cntlzd instructions. */
+
+/* { dg-final { scan-assembler-times "bpermd" 4 } } */
+/* { dg-final { scan-assembler-not "popcntd" } } */
+/* { dg-final { scan-assembler-not "cntlzd" } } */